some cleanup and new functions

JoeZiminski · Nov 30, 2017 · 7e8a3e7 · 7e8a3e7
1 parent f521c6f
commit 7e8a3e7
Show file tree

Hide file tree

Showing 21 changed files with 630 additions and 408 deletions.
diff --git a/HZmvntest.m b/HZmvntest.m
@@ -184,10 +184,6 @@
     return;
 end
 
-% remove NaNs
-% -----------
-X(find(sum(isnan(X),2)),:) = [];
-
 if c == 1  %covariance matrix normalizes by (n) [=default]
     S = cov(X,1);
 else   %covariance matrix normalizes by (n-1)

diff --git a/MC_corrpval.asv b/MC_corrpval.asv
@@ -0,0 +1,80 @@
+function [p_alpha,v] = MC_corrpval(n,p,method,alphav,pairs,D)
+
+% function to compute the alpha quantile estimate of the distribution of
+% minimal p-values under the null of correlations in a n*p matrix with null
+% covariance but variance D (I by default)
+%
+% FORMAT p_alpha = MC_corrpval(n,p,D)
+%
+% INPUT n the number of observations
+%       p the number of variables
+%       method can be 'Pearson', 'Spearman', 'Skipped Pearson', 'Skipped Spearman'
+%       pairs a m*2 matrix of variables to correlate (optional)
+%       D the variance of each variable (optional)
+%
+% p_alpha the alpha quantile estimate of the distribution of
+%         minimal p-values
+%
+%
+% Cyril Pernet v3 - Novembre 2017
+% ---------------------------------------------------
+%  Copyright (C) Corr_toolbox 2017
+
+%% deal with inputs
+if nargin == 0
+    help MC_corrpval
+    elsie nargin < 2
+    error('at least 2 inputs requested see help MC_corrpval');
+end
+
+if ~exist('pairs','var') || isempty(pairs)
+    pairs = nchoosek([1:p],2);
+end
+
+if ~exist('alphav','var')
+    alphav = 5/100;
+end
+
+%% generate the variance
+SIGMA = eye(p);
+if exist('D','var')
+    if length(D) ~= p
+        error('the vector D of variance must be of the same size as the number of variables p')
+    else
+        SIGMA(SIGMA==1) = D;
+    end
+end
+
+%% run the Monte Carlo simulation and keep smallest p values
+v = NaN(1,1000);
+parfor MC = 1:1000
+    fprintf('Running Monte Carlo %g\n',MC)
+    MVN = mvnrnd(zeros(1,p),SIGMA,n); % a multivariate normal distribution
+    if strcmp(method,'Pearson')
+        [~,~,pval] = Pearson(MVN,pairs);
+    elseif strcmp(method,'Pearson')
+        [~,~,pval] = Spearman(MVN,pairs);
+    elseif strcmp(method,'Skipped Pearson')
+        [r,t,pval] = skipped_Pearson(MVN,pairs);
+    elseif strcmp(method,'Skipped Spearman')
+        [r,t,pval] = skipped_Spearman(MVN,pairs);
+    end
+    v(MC) = min(pval);
+
+end
+
+%% get the Harell-Davis estimate of the alpha quantile
+    n       = length(v);
+for l=1:length(alphav)
+    q       = alphav(l)*10;  % for a decile
+    m1      = (n+1).*q;
+    m2      = (n+1).*(1-q);
+    vec     = 1:n;
+    w       = betacdf(vec./n,m1,m2)-betacdf((vec-1)./n,m1,m2);
+    y       = sort(v);
+    p_alpha(l) = sum(w(:).*y(:));
+end
+
+% For p=6, n=20 alpha =.05 .025 .01 get
+% 0.01122045  0.004343809 0.002354744
+% 0.0240      0.0069      0.000027
diff --git a/MC_corrpval.m b/MC_corrpval.m
@@ -0,0 +1,85 @@
+function [r_alpha,t_alpha,p_alpha,vv,vvv,v] = MC_corrpval(n,p,method,alphav,pairs,D)
+
+% function to compute the alpha quantile estimate of the distribution of
+% minimal p-values under the null of correlations in a n*p matrix with null
+% covariance but variance D (I by default)
+%
+% FORMAT p_alpha = MC_corrpval(n,p,D)
+%
+% INPUT n the number of observations
+%       p the number of variables
+%       method can be 'Pearson', 'Spearman', 'Skipped Pearson', 'Skipped Spearman'
+%       pairs a m*2 matrix of variables to correlate (optional)
+%       D the variance of each variable (optional)
+%
+% p_alpha the alpha quantile estimate of the distribution of
+%         minimal p-values
+%
+%
+% Cyril Pernet v3 - Novembre 2017
+% ---------------------------------------------------
+%  Copyright (C) Corr_toolbox 2017
+
+%% deal with inputs
+if nargin == 0
+    help MC_corrpval
+    elsie nargin < 2
+    error('at least 2 inputs requested see help MC_corrpval');
+end
+
+if ~exist('pairs','var') || isempty(pairs)
+    pairs = nchoosek([1:p],2);
+end
+
+if ~exist('alphav','var')
+    alphav = 5/100;
+end
+
+%% generate the variance
+SIGMA = eye(p);
+if exist('D','var')
+    if length(D) ~= p
+        error('the vector D of variance must be of the same size as the number of variables p')
+    else
+        SIGMA(SIGMA==1) = D;
+    end
+end
+
+%% run the Monte Carlo simulation and keep smallest p values
+v = NaN(1,1000);
+parfor MC = 1:1000
+    fprintf('Running Monte Carlo %g\n',MC)
+    MVN = mvnrnd(zeros(1,p),SIGMA,n); % a multivariate normal distribution
+    if strcmp(method,'Pearson')
+        [~,~,pval] = Pearson(MVN,pairs);
+    elseif strcmp(method,'Pearson')
+        [~,~,pval] = Spearman(MVN,pairs);
+    elseif strcmp(method,'Skipped Pearson')
+        [r,t,pval] = skipped_Pearson(MVN,pairs);
+    elseif strcmp(method,'Skipped Spearman')
+        [r,t,pval] = skipped_Spearman(MVN,pairs);
+    end
+    v(MC)   = min(pval);
+    vv(MC)  = max(r);
+    vvv(MC) = max(t);
+end
+
+%% get the Harell-Davis estimate of the alpha quantile
+    n       = length(v);
+for l=1:length(alphav)
+    q       = alphav(l)*10;  % for a decile
+    m1      = (n+1).*q;
+    m2      = (n+1).*(1-q);
+    vec     = 1:n;
+    w       = betacdf(vec./n,m1,m2)-betacdf((vec-1)./n,m1,m2);
+    y       = sort(v);
+    p_alpha(l) = sum(w(:).*y(:));
+    y       = sort(vv);
+    r_alpha(l) = sum(w(:).*y(:));
+    y       = sort(vvv);
+    t_alpha(l) = sum(w(:).*y(:));
+end
+
+% For p=6, n=20 alpha =.05 .025 .01 get
+% 0.01122045  0.004343809 0.002354744
+% 0.0240      0.0069      0.000027
diff --git a/Pearson.m b/Pearson.m
@@ -1,4 +1,4 @@
-function [r,t,pval,hboot,CI] = Pearson(X,Y,fig_flag,level)
+function [r,t,pval,hboot,CI,pboot] = Pearson(X,Y,fig_flag,level)
 
 % compute the Pearson correlation along with the bootstrap CI
 %
@@ -22,15 +22,11 @@
 %
 % If X and Y are matrices of size [n p], p correlations are computed
 % consequently, the CI are adjusted at a level alpha/p (Bonferonni
-% correction) and hboot is based on these adjusted CI but pval remains
-% uncorrected - note also that if some values are NaN, the adjustement 
-% is based on the largest n
-%
-% This function requires the nansum.m function
-% from the matlab stat toolbox. 
-%
-% Cyril Pernet v2 (10-01-2014 - deals with NaN)
-% ---------------------------------------------
+% correction) and hboot is based on these adjusted CI (pval remain
+% uncorrected)
+
+% Cyril Pernet v1
+% ---------------------------------
 %  Copyright (C) Corr_toolbox 2012
 
 %% data check
@@ -61,27 +57,20 @@
 end
 
 [n p] = size(X);
-if p==1
-    [X Y]=pairwise_cleanup(X,Y);
-    [n p] = size(X);
-end
 
 %% basic Pearson
 
 % compute r 
-r = nansum(demean(X).*demean(Y)) ./ ...
-    (nansum(demean(X).^2).*nansum(demean(Y).^2)).^(1/2);
+r = sum(detrend(X,'constant').*detrend(Y,'constant')) ./ ...
+    (sum(detrend(X,'constant').^2).*sum(detrend(Y,'constant').^2)).^(1/2);
 t = r.*sqrt((n-2)./(1-r.^2));
 pval = 2*tcdf(-abs(t),n-2);
 
-%% bootstrap 
 if nargout > 3
     % adjust boot parameters
     if p == 1
         nboot = 599;
         % adjust percentiles following Wilcox
-        % even if NaN n is fine since data have
-        % been cleaned up
         if n < 40
             low = 7 ; high = 593;
         elseif n >= 40 && n < 80
@@ -105,19 +94,15 @@
             high = nboot - low;
         end
     end
-
     % compute hboot and CI
     table = randi(n,n,nboot);
     for B=1:nboot
-        rb(B,:) = nansum(demean(X(table(:,B),:)).*demean(Y(table(:,B),:))) ./ ...
-            (nansum(demean(X(table(:,B),:)).^2).*nansum(demean(Y(table(:,B),:)).^2)).^(1/2);
-        if fig_flag ~= 0 % to make a nice figure get regression values
-            for c=1:size(X,2)
-                tmp = [X(table(:,B),c) Y(table(:,B),c)]; % take a pair
-                tmp(find(sum(isnan(tmp),2)),:) = []; % remove NaNs
-                b = pinv([tmp(:,1) ones(size(tmp,1),1)])*tmp(:,2); % solve
-                slope(B,c) = b(1); intercept(B,c) = b(2,:);
-            end
+        rb(B,:) = sum(detrend(X(table(:,B),:),'constant').*detrend(Y(table(:,B),:),'constant')) ./ ...
+            (sum(detrend(X(table(:,B),:),'constant').^2).*sum(detrend(Y(table(:,B),:),'constant').^2)).^(1/2);
+        for c=1:size(X,2)
+            b = pinv([X(table(:,B),c) ones(n,1)])*Y(table(:,B),c);
+            slope(B,c) = b(1); 
+            intercept(B,c) = b(2,:);
         end
     end
 

diff --git a/Spearman.m b/Spearman.m
@@ -20,14 +20,14 @@
 % If X and Y are matrices of size [n p], p correlations are computed
 % and the CIs are adjusted at the alpha/p level (Bonferonni
 % correction); hboot is based on these adjusted CIs but pval remains
-% uncorrected - note also that if some values are NaN, the adjustement 
-% is based on the largest n
+% uncorrected.
 %
-% This function requires the tiedrank.m and nansum.m functions
-% from the matlab stat toolbox. 
+% This function requires the tiedrank.m function from the matlab stat toolbox. 
 %
-% Cyril Pernet v2 (10-01-2014 - deals with NaN)
-% ---------------------------------------------
+% See also TIEDRANK.
+
+% Cyril Pernet v1
+% ---------------------------------
 %  Copyright (C) Corr_toolbox 2012
 
 %% data check
@@ -58,27 +58,21 @@
 end
 
 [n p] = size(X);
-if p==1
-    [X Y]=pairwise_cleanup(X,Y);
-    [n p] = size(X);
-end
 
 %% basic Spearman
 
-% The corr function in the stat toolbox uses
-% permutations for n<10 and some other fancy
-% things when n>10 and there are no ties among
-% ranks - we just do the standard way.
-
 % compute r (default)
 xrank = tiedrank(X,0);
 yrank = tiedrank(Y,0);
-r = nansum(demean(xrank).*demean(yrank)) ./ ...
-    (nansum(demean(xrank).^2).*nansum(demean(yrank).^2)).^(1/2);
+r = sum(detrend(xrank,'constant').*detrend(yrank,'constant')) ./ ...
+    (sum(detrend(xrank,'constant').^2).*sum(detrend(yrank,'constant').^2)).^(1/2);
 t = r.*(sqrt(n-2)) ./ sqrt((1-r.^2));
 pval = 2*tcdf(-abs(t),n-2);
+% The corr function in the stat toolbox uses
+% permutations for n<10 and some other fancy
+% things when n>10 and there are no ties among
+% ranks - we just do the standard way.
 
-%% bootstrap
 if nargout > 3
     nboot = 1000;
     if p > 1
@@ -96,15 +90,11 @@
     for B=1:nboot
         xrank = tiedrank(X(table(:,B),:),0);
         yrank = tiedrank(Y(table(:,B),:),0);
-        rb(B,:) = nansum(demean(xrank).*demean(yrank)) ./ ...
-            (nansum(demean(xrank).^2).*nansum(demean(yrank).^2)).^(1/2);
-        if fig_flag ~= 0 % to make a nice figure get regression values
-            for c=1:size(X,2)
-                tmp = [xrank(:,c) yrank(:,c)]; % take a pair
-                tmp(find(sum(isnan(tmp),2)),:) = []; % remove NaNs
-                b = pinv([tmp(:,1) ones(size(tmp,1),1)])*tmp(:,2); % solve
-                slope(B,c) = b(1); intercept(B,c) = b(2,:);
-            end
+        rb(B,:) = sum(detrend(xrank,'constant').*detrend(yrank,'constant')) ./ ...
+            (sum(detrend(xrank,'constant').^2).*sum(detrend(yrank,'constant').^2)).^(1/2);
+        for c=1:size(X,2)
+            b = pinv([xrank(:,c) ones(n,1)])*yrank(:,c);
+            slope(B,c) = b(1); intercept(B,c) = b(2,:);
         end
     end
 

diff --git a/bendcorr.m b/bendcorr.m
@@ -20,11 +20,10 @@
 %          pH is the p value for an omnibus test of independence between all pairs 
 %
 % The percentage bend correlation is a robust method that protects against
-% outliers among the marginal distributions. If NaN values are present, the
-% smallest available n is used across all pairs.
-%
+% outliers among the marginal distributions.
+
 % Cyril Pernet and Guillaume Rousselet 26-01-2011
-% Reformatted for Corr_toolbox 02-7-2012 
+% Reformatted for Corr_toolbox 02--7-2012 
 % ----------------------------------------------
 %  Copyright (C) Corr_toolbox 2012
 

diff --git a/bivariate_outliers.m b/bivariate_outliers.m
@@ -0,0 +1,47 @@
+function flag = bivariate_outliers(X)
+
+% routine that find the bivariate outliers using orthogonal projection and
+% box plot rule 
+
+% find the centre of the data cloud using mid-covariance determinant
+n   = size(X,1);
+result = mcdcov(X,'cor',1,'plots',0,'h',floor((n+size(X,2)*2+1)/2));
+center = result.center;
+
+% orthogonal projection to the lines joining the center
+% followed by outlier detection using box plot rule
+
+gval = sqrt(chi2inv(0.975,2)); % in fact depends on size(X,2) but here always = 2
+for i=1:n % for each row
+    dis = NaN(n,1);
+    B   = (X(i,:)-center)';
+    BB  = B.^2;
+    bot = sum(BB);
+    if bot~=0
+        for j=1:n
+            A = (X(j,:)-center)';
+            dis(j)= norm(A'*B/bot.*B);
+        end
+        % IQR rule
+        [ql,qu]=idealf(dis);
+        record{i} = (dis > median(dis)+gval.*(qu-ql)) ; % + (dis < median(dis)-gval.*(qu-ql));
+    end
+end
+
+try
+    flag = nan(n,1);
+    flag = sum(cell2mat(record),2); % if any point is flagged
+
+catch ME  % this can happen to have an empty cell so loop
+    flag = nan(n,size(record,2));
+    index = 1;
+    for s=1:size(record,2)
+        if ~isempty(record{s})
+            flag(:,index) = record{s};
+            index = index+1;
+        end
+    end
+    flag(:,index:end) = [];
+    flag = sum(flag,2);
+end
+