"""Dependence measures functions."""
import numpy as np
from scipy.spatial import procrustes
from scipy.spatial.distance import squareform, pdist
from scipy.stats import t as t_dist
import warnings
from joblib import Parallel, delayed
from sklearn.utils import check_random_state
from nltools.stats import _calc_pvalue
__all__ = ['double_center',
'u_center',
'distance_correlation',
'procrustes_similarity'
]
[docs]def double_center(mat):
'''Double center a 2d array.
Args:
mat (ndarray): 2d numpy array
Returns:
mat (ndarray): double-centered version of input
'''
if len(mat.shape) != 2:
raise ValueError('Array should be 2d')
# keepdims ensures that row/column means are not incorrectly broadcast during subtraction
row_mean = mat.mean(axis=0, keepdims=True)
col_mean = mat.mean(axis=1, keepdims=True)
grand_mean = mat.mean()
return mat - row_mean - col_mean + grand_mean
[docs]def u_center(mat):
'''U-center a 2d array. U-centering is a bias-corrected form of double-centering
Args:
mat (ndarray): 2d numpy array
Returns:
mat (narray): u-centered version of input
'''
if len(mat.shape) != 2:
raise ValueError('Array should be 2d')
dim = mat.shape[0]
u_mu = mat.sum() / ((dim - 1) * (dim - 2))
sum_cols = mat.sum(axis=0, keepdims=True)
sum_rows = mat.sum(axis=1, keepdims=True)
u_mu_cols = np.ones((dim, 1)).dot(sum_cols / (dim - 2))
u_mu_rows = (sum_rows / (dim - 2)).dot(np.ones((1, dim)))
out = np.copy(mat)
# Do one operation at a time, to improve broadcasting memory usage.
out -= u_mu_rows
out -= u_mu_cols
out += u_mu
# The diagonal is zero
out[np.eye(dim, dtype=bool)] = 0
[docs]def distance_correlation(x, y, bias_corrected=True, return_all_stats=False):
'''Compute the distance correlation betwen 2 arrays.
Distance correlation involves computing the normalized covariance of two centered euclidean distance matrices. Each distance matrix is the euclidean distance between rows (if x or y are 2d) or scalars (if x or y are 1d). Each matrix is centered using u-centering, a bias-corrected form of double-centering. This permits inference of the normalized covariance between each distance matrix using a one-tailed directional t-test. (Szekely & Rizzo, 2013). While distance correlation is normally bounded between 0 and 1, u-centering can produce negative estimates, which are never significant. Therefore these estimates are windsorized to 0, ala Geerligs, Cam-CAN, Henson, 2016.
Args:
x (ndarray): 1d or 2d numpy array of observations by features
y (ndarry): 1d or 2d numpy array of observations by features
bias_corrected (bool): if false use double-centering but no inference test is performed, if true use u-centering and perform inference; default True
return_all_stats (bool): if true return distance covariance and variances of each array as well; default False
Returns:
results (dict): dictionary of results (correlation, t, p, and df.) Optionally, covariance, x variance, and y variance
'''
if len(x.shape) > 2 or len(y.shape) > 2:
raise ValueError("Both arrays must be 1d or 2d")
# 1 compute euclidean distances between pairs of value in each array
if len(x.shape) == 1:
_x = x[:, np.newaxis]
else:
_x = x
if len(y.shape) == 1:
_y = y[:, np.newaxis]
else:
_y = y
x_dist = squareform(pdist(_x))
y_dist = squareform(pdist(_y))
# 2 center each matrix
if bias_corrected:
# U-centering
x_dist_cent = u_center(x_dist)
y_dist_cent = u_center(y_dist)
# Compute covariances using N*(N-3) in denominator
adjusted_n = _x.shape[0] * (_x.shape[0]-3)
xy = np.multiply(x_dist_cent, y_dist_cent).sum() / adjusted_n
xx = np.multiply(x_dist_cent, x_dist_cent).sum() / adjusted_n
yy = np.multiply(y_dist_cent, y_dist_cent).sum() / adjusted_n
else:
# double-centering
x_dist_cent = double_center(x_dist)
y_dist_cent = double_center(y_dist)
# Compute covariances using N^2 in denominator
xy = np.multiply(x_dist_cent, y_dist_cent).mean()
xx = np.multiply(x_dist_cent, x_dist_cent).mean()
yy = np.multiply(y_dist_cent, y_dist_cent).mean()
# 3 compute covariances and variances
var_x = np.sqrt(xx)
var_y = np.sqrt(yy)
# 4 Normalize to get correlation
denom = np.sqrt(xx * yy)
if denom > 0:
r2 = xy / denom
else:
r2 = 0
# Windsorize negative values as a result of u-centering
if r2 > 0:
cor = np.sqrt(r2)
else:
cor = 0
out = {}
out['d_correlation_adjusted'] = cor
if return_all_stats:
out['d_covariance_squared'] = xy
out['d_correlation_adjusted'] = r2
out['x_var'] = var_x
out['y_var'] = var_y
if bias_corrected:
dof = (adjusted_n / 2) - 1
t = np.sqrt(dof) * (r2 / np.sqrt(1 - r2**2))
p = 1-t_dist.cdf(t, dof)
out['t'] = t
out['p'] = p
out['df'] = dof
return out
[docs]def procrustes_similarity(mat1, mat2, n_permute=5000, tail=1, n_jobs=-1, random_state=None):
""" Use procrustes super-position to perform a similarity test between 2 matrices. Matrices need to match in size on their first dimension only, as the smaller matrix on the second dimension will be padded with zeros. After aligning two matrices using the procrustes transformation, use the computed disparity between them (sum of squared error of elements) as a similarity metric. Shuffle the rows of one of the matrices and recompute the disparity to perform inference (Peres-Neto & Jackson, 2001). Note: by default this function reverses disparity to treat it like a *similarity* measure like correlation, rather than a distance measure like correlation distance, i.e. smaller values mean less similar, larger values mean more similar.
Args:
mat1 (ndarray): 2d numpy array; must have same number of rows as mat2
mat2 (ndarray): 1d or 2d numpy array; must have same number of rows as mat1
n_permute (int): number of permutation iterations to perform
tail (int): either 1 for one-tailed or 2 for two-tailed test; default 2
n_jobs (int): The number of CPUs to use to do permutation; default -1 (all)
Returns:
similarity (float): similarity between matrices bounded between 0 and 1
pval (float): permuted p-value
"""
warnings.warn("This function needs to be edited to scale SSE to a proportion. It is currently WRONG.")
if mat1.shape[0] != mat2.shape[0]:
raise ValueError('Both arrays must match on their first dimension')
random_state = check_random_state(random_state)
# Make sure both matrices are 2d and the same dimension via padding
if len(mat1.shape) < 2:
mat1 = mat1[:, np.newaxis]
if len(mat2.shape) < 2:
mat2 = mat2[:, np.newaxis]
if mat1.shape[1] > mat2.shape[1]:
mat2 = np.pad(mat2, ((0, 0), (0, mat1.shape[1] - mat2.shape[1])), 'constant')
elif mat2.shape[1] > mat1.shape[1]:
mat1 = np.pad(mat1, ((0, 0), (0, mat2.shape[1] - mat1.shape[1])), 'constant')
_, _, sse = procrustes(mat1, mat2)
sse = 1 - sse # flip to similarity measure
stats = dict()
stats['similarity'] = sse
all_p = Parallel(n_jobs=n_jobs)(delayed(procrustes)(random_state.permutation(mat1), mat2) for i in range(n_permute))
all_p = [1 - x[2] for x in all_p]
stats['p'] = _calc_pvalue(all_p, sse, tail)
return stats