Source code for multivariate_inference.dependence

"""Dependence measures functions."""

import numpy as np
from scipy.spatial import procrustes
from scipy.spatial.distance import squareform, pdist
from scipy.stats import t as t_dist
import warnings
from joblib import Parallel, delayed
from sklearn.utils import check_random_state
from nltools.stats import _calc_pvalue


__all__ = ['double_center',
           'u_center',
           'distance_correlation',
           'procrustes_similarity'
           ]


[docs]def double_center(mat):
    '''Double center a 2d array.

    Args:
        mat (ndarray): 2d numpy array

    Returns:
        mat (ndarray): double-centered version of input

    '''

    if len(mat.shape) != 2:
        raise ValueError('Array should be 2d')

    # keepdims ensures that row/column means are not incorrectly broadcast during    subtraction
    row_mean = mat.mean(axis=0, keepdims=True)
    col_mean = mat.mean(axis=1, keepdims=True)
    grand_mean = mat.mean()
    return mat - row_mean - col_mean + grand_mean


[docs]def u_center(mat):
    '''U-center a 2d array. U-centering is a bias-corrected form of double-centering

    Args:
        mat (ndarray): 2d numpy array

    Returns:
        mat (narray): u-centered version of input
    '''

    if len(mat.shape) != 2:
        raise ValueError('Array should be 2d')

    dim = mat.shape[0]
    u_mu = mat.sum() / ((dim - 1) * (dim - 2))
    sum_cols = mat.sum(axis=0, keepdims=True)
    sum_rows = mat.sum(axis=1, keepdims=True)
    u_mu_cols = np.ones((dim, 1)).dot(sum_cols / (dim - 2))
    u_mu_rows = (sum_rows / (dim - 2)).dot(np.ones((1, dim)))
    out = np.copy(mat)
    # Do one operation at a time, to improve broadcasting memory usage.
    out -= u_mu_rows
    out -= u_mu_cols
    out += u_mu
    # The diagonal is zero
    out[np.eye(dim, dtype=bool)] = 0


[docs]def distance_correlation(x, y, bias_corrected=True, return_all_stats=False):
    '''Compute the distance correlation betwen 2 arrays.
        Distance correlation involves computing the normalized covariance of two centered euclidean distance matrices. Each distance matrix is the euclidean distance between rows (if x or y are 2d) or scalars (if x or y are 1d). Each matrix is centered using u-centering, a bias-corrected form of double-centering. This permits inference of the normalized covariance between each distance matrix using a one-tailed directional t-test. (Szekely & Rizzo, 2013). While distance correlation is normally bounded between 0 and 1, u-centering can produce negative estimates, which are never significant. Therefore these estimates are windsorized to 0, ala Geerligs, Cam-CAN, Henson, 2016.

    Args:
        x (ndarray): 1d or 2d numpy array of observations by features
        y (ndarry): 1d or 2d numpy array of observations by features
        bias_corrected (bool): if false use double-centering but no inference test is performed, if true use u-centering and perform inference; default True
        return_all_stats (bool): if true return distance covariance and variances of each array as well; default False

    Returns:
        results (dict): dictionary of results (correlation, t, p, and df.) Optionally, covariance, x variance, and y variance
    '''

    if len(x.shape) > 2 or len(y.shape) > 2:
        raise ValueError("Both arrays must be 1d or 2d")

    # 1 compute euclidean distances between pairs of value in each array
    if len(x.shape) == 1:
        _x = x[:, np.newaxis]
    else:
        _x = x
    if len(y.shape) == 1:
        _y = y[:, np.newaxis]
    else:
        _y = y

    x_dist = squareform(pdist(_x))
    y_dist = squareform(pdist(_y))

    # 2 center each matrix
    if bias_corrected:
        # U-centering
        x_dist_cent = u_center(x_dist)
        y_dist_cent = u_center(y_dist)
        # Compute covariances using N*(N-3) in denominator
        adjusted_n = _x.shape[0] * (_x.shape[0]-3)
        xy = np.multiply(x_dist_cent, y_dist_cent).sum() / adjusted_n
        xx = np.multiply(x_dist_cent, x_dist_cent).sum() / adjusted_n
        yy = np.multiply(y_dist_cent, y_dist_cent).sum() / adjusted_n
    else:
        # double-centering
        x_dist_cent = double_center(x_dist)
        y_dist_cent = double_center(y_dist)
        # Compute covariances using N^2 in denominator
        xy = np.multiply(x_dist_cent, y_dist_cent).mean()
        xx = np.multiply(x_dist_cent, x_dist_cent).mean()
        yy = np.multiply(y_dist_cent, y_dist_cent).mean()

    # 3 compute covariances and variances
    var_x = np.sqrt(xx)
    var_y = np.sqrt(yy)

    # 4 Normalize to get correlation
    denom = np.sqrt(xx * yy)
    if denom > 0:
        r2 = xy / denom
    else:
        r2 = 0
    # Windsorize negative values as a result of u-centering
    if r2 > 0:
        cor = np.sqrt(r2)
    else:
        cor = 0

    out = {}
    out['d_correlation_adjusted'] = cor

    if return_all_stats:
        out['d_covariance_squared'] = xy
        out['d_correlation_adjusted'] = r2
        out['x_var'] = var_x
        out['y_var'] = var_y

    if bias_corrected:
        dof = (adjusted_n / 2) - 1
        t = np.sqrt(dof) * (r2 / np.sqrt(1 - r2**2))
        p = 1-t_dist.cdf(t, dof)
        out['t'] = t
        out['p'] = p
        out['df'] = dof

    return out


[docs]def procrustes_similarity(mat1, mat2, n_permute=5000, tail=1, n_jobs=-1, random_state=None):
    """ Use procrustes super-position to perform a similarity test between 2 matrices. Matrices need to match in size on their first dimension only, as the smaller matrix on the second dimension will be padded with zeros. After aligning two matrices using the procrustes transformation, use the computed disparity between them (sum of squared error of elements) as a similarity metric. Shuffle the rows of one of the matrices and recompute the disparity to perform inference (Peres-Neto & Jackson, 2001). Note: by default this function reverses disparity to treat it like a *similarity* measure like correlation, rather than a distance measure like correlation distance, i.e. smaller values mean less similar, larger values mean more similar.

    Args:
        mat1 (ndarray): 2d numpy array; must have same number of rows as mat2
        mat2 (ndarray): 1d or 2d numpy array; must have same number of rows as mat1
        n_permute (int): number of permutation iterations to perform
        tail (int): either 1 for one-tailed or 2 for two-tailed test; default 2
        n_jobs (int): The number of CPUs to use to do permutation; default -1 (all)

    Returns:
        similarity (float): similarity between matrices bounded between 0 and 1
        pval (float): permuted p-value

    """

    warnings.warn("This function needs to be edited to scale SSE to a proportion. It is currently WRONG.")

    if mat1.shape[0] != mat2.shape[0]:
        raise ValueError('Both arrays must match on their first dimension')

    random_state = check_random_state(random_state)

    # Make sure both matrices are 2d and the same dimension via padding
    if len(mat1.shape) < 2:
        mat1 = mat1[:, np.newaxis]
    if len(mat2.shape) < 2:
        mat2 = mat2[:, np.newaxis]
    if mat1.shape[1] > mat2.shape[1]:
        mat2 = np.pad(mat2, ((0, 0), (0, mat1.shape[1] - mat2.shape[1])), 'constant')
    elif mat2.shape[1] > mat1.shape[1]:
        mat1 = np.pad(mat1, ((0, 0), (0, mat2.shape[1] - mat1.shape[1])), 'constant')

    _, _, sse = procrustes(mat1, mat2)
    sse = 1 - sse  # flip to similarity measure

    stats = dict()
    stats['similarity'] = sse

    all_p = Parallel(n_jobs=n_jobs)(delayed(procrustes)(random_state.permutation(mat1), mat2) for i in range(n_permute))
    all_p = [1 - x[2] for x in all_p]

    stats['p'] = _calc_pvalue(all_p, sse, tail)

    return stats