Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

21.4 kB

	import warnings
	import numpy as np

	from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
	from scipy._lib._bunch import _make_tuple_bunch
	from scipy.spatial.distance import cdist
	from scipy.ndimage import _measurements

	from ._stats import _local_correlations # type: ignore[import-not-found]
	from . import distributions

	__all__ = ['multiscale_graphcorr']

	# FROM MGCPY: https://github.com/neurodata/mgcpy


	class _ParallelP:
	"""Helper function to calculate parallel p-value."""

	def __init__(self, x, y, random_states):
	self.x = x
	self.y = y
	self.random_states = random_states

	def __call__(self, index):
	order = self.random_states[index].permutation(self.y.shape[0])
	permy = self.y[order][:, order]

	# calculate permuted stats, store in null distribution
	perm_stat = _mgc_stat(self.x, permy)[0]

	return perm_stat


	def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
	r"""Helper function that calculates the p-value. See below for uses.

	Parameters
	----------
	x, y : ndarray
	`x` and `y` have shapes ``(n, p)`` and ``(n, q)``.
	stat : float
	The sample test statistic.
	reps : int, optional
	The number of replications used to estimate the null when using the
	permutation test. The default is 1000 replications.
	workers : int or map-like callable, optional
	If `workers` is an int the population is subdivided into `workers`
	sections and evaluated in parallel (uses
	`multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
	available to the Process. Alternatively supply a map-like callable,
	such as `multiprocessing.Pool.map` for evaluating the population in
	parallel. This evaluation is carried out as `workers(func, iterable)`.
	Requires that `func` be pickleable.
	random_state : {None, int, `numpy.random.Generator`,
	`numpy.random.RandomState`}, optional

	If `seed` is None (or `np.random`), the `numpy.random.RandomState`
	singleton is used.
	If `seed` is an int, a new ``RandomState`` instance is used,
	seeded with `seed`.
	If `seed` is already a ``Generator`` or ``RandomState`` instance then
	that instance is used.

	Returns
	-------
	pvalue : float
	The sample test p-value.
	null_dist : list
	The approximated null distribution.

	"""
	# generate seeds for each rep (change to new parallel random number
	# capabilities in numpy >= 1.17+)
	random_state = check_random_state(random_state)
	random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
	size=4, dtype=np.uint32)) for _ in range(reps)]

	# parallelizes with specified workers over number of reps and set seeds
	parallelp = _ParallelP(x=x, y=y, random_states=random_states)
	with MapWrapper(workers) as mapwrapper:
	null_dist = np.array(list(mapwrapper(parallelp, range(reps))))

	# calculate p-value and significant permutation map through list
	pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)

	return pvalue, null_dist


	def _euclidean_dist(x):
	return cdist(x, x)


	MGCResult = _make_tuple_bunch('MGCResult',
	['statistic', 'pvalue', 'mgc_dict'], [])


	def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
	workers=1, is_twosamp=False, random_state=None):
	r"""Computes the Multiscale Graph Correlation (MGC) test statistic.

	Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
	one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
	the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
	called the "scale". A priori, however, it is not know which scales will be
	most informative. So, MGC computes all distance pairs, and then efficiently
	computes the distance correlations for all scales. The local correlations
	illustrate which scales are relatively informative about the relationship.
	The key, therefore, to successfully discover and decipher relationships
	between disparate data modalities is to adaptively determine which scales
	are the most informative, and the geometric implication for the most
	informative scales. Doing so not only provides an estimate of whether the
	modalities are related, but also provides insight into how the
	determination was made. This is especially important in high-dimensional
	data, where simple visualizations do not reveal relationships to the
	unaided human eye. Characterizations of this implementation in particular
	have been derived from and benchmarked within in [2]_.

	Parameters
	----------
	x, y : ndarray
	If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
	the number of samples and `p` and `q` are the number of dimensions,
	then the MGC independence test will be run. Alternatively, ``x`` and
	``y`` can have shapes ``(n, n)`` if they are distance or similarity
	matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
	and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
	two-sample MGC test will be run.
	compute_distance : callable, optional
	A function that computes the distance or similarity among the samples
	within each data matrix. Set to ``None`` if ``x`` and ``y`` are
	already distance matrices. The default uses the euclidean norm metric.
	If you are calling a custom function, either create the distance
	matrix before-hand or create a function of the form
	``compute_distance(x)`` where `x` is the data matrix for which
	pairwise distances are calculated.
	reps : int, optional
	The number of replications used to estimate the null when using the
	permutation test. The default is ``1000``.
	workers : int or map-like callable, optional
	If ``workers`` is an int the population is subdivided into ``workers``
	sections and evaluated in parallel (uses ``multiprocessing.Pool
	<multiprocessing>``). Supply ``-1`` to use all cores available to the
	Process. Alternatively supply a map-like callable, such as
	``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
	This evaluation is carried out as ``workers(func, iterable)``.
	Requires that `func` be pickleable. The default is ``1``.
	is_twosamp : bool, optional
	If `True`, a two sample test will be run. If ``x`` and ``y`` have
	shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
	set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
	``(n, p)`` and a two sample test is desired. The default is ``False``.
	Note that this will not run if inputs are distance matrices.
	random_state : {None, int, `numpy.random.Generator`,
	`numpy.random.RandomState`}, optional

	If `seed` is None (or `np.random`), the `numpy.random.RandomState`
	singleton is used.
	If `seed` is an int, a new ``RandomState`` instance is used,
	seeded with `seed`.
	If `seed` is already a ``Generator`` or ``RandomState`` instance then
	that instance is used.

	Returns
	-------
	res : MGCResult
	An object containing attributes:

	statistic : float
	The sample MGC test statistic within ``[-1, 1]``.
	pvalue : float
	The p-value obtained via permutation.
	mgc_dict : dict
	Contains additional useful results:

	- mgc_map : ndarray
	A 2D representation of the latent geometry of the
	relationship.
	- opt_scale : (int, int)
	The estimated optimal scale as a ``(x, y)`` pair.
	- null_dist : list
	The null distribution derived from the permuted matrices.

	See Also
	--------
	pearsonr : Pearson correlation coefficient and p-value for testing
	non-correlation.
	kendalltau : Calculates Kendall's tau.
	spearmanr : Calculates a Spearman rank-order correlation coefficient.

	Notes
	-----
	A description of the process of MGC and applications on neuroscience data
	can be found in [1]_. It is performed using the following steps:

	#. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
	modified to be mean zero columnwise. This results in two
	:math:`n \times n` distance matrices :math:`A` and :math:`B` (the
	centering and unbiased modification) [3]_.

	#. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,

	* The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
	are calculated for each property. Here, :math:`G_k (i, j)` indicates
	the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
	and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
	the :math:`i`-th row of :math:`B`

	* Let :math:`\circ` denotes the entry-wise matrix product, then local
	correlations are summed and normalized using the following statistic:

	.. math::

	c^{kl} = \frac{\sum_{ij} A G_k B H_l}
	{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}

	#. The MGC test statistic is the smoothed optimal local correlation of
	:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
	(which essentially set all isolated large correlations) as 0 and
	connected large correlations the same as before, see [3]_.) MGC is,

	.. math::

	MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
	\right)

	The test statistic returns a value between :math:`(-1, 1)` since it is
	normalized.

	The p-value returned is calculated using a permutation test. This process
	is completed by first randomly permuting :math:`y` to estimate the null
	distribution and then calculating the probability of observing a test
	statistic, under the null, at least as extreme as the observed test
	statistic.

	MGC requires at least 5 samples to run with reliable results. It can also
	handle high-dimensional data sets.
	In addition, by manipulating the input data matrices, the two-sample
	testing problem can be reduced to the independence testing problem [4]_.
	Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
	:math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
	follows:

	.. math::

	X = [U \| V] \in \mathcal{R}^{p \times (n + m)}
	Y = [0_{1 \times n} \| 1_{1 \times m}] \in \mathcal{R}^{(n + m)}

	Then, the MGC statistic can be calculated as normal. This methodology can
	be extended to similar tests such as distance correlation [4]_.

	.. versionadded:: 1.4.0

	References
	----------
	.. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
	Maggioni, M., & Shen, C. (2019). Discovering and deciphering
	relationships across disparate data modalities. ELife.
	.. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
	Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
	mgcpy: A Comprehensive High Dimensional Independence Testing Python
	Package. :arXiv:`1907.02088`
	.. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
	correlation to multiscale graph correlation. Journal of the American
	Statistical Association.
	.. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
	Distance and Kernel Methods for Hypothesis Testing.
	:arXiv:`1806.05514`

	Examples
	--------
	>>> import numpy as np
	>>> from scipy.stats import multiscale_graphcorr
	>>> x = np.arange(100)
	>>> y = x
	>>> res = multiscale_graphcorr(x, y)
	>>> res.statistic, res.pvalue
	(1.0, 0.001)

	To run an unpaired two-sample test,

	>>> x = np.arange(100)
	>>> y = np.arange(79)
	>>> res = multiscale_graphcorr(x, y)
	>>> res.statistic, res.pvalue # doctest: +SKIP
	(0.033258146255703246, 0.023)

	or, if shape of the inputs are the same,

	>>> x = np.arange(100)
	>>> y = x
	>>> res = multiscale_graphcorr(x, y, is_twosamp=True)
	>>> res.statistic, res.pvalue # doctest: +SKIP
	(-0.008021809890200488, 1.0)

	"""
	if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
	raise ValueError("x and y must be ndarrays")

	# convert arrays of type (n,) to (n, 1)
	if x.ndim == 1:
	x = x[:, np.newaxis]
	elif x.ndim != 2:
	raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
	if y.ndim == 1:
	y = y[:, np.newaxis]
	elif y.ndim != 2:
	raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")

	nx, px = x.shape
	ny, py = y.shape

	# check for NaNs
	_contains_nan(x, nan_policy='raise')
	_contains_nan(y, nan_policy='raise')

	# check for positive or negative infinity and raise error
	if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
	raise ValueError("Inputs contain infinities")

	if nx != ny:
	if px == py:
	# reshape x and y for two sample testing
	is_twosamp = True
	else:
	raise ValueError("Shape mismatch, x and y must have shape [n, p] "
	"and [n, q] or have shape [n, p] and [m, p].")

	if nx < 5 or ny < 5:
	raise ValueError("MGC requires at least 5 samples to give reasonable "
	"results.")

	# convert x and y to float
	x = x.astype(np.float64)
	y = y.astype(np.float64)

	# check if compute_distance_matrix if a callable()
	if not callable(compute_distance) and compute_distance is not None:
	raise ValueError("Compute_distance must be a function.")

	# check if number of reps exists, integer, or > 0 (if under 1000 raises
	# warning)
	if not isinstance(reps, int) or reps < 0:
	raise ValueError("Number of reps must be an integer greater than 0.")
	elif reps < 1000:
	msg = ("The number of replications is low (under 1000), and p-value "
	"calculations may be unreliable. Use the p-value result, with "
	"caution!")
	warnings.warn(msg, RuntimeWarning, stacklevel=2)

	if is_twosamp:
	if compute_distance is None:
	raise ValueError("Cannot run if inputs are distance matrices")
	x, y = _two_sample_transform(x, y)

	if compute_distance is not None:
	# compute distance matrices for x and y
	x = compute_distance(x)
	y = compute_distance(y)

	# calculate MGC stat
	stat, stat_dict = _mgc_stat(x, y)
	stat_mgc_map = stat_dict["stat_mgc_map"]
	opt_scale = stat_dict["opt_scale"]

	# calculate permutation MGC p-value
	pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
	random_state=random_state)

	# save all stats (other than stat/p-value) in dictionary
	mgc_dict = {"mgc_map": stat_mgc_map,
	"opt_scale": opt_scale,
	"null_dist": null_dist}

	# create result object with alias for backward compatibility
	res = MGCResult(stat, pvalue, mgc_dict)
	res.stat = stat
	return res


	def _mgc_stat(distx, disty):
	r"""Helper function that calculates the MGC stat. See above for use.

	Parameters
	----------
	distx, disty : ndarray
	`distx` and `disty` have shapes ``(n, p)`` and ``(n, q)`` or
	``(n, n)`` and ``(n, n)``
	if distance matrices.

	Returns
	-------
	stat : float
	The sample MGC test statistic within ``[-1, 1]``.
	stat_dict : dict
	Contains additional useful additional returns containing the following
	keys:

	- stat_mgc_map : ndarray
	MGC-map of the statistics.
	- opt_scale : (float, float)
	The estimated optimal scale as a ``(x, y)`` pair.

	"""
	# calculate MGC map and optimal scale
	stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')

	n, m = stat_mgc_map.shape
	if m == 1 or n == 1:
	# the global scale at is the statistic calculated at maximal nearest
	# neighbors. There is not enough local scale to search over, so
	# default to global scale
	stat = stat_mgc_map[m - 1][n - 1]
	opt_scale = m * n
	else:
	samp_size = len(distx) - 1

	# threshold to find connected region of significant local correlations
	sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)

	# maximum within the significant region
	stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)

	stat_dict = {"stat_mgc_map": stat_mgc_map,
	"opt_scale": opt_scale}

	return stat, stat_dict


	def _threshold_mgc_map(stat_mgc_map, samp_size):
	r"""
	Finds a connected region of significance in the MGC-map by thresholding.

	Parameters
	----------
	stat_mgc_map : ndarray
	All local correlations within ``[-1,1]``.
	samp_size : int
	The sample size of original data.

	Returns
	-------
	sig_connect : ndarray
	A binary matrix with 1's indicating the significant region.

	"""
	m, n = stat_mgc_map.shape

	# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
	# with varying levels of performance. Threshold is based on a beta
	# approximation.
	per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
	threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
	threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1

	# the global scale at is the statistic calculated at maximal nearest
	# neighbors. Threshold is the maximum on the global and local scales
	threshold = max(threshold, stat_mgc_map[m - 1][n - 1])

	# find the largest connected component of significant correlations
	sig_connect = stat_mgc_map > threshold
	if np.sum(sig_connect) > 0:
	sig_connect, _ = _measurements.label(sig_connect)
	_, label_counts = np.unique(sig_connect, return_counts=True)

	# skip the first element in label_counts, as it is count(zeros)
	max_label = np.argmax(label_counts[1:]) + 1
	sig_connect = sig_connect == max_label
	else:
	sig_connect = np.array([[False]])

	return sig_connect


	def _smooth_mgc_map(sig_connect, stat_mgc_map):
	"""Finds the smoothed maximal within the significant region R.

	If area of R is too small it returns the last local correlation. Otherwise,
	returns the maximum within significant_connected_region.

	Parameters
	----------
	sig_connect : ndarray
	A binary matrix with 1's indicating the significant region.
	stat_mgc_map : ndarray
	All local correlations within ``[-1, 1]``.

	Returns
	-------
	stat : float
	The sample MGC statistic within ``[-1, 1]``.
	opt_scale: (float, float)
	The estimated optimal scale as an ``(x, y)`` pair.

	"""
	m, n = stat_mgc_map.shape

	# the global scale at is the statistic calculated at maximal nearest
	# neighbors. By default, statistic and optimal scale are global.
	stat = stat_mgc_map[m - 1][n - 1]
	opt_scale = [m, n]

	if np.linalg.norm(sig_connect) != 0:
	# proceed only when the connected region's area is sufficiently large
	# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
	# with varying levels of performance
	if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
	max_corr = max(stat_mgc_map[sig_connect])

	# find all scales within significant_connected_region that maximize
	# the local correlation
	max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)

	if max_corr >= stat:
	stat = max_corr

	k, l = max_corr_index
	one_d_indices = k * n + l # 2D to 1D indexing
	k = np.max(one_d_indices) // n
	l = np.max(one_d_indices) % n
	opt_scale = [k+1, l+1] # adding 1s to match R indexing

	return stat, opt_scale


	def _two_sample_transform(u, v):
	"""Helper function that concatenates x and y for two sample MGC stat.

	See above for use.

	Parameters
	----------
	u, v : ndarray
	`u` and `v` have shapes ``(n, p)`` and ``(m, p)``.

	Returns
	-------
	x : ndarray
	Concatenate `u` and `v` along the ``axis = 0``. `x` thus has shape
	``(2n, p)``.
	y : ndarray
	Label matrix for `x` where 0 refers to samples that comes from `u` and
	1 refers to samples that come from `v`. `y` thus has shape ``(2n, 1)``.

	"""
	nx = u.shape[0]
	ny = v.shape[0]
	x = np.concatenate([u, v], axis=0)
	y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
	return x, y