|
""" |
|
An extension of scipy.stats._stats_py to support masked arrays |
|
|
|
""" |
|
|
|
|
|
|
|
__all__ = ['argstoarray', |
|
'count_tied_groups', |
|
'describe', |
|
'f_oneway', 'find_repeats','friedmanchisquare', |
|
'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis', |
|
'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest', |
|
'ks_1samp', 'kstest', |
|
'linregress', |
|
'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign', |
|
'normaltest', |
|
'obrientransform', |
|
'pearsonr','plotting_positions','pointbiserialr', |
|
'rankdata', |
|
'scoreatpercentile','sem', |
|
'sen_seasonal_slopes','skew','skewtest','spearmanr', |
|
'siegelslopes', 'theilslopes', |
|
'tmax','tmean','tmin','trim','trimboth', |
|
'trimtail','trima','trimr','trimmed_mean','trimmed_std', |
|
'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp', |
|
'ttest_ind','ttest_rel','tvar', |
|
'variation', |
|
'winsorize', |
|
'brunnermunzel', |
|
] |
|
|
|
import numpy as np |
|
from numpy import ndarray |
|
import numpy.ma as ma |
|
from numpy.ma import masked, nomask |
|
import math |
|
|
|
import itertools |
|
import warnings |
|
from collections import namedtuple |
|
|
|
from . import distributions |
|
from scipy._lib._util import _rename_parameter, _contains_nan |
|
from scipy._lib._bunch import _make_tuple_bunch |
|
import scipy.special as special |
|
import scipy.stats._stats_py |
|
import scipy.stats._stats_py as _stats_py |
|
|
|
from ._stats_mstats_common import ( |
|
_find_repeats, |
|
theilslopes as stats_theilslopes, |
|
siegelslopes as stats_siegelslopes |
|
) |
|
|
|
|
|
def _chk_asarray(a, axis): |
|
|
|
a = ma.asanyarray(a) |
|
if axis is None: |
|
a = ma.ravel(a) |
|
outaxis = 0 |
|
else: |
|
outaxis = axis |
|
return a, outaxis |
|
|
|
|
|
def _chk2_asarray(a, b, axis): |
|
a = ma.asanyarray(a) |
|
b = ma.asanyarray(b) |
|
if axis is None: |
|
a = ma.ravel(a) |
|
b = ma.ravel(b) |
|
outaxis = 0 |
|
else: |
|
outaxis = axis |
|
return a, b, outaxis |
|
|
|
|
|
def _chk_size(a, b): |
|
a = ma.asanyarray(a) |
|
b = ma.asanyarray(b) |
|
(na, nb) = (a.size, b.size) |
|
if na != nb: |
|
raise ValueError("The size of the input array should match!" |
|
f" ({na} <> {nb})") |
|
return (a, b, na) |
|
|
|
|
|
def _ttest_finish(df, t, alternative): |
|
"""Common code between all 3 t-test functions.""" |
|
|
|
|
|
if alternative == 'less': |
|
pval = special._ufuncs.stdtr(df, t) |
|
elif alternative == 'greater': |
|
pval = special._ufuncs.stdtr(df, -t) |
|
elif alternative == 'two-sided': |
|
pval = special._ufuncs.stdtr(df, -np.abs(t))*2 |
|
else: |
|
raise ValueError("alternative must be " |
|
"'less', 'greater' or 'two-sided'") |
|
|
|
if t.ndim == 0: |
|
t = t[()] |
|
if pval.ndim == 0: |
|
pval = pval[()] |
|
|
|
return t, pval |
|
|
|
|
|
def argstoarray(*args): |
|
""" |
|
Constructs a 2D array from a group of sequences. |
|
|
|
Sequences are filled with missing values to match the length of the longest |
|
sequence. |
|
|
|
Parameters |
|
---------- |
|
*args : sequences |
|
Group of sequences. |
|
|
|
Returns |
|
------- |
|
argstoarray : MaskedArray |
|
A ( `m` x `n` ) masked array, where `m` is the number of arguments and |
|
`n` the length of the longest argument. |
|
|
|
Notes |
|
----- |
|
`numpy.ma.vstack` has identical behavior, but is called with a sequence |
|
of sequences. |
|
|
|
Examples |
|
-------- |
|
A 2D masked array constructed from a group of sequences is returned. |
|
|
|
>>> from scipy.stats.mstats import argstoarray |
|
>>> argstoarray([1, 2, 3], [4, 5, 6]) |
|
masked_array( |
|
data=[[1.0, 2.0, 3.0], |
|
[4.0, 5.0, 6.0]], |
|
mask=[[False, False, False], |
|
[False, False, False]], |
|
fill_value=1e+20) |
|
|
|
The returned masked array filled with missing values when the lengths of |
|
sequences are different. |
|
|
|
>>> argstoarray([1, 3], [4, 5, 6]) |
|
masked_array( |
|
data=[[1.0, 3.0, --], |
|
[4.0, 5.0, 6.0]], |
|
mask=[[False, False, True], |
|
[False, False, False]], |
|
fill_value=1e+20) |
|
|
|
""" |
|
if len(args) == 1 and not isinstance(args[0], ndarray): |
|
output = ma.asarray(args[0]) |
|
if output.ndim != 2: |
|
raise ValueError("The input should be 2D") |
|
else: |
|
n = len(args) |
|
m = max([len(k) for k in args]) |
|
output = ma.array(np.empty((n,m), dtype=float), mask=True) |
|
for (k,v) in enumerate(args): |
|
output[k,:len(v)] = v |
|
|
|
output[np.logical_not(np.isfinite(output._data))] = masked |
|
return output |
|
|
|
|
|
def find_repeats(arr): |
|
"""Find repeats in arr and return a tuple (repeats, repeat_count). |
|
|
|
The input is cast to float64. Masked values are discarded. |
|
|
|
Parameters |
|
---------- |
|
arr : sequence |
|
Input array. The array is flattened if it is not 1D. |
|
|
|
Returns |
|
------- |
|
repeats : ndarray |
|
Array of repeated values. |
|
counts : ndarray |
|
Array of counts. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats import mstats |
|
>>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5]) |
|
(array([2.]), array([4])) |
|
|
|
In the above example, 2 repeats 4 times. |
|
|
|
>>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) |
|
(array([4., 5.]), array([2, 2])) |
|
|
|
In the above example, both 4 and 5 repeat 2 times. |
|
|
|
""" |
|
|
|
|
|
compr = np.asarray(ma.compressed(arr), dtype=np.float64) |
|
try: |
|
need_copy = np.may_share_memory(compr, arr) |
|
except AttributeError: |
|
|
|
|
|
need_copy = False |
|
if need_copy: |
|
compr = compr.copy() |
|
return _find_repeats(compr) |
|
|
|
|
|
def count_tied_groups(x, use_missing=False): |
|
""" |
|
Counts the number of tied values. |
|
|
|
Parameters |
|
---------- |
|
x : sequence |
|
Sequence of data on which to counts the ties |
|
use_missing : bool, optional |
|
Whether to consider missing values as tied. |
|
|
|
Returns |
|
------- |
|
count_tied_groups : dict |
|
Returns a dictionary (nb of ties: nb of groups). |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats import mstats |
|
>>> import numpy as np |
|
>>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6] |
|
>>> mstats.count_tied_groups(z) |
|
{2: 1, 3: 2} |
|
|
|
In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x). |
|
|
|
>>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6]) |
|
>>> mstats.count_tied_groups(z) |
|
{2: 2, 3: 1} |
|
>>> z[[1,-1]] = np.ma.masked |
|
>>> mstats.count_tied_groups(z, use_missing=True) |
|
{2: 2, 3: 1} |
|
|
|
""" |
|
nmasked = ma.getmask(x).sum() |
|
|
|
data = ma.compressed(x).copy() |
|
(ties, counts) = find_repeats(data) |
|
nties = {} |
|
if len(ties): |
|
nties = dict(zip(np.unique(counts), itertools.repeat(1))) |
|
nties.update(dict(zip(*find_repeats(counts)))) |
|
|
|
if nmasked and use_missing: |
|
try: |
|
nties[nmasked] += 1 |
|
except KeyError: |
|
nties[nmasked] = 1 |
|
|
|
return nties |
|
|
|
|
|
def rankdata(data, axis=None, use_missing=False): |
|
"""Returns the rank (also known as order statistics) of each data point |
|
along the given axis. |
|
|
|
If some values are tied, their rank is averaged. |
|
If some values are masked, their rank is set to 0 if use_missing is False, |
|
or set to the average rank of the unmasked values if use_missing is True. |
|
|
|
Parameters |
|
---------- |
|
data : sequence |
|
Input data. The data is transformed to a masked array |
|
axis : {None,int}, optional |
|
Axis along which to perform the ranking. |
|
If None, the array is first flattened. An exception is raised if |
|
the axis is specified for arrays with a dimension larger than 2 |
|
use_missing : bool, optional |
|
Whether the masked values have a rank of 0 (False) or equal to the |
|
average rank of the unmasked values (True). |
|
|
|
""" |
|
def _rank1d(data, use_missing=False): |
|
n = data.count() |
|
rk = np.empty(data.size, dtype=float) |
|
idx = data.argsort() |
|
rk[idx[:n]] = np.arange(1,n+1) |
|
|
|
if use_missing: |
|
rk[idx[n:]] = (n+1)/2. |
|
else: |
|
rk[idx[n:]] = 0 |
|
|
|
repeats = find_repeats(data.copy()) |
|
for r in repeats[0]: |
|
condition = (data == r).filled(False) |
|
rk[condition] = rk[condition].mean() |
|
return rk |
|
|
|
data = ma.array(data, copy=False) |
|
if axis is None: |
|
if data.ndim > 1: |
|
return _rank1d(data.ravel(), use_missing).reshape(data.shape) |
|
else: |
|
return _rank1d(data, use_missing) |
|
else: |
|
return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray) |
|
|
|
|
|
ModeResult = namedtuple('ModeResult', ('mode', 'count')) |
|
|
|
|
|
def mode(a, axis=0): |
|
""" |
|
Returns an array of the modal (most common) value in the passed array. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
n-dimensional array of which to find mode(s). |
|
axis : int or None, optional |
|
Axis along which to operate. Default is 0. If None, compute over |
|
the whole array `a`. |
|
|
|
Returns |
|
------- |
|
mode : ndarray |
|
Array of modal values. |
|
count : ndarray |
|
Array of counts for each mode. |
|
|
|
Notes |
|
----- |
|
For more details, see `scipy.stats.mode`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy import stats |
|
>>> from scipy.stats import mstats |
|
>>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0]) |
|
>>> mstats.mode(m_arr) # note that most zeros are masked |
|
ModeResult(mode=array([1.]), count=array([2.])) |
|
|
|
""" |
|
return _mode(a, axis=axis, keepdims=True) |
|
|
|
|
|
def _mode(a, axis=0, keepdims=True): |
|
|
|
a, axis = _chk_asarray(a, axis) |
|
|
|
def _mode1D(a): |
|
(rep,cnt) = find_repeats(a) |
|
if not cnt.ndim: |
|
return (0, 0) |
|
elif cnt.size: |
|
return (rep[cnt.argmax()], cnt.max()) |
|
else: |
|
return (a.min(), 1) |
|
|
|
if axis is None: |
|
output = _mode1D(ma.ravel(a)) |
|
output = (ma.array(output[0]), ma.array(output[1])) |
|
else: |
|
output = ma.apply_along_axis(_mode1D, axis, a) |
|
if keepdims is None or keepdims: |
|
newshape = list(a.shape) |
|
newshape[axis] = 1 |
|
slices = [slice(None)] * output.ndim |
|
slices[axis] = 0 |
|
modes = output[tuple(slices)].reshape(newshape) |
|
slices[axis] = 1 |
|
counts = output[tuple(slices)].reshape(newshape) |
|
output = (modes, counts) |
|
else: |
|
output = np.moveaxis(output, axis, 0) |
|
|
|
return ModeResult(*output) |
|
|
|
|
|
def _betai(a, b, x): |
|
x = np.asanyarray(x) |
|
x = ma.where(x < 1.0, x, 1.0) |
|
return special.betainc(a, b, x) |
|
|
|
|
|
def msign(x): |
|
"""Returns the sign of x, or 0 if x is masked.""" |
|
return ma.filled(np.sign(x), 0) |
|
|
|
|
|
def pearsonr(x, y): |
|
r""" |
|
Pearson correlation coefficient and p-value for testing non-correlation. |
|
|
|
The Pearson correlation coefficient [1]_ measures the linear relationship |
|
between two datasets. The calculation of the p-value relies on the |
|
assumption that each dataset is normally distributed. (See Kowalski [3]_ |
|
for a discussion of the effects of non-normality of the input on the |
|
distribution of the correlation coefficient.) Like other correlation |
|
coefficients, this one varies between -1 and +1 with 0 implying no |
|
correlation. Correlations of -1 or +1 imply an exact linear relationship. |
|
|
|
Parameters |
|
---------- |
|
x : (N,) array_like |
|
Input array. |
|
y : (N,) array_like |
|
Input array. |
|
|
|
Returns |
|
------- |
|
r : float |
|
Pearson's correlation coefficient. |
|
p-value : float |
|
Two-tailed p-value. |
|
|
|
Warns |
|
----- |
|
`~scipy.stats.ConstantInputWarning` |
|
Raised if an input is a constant array. The correlation coefficient |
|
is not defined in this case, so ``np.nan`` is returned. |
|
|
|
`~scipy.stats.NearConstantInputWarning` |
|
Raised if an input is "nearly" constant. The array ``x`` is considered |
|
nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``. |
|
Numerical errors in the calculation ``x - mean(x)`` in this case might |
|
result in an inaccurate calculation of r. |
|
|
|
See Also |
|
-------- |
|
spearmanr : Spearman rank-order correlation coefficient. |
|
kendalltau : Kendall's tau, a correlation measure for ordinal data. |
|
|
|
Notes |
|
----- |
|
The correlation coefficient is calculated as follows: |
|
|
|
.. math:: |
|
|
|
r = \frac{\sum (x - m_x) (y - m_y)} |
|
{\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}} |
|
|
|
where :math:`m_x` is the mean of the vector x and :math:`m_y` is |
|
the mean of the vector y. |
|
|
|
Under the assumption that x and y are drawn from |
|
independent normal distributions (so the population correlation coefficient |
|
is 0), the probability density function of the sample correlation |
|
coefficient r is ([1]_, [2]_): |
|
|
|
.. math:: |
|
|
|
f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)} |
|
|
|
where n is the number of samples, and B is the beta function. This |
|
is sometimes referred to as the exact distribution of r. This is |
|
the distribution that is used in `pearsonr` to compute the p-value. |
|
The distribution is a beta distribution on the interval [-1, 1], |
|
with equal shape parameters a = b = n/2 - 1. In terms of SciPy's |
|
implementation of the beta distribution, the distribution of r is:: |
|
|
|
dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2) |
|
|
|
The p-value returned by `pearsonr` is a two-sided p-value. The p-value |
|
roughly indicates the probability of an uncorrelated system |
|
producing datasets that have a Pearson correlation at least as extreme |
|
as the one computed from these datasets. More precisely, for a |
|
given sample with correlation coefficient r, the p-value is |
|
the probability that abs(r') of a random sample x' and y' drawn from |
|
the population with zero correlation would be greater than or equal |
|
to abs(r). In terms of the object ``dist`` shown above, the p-value |
|
for a given r and length n can be computed as:: |
|
|
|
p = 2*dist.cdf(-abs(r)) |
|
|
|
When n is 2, the above continuous distribution is not well-defined. |
|
One can interpret the limit of the beta distribution as the shape |
|
parameters a and b approach a = b = 0 as a discrete distribution with |
|
equal probability masses at r = 1 and r = -1. More directly, one |
|
can observe that, given the data x = [x1, x2] and y = [y1, y2], and |
|
assuming x1 != x2 and y1 != y2, the only possible values for r are 1 |
|
and -1. Because abs(r') for any sample x' and y' with length 2 will |
|
be 1, the two-sided p-value for a sample of length 2 is always 1. |
|
|
|
References |
|
---------- |
|
.. [1] "Pearson correlation coefficient", Wikipedia, |
|
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient |
|
.. [2] Student, "Probable error of a correlation coefficient", |
|
Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310. |
|
.. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution |
|
of the Sample Product-Moment Correlation Coefficient" |
|
Journal of the Royal Statistical Society. Series C (Applied |
|
Statistics), Vol. 21, No. 1 (1972), pp. 1-12. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy import stats |
|
>>> from scipy.stats import mstats |
|
>>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) |
|
(-0.7426106572325057, 0.1505558088534455) |
|
|
|
There is a linear dependence between x and y if y = a + b*x + e, where |
|
a,b are constants and e is a random error term, assumed to be independent |
|
of x. For simplicity, assume that x is standard normal, a=0, b=1 and let |
|
e follow a normal distribution with mean zero and standard deviation s>0. |
|
|
|
>>> s = 0.5 |
|
>>> x = stats.norm.rvs(size=500) |
|
>>> e = stats.norm.rvs(scale=s, size=500) |
|
>>> y = x + e |
|
>>> mstats.pearsonr(x, y) |
|
(0.9029601878969703, 8.428978827629898e-185) # may vary |
|
|
|
This should be close to the exact value given by |
|
|
|
>>> 1/np.sqrt(1 + s**2) |
|
0.8944271909999159 |
|
|
|
For s=0.5, we observe a high level of correlation. In general, a large |
|
variance of the noise reduces the correlation, while the correlation |
|
approaches one as the variance of the error goes to zero. |
|
|
|
It is important to keep in mind that no correlation does not imply |
|
independence unless (x, y) is jointly normal. Correlation can even be zero |
|
when there is a very simple dependence structure: if X follows a |
|
standard normal distribution, let y = abs(x). Note that the correlation |
|
between x and y is zero. Indeed, since the expectation of x is zero, |
|
cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero |
|
by symmetry. The following lines of code illustrate this observation: |
|
|
|
>>> y = np.abs(x) |
|
>>> mstats.pearsonr(x, y) |
|
(-0.016172891856853524, 0.7182823678751942) # may vary |
|
|
|
A non-zero correlation coefficient can be misleading. For example, if X has |
|
a standard normal distribution, define y = x if x < 0 and y = 0 otherwise. |
|
A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797..., |
|
implying a high level of correlation: |
|
|
|
>>> y = np.where(x < 0, x, 0) |
|
>>> mstats.pearsonr(x, y) |
|
(0.8537091583771509, 3.183461621422181e-143) # may vary |
|
|
|
This is unintuitive since there is no dependence of x and y if x is larger |
|
than zero which happens in about half of the cases if we sample x and y. |
|
""" |
|
(x, y, n) = _chk_size(x, y) |
|
(x, y) = (x.ravel(), y.ravel()) |
|
|
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y)) |
|
n -= m.sum() |
|
df = n-2 |
|
if df < 0: |
|
return (masked, masked) |
|
|
|
return scipy.stats._stats_py.pearsonr( |
|
ma.masked_array(x, mask=m).compressed(), |
|
ma.masked_array(y, mask=m).compressed()) |
|
|
|
|
|
def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate', |
|
alternative='two-sided'): |
|
""" |
|
Calculates a Spearman rank-order correlation coefficient and the p-value |
|
to test for non-correlation. |
|
|
|
The Spearman correlation is a nonparametric measure of the linear |
|
relationship between two datasets. Unlike the Pearson correlation, the |
|
Spearman correlation does not assume that both datasets are normally |
|
distributed. Like other correlation coefficients, this one varies |
|
between -1 and +1 with 0 implying no correlation. Correlations of -1 or |
|
+1 imply a monotonic relationship. Positive correlations imply that |
|
as `x` increases, so does `y`. Negative correlations imply that as `x` |
|
increases, `y` decreases. |
|
|
|
Missing values are discarded pair-wise: if a value is missing in `x`, the |
|
corresponding value in `y` is masked. |
|
|
|
The p-value roughly indicates the probability of an uncorrelated system |
|
producing datasets that have a Spearman correlation at least as extreme |
|
as the one computed from these datasets. The p-values are not entirely |
|
reliable but are probably reasonable for datasets larger than 500 or so. |
|
|
|
Parameters |
|
---------- |
|
x, y : 1D or 2D array_like, y is optional |
|
One or two 1-D or 2-D arrays containing multiple variables and |
|
observations. When these are 1-D, each represents a vector of |
|
observations of a single variable. For the behavior in the 2-D case, |
|
see under ``axis``, below. |
|
use_ties : bool, optional |
|
DO NOT USE. Does not do anything, keyword is only left in place for |
|
backwards compatibility reasons. |
|
axis : int or None, optional |
|
If axis=0 (default), then each column represents a variable, with |
|
observations in the rows. If axis=1, the relationship is transposed: |
|
each row represents a variable, while the columns contain observations. |
|
If axis=None, then both arrays will be raveled. |
|
nan_policy : {'propagate', 'raise', 'omit'}, optional |
|
Defines how to handle when input contains nan. 'propagate' returns nan, |
|
'raise' throws an error, 'omit' performs the calculations ignoring nan |
|
values. Default is 'propagate'. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. Default is 'two-sided'. |
|
The following options are available: |
|
|
|
* 'two-sided': the correlation is nonzero |
|
* 'less': the correlation is negative (less than zero) |
|
* 'greater': the correlation is positive (greater than zero) |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
res : SignificanceResult |
|
An object containing attributes: |
|
|
|
statistic : float or ndarray (2-D square) |
|
Spearman correlation matrix or correlation coefficient (if only 2 |
|
variables are given as parameters). Correlation matrix is square |
|
with length equal to total number of variables (columns or rows) in |
|
``a`` and ``b`` combined. |
|
pvalue : float |
|
The p-value for a hypothesis test whose null hypothesis |
|
is that two sets of data are linearly uncorrelated. See |
|
`alternative` above for alternative hypotheses. `pvalue` has the |
|
same shape as `statistic`. |
|
|
|
References |
|
---------- |
|
[CRCProbStat2000] section 14.7 |
|
|
|
""" |
|
if not use_ties: |
|
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0") |
|
|
|
|
|
x, axisout = _chk_asarray(x, axis) |
|
if y is not None: |
|
|
|
y, _ = _chk_asarray(y, axis) |
|
if axisout == 0: |
|
x = ma.column_stack((x, y)) |
|
else: |
|
x = ma.vstack((x, y)) |
|
|
|
if axisout == 1: |
|
|
|
x = x.T |
|
|
|
if nan_policy == 'omit': |
|
x = ma.masked_invalid(x) |
|
|
|
def _spearmanr_2cols(x): |
|
|
|
|
|
x = ma.mask_rowcols(x, axis=0) |
|
x = x[~x.mask.any(axis=1), :] |
|
|
|
|
|
if not np.any(x.data): |
|
res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan) |
|
res.correlation = np.nan |
|
return res |
|
|
|
m = ma.getmask(x) |
|
n_obs = x.shape[0] |
|
dof = n_obs - 2 - int(m.sum(axis=0)[0]) |
|
if dof < 0: |
|
raise ValueError("The input must have at least 3 entries!") |
|
|
|
|
|
x_ranked = rankdata(x, axis=0) |
|
rs = ma.corrcoef(x_ranked, rowvar=False).data |
|
|
|
|
|
with np.errstate(divide='ignore'): |
|
|
|
|
|
t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0)) |
|
|
|
t, prob = _ttest_finish(dof, t, alternative) |
|
|
|
|
|
if rs.shape == (2, 2): |
|
res = scipy.stats._stats_py.SignificanceResult(rs[1, 0], |
|
prob[1, 0]) |
|
res.correlation = rs[1, 0] |
|
return res |
|
else: |
|
res = scipy.stats._stats_py.SignificanceResult(rs, prob) |
|
res.correlation = rs |
|
return res |
|
|
|
|
|
|
|
n_vars = x.shape[1] |
|
if n_vars == 2: |
|
return _spearmanr_2cols(x) |
|
else: |
|
rs = np.ones((n_vars, n_vars), dtype=float) |
|
prob = np.zeros((n_vars, n_vars), dtype=float) |
|
for var1 in range(n_vars - 1): |
|
for var2 in range(var1+1, n_vars): |
|
result = _spearmanr_2cols(x[:, [var1, var2]]) |
|
rs[var1, var2] = result.correlation |
|
rs[var2, var1] = result.correlation |
|
prob[var1, var2] = result.pvalue |
|
prob[var2, var1] = result.pvalue |
|
|
|
res = scipy.stats._stats_py.SignificanceResult(rs, prob) |
|
res.correlation = rs |
|
return res |
|
|
|
|
|
def _kendall_p_exact(n, c, alternative='two-sided'): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
in_right_tail = (c >= (n*(n-1))//2 - c) |
|
alternative_greater = (alternative == 'greater') |
|
c = int(min(c, (n*(n-1))//2 - c)) |
|
|
|
|
|
|
|
if n <= 0: |
|
raise ValueError(f'n ({n}) must be positive') |
|
elif c < 0 or 4*c > n*(n-1): |
|
raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.') |
|
elif n == 1: |
|
prob = 1.0 |
|
p_mass_at_c = 1 |
|
elif n == 2: |
|
prob = 1.0 |
|
p_mass_at_c = 0.5 |
|
elif c == 0: |
|
prob = 2.0/math.factorial(n) if n < 171 else 0.0 |
|
p_mass_at_c = prob/2 |
|
elif c == 1: |
|
prob = 2.0/math.factorial(n-1) if n < 172 else 0.0 |
|
p_mass_at_c = (n-1)/math.factorial(n) |
|
elif 4*c == n*(n-1) and alternative == 'two-sided': |
|
|
|
|
|
prob = 1.0 |
|
elif n < 171: |
|
new = np.zeros(c+1) |
|
new[0:2] = 1.0 |
|
for j in range(3,n+1): |
|
new = np.cumsum(new) |
|
if j <= c: |
|
new[j:] -= new[:c+1-j] |
|
prob = 2.0*np.sum(new)/math.factorial(n) |
|
p_mass_at_c = new[-1]/math.factorial(n) |
|
else: |
|
new = np.zeros(c+1) |
|
new[0:2] = 1.0 |
|
for j in range(3, n+1): |
|
new = np.cumsum(new)/j |
|
if j <= c: |
|
new[j:] -= new[:c+1-j] |
|
prob = np.sum(new) |
|
p_mass_at_c = new[-1]/2 |
|
|
|
if alternative != 'two-sided': |
|
|
|
|
|
if in_right_tail == alternative_greater: |
|
prob /= 2 |
|
else: |
|
prob = 1 - prob/2 + p_mass_at_c |
|
|
|
prob = np.clip(prob, 0, 1) |
|
|
|
return prob |
|
|
|
|
|
def kendalltau(x, y, use_ties=True, use_missing=False, method='auto', |
|
alternative='two-sided'): |
|
""" |
|
Computes Kendall's rank correlation tau on two variables *x* and *y*. |
|
|
|
Parameters |
|
---------- |
|
x : sequence |
|
First data list (for example, time). |
|
y : sequence |
|
Second data list. |
|
use_ties : {True, False}, optional |
|
Whether ties correction should be performed. |
|
use_missing : {False, True}, optional |
|
Whether missing data should be allocated a rank of 0 (False) or the |
|
average rank (True) |
|
method : {'auto', 'asymptotic', 'exact'}, optional |
|
Defines which method is used to calculate the p-value [1]_. |
|
'asymptotic' uses a normal approximation valid for large samples. |
|
'exact' computes the exact p-value, but can only be used if no ties |
|
are present. As the sample size increases, the 'exact' computation |
|
time may grow and the result may lose some precision. |
|
'auto' is the default and selects the appropriate |
|
method based on a trade-off between speed and accuracy. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. Default is 'two-sided'. |
|
The following options are available: |
|
|
|
* 'two-sided': the rank correlation is nonzero |
|
* 'less': the rank correlation is negative (less than zero) |
|
* 'greater': the rank correlation is positive (greater than zero) |
|
|
|
Returns |
|
------- |
|
res : SignificanceResult |
|
An object containing attributes: |
|
|
|
statistic : float |
|
The tau statistic. |
|
pvalue : float |
|
The p-value for a hypothesis test whose null hypothesis is |
|
an absence of association, tau = 0. |
|
|
|
References |
|
---------- |
|
.. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), |
|
Charles Griffin & Co., 1970. |
|
|
|
""" |
|
(x, y, n) = _chk_size(x, y) |
|
(x, y) = (x.flatten(), y.flatten()) |
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y)) |
|
if m is not nomask: |
|
x = ma.array(x, mask=m, copy=True) |
|
y = ma.array(y, mask=m, copy=True) |
|
|
|
|
|
|
|
n -= int(m.sum()) |
|
|
|
if n < 2: |
|
res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan) |
|
res.correlation = np.nan |
|
return res |
|
|
|
rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0) |
|
ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0) |
|
idx = rx.argsort() |
|
(rx, ry) = (rx[idx], ry[idx]) |
|
C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum() |
|
for i in range(len(ry)-1)], dtype=float) |
|
D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum() |
|
for i in range(len(ry)-1)], dtype=float) |
|
xties = count_tied_groups(x) |
|
yties = count_tied_groups(y) |
|
if use_ties: |
|
corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float) |
|
corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float) |
|
denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.) |
|
else: |
|
denom = n*(n-1)/2. |
|
tau = (C-D) / denom |
|
|
|
if method == 'exact' and (xties or yties): |
|
raise ValueError("Ties found, exact method cannot be used.") |
|
|
|
if method == 'auto': |
|
if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1): |
|
method = 'exact' |
|
else: |
|
method = 'asymptotic' |
|
|
|
if not xties and not yties and method == 'exact': |
|
prob = _kendall_p_exact(n, C, alternative) |
|
|
|
elif method == 'asymptotic': |
|
var_s = n*(n-1)*(2*n+5) |
|
if use_ties: |
|
var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()]) |
|
var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()]) |
|
v1 = (np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) * |
|
np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float)) |
|
v1 /= 2.*n*(n-1) |
|
if n > 2: |
|
v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()], |
|
dtype=float) * \ |
|
np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()], |
|
dtype=float) |
|
v2 /= 9.*n*(n-1)*(n-2) |
|
else: |
|
v2 = 0 |
|
else: |
|
v1 = v2 = 0 |
|
|
|
var_s /= 18. |
|
var_s += (v1 + v2) |
|
z = (C-D)/np.sqrt(var_s) |
|
prob = scipy.stats._stats_py._get_pvalue(z, distributions.norm, alternative) |
|
else: |
|
raise ValueError("Unknown method "+str(method)+" specified, please " |
|
"use auto, exact or asymptotic.") |
|
|
|
res = scipy.stats._stats_py.SignificanceResult(tau[()], prob[()]) |
|
res.correlation = tau |
|
return res |
|
|
|
|
|
def kendalltau_seasonal(x): |
|
""" |
|
Computes a multivariate Kendall's rank correlation tau, for seasonal data. |
|
|
|
Parameters |
|
---------- |
|
x : 2-D ndarray |
|
Array of seasonal data, with seasons in columns. |
|
|
|
""" |
|
x = ma.array(x, subok=True, copy=False, ndmin=2) |
|
(n,m) = x.shape |
|
n_p = x.count(0) |
|
|
|
S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n)) |
|
S_tot = S_szn.sum() |
|
|
|
n_tot = x.count() |
|
ties = count_tied_groups(x.compressed()) |
|
corr_ties = sum(v*k*(k-1) for (k,v) in ties.items()) |
|
denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2. |
|
|
|
R = rankdata(x, axis=0, use_missing=True) |
|
K = ma.empty((m,m), dtype=int) |
|
covmat = ma.empty((m,m), dtype=float) |
|
denom_szn = ma.empty(m, dtype=float) |
|
for j in range(m): |
|
ties_j = count_tied_groups(x[:,j].compressed()) |
|
corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items()) |
|
cmb = n_p[j]*(n_p[j]-1) |
|
for k in range(j,m,1): |
|
K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum() |
|
for i in range(n)) |
|
covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() - |
|
n*(n_p[j]+1)*(n_p[k]+1))/3. |
|
K[k,j] = K[j,k] |
|
covmat[k,j] = covmat[j,k] |
|
|
|
denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2. |
|
|
|
var_szn = covmat.diagonal() |
|
|
|
z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn) |
|
z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum()) |
|
z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum()) |
|
|
|
prob_szn = special.erfc(abs(z_szn.data)/np.sqrt(2)) |
|
prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2)) |
|
prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2)) |
|
|
|
chi2_tot = (z_szn*z_szn).sum() |
|
chi2_trd = m * z_szn.mean()**2 |
|
output = {'seasonal tau': S_szn/denom_szn, |
|
'global tau': S_tot/denom_tot, |
|
'global tau (alt)': S_tot/denom_szn.sum(), |
|
'seasonal p-value': prob_szn, |
|
'global p-value (indep)': prob_tot_ind, |
|
'global p-value (dep)': prob_tot_dep, |
|
'chi2 total': chi2_tot, |
|
'chi2 trend': chi2_trd, |
|
} |
|
return output |
|
|
|
|
|
PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation', |
|
'pvalue')) |
|
|
|
|
|
def pointbiserialr(x, y): |
|
"""Calculates a point biserial correlation coefficient and its p-value. |
|
|
|
Parameters |
|
---------- |
|
x : array_like of bools |
|
Input array. |
|
y : array_like |
|
Input array. |
|
|
|
Returns |
|
------- |
|
correlation : float |
|
R value |
|
pvalue : float |
|
2-tailed p-value |
|
|
|
Notes |
|
----- |
|
Missing values are considered pair-wise: if a value is missing in x, |
|
the corresponding value in y is masked. |
|
|
|
For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`. |
|
|
|
""" |
|
x = ma.fix_invalid(x, copy=True).astype(bool) |
|
y = ma.fix_invalid(y, copy=True).astype(float) |
|
|
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y)) |
|
if m is not nomask: |
|
unmask = np.logical_not(m) |
|
x = x[unmask] |
|
y = y[unmask] |
|
|
|
n = len(x) |
|
|
|
phat = x.sum() / float(n) |
|
y0 = y[~x] |
|
y1 = y[x] |
|
y0m = y0.mean() |
|
y1m = y1.mean() |
|
|
|
rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std() |
|
|
|
df = n-2 |
|
t = rpb*ma.sqrt(df/(1.0-rpb**2)) |
|
prob = _betai(0.5*df, 0.5, df/(df+t*t)) |
|
|
|
return PointbiserialrResult(rpb, prob) |
|
|
|
|
|
def linregress(x, y=None): |
|
r""" |
|
Calculate a linear least-squares regression for two sets of measurements. |
|
|
|
Parameters |
|
---------- |
|
x, y : array_like |
|
Two sets of measurements. Both arrays should have the same length N. If |
|
only `x` is given (and ``y=None``), then it must be a two-dimensional |
|
array where one dimension has length 2. The two sets of measurements |
|
are then found by splitting the array along the length-2 dimension. In |
|
the case where ``y=None`` and `x` is a 2xN array, ``linregress(x)`` is |
|
equivalent to ``linregress(x[0], x[1])``. |
|
|
|
Returns |
|
------- |
|
result : ``LinregressResult`` instance |
|
The return value is an object with the following attributes: |
|
|
|
slope : float |
|
Slope of the regression line. |
|
intercept : float |
|
Intercept of the regression line. |
|
rvalue : float |
|
The Pearson correlation coefficient. The square of ``rvalue`` |
|
is equal to the coefficient of determination. |
|
pvalue : float |
|
The p-value for a hypothesis test whose null hypothesis is |
|
that the slope is zero, using Wald Test with t-distribution of |
|
the test statistic. See `alternative` above for alternative |
|
hypotheses. |
|
stderr : float |
|
Standard error of the estimated slope (gradient), under the |
|
assumption of residual normality. |
|
intercept_stderr : float |
|
Standard error of the estimated intercept, under the assumption |
|
of residual normality. |
|
|
|
See Also |
|
-------- |
|
scipy.optimize.curve_fit : |
|
Use non-linear least squares to fit a function to data. |
|
scipy.optimize.leastsq : |
|
Minimize the sum of squares of a set of equations. |
|
|
|
Notes |
|
----- |
|
Missing values are considered pair-wise: if a value is missing in `x`, |
|
the corresponding value in `y` is masked. |
|
|
|
For compatibility with older versions of SciPy, the return value acts |
|
like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``, |
|
``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write:: |
|
|
|
slope, intercept, r, p, se = linregress(x, y) |
|
|
|
With that style, however, the standard error of the intercept is not |
|
available. To have access to all the computed values, including the |
|
standard error of the intercept, use the return value as an object |
|
with attributes, e.g.:: |
|
|
|
result = linregress(x, y) |
|
print(result.intercept, result.intercept_stderr) |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> import matplotlib.pyplot as plt |
|
>>> from scipy import stats |
|
>>> rng = np.random.default_rng() |
|
|
|
Generate some data: |
|
|
|
>>> x = rng.random(10) |
|
>>> y = 1.6*x + rng.random(10) |
|
|
|
Perform the linear regression: |
|
|
|
>>> res = stats.mstats.linregress(x, y) |
|
|
|
Coefficient of determination (R-squared): |
|
|
|
>>> print(f"R-squared: {res.rvalue**2:.6f}") |
|
R-squared: 0.717533 |
|
|
|
Plot the data along with the fitted line: |
|
|
|
>>> plt.plot(x, y, 'o', label='original data') |
|
>>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line') |
|
>>> plt.legend() |
|
>>> plt.show() |
|
|
|
Calculate 95% confidence interval on slope and intercept: |
|
|
|
>>> # Two-sided inverse Students t-distribution |
|
>>> # p - probability, df - degrees of freedom |
|
>>> from scipy.stats import t |
|
>>> tinv = lambda p, df: abs(t.ppf(p/2, df)) |
|
|
|
>>> ts = tinv(0.05, len(x)-2) |
|
>>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}") |
|
slope (95%): 1.453392 +/- 0.743465 |
|
>>> print(f"intercept (95%): {res.intercept:.6f}" |
|
... f" +/- {ts*res.intercept_stderr:.6f}") |
|
intercept (95%): 0.616950 +/- 0.544475 |
|
|
|
""" |
|
if y is None: |
|
x = ma.array(x) |
|
if x.shape[0] == 2: |
|
x, y = x |
|
elif x.shape[1] == 2: |
|
x, y = x.T |
|
else: |
|
raise ValueError("If only `x` is given as input, " |
|
"it has to be of shape (2, N) or (N, 2), " |
|
f"provided shape was {x.shape}") |
|
else: |
|
x = ma.array(x) |
|
y = ma.array(y) |
|
|
|
x = x.flatten() |
|
y = y.flatten() |
|
|
|
if np.amax(x) == np.amin(x) and len(x) > 1: |
|
raise ValueError("Cannot calculate a linear regression " |
|
"if all x values are identical") |
|
|
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False) |
|
if m is not nomask: |
|
x = ma.array(x, mask=m) |
|
y = ma.array(y, mask=m) |
|
if np.any(~m): |
|
result = _stats_py.linregress(x.data[~m], y.data[~m]) |
|
else: |
|
|
|
result = _stats_py.LinregressResult(slope=None, intercept=None, |
|
rvalue=None, pvalue=None, |
|
stderr=None, |
|
intercept_stderr=None) |
|
else: |
|
result = _stats_py.linregress(x.data, y.data) |
|
|
|
return result |
|
|
|
|
|
def theilslopes(y, x=None, alpha=0.95, method='separate'): |
|
r""" |
|
Computes the Theil-Sen estimator for a set of points (x, y). |
|
|
|
`theilslopes` implements a method for robust linear regression. It |
|
computes the slope as the median of all slopes between paired values. |
|
|
|
Parameters |
|
---------- |
|
y : array_like |
|
Dependent variable. |
|
x : array_like or None, optional |
|
Independent variable. If None, use ``arange(len(y))`` instead. |
|
alpha : float, optional |
|
Confidence degree between 0 and 1. Default is 95% confidence. |
|
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are |
|
interpreted as "find the 90% confidence interval". |
|
method : {'joint', 'separate'}, optional |
|
Method to be used for computing estimate for intercept. |
|
Following methods are supported, |
|
|
|
* 'joint': Uses np.median(y - slope * x) as intercept. |
|
* 'separate': Uses np.median(y) - slope * np.median(x) |
|
as intercept. |
|
|
|
The default is 'separate'. |
|
|
|
.. versionadded:: 1.8.0 |
|
|
|
Returns |
|
------- |
|
result : ``TheilslopesResult`` instance |
|
The return value is an object with the following attributes: |
|
|
|
slope : float |
|
Theil slope. |
|
intercept : float |
|
Intercept of the Theil line. |
|
low_slope : float |
|
Lower bound of the confidence interval on `slope`. |
|
high_slope : float |
|
Upper bound of the confidence interval on `slope`. |
|
|
|
See Also |
|
-------- |
|
siegelslopes : a similar technique using repeated medians |
|
|
|
|
|
Notes |
|
----- |
|
For more details on `theilslopes`, see `scipy.stats.theilslopes`. |
|
|
|
""" |
|
y = ma.asarray(y).flatten() |
|
if x is None: |
|
x = ma.arange(len(y), dtype=float) |
|
else: |
|
x = ma.asarray(x).flatten() |
|
if len(x) != len(y): |
|
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})") |
|
|
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y)) |
|
y._mask = x._mask = m |
|
|
|
y = y.compressed() |
|
x = x.compressed().astype(float) |
|
|
|
return stats_theilslopes(y, x, alpha=alpha, method=method) |
|
|
|
|
|
def siegelslopes(y, x=None, method="hierarchical"): |
|
r""" |
|
Computes the Siegel estimator for a set of points (x, y). |
|
|
|
`siegelslopes` implements a method for robust linear regression |
|
using repeated medians to fit a line to the points (x, y). |
|
The method is robust to outliers with an asymptotic breakdown point |
|
of 50%. |
|
|
|
Parameters |
|
---------- |
|
y : array_like |
|
Dependent variable. |
|
x : array_like or None, optional |
|
Independent variable. If None, use ``arange(len(y))`` instead. |
|
method : {'hierarchical', 'separate'} |
|
If 'hierarchical', estimate the intercept using the estimated |
|
slope ``slope`` (default option). |
|
If 'separate', estimate the intercept independent of the estimated |
|
slope. See Notes for details. |
|
|
|
Returns |
|
------- |
|
result : ``SiegelslopesResult`` instance |
|
The return value is an object with the following attributes: |
|
|
|
slope : float |
|
Estimate of the slope of the regression line. |
|
intercept : float |
|
Estimate of the intercept of the regression line. |
|
|
|
See Also |
|
-------- |
|
theilslopes : a similar technique without repeated medians |
|
|
|
Notes |
|
----- |
|
For more details on `siegelslopes`, see `scipy.stats.siegelslopes`. |
|
|
|
""" |
|
y = ma.asarray(y).ravel() |
|
if x is None: |
|
x = ma.arange(len(y), dtype=float) |
|
else: |
|
x = ma.asarray(x).ravel() |
|
if len(x) != len(y): |
|
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})") |
|
|
|
m = ma.mask_or(ma.getmask(x), ma.getmask(y)) |
|
y._mask = x._mask = m |
|
|
|
y = y.compressed() |
|
x = x.compressed().astype(float) |
|
|
|
return stats_siegelslopes(y, x, method=method) |
|
|
|
|
|
SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult', |
|
['intra_slope', 'inter_slope']) |
|
|
|
|
|
def sen_seasonal_slopes(x): |
|
r""" |
|
Computes seasonal Theil-Sen and Kendall slope estimators. |
|
|
|
The seasonal generalization of Sen's slope computes the slopes between all |
|
pairs of values within a "season" (column) of a 2D array. It returns an |
|
array containing the median of these "within-season" slopes for each |
|
season (the Theil-Sen slope estimator of each season), and it returns the |
|
median of the within-season slopes across all seasons (the seasonal Kendall |
|
slope estimator). |
|
|
|
Parameters |
|
---------- |
|
x : 2D array_like |
|
Each column of `x` contains measurements of the dependent variable |
|
within a season. The independent variable (usually time) of each season |
|
is assumed to be ``np.arange(x.shape[0])``. |
|
|
|
Returns |
|
------- |
|
result : ``SenSeasonalSlopesResult`` instance |
|
The return value is an object with the following attributes: |
|
|
|
intra_slope : ndarray |
|
For each season, the Theil-Sen slope estimator: the median of |
|
within-season slopes. |
|
inter_slope : float |
|
The seasonal Kendall slope estimator: the median of within-season |
|
slopes *across all* seasons. |
|
|
|
See Also |
|
-------- |
|
theilslopes : the analogous function for non-seasonal data |
|
scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays |
|
|
|
Notes |
|
----- |
|
The slopes :math:`d_{ijk}` within season :math:`i` are: |
|
|
|
.. math:: |
|
|
|
d_{ijk} = \frac{x_{ij} - x_{ik}} |
|
{j - k} |
|
|
|
for pairs of distinct integer indices :math:`j, k` of :math:`x`. |
|
|
|
Element :math:`i` of the returned `intra_slope` array is the median of the |
|
:math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope |
|
estimator of season :math:`i`. The returned `inter_slope` value, better |
|
known as the seasonal Kendall slope estimator, is the median of the |
|
:math:`d_{ijk}` over all :math:`i, j, k`. |
|
|
|
References |
|
---------- |
|
.. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith. |
|
"Techniques of trend analysis for monthly water quality data." |
|
*Water Resources Research* 18.1 (1982): 107-121. |
|
|
|
Examples |
|
-------- |
|
Suppose we have 100 observations of a dependent variable for each of four |
|
seasons: |
|
|
|
>>> import numpy as np |
|
>>> rng = np.random.default_rng() |
|
>>> x = rng.random(size=(100, 4)) |
|
|
|
We compute the seasonal slopes as: |
|
|
|
>>> from scipy import stats |
|
>>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x) |
|
|
|
If we define a function to compute all slopes between observations within |
|
a season: |
|
|
|
>>> def dijk(yi): |
|
... n = len(yi) |
|
... x = np.arange(n) |
|
... dy = yi - yi[:, np.newaxis] |
|
... dx = x - x[:, np.newaxis] |
|
... # we only want unique pairs of distinct indices |
|
... mask = np.triu(np.ones((n, n), dtype=bool), k=1) |
|
... return dy[mask]/dx[mask] |
|
|
|
then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``: |
|
|
|
>>> i = 2 |
|
>>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i]) |
|
True |
|
|
|
and ``inter_slope`` is the median of the values returned by ``dijk`` for |
|
all seasons: |
|
|
|
>>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])]) |
|
>>> np.allclose(np.median(all_slopes), inter_slope) |
|
True |
|
|
|
Because the data are randomly generated, we would expect the median slopes |
|
to be nearly zero both within and across all seasons, and indeed they are: |
|
|
|
>>> intra_slope.data |
|
array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338]) |
|
>>> inter_slope |
|
-0.0010511779872922058 |
|
|
|
""" |
|
x = ma.array(x, subok=True, copy=False, ndmin=2) |
|
(n,_) = x.shape |
|
|
|
szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None] |
|
for i in range(n)]) |
|
szn_medslopes = ma.median(szn_slopes, axis=0) |
|
medslope = ma.median(szn_slopes, axis=None) |
|
return SenSeasonalSlopesResult(szn_medslopes, medslope) |
|
|
|
|
|
Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def ttest_1samp(a, popmean, axis=0, alternative='two-sided'): |
|
""" |
|
Calculates the T-test for the mean of ONE group of scores. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
sample observation |
|
popmean : float or array_like |
|
expected value in null hypothesis, if array_like than it must have the |
|
same shape as `a` excluding the axis dimension |
|
axis : int or None, optional |
|
Axis along which to compute test. If None, compute over the whole |
|
array `a`. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. |
|
The following options are available (default is 'two-sided'): |
|
|
|
* 'two-sided': the mean of the underlying distribution of the sample |
|
is different than the given population mean (`popmean`) |
|
* 'less': the mean of the underlying distribution of the sample is |
|
less than the given population mean (`popmean`) |
|
* 'greater': the mean of the underlying distribution of the sample is |
|
greater than the given population mean (`popmean`) |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
statistic : float or array |
|
t-statistic |
|
pvalue : float or array |
|
The p-value |
|
|
|
Notes |
|
----- |
|
For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
if a.size == 0: |
|
return (np.nan, np.nan) |
|
|
|
x = a.mean(axis=axis) |
|
v = a.var(axis=axis, ddof=1) |
|
n = a.count(axis=axis) |
|
|
|
df = ma.asanyarray(n - 1.0) |
|
svar = ((n - 1.0) * v) / df |
|
with np.errstate(divide='ignore', invalid='ignore'): |
|
t = (x - popmean) / ma.sqrt(svar / n) |
|
|
|
t, prob = _ttest_finish(df, t, alternative) |
|
return Ttest_1sampResult(t, prob) |
|
|
|
|
|
ttest_onesamp = ttest_1samp |
|
|
|
|
|
Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'): |
|
""" |
|
Calculates the T-test for the means of TWO INDEPENDENT samples of scores. |
|
|
|
Parameters |
|
---------- |
|
a, b : array_like |
|
The arrays must have the same shape, except in the dimension |
|
corresponding to `axis` (the first, by default). |
|
axis : int or None, optional |
|
Axis along which to compute test. If None, compute over the whole |
|
arrays, `a`, and `b`. |
|
equal_var : bool, optional |
|
If True, perform a standard independent 2 sample test that assumes equal |
|
population variances. |
|
If False, perform Welch's t-test, which does not assume equal population |
|
variance. |
|
|
|
.. versionadded:: 0.17.0 |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. |
|
The following options are available (default is 'two-sided'): |
|
|
|
* 'two-sided': the means of the distributions underlying the samples |
|
are unequal. |
|
* 'less': the mean of the distribution underlying the first sample |
|
is less than the mean of the distribution underlying the second |
|
sample. |
|
* 'greater': the mean of the distribution underlying the first |
|
sample is greater than the mean of the distribution underlying |
|
the second sample. |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
statistic : float or array |
|
The calculated t-statistic. |
|
pvalue : float or array |
|
The p-value. |
|
|
|
Notes |
|
----- |
|
For more details on `ttest_ind`, see `scipy.stats.ttest_ind`. |
|
|
|
""" |
|
a, b, axis = _chk2_asarray(a, b, axis) |
|
|
|
if a.size == 0 or b.size == 0: |
|
return Ttest_indResult(np.nan, np.nan) |
|
|
|
(x1, x2) = (a.mean(axis), b.mean(axis)) |
|
(v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) |
|
(n1, n2) = (a.count(axis), b.count(axis)) |
|
|
|
if equal_var: |
|
|
|
df = ma.asanyarray(n1 + n2 - 2.0) |
|
svar = ((n1-1)*v1+(n2-1)*v2) / df |
|
denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) |
|
else: |
|
vn1 = v1/n1 |
|
vn2 = v2/n2 |
|
with np.errstate(divide='ignore', invalid='ignore'): |
|
df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) |
|
|
|
|
|
|
|
df = np.where(np.isnan(df), 1, df) |
|
denom = ma.sqrt(vn1 + vn2) |
|
|
|
with np.errstate(divide='ignore', invalid='ignore'): |
|
t = (x1-x2) / denom |
|
|
|
t, prob = _ttest_finish(df, t, alternative) |
|
return Ttest_indResult(t, prob) |
|
|
|
|
|
Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def ttest_rel(a, b, axis=0, alternative='two-sided'): |
|
""" |
|
Calculates the T-test on TWO RELATED samples of scores, a and b. |
|
|
|
Parameters |
|
---------- |
|
a, b : array_like |
|
The arrays must have the same shape. |
|
axis : int or None, optional |
|
Axis along which to compute test. If None, compute over the whole |
|
arrays, `a`, and `b`. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. |
|
The following options are available (default is 'two-sided'): |
|
|
|
* 'two-sided': the means of the distributions underlying the samples |
|
are unequal. |
|
* 'less': the mean of the distribution underlying the first sample |
|
is less than the mean of the distribution underlying the second |
|
sample. |
|
* 'greater': the mean of the distribution underlying the first |
|
sample is greater than the mean of the distribution underlying |
|
the second sample. |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
statistic : float or array |
|
t-statistic |
|
pvalue : float or array |
|
two-tailed p-value |
|
|
|
Notes |
|
----- |
|
For more details on `ttest_rel`, see `scipy.stats.ttest_rel`. |
|
|
|
""" |
|
a, b, axis = _chk2_asarray(a, b, axis) |
|
if len(a) != len(b): |
|
raise ValueError('unequal length arrays') |
|
|
|
if a.size == 0 or b.size == 0: |
|
return Ttest_relResult(np.nan, np.nan) |
|
|
|
n = a.count(axis) |
|
df = ma.asanyarray(n-1.0) |
|
d = (a-b).astype('d') |
|
dm = d.mean(axis) |
|
v = d.var(axis=axis, ddof=1) |
|
denom = ma.sqrt(v / n) |
|
with np.errstate(divide='ignore', invalid='ignore'): |
|
t = dm / denom |
|
|
|
t, prob = _ttest_finish(df, t, alternative) |
|
return Ttest_relResult(t, prob) |
|
|
|
|
|
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', |
|
'pvalue')) |
|
|
|
|
|
def mannwhitneyu(x,y, use_continuity=True): |
|
""" |
|
Computes the Mann-Whitney statistic |
|
|
|
Missing values in `x` and/or `y` are discarded. |
|
|
|
Parameters |
|
---------- |
|
x : sequence |
|
Input |
|
y : sequence |
|
Input |
|
use_continuity : {True, False}, optional |
|
Whether a continuity correction (1/2.) should be taken into account. |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The minimum of the Mann-Whitney statistics |
|
pvalue : float |
|
Approximate two-sided p-value assuming a normal distribution. |
|
|
|
""" |
|
x = ma.asarray(x).compressed().view(ndarray) |
|
y = ma.asarray(y).compressed().view(ndarray) |
|
ranks = rankdata(np.concatenate([x,y])) |
|
(nx, ny) = (len(x), len(y)) |
|
nt = nx + ny |
|
U = ranks[:nx].sum() - nx*(nx+1)/2. |
|
U = max(U, nx*ny - U) |
|
u = nx*ny - U |
|
|
|
mu = (nx*ny)/2. |
|
sigsq = (nt**3 - nt)/12. |
|
ties = count_tied_groups(ranks) |
|
sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12. |
|
sigsq *= nx*ny/float(nt*(nt-1)) |
|
|
|
if use_continuity: |
|
z = (U - 1/2. - mu) / ma.sqrt(sigsq) |
|
else: |
|
z = (U - mu) / ma.sqrt(sigsq) |
|
|
|
prob = special.erfc(abs(z)/np.sqrt(2)) |
|
return MannwhitneyuResult(u, prob) |
|
|
|
|
|
KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def kruskal(*args): |
|
""" |
|
Compute the Kruskal-Wallis H-test for independent samples |
|
|
|
Parameters |
|
---------- |
|
sample1, sample2, ... : array_like |
|
Two or more arrays with the sample measurements can be given as |
|
arguments. |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The Kruskal-Wallis H statistic, corrected for ties |
|
pvalue : float |
|
The p-value for the test using the assumption that H has a chi |
|
square distribution |
|
|
|
Notes |
|
----- |
|
For more details on `kruskal`, see `scipy.stats.kruskal`. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats.mstats import kruskal |
|
|
|
Random samples from three different brands of batteries were tested |
|
to see how long the charge lasted. Results were as follows: |
|
|
|
>>> a = [6.3, 5.4, 5.7, 5.2, 5.0] |
|
>>> b = [6.9, 7.0, 6.1, 7.9] |
|
>>> c = [7.2, 6.9, 6.1, 6.5] |
|
|
|
Test the hypothesis that the distribution functions for all of the brands' |
|
durations are identical. Use 5% level of significance. |
|
|
|
>>> kruskal(a, b, c) |
|
KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164) |
|
|
|
The null hypothesis is rejected at the 5% level of significance |
|
because the returned p-value is less than the critical value of 5%. |
|
|
|
""" |
|
output = argstoarray(*args) |
|
ranks = ma.masked_equal(rankdata(output, use_missing=False), 0) |
|
sumrk = ranks.sum(-1) |
|
ngrp = ranks.count(-1) |
|
ntot = ranks.count() |
|
H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1) |
|
|
|
ties = count_tied_groups(ranks) |
|
T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot) |
|
if T == 0: |
|
raise ValueError('All numbers are identical in kruskal') |
|
|
|
H /= T |
|
df = len(output) - 1 |
|
prob = distributions.chi2.sf(H, df) |
|
return KruskalResult(H, prob) |
|
|
|
|
|
kruskalwallis = kruskal |
|
|
|
|
|
@_rename_parameter("mode", "method") |
|
def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'): |
|
""" |
|
Computes the Kolmogorov-Smirnov test on one sample of masked values. |
|
|
|
Missing values in `x` are discarded. |
|
|
|
Parameters |
|
---------- |
|
x : array_like |
|
a 1-D array of observations of random variables. |
|
cdf : str or callable |
|
If a string, it should be the name of a distribution in `scipy.stats`. |
|
If a callable, that callable is used to calculate the cdf. |
|
args : tuple, sequence, optional |
|
Distribution parameters, used if `cdf` is a string. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Indicates the alternative hypothesis. Default is 'two-sided'. |
|
method : {'auto', 'exact', 'asymp'}, optional |
|
Defines the method used for calculating the p-value. |
|
The following options are available (default is 'auto'): |
|
|
|
* 'auto' : use 'exact' for small size arrays, 'asymp' for large |
|
* 'exact' : use approximation to exact distribution of test statistic |
|
* 'asymp' : use asymptotic distribution of test statistic |
|
|
|
Returns |
|
------- |
|
d : float |
|
Value of the Kolmogorov Smirnov test |
|
p : float |
|
Corresponding p-value. |
|
|
|
""" |
|
alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( |
|
alternative.lower()[0], alternative) |
|
return scipy.stats._stats_py.ks_1samp( |
|
x, cdf, args=args, alternative=alternative, method=method) |
|
|
|
|
|
@_rename_parameter("mode", "method") |
|
def ks_2samp(data1, data2, alternative="two-sided", method='auto'): |
|
""" |
|
Computes the Kolmogorov-Smirnov test on two samples. |
|
|
|
Missing values in `x` and/or `y` are discarded. |
|
|
|
Parameters |
|
---------- |
|
data1 : array_like |
|
First data set |
|
data2 : array_like |
|
Second data set |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Indicates the alternative hypothesis. Default is 'two-sided'. |
|
method : {'auto', 'exact', 'asymp'}, optional |
|
Defines the method used for calculating the p-value. |
|
The following options are available (default is 'auto'): |
|
|
|
* 'auto' : use 'exact' for small size arrays, 'asymp' for large |
|
* 'exact' : use approximation to exact distribution of test statistic |
|
* 'asymp' : use asymptotic distribution of test statistic |
|
|
|
Returns |
|
------- |
|
d : float |
|
Value of the Kolmogorov Smirnov test |
|
p : float |
|
Corresponding p-value. |
|
|
|
""" |
|
|
|
|
|
|
|
alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( |
|
alternative.lower()[0], alternative) |
|
return scipy.stats._stats_py.ks_2samp(data1, data2, |
|
alternative=alternative, |
|
method=method) |
|
|
|
|
|
ks_twosamp = ks_2samp |
|
|
|
|
|
@_rename_parameter("mode", "method") |
|
def kstest(data1, data2, args=(), alternative='two-sided', method='auto'): |
|
""" |
|
|
|
Parameters |
|
---------- |
|
data1 : array_like |
|
data2 : str, callable or array_like |
|
args : tuple, sequence, optional |
|
Distribution parameters, used if `data1` or `data2` are strings. |
|
alternative : str, as documented in stats.kstest |
|
method : str, as documented in stats.kstest |
|
|
|
Returns |
|
------- |
|
tuple of (K-S statistic, probability) |
|
|
|
""" |
|
return scipy.stats._stats_py.kstest(data1, data2, args, |
|
alternative=alternative, method=method) |
|
|
|
|
|
def trima(a, limits=None, inclusive=(True,True)): |
|
""" |
|
Trims an array by masking the data outside some given limits. |
|
|
|
Returns a masked version of the input array. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Input array. |
|
limits : {None, tuple}, optional |
|
Tuple of (lower limit, upper limit) in absolute values. |
|
Values of the input array lower (greater) than the lower (upper) limit |
|
will be masked. A limit is None indicates an open interval. |
|
inclusive : (bool, bool) tuple, optional |
|
Tuple of (lower flag, upper flag), indicating whether values exactly |
|
equal to the lower (upper) limit are allowed. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats.mstats import trima |
|
>>> import numpy as np |
|
|
|
>>> a = np.arange(10) |
|
|
|
The interval is left-closed and right-open, i.e., `[2, 8)`. |
|
Trim the array by keeping only values in the interval. |
|
|
|
>>> trima(a, limits=(2, 8), inclusive=(True, False)) |
|
masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --], |
|
mask=[ True, True, False, False, False, False, False, False, |
|
True, True], |
|
fill_value=999999) |
|
|
|
""" |
|
a = ma.asarray(a) |
|
a.unshare_mask() |
|
if (limits is None) or (limits == (None, None)): |
|
return a |
|
|
|
(lower_lim, upper_lim) = limits |
|
(lower_in, upper_in) = inclusive |
|
condition = False |
|
if lower_lim is not None: |
|
if lower_in: |
|
condition |= (a < lower_lim) |
|
else: |
|
condition |= (a <= lower_lim) |
|
|
|
if upper_lim is not None: |
|
if upper_in: |
|
condition |= (a > upper_lim) |
|
else: |
|
condition |= (a >= upper_lim) |
|
|
|
a[condition.filled(True)] = masked |
|
return a |
|
|
|
|
|
def trimr(a, limits=None, inclusive=(True, True), axis=None): |
|
""" |
|
Trims an array by masking some proportion of the data on each end. |
|
Returns a masked version of the input array. |
|
|
|
Parameters |
|
---------- |
|
a : sequence |
|
Input array. |
|
limits : {None, tuple}, optional |
|
Tuple of the percentages to cut on each side of the array, with respect |
|
to the number of unmasked data, as floats between 0. and 1. |
|
Noting n the number of unmasked data before trimming, the |
|
(n*limits[0])th smallest data and the (n*limits[1])th largest data are |
|
masked, and the total number of unmasked data after trimming is |
|
n*(1.-sum(limits)). The value of one limit can be set to None to |
|
indicate an open interval. |
|
inclusive : {(True,True) tuple}, optional |
|
Tuple of flags indicating whether the number of data being masked on |
|
the left (right) end should be truncated (True) or rounded (False) to |
|
integers. |
|
axis : {None,int}, optional |
|
Axis along which to trim. If None, the whole array is trimmed, but its |
|
shape is maintained. |
|
|
|
""" |
|
def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive): |
|
n = a.count() |
|
idx = a.argsort() |
|
if low_limit: |
|
if low_inclusive: |
|
lowidx = int(low_limit*n) |
|
else: |
|
lowidx = int(np.round(low_limit*n)) |
|
a[idx[:lowidx]] = masked |
|
if up_limit is not None: |
|
if up_inclusive: |
|
upidx = n - int(n*up_limit) |
|
else: |
|
upidx = n - int(np.round(n*up_limit)) |
|
a[idx[upidx:]] = masked |
|
return a |
|
|
|
a = ma.asarray(a) |
|
a.unshare_mask() |
|
if limits is None: |
|
return a |
|
|
|
|
|
(lolim, uplim) = limits |
|
errmsg = "The proportion to cut from the %s should be between 0. and 1." |
|
if lolim is not None: |
|
if lolim > 1. or lolim < 0: |
|
raise ValueError(errmsg % 'beginning' + f"(got {lolim})") |
|
if uplim is not None: |
|
if uplim > 1. or uplim < 0: |
|
raise ValueError(errmsg % 'end' + f"(got {uplim})") |
|
|
|
(loinc, upinc) = inclusive |
|
|
|
if axis is None: |
|
shp = a.shape |
|
return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) |
|
else: |
|
return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc) |
|
|
|
|
|
trimdoc = """ |
|
Parameters |
|
---------- |
|
a : sequence |
|
Input array |
|
limits : {None, tuple}, optional |
|
If `relative` is False, tuple (lower limit, upper limit) in absolute values. |
|
Values of the input array lower (greater) than the lower (upper) limit are |
|
masked. |
|
|
|
If `relative` is True, tuple (lower percentage, upper percentage) to cut |
|
on each side of the array, with respect to the number of unmasked data. |
|
|
|
Noting n the number of unmasked data before trimming, the (n*limits[0])th |
|
smallest data and the (n*limits[1])th largest data are masked, and the |
|
total number of unmasked data after trimming is n*(1.-sum(limits)) |
|
In each case, the value of one limit can be set to None to indicate an |
|
open interval. |
|
|
|
If limits is None, no trimming is performed |
|
inclusive : {(bool, bool) tuple}, optional |
|
If `relative` is False, tuple indicating whether values exactly equal |
|
to the absolute limits are allowed. |
|
If `relative` is True, tuple indicating whether the number of data |
|
being masked on each side should be rounded (True) or truncated |
|
(False). |
|
relative : bool, optional |
|
Whether to consider the limits as absolute values (False) or proportions |
|
to cut (True). |
|
axis : int, optional |
|
Axis along which to trim. |
|
""" |
|
|
|
|
|
def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None): |
|
""" |
|
Trims an array by masking the data outside some given limits. |
|
|
|
Returns a masked version of the input array. |
|
|
|
%s |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats.mstats import trim |
|
>>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10] |
|
>>> print(trim(z,(3,8))) |
|
[-- -- 3 4 5 6 7 8 -- --] |
|
>>> print(trim(z,(0.1,0.2),relative=True)) |
|
[-- 2 3 4 5 6 7 8 -- --] |
|
|
|
""" |
|
if relative: |
|
return trimr(a, limits=limits, inclusive=inclusive, axis=axis) |
|
else: |
|
return trima(a, limits=limits, inclusive=inclusive) |
|
|
|
|
|
if trim.__doc__: |
|
trim.__doc__ = trim.__doc__ % trimdoc |
|
|
|
|
|
def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None): |
|
""" |
|
Trims the smallest and largest data values. |
|
|
|
Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and |
|
``int(proportiontocut * n)`` largest values of data along the given axis, |
|
where n is the number of unmasked values before trimming. |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
Data to trim. |
|
proportiontocut : float, optional |
|
Percentage of trimming (as a float between 0 and 1). |
|
If n is the number of unmasked values before trimming, the number of |
|
values after trimming is ``(1 - 2*proportiontocut) * n``. |
|
Default is 0.2. |
|
inclusive : {(bool, bool) tuple}, optional |
|
Tuple indicating whether the number of data being masked on each side |
|
should be rounded (True) or truncated (False). |
|
axis : int, optional |
|
Axis along which to perform the trimming. |
|
If None, the input array is first flattened. |
|
|
|
""" |
|
return trimr(data, limits=(proportiontocut,proportiontocut), |
|
inclusive=inclusive, axis=axis) |
|
|
|
|
|
def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True), |
|
axis=None): |
|
""" |
|
Trims the data by masking values from one tail. |
|
|
|
Parameters |
|
---------- |
|
data : array_like |
|
Data to trim. |
|
proportiontocut : float, optional |
|
Percentage of trimming. If n is the number of unmasked values |
|
before trimming, the number of values after trimming is |
|
``(1 - proportiontocut) * n``. Default is 0.2. |
|
tail : {'left','right'}, optional |
|
If 'left' the `proportiontocut` lowest values will be masked. |
|
If 'right' the `proportiontocut` highest values will be masked. |
|
Default is 'left'. |
|
inclusive : {(bool, bool) tuple}, optional |
|
Tuple indicating whether the number of data being masked on each side |
|
should be rounded (True) or truncated (False). Default is |
|
(True, True). |
|
axis : int, optional |
|
Axis along which to perform the trimming. |
|
If None, the input array is first flattened. Default is None. |
|
|
|
Returns |
|
------- |
|
trimtail : ndarray |
|
Returned array of same shape as `data` with masked tail values. |
|
|
|
""" |
|
tail = str(tail).lower()[0] |
|
if tail == 'l': |
|
limits = (proportiontocut,None) |
|
elif tail == 'r': |
|
limits = (None, proportiontocut) |
|
else: |
|
raise TypeError("The tail argument should be in ('left','right')") |
|
|
|
return trimr(data, limits=limits, axis=axis, inclusive=inclusive) |
|
|
|
|
|
trim1 = trimtail |
|
|
|
|
|
def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, |
|
axis=None): |
|
"""Returns the trimmed mean of the data along the given axis. |
|
|
|
%s |
|
|
|
""" |
|
if (not isinstance(limits,tuple)) and isinstance(limits,float): |
|
limits = (limits, limits) |
|
if relative: |
|
return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis) |
|
else: |
|
return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis) |
|
|
|
|
|
if trimmed_mean.__doc__: |
|
trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc |
|
|
|
|
|
def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, |
|
axis=None, ddof=0): |
|
"""Returns the trimmed variance of the data along the given axis. |
|
|
|
%s |
|
ddof : {0,integer}, optional |
|
Means Delta Degrees of Freedom. The denominator used during computations |
|
is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- |
|
biased estimate of the variance. |
|
|
|
""" |
|
if (not isinstance(limits,tuple)) and isinstance(limits,float): |
|
limits = (limits, limits) |
|
if relative: |
|
out = trimr(a,limits=limits, inclusive=inclusive,axis=axis) |
|
else: |
|
out = trima(a,limits=limits,inclusive=inclusive) |
|
|
|
return out.var(axis=axis, ddof=ddof) |
|
|
|
|
|
if trimmed_var.__doc__: |
|
trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc |
|
|
|
|
|
def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, |
|
axis=None, ddof=0): |
|
"""Returns the trimmed standard deviation of the data along the given axis. |
|
|
|
%s |
|
ddof : {0,integer}, optional |
|
Means Delta Degrees of Freedom. The denominator used during computations |
|
is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- |
|
biased estimate of the variance. |
|
|
|
""" |
|
if (not isinstance(limits,tuple)) and isinstance(limits,float): |
|
limits = (limits, limits) |
|
if relative: |
|
out = trimr(a,limits=limits,inclusive=inclusive,axis=axis) |
|
else: |
|
out = trima(a,limits=limits,inclusive=inclusive) |
|
return out.std(axis=axis,ddof=ddof) |
|
|
|
|
|
if trimmed_std.__doc__: |
|
trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc |
|
|
|
|
|
def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None): |
|
""" |
|
Returns the standard error of the trimmed mean along the given axis. |
|
|
|
Parameters |
|
---------- |
|
a : sequence |
|
Input array |
|
limits : {(0.1,0.1), tuple of float}, optional |
|
tuple (lower percentage, upper percentage) to cut on each side of the |
|
array, with respect to the number of unmasked data. |
|
|
|
If n is the number of unmasked data before trimming, the values |
|
smaller than ``n * limits[0]`` and the values larger than |
|
``n * `limits[1]`` are masked, and the total number of unmasked |
|
data after trimming is ``n * (1.-sum(limits))``. In each case, |
|
the value of one limit can be set to None to indicate an open interval. |
|
If `limits` is None, no trimming is performed. |
|
inclusive : {(bool, bool) tuple} optional |
|
Tuple indicating whether the number of data being masked on each side |
|
should be rounded (True) or truncated (False). |
|
axis : int, optional |
|
Axis along which to trim. |
|
|
|
Returns |
|
------- |
|
trimmed_stde : scalar or ndarray |
|
|
|
""" |
|
def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive): |
|
"Returns the standard error of the trimmed mean for a 1D input data." |
|
n = a.count() |
|
idx = a.argsort() |
|
if low_limit: |
|
if low_inclusive: |
|
lowidx = int(low_limit*n) |
|
else: |
|
lowidx = np.round(low_limit*n) |
|
a[idx[:lowidx]] = masked |
|
if up_limit is not None: |
|
if up_inclusive: |
|
upidx = n - int(n*up_limit) |
|
else: |
|
upidx = n - np.round(n*up_limit) |
|
a[idx[upidx:]] = masked |
|
a[idx[:lowidx]] = a[idx[lowidx]] |
|
a[idx[upidx:]] = a[idx[upidx-1]] |
|
winstd = a.std(ddof=1) |
|
return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a))) |
|
|
|
a = ma.array(a, copy=True, subok=True) |
|
a.unshare_mask() |
|
if limits is None: |
|
return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis)) |
|
if (not isinstance(limits,tuple)) and isinstance(limits,float): |
|
limits = (limits, limits) |
|
|
|
|
|
(lolim, uplim) = limits |
|
errmsg = "The proportion to cut from the %s should be between 0. and 1." |
|
if lolim is not None: |
|
if lolim > 1. or lolim < 0: |
|
raise ValueError(errmsg % 'beginning' + f"(got {lolim})") |
|
if uplim is not None: |
|
if uplim > 1. or uplim < 0: |
|
raise ValueError(errmsg % 'end' + f"(got {uplim})") |
|
|
|
(loinc, upinc) = inclusive |
|
if (axis is None): |
|
return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc) |
|
else: |
|
if a.ndim > 2: |
|
raise ValueError("Array 'a' must be at most two dimensional, " |
|
"but got a.ndim = %d" % a.ndim) |
|
return ma.apply_along_axis(_trimmed_stde_1D, axis, a, |
|
lolim,uplim,loinc,upinc) |
|
|
|
|
|
def _mask_to_limits(a, limits, inclusive): |
|
"""Mask an array for values outside of given limits. |
|
|
|
This is primarily a utility function. |
|
|
|
Parameters |
|
---------- |
|
a : array |
|
limits : (float or None, float or None) |
|
A tuple consisting of the (lower limit, upper limit). Values in the |
|
input array less than the lower limit or greater than the upper limit |
|
will be masked out. None implies no limit. |
|
inclusive : (bool, bool) |
|
A tuple consisting of the (lower flag, upper flag). These flags |
|
determine whether values exactly equal to lower or upper are allowed. |
|
|
|
Returns |
|
------- |
|
A MaskedArray. |
|
|
|
Raises |
|
------ |
|
A ValueError if there are no values within the given limits. |
|
""" |
|
lower_limit, upper_limit = limits |
|
lower_include, upper_include = inclusive |
|
am = ma.MaskedArray(a) |
|
if lower_limit is not None: |
|
if lower_include: |
|
am = ma.masked_less(am, lower_limit) |
|
else: |
|
am = ma.masked_less_equal(am, lower_limit) |
|
|
|
if upper_limit is not None: |
|
if upper_include: |
|
am = ma.masked_greater(am, upper_limit) |
|
else: |
|
am = ma.masked_greater_equal(am, upper_limit) |
|
|
|
if am.count() == 0: |
|
raise ValueError("No array values within given limits") |
|
|
|
return am |
|
|
|
|
|
def tmean(a, limits=None, inclusive=(True, True), axis=None): |
|
""" |
|
Compute the trimmed mean. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Array of values. |
|
limits : None or (lower limit, upper limit), optional |
|
Values in the input array less than the lower limit or greater than the |
|
upper limit will be ignored. When limits is None (default), then all |
|
values are used. Either of the limit values in the tuple can also be |
|
None representing a half-open interval. |
|
inclusive : (bool, bool), optional |
|
A tuple consisting of the (lower flag, upper flag). These flags |
|
determine whether values exactly equal to the lower or upper limits |
|
are included. The default value is (True, True). |
|
axis : int or None, optional |
|
Axis along which to operate. If None, compute over the |
|
whole array. Default is None. |
|
|
|
Returns |
|
------- |
|
tmean : float |
|
|
|
Notes |
|
----- |
|
For more details on `tmean`, see `scipy.stats.tmean`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats import mstats |
|
>>> a = np.array([[6, 8, 3, 0], |
|
... [3, 9, 1, 2], |
|
... [8, 7, 8, 2], |
|
... [5, 6, 0, 2], |
|
... [4, 5, 5, 2]]) |
|
... |
|
... |
|
>>> mstats.tmean(a, (2,5)) |
|
3.3 |
|
>>> mstats.tmean(a, (2,5), axis=0) |
|
masked_array(data=[4.0, 5.0, 4.0, 2.0], |
|
mask=[False, False, False, False], |
|
fill_value=1e+20) |
|
|
|
""" |
|
return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis) |
|
|
|
|
|
def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1): |
|
""" |
|
Compute the trimmed variance |
|
|
|
This function computes the sample variance of an array of values, |
|
while ignoring values which are outside of given `limits`. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Array of values. |
|
limits : None or (lower limit, upper limit), optional |
|
Values in the input array less than the lower limit or greater than the |
|
upper limit will be ignored. When limits is None, then all values are |
|
used. Either of the limit values in the tuple can also be None |
|
representing a half-open interval. The default value is None. |
|
inclusive : (bool, bool), optional |
|
A tuple consisting of the (lower flag, upper flag). These flags |
|
determine whether values exactly equal to the lower or upper limits |
|
are included. The default value is (True, True). |
|
axis : int or None, optional |
|
Axis along which to operate. If None, compute over the |
|
whole array. Default is zero. |
|
ddof : int, optional |
|
Delta degrees of freedom. Default is 1. |
|
|
|
Returns |
|
------- |
|
tvar : float |
|
Trimmed variance. |
|
|
|
Notes |
|
----- |
|
For more details on `tvar`, see `scipy.stats.tvar`. |
|
|
|
""" |
|
a = a.astype(float).ravel() |
|
if limits is None: |
|
n = (~a.mask).sum() |
|
return np.ma.var(a) * n/(n-1.) |
|
am = _mask_to_limits(a, limits=limits, inclusive=inclusive) |
|
|
|
return np.ma.var(am, axis=axis, ddof=ddof) |
|
|
|
|
|
def tmin(a, lowerlimit=None, axis=0, inclusive=True): |
|
""" |
|
Compute the trimmed minimum |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
array of values |
|
lowerlimit : None or float, optional |
|
Values in the input array less than the given limit will be ignored. |
|
When lowerlimit is None, then all values are used. The default value |
|
is None. |
|
axis : int or None, optional |
|
Axis along which to operate. Default is 0. If None, compute over the |
|
whole array `a`. |
|
inclusive : {True, False}, optional |
|
This flag determines whether values exactly equal to the lower limit |
|
are included. The default value is True. |
|
|
|
Returns |
|
------- |
|
tmin : float, int or ndarray |
|
|
|
Notes |
|
----- |
|
For more details on `tmin`, see `scipy.stats.tmin`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats import mstats |
|
>>> a = np.array([[6, 8, 3, 0], |
|
... [3, 2, 1, 2], |
|
... [8, 1, 8, 2], |
|
... [5, 3, 0, 2], |
|
... [4, 7, 5, 2]]) |
|
... |
|
>>> mstats.tmin(a, 5) |
|
masked_array(data=[5, 7, 5, --], |
|
mask=[False, False, False, True], |
|
fill_value=999999) |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
am = trima(a, (lowerlimit, None), (inclusive, False)) |
|
return ma.minimum.reduce(am, axis) |
|
|
|
|
|
def tmax(a, upperlimit=None, axis=0, inclusive=True): |
|
""" |
|
Compute the trimmed maximum |
|
|
|
This function computes the maximum value of an array along a given axis, |
|
while ignoring values larger than a specified upper limit. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
array of values |
|
upperlimit : None or float, optional |
|
Values in the input array greater than the given limit will be ignored. |
|
When upperlimit is None, then all values are used. The default value |
|
is None. |
|
axis : int or None, optional |
|
Axis along which to operate. Default is 0. If None, compute over the |
|
whole array `a`. |
|
inclusive : {True, False}, optional |
|
This flag determines whether values exactly equal to the upper limit |
|
are included. The default value is True. |
|
|
|
Returns |
|
------- |
|
tmax : float, int or ndarray |
|
|
|
Notes |
|
----- |
|
For more details on `tmax`, see `scipy.stats.tmax`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats import mstats |
|
>>> a = np.array([[6, 8, 3, 0], |
|
... [3, 9, 1, 2], |
|
... [8, 7, 8, 2], |
|
... [5, 6, 0, 2], |
|
... [4, 5, 5, 2]]) |
|
... |
|
... |
|
>>> mstats.tmax(a, 4) |
|
masked_array(data=[4, --, 3, 2], |
|
mask=[False, True, False, False], |
|
fill_value=999999) |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
am = trima(a, (None, upperlimit), (False, inclusive)) |
|
return ma.maximum.reduce(am, axis) |
|
|
|
|
|
def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1): |
|
""" |
|
Compute the trimmed standard error of the mean. |
|
|
|
This function finds the standard error of the mean for given |
|
values, ignoring values outside the given `limits`. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
array of values |
|
limits : None or (lower limit, upper limit), optional |
|
Values in the input array less than the lower limit or greater than the |
|
upper limit will be ignored. When limits is None, then all values are |
|
used. Either of the limit values in the tuple can also be None |
|
representing a half-open interval. The default value is None. |
|
inclusive : (bool, bool), optional |
|
A tuple consisting of the (lower flag, upper flag). These flags |
|
determine whether values exactly equal to the lower or upper limits |
|
are included. The default value is (True, True). |
|
axis : int or None, optional |
|
Axis along which to operate. If None, compute over the |
|
whole array. Default is zero. |
|
ddof : int, optional |
|
Delta degrees of freedom. Default is 1. |
|
|
|
Returns |
|
------- |
|
tsem : float |
|
|
|
Notes |
|
----- |
|
For more details on `tsem`, see `scipy.stats.tsem`. |
|
|
|
""" |
|
a = ma.asarray(a).ravel() |
|
if limits is None: |
|
n = float(a.count()) |
|
return a.std(axis=axis, ddof=ddof)/ma.sqrt(n) |
|
|
|
am = trima(a.ravel(), limits, inclusive) |
|
sd = np.sqrt(am.var(axis=axis, ddof=ddof)) |
|
return sd / np.sqrt(am.count()) |
|
|
|
|
|
def winsorize(a, limits=None, inclusive=(True, True), inplace=False, |
|
axis=None, nan_policy='propagate'): |
|
"""Returns a Winsorized version of the input array. |
|
|
|
The (limits[0])th lowest values are set to the (limits[0])th percentile, |
|
and the (limits[1])th highest values are set to the (1 - limits[1])th |
|
percentile. |
|
Masked values are skipped. |
|
|
|
|
|
Parameters |
|
---------- |
|
a : sequence |
|
Input array. |
|
limits : {None, tuple of float}, optional |
|
Tuple of the percentages to cut on each side of the array, with respect |
|
to the number of unmasked data, as floats between 0. and 1. |
|
Noting n the number of unmasked data before trimming, the |
|
(n*limits[0])th smallest data and the (n*limits[1])th largest data are |
|
masked, and the total number of unmasked data after trimming |
|
is n*(1.-sum(limits)) The value of one limit can be set to None to |
|
indicate an open interval. |
|
inclusive : {(True, True) tuple}, optional |
|
Tuple indicating whether the number of data being masked on each side |
|
should be truncated (True) or rounded (False). |
|
inplace : {False, True}, optional |
|
Whether to winsorize in place (True) or to use a copy (False) |
|
axis : {None, int}, optional |
|
Axis along which to trim. If None, the whole array is trimmed, but its |
|
shape is maintained. |
|
nan_policy : {'propagate', 'raise', 'omit'}, optional |
|
Defines how to handle when input contains nan. |
|
The following options are available (default is 'propagate'): |
|
|
|
* 'propagate': allows nan values and may overwrite or propagate them |
|
* 'raise': throws an error |
|
* 'omit': performs the calculations ignoring nan values |
|
|
|
Notes |
|
----- |
|
This function is applied to reduce the effect of possibly spurious outliers |
|
by limiting the extreme values. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats.mstats import winsorize |
|
|
|
A shuffled array contains integers from 1 to 10. |
|
|
|
>>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) |
|
|
|
The 10% of the lowest value (i.e., ``1``) and the 20% of the highest |
|
values (i.e., ``9`` and ``10``) are replaced. |
|
|
|
>>> winsorize(a, limits=[0.1, 0.2]) |
|
masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6], |
|
mask=False, |
|
fill_value=999999) |
|
|
|
""" |
|
def _winsorize1D(a, low_limit, up_limit, low_include, up_include, |
|
contains_nan, nan_policy): |
|
n = a.count() |
|
idx = a.argsort() |
|
if contains_nan: |
|
nan_count = np.count_nonzero(np.isnan(a)) |
|
if low_limit: |
|
if low_include: |
|
lowidx = int(low_limit * n) |
|
else: |
|
lowidx = np.round(low_limit * n).astype(int) |
|
if contains_nan and nan_policy == 'omit': |
|
lowidx = min(lowidx, n-nan_count-1) |
|
a[idx[:lowidx]] = a[idx[lowidx]] |
|
if up_limit is not None: |
|
if up_include: |
|
upidx = n - int(n * up_limit) |
|
else: |
|
upidx = n - np.round(n * up_limit).astype(int) |
|
if contains_nan and nan_policy == 'omit': |
|
a[idx[upidx:-nan_count]] = a[idx[upidx - 1]] |
|
else: |
|
a[idx[upidx:]] = a[idx[upidx - 1]] |
|
return a |
|
|
|
contains_nan, nan_policy = _contains_nan(a, nan_policy) |
|
|
|
a = ma.array(a, copy=np.logical_not(inplace)) |
|
|
|
if limits is None: |
|
return a |
|
if (not isinstance(limits, tuple)) and isinstance(limits, float): |
|
limits = (limits, limits) |
|
|
|
|
|
(lolim, uplim) = limits |
|
errmsg = "The proportion to cut from the %s should be between 0. and 1." |
|
if lolim is not None: |
|
if lolim > 1. or lolim < 0: |
|
raise ValueError(errmsg % 'beginning' + f"(got {lolim})") |
|
if uplim is not None: |
|
if uplim > 1. or uplim < 0: |
|
raise ValueError(errmsg % 'end' + f"(got {uplim})") |
|
|
|
(loinc, upinc) = inclusive |
|
|
|
if axis is None: |
|
shp = a.shape |
|
return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc, |
|
contains_nan, nan_policy).reshape(shp) |
|
else: |
|
return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc, |
|
upinc, contains_nan, nan_policy) |
|
|
|
|
|
def moment(a, moment=1, axis=0): |
|
""" |
|
Calculates the nth moment about the mean for a sample. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
data |
|
moment : int, optional |
|
order of central moment that is returned |
|
axis : int or None, optional |
|
Axis along which the central moment is computed. Default is 0. |
|
If None, compute over the whole array `a`. |
|
|
|
Returns |
|
------- |
|
n-th central moment : ndarray or float |
|
The appropriate moment along the given axis or over all values if axis |
|
is None. The denominator for the moment calculation is the number of |
|
observations, no degrees of freedom correction is done. |
|
|
|
Notes |
|
----- |
|
For more details about `moment`, see `scipy.stats.moment`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
if a.size == 0: |
|
moment_shape = list(a.shape) |
|
del moment_shape[axis] |
|
dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64 |
|
|
|
out_shape = (moment_shape if np.isscalar(moment) |
|
else [len(moment)] + moment_shape) |
|
if len(out_shape) == 0: |
|
return dtype(np.nan) |
|
else: |
|
return ma.array(np.full(out_shape, np.nan, dtype=dtype)) |
|
|
|
|
|
if not np.isscalar(moment): |
|
mean = a.mean(axis, keepdims=True) |
|
mmnt = [_moment(a, i, axis, mean=mean) for i in moment] |
|
return ma.array(mmnt) |
|
else: |
|
return _moment(a, moment, axis) |
|
|
|
|
|
|
|
def _moment(a, moment, axis, *, mean=None): |
|
if np.abs(moment - np.round(moment)) > 0: |
|
raise ValueError("All moment parameters must be integers") |
|
|
|
if moment == 0 or moment == 1: |
|
|
|
|
|
shape = list(a.shape) |
|
del shape[axis] |
|
dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64 |
|
|
|
if len(shape) == 0: |
|
return dtype(1.0 if moment == 0 else 0.0) |
|
else: |
|
return (ma.ones(shape, dtype=dtype) if moment == 0 |
|
else ma.zeros(shape, dtype=dtype)) |
|
else: |
|
|
|
n_list = [moment] |
|
current_n = moment |
|
while current_n > 2: |
|
if current_n % 2: |
|
current_n = (current_n-1)/2 |
|
else: |
|
current_n /= 2 |
|
n_list.append(current_n) |
|
|
|
|
|
mean = a.mean(axis, keepdims=True) if mean is None else mean |
|
a_zero_mean = a - mean |
|
if n_list[-1] == 1: |
|
s = a_zero_mean.copy() |
|
else: |
|
s = a_zero_mean**2 |
|
|
|
|
|
for n in n_list[-2::-1]: |
|
s = s**2 |
|
if n % 2: |
|
s *= a_zero_mean |
|
return s.mean(axis) |
|
|
|
|
|
def variation(a, axis=0, ddof=0): |
|
""" |
|
Compute the coefficient of variation. |
|
|
|
The coefficient of variation is the standard deviation divided by the |
|
mean. This function is equivalent to:: |
|
|
|
np.std(x, axis=axis, ddof=ddof) / np.mean(x) |
|
|
|
The default for ``ddof`` is 0, but many definitions of the coefficient |
|
of variation use the square root of the unbiased sample variance |
|
for the sample standard deviation, which corresponds to ``ddof=1``. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Input array. |
|
axis : int or None, optional |
|
Axis along which to calculate the coefficient of variation. Default |
|
is 0. If None, compute over the whole array `a`. |
|
ddof : int, optional |
|
Delta degrees of freedom. Default is 0. |
|
|
|
Returns |
|
------- |
|
variation : ndarray |
|
The calculated variation along the requested axis. |
|
|
|
Notes |
|
----- |
|
For more details about `variation`, see `scipy.stats.variation`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats.mstats import variation |
|
>>> a = np.array([2,8,4]) |
|
>>> variation(a) |
|
0.5345224838248487 |
|
>>> b = np.array([2,8,3,4]) |
|
>>> c = np.ma.masked_array(b, mask=[0,0,1,0]) |
|
>>> variation(c) |
|
0.5345224838248487 |
|
|
|
In the example above, it can be seen that this works the same as |
|
`scipy.stats.variation` except 'stats.mstats.variation' ignores masked |
|
array elements. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
return a.std(axis, ddof=ddof)/a.mean(axis) |
|
|
|
|
|
def skew(a, axis=0, bias=True): |
|
""" |
|
Computes the skewness of a data set. |
|
|
|
Parameters |
|
---------- |
|
a : ndarray |
|
data |
|
axis : int or None, optional |
|
Axis along which skewness is calculated. Default is 0. |
|
If None, compute over the whole array `a`. |
|
bias : bool, optional |
|
If False, then the calculations are corrected for statistical bias. |
|
|
|
Returns |
|
------- |
|
skewness : ndarray |
|
The skewness of values along an axis, returning 0 where all values are |
|
equal. |
|
|
|
Notes |
|
----- |
|
For more details about `skew`, see `scipy.stats.skew`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a,axis) |
|
mean = a.mean(axis, keepdims=True) |
|
m2 = _moment(a, 2, axis, mean=mean) |
|
m3 = _moment(a, 3, axis, mean=mean) |
|
zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) |
|
with np.errstate(all='ignore'): |
|
vals = ma.where(zero, 0, m3 / m2**1.5) |
|
|
|
if not bias and zero is not ma.masked and m2 is not ma.masked: |
|
n = a.count(axis) |
|
can_correct = ~zero & (n > 2) |
|
if can_correct.any(): |
|
n = np.extract(can_correct, n) |
|
m2 = np.extract(can_correct, m2) |
|
m3 = np.extract(can_correct, m3) |
|
nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 |
|
np.place(vals, can_correct, nval) |
|
return vals |
|
|
|
|
|
def kurtosis(a, axis=0, fisher=True, bias=True): |
|
""" |
|
Computes the kurtosis (Fisher or Pearson) of a dataset. |
|
|
|
Kurtosis is the fourth central moment divided by the square of the |
|
variance. If Fisher's definition is used, then 3.0 is subtracted from |
|
the result to give 0.0 for a normal distribution. |
|
|
|
If bias is False then the kurtosis is calculated using k statistics to |
|
eliminate bias coming from biased moment estimators |
|
|
|
Use `kurtosistest` to see if result is close enough to normal. |
|
|
|
Parameters |
|
---------- |
|
a : array |
|
data for which the kurtosis is calculated |
|
axis : int or None, optional |
|
Axis along which the kurtosis is calculated. Default is 0. |
|
If None, compute over the whole array `a`. |
|
fisher : bool, optional |
|
If True, Fisher's definition is used (normal ==> 0.0). If False, |
|
Pearson's definition is used (normal ==> 3.0). |
|
bias : bool, optional |
|
If False, then the calculations are corrected for statistical bias. |
|
|
|
Returns |
|
------- |
|
kurtosis : array |
|
The kurtosis of values along an axis. If all values are equal, |
|
return -3 for Fisher's definition and 0 for Pearson's definition. |
|
|
|
Notes |
|
----- |
|
For more details about `kurtosis`, see `scipy.stats.kurtosis`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
mean = a.mean(axis, keepdims=True) |
|
m2 = _moment(a, 2, axis, mean=mean) |
|
m4 = _moment(a, 4, axis, mean=mean) |
|
zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) |
|
with np.errstate(all='ignore'): |
|
vals = ma.where(zero, 0, m4 / m2**2.0) |
|
|
|
if not bias and zero is not ma.masked and m2 is not ma.masked: |
|
n = a.count(axis) |
|
can_correct = ~zero & (n > 3) |
|
if can_correct.any(): |
|
n = np.extract(can_correct, n) |
|
m2 = np.extract(can_correct, m2) |
|
m4 = np.extract(can_correct, m4) |
|
nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) |
|
np.place(vals, can_correct, nval+3.0) |
|
if fisher: |
|
return vals - 3 |
|
else: |
|
return vals |
|
|
|
|
|
DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean', |
|
'variance', 'skewness', |
|
'kurtosis')) |
|
|
|
|
|
def describe(a, axis=0, ddof=0, bias=True): |
|
""" |
|
Computes several descriptive statistics of the passed array. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Data array |
|
axis : int or None, optional |
|
Axis along which to calculate statistics. Default 0. If None, |
|
compute over the whole array `a`. |
|
ddof : int, optional |
|
degree of freedom (default 0); note that default ddof is different |
|
from the same routine in stats.describe |
|
bias : bool, optional |
|
If False, then the skewness and kurtosis calculations are corrected for |
|
statistical bias. |
|
|
|
Returns |
|
------- |
|
nobs : int |
|
(size of the data (discarding missing values) |
|
|
|
minmax : (int, int) |
|
min, max |
|
|
|
mean : float |
|
arithmetic mean |
|
|
|
variance : float |
|
unbiased variance |
|
|
|
skewness : float |
|
biased skewness |
|
|
|
kurtosis : float |
|
biased kurtosis |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats.mstats import describe |
|
>>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1]) |
|
>>> describe(ma) |
|
DescribeResult(nobs=np.int64(3), minmax=(masked_array(data=0, |
|
mask=False, |
|
fill_value=999999), masked_array(data=2, |
|
mask=False, |
|
fill_value=999999)), mean=np.float64(1.0), |
|
variance=np.float64(0.6666666666666666), |
|
skewness=masked_array(data=0., mask=False, fill_value=1e+20), |
|
kurtosis=np.float64(-1.5)) |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
n = a.count(axis) |
|
mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis)) |
|
m = a.mean(axis) |
|
v = a.var(axis, ddof=ddof) |
|
sk = skew(a, axis, bias=bias) |
|
kurt = kurtosis(a, axis, bias=bias) |
|
|
|
return DescribeResult(n, mm, m, v, sk, kurt) |
|
|
|
|
|
def stde_median(data, axis=None): |
|
"""Returns the McKean-Schrader estimate of the standard error of the sample |
|
median along the given axis. masked values are discarded. |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
Data to trim. |
|
axis : {None,int}, optional |
|
Axis along which to perform the trimming. |
|
If None, the input array is first flattened. |
|
|
|
""" |
|
def _stdemed_1D(data): |
|
data = np.sort(data.compressed()) |
|
n = len(data) |
|
z = 2.5758293035489004 |
|
k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0)) |
|
return ((data[n-k] - data[k-1])/(2.*z)) |
|
|
|
data = ma.array(data, copy=False, subok=True) |
|
if (axis is None): |
|
return _stdemed_1D(data) |
|
else: |
|
if data.ndim > 2: |
|
raise ValueError("Array 'data' must be at most two dimensional, " |
|
"but got data.ndim = %d" % data.ndim) |
|
return ma.apply_along_axis(_stdemed_1D, axis, data) |
|
|
|
|
|
SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def skewtest(a, axis=0, alternative='two-sided'): |
|
""" |
|
Tests whether the skew is different from the normal distribution. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
The data to be tested |
|
axis : int or None, optional |
|
Axis along which statistics are calculated. Default is 0. |
|
If None, compute over the whole array `a`. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. Default is 'two-sided'. |
|
The following options are available: |
|
|
|
* 'two-sided': the skewness of the distribution underlying the sample |
|
is different from that of the normal distribution (i.e. 0) |
|
* 'less': the skewness of the distribution underlying the sample |
|
is less than that of the normal distribution |
|
* 'greater': the skewness of the distribution underlying the sample |
|
is greater than that of the normal distribution |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
statistic : array_like |
|
The computed z-score for this test. |
|
pvalue : array_like |
|
A p-value for the hypothesis test |
|
|
|
Notes |
|
----- |
|
For more details about `skewtest`, see `scipy.stats.skewtest`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
if axis is None: |
|
a = a.ravel() |
|
axis = 0 |
|
b2 = skew(a,axis) |
|
n = a.count(axis) |
|
if np.min(n) < 8: |
|
raise ValueError( |
|
"skewtest is not valid with less than 8 samples; %i samples" |
|
" were given." % np.min(n)) |
|
|
|
y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2))) |
|
beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9)) |
|
W2 = -1 + ma.sqrt(2*(beta2-1)) |
|
delta = 1/ma.sqrt(0.5*ma.log(W2)) |
|
alpha = ma.sqrt(2.0/(W2-1)) |
|
y = ma.where(y == 0, 1, y) |
|
Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1)) |
|
pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative) |
|
|
|
return SkewtestResult(Z[()], pvalue[()]) |
|
|
|
|
|
KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def kurtosistest(a, axis=0, alternative='two-sided'): |
|
""" |
|
Tests whether a dataset has normal kurtosis |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
array of the sample data |
|
axis : int or None, optional |
|
Axis along which to compute test. Default is 0. If None, |
|
compute over the whole array `a`. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. |
|
The following options are available (default is 'two-sided'): |
|
|
|
* 'two-sided': the kurtosis of the distribution underlying the sample |
|
is different from that of the normal distribution |
|
* 'less': the kurtosis of the distribution underlying the sample |
|
is less than that of the normal distribution |
|
* 'greater': the kurtosis of the distribution underlying the sample |
|
is greater than that of the normal distribution |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
Returns |
|
------- |
|
statistic : array_like |
|
The computed z-score for this test. |
|
pvalue : array_like |
|
The p-value for the hypothesis test |
|
|
|
Notes |
|
----- |
|
For more details about `kurtosistest`, see `scipy.stats.kurtosistest`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
n = a.count(axis=axis) |
|
if np.min(n) < 5: |
|
raise ValueError( |
|
"kurtosistest requires at least 5 observations; %i observations" |
|
" were given." % np.min(n)) |
|
if np.min(n) < 20: |
|
warnings.warn( |
|
"kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % np.min(n), |
|
stacklevel=2, |
|
) |
|
|
|
b2 = kurtosis(a, axis, fisher=False) |
|
E = 3.0*(n-1) / (n+1) |
|
varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) |
|
x = (b2-E)/ma.sqrt(varb2) |
|
sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / |
|
(n*(n-2)*(n-3))) |
|
A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) |
|
term1 = 1 - 2./(9.0*A) |
|
denom = 1 + x*ma.sqrt(2/(A-4.0)) |
|
if np.ma.isMaskedArray(denom): |
|
|
|
denom[denom == 0.0] = masked |
|
elif denom == 0.0: |
|
denom = masked |
|
|
|
term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0), |
|
-ma.power(-(1-2.0/A)/denom, 1/3.0)) |
|
Z = (term1 - term2) / np.sqrt(2/(9.0*A)) |
|
pvalue = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative) |
|
|
|
return KurtosistestResult(Z[()], pvalue[()]) |
|
|
|
|
|
NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def normaltest(a, axis=0): |
|
""" |
|
Tests whether a sample differs from a normal distribution. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
The array containing the data to be tested. |
|
axis : int or None, optional |
|
Axis along which to compute test. Default is 0. If None, |
|
compute over the whole array `a`. |
|
|
|
Returns |
|
------- |
|
statistic : float or array |
|
``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and |
|
``k`` is the z-score returned by `kurtosistest`. |
|
pvalue : float or array |
|
A 2-sided chi squared probability for the hypothesis test. |
|
|
|
Notes |
|
----- |
|
For more details about `normaltest`, see `scipy.stats.normaltest`. |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
s, _ = skewtest(a, axis) |
|
k, _ = kurtosistest(a, axis) |
|
k2 = s*s + k*k |
|
|
|
return NormaltestResult(k2, distributions.chi2.sf(k2, 2)) |
|
|
|
|
|
def mquantiles(a, prob=(.25, .5, .75), alphap=.4, betap=.4, axis=None, |
|
limit=()): |
|
""" |
|
Computes empirical quantiles for a data array. |
|
|
|
Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, |
|
where ``x[j]`` is the j-th order statistic, and gamma is a function of |
|
``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and |
|
``g = n*p + m - j``. |
|
|
|
Reinterpreting the above equations to compare to **R** lead to the |
|
equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` |
|
|
|
Typical values of (alphap,betap) are: |
|
- (0,1) : ``p(k) = k/n`` : linear interpolation of cdf |
|
(**R** type 4) |
|
- (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function |
|
(**R** type 5) |
|
- (0,0) : ``p(k) = k/(n+1)`` : |
|
(**R** type 6) |
|
- (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. |
|
(**R** type 7, **R** default) |
|
- (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. |
|
The resulting quantile estimates are approximately median-unbiased |
|
regardless of the distribution of x. |
|
(**R** type 8) |
|
- (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. |
|
The resulting quantile estimates are approximately unbiased |
|
if x is normally distributed |
|
(**R** type 9) |
|
- (.4,.4) : approximately quantile unbiased (Cunnane) |
|
- (.35,.35): APL, used with PWM |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
Input data, as a sequence or array of dimension at most 2. |
|
prob : array_like, optional |
|
List of quantiles to compute. |
|
alphap : float, optional |
|
Plotting positions parameter, default is 0.4. |
|
betap : float, optional |
|
Plotting positions parameter, default is 0.4. |
|
axis : int, optional |
|
Axis along which to perform the trimming. |
|
If None (default), the input array is first flattened. |
|
limit : tuple, optional |
|
Tuple of (lower, upper) values. |
|
Values of `a` outside this open interval are ignored. |
|
|
|
Returns |
|
------- |
|
mquantiles : MaskedArray |
|
An array containing the calculated quantiles. |
|
|
|
Notes |
|
----- |
|
This formulation is very similar to **R** except the calculation of |
|
``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined |
|
with each type. |
|
|
|
References |
|
---------- |
|
.. [1] *R* statistical software: https://www.r-project.org/ |
|
.. [2] *R* ``quantile`` function: |
|
http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.stats.mstats import mquantiles |
|
>>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) |
|
>>> mquantiles(a) |
|
array([ 19.2, 40. , 42.8]) |
|
|
|
Using a 2D array, specifying axis and limit. |
|
|
|
>>> data = np.array([[ 6., 7., 1.], |
|
... [ 47., 15., 2.], |
|
... [ 49., 36., 3.], |
|
... [ 15., 39., 4.], |
|
... [ 42., 40., -999.], |
|
... [ 41., 41., -999.], |
|
... [ 7., -999., -999.], |
|
... [ 39., -999., -999.], |
|
... [ 43., -999., -999.], |
|
... [ 40., -999., -999.], |
|
... [ 36., -999., -999.]]) |
|
>>> print(mquantiles(data, axis=0, limit=(0, 50))) |
|
[[19.2 14.6 1.45] |
|
[40. 37.5 2.5 ] |
|
[42.8 40.05 3.55]] |
|
|
|
>>> data[:, 2] = -999. |
|
>>> print(mquantiles(data, axis=0, limit=(0, 50))) |
|
[[19.200000000000003 14.6 --] |
|
[40.0 37.5 --] |
|
[42.800000000000004 40.05 --]] |
|
|
|
""" |
|
def _quantiles1D(data,m,p): |
|
x = np.sort(data.compressed()) |
|
n = len(x) |
|
if n == 0: |
|
return ma.array(np.empty(len(p), dtype=float), mask=True) |
|
elif n == 1: |
|
return ma.array(np.resize(x, p.shape), mask=nomask) |
|
aleph = (n*p + m) |
|
k = np.floor(aleph.clip(1, n-1)).astype(int) |
|
gamma = (aleph-k).clip(0,1) |
|
return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] |
|
|
|
data = ma.array(a, copy=False) |
|
if data.ndim > 2: |
|
raise TypeError("Array should be 2D at most !") |
|
|
|
if limit: |
|
condition = (limit[0] < data) & (data < limit[1]) |
|
data[~condition.filled(True)] = masked |
|
|
|
p = np.atleast_1d(np.asarray(prob)) |
|
m = alphap + p*(1.-alphap-betap) |
|
|
|
if (axis is None): |
|
return _quantiles1D(data, m, p) |
|
|
|
return ma.apply_along_axis(_quantiles1D, axis, data, m, p) |
|
|
|
|
|
def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4): |
|
"""Calculate the score at the given 'per' percentile of the |
|
sequence a. For example, the score at per=50 is the median. |
|
|
|
This function is a shortcut to mquantile |
|
|
|
""" |
|
if (per < 0) or (per > 100.): |
|
raise ValueError(f"The percentile should be between 0. and 100. ! (got {per})") |
|
|
|
return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap, |
|
limit=limit, axis=0).squeeze() |
|
|
|
|
|
def plotting_positions(data, alpha=0.4, beta=0.4): |
|
""" |
|
Returns plotting positions (or empirical percentile points) for the data. |
|
|
|
Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where: |
|
- i is the rank order statistics |
|
- n is the number of unmasked values along the given axis |
|
- `alpha` and `beta` are two parameters. |
|
|
|
Typical values for `alpha` and `beta` are: |
|
- (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4) |
|
- (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function |
|
(R, type 5) |
|
- (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6) |
|
- (1,1) : ``p(k) = (k-1)/(n-1)``, in this case, |
|
``p(k) = mode[F(x[k])]``. That's R default (R type 7) |
|
- (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then |
|
``p(k) ~ median[F(x[k])]``. |
|
The resulting quantile estimates are approximately median-unbiased |
|
regardless of the distribution of x. (R type 8) |
|
- (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom. |
|
The resulting quantile estimates are approximately unbiased |
|
if x is normally distributed (R type 9) |
|
- (.4,.4) : approximately quantile unbiased (Cunnane) |
|
- (.35,.35): APL, used with PWM |
|
- (.3175, .3175): used in scipy.stats.probplot |
|
|
|
Parameters |
|
---------- |
|
data : array_like |
|
Input data, as a sequence or array of dimension at most 2. |
|
alpha : float, optional |
|
Plotting positions parameter. Default is 0.4. |
|
beta : float, optional |
|
Plotting positions parameter. Default is 0.4. |
|
|
|
Returns |
|
------- |
|
positions : MaskedArray |
|
The calculated plotting positions. |
|
|
|
""" |
|
data = ma.array(data, copy=False).reshape(1,-1) |
|
n = data.count() |
|
plpos = np.empty(data.size, dtype=float) |
|
plpos[n:] = 0 |
|
plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) / |
|
(n + 1.0 - alpha - beta)) |
|
return ma.array(plpos, mask=data._mask) |
|
|
|
|
|
meppf = plotting_positions |
|
|
|
|
|
def obrientransform(*args): |
|
""" |
|
Computes a transform on input data (any number of columns). Used to |
|
test for homogeneity of variance prior to running one-way stats. Each |
|
array in ``*args`` is one level of a factor. If an `f_oneway()` run on |
|
the transformed data and found significant, variances are unequal. From |
|
Maxwell and Delaney, p.112. |
|
|
|
Returns: transformed data for use in an ANOVA |
|
""" |
|
data = argstoarray(*args).T |
|
v = data.var(axis=0,ddof=1) |
|
m = data.mean(0) |
|
n = data.count(0).astype(float) |
|
|
|
data -= m |
|
data **= 2 |
|
data *= (n-1.5)*n |
|
data -= 0.5*v*(n-1) |
|
data /= (n-1.)*(n-2.) |
|
if not ma.allclose(v,data.mean(0)): |
|
raise ValueError("Lack of convergence in obrientransform.") |
|
|
|
return data |
|
|
|
|
|
def sem(a, axis=0, ddof=1): |
|
""" |
|
Calculates the standard error of the mean of the input array. |
|
|
|
Also sometimes called standard error of measurement. |
|
|
|
Parameters |
|
---------- |
|
a : array_like |
|
An array containing the values for which the standard error is |
|
returned. |
|
axis : int or None, optional |
|
If axis is None, ravel `a` first. If axis is an integer, this will be |
|
the axis over which to operate. Defaults to 0. |
|
ddof : int, optional |
|
Delta degrees-of-freedom. How many degrees of freedom to adjust |
|
for bias in limited samples relative to the population estimate |
|
of variance. Defaults to 1. |
|
|
|
Returns |
|
------- |
|
s : ndarray or float |
|
The standard error of the mean in the sample(s), along the input axis. |
|
|
|
Notes |
|
----- |
|
The default value for `ddof` changed in scipy 0.15.0 to be consistent with |
|
`scipy.stats.sem` as well as with the most common definition used (like in |
|
the R documentation). |
|
|
|
Examples |
|
-------- |
|
Find standard error along the first axis: |
|
|
|
>>> import numpy as np |
|
>>> from scipy import stats |
|
>>> a = np.arange(20).reshape(5,4) |
|
>>> print(stats.mstats.sem(a)) |
|
[2.8284271247461903 2.8284271247461903 2.8284271247461903 |
|
2.8284271247461903] |
|
|
|
Find standard error across the whole array, using n degrees of freedom: |
|
|
|
>>> print(stats.mstats.sem(a, axis=None, ddof=0)) |
|
1.2893796958227628 |
|
|
|
""" |
|
a, axis = _chk_asarray(a, axis) |
|
n = a.count(axis=axis) |
|
s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n) |
|
return s |
|
|
|
|
|
F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def f_oneway(*args): |
|
""" |
|
Performs a 1-way ANOVA, returning an F-value and probability given |
|
any number of groups. From Heiman, pp.394-7. |
|
|
|
Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays, |
|
one per treatment group. |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The computed F-value of the test. |
|
pvalue : float |
|
The associated p-value from the F-distribution. |
|
|
|
""" |
|
|
|
data = argstoarray(*args) |
|
ngroups = len(data) |
|
ntot = data.count() |
|
sstot = (data**2).sum() - (data.sum())**2/float(ntot) |
|
ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() |
|
sswg = sstot-ssbg |
|
dfbg = ngroups-1 |
|
dfwg = ntot - ngroups |
|
msb = ssbg/float(dfbg) |
|
msw = sswg/float(dfwg) |
|
f = msb/msw |
|
prob = special.fdtrc(dfbg, dfwg, f) |
|
|
|
return F_onewayResult(f, prob) |
|
|
|
|
|
FriedmanchisquareResult = namedtuple('FriedmanchisquareResult', |
|
('statistic', 'pvalue')) |
|
|
|
|
|
def friedmanchisquare(*args): |
|
"""Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA. |
|
This function calculates the Friedman Chi-square test for repeated measures |
|
and returns the result, along with the associated probability value. |
|
|
|
Each input is considered a given group. Ideally, the number of treatments |
|
among each group should be equal. If this is not the case, only the first |
|
n treatments are taken into account, where n is the number of treatments |
|
of the smallest group. |
|
If a group has some missing values, the corresponding treatments are masked |
|
in the other groups. |
|
The test statistic is corrected for ties. |
|
|
|
Masked values in one group are propagated to the other groups. |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
the test statistic. |
|
pvalue : float |
|
the associated p-value. |
|
|
|
""" |
|
data = argstoarray(*args).astype(float) |
|
k = len(data) |
|
if k < 3: |
|
raise ValueError("Less than 3 groups (%i): " % k + |
|
"the Friedman test is NOT appropriate.") |
|
|
|
ranked = ma.masked_values(rankdata(data, axis=0), 0) |
|
if ranked._mask is not nomask: |
|
ranked = ma.mask_cols(ranked) |
|
ranked = ranked.compressed().reshape(k,-1).view(ndarray) |
|
else: |
|
ranked = ranked._data |
|
(k,n) = ranked.shape |
|
|
|
repeats = [find_repeats(row) for row in ranked.T] |
|
ties = np.array([y for x, y in repeats if x.size > 0]) |
|
tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k)) |
|
|
|
ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2) |
|
chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction |
|
|
|
return FriedmanchisquareResult(chisq, |
|
distributions.chi2.sf(chisq, k-1)) |
|
|
|
|
|
BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue')) |
|
|
|
|
|
def brunnermunzel(x, y, alternative="two-sided", distribution="t"): |
|
""" |
|
Compute the Brunner-Munzel test on samples x and y. |
|
|
|
Any missing values in `x` and/or `y` are discarded. |
|
|
|
The Brunner-Munzel test is a nonparametric test of the null hypothesis that |
|
when values are taken one by one from each group, the probabilities of |
|
getting large values in both groups are equal. |
|
Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the |
|
assumption of equivariance of two groups. Note that this does not assume |
|
the distributions are same. This test works on two independent samples, |
|
which may have different sizes. |
|
|
|
Parameters |
|
---------- |
|
x, y : array_like |
|
Array of samples, should be one-dimensional. |
|
alternative : 'less', 'two-sided', or 'greater', optional |
|
Whether to get the p-value for the one-sided hypothesis ('less' |
|
or 'greater') or for the two-sided hypothesis ('two-sided'). |
|
Defaults value is 'two-sided' . |
|
distribution : 't' or 'normal', optional |
|
Whether to get the p-value by t-distribution or by standard normal |
|
distribution. |
|
Defaults value is 't' . |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The Brunner-Munzer W statistic. |
|
pvalue : float |
|
p-value assuming an t distribution. One-sided or |
|
two-sided, depending on the choice of `alternative` and `distribution`. |
|
|
|
See Also |
|
-------- |
|
mannwhitneyu : Mann-Whitney rank test on two samples. |
|
|
|
Notes |
|
----- |
|
For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats.mstats import brunnermunzel |
|
>>> import numpy as np |
|
>>> x1 = [1, 2, np.nan, np.nan, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1] |
|
>>> x2 = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4] |
|
>>> brunnermunzel(x1, x2) |
|
BrunnerMunzelResult(statistic=1.4723186918922935, pvalue=0.15479415300426624) # may vary |
|
|
|
""" |
|
x = ma.asarray(x).compressed().view(ndarray) |
|
y = ma.asarray(y).compressed().view(ndarray) |
|
nx = len(x) |
|
ny = len(y) |
|
if nx == 0 or ny == 0: |
|
return BrunnerMunzelResult(np.nan, np.nan) |
|
rankc = rankdata(np.concatenate((x,y))) |
|
rankcx = rankc[0:nx] |
|
rankcy = rankc[nx:nx+ny] |
|
rankcx_mean = np.mean(rankcx) |
|
rankcy_mean = np.mean(rankcy) |
|
rankx = rankdata(x) |
|
ranky = rankdata(y) |
|
rankx_mean = np.mean(rankx) |
|
ranky_mean = np.mean(ranky) |
|
|
|
Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0)) |
|
Sx /= nx - 1 |
|
Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0)) |
|
Sy /= ny - 1 |
|
|
|
wbfn = nx * ny * (rankcy_mean - rankcx_mean) |
|
wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy) |
|
|
|
if distribution == "t": |
|
df_numer = np.power(nx * Sx + ny * Sy, 2.0) |
|
df_denom = np.power(nx * Sx, 2.0) / (nx - 1) |
|
df_denom += np.power(ny * Sy, 2.0) / (ny - 1) |
|
df = df_numer / df_denom |
|
p = distributions.t.cdf(wbfn, df) |
|
elif distribution == "normal": |
|
p = distributions.norm.cdf(wbfn) |
|
else: |
|
raise ValueError( |
|
"distribution should be 't' or 'normal'") |
|
|
|
if alternative == "greater": |
|
pass |
|
elif alternative == "less": |
|
p = 1 - p |
|
elif alternative == "two-sided": |
|
p = 2 * np.min([p, 1-p]) |
|
else: |
|
raise ValueError( |
|
"alternative should be 'less', 'greater' or 'two-sided'") |
|
|
|
return BrunnerMunzelResult(wbfn, p) |
|
|