Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

15.8 kB

	"""
	Created on Fri Apr 2 09:06:05 2021

	@author: matth
	"""

	import math
	import numpy as np
	from scipy import special
	from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
	from scipy._lib._array_api import array_namespace, xp_moveaxis_to_end

	__all__ = ['entropy', 'differential_entropy']


	@_axis_nan_policy_factory(
	lambda x: x,
	n_samples=lambda kwgs: (
	2 if ("qk" in kwgs and kwgs["qk"] is not None)
	else 1
	),
	n_outputs=1, result_to_tuple=lambda x: (x,), paired=True,
	too_small=-1 # entropy doesn't have too small inputs
	)
	def entropy(pk: np.typing.ArrayLike,
	qk: np.typing.ArrayLike \| None = None,
	base: float \| None = None,
	axis: int = 0
	) -> np.number \| np.ndarray:
	"""
	Calculate the Shannon entropy/relative entropy of given distribution(s).

	If only probabilities `pk` are given, the Shannon entropy is calculated as
	``H = -sum(pk * log(pk))``.

	If `qk` is not None, then compute the relative entropy
	``D = sum(pk * log(pk / qk))``. This quantity is also known
	as the Kullback-Leibler divergence.

	This routine will normalize `pk` and `qk` if they don't sum to 1.

	Parameters
	----------
	pk : array_like
	Defines the (discrete) distribution. Along each axis-slice of ``pk``,
	element ``i`` is the (possibly unnormalized) probability of event
	``i``.
	qk : array_like, optional
	Sequence against which the relative entropy is computed. Should be in
	the same format as `pk`.
	base : float, optional
	The logarithmic base to use, defaults to ``e`` (natural logarithm).
	axis : int, optional
	The axis along which the entropy is calculated. Default is 0.

	Returns
	-------
	S : {float, array_like}
	The calculated entropy.

	Notes
	-----
	Informally, the Shannon entropy quantifies the expected uncertainty
	inherent in the possible outcomes of a discrete random variable.
	For example,
	if messages consisting of sequences of symbols from a set are to be
	encoded and transmitted over a noiseless channel, then the Shannon entropy
	``H(pk)`` gives a tight lower bound for the average number of units of
	information needed per symbol if the symbols occur with frequencies
	governed by the discrete distribution `pk` [1]_. The choice of base
	determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.

	The relative entropy, ``D(pk\|qk)``, quantifies the increase in the average
	number of units of information needed per symbol if the encoding is
	optimized for the probability distribution `qk` instead of the true
	distribution `pk`. Informally, the relative entropy quantifies the expected
	excess in surprise experienced if one believes the true distribution is
	`qk` when it is actually `pk`.

	A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
	equation ``CE(pk, qk) = H(pk) + D(pk\|qk)`` and can also be calculated with
	the formula ``CE = -sum(pk * log(qk))``. It gives the average
	number of units of information needed per symbol if an encoding is
	optimized for the probability distribution `qk` when the true distribution
	is `pk`. It is not computed directly by `entropy`, but it can be computed
	using two calls to the function (see Examples).

	See [2]_ for more information.

	References
	----------
	.. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
	Bell System Technical Journal, 27: 379-423.
	https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
	.. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
	Theory (Wiley Series in Telecommunications and Signal Processing).
	Wiley-Interscience, USA.


	Examples
	--------
	The outcome of a fair coin is the most uncertain:

	>>> import numpy as np
	>>> from scipy.stats import entropy
	>>> base = 2 # work in units of bits
	>>> pk = np.array([1/2, 1/2]) # fair coin
	>>> H = entropy(pk, base=base)
	>>> H
	1.0
	>>> H == -np.sum(pk * np.log(pk)) / np.log(base)
	True

	The outcome of a biased coin is less uncertain:

	>>> qk = np.array([9/10, 1/10]) # biased coin
	>>> entropy(qk, base=base)
	0.46899559358928117

	The relative entropy between the fair coin and biased coin is calculated
	as:

	>>> D = entropy(pk, qk, base=base)
	>>> D
	0.7369655941662062
	>>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0)
	True

	The cross entropy can be calculated as the sum of the entropy and
	relative entropy`:

	>>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
	>>> CE
	1.736965594166206
	>>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
	True

	"""
	if base is not None and base <= 0:
	raise ValueError("`base` must be a positive number or `None`.")

	xp = array_namespace(pk) if qk is None else array_namespace(pk, qk)

	pk = xp.asarray(pk)
	with np.errstate(invalid='ignore'):
	pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True) # type: ignore[operator]
	if qk is None:
	vec = special.entr(pk)
	else:
	qk = xp.asarray(qk)
	pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp) # don't ignore any axes
	sum_kwargs = dict(axis=axis, keepdims=True)
	qk = 1.0qk / xp.sum(qk, *sum_kwargs) # type: ignore[operator, call-overload]
	vec = special.rel_entr(pk, qk)
	S = xp.sum(vec, axis=axis)
	if base is not None:
	S /= math.log(base)
	return S


	def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
	values = samples[0]
	n = values.shape[axis]
	window_length = kwargs.get("window_length",
	math.floor(math.sqrt(n) + 0.5))
	if not 2 <= 2 * window_length < n:
	return True
	return False


	@_axis_nan_policy_factory(
	lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,),
	too_small=_differential_entropy_is_too_small
	)
	def differential_entropy(
	values: np.typing.ArrayLike,
	*,
	window_length: int \| None = None,
	base: float \| None = None,
	axis: int = 0,
	method: str = "auto",
	) -> np.number \| np.ndarray:
	r"""Given a sample of a distribution, estimate the differential entropy.

	Several estimation methods are available using the `method` parameter. By
	default, a method is selected based the size of the sample.

	Parameters
	----------
	values : sequence
	Sample from a continuous distribution.
	window_length : int, optional
	Window length for computing Vasicek estimate. Must be an integer
	between 1 and half of the sample size. If ``None`` (the default), it
	uses the heuristic value

	.. math::
	\left \lfloor \sqrt{n} + 0.5 \right \rfloor

	where :math:`n` is the sample size. This heuristic was originally
	proposed in [2]_ and has become common in the literature.
	base : float, optional
	The logarithmic base to use, defaults to ``e`` (natural logarithm).
	axis : int, optional
	The axis along which the differential entropy is calculated.
	Default is 0.
	method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
	The method used to estimate the differential entropy from the sample.
	Default is ``'auto'``. See Notes for more information.

	Returns
	-------
	entropy : float
	The calculated differential entropy.

	Notes
	-----
	This function will converge to the true differential entropy in the limit

	.. math::
	n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0

	The optimal choice of ``window_length`` for a given sample size depends on
	the (unknown) distribution. Typically, the smoother the density of the
	distribution, the larger the optimal value of ``window_length`` [1]_.

	The following options are available for the `method` parameter.

	* ``'vasicek'`` uses the estimator presented in [1]_. This is
	one of the first and most influential estimators of differential entropy.
	* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
	is not only consistent but, under some conditions, asymptotically normal.
	* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
	in simulation to have smaller bias and mean squared error than
	the Vasicek estimator.
	* ``'correa'`` uses the estimator presented in [5]_ based on local linear
	regression. In a simulation study, it had consistently smaller mean
	square error than the Vasiceck estimator, but it is more expensive to
	compute.
	* ``'auto'`` selects the method automatically (default). Currently,
	this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
	for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
	samples, but this behavior is subject to change in future versions.

	All estimators are implemented as described in [6]_.

	References
	----------
	.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
	Journal of the Royal Statistical Society:
	Series B (Methodological), 38(1), 54-59.
	.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
	goodness-of-fit test for exponentiality. Communications in
	Statistics-Theory and Methods, 28(5), 1183-1202.
	.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
	class of statistics based on spacings. Scandinavian Journal of
	Statistics, 61-72.
	.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
	of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
	.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
	in Statistics-Theory and Methods, 24(10), 2439-2449.
	.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
	Annals of Data Science, 2(2), 231-241.
	https://link.springer.com/article/10.1007/s40745-015-0045-9

	Examples
	--------
	>>> import numpy as np
	>>> from scipy.stats import differential_entropy, norm

	Entropy of a standard normal distribution:

	>>> rng = np.random.default_rng()
	>>> values = rng.standard_normal(100)
	>>> differential_entropy(values)
	1.3407817436640392

	Compare with the true entropy:

	>>> float(norm.entropy())
	1.4189385332046727

	For several sample sizes between 5 and 1000, compare the accuracy of
	the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
	compare the root mean squared error (over 1000 trials) between the estimate
	and the true differential entropy of the distribution.

	>>> from scipy import stats
	>>> import matplotlib.pyplot as plt
	>>>
	>>>
	>>> def rmse(res, expected):
	... '''Root mean squared error'''
	... return np.sqrt(np.mean((res - expected)**2))
	>>>
	>>>
	>>> a, b = np.log10(5), np.log10(1000)
	>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
	>>> reps = 1000 # number of repetitions for each sample size
	>>> expected = stats.expon.entropy()
	>>>
	>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
	>>> for method in method_errors:
	... for n in ns:
	... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
	... res = stats.differential_entropy(rvs, method=method, axis=-1)
	... error = rmse(res, expected)
	... method_errors[method].append(error)
	>>>
	>>> for method, errors in method_errors.items():
	... plt.loglog(ns, errors, label=method)
	>>>
	>>> plt.legend()
	>>> plt.xlabel('sample size')
	>>> plt.ylabel('RMSE (1000 trials)')
	>>> plt.title('Entropy Estimator Error (Exponential Distribution)')

	"""
	xp = array_namespace(values)
	values = xp.asarray(values)
	if xp.isdtype(values.dtype, "integral"): # type: ignore[union-attr]
	values = xp.astype(values, xp.asarray(1.).dtype)
	values = xp_moveaxis_to_end(values, axis, xp=xp)
	n = values.shape[-1] # type: ignore[union-attr]

	if window_length is None:
	window_length = math.floor(math.sqrt(n) + 0.5)

	if not 2 <= 2 * window_length < n:
	raise ValueError(
	f"Window length ({window_length}) must be positive and less "
	f"than half the sample size ({n}).",
	)

	if base is not None and base <= 0:
	raise ValueError("`base` must be a positive number or `None`.")

	sorted_data = xp.sort(values, axis=-1)

	methods = {"vasicek": _vasicek_entropy,
	"van es": _van_es_entropy,
	"correa": _correa_entropy,
	"ebrahimi": _ebrahimi_entropy,
	"auto": _vasicek_entropy}
	method = method.lower()
	if method not in methods:
	message = f"`method` must be one of {set(methods)}"
	raise ValueError(message)

	if method == "auto":
	if n <= 10:
	method = 'van es'
	elif n <= 1000:
	method = 'ebrahimi'
	else:
	method = 'vasicek'

	res = methods[method](sorted_data, window_length, xp=xp)

	if base is not None:
	res /= math.log(base)

	# avoid dtype changes due to data-apis/array-api-compat#152
	# can be removed when data-apis/array-api-compat#152 is resolved
	return xp.astype(res, values.dtype) # type: ignore[union-attr]


	def _pad_along_last_axis(X, m, *, xp):
	"""Pad the data for computing the rolling window difference."""
	# scales a bit better than method in _vasicek_like_entropy
	shape = X.shape[:-1] + (m,)
	Xl = xp.broadcast_to(X[..., :1], shape) # :1 vs 0 to maintain shape
	Xr = xp.broadcast_to(X[..., -1:], shape)
	return xp.concat((Xl, X, Xr), axis=-1)


	def _vasicek_entropy(X, m, *, xp):
	"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
	n = X.shape[-1]
	X = _pad_along_last_axis(X, m, xp=xp)
	differences = X[..., 2 * m:] - X[..., : -2 * m:]
	logs = xp.log(n/(2m) differences)
	return xp.mean(logs, axis=-1)


	def _van_es_entropy(X, m, *, xp):
	"""Compute the van Es estimator as described in [6]."""
	# No equation number, but referred to as HVE_mn.
	# Typo: there should be a log within the summation.
	n = X.shape[-1]
	difference = X[..., m:] - X[..., :-m]
	term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1)
	k = xp.arange(m, n+1, dtype=term1.dtype)
	return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1)


	def _ebrahimi_entropy(X, m, *, xp):
	"""Compute the Ebrahimi estimator as described in [6]."""
	# No equation number, but referred to as HE_mn
	n = X.shape[-1]
	X = _pad_along_last_axis(X, m, xp=xp)

	differences = X[..., 2 * m:] - X[..., : -2 * m:]

	i = xp.arange(1, n+1, dtype=X.dtype)
	ci = xp.ones_like(i)*2
	ci[i <= m] = 1 + (i[i <= m] - 1)/m
	ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m

	logs = xp.log(n * differences / (ci * m))
	return xp.mean(logs, axis=-1)


	def _correa_entropy(X, m, *, xp):
	"""Compute the Correa estimator as described in [6]."""
	# No equation number, but referred to as HC_mn
	n = X.shape[-1]
	X = _pad_along_last_axis(X, m, xp=xp)

	i = xp.arange(1, n+1)
	dj = xp.arange(-m, m+1)[:, None]
	j = i + dj
	j0 = j + m - 1 # 0-indexed version of j

	Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True)
	difference = X[..., j0] - Xibar
	num = xp.sum(difference*dj, axis=-2) # dj is d-i
	den = nxp.sum(difference*2, axis=-2)
	return -xp.mean(xp.log(num/den), axis=-1)