|
"""Quasi-Monte Carlo engines and helpers.""" |
|
import copy |
|
import math |
|
import numbers |
|
import os |
|
import warnings |
|
from abc import ABC, abstractmethod |
|
from functools import partial |
|
from typing import ( |
|
ClassVar, |
|
Literal, |
|
overload, |
|
TYPE_CHECKING, |
|
) |
|
from collections.abc import Callable |
|
|
|
import numpy as np |
|
|
|
from scipy._lib._util import DecimalNumber, GeneratorType, IntNumber, SeedType |
|
|
|
if TYPE_CHECKING: |
|
import numpy.typing as npt |
|
|
|
import scipy.stats as stats |
|
from scipy._lib._util import rng_integers, _rng_spawn, _transition_to_rng |
|
from scipy.sparse.csgraph import minimum_spanning_tree |
|
from scipy.spatial import distance, Voronoi |
|
from scipy.special import gammainc |
|
from ._sobol import ( |
|
_initialize_v, _cscramble, _fill_p_cumulative, _draw, _fast_forward, |
|
_categorize, _MAXDIM |
|
) |
|
from ._qmc_cy import ( |
|
_cy_wrapper_centered_discrepancy, |
|
_cy_wrapper_wrap_around_discrepancy, |
|
_cy_wrapper_mixture_discrepancy, |
|
_cy_wrapper_l2_star_discrepancy, |
|
_cy_wrapper_update_discrepancy, |
|
_cy_van_der_corput_scrambled, |
|
_cy_van_der_corput, |
|
) |
|
|
|
|
|
__all__ = ['scale', 'discrepancy', 'geometric_discrepancy', 'update_discrepancy', |
|
'QMCEngine', 'Sobol', 'Halton', 'LatinHypercube', 'PoissonDisk', |
|
'MultinomialQMC', 'MultivariateNormalQMC'] |
|
|
|
|
|
@overload |
|
def check_random_state(seed: IntNumber | None = ...) -> np.random.Generator: |
|
... |
|
|
|
|
|
@overload |
|
def check_random_state(seed: GeneratorType) -> GeneratorType: |
|
... |
|
|
|
|
|
|
|
|
|
|
|
def check_random_state(seed=None): |
|
"""Turn `seed` into a `numpy.random.Generator` instance. |
|
|
|
Parameters |
|
---------- |
|
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional |
|
If `seed` is an int or None, a new `numpy.random.Generator` is |
|
created using ``np.random.default_rng(seed)``. |
|
If `seed` is already a ``Generator`` or ``RandomState`` instance, then |
|
the provided instance is used. |
|
|
|
Returns |
|
------- |
|
seed : {`numpy.random.Generator`, `numpy.random.RandomState`} |
|
Random number generator. |
|
|
|
""" |
|
if seed is None or isinstance(seed, (numbers.Integral, np.integer)): |
|
return np.random.default_rng(seed) |
|
elif isinstance(seed, (np.random.RandomState, np.random.Generator)): |
|
return seed |
|
else: |
|
raise ValueError(f'{seed!r} cannot be used to seed a' |
|
' numpy.random.Generator instance') |
|
|
|
|
|
def scale( |
|
sample: "npt.ArrayLike", |
|
l_bounds: "npt.ArrayLike", |
|
u_bounds: "npt.ArrayLike", |
|
*, |
|
reverse: bool = False |
|
) -> np.ndarray: |
|
r"""Sample scaling from unit hypercube to different bounds. |
|
|
|
To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`, |
|
with :math:`a` the lower bounds and :math:`b` the upper bounds. |
|
The following transformation is used: |
|
|
|
.. math:: |
|
|
|
(b - a) \cdot \text{sample} + a |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
Sample to scale. |
|
l_bounds, u_bounds : array_like (d,) |
|
Lower and upper bounds (resp. :math:`a`, :math:`b`) of transformed |
|
data. If `reverse` is True, range of the original data to transform |
|
to the unit hypercube. |
|
reverse : bool, optional |
|
Reverse the transformation from different bounds to the unit hypercube. |
|
Default is False. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
Scaled sample. |
|
|
|
Examples |
|
-------- |
|
Transform 3 samples in the unit hypercube to bounds: |
|
|
|
>>> from scipy.stats import qmc |
|
>>> l_bounds = [-2, 0] |
|
>>> u_bounds = [6, 5] |
|
>>> sample = [[0.5 , 0.75], |
|
... [0.5 , 0.5], |
|
... [0.75, 0.25]] |
|
>>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds) |
|
>>> sample_scaled |
|
array([[2. , 3.75], |
|
[2. , 2.5 ], |
|
[4. , 1.25]]) |
|
|
|
And convert back to the unit hypercube: |
|
|
|
>>> sample_ = qmc.scale(sample_scaled, l_bounds, u_bounds, reverse=True) |
|
>>> sample_ |
|
array([[0.5 , 0.75], |
|
[0.5 , 0.5 ], |
|
[0.75, 0.25]]) |
|
|
|
""" |
|
sample = np.asarray(sample) |
|
|
|
|
|
if not sample.ndim == 2: |
|
raise ValueError('Sample is not a 2D array') |
|
|
|
lower, upper = _validate_bounds( |
|
l_bounds=l_bounds, u_bounds=u_bounds, d=sample.shape[1] |
|
) |
|
|
|
if not reverse: |
|
|
|
if (sample.max() > 1.) or (sample.min() < 0.): |
|
raise ValueError('Sample is not in unit hypercube') |
|
|
|
return sample * (upper - lower) + lower |
|
else: |
|
|
|
if not (np.all(sample >= lower) and np.all(sample <= upper)): |
|
raise ValueError('Sample is out of bounds') |
|
|
|
return (sample - lower) / (upper - lower) |
|
|
|
|
|
def _ensure_in_unit_hypercube(sample: "npt.ArrayLike") -> np.ndarray: |
|
"""Ensure that sample is a 2D array and is within a unit hypercube |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
A 2D array of points. |
|
|
|
Returns |
|
------- |
|
np.ndarray |
|
The array interpretation of the input sample |
|
|
|
Raises |
|
------ |
|
ValueError |
|
If the input is not a 2D array or contains points outside of |
|
a unit hypercube. |
|
""" |
|
sample = np.asarray(sample, dtype=np.float64, order="C") |
|
|
|
if not sample.ndim == 2: |
|
raise ValueError("Sample is not a 2D array") |
|
|
|
if (sample.max() > 1.) or (sample.min() < 0.): |
|
raise ValueError("Sample is not in unit hypercube") |
|
|
|
return sample |
|
|
|
|
|
def discrepancy( |
|
sample: "npt.ArrayLike", |
|
*, |
|
iterative: bool = False, |
|
method: Literal["CD", "WD", "MD", "L2-star"] = "CD", |
|
workers: IntNumber = 1) -> float: |
|
"""Discrepancy of a given sample. |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
The sample to compute the discrepancy from. |
|
iterative : bool, optional |
|
Must be False if not using it for updating the discrepancy. |
|
Default is False. Refer to the notes for more details. |
|
method : str, optional |
|
Type of discrepancy, can be ``CD``, ``WD``, ``MD`` or ``L2-star``. |
|
Refer to the notes for more details. Default is ``CD``. |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is given all |
|
CPU threads are used. Default is 1. |
|
|
|
Returns |
|
------- |
|
discrepancy : float |
|
Discrepancy. |
|
|
|
See Also |
|
-------- |
|
geometric_discrepancy |
|
|
|
Notes |
|
----- |
|
The discrepancy is a uniformity criterion used to assess the space filling |
|
of a number of samples in a hypercube. A discrepancy quantifies the |
|
distance between the continuous uniform distribution on a hypercube and the |
|
discrete uniform distribution on :math:`n` distinct sample points. |
|
|
|
The lower the value is, the better the coverage of the parameter space is. |
|
|
|
For a collection of subsets of the hypercube, the discrepancy is the |
|
difference between the fraction of sample points in one of those |
|
subsets and the volume of that subset. There are different definitions of |
|
discrepancy corresponding to different collections of subsets. Some |
|
versions take a root mean square difference over subsets instead of |
|
a maximum. |
|
|
|
A measure of uniformity is reasonable if it satisfies the following |
|
criteria [1]_: |
|
|
|
1. It is invariant under permuting factors and/or runs. |
|
2. It is invariant under rotation of the coordinates. |
|
3. It can measure not only uniformity of the sample over the hypercube, |
|
but also the projection uniformity of the sample over non-empty |
|
subset of lower dimension hypercubes. |
|
4. There is some reasonable geometric meaning. |
|
5. It is easy to compute. |
|
6. It satisfies the Koksma-Hlawka-like inequality. |
|
7. It is consistent with other criteria in experimental design. |
|
|
|
Four methods are available: |
|
|
|
* ``CD``: Centered Discrepancy - subspace involves a corner of the |
|
hypercube |
|
* ``WD``: Wrap-around Discrepancy - subspace can wrap around bounds |
|
* ``MD``: Mixture Discrepancy - mix between CD/WD covering more criteria |
|
* ``L2-star``: L2-star discrepancy - like CD BUT variant to rotation |
|
|
|
See [2]_ for precise definitions of each method. |
|
|
|
Lastly, using ``iterative=True``, it is possible to compute the |
|
discrepancy as if we had :math:`n+1` samples. This is useful if we want |
|
to add a point to a sampling and check the candidate which would give the |
|
lowest discrepancy. Then you could just update the discrepancy with |
|
each candidate using `update_discrepancy`. This method is faster than |
|
computing the discrepancy for a large number of candidates. |
|
|
|
References |
|
---------- |
|
.. [1] Fang et al. "Design and modeling for computer experiments". |
|
Computer Science and Data Analysis Series, 2006. |
|
.. [2] Zhou Y.-D. et al. "Mixture discrepancy for quasi-random point sets." |
|
Journal of Complexity, 29 (3-4) , pp. 283-301, 2013. |
|
.. [3] T. T. Warnock. "Computational investigations of low discrepancy |
|
point sets." Applications of Number Theory to Numerical |
|
Analysis, Academic Press, pp. 319-343, 1972. |
|
|
|
Examples |
|
-------- |
|
Calculate the quality of the sample using the discrepancy: |
|
|
|
>>> import numpy as np |
|
>>> from scipy.stats import qmc |
|
>>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]]) |
|
>>> l_bounds = [0.5, 0.5] |
|
>>> u_bounds = [6.5, 6.5] |
|
>>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True) |
|
>>> space |
|
array([[0.08333333, 0.41666667], |
|
[0.25 , 0.91666667], |
|
[0.41666667, 0.25 ], |
|
[0.58333333, 0.75 ], |
|
[0.75 , 0.08333333], |
|
[0.91666667, 0.58333333]]) |
|
>>> qmc.discrepancy(space) |
|
0.008142039609053464 |
|
|
|
We can also compute iteratively the ``CD`` discrepancy by using |
|
``iterative=True``. |
|
|
|
>>> disc_init = qmc.discrepancy(space[:-1], iterative=True) |
|
>>> disc_init |
|
0.04769081147119336 |
|
>>> qmc.update_discrepancy(space[-1], space[:-1], disc_init) |
|
0.008142039609053513 |
|
|
|
""" |
|
sample = _ensure_in_unit_hypercube(sample) |
|
|
|
workers = _validate_workers(workers) |
|
|
|
methods = { |
|
"CD": _cy_wrapper_centered_discrepancy, |
|
"WD": _cy_wrapper_wrap_around_discrepancy, |
|
"MD": _cy_wrapper_mixture_discrepancy, |
|
"L2-star": _cy_wrapper_l2_star_discrepancy, |
|
} |
|
|
|
if method in methods: |
|
return methods[method](sample, iterative, workers=workers) |
|
else: |
|
raise ValueError(f"{method!r} is not a valid method. It must be one of" |
|
f" {set(methods)!r}") |
|
|
|
|
|
def geometric_discrepancy( |
|
sample: "npt.ArrayLike", |
|
method: Literal["mindist", "mst"] = "mindist", |
|
metric: str = "euclidean") -> float: |
|
"""Discrepancy of a given sample based on its geometric properties. |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
The sample to compute the discrepancy from. |
|
method : {"mindist", "mst"}, optional |
|
The method to use. One of ``mindist`` for minimum distance (default) |
|
or ``mst`` for minimum spanning tree. |
|
metric : str or callable, optional |
|
The distance metric to use. See the documentation |
|
for `scipy.spatial.distance.pdist` for the available metrics and |
|
the default. |
|
|
|
Returns |
|
------- |
|
discrepancy : float |
|
Discrepancy (higher values correspond to greater sample uniformity). |
|
|
|
See Also |
|
-------- |
|
discrepancy |
|
|
|
Notes |
|
----- |
|
The discrepancy can serve as a simple measure of quality of a random sample. |
|
This measure is based on the geometric properties of the distribution of points |
|
in the sample, such as the minimum distance between any pair of points, or |
|
the mean edge length in a minimum spanning tree. |
|
|
|
The higher the value is, the better the coverage of the parameter space is. |
|
Note that this is different from `scipy.stats.qmc.discrepancy`, where lower |
|
values correspond to higher quality of the sample. |
|
|
|
Also note that when comparing different sampling strategies using this function, |
|
the sample size must be kept constant. |
|
|
|
It is possible to calculate two metrics from the minimum spanning tree: |
|
the mean edge length and the standard deviation of edges lengths. Using |
|
both metrics offers a better picture of uniformity than either metric alone, |
|
with higher mean and lower standard deviation being preferable (see [1]_ |
|
for a brief discussion). This function currently only calculates the mean |
|
edge length. |
|
|
|
References |
|
---------- |
|
.. [1] Franco J. et al. "Minimum Spanning Tree: A new approach to assess the quality |
|
of the design of computer experiments." Chemometrics and Intelligent Laboratory |
|
Systems, 97 (2), pp. 164-169, 2009. |
|
|
|
Examples |
|
-------- |
|
Calculate the quality of the sample using the minimum euclidean distance |
|
(the defaults): |
|
|
|
>>> import numpy as np |
|
>>> from scipy.stats import qmc |
|
>>> rng = np.random.default_rng(191468432622931918890291693003068437394) |
|
>>> sample = qmc.LatinHypercube(d=2, rng=rng).random(50) |
|
>>> qmc.geometric_discrepancy(sample) |
|
0.03708161435687876 |
|
|
|
Calculate the quality using the mean edge length in the minimum |
|
spanning tree: |
|
|
|
>>> qmc.geometric_discrepancy(sample, method='mst') |
|
0.1105149978798376 |
|
|
|
Display the minimum spanning tree and the points with |
|
the smallest distance: |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> from matplotlib.lines import Line2D |
|
>>> from scipy.sparse.csgraph import minimum_spanning_tree |
|
>>> from scipy.spatial.distance import pdist, squareform |
|
>>> dist = pdist(sample) |
|
>>> mst = minimum_spanning_tree(squareform(dist)) |
|
>>> edges = np.where(mst.toarray() > 0) |
|
>>> edges = np.asarray(edges).T |
|
>>> min_dist = np.min(dist) |
|
>>> min_idx = np.argwhere(squareform(dist) == min_dist)[0] |
|
>>> fig, ax = plt.subplots(figsize=(10, 5)) |
|
>>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$', |
|
... xlim=[0, 1], ylim=[0, 1]) |
|
>>> for edge in edges: |
|
... ax.plot(sample[edge, 0], sample[edge, 1], c='k') |
|
>>> ax.scatter(sample[:, 0], sample[:, 1]) |
|
>>> ax.add_patch(plt.Circle(sample[min_idx[0]], min_dist, color='red', fill=False)) |
|
>>> markers = [ |
|
... Line2D([0], [0], marker='o', lw=0, label='Sample points'), |
|
... Line2D([0], [0], color='k', label='Minimum spanning tree'), |
|
... Line2D([0], [0], marker='o', lw=0, markerfacecolor='w', markeredgecolor='r', |
|
... label='Minimum point-to-point distance'), |
|
... ] |
|
>>> ax.legend(handles=markers, loc='center left', bbox_to_anchor=(1, 0.5)); |
|
>>> plt.show() |
|
|
|
""" |
|
sample = _ensure_in_unit_hypercube(sample) |
|
if sample.shape[0] < 2: |
|
raise ValueError("Sample must contain at least two points") |
|
|
|
distances = distance.pdist(sample, metric=metric) |
|
|
|
if np.any(distances == 0.0): |
|
warnings.warn("Sample contains duplicate points.", stacklevel=2) |
|
|
|
if method == "mindist": |
|
return np.min(distances[distances.nonzero()]) |
|
elif method == "mst": |
|
fully_connected_graph = distance.squareform(distances) |
|
mst = minimum_spanning_tree(fully_connected_graph) |
|
distances = mst[mst.nonzero()] |
|
|
|
|
|
return np.mean(distances) |
|
else: |
|
raise ValueError(f"{method!r} is not a valid method. " |
|
f"It must be one of {{'mindist', 'mst'}}") |
|
|
|
|
|
def update_discrepancy( |
|
x_new: "npt.ArrayLike", |
|
sample: "npt.ArrayLike", |
|
initial_disc: DecimalNumber) -> float: |
|
"""Update the centered discrepancy with a new sample. |
|
|
|
Parameters |
|
---------- |
|
x_new : array_like (1, d) |
|
The new sample to add in `sample`. |
|
sample : array_like (n, d) |
|
The initial sample. |
|
initial_disc : float |
|
Centered discrepancy of the `sample`. |
|
|
|
Returns |
|
------- |
|
discrepancy : float |
|
Centered discrepancy of the sample composed of `x_new` and `sample`. |
|
|
|
Examples |
|
-------- |
|
We can also compute iteratively the discrepancy by using |
|
``iterative=True``. |
|
|
|
>>> import numpy as np |
|
>>> from scipy.stats import qmc |
|
>>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]]) |
|
>>> l_bounds = [0.5, 0.5] |
|
>>> u_bounds = [6.5, 6.5] |
|
>>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True) |
|
>>> disc_init = qmc.discrepancy(space[:-1], iterative=True) |
|
>>> disc_init |
|
0.04769081147119336 |
|
>>> qmc.update_discrepancy(space[-1], space[:-1], disc_init) |
|
0.008142039609053513 |
|
|
|
""" |
|
sample = np.asarray(sample, dtype=np.float64, order="C") |
|
x_new = np.asarray(x_new, dtype=np.float64, order="C") |
|
|
|
|
|
if not sample.ndim == 2: |
|
raise ValueError('Sample is not a 2D array') |
|
|
|
if (sample.max() > 1.) or (sample.min() < 0.): |
|
raise ValueError('Sample is not in unit hypercube') |
|
|
|
|
|
if not x_new.ndim == 1: |
|
raise ValueError('x_new is not a 1D array') |
|
|
|
if not (np.all(x_new >= 0) and np.all(x_new <= 1)): |
|
raise ValueError('x_new is not in unit hypercube') |
|
|
|
if x_new.shape[0] != sample.shape[1]: |
|
raise ValueError("x_new and sample must be broadcastable") |
|
|
|
return _cy_wrapper_update_discrepancy(x_new, sample, initial_disc) |
|
|
|
|
|
def _perturb_discrepancy(sample: np.ndarray, i1: int, i2: int, k: int, |
|
disc: float): |
|
"""Centered discrepancy after an elementary perturbation of a LHS. |
|
|
|
An elementary perturbation consists of an exchange of coordinates between |
|
two points: ``sample[i1, k] <-> sample[i2, k]``. By construction, |
|
this operation conserves the LHS properties. |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
The sample (before permutation) to compute the discrepancy from. |
|
i1 : int |
|
The first line of the elementary permutation. |
|
i2 : int |
|
The second line of the elementary permutation. |
|
k : int |
|
The column of the elementary permutation. |
|
disc : float |
|
Centered discrepancy of the design before permutation. |
|
|
|
Returns |
|
------- |
|
discrepancy : float |
|
Centered discrepancy of the design after permutation. |
|
|
|
References |
|
---------- |
|
.. [1] Jin et al. "An efficient algorithm for constructing optimal design |
|
of computer experiments", Journal of Statistical Planning and |
|
Inference, 2005. |
|
|
|
""" |
|
n = sample.shape[0] |
|
|
|
z_ij = sample - 0.5 |
|
|
|
|
|
c_i1j = (1. / n ** 2. |
|
* np.prod(0.5 * (2. + abs(z_ij[i1, :]) |
|
+ abs(z_ij) - abs(z_ij[i1, :] - z_ij)), axis=1)) |
|
c_i2j = (1. / n ** 2. |
|
* np.prod(0.5 * (2. + abs(z_ij[i2, :]) |
|
+ abs(z_ij) - abs(z_ij[i2, :] - z_ij)), axis=1)) |
|
|
|
|
|
c_i1i1 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i1, :])) |
|
- 2. / n * np.prod(1. + 0.5 * abs(z_ij[i1, :]) |
|
- 0.5 * z_ij[i1, :] ** 2)) |
|
c_i2i2 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i2, :])) |
|
- 2. / n * np.prod(1. + 0.5 * abs(z_ij[i2, :]) |
|
- 0.5 * z_ij[i2, :] ** 2)) |
|
|
|
|
|
num = (2 + abs(z_ij[i2, k]) + abs(z_ij[:, k]) |
|
- abs(z_ij[i2, k] - z_ij[:, k])) |
|
denum = (2 + abs(z_ij[i1, k]) + abs(z_ij[:, k]) |
|
- abs(z_ij[i1, k] - z_ij[:, k])) |
|
gamma = num / denum |
|
|
|
|
|
c_p_i1j = gamma * c_i1j |
|
|
|
c_p_i2j = c_i2j / gamma |
|
|
|
alpha = (1 + abs(z_ij[i2, k])) / (1 + abs(z_ij[i1, k])) |
|
beta = (2 - abs(z_ij[i2, k])) / (2 - abs(z_ij[i1, k])) |
|
|
|
g_i1 = np.prod(1. + abs(z_ij[i1, :])) |
|
g_i2 = np.prod(1. + abs(z_ij[i2, :])) |
|
h_i1 = np.prod(1. + 0.5 * abs(z_ij[i1, :]) - 0.5 * (z_ij[i1, :] ** 2)) |
|
h_i2 = np.prod(1. + 0.5 * abs(z_ij[i2, :]) - 0.5 * (z_ij[i2, :] ** 2)) |
|
|
|
|
|
c_p_i1i1 = ((g_i1 * alpha) / (n ** 2) - 2. * alpha * beta * h_i1 / n) |
|
|
|
c_p_i2i2 = ((g_i2 / ((n ** 2) * alpha)) - (2. * h_i2 / (n * alpha * beta))) |
|
|
|
|
|
sum_ = c_p_i1j - c_i1j + c_p_i2j - c_i2j |
|
|
|
mask = np.ones(n, dtype=bool) |
|
mask[[i1, i2]] = False |
|
sum_ = sum(sum_[mask]) |
|
|
|
disc_ep = (disc + c_p_i1i1 - c_i1i1 + c_p_i2i2 - c_i2i2 + 2 * sum_) |
|
|
|
return disc_ep |
|
|
|
|
|
def primes_from_2_to(n: int) -> np.ndarray: |
|
"""Prime numbers from 2 to *n*. |
|
|
|
Parameters |
|
---------- |
|
n : int |
|
Sup bound with ``n >= 6``. |
|
|
|
Returns |
|
------- |
|
primes : list(int) |
|
Primes in ``2 <= p < n``. |
|
|
|
Notes |
|
----- |
|
Taken from [1]_ by P.T. Roy, written consent given on 23.04.2021 |
|
by the original author, Bruno Astrolino, for free use in SciPy under |
|
the 3-clause BSD. |
|
|
|
References |
|
---------- |
|
.. [1] `StackOverflow <https://stackoverflow.com/questions/2068372>`_. |
|
|
|
""" |
|
sieve = np.ones(n // 3 + (n % 6 == 2), dtype=bool) |
|
for i in range(1, int(n ** 0.5) // 3 + 1): |
|
k = 3 * i + 1 | 1 |
|
sieve[k * k // 3::2 * k] = False |
|
sieve[k * (k - 2 * (i & 1) + 4) // 3::2 * k] = False |
|
return np.r_[2, 3, ((3 * np.nonzero(sieve)[0][1:] + 1) | 1)] |
|
|
|
|
|
def n_primes(n: IntNumber) -> list[int]: |
|
"""List of the n-first prime numbers. |
|
|
|
Parameters |
|
---------- |
|
n : int |
|
Number of prime numbers wanted. |
|
|
|
Returns |
|
------- |
|
primes : list(int) |
|
List of primes. |
|
|
|
""" |
|
primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, |
|
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, |
|
131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, |
|
197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, |
|
271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, |
|
353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, |
|
433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, |
|
509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, |
|
601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, |
|
677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, |
|
769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, |
|
859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, |
|
953, 967, 971, 977, 983, 991, 997][:n] |
|
|
|
if len(primes) < n: |
|
big_number = 2000 |
|
while 'Not enough primes': |
|
primes = primes_from_2_to(big_number)[:n] |
|
if len(primes) == n: |
|
break |
|
big_number += 1000 |
|
|
|
return primes |
|
|
|
|
|
def _van_der_corput_permutations( |
|
base: IntNumber, *, rng: SeedType = None |
|
) -> np.ndarray: |
|
"""Permutations for scrambling a Van der Corput sequence. |
|
|
|
Parameters |
|
---------- |
|
base : int |
|
Base of the sequence. |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. During the transition, the behavior documented above is not |
|
accurate; see `check_random_state` for actual behavior. After the |
|
transition, this admonition can be removed. |
|
|
|
Returns |
|
------- |
|
permutations : array_like |
|
Permutation indices. |
|
|
|
Notes |
|
----- |
|
In Algorithm 1 of Owen 2017, a permutation of `np.arange(base)` is |
|
created for each positive integer `k` such that ``1 - base**-k < 1`` |
|
using floating-point arithmetic. For double precision floats, the |
|
condition ``1 - base**-k < 1`` can also be written as ``base**-k > |
|
2**-54``, which makes it more apparent how many permutations we need |
|
to create. |
|
""" |
|
rng = check_random_state(rng) |
|
count = math.ceil(54 / math.log2(base)) - 1 |
|
permutations = np.repeat(np.arange(base)[None], count, axis=0) |
|
for perm in permutations: |
|
rng.shuffle(perm) |
|
|
|
return permutations |
|
|
|
|
|
def van_der_corput( |
|
n: IntNumber, |
|
base: IntNumber = 2, |
|
*, |
|
start_index: IntNumber = 0, |
|
scramble: bool = False, |
|
permutations: "npt.ArrayLike | None" = None, |
|
rng: SeedType = None, |
|
workers: IntNumber = 1) -> np.ndarray: |
|
"""Van der Corput sequence. |
|
|
|
Pseudo-random number generator based on a b-adic expansion. |
|
|
|
Scrambling uses permutations of the remainders (see [1]_). Multiple |
|
permutations are applied to construct a point. The sequence of |
|
permutations has to be the same for all points of the sequence. |
|
|
|
Parameters |
|
---------- |
|
n : int |
|
Number of element of the sequence. |
|
base : int, optional |
|
Base of the sequence. Default is 2. |
|
start_index : int, optional |
|
Index to start the sequence from. Default is 0. |
|
scramble : bool, optional |
|
If True, use Owen scrambling. Otherwise no scrambling is done. |
|
Default is True. |
|
permutations : array_like, optional |
|
Permutations used for scrambling. |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is |
|
given all CPU threads are used. Default is 1. |
|
|
|
Returns |
|
------- |
|
sequence : list (n,) |
|
Sequence of Van der Corput. |
|
|
|
References |
|
---------- |
|
.. [1] A. B. Owen. "A randomized Halton algorithm in R", |
|
:arxiv:`1706.02808`, 2017. |
|
|
|
""" |
|
if base < 2: |
|
raise ValueError("'base' must be at least 2") |
|
|
|
if scramble: |
|
if permutations is None: |
|
permutations = _van_der_corput_permutations( |
|
base=base, rng=rng |
|
) |
|
else: |
|
permutations = np.asarray(permutations) |
|
|
|
permutations = permutations.astype(np.int64) |
|
return _cy_van_der_corput_scrambled(n, base, start_index, |
|
permutations, workers) |
|
|
|
else: |
|
return _cy_van_der_corput(n, base, start_index, workers) |
|
|
|
|
|
class QMCEngine(ABC): |
|
"""A generic Quasi-Monte Carlo sampler class meant for subclassing. |
|
|
|
QMCEngine is a base class to construct a specific Quasi-Monte Carlo |
|
sampler. It cannot be used directly as a sampler. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Dimension of the parameter space. |
|
optimization : {None, "random-cd", "lloyd"}, optional |
|
Whether to use an optimization scheme to improve the quality after |
|
sampling. Note that this is a post-processing step that does not |
|
guarantee that all properties of the sample will be conserved. |
|
Default is None. |
|
|
|
* ``random-cd``: random permutations of coordinates to lower the |
|
centered discrepancy. The best sample based on the centered |
|
discrepancy is constantly updated. Centered discrepancy-based |
|
sampling shows better space-filling robustness toward 2D and 3D |
|
subprojections compared to using other discrepancy measures. |
|
* ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm. |
|
The process converges to equally spaced samples. |
|
|
|
.. versionadded:: 1.10.0 |
|
|
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
Notes |
|
----- |
|
By convention samples are distributed over the half-open interval |
|
``[0, 1)``. Instances of the class can access the attributes: ``d`` for |
|
the dimension; and ``rng`` for the random number generator. |
|
|
|
**Subclassing** |
|
|
|
When subclassing `QMCEngine` to create a new sampler, ``__init__`` and |
|
``random`` must be redefined. |
|
|
|
* ``__init__(d, rng=None)``: at least fix the dimension. If the sampler |
|
does not take advantage of a ``rng`` (deterministic methods like |
|
Halton), this parameter can be omitted. |
|
* ``_random(n, *, workers=1)``: draw ``n`` from the engine. ``workers`` |
|
is used for parallelism. See `Halton` for example. |
|
|
|
Optionally, two other methods can be overwritten by subclasses: |
|
|
|
* ``reset``: Reset the engine to its original state. |
|
* ``fast_forward``: If the sequence is deterministic (like Halton |
|
sequence), then ``fast_forward(n)`` is skipping the ``n`` first draw. |
|
|
|
Examples |
|
-------- |
|
To create a random sampler based on ``np.random.random``, we would do the |
|
following: |
|
|
|
>>> from scipy.stats import qmc |
|
>>> class RandomEngine(qmc.QMCEngine): |
|
... def __init__(self, d, rng=None): |
|
... super().__init__(d=d, rng=rng) |
|
... |
|
... |
|
... def _random(self, n=1, *, workers=1): |
|
... return self.rng.random((n, self.d)) |
|
... |
|
... |
|
... def reset(self): |
|
... super().__init__(d=self.d, rng=self.rng_seed) |
|
... return self |
|
... |
|
... |
|
... def fast_forward(self, n): |
|
... self.random(n) |
|
... return self |
|
|
|
After subclassing `QMCEngine` to define the sampling strategy we want to |
|
use, we can create an instance to sample from. |
|
|
|
>>> engine = RandomEngine(2) |
|
>>> engine.random(5) |
|
array([[0.22733602, 0.31675834], # random |
|
[0.79736546, 0.67625467], |
|
[0.39110955, 0.33281393], |
|
[0.59830875, 0.18673419], |
|
[0.67275604, 0.94180287]]) |
|
|
|
We can also reset the state of the generator and resample again. |
|
|
|
>>> _ = engine.reset() |
|
>>> engine.random(5) |
|
array([[0.22733602, 0.31675834], # random |
|
[0.79736546, 0.67625467], |
|
[0.39110955, 0.33281393], |
|
[0.59830875, 0.18673419], |
|
[0.67275604, 0.94180287]]) |
|
|
|
""" |
|
|
|
@abstractmethod |
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, |
|
d: IntNumber, |
|
*, |
|
optimization: Literal["random-cd", "lloyd"] | None = None, |
|
rng: SeedType = None |
|
) -> None: |
|
self._initialize(d, optimization=optimization, rng=rng) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _initialize( |
|
self, |
|
d: IntNumber, |
|
*, |
|
optimization: Literal["random-cd", "lloyd"] | None = None, |
|
rng: SeedType = None |
|
) -> None: |
|
if not np.issubdtype(type(d), np.integer) or d < 0: |
|
raise ValueError('d must be a non-negative integer value') |
|
|
|
self.d = d |
|
|
|
if isinstance(rng, np.random.Generator): |
|
|
|
self.rng = _rng_spawn(rng, 1)[0] |
|
else: |
|
|
|
|
|
self.rng = check_random_state(rng) |
|
self.rng_seed = copy.deepcopy(self.rng) |
|
|
|
self.num_generated = 0 |
|
|
|
config = { |
|
|
|
"n_nochange": 100, |
|
"n_iters": 10_000, |
|
"rng": self.rng, |
|
|
|
|
|
"tol": 1e-5, |
|
"maxiter": 10, |
|
"qhull_options": None, |
|
} |
|
self._optimization = optimization |
|
self.optimization_method = _select_optimizer(optimization, config) |
|
|
|
@abstractmethod |
|
def _random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
... |
|
|
|
def random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Draw `n` in the half-open interval ``[0, 1)``. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. |
|
Default is 1. |
|
workers : int, optional |
|
Only supported with `Halton`. |
|
Number of workers to use for parallel processing. If -1 is |
|
given all CPU threads are used. Default is 1. It becomes faster |
|
than one worker for `n` greater than :math:`10^3`. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
QMC sample. |
|
|
|
""" |
|
sample = self._random(n, workers=workers) |
|
if self.optimization_method is not None: |
|
sample = self.optimization_method(sample) |
|
|
|
self.num_generated += n |
|
return sample |
|
|
|
def integers( |
|
self, |
|
l_bounds: "npt.ArrayLike", |
|
*, |
|
u_bounds: "npt.ArrayLike | None" = None, |
|
n: IntNumber = 1, |
|
endpoint: bool = False, |
|
workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
r""" |
|
Draw `n` integers from `l_bounds` (inclusive) to `u_bounds` |
|
(exclusive), or if endpoint=True, `l_bounds` (inclusive) to |
|
`u_bounds` (inclusive). |
|
|
|
Parameters |
|
---------- |
|
l_bounds : int or array-like of ints |
|
Lowest (signed) integers to be drawn (unless ``u_bounds=None``, |
|
in which case this parameter is 0 and this value is used for |
|
`u_bounds`). |
|
u_bounds : int or array-like of ints, optional |
|
If provided, one above the largest (signed) integer to be drawn |
|
(see above for behavior if ``u_bounds=None``). |
|
If array-like, must contain integer values. |
|
n : int, optional |
|
Number of samples to generate in the parameter space. |
|
Default is 1. |
|
endpoint : bool, optional |
|
If true, sample from the interval ``[l_bounds, u_bounds]`` instead |
|
of the default ``[l_bounds, u_bounds)``. Defaults is False. |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is |
|
given all CPU threads are used. Only supported when using `Halton` |
|
Default is 1. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
QMC sample. |
|
|
|
Notes |
|
----- |
|
It is safe to just use the same ``[0, 1)`` to integer mapping |
|
with QMC that you would use with MC. You still get unbiasedness, |
|
a strong law of large numbers, an asymptotically infinite variance |
|
reduction and a finite sample variance bound. |
|
|
|
To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`, |
|
with :math:`a` the lower bounds and :math:`b` the upper bounds, |
|
the following transformation is used: |
|
|
|
.. math:: |
|
|
|
\text{floor}((b - a) \cdot \text{sample} + a) |
|
|
|
""" |
|
if u_bounds is None: |
|
u_bounds = l_bounds |
|
l_bounds = 0 |
|
|
|
u_bounds = np.atleast_1d(u_bounds) |
|
l_bounds = np.atleast_1d(l_bounds) |
|
|
|
if endpoint: |
|
u_bounds = u_bounds + 1 |
|
|
|
if (not np.issubdtype(l_bounds.dtype, np.integer) or |
|
not np.issubdtype(u_bounds.dtype, np.integer)): |
|
message = ("'u_bounds' and 'l_bounds' must be integers or" |
|
" array-like of integers") |
|
raise ValueError(message) |
|
|
|
if isinstance(self, Halton): |
|
sample = self.random(n=n, workers=workers) |
|
else: |
|
sample = self.random(n=n) |
|
|
|
sample = scale(sample, l_bounds=l_bounds, u_bounds=u_bounds) |
|
sample = np.floor(sample).astype(np.int64) |
|
|
|
return sample |
|
|
|
def reset(self) -> "QMCEngine": |
|
"""Reset the engine to base state. |
|
|
|
Returns |
|
------- |
|
engine : QMCEngine |
|
Engine reset to its base state. |
|
|
|
""" |
|
rng = copy.deepcopy(self.rng_seed) |
|
self.rng = check_random_state(rng) |
|
self.num_generated = 0 |
|
return self |
|
|
|
def fast_forward(self, n: IntNumber) -> "QMCEngine": |
|
"""Fast-forward the sequence by `n` positions. |
|
|
|
Parameters |
|
---------- |
|
n : int |
|
Number of points to skip in the sequence. |
|
|
|
Returns |
|
------- |
|
engine : QMCEngine |
|
Engine reset to its base state. |
|
|
|
""" |
|
self.random(n=n) |
|
return self |
|
|
|
|
|
class Halton(QMCEngine): |
|
"""Halton sequence. |
|
|
|
Pseudo-random number generator that generalize the Van der Corput sequence |
|
for multiple dimensions. The Halton sequence uses the base-two Van der |
|
Corput sequence for the first dimension, base-three for its second and |
|
base-:math:`n` for its n-dimension. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Dimension of the parameter space. |
|
scramble : bool, optional |
|
If True, use Owen scrambling. Otherwise no scrambling is done. |
|
Default is True. |
|
optimization : {None, "random-cd", "lloyd"}, optional |
|
Whether to use an optimization scheme to improve the quality after |
|
sampling. Note that this is a post-processing step that does not |
|
guarantee that all properties of the sample will be conserved. |
|
Default is None. |
|
|
|
* ``random-cd``: random permutations of coordinates to lower the |
|
centered discrepancy. The best sample based on the centered |
|
discrepancy is constantly updated. Centered discrepancy-based |
|
sampling shows better space-filling robustness toward 2D and 3D |
|
subprojections compared to using other discrepancy measures. |
|
* ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm. |
|
The process converges to equally spaced samples. |
|
|
|
.. versionadded:: 1.10.0 |
|
|
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
Notes |
|
----- |
|
The Halton sequence has severe striping artifacts for even modestly |
|
large dimensions. These can be ameliorated by scrambling. Scrambling |
|
also supports replication-based error estimates and extends |
|
applicability to unbounded integrands. |
|
|
|
References |
|
---------- |
|
.. [1] Halton, "On the efficiency of certain quasi-random sequences of |
|
points in evaluating multi-dimensional integrals", Numerische |
|
Mathematik, 1960. |
|
.. [2] A. B. Owen. "A randomized Halton algorithm in R", |
|
:arxiv:`1706.02808`, 2017. |
|
|
|
Examples |
|
-------- |
|
Generate samples from a low discrepancy sequence of Halton. |
|
|
|
>>> from scipy.stats import qmc |
|
>>> sampler = qmc.Halton(d=2, scramble=False) |
|
>>> sample = sampler.random(n=5) |
|
>>> sample |
|
array([[0. , 0. ], |
|
[0.5 , 0.33333333], |
|
[0.25 , 0.66666667], |
|
[0.75 , 0.11111111], |
|
[0.125 , 0.44444444]]) |
|
|
|
Compute the quality of the sample using the discrepancy criterion. |
|
|
|
>>> qmc.discrepancy(sample) |
|
0.088893711419753 |
|
|
|
If some wants to continue an existing design, extra points can be obtained |
|
by calling again `random`. Alternatively, you can skip some points like: |
|
|
|
>>> _ = sampler.fast_forward(5) |
|
>>> sample_continued = sampler.random(n=5) |
|
>>> sample_continued |
|
array([[0.3125 , 0.37037037], |
|
[0.8125 , 0.7037037 ], |
|
[0.1875 , 0.14814815], |
|
[0.6875 , 0.48148148], |
|
[0.4375 , 0.81481481]]) |
|
|
|
Finally, samples can be scaled to bounds. |
|
|
|
>>> l_bounds = [0, 2] |
|
>>> u_bounds = [10, 5] |
|
>>> qmc.scale(sample_continued, l_bounds, u_bounds) |
|
array([[3.125 , 3.11111111], |
|
[8.125 , 4.11111111], |
|
[1.875 , 2.44444444], |
|
[6.875 , 3.44444444], |
|
[4.375 , 4.44444444]]) |
|
|
|
""" |
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, d: IntNumber, *, scramble: bool = True, |
|
optimization: Literal["random-cd", "lloyd"] | None = None, |
|
rng: SeedType = None |
|
) -> None: |
|
|
|
self._init_quad = {'d': d, 'scramble': True, |
|
'optimization': optimization} |
|
super()._initialize(d=d, optimization=optimization, rng=rng) |
|
|
|
|
|
self.base = [int(bdim) for bdim in n_primes(d)] |
|
self.scramble = scramble |
|
|
|
self._initialize_permutations() |
|
|
|
def _initialize_permutations(self) -> None: |
|
"""Initialize permutations for all Van der Corput sequences. |
|
|
|
Permutations are only needed for scrambling. |
|
""" |
|
self._permutations: list = [None] * len(self.base) |
|
if self.scramble: |
|
for i, bdim in enumerate(self.base): |
|
permutations = _van_der_corput_permutations( |
|
base=bdim, rng=self.rng |
|
) |
|
|
|
self._permutations[i] = permutations |
|
|
|
def _random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Draw `n` in the half-open interval ``[0, 1)``. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is |
|
given all CPU threads are used. Default is 1. It becomes faster |
|
than one worker for `n` greater than :math:`10^3`. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
QMC sample. |
|
|
|
""" |
|
workers = _validate_workers(workers) |
|
|
|
sample = [van_der_corput(n, bdim, start_index=self.num_generated, |
|
scramble=self.scramble, |
|
permutations=self._permutations[i], |
|
workers=workers) |
|
for i, bdim in enumerate(self.base)] |
|
|
|
return np.array(sample).T.reshape(n, self.d) |
|
|
|
|
|
class LatinHypercube(QMCEngine): |
|
r"""Latin hypercube sampling (LHS). |
|
|
|
A Latin hypercube sample [1]_ generates :math:`n` points in |
|
:math:`[0,1)^{d}`. Each univariate marginal distribution is stratified, |
|
placing exactly one point in :math:`[j/n, (j+1)/n)` for |
|
:math:`j=0,1,...,n-1`. They are still applicable when :math:`n << d`. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Dimension of the parameter space. |
|
scramble : bool, optional |
|
When False, center samples within cells of a multi-dimensional grid. |
|
Otherwise, samples are randomly placed within cells of the grid. |
|
|
|
.. note:: |
|
Setting ``scramble=False`` does not ensure deterministic output. |
|
For that, use the `rng` parameter. |
|
|
|
Default is True. |
|
|
|
.. versionadded:: 1.10.0 |
|
|
|
optimization : {None, "random-cd", "lloyd"}, optional |
|
Whether to use an optimization scheme to improve the quality after |
|
sampling. Note that this is a post-processing step that does not |
|
guarantee that all properties of the sample will be conserved. |
|
Default is None. |
|
|
|
* ``random-cd``: random permutations of coordinates to lower the |
|
centered discrepancy. The best sample based on the centered |
|
discrepancy is constantly updated. Centered discrepancy-based |
|
sampling shows better space-filling robustness toward 2D and 3D |
|
subprojections compared to using other discrepancy measures. |
|
* ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm. |
|
The process converges to equally spaced samples. |
|
|
|
.. versionadded:: 1.8.0 |
|
.. versionchanged:: 1.10.0 |
|
Add ``lloyd``. |
|
|
|
strength : {1, 2}, optional |
|
Strength of the LHS. ``strength=1`` produces a plain LHS while |
|
``strength=2`` produces an orthogonal array based LHS of strength 2 |
|
[7]_, [8]_. In that case, only ``n=p**2`` points can be sampled, |
|
with ``p`` a prime number. It also constrains ``d <= p + 1``. |
|
Default is 1. |
|
|
|
.. versionadded:: 1.8.0 |
|
|
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
See Also |
|
-------- |
|
:ref:`quasi-monte-carlo` |
|
|
|
Notes |
|
----- |
|
|
|
When LHS is used for integrating a function :math:`f` over :math:`n`, |
|
LHS is extremely effective on integrands that are nearly additive [2]_. |
|
With a LHS of :math:`n` points, the variance of the integral is always |
|
lower than plain MC on :math:`n-1` points [3]_. There is a central limit |
|
theorem for LHS on the mean and variance of the integral [4]_, but not |
|
necessarily for optimized LHS due to the randomization. |
|
|
|
:math:`A` is called an orthogonal array of strength :math:`t` if in each |
|
n-row-by-t-column submatrix of :math:`A`: all :math:`p^t` possible |
|
distinct rows occur the same number of times. The elements of :math:`A` |
|
are in the set :math:`\{0, 1, ..., p-1\}`, also called symbols. |
|
The constraint that :math:`p` must be a prime number is to allow modular |
|
arithmetic. Increasing strength adds some symmetry to the sub-projections |
|
of a sample. With strength 2, samples are symmetric along the diagonals of |
|
2D sub-projections. This may be undesirable, but on the other hand, the |
|
sample dispersion is improved. |
|
|
|
Strength 1 (plain LHS) brings an advantage over strength 0 (MC) and |
|
strength 2 is a useful increment over strength 1. Going to strength 3 is |
|
a smaller increment and scrambled QMC like Sobol', Halton are more |
|
performant [7]_. |
|
|
|
To create a LHS of strength 2, the orthogonal array :math:`A` is |
|
randomized by applying a random, bijective map of the set of symbols onto |
|
itself. For example, in column 0, all 0s might become 2; in column 1, |
|
all 0s might become 1, etc. |
|
Then, for each column :math:`i` and symbol :math:`j`, we add a plain, |
|
one-dimensional LHS of size :math:`p` to the subarray where |
|
:math:`A^i = j`. The resulting matrix is finally divided by :math:`p`. |
|
|
|
References |
|
---------- |
|
.. [1] Mckay et al., "A Comparison of Three Methods for Selecting Values |
|
of Input Variables in the Analysis of Output from a Computer Code." |
|
Technometrics, 1979. |
|
.. [2] M. Stein, "Large sample properties of simulations using Latin |
|
hypercube sampling." Technometrics 29, no. 2: 143-151, 1987. |
|
.. [3] A. B. Owen, "Monte Carlo variance of scrambled net quadrature." |
|
SIAM Journal on Numerical Analysis 34, no. 5: 1884-1910, 1997 |
|
.. [4] Loh, W.-L. "On Latin hypercube sampling." The annals of statistics |
|
24, no. 5: 2058-2080, 1996. |
|
.. [5] Fang et al. "Design and modeling for computer experiments". |
|
Computer Science and Data Analysis Series, 2006. |
|
.. [6] Damblin et al., "Numerical studies of space filling designs: |
|
optimization of Latin Hypercube Samples and subprojection properties." |
|
Journal of Simulation, 2013. |
|
.. [7] A. B. Owen , "Orthogonal arrays for computer experiments, |
|
integration and visualization." Statistica Sinica, 1992. |
|
.. [8] B. Tang, "Orthogonal Array-Based Latin Hypercubes." |
|
Journal of the American Statistical Association, 1993. |
|
.. [9] Seaholm, Susan K. et al. (1988). Latin hypercube sampling and the |
|
sensitivity analysis of a Monte Carlo epidemic model. Int J Biomed |
|
Comput, 23(1-2), 97-112. :doi:`10.1016/0020-7101(88)90067-0` |
|
|
|
Examples |
|
-------- |
|
Generate samples from a Latin hypercube generator. |
|
|
|
>>> from scipy.stats import qmc |
|
>>> sampler = qmc.LatinHypercube(d=2) |
|
>>> sample = sampler.random(n=5) |
|
>>> sample |
|
array([[0.1545328 , 0.53664833], # random |
|
[0.84052691, 0.06474907], |
|
[0.52177809, 0.93343721], |
|
[0.68033825, 0.36265316], |
|
[0.26544879, 0.61163943]]) |
|
|
|
Compute the quality of the sample using the discrepancy criterion. |
|
|
|
>>> qmc.discrepancy(sample) |
|
0.0196... # random |
|
|
|
Samples can be scaled to bounds. |
|
|
|
>>> l_bounds = [0, 2] |
|
>>> u_bounds = [10, 5] |
|
>>> qmc.scale(sample, l_bounds, u_bounds) |
|
array([[1.54532796, 3.609945 ], # random |
|
[8.40526909, 2.1942472 ], |
|
[5.2177809 , 4.80031164], |
|
[6.80338249, 3.08795949], |
|
[2.65448791, 3.83491828]]) |
|
|
|
Below are other examples showing alternative ways to construct LHS with |
|
even better coverage of the space. |
|
|
|
Using a base LHS as a baseline. |
|
|
|
>>> sampler = qmc.LatinHypercube(d=2) |
|
>>> sample = sampler.random(n=5) |
|
>>> qmc.discrepancy(sample) |
|
0.0196... # random |
|
|
|
Use the `optimization` keyword argument to produce a LHS with |
|
lower discrepancy at higher computational cost. |
|
|
|
>>> sampler = qmc.LatinHypercube(d=2, optimization="random-cd") |
|
>>> sample = sampler.random(n=5) |
|
>>> qmc.discrepancy(sample) |
|
0.0176... # random |
|
|
|
Use the `strength` keyword argument to produce an orthogonal array based |
|
LHS of strength 2. In this case, the number of sample points must be the |
|
square of a prime number. |
|
|
|
>>> sampler = qmc.LatinHypercube(d=2, strength=2) |
|
>>> sample = sampler.random(n=9) |
|
>>> qmc.discrepancy(sample) |
|
0.00526... # random |
|
|
|
Options could be combined to produce an optimized centered |
|
orthogonal array based LHS. After optimization, the result would not |
|
be guaranteed to be of strength 2. |
|
|
|
**Real-world example** |
|
|
|
In [9]_, a Latin Hypercube sampling (LHS) strategy was used to sample a |
|
parameter space to study the importance of each parameter of an epidemic |
|
model. Such analysis is also called a sensitivity analysis. |
|
|
|
Since the dimensionality of the problem is high (6), it is computationally |
|
expensive to cover the space. When numerical experiments are costly, QMC |
|
enables analysis that may not be possible if using a grid. |
|
|
|
The six parameters of the model represented the probability of illness, |
|
the probability of withdrawal, and four contact probabilities. The |
|
authors assumed uniform distributions for all parameters and generated |
|
50 samples. |
|
|
|
Using `scipy.stats.qmc.LatinHypercube` to replicate the protocol, |
|
the first step is to create a sample in the unit hypercube: |
|
|
|
>>> from scipy.stats import qmc |
|
>>> sampler = qmc.LatinHypercube(d=6) |
|
>>> sample = sampler.random(n=50) |
|
|
|
Then the sample can be scaled to the appropriate bounds: |
|
|
|
>>> l_bounds = [0.000125, 0.01, 0.0025, 0.05, 0.47, 0.7] |
|
>>> u_bounds = [0.000375, 0.03, 0.0075, 0.15, 0.87, 0.9] |
|
>>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds) |
|
|
|
Such a sample was used to run the model 50 times, and a polynomial |
|
response surface was constructed. This allowed the authors to study the |
|
relative importance of each parameter across the range of possibilities |
|
of every other parameter. |
|
|
|
In this computer experiment, they showed a 14-fold reduction in the |
|
number of samples required to maintain an error below 2% on their |
|
response surface when compared to a grid sampling. |
|
|
|
""" |
|
|
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, d: IntNumber, *, |
|
scramble: bool = True, |
|
strength: int = 1, |
|
optimization: Literal["random-cd", "lloyd"] | None = None, |
|
rng: SeedType = None |
|
) -> None: |
|
|
|
self._init_quad = {'d': d, 'scramble': True, 'strength': strength, |
|
'optimization': optimization} |
|
super()._initialize(d=d, rng=rng, optimization=optimization) |
|
self.scramble = scramble |
|
|
|
lhs_method_strength = { |
|
1: self._random_lhs, |
|
2: self._random_oa_lhs |
|
} |
|
|
|
try: |
|
self.lhs_method: Callable = lhs_method_strength[strength] |
|
except KeyError as exc: |
|
message = (f"{strength!r} is not a valid strength. It must be one" |
|
f" of {set(lhs_method_strength)!r}") |
|
raise ValueError(message) from exc |
|
|
|
def _random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
lhs = self.lhs_method(n) |
|
return lhs |
|
|
|
def _random_lhs(self, n: IntNumber = 1) -> np.ndarray: |
|
"""Base LHS algorithm.""" |
|
if not self.scramble: |
|
samples: np.ndarray | float = 0.5 |
|
else: |
|
samples = self.rng.uniform(size=(n, self.d)) |
|
|
|
perms = np.tile(np.arange(1, n + 1), |
|
(self.d, 1)) |
|
for i in range(self.d): |
|
self.rng.shuffle(perms[i, :]) |
|
perms = perms.T |
|
|
|
samples = (perms - samples) / n |
|
return samples |
|
|
|
def _random_oa_lhs(self, n: IntNumber = 4) -> np.ndarray: |
|
"""Orthogonal array based LHS of strength 2.""" |
|
p = np.sqrt(n).astype(int) |
|
n_row = p**2 |
|
n_col = p + 1 |
|
|
|
primes = primes_from_2_to(p + 1) |
|
if p not in primes or n != n_row: |
|
raise ValueError( |
|
"n is not the square of a prime number. Close" |
|
f" values are {primes[-2:]**2}" |
|
) |
|
if self.d > p + 1: |
|
raise ValueError("n is too small for d. Must be n > (d-1)**2") |
|
|
|
oa_sample = np.zeros(shape=(n_row, n_col), dtype=int) |
|
|
|
|
|
arrays = np.tile(np.arange(p), (2, 1)) |
|
oa_sample[:, :2] = np.stack(np.meshgrid(*arrays), |
|
axis=-1).reshape(-1, 2) |
|
for p_ in range(1, p): |
|
oa_sample[:, 2+p_-1] = np.mod(oa_sample[:, 0] |
|
+ p_*oa_sample[:, 1], p) |
|
|
|
|
|
oa_sample_ = np.empty(shape=(n_row, n_col), dtype=int) |
|
for j in range(n_col): |
|
perms = self.rng.permutation(p) |
|
oa_sample_[:, j] = perms[oa_sample[:, j]] |
|
|
|
oa_sample = oa_sample_ |
|
|
|
oa_lhs_sample = np.zeros(shape=(n_row, n_col)) |
|
lhs_engine = LatinHypercube(d=1, scramble=self.scramble, strength=1, |
|
rng=self.rng) |
|
for j in range(n_col): |
|
for k in range(p): |
|
idx = oa_sample[:, j] == k |
|
lhs = lhs_engine.random(p).flatten() |
|
oa_lhs_sample[:, j][idx] = lhs + oa_sample[:, j][idx] |
|
|
|
oa_lhs_sample /= p |
|
|
|
return oa_lhs_sample[:, :self.d] |
|
|
|
|
|
class Sobol(QMCEngine): |
|
"""Engine for generating (scrambled) Sobol' sequences. |
|
|
|
Sobol' sequences are low-discrepancy, quasi-random numbers. Points |
|
can be drawn using two methods: |
|
|
|
* `random_base2`: safely draw :math:`n=2^m` points. This method |
|
guarantees the balance properties of the sequence. |
|
* `random`: draw an arbitrary number of points from the |
|
sequence. See warning below. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Dimensionality of the sequence. Max dimensionality is 21201. |
|
scramble : bool, optional |
|
If True, use LMS+shift scrambling. Otherwise, no scrambling is done. |
|
Default is True. |
|
bits : int, optional |
|
Number of bits of the generator. Control the maximum number of points |
|
that can be generated, which is ``2**bits``. Maximal value is 64. |
|
It does not correspond to the return type, which is always |
|
``np.float64`` to prevent points from repeating themselves. |
|
Default is None, which for backward compatibility, corresponds to 30. |
|
|
|
.. versionadded:: 1.9.0 |
|
optimization : {None, "random-cd", "lloyd"}, optional |
|
Whether to use an optimization scheme to improve the quality after |
|
sampling. Note that this is a post-processing step that does not |
|
guarantee that all properties of the sample will be conserved. |
|
Default is None. |
|
|
|
* ``random-cd``: random permutations of coordinates to lower the |
|
centered discrepancy. The best sample based on the centered |
|
discrepancy is constantly updated. Centered discrepancy-based |
|
sampling shows better space-filling robustness toward 2D and 3D |
|
subprojections compared to using other discrepancy measures. |
|
* ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm. |
|
The process converges to equally spaced samples. |
|
|
|
.. versionadded:: 1.10.0 |
|
|
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
Notes |
|
----- |
|
Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in |
|
:math:`[0,1)^{d}`. Scrambling them [3]_ makes them suitable for singular |
|
integrands, provides a means of error estimation, and can improve their |
|
rate of convergence. The scrambling strategy which is implemented is a |
|
(left) linear matrix scramble (LMS) followed by a digital random shift |
|
(LMS+shift) [2]_. |
|
|
|
There are many versions of Sobol' sequences depending on their |
|
'direction numbers'. This code uses direction numbers from [4]_. Hence, |
|
the maximum number of dimension is 21201. The direction numbers have been |
|
precomputed with search criterion 6 and can be retrieved at |
|
https://web.maths.unsw.edu.au/~fkuo/sobol/. |
|
|
|
.. warning:: |
|
|
|
Sobol' sequences are a quadrature rule and they lose their balance |
|
properties if one uses a sample size that is not a power of 2, or skips |
|
the first point, or thins the sequence [5]_. |
|
|
|
If :math:`n=2^m` points are not enough then one should take :math:`2^M` |
|
points for :math:`M>m`. When scrambling, the number R of independent |
|
replicates does not have to be a power of 2. |
|
|
|
Sobol' sequences are generated to some number :math:`B` of bits. |
|
After :math:`2^B` points have been generated, the sequence would |
|
repeat. Hence, an error is raised. |
|
The number of bits can be controlled with the parameter `bits`. |
|
|
|
References |
|
---------- |
|
.. [1] I. M. Sobol', "The distribution of points in a cube and the accurate |
|
evaluation of integrals." Zh. Vychisl. Mat. i Mat. Phys., 7:784-802, |
|
1967. |
|
.. [2] J. Matousek, "On the L2-discrepancy for anchored boxes." |
|
J. of Complexity 14, 527-556, 1998. |
|
.. [3] Art B. Owen, "Scrambling Sobol and Niederreiter-Xing points." |
|
Journal of Complexity, 14(4):466-489, December 1998. |
|
.. [4] S. Joe and F. Y. Kuo, "Constructing sobol sequences with better |
|
two-dimensional projections." SIAM Journal on Scientific Computing, |
|
30(5):2635-2654, 2008. |
|
.. [5] Art B. Owen, "On dropping the first Sobol' point." |
|
:arxiv:`2008.08051`, 2020. |
|
|
|
Examples |
|
-------- |
|
Generate samples from a low discrepancy sequence of Sobol'. |
|
|
|
>>> from scipy.stats import qmc |
|
>>> sampler = qmc.Sobol(d=2, scramble=False) |
|
>>> sample = sampler.random_base2(m=3) |
|
>>> sample |
|
array([[0. , 0. ], |
|
[0.5 , 0.5 ], |
|
[0.75 , 0.25 ], |
|
[0.25 , 0.75 ], |
|
[0.375, 0.375], |
|
[0.875, 0.875], |
|
[0.625, 0.125], |
|
[0.125, 0.625]]) |
|
|
|
Compute the quality of the sample using the discrepancy criterion. |
|
|
|
>>> qmc.discrepancy(sample) |
|
0.013882107204860938 |
|
|
|
To continue an existing design, extra points can be obtained |
|
by calling again `random_base2`. Alternatively, you can skip some |
|
points like: |
|
|
|
>>> _ = sampler.reset() |
|
>>> _ = sampler.fast_forward(4) |
|
>>> sample_continued = sampler.random_base2(m=2) |
|
>>> sample_continued |
|
array([[0.375, 0.375], |
|
[0.875, 0.875], |
|
[0.625, 0.125], |
|
[0.125, 0.625]]) |
|
|
|
Finally, samples can be scaled to bounds. |
|
|
|
>>> l_bounds = [0, 2] |
|
>>> u_bounds = [10, 5] |
|
>>> qmc.scale(sample_continued, l_bounds, u_bounds) |
|
array([[3.75 , 3.125], |
|
[8.75 , 4.625], |
|
[6.25 , 2.375], |
|
[1.25 , 3.875]]) |
|
|
|
""" |
|
|
|
MAXDIM: ClassVar[int] = _MAXDIM |
|
|
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, d: IntNumber, *, scramble: bool = True, |
|
bits: IntNumber | None = None, rng: SeedType = None, |
|
optimization: Literal["random-cd", "lloyd"] | None = None |
|
) -> None: |
|
|
|
self._init_quad = {'d': d, 'scramble': True, 'bits': bits, |
|
'optimization': optimization} |
|
|
|
super()._initialize(d=d, optimization=optimization, rng=rng) |
|
if d > self.MAXDIM: |
|
raise ValueError( |
|
f"Maximum supported dimensionality is {self.MAXDIM}." |
|
) |
|
|
|
self.bits = bits |
|
self.dtype_i: type |
|
self.scramble = scramble |
|
|
|
if self.bits is None: |
|
self.bits = 30 |
|
|
|
if self.bits <= 32: |
|
self.dtype_i = np.uint32 |
|
elif 32 < self.bits <= 64: |
|
self.dtype_i = np.uint64 |
|
else: |
|
raise ValueError("Maximum supported 'bits' is 64") |
|
|
|
self.maxn = 2**self.bits |
|
|
|
|
|
self._sv: np.ndarray = np.zeros((d, self.bits), dtype=self.dtype_i) |
|
_initialize_v(self._sv, dim=d, bits=self.bits) |
|
|
|
if not scramble: |
|
self._shift: np.ndarray = np.zeros(d, dtype=self.dtype_i) |
|
else: |
|
|
|
self._scramble() |
|
|
|
self._quasi = self._shift.copy() |
|
|
|
|
|
|
|
self._scale = 1.0 / 2 ** self.bits |
|
|
|
self._first_point = (self._quasi * self._scale).reshape(1, -1) |
|
|
|
self._first_point = self._first_point.astype(np.float64) |
|
|
|
def _scramble(self) -> None: |
|
"""Scramble the sequence using LMS+shift.""" |
|
|
|
self._shift = np.dot( |
|
rng_integers(self.rng, 2, size=(self.d, self.bits), |
|
dtype=self.dtype_i), |
|
2 ** np.arange(self.bits, dtype=self.dtype_i), |
|
) |
|
|
|
ltm = np.tril(rng_integers(self.rng, 2, |
|
size=(self.d, self.bits, self.bits), |
|
dtype=self.dtype_i)) |
|
_cscramble( |
|
dim=self.d, bits=self.bits, |
|
ltm=ltm, sv=self._sv |
|
) |
|
|
|
def _random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Draw next point(s) in the Sobol' sequence. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
Sobol' sample. |
|
|
|
""" |
|
sample: np.ndarray = np.empty((n, self.d), dtype=np.float64) |
|
|
|
if n == 0: |
|
return sample |
|
|
|
total_n = self.num_generated + n |
|
if total_n > self.maxn: |
|
msg = ( |
|
f"At most 2**{self.bits}={self.maxn} distinct points can be " |
|
f"generated. {self.num_generated} points have been previously " |
|
f"generated, then: n={self.num_generated}+{n}={total_n}. " |
|
) |
|
if self.bits != 64: |
|
msg += "Consider increasing `bits`." |
|
raise ValueError(msg) |
|
|
|
if self.num_generated == 0: |
|
|
|
if not (n & (n - 1) == 0): |
|
warnings.warn("The balance properties of Sobol' points require" |
|
" n to be a power of 2.", stacklevel=2) |
|
|
|
if n == 1: |
|
sample = self._first_point |
|
else: |
|
_draw( |
|
n=n - 1, num_gen=self.num_generated, dim=self.d, |
|
scale=self._scale, sv=self._sv, quasi=self._quasi, |
|
sample=sample |
|
) |
|
sample = np.concatenate( |
|
[self._first_point, sample] |
|
)[:n] |
|
else: |
|
_draw( |
|
n=n, num_gen=self.num_generated - 1, dim=self.d, |
|
scale=self._scale, sv=self._sv, quasi=self._quasi, |
|
sample=sample |
|
) |
|
|
|
return sample |
|
|
|
def random_base2(self, m: IntNumber) -> np.ndarray: |
|
"""Draw point(s) from the Sobol' sequence. |
|
|
|
This function draws :math:`n=2^m` points in the parameter space |
|
ensuring the balance properties of the sequence. |
|
|
|
Parameters |
|
---------- |
|
m : int |
|
Logarithm in base 2 of the number of samples; i.e., n = 2^m. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
Sobol' sample. |
|
|
|
""" |
|
n = 2 ** m |
|
|
|
total_n = self.num_generated + n |
|
if not (total_n & (total_n - 1) == 0): |
|
raise ValueError('The balance properties of Sobol\' points require ' |
|
f'n to be a power of 2. {self.num_generated} points ' |
|
'have been previously generated, then: ' |
|
f'n={self.num_generated}+2**{m}={total_n}. ' |
|
'If you still want to do this, the function ' |
|
'\'Sobol.random()\' can be used.' |
|
) |
|
|
|
return self.random(n) |
|
|
|
def reset(self) -> "Sobol": |
|
"""Reset the engine to base state. |
|
|
|
Returns |
|
------- |
|
engine : Sobol |
|
Engine reset to its base state. |
|
|
|
""" |
|
super().reset() |
|
self._quasi = self._shift.copy() |
|
return self |
|
|
|
def fast_forward(self, n: IntNumber) -> "Sobol": |
|
"""Fast-forward the sequence by `n` positions. |
|
|
|
Parameters |
|
---------- |
|
n : int |
|
Number of points to skip in the sequence. |
|
|
|
Returns |
|
------- |
|
engine : Sobol |
|
The fast-forwarded engine. |
|
|
|
""" |
|
if self.num_generated == 0: |
|
_fast_forward( |
|
n=n - 1, num_gen=self.num_generated, dim=self.d, |
|
sv=self._sv, quasi=self._quasi |
|
) |
|
else: |
|
_fast_forward( |
|
n=n, num_gen=self.num_generated - 1, dim=self.d, |
|
sv=self._sv, quasi=self._quasi |
|
) |
|
self.num_generated += n |
|
return self |
|
|
|
|
|
class PoissonDisk(QMCEngine): |
|
"""Poisson disk sampling. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Dimension of the parameter space. |
|
radius : float |
|
Minimal distance to keep between points when sampling new candidates. |
|
hypersphere : {"volume", "surface"}, optional |
|
Sampling strategy to generate potential candidates to be added in the |
|
final sample. Default is "volume". |
|
|
|
* ``volume``: original Bridson algorithm as described in [1]_. |
|
New candidates are sampled *within* the hypersphere. |
|
* ``surface``: only sample the surface of the hypersphere. |
|
ncandidates : int |
|
Number of candidates to sample per iteration. More candidates result |
|
in a denser sampling as more candidates can be accepted per iteration. |
|
optimization : {None, "random-cd", "lloyd"}, optional |
|
Whether to use an optimization scheme to improve the quality after |
|
sampling. Note that this is a post-processing step that does not |
|
guarantee that all properties of the sample will be conserved. |
|
Default is None. |
|
|
|
* ``random-cd``: random permutations of coordinates to lower the |
|
centered discrepancy. The best sample based on the centered |
|
discrepancy is constantly updated. Centered discrepancy-based |
|
sampling shows better space-filling robustness toward 2D and 3D |
|
subprojections compared to using other discrepancy measures. |
|
* ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm. |
|
The process converges to equally spaced samples. |
|
|
|
.. versionadded:: 1.10.0 |
|
|
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
l_bounds, u_bounds : array_like (d,) |
|
Lower and upper bounds of target sample data. |
|
|
|
Notes |
|
----- |
|
Poisson disk sampling is an iterative sampling strategy. Starting from |
|
a seed sample, `ncandidates` are sampled in the hypersphere |
|
surrounding the seed. Candidates below a certain `radius` or outside the |
|
domain are rejected. New samples are added in a pool of sample seed. The |
|
process stops when the pool is empty or when the number of required |
|
samples is reached. |
|
|
|
The maximum number of point that a sample can contain is directly linked |
|
to the `radius`. As the dimension of the space increases, a higher radius |
|
spreads the points further and help overcome the curse of dimensionality. |
|
See the :ref:`quasi monte carlo tutorial <quasi-monte-carlo>` for more |
|
details. |
|
|
|
.. warning:: |
|
|
|
The algorithm is more suitable for low dimensions and sampling size |
|
due to its iterative nature and memory requirements. |
|
Selecting a small radius with a high dimension would |
|
mean that the space could contain more samples than using lower |
|
dimension or a bigger radius. |
|
|
|
Some code taken from [2]_, written consent given on 31.03.2021 |
|
by the original author, Shamis, for free use in SciPy under |
|
the 3-clause BSD. |
|
|
|
References |
|
---------- |
|
.. [1] Robert Bridson, "Fast Poisson Disk Sampling in Arbitrary |
|
Dimensions." SIGGRAPH, 2007. |
|
.. [2] `StackOverflow <https://stackoverflow.com/questions/66047540>`__. |
|
|
|
Examples |
|
-------- |
|
Generate a 2D sample using a `radius` of 0.2. |
|
|
|
>>> import numpy as np |
|
>>> import matplotlib.pyplot as plt |
|
>>> from matplotlib.collections import PatchCollection |
|
>>> from scipy.stats import qmc |
|
>>> |
|
>>> rng = np.random.default_rng() |
|
>>> radius = 0.2 |
|
>>> engine = qmc.PoissonDisk(d=2, radius=radius, rng=rng) |
|
>>> sample = engine.random(20) |
|
|
|
Visualizing the 2D sample and showing that no points are closer than |
|
`radius`. ``radius/2`` is used to visualize non-intersecting circles. |
|
If two samples are exactly at `radius` from each other, then their circle |
|
of radius ``radius/2`` will touch. |
|
|
|
>>> fig, ax = plt.subplots() |
|
>>> _ = ax.scatter(sample[:, 0], sample[:, 1]) |
|
>>> circles = [plt.Circle((xi, yi), radius=radius/2, fill=False) |
|
... for xi, yi in sample] |
|
>>> collection = PatchCollection(circles, match_original=True) |
|
>>> ax.add_collection(collection) |
|
>>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$', |
|
... xlim=[0, 1], ylim=[0, 1]) |
|
>>> plt.show() |
|
|
|
Such visualization can be seen as circle packing: how many circle can |
|
we put in the space. It is a np-hard problem. The method `fill_space` |
|
can be used to add samples until no more samples can be added. This is |
|
a hard problem and parameters may need to be adjusted manually. Beware of |
|
the dimension: as the dimensionality increases, the number of samples |
|
required to fill the space increases exponentially |
|
(curse-of-dimensionality). |
|
|
|
""" |
|
|
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, |
|
d: IntNumber, |
|
*, |
|
radius: DecimalNumber = 0.05, |
|
hypersphere: Literal["volume", "surface"] = "volume", |
|
ncandidates: IntNumber = 30, |
|
optimization: Literal["random-cd", "lloyd"] | None = None, |
|
rng: SeedType = None, |
|
l_bounds: "npt.ArrayLike | None" = None, |
|
u_bounds: "npt.ArrayLike | None" = None, |
|
) -> None: |
|
|
|
self._init_quad = {'d': d, 'radius': radius, |
|
'hypersphere': hypersphere, |
|
'ncandidates': ncandidates, |
|
'optimization': optimization} |
|
super()._initialize(d=d, optimization=optimization, rng=rng) |
|
|
|
hypersphere_sample = { |
|
"volume": self._hypersphere_volume_sample, |
|
"surface": self._hypersphere_surface_sample |
|
} |
|
|
|
try: |
|
self.hypersphere_method = hypersphere_sample[hypersphere] |
|
except KeyError as exc: |
|
message = ( |
|
f"{hypersphere!r} is not a valid hypersphere sampling" |
|
f" method. It must be one of {set(hypersphere_sample)!r}") |
|
raise ValueError(message) from exc |
|
|
|
|
|
|
|
|
|
|
|
self.radius_factor = 2 if hypersphere == "volume" else 1.001 |
|
self.radius = radius |
|
self.radius_squared = self.radius**2 |
|
|
|
|
|
self.ncandidates = ncandidates |
|
|
|
if u_bounds is None: |
|
u_bounds = np.ones(d) |
|
if l_bounds is None: |
|
l_bounds = np.zeros(d) |
|
self.l_bounds, self.u_bounds = _validate_bounds( |
|
l_bounds=l_bounds, u_bounds=u_bounds, d=int(d) |
|
) |
|
|
|
with np.errstate(divide='ignore'): |
|
self.cell_size = self.radius / np.sqrt(self.d) |
|
self.grid_size = ( |
|
np.ceil((self.u_bounds - self.l_bounds) / self.cell_size) |
|
).astype(int) |
|
|
|
self._initialize_grid_pool() |
|
|
|
def _initialize_grid_pool(self): |
|
"""Sampling pool and sample grid.""" |
|
self.sample_pool = [] |
|
|
|
|
|
self.sample_grid = np.empty( |
|
np.append(self.grid_size, self.d), |
|
dtype=np.float32 |
|
) |
|
|
|
self.sample_grid.fill(np.nan) |
|
|
|
def _random( |
|
self, n: IntNumber = 1, *, workers: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Draw `n` in the interval ``[l_bounds, u_bounds]``. |
|
|
|
Note that it can return fewer samples if the space is full. |
|
See the note section of the class. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
QMC sample. |
|
|
|
""" |
|
if n == 0 or self.d == 0: |
|
return np.empty((n, self.d)) |
|
|
|
def in_limits(sample: np.ndarray) -> bool: |
|
for i in range(self.d): |
|
if (sample[i] > self.u_bounds[i] or sample[i] < self.l_bounds[i]): |
|
return False |
|
return True |
|
|
|
def in_neighborhood(candidate: np.ndarray, n: int = 2) -> bool: |
|
""" |
|
Check if there are samples closer than ``radius_squared`` to the |
|
`candidate` sample. |
|
""" |
|
indices = ((candidate - self.l_bounds) / self.cell_size).astype(int) |
|
ind_min = np.maximum(indices - n, self.l_bounds.astype(int)) |
|
ind_max = np.minimum(indices + n + 1, self.grid_size) |
|
|
|
|
|
if not np.isnan(self.sample_grid[tuple(indices)][0]): |
|
return True |
|
|
|
a = [slice(ind_min[i], ind_max[i]) for i in range(self.d)] |
|
|
|
|
|
|
|
with np.errstate(invalid='ignore'): |
|
if np.any( |
|
np.sum( |
|
np.square(candidate - self.sample_grid[tuple(a)]), |
|
axis=self.d |
|
) < self.radius_squared |
|
): |
|
return True |
|
|
|
return False |
|
|
|
def add_sample(candidate: np.ndarray) -> None: |
|
self.sample_pool.append(candidate) |
|
indices = ((candidate - self.l_bounds) / self.cell_size).astype(int) |
|
self.sample_grid[tuple(indices)] = candidate |
|
curr_sample.append(candidate) |
|
|
|
curr_sample: list[np.ndarray] = [] |
|
|
|
if len(self.sample_pool) == 0: |
|
|
|
add_sample(self.rng.uniform(self.l_bounds, self.u_bounds)) |
|
num_drawn = 1 |
|
else: |
|
num_drawn = 0 |
|
|
|
|
|
while len(self.sample_pool) and num_drawn < n: |
|
|
|
idx_center = rng_integers(self.rng, len(self.sample_pool)) |
|
center = self.sample_pool[idx_center] |
|
del self.sample_pool[idx_center] |
|
|
|
|
|
candidates = self.hypersphere_method( |
|
center, self.radius * self.radius_factor, self.ncandidates |
|
) |
|
|
|
|
|
for candidate in candidates: |
|
if in_limits(candidate) and not in_neighborhood(candidate): |
|
add_sample(candidate) |
|
|
|
num_drawn += 1 |
|
if num_drawn >= n: |
|
break |
|
|
|
self.num_generated += num_drawn |
|
return np.array(curr_sample) |
|
|
|
def fill_space(self) -> np.ndarray: |
|
"""Draw ``n`` samples in the interval ``[l_bounds, u_bounds]``. |
|
|
|
Unlike `random`, this method will try to add points until |
|
the space is full. Depending on ``candidates`` (and to a lesser extent |
|
other parameters), some empty areas can still be present in the sample. |
|
|
|
.. warning:: |
|
|
|
This can be extremely slow in high dimensions or if the |
|
``radius`` is very small-with respect to the dimensionality. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
QMC sample. |
|
|
|
""" |
|
return self.random(np.inf) |
|
|
|
def reset(self) -> "PoissonDisk": |
|
"""Reset the engine to base state. |
|
|
|
Returns |
|
------- |
|
engine : PoissonDisk |
|
Engine reset to its base state. |
|
|
|
""" |
|
super().reset() |
|
self._initialize_grid_pool() |
|
return self |
|
|
|
def _hypersphere_volume_sample( |
|
self, center: np.ndarray, radius: DecimalNumber, |
|
candidates: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Uniform sampling within hypersphere.""" |
|
|
|
x = self.rng.standard_normal(size=(candidates, self.d)) |
|
ssq = np.sum(x**2, axis=1) |
|
fr = radius * gammainc(self.d/2, ssq/2)**(1/self.d) / np.sqrt(ssq) |
|
fr_tiled = np.tile( |
|
fr.reshape(-1, 1), (1, self.d) |
|
) |
|
p = center + np.multiply(x, fr_tiled) |
|
return p |
|
|
|
def _hypersphere_surface_sample( |
|
self, center: np.ndarray, radius: DecimalNumber, |
|
candidates: IntNumber = 1 |
|
) -> np.ndarray: |
|
"""Uniform sampling on the hypersphere's surface.""" |
|
vec = self.rng.standard_normal(size=(candidates, self.d)) |
|
vec /= np.linalg.norm(vec, axis=1)[:, None] |
|
p = center + np.multiply(vec, radius) |
|
return p |
|
|
|
|
|
class MultivariateNormalQMC: |
|
r"""QMC sampling from a multivariate Normal :math:`N(\mu, \Sigma)`. |
|
|
|
Parameters |
|
---------- |
|
mean : array_like (d,) |
|
The mean vector. Where ``d`` is the dimension. |
|
cov : array_like (d, d), optional |
|
The covariance matrix. If omitted, use `cov_root` instead. |
|
If both `cov` and `cov_root` are omitted, use the identity matrix. |
|
cov_root : array_like (d, d'), optional |
|
A root decomposition of the covariance matrix, where ``d'`` may be less |
|
than ``d`` if the covariance is not full rank. If omitted, use `cov`. |
|
inv_transform : bool, optional |
|
If True, use inverse transform instead of Box-Muller. Default is True. |
|
engine : QMCEngine, optional |
|
Quasi-Monte Carlo engine sampler. If None, `Sobol` is used. |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
Examples |
|
-------- |
|
>>> import matplotlib.pyplot as plt |
|
>>> from scipy.stats import qmc |
|
>>> dist = qmc.MultivariateNormalQMC(mean=[0, 5], cov=[[1, 0], [0, 1]]) |
|
>>> sample = dist.random(512) |
|
>>> _ = plt.scatter(sample[:, 0], sample[:, 1]) |
|
>>> plt.show() |
|
|
|
""" |
|
|
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, |
|
mean: "npt.ArrayLike", |
|
cov: "npt.ArrayLike | None" = None, |
|
*, |
|
cov_root: "npt.ArrayLike | None" = None, |
|
inv_transform: bool = True, |
|
engine: QMCEngine | None = None, |
|
rng: SeedType = None, |
|
) -> None: |
|
mean = np.asarray(np.atleast_1d(mean)) |
|
d = mean.shape[0] |
|
if cov is not None: |
|
|
|
cov = np.asarray(np.atleast_2d(cov)) |
|
|
|
|
|
if not mean.shape[0] == cov.shape[0]: |
|
raise ValueError("Dimension mismatch between mean and " |
|
"covariance.") |
|
if not np.allclose(cov, cov.transpose()): |
|
raise ValueError("Covariance matrix is not symmetric.") |
|
|
|
try: |
|
cov_root = np.linalg.cholesky(cov).transpose() |
|
except np.linalg.LinAlgError: |
|
eigval, eigvec = np.linalg.eigh(cov) |
|
if not np.all(eigval >= -1.0e-8): |
|
raise ValueError("Covariance matrix not PSD.") |
|
eigval = np.clip(eigval, 0.0, None) |
|
cov_root = (eigvec * np.sqrt(eigval)).transpose() |
|
elif cov_root is not None: |
|
|
|
cov_root = np.atleast_2d(cov_root) |
|
if not mean.shape[0] == cov_root.shape[0]: |
|
raise ValueError("Dimension mismatch between mean and " |
|
"covariance.") |
|
else: |
|
|
|
cov_root = None |
|
|
|
self._inv_transform = inv_transform |
|
|
|
if not inv_transform: |
|
|
|
engine_dim = 2 * math.ceil(d / 2) |
|
else: |
|
engine_dim = d |
|
if engine is None: |
|
|
|
|
|
kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng" |
|
kwargs = {kwarg: rng} |
|
self.engine = Sobol( |
|
d=engine_dim, scramble=True, bits=30, **kwargs |
|
) |
|
elif isinstance(engine, QMCEngine): |
|
if engine.d != engine_dim: |
|
raise ValueError("Dimension of `engine` must be consistent" |
|
" with dimensions of mean and covariance." |
|
" If `inv_transform` is False, it must be" |
|
" an even number.") |
|
self.engine = engine |
|
else: |
|
raise ValueError("`engine` must be an instance of " |
|
"`scipy.stats.qmc.QMCEngine` or `None`.") |
|
|
|
self._mean = mean |
|
self._corr_matrix = cov_root |
|
|
|
self._d = d |
|
|
|
def random(self, n: IntNumber = 1) -> np.ndarray: |
|
"""Draw `n` QMC samples from the multivariate Normal. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
Sample. |
|
|
|
""" |
|
base_samples = self._standard_normal_samples(n) |
|
return self._correlate(base_samples) |
|
|
|
def _correlate(self, base_samples: np.ndarray) -> np.ndarray: |
|
if self._corr_matrix is not None: |
|
return base_samples @ self._corr_matrix + self._mean |
|
else: |
|
|
|
return base_samples + self._mean |
|
|
|
def _standard_normal_samples(self, n: IntNumber = 1) -> np.ndarray: |
|
"""Draw `n` QMC samples from the standard Normal :math:`N(0, I_d)`. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
Sample. |
|
|
|
""" |
|
|
|
samples = self.engine.random(n) |
|
if self._inv_transform: |
|
|
|
|
|
return stats.norm.ppf(0.5 + (1 - 1e-10) * (samples - 0.5)) |
|
else: |
|
|
|
even = np.arange(0, samples.shape[-1], 2) |
|
Rs = np.sqrt(-2 * np.log(samples[:, even])) |
|
thetas = 2 * math.pi * samples[:, 1 + even] |
|
cos = np.cos(thetas) |
|
sin = np.sin(thetas) |
|
transf_samples = np.stack([Rs * cos, Rs * sin], |
|
-1).reshape(n, -1) |
|
|
|
return transf_samples[:, : self._d] |
|
|
|
|
|
class MultinomialQMC: |
|
r"""QMC sampling from a multinomial distribution. |
|
|
|
Parameters |
|
---------- |
|
pvals : array_like (k,) |
|
Vector of probabilities of size ``k``, where ``k`` is the number |
|
of categories. Elements must be non-negative and sum to 1. |
|
n_trials : int |
|
Number of trials. |
|
engine : QMCEngine, optional |
|
Quasi-Monte Carlo engine sampler. If None, `Sobol` is used. |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
.. versionchanged:: 1.15.0 |
|
|
|
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_ |
|
transition from use of `numpy.random.RandomState` to |
|
`numpy.random.Generator`, this keyword was changed from `seed` to |
|
`rng`. For an interim period, both keywords will continue to work, although |
|
only one may be specified at a time. After the interim period, function |
|
calls using the `seed` keyword will emit warnings. Following a |
|
deprecation period, the `seed` keyword will be removed. |
|
|
|
Examples |
|
-------- |
|
Let's define 3 categories and for a given sample, the sum of the trials |
|
of each category is 8. The number of trials per category is determined |
|
by the `pvals` associated to each category. |
|
Then, we sample this distribution 64 times. |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> from scipy.stats import qmc |
|
>>> dist = qmc.MultinomialQMC( |
|
... pvals=[0.2, 0.4, 0.4], n_trials=10, engine=qmc.Halton(d=1) |
|
... ) |
|
>>> sample = dist.random(64) |
|
|
|
We can plot the sample and verify that the median of number of trials |
|
for each category is following the `pvals`. That would be |
|
``pvals * n_trials = [2, 4, 4]``. |
|
|
|
>>> fig, ax = plt.subplots() |
|
>>> ax.yaxis.get_major_locator().set_params(integer=True) |
|
>>> _ = ax.boxplot(sample) |
|
>>> ax.set(xlabel="Categories", ylabel="Trials") |
|
>>> plt.show() |
|
|
|
""" |
|
|
|
@_transition_to_rng('seed', replace_doc=False) |
|
def __init__( |
|
self, |
|
pvals: "npt.ArrayLike", |
|
n_trials: IntNumber, |
|
*, |
|
engine: QMCEngine | None = None, |
|
rng: SeedType = None, |
|
) -> None: |
|
self.pvals = np.atleast_1d(np.asarray(pvals)) |
|
if np.min(pvals) < 0: |
|
raise ValueError('Elements of pvals must be non-negative.') |
|
if not np.isclose(np.sum(pvals), 1): |
|
raise ValueError('Elements of pvals must sum to 1.') |
|
self.n_trials = n_trials |
|
if engine is None: |
|
|
|
|
|
kwarg = "seed" if isinstance(rng, np.random.RandomState) else "rng" |
|
kwargs = {kwarg: rng} |
|
self.engine = Sobol( |
|
d=1, scramble=True, bits=30, **kwargs |
|
) |
|
elif isinstance(engine, QMCEngine): |
|
if engine.d != 1: |
|
raise ValueError("Dimension of `engine` must be 1.") |
|
self.engine = engine |
|
else: |
|
raise ValueError("`engine` must be an instance of " |
|
"`scipy.stats.qmc.QMCEngine` or `None`.") |
|
|
|
def random(self, n: IntNumber = 1) -> np.ndarray: |
|
"""Draw `n` QMC samples from the multinomial distribution. |
|
|
|
Parameters |
|
---------- |
|
n : int, optional |
|
Number of samples to generate in the parameter space. Default is 1. |
|
|
|
Returns |
|
------- |
|
samples : array_like (n, pvals) |
|
Sample. |
|
|
|
""" |
|
sample = np.empty((n, len(self.pvals))) |
|
for i in range(n): |
|
base_draws = self.engine.random(self.n_trials).ravel() |
|
p_cumulative = np.empty_like(self.pvals, dtype=float) |
|
_fill_p_cumulative(np.array(self.pvals, dtype=float), p_cumulative) |
|
sample_ = np.zeros_like(self.pvals, dtype=np.intp) |
|
_categorize(base_draws, p_cumulative, sample_) |
|
sample[i] = sample_ |
|
return sample |
|
|
|
|
|
def _select_optimizer( |
|
optimization: Literal["random-cd", "lloyd"] | None, config: dict |
|
) -> Callable | None: |
|
"""A factory for optimization methods.""" |
|
optimization_method: dict[str, Callable] = { |
|
"random-cd": _random_cd, |
|
"lloyd": _lloyd_centroidal_voronoi_tessellation |
|
} |
|
|
|
optimizer: partial | None |
|
if optimization is not None: |
|
try: |
|
optimization = optimization.lower() |
|
optimizer_ = optimization_method[optimization] |
|
except KeyError as exc: |
|
message = (f"{optimization!r} is not a valid optimization" |
|
f" method. It must be one of" |
|
f" {set(optimization_method)!r}") |
|
raise ValueError(message) from exc |
|
|
|
|
|
optimizer = partial(optimizer_, **config) |
|
else: |
|
optimizer = None |
|
|
|
return optimizer |
|
|
|
|
|
def _random_cd( |
|
best_sample: np.ndarray, n_iters: int, n_nochange: int, rng: GeneratorType, |
|
**kwargs: dict |
|
) -> np.ndarray: |
|
"""Optimal LHS on CD. |
|
|
|
Create a base LHS and do random permutations of coordinates to |
|
lower the centered discrepancy. |
|
Because it starts with a normal LHS, it also works with the |
|
`scramble` keyword argument. |
|
|
|
Two stopping criterion are used to stop the algorithm: at most, |
|
`n_iters` iterations are performed; or if there is no improvement |
|
for `n_nochange` consecutive iterations. |
|
""" |
|
del kwargs |
|
|
|
n, d = best_sample.shape |
|
|
|
if d == 0 or n == 0: |
|
return np.empty((n, d)) |
|
|
|
if d == 1 or n == 1: |
|
|
|
return best_sample |
|
|
|
best_disc = discrepancy(best_sample) |
|
|
|
bounds = ([0, d - 1], |
|
[0, n - 1], |
|
[0, n - 1]) |
|
|
|
n_nochange_ = 0 |
|
n_iters_ = 0 |
|
while n_nochange_ < n_nochange and n_iters_ < n_iters: |
|
n_iters_ += 1 |
|
|
|
col = rng_integers(rng, *bounds[0], endpoint=True) |
|
row_1 = rng_integers(rng, *bounds[1], endpoint=True) |
|
row_2 = rng_integers(rng, *bounds[2], endpoint=True) |
|
disc = _perturb_discrepancy(best_sample, |
|
row_1, row_2, col, |
|
best_disc) |
|
if disc < best_disc: |
|
best_sample[row_1, col], best_sample[row_2, col] = ( |
|
best_sample[row_2, col], best_sample[row_1, col]) |
|
|
|
best_disc = disc |
|
n_nochange_ = 0 |
|
else: |
|
n_nochange_ += 1 |
|
|
|
return best_sample |
|
|
|
|
|
def _l1_norm(sample: np.ndarray) -> float: |
|
return distance.pdist(sample, 'cityblock').min() |
|
|
|
|
|
def _lloyd_iteration( |
|
sample: np.ndarray, |
|
decay: float, |
|
qhull_options: str |
|
) -> np.ndarray: |
|
"""Lloyd-Max algorithm iteration. |
|
|
|
Based on the implementation of Stéfan van der Walt: |
|
|
|
https://github.com/stefanv/lloyd |
|
|
|
which is: |
|
|
|
Copyright (c) 2021-04-21 Stéfan van der Walt |
|
https://github.com/stefanv/lloyd |
|
MIT License |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
The sample to iterate on. |
|
decay : float |
|
Relaxation decay. A positive value would move the samples toward |
|
their centroid, and negative value would move them away. |
|
1 would move the samples to their centroid. |
|
qhull_options : str |
|
Additional options to pass to Qhull. See Qhull manual |
|
for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and |
|
"Qbb Qc Qz Qj" otherwise.) |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
The sample after an iteration of Lloyd's algorithm. |
|
|
|
""" |
|
new_sample = np.empty_like(sample) |
|
|
|
voronoi = Voronoi(sample, qhull_options=qhull_options) |
|
|
|
for ii, idx in enumerate(voronoi.point_region): |
|
|
|
|
|
region = [i for i in voronoi.regions[idx] if i != -1] |
|
|
|
|
|
verts = voronoi.vertices[region] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
centroid = np.mean(verts, axis=0) |
|
new_sample[ii] = sample[ii] + (centroid - sample[ii]) * decay |
|
|
|
|
|
is_valid = np.all(np.logical_and(new_sample >= 0, new_sample <= 1), axis=1) |
|
sample[is_valid] = new_sample[is_valid] |
|
|
|
return sample |
|
|
|
|
|
def _lloyd_centroidal_voronoi_tessellation( |
|
sample: "npt.ArrayLike", |
|
*, |
|
tol: DecimalNumber = 1e-5, |
|
maxiter: IntNumber = 10, |
|
qhull_options: str | None = None, |
|
**kwargs: dict |
|
) -> np.ndarray: |
|
"""Approximate Centroidal Voronoi Tessellation. |
|
|
|
Perturb samples in N-dimensions using Lloyd-Max algorithm. |
|
|
|
Parameters |
|
---------- |
|
sample : array_like (n, d) |
|
The sample to iterate on. With ``n`` the number of samples and ``d`` |
|
the dimension. Samples must be in :math:`[0, 1]^d`, with ``d>=2``. |
|
tol : float, optional |
|
Tolerance for termination. If the min of the L1-norm over the samples |
|
changes less than `tol`, it stops the algorithm. Default is 1e-5. |
|
maxiter : int, optional |
|
Maximum number of iterations. It will stop the algorithm even if |
|
`tol` is above the threshold. |
|
Too many iterations tend to cluster the samples as a hypersphere. |
|
Default is 10. |
|
qhull_options : str, optional |
|
Additional options to pass to Qhull. See Qhull manual |
|
for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and |
|
"Qbb Qc Qz Qj" otherwise.) |
|
|
|
Returns |
|
------- |
|
sample : array_like (n, d) |
|
The sample after being processed by Lloyd-Max algorithm. |
|
|
|
Notes |
|
----- |
|
Lloyd-Max algorithm is an iterative process with the purpose of improving |
|
the dispersion of samples. For given sample: (i) compute a Voronoi |
|
Tessellation; (ii) find the centroid of each Voronoi cell; (iii) move the |
|
samples toward the centroid of their respective cell. See [1]_, [2]_. |
|
|
|
A relaxation factor is used to control how fast samples can move at each |
|
iteration. This factor is starting at 2 and ending at 1 after `maxiter` |
|
following an exponential decay. |
|
|
|
The process converges to equally spaced samples. It implies that measures |
|
like the discrepancy could suffer from too many iterations. On the other |
|
hand, L1 and L2 distances should improve. This is especially true with |
|
QMC methods which tend to favor the discrepancy over other criteria. |
|
|
|
.. note:: |
|
|
|
The current implementation does not intersect the Voronoi Tessellation |
|
with the boundaries. This implies that for a low number of samples, |
|
empirically below 20, no Voronoi cell is touching the boundaries. |
|
Hence, samples cannot be moved close to the boundaries. |
|
|
|
Further improvements could consider the samples at infinity so that |
|
all boundaries are segments of some Voronoi cells. This would fix |
|
the computation of the centroid position. |
|
|
|
.. warning:: |
|
|
|
The Voronoi Tessellation step is expensive and quickly becomes |
|
intractable with dimensions as low as 10 even for a sample |
|
of size as low as 1000. |
|
|
|
.. versionadded:: 1.9.0 |
|
|
|
References |
|
---------- |
|
.. [1] Lloyd. "Least Squares Quantization in PCM". |
|
IEEE Transactions on Information Theory, 1982. |
|
.. [2] Max J. "Quantizing for minimum distortion". |
|
IEEE Transactions on Information Theory, 1960. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.spatial import distance |
|
>>> from scipy.stats._qmc import _lloyd_centroidal_voronoi_tessellation |
|
>>> rng = np.random.default_rng() |
|
>>> sample = rng.random((128, 2)) |
|
|
|
.. note:: |
|
|
|
The samples need to be in :math:`[0, 1]^d`. `scipy.stats.qmc.scale` |
|
can be used to scale the samples from their |
|
original bounds to :math:`[0, 1]^d`. And back to their original bounds. |
|
|
|
Compute the quality of the sample using the L1 criterion. |
|
|
|
>>> def l1_norm(sample): |
|
... return distance.pdist(sample, 'cityblock').min() |
|
|
|
>>> l1_norm(sample) |
|
0.00161... # random |
|
|
|
Now process the sample using Lloyd's algorithm and check the improvement |
|
on the L1. The value should increase. |
|
|
|
>>> sample = _lloyd_centroidal_voronoi_tessellation(sample) |
|
>>> l1_norm(sample) |
|
0.0278... # random |
|
|
|
""" |
|
del kwargs |
|
|
|
sample = np.asarray(sample).copy() |
|
|
|
if not sample.ndim == 2: |
|
raise ValueError('`sample` is not a 2D array') |
|
|
|
if not sample.shape[1] >= 2: |
|
raise ValueError('`sample` dimension is not >= 2') |
|
|
|
|
|
if (sample.max() > 1.) or (sample.min() < 0.): |
|
raise ValueError('`sample` is not in unit hypercube') |
|
|
|
if qhull_options is None: |
|
qhull_options = 'Qbb Qc Qz QJ' |
|
|
|
if sample.shape[1] >= 5: |
|
qhull_options += ' Qx' |
|
|
|
|
|
|
|
|
|
root = -maxiter / np.log(0.1) |
|
decay = [np.exp(-x / root)+0.9 for x in range(maxiter)] |
|
|
|
l1_old = _l1_norm(sample=sample) |
|
for i in range(maxiter): |
|
sample = _lloyd_iteration( |
|
sample=sample, decay=decay[i], |
|
qhull_options=qhull_options, |
|
) |
|
|
|
l1_new = _l1_norm(sample=sample) |
|
|
|
if abs(l1_new - l1_old) < tol: |
|
break |
|
else: |
|
l1_old = l1_new |
|
|
|
return sample |
|
|
|
|
|
def _validate_workers(workers: IntNumber = 1) -> IntNumber: |
|
"""Validate `workers` based on platform and value. |
|
|
|
Parameters |
|
---------- |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is |
|
given all CPU threads are used. Default is 1. |
|
|
|
Returns |
|
------- |
|
Workers : int |
|
Number of CPU used by the algorithm |
|
|
|
""" |
|
workers = int(workers) |
|
if workers == -1: |
|
workers = os.cpu_count() |
|
if workers is None: |
|
raise NotImplementedError( |
|
"Cannot determine the number of cpus using os.cpu_count(), " |
|
"cannot use -1 for the number of workers" |
|
) |
|
elif workers <= 0: |
|
raise ValueError(f"Invalid number of workers: {workers}, must be -1 " |
|
"or > 0") |
|
|
|
return workers |
|
|
|
|
|
def _validate_bounds( |
|
l_bounds: "npt.ArrayLike", u_bounds: "npt.ArrayLike", d: int |
|
) -> "tuple[npt.NDArray[np.generic], npt.NDArray[np.generic]]": |
|
"""Bounds input validation. |
|
|
|
Parameters |
|
---------- |
|
l_bounds, u_bounds : array_like (d,) |
|
Lower and upper bounds. |
|
d : int |
|
Dimension to use for broadcasting. |
|
|
|
Returns |
|
------- |
|
l_bounds, u_bounds : array_like (d,) |
|
Lower and upper bounds. |
|
|
|
""" |
|
try: |
|
lower = np.broadcast_to(l_bounds, d) |
|
upper = np.broadcast_to(u_bounds, d) |
|
except ValueError as exc: |
|
msg = ("'l_bounds' and 'u_bounds' must be broadcastable and respect" |
|
" the sample dimension") |
|
raise ValueError(msg) from exc |
|
|
|
if not np.all(lower < upper): |
|
raise ValueError("Bounds are not consistent 'l_bounds' < 'u_bounds'") |
|
|
|
return lower, upper |
|
|