|
|
|
|
|
|
|
""" |
|
Random utility function |
|
======================= |
|
This module complements missing features of ``numpy.random``. |
|
|
|
The module contains: |
|
* Several algorithms to sample integers without replacement. |
|
* Fast rand_r alternative based on xor shifts |
|
""" |
|
import numpy as np |
|
from . import check_random_state |
|
|
|
from ._typedefs cimport intp_t |
|
|
|
|
|
cdef uint32_t DEFAULT_SEED = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
ctypedef fused default_int: |
|
intp_t |
|
long |
|
|
|
|
|
cpdef _sample_without_replacement_check_input(default_int n_population, |
|
default_int n_samples): |
|
""" Check that input are consistent for sample_without_replacement""" |
|
if n_population < 0: |
|
raise ValueError('n_population should be greater than 0, got %s.' |
|
% n_population) |
|
|
|
if n_samples > n_population: |
|
raise ValueError('n_population should be greater or equal than ' |
|
'n_samples, got n_samples > n_population (%s > %s)' |
|
% (n_samples, n_population)) |
|
|
|
|
|
cpdef _sample_without_replacement_with_tracking_selection( |
|
default_int n_population, |
|
default_int n_samples, |
|
random_state=None): |
|
r"""Sample integers without replacement. |
|
|
|
Select n_samples integers from the set [0, n_population) without |
|
replacement. |
|
|
|
Time complexity: |
|
- Worst-case: unbounded |
|
- Average-case: |
|
O(O(np.random.randint) * \sum_{i=1}^n_samples 1 / |
|
(1 - i / n_population))) |
|
<= O(O(np.random.randint) * |
|
n_population * ln((n_population - 2) |
|
/(n_population - 1 - n_samples))) |
|
<= O(O(np.random.randint) * |
|
n_population * 1 / (1 - n_samples / n_population)) |
|
|
|
Space complexity of O(n_samples) in a python set. |
|
|
|
|
|
Parameters |
|
---------- |
|
n_population : int |
|
The size of the set to sample from. |
|
|
|
n_samples : int |
|
The number of integer to sample. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
If int, random_state is the seed used by the random number generator; |
|
If RandomState instance, random_state is the random number generator; |
|
If None, the random number generator is the RandomState instance used |
|
by `np.random`. |
|
|
|
Returns |
|
------- |
|
out : ndarray of shape (n_samples,) |
|
The sampled subsets of integer. |
|
""" |
|
_sample_without_replacement_check_input(n_population, n_samples) |
|
|
|
cdef default_int i |
|
cdef default_int j |
|
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int) |
|
|
|
rng = check_random_state(random_state) |
|
rng_randint = rng.randint |
|
|
|
|
|
|
|
cdef set selected = set() |
|
|
|
for i in range(n_samples): |
|
j = rng_randint(n_population) |
|
while j in selected: |
|
j = rng_randint(n_population) |
|
selected.add(j) |
|
out[i] = j |
|
|
|
return np.asarray(out) |
|
|
|
|
|
cpdef _sample_without_replacement_with_pool(default_int n_population, |
|
default_int n_samples, |
|
random_state=None): |
|
"""Sample integers without replacement. |
|
|
|
Select n_samples integers from the set [0, n_population) without |
|
replacement. |
|
|
|
Time complexity: O(n_population + O(np.random.randint) * n_samples) |
|
|
|
Space complexity of O(n_population + n_samples). |
|
|
|
|
|
Parameters |
|
---------- |
|
n_population : int |
|
The size of the set to sample from. |
|
|
|
n_samples : int |
|
The number of integer to sample. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
If int, random_state is the seed used by the random number generator; |
|
If RandomState instance, random_state is the random number generator; |
|
If None, the random number generator is the RandomState instance used |
|
by `np.random`. |
|
|
|
Returns |
|
------- |
|
out : ndarray of shape (n_samples,) |
|
The sampled subsets of integer. |
|
""" |
|
_sample_without_replacement_check_input(n_population, n_samples) |
|
|
|
cdef default_int i |
|
cdef default_int j |
|
cdef default_int[::1] out = np.empty((n_samples,), dtype=int) |
|
cdef default_int[::1] pool = np.empty((n_population,), dtype=int) |
|
|
|
rng = check_random_state(random_state) |
|
rng_randint = rng.randint |
|
|
|
|
|
for i in range(n_population): |
|
pool[i] = i |
|
|
|
|
|
|
|
for i in range(n_samples): |
|
j = rng_randint(n_population - i) |
|
out[i] = pool[j] |
|
pool[j] = pool[n_population - i - 1] |
|
|
|
return np.asarray(out) |
|
|
|
|
|
cpdef _sample_without_replacement_with_reservoir_sampling( |
|
default_int n_population, |
|
default_int n_samples, |
|
random_state=None |
|
): |
|
"""Sample integers without replacement. |
|
|
|
Select n_samples integers from the set [0, n_population) without |
|
replacement. |
|
|
|
Time complexity of |
|
O((n_population - n_samples) * O(np.random.randint) + n_samples) |
|
Space complexity of O(n_samples) |
|
|
|
|
|
Parameters |
|
---------- |
|
n_population : int |
|
The size of the set to sample from. |
|
|
|
n_samples : int |
|
The number of integer to sample. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
If int, random_state is the seed used by the random number generator; |
|
If RandomState instance, random_state is the random number generator; |
|
If None, the random number generator is the RandomState instance used |
|
by `np.random`. |
|
|
|
Returns |
|
------- |
|
out : ndarray of shape (n_samples,) |
|
The sampled subsets of integer. The order of the items is not |
|
necessarily random. Use a random permutation of the array if the order |
|
of the items has to be randomized. |
|
""" |
|
_sample_without_replacement_check_input(n_population, n_samples) |
|
|
|
cdef default_int i |
|
cdef default_int j |
|
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int) |
|
|
|
rng = check_random_state(random_state) |
|
rng_randint = rng.randint |
|
|
|
|
|
|
|
|
|
|
|
for i in range(n_samples): |
|
out[i] = i |
|
|
|
for i from n_samples <= i < n_population: |
|
j = rng_randint(0, i + 1) |
|
if j < n_samples: |
|
out[j] = i |
|
|
|
return np.asarray(out) |
|
|
|
|
|
cdef _sample_without_replacement(default_int n_population, |
|
default_int n_samples, |
|
method="auto", |
|
random_state=None): |
|
"""Sample integers without replacement. |
|
|
|
Private function for the implementation, see sample_without_replacement |
|
documentation for more details. |
|
""" |
|
_sample_without_replacement_check_input(n_population, n_samples) |
|
|
|
all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool") |
|
|
|
ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0 |
|
|
|
|
|
if method == "auto" and ratio > 0.01 and ratio < 0.99: |
|
rng = check_random_state(random_state) |
|
return rng.permutation(n_population)[:n_samples] |
|
|
|
if method == "auto" or method == "tracking_selection": |
|
|
|
|
|
|
|
|
|
|
|
if ratio < 0.2: |
|
return _sample_without_replacement_with_tracking_selection( |
|
n_population, n_samples, random_state) |
|
else: |
|
return _sample_without_replacement_with_reservoir_sampling( |
|
n_population, n_samples, random_state) |
|
|
|
elif method == "reservoir_sampling": |
|
return _sample_without_replacement_with_reservoir_sampling( |
|
n_population, n_samples, random_state) |
|
|
|
elif method == "pool": |
|
return _sample_without_replacement_with_pool(n_population, n_samples, |
|
random_state) |
|
else: |
|
raise ValueError('Expected a method name in %s, got %s. ' |
|
% (all_methods, method)) |
|
|
|
|
|
def sample_without_replacement( |
|
object n_population, object n_samples, method="auto", random_state=None): |
|
"""Sample integers without replacement. |
|
|
|
Select n_samples integers from the set [0, n_population) without |
|
replacement. |
|
|
|
|
|
Parameters |
|
---------- |
|
n_population : int |
|
The size of the set to sample from. |
|
|
|
n_samples : int |
|
The number of integer to sample. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
If int, random_state is the seed used by the random number generator; |
|
If RandomState instance, random_state is the random number generator; |
|
If None, the random number generator is the RandomState instance used |
|
by `np.random`. |
|
|
|
method : {"auto", "tracking_selection", "reservoir_sampling", "pool"}, \ |
|
default='auto' |
|
If method == "auto", the ratio of n_samples / n_population is used |
|
to determine which algorithm to use: |
|
If ratio is between 0 and 0.01, tracking selection is used. |
|
If ratio is between 0.01 and 0.99, numpy.random.permutation is used. |
|
If ratio is greater than 0.99, reservoir sampling is used. |
|
The order of the selected integers is undefined. If a random order is |
|
desired, the selected subset should be shuffled. |
|
|
|
If method =="tracking_selection", a set based implementation is used |
|
which is suitable for `n_samples` <<< `n_population`. |
|
|
|
If method == "reservoir_sampling", a reservoir sampling algorithm is |
|
used which is suitable for high memory constraint or when |
|
O(`n_samples`) ~ O(`n_population`). |
|
The order of the selected integers is undefined. If a random order is |
|
desired, the selected subset should be shuffled. |
|
|
|
If method == "pool", a pool based algorithm is particularly fast, even |
|
faster than the tracking selection method. However, a vector containing |
|
the entire population has to be initialized. |
|
If n_samples ~ n_population, the reservoir sampling method is faster. |
|
|
|
Returns |
|
------- |
|
out : ndarray of shape (n_samples,) |
|
The sampled subsets of integer. The subset of selected integer might |
|
not be randomized, see the method argument. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.random import sample_without_replacement |
|
>>> sample_without_replacement(10, 5, random_state=42) |
|
array([8, 1, 5, 0, 7]) |
|
""" |
|
cdef: |
|
intp_t n_pop_intp, n_samples_intp |
|
long n_pop_long, n_samples_long |
|
|
|
|
|
|
|
|
|
if np.int_ is np.intp: |
|
|
|
|
|
|
|
|
|
n_pop_intp = int(n_population) |
|
n_samples_intp = int(n_samples) |
|
return _sample_without_replacement( |
|
n_pop_intp, n_samples_intp, method, random_state) |
|
else: |
|
|
|
n_pop_long = n_population |
|
n_samples_long = n_samples |
|
return _sample_without_replacement( |
|
n_pop_long, n_samples_long, method, random_state) |
|
|
|
|
|
def _our_rand_r_py(seed): |
|
"""Python utils to test the our_rand_r function""" |
|
cdef uint32_t my_seed = seed |
|
return our_rand_r(&my_seed) |
|
|