|
|
|
|
|
|
|
import numbers |
|
import sys |
|
import warnings |
|
from collections import UserList |
|
from itertools import compress, islice |
|
|
|
import numpy as np |
|
from scipy.sparse import issparse |
|
|
|
from ._array_api import _is_numpy_namespace, get_namespace |
|
from ._param_validation import Interval, validate_params |
|
from .extmath import _approximate_mode |
|
from .validation import ( |
|
_is_arraylike_not_scalar, |
|
_is_pandas_df, |
|
_is_polars_df_or_series, |
|
_use_interchange_protocol, |
|
check_array, |
|
check_consistent_length, |
|
check_random_state, |
|
) |
|
|
|
|
|
def _array_indexing(array, key, key_dtype, axis): |
|
"""Index an array or scipy.sparse consistently across NumPy version.""" |
|
xp, is_array_api = get_namespace(array) |
|
if is_array_api: |
|
return xp.take(array, key, axis=axis) |
|
if issparse(array) and key_dtype == "bool": |
|
key = np.asarray(key) |
|
if isinstance(key, tuple): |
|
key = list(key) |
|
return array[key, ...] if axis == 0 else array[:, key] |
|
|
|
|
|
def _pandas_indexing(X, key, key_dtype, axis): |
|
"""Index a pandas dataframe or a series.""" |
|
if _is_arraylike_not_scalar(key): |
|
key = np.asarray(key) |
|
|
|
if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): |
|
|
|
|
|
return X.take(key, axis=axis) |
|
else: |
|
|
|
indexer = X.iloc if key_dtype == "int" else X.loc |
|
return indexer[:, key] if axis else indexer[key] |
|
|
|
|
|
def _list_indexing(X, key, key_dtype): |
|
"""Index a Python list.""" |
|
if np.isscalar(key) or isinstance(key, slice): |
|
|
|
return X[key] |
|
if key_dtype == "bool": |
|
|
|
return list(compress(X, key)) |
|
|
|
return [X[idx] for idx in key] |
|
|
|
|
|
def _polars_indexing(X, key, key_dtype, axis): |
|
"""Indexing X with polars interchange protocol.""" |
|
|
|
if isinstance(key, np.ndarray): |
|
|
|
key = key.tolist() |
|
elif not (np.isscalar(key) or isinstance(key, slice)): |
|
key = list(key) |
|
|
|
if axis == 1: |
|
|
|
|
|
return X[:, key] |
|
|
|
if key_dtype == "bool": |
|
|
|
return X.filter(key) |
|
|
|
|
|
|
|
X_indexed = X[key] |
|
if np.isscalar(key) and len(X.shape) == 2: |
|
|
|
|
|
pl = sys.modules["polars"] |
|
return pl.Series(X_indexed.row(0)) |
|
return X_indexed |
|
|
|
|
|
def _determine_key_type(key, accept_slice=True): |
|
"""Determine the data type of key. |
|
|
|
Parameters |
|
---------- |
|
key : scalar, slice or array-like |
|
The key from which we want to infer the data type. |
|
|
|
accept_slice : bool, default=True |
|
Whether or not to raise an error if the key is a slice. |
|
|
|
Returns |
|
------- |
|
dtype : {'int', 'str', 'bool', None} |
|
Returns the data type of key. |
|
""" |
|
err_msg = ( |
|
"No valid specification of the columns. Only a scalar, list or " |
|
"slice of all integers or all strings, or boolean mask is " |
|
"allowed" |
|
) |
|
|
|
dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"} |
|
array_dtype_to_str = { |
|
"i": "int", |
|
"u": "int", |
|
"b": "bool", |
|
"O": "str", |
|
"U": "str", |
|
"S": "str", |
|
} |
|
|
|
if key is None: |
|
return None |
|
if isinstance(key, tuple(dtype_to_str.keys())): |
|
try: |
|
return dtype_to_str[type(key)] |
|
except KeyError: |
|
raise ValueError(err_msg) |
|
if isinstance(key, slice): |
|
if not accept_slice: |
|
raise TypeError( |
|
"Only array-like or scalar are supported. A Python slice was given." |
|
) |
|
if key.start is None and key.stop is None: |
|
return None |
|
key_start_type = _determine_key_type(key.start) |
|
key_stop_type = _determine_key_type(key.stop) |
|
if key_start_type is not None and key_stop_type is not None: |
|
if key_start_type != key_stop_type: |
|
raise ValueError(err_msg) |
|
if key_start_type is not None: |
|
return key_start_type |
|
return key_stop_type |
|
|
|
|
|
if isinstance(key, (list, tuple, UserList)): |
|
unique_key = set(key) |
|
key_type = {_determine_key_type(elt) for elt in unique_key} |
|
if not key_type: |
|
return None |
|
if len(key_type) != 1: |
|
raise ValueError(err_msg) |
|
return key_type.pop() |
|
if hasattr(key, "dtype"): |
|
xp, is_array_api = get_namespace(key) |
|
|
|
|
|
|
|
if is_array_api and not _is_numpy_namespace(xp): |
|
if xp.isdtype(key.dtype, "bool"): |
|
return "bool" |
|
elif xp.isdtype(key.dtype, "integral"): |
|
return "int" |
|
else: |
|
raise ValueError(err_msg) |
|
else: |
|
try: |
|
return array_dtype_to_str[key.dtype.kind] |
|
except KeyError: |
|
raise ValueError(err_msg) |
|
raise ValueError(err_msg) |
|
|
|
|
|
def _safe_indexing(X, indices, *, axis=0): |
|
"""Return rows, items or columns of X using indices. |
|
|
|
.. warning:: |
|
|
|
This utility is documented, but **private**. This means that |
|
backward compatibility might be broken without any deprecation |
|
cycle. |
|
|
|
Parameters |
|
---------- |
|
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series |
|
Data from which to sample rows, items or columns. `list` are only |
|
supported when `axis=0`. |
|
indices : bool, int, str, slice, array-like |
|
- If `axis=0`, boolean and integer array-like, integer slice, |
|
and scalar integer are supported. |
|
- If `axis=1`: |
|
- to select a single column, `indices` can be of `int` type for |
|
all `X` types and `str` only for dataframe. The selected subset |
|
will be 1D, unless `X` is a sparse matrix in which case it will |
|
be 2D. |
|
- to select multiples columns, `indices` can be one of the |
|
following: `list`, `array`, `slice`. The type used in |
|
these containers can be one of the following: `int`, 'bool' and |
|
`str`. However, `str` is only supported when `X` is a dataframe. |
|
The selected subset will be 2D. |
|
axis : int, default=0 |
|
The axis along which `X` will be subsampled. `axis=0` will select |
|
rows while `axis=1` will select columns. |
|
|
|
Returns |
|
------- |
|
subset |
|
Subset of X on axis 0 or 1. |
|
|
|
Notes |
|
----- |
|
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are |
|
not supported. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.utils import _safe_indexing |
|
>>> data = np.array([[1, 2], [3, 4], [5, 6]]) |
|
>>> _safe_indexing(data, 0, axis=0) # select the first row |
|
array([1, 2]) |
|
>>> _safe_indexing(data, 0, axis=1) # select the first column |
|
array([1, 3, 5]) |
|
""" |
|
if indices is None: |
|
return X |
|
|
|
if axis not in (0, 1): |
|
raise ValueError( |
|
"'axis' should be either 0 (to index rows) or 1 (to index " |
|
" column). Got {} instead.".format(axis) |
|
) |
|
|
|
indices_dtype = _determine_key_type(indices) |
|
|
|
if axis == 0 and indices_dtype == "str": |
|
raise ValueError("String indexing is not supported with 'axis=0'") |
|
|
|
if axis == 1 and isinstance(X, list): |
|
raise ValueError("axis=1 is not supported for lists") |
|
|
|
if axis == 1 and hasattr(X, "shape") and len(X.shape) != 2: |
|
raise ValueError( |
|
"'X' should be a 2D NumPy array, 2D sparse matrix or " |
|
"dataframe when indexing the columns (i.e. 'axis=1'). " |
|
"Got {} instead with {} dimension(s).".format(type(X), len(X.shape)) |
|
) |
|
|
|
if ( |
|
axis == 1 |
|
and indices_dtype == "str" |
|
and not (_is_pandas_df(X) or _use_interchange_protocol(X)) |
|
): |
|
raise ValueError( |
|
"Specifying the columns using strings is only supported for dataframes." |
|
) |
|
|
|
if hasattr(X, "iloc"): |
|
|
|
|
|
return _pandas_indexing(X, indices, indices_dtype, axis=axis) |
|
elif _is_polars_df_or_series(X): |
|
return _polars_indexing(X, indices, indices_dtype, axis=axis) |
|
elif hasattr(X, "shape"): |
|
return _array_indexing(X, indices, indices_dtype, axis=axis) |
|
else: |
|
return _list_indexing(X, indices, indices_dtype) |
|
|
|
|
|
def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): |
|
"""Safe assignment to a numpy array, sparse matrix, or pandas dataframe. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse-matrix, dataframe} |
|
Array to be modified. It is expected to be 2-dimensional. |
|
|
|
values : ndarray |
|
The values to be assigned to `X`. |
|
|
|
row_indexer : array-like, dtype={int, bool}, default=None |
|
A 1-dimensional array to select the rows of interest. If `None`, all |
|
rows are selected. |
|
|
|
column_indexer : array-like, dtype={int, bool}, default=None |
|
A 1-dimensional array to select the columns of interest. If `None`, all |
|
columns are selected. |
|
""" |
|
row_indexer = slice(None, None, None) if row_indexer is None else row_indexer |
|
column_indexer = ( |
|
slice(None, None, None) if column_indexer is None else column_indexer |
|
) |
|
|
|
if hasattr(X, "iloc"): |
|
with warnings.catch_warnings(): |
|
|
|
|
|
|
|
|
|
|
|
warnings.simplefilter("ignore", FutureWarning) |
|
X.iloc[row_indexer, column_indexer] = values |
|
else: |
|
X[row_indexer, column_indexer] = values |
|
|
|
|
|
def _get_column_indices_for_bool_or_int(key, n_columns): |
|
|
|
try: |
|
idx = _safe_indexing(np.arange(n_columns), key) |
|
except IndexError as e: |
|
raise ValueError( |
|
f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]" |
|
) from e |
|
return np.atleast_1d(idx).tolist() |
|
|
|
|
|
def _get_column_indices(X, key): |
|
"""Get feature column indices for input data X and key. |
|
|
|
For accepted values of `key`, see the docstring of |
|
:func:`_safe_indexing`. |
|
""" |
|
key_dtype = _determine_key_type(key) |
|
if _use_interchange_protocol(X): |
|
return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype) |
|
|
|
n_columns = X.shape[1] |
|
if isinstance(key, (list, tuple)) and not key: |
|
|
|
return [] |
|
elif key_dtype in ("bool", "int"): |
|
return _get_column_indices_for_bool_or_int(key, n_columns) |
|
else: |
|
try: |
|
all_columns = X.columns |
|
except AttributeError: |
|
raise ValueError( |
|
"Specifying the columns using strings is only supported for dataframes." |
|
) |
|
if isinstance(key, str): |
|
columns = [key] |
|
elif isinstance(key, slice): |
|
start, stop = key.start, key.stop |
|
if start is not None: |
|
start = all_columns.get_loc(start) |
|
if stop is not None: |
|
|
|
stop = all_columns.get_loc(stop) + 1 |
|
else: |
|
stop = n_columns + 1 |
|
return list(islice(range(n_columns), start, stop)) |
|
else: |
|
columns = list(key) |
|
|
|
try: |
|
column_indices = [] |
|
for col in columns: |
|
col_idx = all_columns.get_loc(col) |
|
if not isinstance(col_idx, numbers.Integral): |
|
raise ValueError( |
|
f"Selected columns, {columns}, are not unique in dataframe" |
|
) |
|
column_indices.append(col_idx) |
|
|
|
except KeyError as e: |
|
raise ValueError("A given column is not a column of the dataframe") from e |
|
|
|
return column_indices |
|
|
|
|
|
def _get_column_indices_interchange(X_interchange, key, key_dtype): |
|
"""Same as _get_column_indices but for X with __dataframe__ protocol.""" |
|
|
|
n_columns = X_interchange.num_columns() |
|
|
|
if isinstance(key, (list, tuple)) and not key: |
|
|
|
return [] |
|
elif key_dtype in ("bool", "int"): |
|
return _get_column_indices_for_bool_or_int(key, n_columns) |
|
else: |
|
column_names = list(X_interchange.column_names()) |
|
|
|
if isinstance(key, slice): |
|
if key.step not in [1, None]: |
|
raise NotImplementedError("key.step must be 1 or None") |
|
start, stop = key.start, key.stop |
|
if start is not None: |
|
start = column_names.index(start) |
|
|
|
if stop is not None: |
|
stop = column_names.index(stop) + 1 |
|
else: |
|
stop = n_columns + 1 |
|
return list(islice(range(n_columns), start, stop)) |
|
|
|
selected_columns = [key] if np.isscalar(key) else key |
|
|
|
try: |
|
return [column_names.index(col) for col in selected_columns] |
|
except ValueError as e: |
|
raise ValueError("A given column is not a column of the dataframe") from e |
|
|
|
|
|
@validate_params( |
|
{ |
|
"replace": ["boolean"], |
|
"n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None], |
|
"random_state": ["random_state"], |
|
"stratify": ["array-like", "sparse matrix", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None): |
|
"""Resample arrays or sparse matrices in a consistent way. |
|
|
|
The default strategy implements one step of the bootstrapping |
|
procedure. |
|
|
|
Parameters |
|
---------- |
|
*arrays : sequence of array-like of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Indexable data-structures can be arrays, lists, dataframes or scipy |
|
sparse matrices with consistent first dimension. |
|
|
|
replace : bool, default=True |
|
Implements resampling with replacement. If False, this will implement |
|
(sliced) random permutations. |
|
|
|
n_samples : int, default=None |
|
Number of samples to generate. If left to None this is |
|
automatically set to the first dimension of the arrays. |
|
If replace is False it should not be larger than the length of |
|
arrays. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Determines random number generation for shuffling |
|
the data. |
|
Pass an int for reproducible results across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
stratify : {array-like, sparse matrix} of shape (n_samples,) or \ |
|
(n_samples, n_outputs), default=None |
|
If not None, data is split in a stratified fashion, using this as |
|
the class labels. |
|
|
|
Returns |
|
------- |
|
resampled_arrays : sequence of array-like of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Sequence of resampled copies of the collections. The original arrays |
|
are not impacted. |
|
|
|
See Also |
|
-------- |
|
shuffle : Shuffle arrays or sparse matrices in a consistent way. |
|
|
|
Examples |
|
-------- |
|
It is possible to mix sparse and dense arrays in the same run:: |
|
|
|
>>> import numpy as np |
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) |
|
>>> y = np.array([0, 1, 2]) |
|
|
|
>>> from scipy.sparse import coo_matrix |
|
>>> X_sparse = coo_matrix(X) |
|
|
|
>>> from sklearn.utils import resample |
|
>>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) |
|
>>> X |
|
array([[1., 0.], |
|
[2., 1.], |
|
[1., 0.]]) |
|
|
|
>>> X_sparse |
|
<Compressed Sparse Row sparse matrix of dtype 'float64' |
|
with 4 stored elements and shape (3, 2)> |
|
|
|
>>> X_sparse.toarray() |
|
array([[1., 0.], |
|
[2., 1.], |
|
[1., 0.]]) |
|
|
|
>>> y |
|
array([0, 1, 0]) |
|
|
|
>>> resample(y, n_samples=2, random_state=0) |
|
array([0, 1]) |
|
|
|
Example using stratification:: |
|
|
|
>>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1] |
|
>>> resample(y, n_samples=5, replace=False, stratify=y, |
|
... random_state=0) |
|
[1, 1, 1, 0, 1] |
|
""" |
|
max_n_samples = n_samples |
|
random_state = check_random_state(random_state) |
|
|
|
if len(arrays) == 0: |
|
return None |
|
|
|
first = arrays[0] |
|
n_samples = first.shape[0] if hasattr(first, "shape") else len(first) |
|
|
|
if max_n_samples is None: |
|
max_n_samples = n_samples |
|
elif (max_n_samples > n_samples) and (not replace): |
|
raise ValueError( |
|
"Cannot sample %d out of arrays with dim %d when replace is False" |
|
% (max_n_samples, n_samples) |
|
) |
|
|
|
check_consistent_length(*arrays) |
|
|
|
if stratify is None: |
|
if replace: |
|
indices = random_state.randint(0, n_samples, size=(max_n_samples,)) |
|
else: |
|
indices = np.arange(n_samples) |
|
random_state.shuffle(indices) |
|
indices = indices[:max_n_samples] |
|
else: |
|
|
|
y = check_array(stratify, ensure_2d=False, dtype=None) |
|
if y.ndim == 2: |
|
|
|
|
|
y = np.array([" ".join(row.astype("str")) for row in y]) |
|
|
|
classes, y_indices = np.unique(y, return_inverse=True) |
|
n_classes = classes.shape[0] |
|
|
|
class_counts = np.bincount(y_indices) |
|
|
|
|
|
|
|
class_indices = np.split( |
|
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] |
|
) |
|
|
|
n_i = _approximate_mode(class_counts, max_n_samples, random_state) |
|
|
|
indices = [] |
|
|
|
for i in range(n_classes): |
|
indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace) |
|
indices.extend(indices_i) |
|
|
|
indices = random_state.permutation(indices) |
|
|
|
|
|
arrays = [a.tocsr() if issparse(a) else a for a in arrays] |
|
resampled_arrays = [_safe_indexing(a, indices) for a in arrays] |
|
if len(resampled_arrays) == 1: |
|
|
|
return resampled_arrays[0] |
|
else: |
|
return resampled_arrays |
|
|
|
|
|
def shuffle(*arrays, random_state=None, n_samples=None): |
|
"""Shuffle arrays or sparse matrices in a consistent way. |
|
|
|
This is a convenience alias to ``resample(*arrays, replace=False)`` to do |
|
random permutations of the collections. |
|
|
|
Parameters |
|
---------- |
|
*arrays : sequence of indexable data-structures |
|
Indexable data-structures can be arrays, lists, dataframes or scipy |
|
sparse matrices with consistent first dimension. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Determines random number generation for shuffling |
|
the data. |
|
Pass an int for reproducible results across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
n_samples : int, default=None |
|
Number of samples to generate. If left to None this is |
|
automatically set to the first dimension of the arrays. It should |
|
not be larger than the length of arrays. |
|
|
|
Returns |
|
------- |
|
shuffled_arrays : sequence of indexable data-structures |
|
Sequence of shuffled copies of the collections. The original arrays |
|
are not impacted. |
|
|
|
See Also |
|
-------- |
|
resample : Resample arrays or sparse matrices in a consistent way. |
|
|
|
Examples |
|
-------- |
|
It is possible to mix sparse and dense arrays in the same run:: |
|
|
|
>>> import numpy as np |
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) |
|
>>> y = np.array([0, 1, 2]) |
|
|
|
>>> from scipy.sparse import coo_matrix |
|
>>> X_sparse = coo_matrix(X) |
|
|
|
>>> from sklearn.utils import shuffle |
|
>>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) |
|
>>> X |
|
array([[0., 0.], |
|
[2., 1.], |
|
[1., 0.]]) |
|
|
|
>>> X_sparse |
|
<Compressed Sparse Row sparse matrix of dtype 'float64' |
|
with 3 stored elements and shape (3, 2)> |
|
|
|
>>> X_sparse.toarray() |
|
array([[0., 0.], |
|
[2., 1.], |
|
[1., 0.]]) |
|
|
|
>>> y |
|
array([2, 1, 0]) |
|
|
|
>>> shuffle(y, n_samples=2, random_state=0) |
|
array([0, 1]) |
|
""" |
|
return resample( |
|
*arrays, replace=False, n_samples=n_samples, random_state=random_state |
|
) |
|
|