|
""" |
|
HDBSCAN: Hierarchical Density-Based Spatial Clustering |
|
of Applications with Noise |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from numbers import Integral, Real |
|
from warnings import warn |
|
|
|
import numpy as np |
|
from scipy.sparse import csgraph, issparse |
|
|
|
from ...base import BaseEstimator, ClusterMixin, _fit_context |
|
from ...metrics import pairwise_distances |
|
from ...metrics._dist_metrics import DistanceMetric |
|
from ...metrics.pairwise import _VALID_METRICS |
|
from ...neighbors import BallTree, KDTree, NearestNeighbors |
|
from ...utils._param_validation import Interval, StrOptions |
|
from ...utils.validation import ( |
|
_allclose_dense_sparse, |
|
_assert_all_finite, |
|
validate_data, |
|
) |
|
from ._linkage import ( |
|
MST_edge_dtype, |
|
make_single_linkage, |
|
mst_from_data_matrix, |
|
mst_from_mutual_reachability, |
|
) |
|
from ._reachability import mutual_reachability_graph |
|
from ._tree import HIERARCHY_dtype, labelling_at_cut, tree_to_labels |
|
|
|
FAST_METRICS = set(KDTree.valid_metrics + BallTree.valid_metrics) |
|
|
|
|
|
|
|
|
|
_OUTLIER_ENCODING: dict = { |
|
"infinite": { |
|
"label": -2, |
|
|
|
|
|
|
|
"prob": 0, |
|
}, |
|
"missing": { |
|
"label": -3, |
|
|
|
|
|
"prob": np.nan, |
|
}, |
|
} |
|
|
|
|
|
def _brute_mst(mutual_reachability, min_samples): |
|
""" |
|
Builds a minimum spanning tree (MST) from the provided mutual-reachability |
|
values. This function dispatches to a custom Cython implementation for |
|
dense arrays, and `scipy.sparse.csgraph.minimum_spanning_tree` for sparse |
|
arrays/matrices. |
|
|
|
Parameters |
|
---------- |
|
mututal_reachability_graph: {ndarray, sparse matrix} of shape \ |
|
(n_samples, n_samples) |
|
Weighted adjacency matrix of the mutual reachability graph. |
|
|
|
min_samples : int, default=None |
|
The number of samples in a neighborhood for a point |
|
to be considered as a core point. This includes the point itself. |
|
|
|
Returns |
|
------- |
|
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype |
|
The MST representation of the mutual-reachability graph. The MST is |
|
represented as a collection of edges. |
|
""" |
|
if not issparse(mutual_reachability): |
|
return mst_from_mutual_reachability(mutual_reachability) |
|
|
|
|
|
|
|
indptr = mutual_reachability.indptr |
|
num_points = mutual_reachability.shape[0] |
|
if any((indptr[i + 1] - indptr[i]) < min_samples for i in range(num_points)): |
|
raise ValueError( |
|
f"There exists points with fewer than {min_samples} neighbors. Ensure" |
|
" your distance matrix has non-zero values for at least" |
|
f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" |
|
" graph), or specify a `max_distance` in `metric_params` to use when" |
|
" distances are missing." |
|
) |
|
|
|
|
|
|
|
n_components = csgraph.connected_components( |
|
mutual_reachability, directed=False, return_labels=False |
|
) |
|
if n_components > 1: |
|
raise ValueError( |
|
f"Sparse mutual reachability matrix has {n_components} connected" |
|
" components. HDBSCAN cannot be perfomed on a disconnected graph. Ensure" |
|
" that the sparse distance matrix has only one connected component." |
|
) |
|
|
|
|
|
sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) |
|
rows, cols = sparse_min_spanning_tree.nonzero() |
|
mst = np.rec.fromarrays( |
|
[rows, cols, sparse_min_spanning_tree.data], |
|
dtype=MST_edge_dtype, |
|
) |
|
return mst |
|
|
|
|
|
def _process_mst(min_spanning_tree): |
|
""" |
|
Builds a single-linkage tree (SLT) from the provided minimum spanning tree |
|
(MST). The MST is first sorted then processed by a custom Cython routine. |
|
|
|
Parameters |
|
---------- |
|
min_spanning_tree : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype |
|
The MST representation of the mutual-reachability graph. The MST is |
|
represented as a collection of edges. |
|
|
|
Returns |
|
------- |
|
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype |
|
The single-linkage tree tree (dendrogram) built from the MST. |
|
""" |
|
|
|
row_order = np.argsort(min_spanning_tree["distance"]) |
|
min_spanning_tree = min_spanning_tree[row_order] |
|
|
|
return make_single_linkage(min_spanning_tree) |
|
|
|
|
|
def _hdbscan_brute( |
|
X, |
|
min_samples=5, |
|
alpha=None, |
|
metric="euclidean", |
|
n_jobs=None, |
|
copy=False, |
|
**metric_params, |
|
): |
|
""" |
|
Builds a single-linkage tree (SLT) from the input data `X`. If |
|
`metric="precomputed"` then `X` must be a symmetric array of distances. |
|
Otherwise, the pairwise distances are calculated directly and passed to |
|
`mutual_reachability_graph`. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) |
|
Either the raw data from which to compute the pairwise distances, |
|
or the precomputed distances. |
|
|
|
min_samples : int, default=None |
|
The number of samples in a neighborhood for a point |
|
to be considered as a core point. This includes the point itself. |
|
|
|
alpha : float, default=1.0 |
|
A distance scaling parameter as used in robust single linkage. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. |
|
|
|
- If metric is a string or callable, it must be one of |
|
the options allowed by :func:`~sklearn.metrics.pairwise_distances` |
|
for its metric parameter. |
|
|
|
- If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square. |
|
|
|
n_jobs : int, default=None |
|
The number of jobs to use for computing the pairwise distances. This |
|
works by breaking down the pairwise matrix into n_jobs even slices and |
|
computing them in parallel. This parameter is passed directly to |
|
:func:`~sklearn.metrics.pairwise_distances`. |
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
copy : bool, default=False |
|
If `copy=True` then any time an in-place modifications would be made |
|
that would overwrite `X`, a copy will first be made, guaranteeing that |
|
the original data will be unchanged. Currently, it only applies when |
|
`metric="precomputed"`, when passing a dense array or a CSR sparse |
|
array/matrix. |
|
|
|
metric_params : dict, default=None |
|
Arguments passed to the distance metric. |
|
|
|
Returns |
|
------- |
|
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype |
|
The single-linkage tree tree (dendrogram) built from the MST. |
|
""" |
|
if metric == "precomputed": |
|
if X.shape[0] != X.shape[1]: |
|
raise ValueError( |
|
"The precomputed distance matrix is expected to be symmetric, however" |
|
f" it has shape {X.shape}. Please verify that the" |
|
" distance matrix was constructed correctly." |
|
) |
|
if not _allclose_dense_sparse(X, X.T): |
|
raise ValueError( |
|
"The precomputed distance matrix is expected to be symmetric, however" |
|
" its values appear to be asymmetric. Please verify that the distance" |
|
" matrix was constructed correctly." |
|
) |
|
|
|
distance_matrix = X.copy() if copy else X |
|
else: |
|
distance_matrix = pairwise_distances( |
|
X, metric=metric, n_jobs=n_jobs, **metric_params |
|
) |
|
distance_matrix /= alpha |
|
|
|
max_distance = metric_params.get("max_distance", 0.0) |
|
if issparse(distance_matrix) and distance_matrix.format != "csr": |
|
|
|
|
|
distance_matrix = distance_matrix.tocsr() |
|
|
|
|
|
|
|
mutual_reachability_ = mutual_reachability_graph( |
|
distance_matrix, min_samples=min_samples, max_distance=max_distance |
|
) |
|
min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples) |
|
|
|
if np.isinf(min_spanning_tree["distance"]).any(): |
|
warn( |
|
( |
|
"The minimum spanning tree contains edge weights with value " |
|
"infinity. Potentially, you are missing too many distances " |
|
"in the initial distance matrix for the given neighborhood " |
|
"size." |
|
), |
|
UserWarning, |
|
) |
|
return _process_mst(min_spanning_tree) |
|
|
|
|
|
def _hdbscan_prims( |
|
X, |
|
algo, |
|
min_samples=5, |
|
alpha=1.0, |
|
metric="euclidean", |
|
leaf_size=40, |
|
n_jobs=None, |
|
**metric_params, |
|
): |
|
""" |
|
Builds a single-linkage tree (SLT) from the input data `X`. If |
|
`metric="precomputed"` then `X` must be a symmetric array of distances. |
|
Otherwise, the pairwise distances are calculated directly and passed to |
|
`mutual_reachability_graph`. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) |
|
The raw data. |
|
|
|
min_samples : int, default=None |
|
The number of samples in a neighborhood for a point |
|
to be considered as a core point. This includes the point itself. |
|
|
|
alpha : float, default=1.0 |
|
A distance scaling parameter as used in robust single linkage. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. `metric` must be one of the options allowed by |
|
:func:`~sklearn.metrics.pairwise_distances` for its metric |
|
parameter. |
|
|
|
n_jobs : int, default=None |
|
The number of jobs to use for computing the pairwise distances. This |
|
works by breaking down the pairwise matrix into n_jobs even slices and |
|
computing them in parallel. This parameter is passed directly to |
|
:func:`~sklearn.metrics.pairwise_distances`. |
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
copy : bool, default=False |
|
If `copy=True` then any time an in-place modifications would be made |
|
that would overwrite `X`, a copy will first be made, guaranteeing that |
|
the original data will be unchanged. Currently, it only applies when |
|
`metric="precomputed"`, when passing a dense array or a CSR sparse |
|
array/matrix. |
|
|
|
metric_params : dict, default=None |
|
Arguments passed to the distance metric. |
|
|
|
Returns |
|
------- |
|
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype |
|
The single-linkage tree tree (dendrogram) built from the MST. |
|
""" |
|
|
|
X = np.asarray(X, order="C") |
|
|
|
|
|
nbrs = NearestNeighbors( |
|
n_neighbors=min_samples, |
|
algorithm=algo, |
|
leaf_size=leaf_size, |
|
metric=metric, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
p=None, |
|
).fit(X) |
|
|
|
neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) |
|
core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) |
|
dist_metric = DistanceMetric.get_metric(metric, **metric_params) |
|
|
|
|
|
min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) |
|
return _process_mst(min_spanning_tree) |
|
|
|
|
|
def remap_single_linkage_tree(tree, internal_to_raw, non_finite): |
|
""" |
|
Takes an internal single_linkage_tree structure and adds back in a set of points |
|
that were initially detected as non-finite and returns that new tree. |
|
These points will all be merged into the final node at np.inf distance and |
|
considered noise points. |
|
|
|
Parameters |
|
---------- |
|
tree : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype |
|
The single-linkage tree tree (dendrogram) built from the MST. |
|
internal_to_raw: dict |
|
A mapping from internal integer index to the raw integer index |
|
non_finite : ndarray |
|
Boolean array of which entries in the raw data are non-finite |
|
""" |
|
finite_count = len(internal_to_raw) |
|
|
|
outlier_count = len(non_finite) |
|
for i, _ in enumerate(tree): |
|
left = tree[i]["left_node"] |
|
right = tree[i]["right_node"] |
|
|
|
if left < finite_count: |
|
tree[i]["left_node"] = internal_to_raw[left] |
|
else: |
|
tree[i]["left_node"] = left + outlier_count |
|
if right < finite_count: |
|
tree[i]["right_node"] = internal_to_raw[right] |
|
else: |
|
tree[i]["right_node"] = right + outlier_count |
|
|
|
outlier_tree = np.zeros(len(non_finite), dtype=HIERARCHY_dtype) |
|
last_cluster_id = max( |
|
tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"] |
|
) |
|
last_cluster_size = tree[tree.shape[0] - 1]["cluster_size"] |
|
for i, outlier in enumerate(non_finite): |
|
outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) |
|
last_cluster_id += 1 |
|
last_cluster_size += 1 |
|
tree = np.concatenate([tree, outlier_tree]) |
|
return tree |
|
|
|
|
|
def _get_finite_row_indices(matrix): |
|
""" |
|
Returns the indices of the purely finite rows of a |
|
sparse matrix or dense ndarray |
|
""" |
|
if issparse(matrix): |
|
row_indices = np.array( |
|
[i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] |
|
) |
|
else: |
|
(row_indices,) = np.isfinite(matrix.sum(axis=1)).nonzero() |
|
return row_indices |
|
|
|
|
|
class HDBSCAN(ClusterMixin, BaseEstimator): |
|
"""Cluster data using hierarchical density-based clustering. |
|
|
|
HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications |
|
with Noise. Performs :class:`~sklearn.cluster.DBSCAN` over varying epsilon |
|
values and integrates the result to find a clustering that gives the best |
|
stability over epsilon. |
|
This allows HDBSCAN to find clusters of varying densities (unlike |
|
:class:`~sklearn.cluster.DBSCAN`), and be more robust to parameter selection. |
|
Read more in the :ref:`User Guide <hdbscan>`. |
|
|
|
For an example of how to use HDBSCAN, as well as a comparison to |
|
:class:`~sklearn.cluster.DBSCAN`, please see the :ref:`plotting demo |
|
<sphx_glr_auto_examples_cluster_plot_hdbscan.py>`. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Parameters |
|
---------- |
|
min_cluster_size : int, default=5 |
|
The minimum number of samples in a group for that group to be |
|
considered a cluster; groupings smaller than this size will be left |
|
as noise. |
|
|
|
min_samples : int, default=None |
|
The parameter `k` used to calculate the distance between a point |
|
`x_p` and its k-th nearest neighbor. |
|
When `None`, defaults to `min_cluster_size`. |
|
|
|
cluster_selection_epsilon : float, default=0.0 |
|
A distance threshold. Clusters below this value will be merged. |
|
See [5]_ for more information. |
|
|
|
max_cluster_size : int, default=None |
|
A limit to the size of clusters returned by the `"eom"` cluster |
|
selection algorithm. There is no limit when `max_cluster_size=None`. |
|
Has no effect if `cluster_selection_method="leaf"`. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. |
|
|
|
- If metric is a string or callable, it must be one of |
|
the options allowed by :func:`~sklearn.metrics.pairwise_distances` |
|
for its metric parameter. |
|
|
|
- If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square. |
|
|
|
metric_params : dict, default=None |
|
Arguments passed to the distance metric. |
|
|
|
alpha : float, default=1.0 |
|
A distance scaling parameter as used in robust single linkage. |
|
See [3]_ for more information. |
|
|
|
algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto" |
|
Exactly which algorithm to use for computing core distances; By default |
|
this is set to `"auto"` which attempts to use a |
|
:class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses |
|
a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and |
|
`"ball_tree"` algorithms use the |
|
:class:`~sklearn.neighbors.NearestNeighbors` estimator. |
|
|
|
If the `X` passed during `fit` is sparse or `metric` is invalid for |
|
both :class:`~sklearn.neighbors.KDTree` and |
|
:class:`~sklearn.neighbors.BallTree`, then it resolves to use the |
|
`"brute"` algorithm. |
|
|
|
leaf_size : int, default=40 |
|
Leaf size for trees responsible for fast nearest neighbour queries when |
|
a KDTree or a BallTree are used as core-distance algorithms. A large |
|
dataset size and small `leaf_size` may induce excessive memory usage. |
|
If you are running out of memory consider increasing the `leaf_size` |
|
parameter. Ignored for `algorithm="brute"`. |
|
|
|
n_jobs : int, default=None |
|
Number of jobs to run in parallel to calculate distances. |
|
`None` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
`-1` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
cluster_selection_method : {"eom", "leaf"}, default="eom" |
|
The method used to select clusters from the condensed tree. The |
|
standard approach for HDBSCAN* is to use an Excess of Mass (`"eom"`) |
|
algorithm to find the most persistent clusters. Alternatively you can |
|
instead select the clusters at the leaves of the tree -- this provides |
|
the most fine grained and homogeneous clusters. |
|
|
|
allow_single_cluster : bool, default=False |
|
By default HDBSCAN* will not produce a single cluster, setting this |
|
to True will override this and allow single cluster results in |
|
the case that you feel this is a valid result for your dataset. |
|
|
|
store_centers : str, default=None |
|
Which, if any, cluster centers to compute and store. The options are: |
|
|
|
- `None` which does not compute nor store any centers. |
|
- `"centroid"` which calculates the center by taking the weighted |
|
average of their positions. Note that the algorithm uses the |
|
euclidean metric and does not guarantee that the output will be |
|
an observed data point. |
|
- `"medoid"` which calculates the center by taking the point in the |
|
fitted data which minimizes the distance to all other points in |
|
the cluster. This is slower than "centroid" since it requires |
|
computing additional pairwise distances between points of the |
|
same cluster but guarantees the output is an observed data point. |
|
The medoid is also well-defined for arbitrary metrics, and does not |
|
depend on a euclidean metric. |
|
- `"both"` which computes and stores both forms of centers. |
|
|
|
copy : bool, default=False |
|
If `copy=True` then any time an in-place modifications would be made |
|
that would overwrite data passed to :term:`fit`, a copy will first be |
|
made, guaranteeing that the original data will be unchanged. |
|
Currently, it only applies when `metric="precomputed"`, when passing |
|
a dense array or a CSR sparse matrix and when `algorithm="brute"`. |
|
|
|
Attributes |
|
---------- |
|
labels_ : ndarray of shape (n_samples,) |
|
Cluster labels for each point in the dataset given to :term:`fit`. |
|
Outliers are labeled as follows: |
|
|
|
- Noisy samples are given the label -1. |
|
- Samples with infinite elements (+/- np.inf) are given the label -2. |
|
- Samples with missing data are given the label -3, even if they |
|
also have infinite elements. |
|
|
|
probabilities_ : ndarray of shape (n_samples,) |
|
The strength with which each sample is a member of its assigned |
|
cluster. |
|
|
|
- Clustered samples have probabilities proportional to the degree that |
|
they persist as part of the cluster. |
|
- Noisy samples have probability zero. |
|
- Samples with infinite elements (+/- np.inf) have probability 0. |
|
- Samples with missing data have probability `np.nan`. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
centroids_ : ndarray of shape (n_clusters, n_features) |
|
A collection containing the centroid of each cluster calculated under |
|
the standard euclidean metric. The centroids may fall "outside" their |
|
respective clusters if the clusters themselves are non-convex. |
|
|
|
Note that `n_clusters` only counts non-outlier clusters. That is to |
|
say, the `-1, -2, -3` labels for the outlier clusters are excluded. |
|
|
|
medoids_ : ndarray of shape (n_clusters, n_features) |
|
A collection containing the medoid of each cluster calculated under |
|
the whichever metric was passed to the `metric` parameter. The |
|
medoids are points in the original cluster which minimize the average |
|
distance to all other points in that cluster under the chosen metric. |
|
These can be thought of as the result of projecting the `metric`-based |
|
centroid back onto the cluster. |
|
|
|
Note that `n_clusters` only counts non-outlier clusters. That is to |
|
say, the `-1, -2, -3` labels for the outlier clusters are excluded. |
|
|
|
See Also |
|
-------- |
|
DBSCAN : Density-Based Spatial Clustering of Applications |
|
with Noise. |
|
OPTICS : Ordering Points To Identify the Clustering Structure. |
|
Birch : Memory-efficient, online-learning algorithm. |
|
|
|
Notes |
|
----- |
|
The `min_samples` parameter includes the point itself, whereas the implementation in |
|
`scikit-learn-contrib/hdbscan <https://github.com/scikit-learn-contrib/hdbscan>`_ |
|
does not. To get the same results in both versions, the value of `min_samples` here |
|
must be 1 greater than the value used in `scikit-learn-contrib/hdbscan |
|
<https://github.com/scikit-learn-contrib/hdbscan>`_. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] :doi:`Campello, R. J., Moulavi, D., & Sander, J. Density-based clustering |
|
based on hierarchical density estimates. |
|
<10.1007/978-3-642-37456-2_14>` |
|
.. [2] :doi:`Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. |
|
Hierarchical density estimates for data clustering, visualization, |
|
and outlier detection.<10.1145/2733381>` |
|
|
|
.. [3] `Chaudhuri, K., & Dasgupta, S. Rates of convergence for the |
|
cluster tree. |
|
<https://papers.nips.cc/paper/2010/hash/ |
|
b534ba68236ba543ae44b22bd110a1d6-Abstract.html>`_ |
|
|
|
.. [4] `Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and |
|
Sander, J. Density-Based Clustering Validation. |
|
<https://www.dbs.ifi.lmu.de/~zimek/publications/SDM2014/DBCV.pdf>`_ |
|
|
|
.. [5] :arxiv:`Malzer, C., & Baum, M. "A Hybrid Approach To Hierarchical |
|
Density-based Cluster Selection."<1911.02282>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.cluster import HDBSCAN |
|
>>> from sklearn.datasets import load_digits |
|
>>> X, _ = load_digits(return_X_y=True) |
|
>>> hdb = HDBSCAN(min_cluster_size=20) |
|
>>> hdb.fit(X) |
|
HDBSCAN(min_cluster_size=20) |
|
>>> hdb.labels_.shape == (X.shape[0],) |
|
True |
|
>>> np.unique(hdb.labels_).tolist() |
|
[-1, 0, 1, 2, 3, 4, 5, 6, 7] |
|
""" |
|
|
|
_parameter_constraints = { |
|
"min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], |
|
"min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], |
|
"cluster_selection_epsilon": [ |
|
Interval(Real, left=0, right=None, closed="left") |
|
], |
|
"max_cluster_size": [ |
|
None, |
|
Interval(Integral, left=1, right=None, closed="left"), |
|
], |
|
"metric": [ |
|
StrOptions(FAST_METRICS | set(_VALID_METRICS) | {"precomputed"}), |
|
callable, |
|
], |
|
"metric_params": [dict, None], |
|
"alpha": [Interval(Real, left=0, right=None, closed="neither")], |
|
"algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})], |
|
"leaf_size": [Interval(Integral, left=1, right=None, closed="left")], |
|
"n_jobs": [Integral, None], |
|
"cluster_selection_method": [StrOptions({"eom", "leaf"})], |
|
"allow_single_cluster": ["boolean"], |
|
"store_centers": [None, StrOptions({"centroid", "medoid", "both"})], |
|
"copy": ["boolean"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
min_cluster_size=5, |
|
min_samples=None, |
|
cluster_selection_epsilon=0.0, |
|
max_cluster_size=None, |
|
metric="euclidean", |
|
metric_params=None, |
|
alpha=1.0, |
|
algorithm="auto", |
|
leaf_size=40, |
|
n_jobs=None, |
|
cluster_selection_method="eom", |
|
allow_single_cluster=False, |
|
store_centers=None, |
|
copy=False, |
|
): |
|
self.min_cluster_size = min_cluster_size |
|
self.min_samples = min_samples |
|
self.alpha = alpha |
|
self.max_cluster_size = max_cluster_size |
|
self.cluster_selection_epsilon = cluster_selection_epsilon |
|
self.metric = metric |
|
self.metric_params = metric_params |
|
self.algorithm = algorithm |
|
self.leaf_size = leaf_size |
|
self.n_jobs = n_jobs |
|
self.cluster_selection_method = cluster_selection_method |
|
self.allow_single_cluster = allow_single_cluster |
|
self.store_centers = store_centers |
|
self.copy = copy |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None): |
|
"""Find clusters based on hierarchical density-based clustering. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ |
|
ndarray of shape (n_samples, n_samples) |
|
A feature array, or array of distances between samples if |
|
`metric='precomputed'`. |
|
|
|
y : None |
|
Ignored. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns self. |
|
""" |
|
if self.metric == "precomputed" and self.store_centers is not None: |
|
raise ValueError( |
|
"Cannot store centers when using a precomputed distance matrix." |
|
) |
|
|
|
self._metric_params = self.metric_params or {} |
|
if self.metric != "precomputed": |
|
|
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse=["csr", "lil"], |
|
ensure_all_finite=False, |
|
dtype=np.float64, |
|
) |
|
self._raw_data = X |
|
all_finite = True |
|
try: |
|
_assert_all_finite(X.data if issparse(X) else X) |
|
except ValueError: |
|
all_finite = False |
|
|
|
if not all_finite: |
|
|
|
|
|
|
|
|
|
|
|
|
|
reduced_X = X.sum(axis=1) |
|
|
|
|
|
|
|
missing_index = np.isnan(reduced_X).nonzero()[0] |
|
|
|
|
|
infinite_index = np.isinf(reduced_X).nonzero()[0] |
|
|
|
|
|
finite_index = _get_finite_row_indices(X) |
|
internal_to_raw = {x: y for x, y in enumerate(finite_index)} |
|
X = X[finite_index] |
|
elif issparse(X): |
|
|
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse=["csr", "lil"], |
|
dtype=np.float64, |
|
force_writeable=True, |
|
) |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
X = validate_data( |
|
self, X, ensure_all_finite=False, dtype=np.float64, force_writeable=True |
|
) |
|
if np.isnan(X).any(): |
|
|
|
|
|
raise ValueError("np.nan values found in precomputed-dense") |
|
if X.shape[0] == 1: |
|
raise ValueError("n_samples=1 while HDBSCAN requires more than one sample") |
|
self._min_samples = ( |
|
self.min_cluster_size if self.min_samples is None else self.min_samples |
|
) |
|
|
|
if self._min_samples > X.shape[0]: |
|
raise ValueError( |
|
f"min_samples ({self._min_samples}) must be at most the number of" |
|
f" samples in X ({X.shape[0]})" |
|
) |
|
|
|
mst_func = None |
|
kwargs = dict( |
|
X=X, |
|
min_samples=self._min_samples, |
|
alpha=self.alpha, |
|
metric=self.metric, |
|
n_jobs=self.n_jobs, |
|
**self._metric_params, |
|
) |
|
if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics: |
|
raise ValueError( |
|
f"{self.metric} is not a valid metric for a KDTree-based algorithm." |
|
" Please select a different metric." |
|
) |
|
elif ( |
|
self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics |
|
): |
|
raise ValueError( |
|
f"{self.metric} is not a valid metric for a BallTree-based algorithm." |
|
" Please select a different metric." |
|
) |
|
|
|
if self.algorithm != "auto": |
|
if ( |
|
self.metric != "precomputed" |
|
and issparse(X) |
|
and self.algorithm != "brute" |
|
): |
|
raise ValueError("Sparse data matrices only support algorithm `brute`.") |
|
|
|
if self.algorithm == "brute": |
|
mst_func = _hdbscan_brute |
|
kwargs["copy"] = self.copy |
|
elif self.algorithm == "kd_tree": |
|
mst_func = _hdbscan_prims |
|
kwargs["algo"] = "kd_tree" |
|
kwargs["leaf_size"] = self.leaf_size |
|
else: |
|
mst_func = _hdbscan_prims |
|
kwargs["algo"] = "ball_tree" |
|
kwargs["leaf_size"] = self.leaf_size |
|
else: |
|
if issparse(X) or self.metric not in FAST_METRICS: |
|
|
|
mst_func = _hdbscan_brute |
|
kwargs["copy"] = self.copy |
|
elif self.metric in KDTree.valid_metrics: |
|
|
|
mst_func = _hdbscan_prims |
|
kwargs["algo"] = "kd_tree" |
|
kwargs["leaf_size"] = self.leaf_size |
|
else: |
|
|
|
mst_func = _hdbscan_prims |
|
kwargs["algo"] = "ball_tree" |
|
kwargs["leaf_size"] = self.leaf_size |
|
|
|
self._single_linkage_tree_ = mst_func(**kwargs) |
|
|
|
self.labels_, self.probabilities_ = tree_to_labels( |
|
self._single_linkage_tree_, |
|
self.min_cluster_size, |
|
self.cluster_selection_method, |
|
self.allow_single_cluster, |
|
self.cluster_selection_epsilon, |
|
self.max_cluster_size, |
|
) |
|
if self.metric != "precomputed" and not all_finite: |
|
|
|
|
|
|
|
self._single_linkage_tree_ = remap_single_linkage_tree( |
|
self._single_linkage_tree_, |
|
internal_to_raw, |
|
|
|
non_finite=set(np.hstack([infinite_index, missing_index])), |
|
) |
|
new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32) |
|
new_labels[finite_index] = self.labels_ |
|
new_labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] |
|
new_labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] |
|
self.labels_ = new_labels |
|
|
|
new_probabilities = np.zeros(self._raw_data.shape[0], dtype=np.float64) |
|
new_probabilities[finite_index] = self.probabilities_ |
|
|
|
|
|
new_probabilities[infinite_index] = _OUTLIER_ENCODING["infinite"]["prob"] |
|
new_probabilities[missing_index] = _OUTLIER_ENCODING["missing"]["prob"] |
|
self.probabilities_ = new_probabilities |
|
|
|
if self.store_centers: |
|
self._weighted_cluster_center(X) |
|
return self |
|
|
|
def fit_predict(self, X, y=None): |
|
"""Cluster X and return the associated cluster labels. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ |
|
ndarray of shape (n_samples, n_samples) |
|
A feature array, or array of distances between samples if |
|
`metric='precomputed'`. |
|
|
|
y : None |
|
Ignored. |
|
|
|
Returns |
|
------- |
|
y : ndarray of shape (n_samples,) |
|
Cluster labels. |
|
""" |
|
self.fit(X) |
|
return self.labels_ |
|
|
|
def _weighted_cluster_center(self, X): |
|
"""Calculate and store the centroids/medoids of each cluster. |
|
|
|
This requires `X` to be a raw feature array, not precomputed |
|
distances. Rather than return outputs directly, this helper method |
|
instead stores them in the `self.{centroids, medoids}_` attributes. |
|
The choice for which attributes are calculated and stored is mediated |
|
by the value of `self.store_centers`. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) |
|
The feature array that the estimator was fit with. |
|
|
|
""" |
|
|
|
n_clusters = len(set(self.labels_) - {-1, -2}) |
|
mask = np.empty((X.shape[0],), dtype=np.bool_) |
|
make_centroids = self.store_centers in ("centroid", "both") |
|
make_medoids = self.store_centers in ("medoid", "both") |
|
|
|
if make_centroids: |
|
self.centroids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) |
|
if make_medoids: |
|
self.medoids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) |
|
|
|
|
|
|
|
for idx in range(n_clusters): |
|
mask = self.labels_ == idx |
|
data = X[mask] |
|
strength = self.probabilities_[mask] |
|
if make_centroids: |
|
self.centroids_[idx] = np.average(data, weights=strength, axis=0) |
|
if make_medoids: |
|
|
|
dist_mat = pairwise_distances( |
|
data, metric=self.metric, **self._metric_params |
|
) |
|
dist_mat = dist_mat * strength |
|
medoid_index = np.argmin(dist_mat.sum(axis=1)) |
|
self.medoids_[idx] = data[medoid_index] |
|
return |
|
|
|
def dbscan_clustering(self, cut_distance, min_cluster_size=5): |
|
"""Return clustering given by DBSCAN without border points. |
|
|
|
Return clustering that would be equivalent to running DBSCAN* for a |
|
particular cut_distance (or epsilon) DBSCAN* can be thought of as |
|
DBSCAN without the border points. As such these results may differ |
|
slightly from `cluster.DBSCAN` due to the difference in implementation |
|
over the non-core points. |
|
|
|
This can also be thought of as a flat clustering derived from constant |
|
height cut through the single linkage tree. |
|
|
|
This represents the result of selecting a cut value for robust single linkage |
|
clustering. The `min_cluster_size` allows the flat clustering to declare noise |
|
points (and cluster smaller than `min_cluster_size`). |
|
|
|
Parameters |
|
---------- |
|
cut_distance : float |
|
The mutual reachability distance cut value to use to generate a |
|
flat clustering. |
|
|
|
min_cluster_size : int, default=5 |
|
Clusters smaller than this value with be called 'noise' and remain |
|
unclustered in the resulting flat clustering. |
|
|
|
Returns |
|
------- |
|
labels : ndarray of shape (n_samples,) |
|
An array of cluster labels, one per datapoint. |
|
Outliers are labeled as follows: |
|
|
|
- Noisy samples are given the label -1. |
|
- Samples with infinite elements (+/- np.inf) are given the label -2. |
|
- Samples with missing data are given the label -3, even if they |
|
also have infinite elements. |
|
""" |
|
labels = labelling_at_cut( |
|
self._single_linkage_tree_, cut_distance, min_cluster_size |
|
) |
|
|
|
infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"] |
|
missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"] |
|
|
|
|
|
labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] |
|
labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] |
|
return labels |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.sparse = True |
|
tags.input_tags.allow_nan = self.metric != "precomputed" |
|
return tags |
|
|