|
"""Unsupervised evaluation metrics.""" |
|
|
|
|
|
|
|
|
|
|
|
import functools |
|
from numbers import Integral |
|
|
|
import numpy as np |
|
from scipy.sparse import issparse |
|
|
|
from ...preprocessing import LabelEncoder |
|
from ...utils import _safe_indexing, check_random_state, check_X_y |
|
from ...utils._array_api import _atol_for_type |
|
from ...utils._param_validation import ( |
|
Interval, |
|
StrOptions, |
|
validate_params, |
|
) |
|
from ..pairwise import _VALID_METRICS, pairwise_distances, pairwise_distances_chunked |
|
|
|
|
|
def check_number_of_labels(n_labels, n_samples): |
|
"""Check that number of labels are valid. |
|
|
|
Parameters |
|
---------- |
|
n_labels : int |
|
Number of labels. |
|
|
|
n_samples : int |
|
Number of samples. |
|
""" |
|
if not 1 < n_labels < n_samples: |
|
raise ValueError( |
|
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)" |
|
% n_labels |
|
) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"labels": ["array-like"], |
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
|
"sample_size": [Interval(Integral, 1, None, closed="left"), None], |
|
"random_state": ["random_state"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def silhouette_score( |
|
X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds |
|
): |
|
"""Compute the mean Silhouette Coefficient of all samples. |
|
|
|
The Silhouette Coefficient is calculated using the mean intra-cluster |
|
distance (``a``) and the mean nearest-cluster distance (``b``) for each |
|
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, |
|
b)``. To clarify, ``b`` is the distance between a sample and the nearest |
|
cluster that the sample is not a part of. |
|
Note that Silhouette Coefficient is only defined if number of labels |
|
is ``2 <= n_labels <= n_samples - 1``. |
|
|
|
This function returns the mean Silhouette Coefficient over all samples. |
|
To obtain the values for each sample, use :func:`silhouette_samples`. |
|
|
|
The best value is 1 and the worst value is -1. Values near 0 indicate |
|
overlapping clusters. Negative values generally indicate that a sample has |
|
been assigned to the wrong cluster, as a different cluster is more similar. |
|
|
|
Read more in the :ref:`User Guide <silhouette_coefficient>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \ |
|
"precomputed" or (n_samples_a, n_features) otherwise |
|
An array of pairwise distances between samples, or a feature array. |
|
|
|
labels : array-like of shape (n_samples,) |
|
Predicted labels for each sample. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string, it must be one of the options |
|
allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is |
|
the distance array itself, use ``metric="precomputed"``. |
|
|
|
sample_size : int, default=None |
|
The size of the sample to use when computing the Silhouette Coefficient |
|
on a random subset of the data. |
|
If ``sample_size is None``, no sampling is used. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Determines random number generation for selecting a subset of samples. |
|
Used when ``sample_size is not None``. |
|
Pass an int for reproducible results across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
**kwds : optional keyword parameters |
|
Any further parameters are passed directly to the distance function. |
|
If using a scipy.spatial.distance metric, the parameters are still |
|
metric dependent. See the scipy docs for usage examples. |
|
|
|
Returns |
|
------- |
|
silhouette : float |
|
Mean Silhouette Coefficient for all samples. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the |
|
Interpretation and Validation of Cluster Analysis". Computational |
|
and Applied Mathematics 20: 53-65. |
|
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_ |
|
|
|
.. [2] `Wikipedia entry on the Silhouette Coefficient |
|
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import make_blobs |
|
>>> from sklearn.cluster import KMeans |
|
>>> from sklearn.metrics import silhouette_score |
|
>>> X, y = make_blobs(random_state=42) |
|
>>> kmeans = KMeans(n_clusters=2, random_state=42) |
|
>>> silhouette_score(X, kmeans.fit_predict(X)) |
|
np.float64(0.49...) |
|
""" |
|
if sample_size is not None: |
|
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"]) |
|
random_state = check_random_state(random_state) |
|
indices = random_state.permutation(X.shape[0])[:sample_size] |
|
if metric == "precomputed": |
|
X, labels = X[indices].T[indices].T, labels[indices] |
|
else: |
|
X, labels = X[indices], labels[indices] |
|
return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) |
|
|
|
|
|
def _silhouette_reduce(D_chunk, start, labels, label_freqs): |
|
"""Accumulate silhouette statistics for vertical chunk of X. |
|
|
|
Parameters |
|
---------- |
|
D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples) |
|
Precomputed distances for a chunk. If a sparse matrix is provided, |
|
only CSR format is accepted. |
|
start : int |
|
First index in the chunk. |
|
labels : array-like of shape (n_samples,) |
|
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}. |
|
label_freqs : array-like |
|
Distribution of cluster labels in ``labels``. |
|
""" |
|
n_chunk_samples = D_chunk.shape[0] |
|
|
|
cluster_distances = np.zeros( |
|
(n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype |
|
) |
|
|
|
if issparse(D_chunk): |
|
if D_chunk.format != "csr": |
|
raise TypeError( |
|
"Expected CSR matrix. Please pass sparse matrix in CSR format." |
|
) |
|
for i in range(n_chunk_samples): |
|
indptr = D_chunk.indptr |
|
indices = D_chunk.indices[indptr[i] : indptr[i + 1]] |
|
sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]] |
|
sample_labels = np.take(labels, indices) |
|
cluster_distances[i] += np.bincount( |
|
sample_labels, weights=sample_weights, minlength=len(label_freqs) |
|
) |
|
else: |
|
for i in range(n_chunk_samples): |
|
sample_weights = D_chunk[i] |
|
sample_labels = labels |
|
cluster_distances[i] += np.bincount( |
|
sample_labels, weights=sample_weights, minlength=len(label_freqs) |
|
) |
|
|
|
|
|
end = start + n_chunk_samples |
|
intra_index = (np.arange(n_chunk_samples), labels[start:end]) |
|
|
|
intra_cluster_distances = cluster_distances[intra_index] |
|
|
|
cluster_distances[intra_index] = np.inf |
|
cluster_distances /= label_freqs |
|
inter_cluster_distances = cluster_distances.min(axis=1) |
|
return intra_cluster_distances, inter_cluster_distances |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"labels": ["array-like"], |
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def silhouette_samples(X, labels, *, metric="euclidean", **kwds): |
|
"""Compute the Silhouette Coefficient for each sample. |
|
|
|
The Silhouette Coefficient is a measure of how well samples are clustered |
|
with samples that are similar to themselves. Clustering models with a high |
|
Silhouette Coefficient are said to be dense, where samples in the same |
|
cluster are similar to each other, and well separated, where samples in |
|
different clusters are not very similar to each other. |
|
|
|
The Silhouette Coefficient is calculated using the mean intra-cluster |
|
distance (``a``) and the mean nearest-cluster distance (``b``) for each |
|
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, |
|
b)``. |
|
Note that Silhouette Coefficient is only defined if number of labels |
|
is 2 ``<= n_labels <= n_samples - 1``. |
|
|
|
This function returns the Silhouette Coefficient for each sample. |
|
|
|
The best value is 1 and the worst value is -1. Values near 0 indicate |
|
overlapping clusters. |
|
|
|
Read more in the :ref:`User Guide <silhouette_coefficient>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \ |
|
"precomputed" or (n_samples_a, n_features) otherwise |
|
An array of pairwise distances between samples, or a feature array. If |
|
a sparse matrix is provided, CSR format should be favoured avoiding |
|
an additional copy. |
|
|
|
labels : array-like of shape (n_samples,) |
|
Label values for each sample. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string, it must be one of the options |
|
allowed by :func:`~sklearn.metrics.pairwise_distances`. |
|
If ``X`` is the distance array itself, use "precomputed" as the metric. |
|
Precomputed distance matrices must have 0 along the diagonal. |
|
|
|
**kwds : optional keyword parameters |
|
Any further parameters are passed directly to the distance function. |
|
If using a ``scipy.spatial.distance`` metric, the parameters are still |
|
metric dependent. See the scipy docs for usage examples. |
|
|
|
Returns |
|
------- |
|
silhouette : array-like of shape (n_samples,) |
|
Silhouette Coefficients for each sample. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the |
|
Interpretation and Validation of Cluster Analysis". Computational |
|
and Applied Mathematics 20: 53-65. |
|
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_ |
|
|
|
.. [2] `Wikipedia entry on the Silhouette Coefficient |
|
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import silhouette_samples |
|
>>> from sklearn.datasets import make_blobs |
|
>>> from sklearn.cluster import KMeans |
|
>>> X, y = make_blobs(n_samples=50, random_state=42) |
|
>>> kmeans = KMeans(n_clusters=3, random_state=42) |
|
>>> labels = kmeans.fit_predict(X) |
|
>>> silhouette_samples(X, labels) |
|
array([...]) |
|
""" |
|
X, labels = check_X_y(X, labels, accept_sparse=["csr"]) |
|
|
|
|
|
if metric == "precomputed": |
|
error_msg = ValueError( |
|
"The precomputed distance matrix contains non-zero " |
|
"elements on the diagonal. Use np.fill_diagonal(X, 0)." |
|
) |
|
if X.dtype.kind == "f": |
|
atol = _atol_for_type(X.dtype) |
|
|
|
if np.any(np.abs(X.diagonal()) > atol): |
|
raise error_msg |
|
elif np.any(X.diagonal() != 0): |
|
raise error_msg |
|
|
|
le = LabelEncoder() |
|
labels = le.fit_transform(labels) |
|
n_samples = len(labels) |
|
label_freqs = np.bincount(labels) |
|
check_number_of_labels(len(le.classes_), n_samples) |
|
|
|
kwds["metric"] = metric |
|
reduce_func = functools.partial( |
|
_silhouette_reduce, labels=labels, label_freqs=label_freqs |
|
) |
|
results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds)) |
|
intra_clust_dists, inter_clust_dists = results |
|
intra_clust_dists = np.concatenate(intra_clust_dists) |
|
inter_clust_dists = np.concatenate(inter_clust_dists) |
|
|
|
denom = (label_freqs - 1).take(labels, mode="clip") |
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
intra_clust_dists /= denom |
|
|
|
sil_samples = inter_clust_dists - intra_clust_dists |
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) |
|
|
|
return np.nan_to_num(sil_samples) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like"], |
|
"labels": ["array-like"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def calinski_harabasz_score(X, labels): |
|
"""Compute the Calinski and Harabasz score. |
|
|
|
It is also known as the Variance Ratio Criterion. |
|
|
|
The score is defined as ratio of the sum of between-cluster dispersion and |
|
of within-cluster dispersion. |
|
|
|
Read more in the :ref:`User Guide <calinski_harabasz_index>`. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
A list of ``n_features``-dimensional data points. Each row corresponds |
|
to a single data point. |
|
|
|
labels : array-like of shape (n_samples,) |
|
Predicted labels for each sample. |
|
|
|
Returns |
|
------- |
|
score : float |
|
The resulting Calinski-Harabasz score. |
|
|
|
References |
|
---------- |
|
.. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster |
|
analysis". Communications in Statistics |
|
<https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_ |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import make_blobs |
|
>>> from sklearn.cluster import KMeans |
|
>>> from sklearn.metrics import calinski_harabasz_score |
|
>>> X, _ = make_blobs(random_state=0) |
|
>>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X) |
|
>>> calinski_harabasz_score(X, kmeans.labels_) |
|
np.float64(114.8...) |
|
""" |
|
X, labels = check_X_y(X, labels) |
|
le = LabelEncoder() |
|
labels = le.fit_transform(labels) |
|
|
|
n_samples, _ = X.shape |
|
n_labels = len(le.classes_) |
|
|
|
check_number_of_labels(n_labels, n_samples) |
|
|
|
extra_disp, intra_disp = 0.0, 0.0 |
|
mean = np.mean(X, axis=0) |
|
for k in range(n_labels): |
|
cluster_k = X[labels == k] |
|
mean_k = np.mean(cluster_k, axis=0) |
|
extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2) |
|
intra_disp += np.sum((cluster_k - mean_k) ** 2) |
|
|
|
return ( |
|
1.0 |
|
if intra_disp == 0.0 |
|
else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0)) |
|
) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like"], |
|
"labels": ["array-like"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def davies_bouldin_score(X, labels): |
|
"""Compute the Davies-Bouldin score. |
|
|
|
The score is defined as the average similarity measure of each cluster with |
|
its most similar cluster, where similarity is the ratio of within-cluster |
|
distances to between-cluster distances. Thus, clusters which are farther |
|
apart and less dispersed will result in a better score. |
|
|
|
The minimum score is zero, with lower values indicating better clustering. |
|
|
|
Read more in the :ref:`User Guide <davies-bouldin_index>`. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
A list of ``n_features``-dimensional data points. Each row corresponds |
|
to a single data point. |
|
|
|
labels : array-like of shape (n_samples,) |
|
Predicted labels for each sample. |
|
|
|
Returns |
|
------- |
|
score: float |
|
The resulting Davies-Bouldin score. |
|
|
|
References |
|
---------- |
|
.. [1] Davies, David L.; Bouldin, Donald W. (1979). |
|
`"A Cluster Separation Measure" |
|
<https://ieeexplore.ieee.org/document/4766909>`__. |
|
IEEE Transactions on Pattern Analysis and Machine Intelligence. |
|
PAMI-1 (2): 224-227 |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import davies_bouldin_score |
|
>>> X = [[0, 1], [1, 1], [3, 4]] |
|
>>> labels = [0, 0, 1] |
|
>>> davies_bouldin_score(X, labels) |
|
np.float64(0.12...) |
|
""" |
|
X, labels = check_X_y(X, labels) |
|
le = LabelEncoder() |
|
labels = le.fit_transform(labels) |
|
n_samples, _ = X.shape |
|
n_labels = len(le.classes_) |
|
check_number_of_labels(n_labels, n_samples) |
|
|
|
intra_dists = np.zeros(n_labels) |
|
centroids = np.zeros((n_labels, len(X[0])), dtype=float) |
|
for k in range(n_labels): |
|
cluster_k = _safe_indexing(X, labels == k) |
|
centroid = cluster_k.mean(axis=0) |
|
centroids[k] = centroid |
|
intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid])) |
|
|
|
centroid_distances = pairwise_distances(centroids) |
|
|
|
if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0): |
|
return 0.0 |
|
|
|
centroid_distances[centroid_distances == 0] = np.inf |
|
combined_intra_dists = intra_dists[:, None] + intra_dists |
|
scores = np.max(combined_intra_dists / centroid_distances, axis=1) |
|
return np.mean(scores) |
|
|