|
""" |
|
Hierarchical clustering (:mod:`scipy.cluster.hierarchy`) |
|
======================================================== |
|
|
|
.. currentmodule:: scipy.cluster.hierarchy |
|
|
|
These functions cut hierarchical clusterings into flat clusterings |
|
or find the roots of the forest formed by a cut by providing the flat |
|
cluster ids of each observation. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
fcluster |
|
fclusterdata |
|
leaders |
|
|
|
These are routines for agglomerative clustering. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
linkage |
|
single |
|
complete |
|
average |
|
weighted |
|
centroid |
|
median |
|
ward |
|
|
|
These routines compute statistics on hierarchies. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
cophenet |
|
from_mlab_linkage |
|
inconsistent |
|
maxinconsts |
|
maxdists |
|
maxRstat |
|
to_mlab_linkage |
|
|
|
Routines for visualizing flat clusters. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
dendrogram |
|
|
|
These are data structures and routines for representing hierarchies as |
|
tree objects. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
ClusterNode |
|
leaves_list |
|
to_tree |
|
cut_tree |
|
optimal_leaf_ordering |
|
|
|
These are predicates for checking the validity of linkage and |
|
inconsistency matrices as well as for checking isomorphism of two |
|
flat cluster assignments. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
is_valid_im |
|
is_valid_linkage |
|
is_isomorphic |
|
is_monotonic |
|
correspond |
|
num_obs_linkage |
|
|
|
Utility routines for plotting: |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
set_link_color_palette |
|
|
|
Utility classes: |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
DisjointSet -- data structure for incremental connectivity queries |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import warnings |
|
import bisect |
|
from collections import deque |
|
|
|
import numpy as np |
|
from . import _hierarchy, _optimal_leaf_ordering |
|
import scipy.spatial.distance as distance |
|
from scipy._lib._array_api import array_namespace, _asarray, xp_copy, is_jax |
|
from scipy._lib._disjoint_set import DisjointSet |
|
|
|
|
|
_LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3, |
|
'median': 4, 'ward': 5, 'weighted': 6} |
|
_EUCLIDEAN_METHODS = ('centroid', 'median', 'ward') |
|
|
|
__all__ = ['ClusterNode', 'DisjointSet', 'average', 'centroid', 'complete', |
|
'cophenet', 'correspond', 'cut_tree', 'dendrogram', 'fcluster', |
|
'fclusterdata', 'from_mlab_linkage', 'inconsistent', |
|
'is_isomorphic', 'is_monotonic', 'is_valid_im', 'is_valid_linkage', |
|
'leaders', 'leaves_list', 'linkage', 'maxRstat', 'maxdists', |
|
'maxinconsts', 'median', 'num_obs_linkage', 'optimal_leaf_ordering', |
|
'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree', |
|
'ward', 'weighted'] |
|
|
|
|
|
class ClusterWarning(UserWarning): |
|
pass |
|
|
|
|
|
def _warning(s): |
|
warnings.warn(f'scipy.cluster: {s}', ClusterWarning, stacklevel=3) |
|
|
|
|
|
def int_floor(arr, xp): |
|
|
|
|
|
return int(xp.astype(xp.asarray(arr), xp.int64)) |
|
|
|
|
|
def single(y): |
|
""" |
|
Perform single/min/nearest linkage on the condensed distance matrix ``y``. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
The upper triangular of the distance matrix. The result of |
|
``pdist`` is returned in this form. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
The linkage matrix. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import single, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = single(y) |
|
>>> Z |
|
array([[ 0., 1., 1., 2.], |
|
[ 2., 12., 1., 3.], |
|
[ 3., 4., 1., 2.], |
|
[ 5., 14., 1., 3.], |
|
[ 6., 7., 1., 2.], |
|
[ 8., 16., 1., 3.], |
|
[ 9., 10., 1., 2.], |
|
[11., 18., 1., 3.], |
|
[13., 15., 2., 6.], |
|
[17., 20., 2., 9.], |
|
[19., 21., 2., 12.]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 7, 8, 9, 10, 11, 12, 4, 5, 6, 1, 2, 3], dtype=int32) |
|
>>> fcluster(Z, 1, criterion='distance') |
|
array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) |
|
>>> fcluster(Z, 2, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
""" |
|
return linkage(y, method='single', metric='euclidean') |
|
|
|
|
|
def complete(y): |
|
""" |
|
Perform complete/max/farthest point linkage on a condensed distance matrix. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
The upper triangular of the distance matrix. The result of |
|
``pdist`` is returned in this form. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
A linkage matrix containing the hierarchical clustering. See |
|
the `linkage` function documentation for more information |
|
on its structure. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import complete, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = complete(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.41421356, 3. ], |
|
[ 5. , 13. , 1.41421356, 3. ], |
|
[ 8. , 14. , 1.41421356, 3. ], |
|
[11. , 15. , 1.41421356, 3. ], |
|
[16. , 17. , 4.12310563, 6. ], |
|
[18. , 19. , 4.12310563, 6. ], |
|
[20. , 21. , 5.65685425, 12. ]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) |
|
>>> fcluster(Z, 1.5, criterion='distance') |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
>>> fcluster(Z, 4.5, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32) |
|
>>> fcluster(Z, 6, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
""" |
|
return linkage(y, method='complete', metric='euclidean') |
|
|
|
|
|
def average(y): |
|
""" |
|
Perform average/UPGMA linkage on a condensed distance matrix. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
The upper triangular of the distance matrix. The result of |
|
``pdist`` is returned in this form. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
A linkage matrix containing the hierarchical clustering. See |
|
`linkage` for more information on its structure. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import average, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = average(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.20710678, 3. ], |
|
[ 5. , 13. , 1.20710678, 3. ], |
|
[ 8. , 14. , 1.20710678, 3. ], |
|
[11. , 15. , 1.20710678, 3. ], |
|
[16. , 17. , 3.39675184, 6. ], |
|
[18. , 19. , 3.39675184, 6. ], |
|
[20. , 21. , 4.09206523, 12. ]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) |
|
>>> fcluster(Z, 1.5, criterion='distance') |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
>>> fcluster(Z, 4, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32) |
|
>>> fcluster(Z, 6, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
|
|
""" |
|
return linkage(y, method='average', metric='euclidean') |
|
|
|
|
|
def weighted(y): |
|
""" |
|
Perform weighted/WPGMA linkage on the condensed distance matrix. |
|
|
|
See `linkage` for more information on the return |
|
structure and algorithm. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
The upper triangular of the distance matrix. The result of |
|
``pdist`` is returned in this form. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
A linkage matrix containing the hierarchical clustering. See |
|
`linkage` for more information on its structure. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import weighted, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = weighted(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 11. , 1. , 2. ], |
|
[ 2. , 12. , 1.20710678, 3. ], |
|
[ 8. , 13. , 1.20710678, 3. ], |
|
[ 5. , 14. , 1.20710678, 3. ], |
|
[10. , 15. , 1.20710678, 3. ], |
|
[18. , 19. , 3.05595762, 6. ], |
|
[16. , 17. , 3.32379407, 6. ], |
|
[20. , 21. , 4.06357713, 12. ]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 7, 8, 9, 1, 2, 3, 10, 11, 12, 4, 6, 5], dtype=int32) |
|
>>> fcluster(Z, 1.5, criterion='distance') |
|
array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32) |
|
>>> fcluster(Z, 4, criterion='distance') |
|
array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32) |
|
>>> fcluster(Z, 6, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
|
|
""" |
|
return linkage(y, method='weighted', metric='euclidean') |
|
|
|
|
|
def centroid(y): |
|
""" |
|
Perform centroid/UPGMC linkage. |
|
|
|
See `linkage` for more information on the input matrix, |
|
return structure, and algorithm. |
|
|
|
The following are common calling conventions: |
|
|
|
1. ``Z = centroid(y)`` |
|
|
|
Performs centroid/UPGMC linkage on the condensed distance |
|
matrix ``y``. |
|
|
|
2. ``Z = centroid(X)`` |
|
|
|
Performs centroid/UPGMC linkage on the observation matrix ``X`` |
|
using Euclidean distance as the distance metric. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
A condensed distance matrix. A condensed |
|
distance matrix is a flat array containing the upper |
|
triangular of the distance matrix. This is the form that |
|
``pdist`` returns. Alternatively, a collection of |
|
m observation vectors in n dimensions may be passed as |
|
an m by n array. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
A linkage matrix containing the hierarchical clustering. See |
|
the `linkage` function documentation for more information |
|
on its structure. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import centroid, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = centroid(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 2. , 12. , 1.11803399, 3. ], |
|
[ 5. , 13. , 1.11803399, 3. ], |
|
[ 8. , 15. , 1.11803399, 3. ], |
|
[11. , 14. , 1.11803399, 3. ], |
|
[18. , 19. , 3.33333333, 6. ], |
|
[16. , 17. , 3.33333333, 6. ], |
|
[20. , 21. , 3.33333333, 12. ]]) # may vary |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) # may vary |
|
>>> fcluster(Z, 1.1, criterion='distance') |
|
array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) # may vary |
|
>>> fcluster(Z, 2, criterion='distance') |
|
array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) # may vary |
|
>>> fcluster(Z, 4, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
|
|
""" |
|
return linkage(y, method='centroid', metric='euclidean') |
|
|
|
|
|
def median(y): |
|
""" |
|
Perform median/WPGMC linkage. |
|
|
|
See `linkage` for more information on the return structure |
|
and algorithm. |
|
|
|
The following are common calling conventions: |
|
|
|
1. ``Z = median(y)`` |
|
|
|
Performs median/WPGMC linkage on the condensed distance matrix |
|
``y``. See ``linkage`` for more information on the return |
|
structure and algorithm. |
|
|
|
2. ``Z = median(X)`` |
|
|
|
Performs median/WPGMC linkage on the observation matrix ``X`` |
|
using Euclidean distance as the distance metric. See `linkage` |
|
for more information on the return structure and algorithm. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
A condensed distance matrix. A condensed |
|
distance matrix is a flat array containing the upper |
|
triangular of the distance matrix. This is the form that |
|
``pdist`` returns. Alternatively, a collection of |
|
m observation vectors in n dimensions may be passed as |
|
an m by n array. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a linkage matrix. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import median, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = median(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 2. , 12. , 1.11803399, 3. ], |
|
[ 5. , 13. , 1.11803399, 3. ], |
|
[ 8. , 15. , 1.11803399, 3. ], |
|
[11. , 14. , 1.11803399, 3. ], |
|
[18. , 19. , 3. , 6. ], |
|
[16. , 17. , 3.5 , 6. ], |
|
[20. , 21. , 3.25 , 12. ]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) |
|
>>> fcluster(Z, 1.1, criterion='distance') |
|
array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) |
|
>>> fcluster(Z, 2, criterion='distance') |
|
array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) |
|
>>> fcluster(Z, 4, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
|
|
""" |
|
return linkage(y, method='median', metric='euclidean') |
|
|
|
|
|
def ward(y): |
|
""" |
|
Perform Ward's linkage on a condensed distance matrix. |
|
|
|
See `linkage` for more information on the return structure |
|
and algorithm. |
|
|
|
The following are common calling conventions: |
|
|
|
1. ``Z = ward(y)`` |
|
Performs Ward's linkage on the condensed distance matrix ``y``. |
|
|
|
2. ``Z = ward(X)`` |
|
Performs Ward's linkage on the observation matrix ``X`` using |
|
Euclidean distance as the distance metric. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
A condensed distance matrix. A condensed |
|
distance matrix is a flat array containing the upper |
|
triangular of the distance matrix. This is the form that |
|
``pdist`` returns. Alternatively, a collection of |
|
m observation vectors in n dimensions may be passed as |
|
an m by n array. |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a linkage matrix. See |
|
`linkage` for more information on the return structure and |
|
algorithm. |
|
|
|
See Also |
|
-------- |
|
linkage : for advanced creation of hierarchical clusterings. |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
First, we need a toy dataset to play with:: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
Then, we get a condensed distance matrix from this dataset: |
|
|
|
>>> y = pdist(X) |
|
|
|
Finally, we can perform the clustering: |
|
|
|
>>> Z = ward(y) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram - see |
|
`scipy.cluster.hierarchy.linkage` for a detailed explanation of its |
|
contents. |
|
|
|
We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster |
|
each initial point would belong given a distance threshold: |
|
|
|
>>> fcluster(Z, 0.9, criterion='distance') |
|
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) |
|
>>> fcluster(Z, 1.1, criterion='distance') |
|
array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32) |
|
>>> fcluster(Z, 3, criterion='distance') |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
>>> fcluster(Z, 9, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a |
|
plot of the dendrogram. |
|
|
|
""" |
|
return linkage(y, method='ward', metric='euclidean') |
|
|
|
|
|
def linkage(y, method='single', metric='euclidean', optimal_ordering=False): |
|
""" |
|
Perform hierarchical/agglomerative clustering. |
|
|
|
The input y may be either a 1-D condensed distance matrix |
|
or a 2-D array of observation vectors. |
|
|
|
If y is a 1-D condensed distance matrix, |
|
then y must be a :math:`\\binom{n}{2}` sized |
|
vector, where n is the number of original observations paired |
|
in the distance matrix. The behavior of this function is very |
|
similar to the MATLAB linkage function. |
|
|
|
A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the |
|
:math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and |
|
``Z[i, 1]`` are combined to form cluster :math:`n + i`. A |
|
cluster with an index less than :math:`n` corresponds to one of |
|
the :math:`n` original observations. The distance between |
|
clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The |
|
fourth value ``Z[i, 3]`` represents the number of original |
|
observations in the newly formed cluster. |
|
|
|
The following linkage methods are used to compute the distance |
|
:math:`d(s, t)` between two clusters :math:`s` and |
|
:math:`t`. The algorithm begins with a forest of clusters that |
|
have yet to be used in the hierarchy being formed. When two |
|
clusters :math:`s` and :math:`t` from this forest are combined |
|
into a single cluster :math:`u`, :math:`s` and :math:`t` are |
|
removed from the forest, and :math:`u` is added to the |
|
forest. When only one cluster remains in the forest, the algorithm |
|
stops, and this cluster becomes the root. |
|
|
|
A distance matrix is maintained at each iteration. The ``d[i,j]`` |
|
entry corresponds to the distance between cluster :math:`i` and |
|
:math:`j` in the original forest. |
|
|
|
At each iteration, the algorithm must update the distance matrix |
|
to reflect the distance of the newly formed cluster u with the |
|
remaining clusters in the forest. |
|
|
|
Suppose there are :math:`|u|` original observations |
|
:math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and |
|
:math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in |
|
cluster :math:`v`. Recall, :math:`s` and :math:`t` are |
|
combined to form cluster :math:`u`. Let :math:`v` be any |
|
remaining cluster in the forest that is not :math:`u`. |
|
|
|
The following are methods for calculating the distance between the |
|
newly formed cluster :math:`u` and each :math:`v`. |
|
|
|
* method='single' assigns |
|
|
|
.. math:: |
|
d(u,v) = \\min(dist(u[i],v[j])) |
|
|
|
for all points :math:`i` in cluster :math:`u` and |
|
:math:`j` in cluster :math:`v`. This is also known as the |
|
Nearest Point Algorithm. |
|
|
|
* method='complete' assigns |
|
|
|
.. math:: |
|
d(u, v) = \\max(dist(u[i],v[j])) |
|
|
|
for all points :math:`i` in cluster u and :math:`j` in |
|
cluster :math:`v`. This is also known by the Farthest Point |
|
Algorithm or Voor Hees Algorithm. |
|
|
|
* method='average' assigns |
|
|
|
.. math:: |
|
d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])} |
|
{(|u|*|v|)} |
|
|
|
for all points :math:`i` and :math:`j` where :math:`|u|` |
|
and :math:`|v|` are the cardinalities of clusters :math:`u` |
|
and :math:`v`, respectively. This is also called the UPGMA |
|
algorithm. |
|
|
|
* method='weighted' assigns |
|
|
|
.. math:: |
|
d(u,v) = (dist(s,v) + dist(t,v))/2 |
|
|
|
where cluster u was formed with cluster s and t and v |
|
is a remaining cluster in the forest (also called WPGMA). |
|
|
|
* method='centroid' assigns |
|
|
|
.. math:: |
|
dist(s,t) = ||c_s-c_t||_2 |
|
|
|
where :math:`c_s` and :math:`c_t` are the centroids of |
|
clusters :math:`s` and :math:`t`, respectively. When two |
|
clusters :math:`s` and :math:`t` are combined into a new |
|
cluster :math:`u`, the new centroid is computed over all the |
|
original objects in clusters :math:`s` and :math:`t`. The |
|
distance then becomes the Euclidean distance between the |
|
centroid of :math:`u` and the centroid of a remaining cluster |
|
:math:`v` in the forest. This is also known as the UPGMC |
|
algorithm. |
|
|
|
* method='median' assigns :math:`d(s,t)` like the ``centroid`` |
|
method. When two clusters :math:`s` and :math:`t` are combined |
|
into a new cluster :math:`u`, the average of centroids s and t |
|
give the new centroid :math:`u`. This is also known as the |
|
WPGMC algorithm. |
|
|
|
* method='ward' uses the Ward variance minimization algorithm. |
|
The new entry :math:`d(u,v)` is computed as follows, |
|
|
|
.. math:: |
|
|
|
d(u,v) = \\sqrt{\\frac{|v|+|s|} |
|
{T}d(v,s)^2 |
|
+ \\frac{|v|+|t|} |
|
{T}d(v,t)^2 |
|
- \\frac{|v|} |
|
{T}d(s,t)^2} |
|
|
|
where :math:`u` is the newly joined cluster consisting of |
|
clusters :math:`s` and :math:`t`, :math:`v` is an unused |
|
cluster in the forest, :math:`T=|v|+|s|+|t|`, and |
|
:math:`|*|` is the cardinality of its argument. This is also |
|
known as the incremental algorithm. |
|
|
|
Warning: When the minimum distance pair in the forest is chosen, there |
|
may be two or more pairs with the same minimum distance. This |
|
implementation may choose a different minimum than the MATLAB |
|
version. |
|
|
|
Parameters |
|
---------- |
|
y : ndarray |
|
A condensed distance matrix. A condensed distance matrix |
|
is a flat array containing the upper triangular of the distance matrix. |
|
This is the form that ``pdist`` returns. Alternatively, a collection of |
|
:math:`m` observation vectors in :math:`n` dimensions may be passed as |
|
an :math:`m` by :math:`n` array. All elements of the condensed distance |
|
matrix must be finite, i.e., no NaNs or infs. |
|
method : str, optional |
|
The linkage algorithm to use. See the ``Linkage Methods`` section below |
|
for full descriptions. |
|
metric : str or function, optional |
|
The distance metric to use in the case that y is a collection of |
|
observation vectors; ignored otherwise. See the ``pdist`` |
|
function for a list of valid distance metrics. A custom distance |
|
function can also be used. |
|
optimal_ordering : bool, optional |
|
If True, the linkage matrix will be reordered so that the distance |
|
between successive leaves is minimal. This results in a more intuitive |
|
tree structure when the data are visualized. defaults to False, because |
|
this algorithm can be slow, particularly on large datasets [2]_. See |
|
also the `optimal_leaf_ordering` function. |
|
|
|
.. versionadded:: 1.0.0 |
|
|
|
Returns |
|
------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a linkage matrix. |
|
|
|
Notes |
|
----- |
|
1. For method 'single', an optimized algorithm based on minimum spanning |
|
tree is implemented. It has time complexity :math:`O(n^2)`. |
|
For methods 'complete', 'average', 'weighted' and 'ward', an algorithm |
|
called nearest-neighbors chain is implemented. It also has time |
|
complexity :math:`O(n^2)`. |
|
For other methods, a naive algorithm is implemented with :math:`O(n^3)` |
|
time complexity. |
|
All algorithms use :math:`O(n^2)` memory. |
|
Refer to [1]_ for details about the algorithms. |
|
2. Methods 'centroid', 'median', and 'ward' are correctly defined only if |
|
Euclidean pairwise metric is used. If `y` is passed as precomputed |
|
pairwise distances, then it is the user's responsibility to assure that |
|
these distances are in fact Euclidean, otherwise the produced result |
|
will be incorrect. |
|
|
|
See Also |
|
-------- |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
References |
|
---------- |
|
.. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering |
|
algorithms", :arXiv:`1109.2378v1`. |
|
.. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal |
|
leaf ordering for hierarchical clustering", 2001. Bioinformatics |
|
:doi:`10.1093/bioinformatics/17.suppl_1.S22` |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import dendrogram, linkage |
|
>>> from matplotlib import pyplot as plt |
|
>>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]] |
|
|
|
>>> Z = linkage(X, 'ward') |
|
>>> fig = plt.figure(figsize=(25, 10)) |
|
>>> dn = dendrogram(Z) |
|
|
|
>>> Z = linkage(X, 'single') |
|
>>> fig = plt.figure(figsize=(25, 10)) |
|
>>> dn = dendrogram(Z) |
|
>>> plt.show() |
|
""" |
|
xp = array_namespace(y) |
|
y = _asarray(y, order='C', dtype=xp.float64, xp=xp) |
|
|
|
if method not in _LINKAGE_METHODS: |
|
raise ValueError(f"Invalid method: {method}") |
|
|
|
if method in _EUCLIDEAN_METHODS and metric != 'euclidean' and y.ndim == 2: |
|
msg = f"`method={method}` requires the distance metric to be Euclidean" |
|
raise ValueError(msg) |
|
|
|
if y.ndim == 1: |
|
distance.is_valid_y(y, throw=True, name='y') |
|
elif y.ndim == 2: |
|
if (y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0) and |
|
xp.all(y >= 0) and np.allclose(y, y.T)): |
|
warnings.warn('The symmetric non-negative hollow observation ' |
|
'matrix looks suspiciously like an uncondensed ' |
|
'distance matrix', |
|
ClusterWarning, stacklevel=2) |
|
y = distance.pdist(y, metric) |
|
y = xp.asarray(y) |
|
else: |
|
raise ValueError("`y` must be 1 or 2 dimensional.") |
|
|
|
if not xp.all(xp.isfinite(y)): |
|
raise ValueError("The condensed distance matrix must contain only " |
|
"finite values.") |
|
|
|
n = int(distance.num_obs_y(y)) |
|
method_code = _LINKAGE_METHODS[method] |
|
|
|
y = np.asarray(y) |
|
if method == 'single': |
|
result = _hierarchy.mst_single_linkage(y, n) |
|
elif method in ['complete', 'average', 'weighted', 'ward']: |
|
result = _hierarchy.nn_chain(y, n, method_code) |
|
else: |
|
result = _hierarchy.fast_linkage(y, n, method_code) |
|
result = xp.asarray(result) |
|
|
|
if optimal_ordering: |
|
y = xp.asarray(y) |
|
return optimal_leaf_ordering(result, y) |
|
else: |
|
return result |
|
|
|
|
|
class ClusterNode: |
|
""" |
|
A tree node class for representing a cluster. |
|
|
|
Leaf nodes correspond to original observations, while non-leaf nodes |
|
correspond to non-singleton clusters. |
|
|
|
The `to_tree` function converts a matrix returned by the linkage |
|
function into an easy-to-use tree representation. |
|
|
|
All parameter names are also attributes. |
|
|
|
Parameters |
|
---------- |
|
id : int |
|
The node id. |
|
left : ClusterNode instance, optional |
|
The left child tree node. |
|
right : ClusterNode instance, optional |
|
The right child tree node. |
|
dist : float, optional |
|
Distance for this cluster in the linkage matrix. |
|
count : int, optional |
|
The number of samples in this cluster. |
|
|
|
See Also |
|
-------- |
|
to_tree : for converting a linkage matrix ``Z`` into a tree object. |
|
|
|
""" |
|
|
|
def __init__(self, id, left=None, right=None, dist=0.0, count=1): |
|
if id < 0: |
|
raise ValueError('The id must be non-negative.') |
|
if dist < 0: |
|
raise ValueError('The distance must be non-negative.') |
|
if (left is None and right is not None) or \ |
|
(left is not None and right is None): |
|
raise ValueError('Only full or proper binary trees are permitted.' |
|
' This node has one child.') |
|
if count < 1: |
|
raise ValueError('A cluster must contain at least one original ' |
|
'observation.') |
|
self.id = id |
|
self.left = left |
|
self.right = right |
|
self.dist = dist |
|
if self.left is None: |
|
self.count = count |
|
else: |
|
self.count = left.count + right.count |
|
|
|
def __lt__(self, node): |
|
if not isinstance(node, ClusterNode): |
|
raise ValueError("Can't compare ClusterNode " |
|
f"to type {type(node)}") |
|
return self.dist < node.dist |
|
|
|
def __gt__(self, node): |
|
if not isinstance(node, ClusterNode): |
|
raise ValueError("Can't compare ClusterNode " |
|
f"to type {type(node)}") |
|
return self.dist > node.dist |
|
|
|
def __eq__(self, node): |
|
if not isinstance(node, ClusterNode): |
|
raise ValueError("Can't compare ClusterNode " |
|
f"to type {type(node)}") |
|
return self.dist == node.dist |
|
|
|
def get_id(self): |
|
""" |
|
The identifier of the target node. |
|
|
|
For ``0 <= i < n``, `i` corresponds to original observation i. |
|
For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed |
|
at iteration ``i-n``. |
|
|
|
Returns |
|
------- |
|
id : int |
|
The identifier of the target node. |
|
|
|
""" |
|
return self.id |
|
|
|
def get_count(self): |
|
""" |
|
The number of leaf nodes (original observations) belonging to |
|
the cluster node nd. If the target node is a leaf, 1 is |
|
returned. |
|
|
|
Returns |
|
------- |
|
get_count : int |
|
The number of leaf nodes below the target node. |
|
|
|
""" |
|
return self.count |
|
|
|
def get_left(self): |
|
""" |
|
Return a reference to the left child tree object. |
|
|
|
Returns |
|
------- |
|
left : ClusterNode |
|
The left child of the target node. If the node is a leaf, |
|
None is returned. |
|
|
|
""" |
|
return self.left |
|
|
|
def get_right(self): |
|
""" |
|
Return a reference to the right child tree object. |
|
|
|
Returns |
|
------- |
|
right : ClusterNode |
|
The left child of the target node. If the node is a leaf, |
|
None is returned. |
|
|
|
""" |
|
return self.right |
|
|
|
def is_leaf(self): |
|
""" |
|
Return True if the target node is a leaf. |
|
|
|
Returns |
|
------- |
|
leafness : bool |
|
True if the target node is a leaf node. |
|
|
|
""" |
|
return self.left is None |
|
|
|
def pre_order(self, func=(lambda x: x.id)): |
|
""" |
|
Perform pre-order traversal without recursive function calls. |
|
|
|
When a leaf node is first encountered, ``func`` is called with |
|
the leaf node as its argument, and its result is appended to |
|
the list. |
|
|
|
For example, the statement:: |
|
|
|
ids = root.pre_order(lambda x: x.id) |
|
|
|
returns a list of the node ids corresponding to the leaf nodes |
|
of the tree as they appear from left to right. |
|
|
|
Parameters |
|
---------- |
|
func : function |
|
Applied to each leaf ClusterNode object in the pre-order traversal. |
|
Given the ``i``-th leaf node in the pre-order traversal ``n[i]``, |
|
the result of ``func(n[i])`` is stored in ``L[i]``. If not |
|
provided, the index of the original observation to which the node |
|
corresponds is used. |
|
|
|
Returns |
|
------- |
|
L : list |
|
The pre-order traversal. |
|
|
|
""" |
|
|
|
|
|
n = self.count |
|
|
|
curNode = [None] * (2 * n) |
|
lvisited = set() |
|
rvisited = set() |
|
curNode[0] = self |
|
k = 0 |
|
preorder = [] |
|
while k >= 0: |
|
nd = curNode[k] |
|
ndid = nd.id |
|
if nd.is_leaf(): |
|
preorder.append(func(nd)) |
|
k = k - 1 |
|
else: |
|
if ndid not in lvisited: |
|
curNode[k + 1] = nd.left |
|
lvisited.add(ndid) |
|
k = k + 1 |
|
elif ndid not in rvisited: |
|
curNode[k + 1] = nd.right |
|
rvisited.add(ndid) |
|
k = k + 1 |
|
|
|
|
|
else: |
|
k = k - 1 |
|
|
|
return preorder |
|
|
|
|
|
_cnode_bare = ClusterNode(0) |
|
_cnode_type = type(ClusterNode) |
|
|
|
|
|
def _order_cluster_tree(Z): |
|
""" |
|
Return clustering nodes in bottom-up order by distance. |
|
|
|
Parameters |
|
---------- |
|
Z : scipy.cluster.linkage array |
|
The linkage matrix. |
|
|
|
Returns |
|
------- |
|
nodes : list |
|
A list of ClusterNode objects. |
|
""" |
|
q = deque() |
|
tree = to_tree(Z) |
|
q.append(tree) |
|
nodes = [] |
|
|
|
while q: |
|
node = q.popleft() |
|
if not node.is_leaf(): |
|
bisect.insort_left(nodes, node) |
|
q.append(node.get_right()) |
|
q.append(node.get_left()) |
|
return nodes |
|
|
|
|
|
def cut_tree(Z, n_clusters=None, height=None): |
|
""" |
|
Given a linkage matrix Z, return the cut tree. |
|
|
|
Parameters |
|
---------- |
|
Z : scipy.cluster.linkage array |
|
The linkage matrix. |
|
n_clusters : array_like, optional |
|
Number of clusters in the tree at the cut point. |
|
height : array_like, optional |
|
The height at which to cut the tree. Only possible for ultrametric |
|
trees. |
|
|
|
Returns |
|
------- |
|
cutree : array |
|
An array indicating group membership at each agglomeration step. I.e., |
|
for a full cut tree, in the first column each data point is in its own |
|
cluster. At the next step, two nodes are merged. Finally, all |
|
singleton and non-singleton clusters are in one group. If `n_clusters` |
|
or `height` are given, the columns correspond to the columns of |
|
`n_clusters` or `height`. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy import cluster |
|
>>> import numpy as np |
|
>>> from numpy.random import default_rng |
|
>>> rng = default_rng() |
|
>>> X = rng.random((50, 4)) |
|
>>> Z = cluster.hierarchy.ward(X) |
|
>>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10]) |
|
>>> cutree[:10] |
|
array([[0, 0], |
|
[1, 1], |
|
[2, 2], |
|
[3, 3], |
|
[3, 4], |
|
[2, 2], |
|
[0, 0], |
|
[1, 5], |
|
[3, 6], |
|
[4, 7]]) # random |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
nobs = num_obs_linkage(Z) |
|
nodes = _order_cluster_tree(Z) |
|
|
|
if height is not None and n_clusters is not None: |
|
raise ValueError("At least one of either height or n_clusters " |
|
"must be None") |
|
elif height is None and n_clusters is None: |
|
cols_idx = xp.arange(nobs) |
|
elif height is not None: |
|
height = xp.asarray(height) |
|
heights = xp.asarray([x.dist for x in nodes]) |
|
cols_idx = xp.searchsorted(heights, height) |
|
else: |
|
n_clusters = xp.asarray(n_clusters) |
|
cols_idx = nobs - xp.searchsorted(xp.arange(nobs), n_clusters) |
|
|
|
try: |
|
n_cols = len(cols_idx) |
|
except TypeError: |
|
n_cols = 1 |
|
cols_idx = xp.asarray([cols_idx]) |
|
|
|
groups = xp.zeros((n_cols, nobs), dtype=xp.int64) |
|
last_group = xp.arange(nobs) |
|
if 0 in cols_idx: |
|
groups[0] = last_group |
|
|
|
for i, node in enumerate(nodes): |
|
idx = node.pre_order() |
|
this_group = xp_copy(last_group, xp=xp) |
|
|
|
this_group[idx] = xp.min(last_group[idx]) |
|
this_group[this_group > xp.max(last_group[idx])] -= 1 |
|
if i + 1 in cols_idx: |
|
groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group |
|
last_group = this_group |
|
|
|
return groups.T |
|
|
|
|
|
def to_tree(Z, rd=False): |
|
""" |
|
Convert a linkage matrix into an easy-to-use tree object. |
|
|
|
The reference to the root `ClusterNode` object is returned (by default). |
|
|
|
Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``, |
|
and ``count`` attribute. The left and right attributes point to |
|
ClusterNode objects that were combined to generate the cluster. |
|
If both are None then the `ClusterNode` object is a leaf node, its count |
|
must be 1, and its distance is meaningless but set to 0. |
|
|
|
*Note: This function is provided for the convenience of the library |
|
user. ClusterNodes are not used as input to any of the functions in this |
|
library.* |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The linkage matrix in proper form (see the `linkage` |
|
function documentation). |
|
rd : bool, optional |
|
When False (default), a reference to the root `ClusterNode` object is |
|
returned. Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a |
|
reference to the root node while ``d`` is a list of `ClusterNode` |
|
objects - one per original entry in the linkage matrix plus entries |
|
for all clustering steps. If a cluster id is |
|
less than the number of samples ``n`` in the data that the linkage |
|
matrix describes, then it corresponds to a singleton cluster (leaf |
|
node). |
|
See `linkage` for more information on the assignment of cluster ids |
|
to clusters. |
|
|
|
Returns |
|
------- |
|
tree : ClusterNode or tuple (ClusterNode, list of ClusterNode) |
|
If ``rd`` is False, a `ClusterNode`. |
|
If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number |
|
of samples. See the description of `rd` above for more details. |
|
|
|
See Also |
|
-------- |
|
linkage, is_valid_linkage, ClusterNode |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster import hierarchy |
|
>>> rng = np.random.default_rng() |
|
>>> x = rng.random((5, 2)) |
|
>>> Z = hierarchy.linkage(x) |
|
>>> hierarchy.to_tree(Z) |
|
<scipy.cluster.hierarchy.ClusterNode object at ... |
|
>>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True) |
|
>>> rootnode |
|
<scipy.cluster.hierarchy.ClusterNode object at ... |
|
>>> len(nodelist) |
|
9 |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
|
|
n = Z.shape[0] + 1 |
|
|
|
|
|
d = [None] * (n * 2 - 1) |
|
|
|
|
|
for i in range(0, n): |
|
d[i] = ClusterNode(i) |
|
|
|
nd = None |
|
|
|
for i in range(Z.shape[0]): |
|
row = Z[i, :] |
|
|
|
fi = int_floor(row[0], xp) |
|
fj = int_floor(row[1], xp) |
|
if fi > i + n: |
|
raise ValueError(('Corrupt matrix Z. Index to derivative cluster ' |
|
'is used before it is formed. See row %d, ' |
|
'column 0') % fi) |
|
if fj > i + n: |
|
raise ValueError(('Corrupt matrix Z. Index to derivative cluster ' |
|
'is used before it is formed. See row %d, ' |
|
'column 1') % fj) |
|
|
|
nd = ClusterNode(i + n, d[fi], d[fj], row[2]) |
|
|
|
if row[3] != nd.count: |
|
raise ValueError(('Corrupt matrix Z. The count Z[%d,3] is ' |
|
'incorrect.') % i) |
|
d[n + i] = nd |
|
|
|
if rd: |
|
return (nd, d) |
|
else: |
|
return nd |
|
|
|
|
|
def optimal_leaf_ordering(Z, y, metric='euclidean'): |
|
""" |
|
Given a linkage matrix Z and distance, reorder the cut tree. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a linkage matrix. See |
|
`linkage` for more information on the return structure and |
|
algorithm. |
|
y : ndarray |
|
The condensed distance matrix from which Z was generated. |
|
Alternatively, a collection of m observation vectors in n |
|
dimensions may be passed as an m by n array. |
|
metric : str or function, optional |
|
The distance metric to use in the case that y is a collection of |
|
observation vectors; ignored otherwise. See the ``pdist`` |
|
function for a list of valid distance metrics. A custom distance |
|
function can also be used. |
|
|
|
Returns |
|
------- |
|
Z_ordered : ndarray |
|
A copy of the linkage matrix Z, reordered to minimize the distance |
|
between adjacent leaves. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster import hierarchy |
|
>>> rng = np.random.default_rng() |
|
>>> X = rng.standard_normal((10, 10)) |
|
>>> Z = hierarchy.ward(X) |
|
>>> hierarchy.leaves_list(Z) |
|
array([0, 3, 1, 9, 2, 5, 7, 4, 6, 8], dtype=int32) |
|
>>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X)) |
|
array([3, 0, 2, 5, 7, 4, 8, 6, 9, 1], dtype=int32) |
|
|
|
""" |
|
xp = array_namespace(Z, y) |
|
Z = _asarray(Z, order='C', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
y = _asarray(y, order='C', dtype=xp.float64, xp=xp) |
|
|
|
if y.ndim == 1: |
|
distance.is_valid_y(y, throw=True, name='y') |
|
elif y.ndim == 2: |
|
if (y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0) and |
|
np.all(y >= 0) and np.allclose(y, y.T)): |
|
warnings.warn('The symmetric non-negative hollow observation ' |
|
'matrix looks suspiciously like an uncondensed ' |
|
'distance matrix', |
|
ClusterWarning, stacklevel=2) |
|
y = distance.pdist(y, metric) |
|
y = xp.asarray(y) |
|
else: |
|
raise ValueError("`y` must be 1 or 2 dimensional.") |
|
|
|
if not xp.all(xp.isfinite(y)): |
|
raise ValueError("The condensed distance matrix must contain only " |
|
"finite values.") |
|
|
|
Z = np.asarray(Z) |
|
y = np.asarray(y) |
|
return xp.asarray(_optimal_leaf_ordering.optimal_leaf_ordering(Z, y)) |
|
|
|
|
|
def cophenet(Z, Y=None): |
|
""" |
|
Calculate the cophenetic distances between each observation in |
|
the hierarchical clustering defined by the linkage ``Z``. |
|
|
|
Suppose ``p`` and ``q`` are original observations in |
|
disjoint clusters ``s`` and ``t``, respectively and |
|
``s`` and ``t`` are joined by a direct parent cluster |
|
``u``. The cophenetic distance between observations |
|
``i`` and ``j`` is simply the distance between |
|
clusters ``s`` and ``t``. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as an array |
|
(see `linkage` function). |
|
Y : ndarray (optional) |
|
Calculates the cophenetic correlation coefficient ``c`` of a |
|
hierarchical clustering defined by the linkage matrix `Z` |
|
of a set of :math:`n` observations in :math:`m` |
|
dimensions. `Y` is the condensed distance matrix from which |
|
`Z` was generated. |
|
|
|
Returns |
|
------- |
|
c : ndarray |
|
The cophentic correlation distance (if ``Y`` is passed). |
|
d : ndarray |
|
The cophenetic distance matrix in condensed form. The |
|
:math:`ij` th entry is the cophenetic distance between |
|
original observations :math:`i` and :math:`j`. |
|
|
|
See Also |
|
-------- |
|
linkage : |
|
for a description of what a linkage matrix is. |
|
scipy.spatial.distance.squareform : |
|
transforming condensed matrices into square ones. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import single, cophenet |
|
>>> from scipy.spatial.distance import pdist, squareform |
|
|
|
Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance |
|
between two points of ``X`` is the distance between the largest two |
|
distinct clusters that each of the points: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
``X`` corresponds to this dataset :: |
|
|
|
x x x x |
|
x x |
|
|
|
x x |
|
x x x x |
|
|
|
>>> Z = single(pdist(X)) |
|
>>> Z |
|
array([[ 0., 1., 1., 2.], |
|
[ 2., 12., 1., 3.], |
|
[ 3., 4., 1., 2.], |
|
[ 5., 14., 1., 3.], |
|
[ 6., 7., 1., 2.], |
|
[ 8., 16., 1., 3.], |
|
[ 9., 10., 1., 2.], |
|
[11., 18., 1., 3.], |
|
[13., 15., 2., 6.], |
|
[17., 20., 2., 9.], |
|
[19., 21., 2., 12.]]) |
|
>>> cophenet(Z) |
|
array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., |
|
2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2., |
|
2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., |
|
1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.]) |
|
|
|
The output of the `scipy.cluster.hierarchy.cophenet` method is |
|
represented in condensed form. We can use |
|
`scipy.spatial.distance.squareform` to see the output as a |
|
regular matrix (where each element ``ij`` denotes the cophenetic distance |
|
between each ``i``, ``j`` pair of points in ``X``): |
|
|
|
>>> squareform(cophenet(Z)) |
|
array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.], |
|
[1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.], |
|
[1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.], |
|
[2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.], |
|
[2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.], |
|
[2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.], |
|
[2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.], |
|
[2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.], |
|
[2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.], |
|
[2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.], |
|
[2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.], |
|
[2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]]) |
|
|
|
In this example, the cophenetic distance between points on ``X`` that are |
|
very close (i.e., in the same corner) is 1. For other pairs of points is 2, |
|
because the points will be located in clusters at different |
|
corners - thus, the distance between these clusters will be larger. |
|
|
|
""" |
|
xp = array_namespace(Z, Y) |
|
|
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
n = Z.shape[0] + 1 |
|
zz = np.zeros((n * (n-1)) // 2, dtype=np.float64) |
|
|
|
Z = np.asarray(Z) |
|
_hierarchy.cophenetic_distances(Z, zz, int(n)) |
|
zz = xp.asarray(zz) |
|
if Y is None: |
|
return zz |
|
|
|
Y = _asarray(Y, order='C', xp=xp) |
|
distance.is_valid_y(Y, throw=True, name='Y') |
|
|
|
z = xp.mean(zz) |
|
y = xp.mean(Y) |
|
Yy = Y - y |
|
Zz = zz - z |
|
numerator = (Yy * Zz) |
|
denomA = Yy**2 |
|
denomB = Zz**2 |
|
c = xp.sum(numerator) / xp.sqrt(xp.sum(denomA) * xp.sum(denomB)) |
|
return (c, zz) |
|
|
|
|
|
def inconsistent(Z, d=2): |
|
r""" |
|
Calculate inconsistency statistics on a linkage matrix. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical |
|
clustering). See `linkage` documentation for more information on its |
|
form. |
|
d : int, optional |
|
The number of links up to `d` levels below each non-singleton cluster. |
|
|
|
Returns |
|
------- |
|
R : ndarray |
|
A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link |
|
statistics for the non-singleton cluster ``i``. The link statistics are |
|
computed over the link heights for links :math:`d` levels below the |
|
cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard |
|
deviation of the link heights, respectively; ``R[i,2]`` is the number |
|
of links included in the calculation; and ``R[i,3]`` is the |
|
inconsistency coefficient, |
|
|
|
.. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]} |
|
|
|
Notes |
|
----- |
|
This function behaves similarly to the MATLAB(TM) ``inconsistent`` |
|
function. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import inconsistent, linkage |
|
>>> from matplotlib import pyplot as plt |
|
>>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]] |
|
>>> Z = linkage(X, 'ward') |
|
>>> print(Z) |
|
[[ 5. 6. 0. 2. ] |
|
[ 2. 7. 0. 2. ] |
|
[ 0. 4. 1. 2. ] |
|
[ 1. 8. 1.15470054 3. ] |
|
[ 9. 10. 2.12132034 4. ] |
|
[ 3. 12. 4.11096096 5. ] |
|
[11. 13. 14.07183949 8. ]] |
|
>>> inconsistent(Z) |
|
array([[ 0. , 0. , 1. , 0. ], |
|
[ 0. , 0. , 1. , 0. ], |
|
[ 1. , 0. , 1. , 0. ], |
|
[ 0.57735027, 0.81649658, 2. , 0.70710678], |
|
[ 1.04044011, 1.06123822, 3. , 1.01850858], |
|
[ 3.11614065, 1.40688837, 2. , 0.70710678], |
|
[ 6.44583366, 6.76770586, 3. , 1.12682288]]) |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
if (not d == np.floor(d)) or d < 0: |
|
raise ValueError('The second argument d must be a nonnegative ' |
|
'integer value.') |
|
|
|
n = Z.shape[0] + 1 |
|
R = np.zeros((n - 1, 4), dtype=np.float64) |
|
|
|
Z = np.asarray(Z) |
|
_hierarchy.inconsistent(Z, R, int(n), int(d)) |
|
R = xp.asarray(R) |
|
return R |
|
|
|
|
|
def from_mlab_linkage(Z): |
|
""" |
|
Convert a linkage matrix generated by MATLAB(TM) to a new |
|
linkage matrix compatible with this module. |
|
|
|
The conversion does two things: |
|
|
|
* the indices are converted from ``1..N`` to ``0..(N-1)`` form, |
|
and |
|
|
|
* a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the |
|
number of original observations (leaves) in the non-singleton |
|
cluster ``i``. |
|
|
|
This function is useful when loading in linkages from legacy data |
|
files generated by MATLAB. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
A linkage matrix generated by MATLAB(TM). |
|
|
|
Returns |
|
------- |
|
ZS : ndarray |
|
A linkage matrix compatible with ``scipy.cluster.hierarchy``. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
to_mlab_linkage : transform from SciPy to MATLAB format. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster.hierarchy import ward, from_mlab_linkage |
|
|
|
Given a linkage matrix in MATLAB format ``mZ``, we can use |
|
`scipy.cluster.hierarchy.from_mlab_linkage` to import |
|
it into SciPy format: |
|
|
|
>>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1], |
|
... [10, 11, 1], [3, 13, 1.29099445], |
|
... [6, 14, 1.29099445], |
|
... [9, 15, 1.29099445], |
|
... [12, 16, 1.29099445], |
|
... [17, 18, 5.77350269], |
|
... [19, 20, 5.77350269], |
|
... [21, 22, 8.16496581]]) |
|
|
|
>>> Z = from_mlab_linkage(mZ) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[ 11. , 15. , 1.29099445, 3. ], |
|
[ 16. , 17. , 5.77350269, 6. ], |
|
[ 18. , 19. , 5.77350269, 6. ], |
|
[ 20. , 21. , 8.16496581, 12. ]]) |
|
|
|
As expected, the linkage matrix ``Z`` returned includes an |
|
additional column counting the number of original samples in |
|
each cluster. Also, all cluster indices are reduced by 1 |
|
(MATLAB format uses 1-indexing, whereas SciPy uses 0-indexing). |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, dtype=xp.float64, order='C', xp=xp) |
|
Zs = Z.shape |
|
|
|
|
|
if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0): |
|
return xp_copy(Z, xp=xp) |
|
|
|
if len(Zs) != 2: |
|
raise ValueError("The linkage array must be rectangular.") |
|
|
|
|
|
if Zs[0] == 0: |
|
return xp_copy(Z, xp=xp) |
|
|
|
if xp.min(Z[:, 0:2]) != 1.0 and xp.max(Z[:, 0:2]) != 2 * Zs[0]: |
|
raise ValueError('The format of the indices is not 1..N') |
|
|
|
Zpart = xp.concat((Z[:, 0:2] - 1.0, Z[:, 2:]), axis=1) |
|
CS = np.zeros((Zs[0],), dtype=np.float64) |
|
if is_jax(xp): |
|
|
|
Zpart = np.array(Zpart, copy=True) |
|
else: |
|
Zpart = np.asarray(Zpart) |
|
_hierarchy.calculate_cluster_sizes(Zpart, CS, int(Zs[0]) + 1) |
|
res = np.hstack([Zpart, CS.reshape(Zs[0], 1)]) |
|
return xp.asarray(res) |
|
|
|
|
|
def to_mlab_linkage(Z): |
|
""" |
|
Convert a linkage matrix to a MATLAB(TM) compatible one. |
|
|
|
Converts a linkage matrix ``Z`` generated by the linkage function |
|
of this module to a MATLAB(TM) compatible one. The return linkage |
|
matrix has the last column removed and the cluster indices are |
|
converted to ``1..N`` indexing. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
A linkage matrix generated by ``scipy.cluster.hierarchy``. |
|
|
|
Returns |
|
------- |
|
to_mlab_linkage : ndarray |
|
A linkage matrix compatible with MATLAB(TM)'s hierarchical |
|
clustering functions. |
|
|
|
The return linkage matrix has the last column removed |
|
and the cluster indices are converted to ``1..N`` indexing. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
from_mlab_linkage : transform from Matlab to SciPy format. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, to_mlab_linkage |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
|
|
After a linkage matrix ``Z`` has been created, we can use |
|
`scipy.cluster.hierarchy.to_mlab_linkage` to convert it |
|
into MATLAB format: |
|
|
|
>>> mZ = to_mlab_linkage(Z) |
|
>>> mZ |
|
array([[ 1. , 2. , 1. ], |
|
[ 4. , 5. , 1. ], |
|
[ 7. , 8. , 1. ], |
|
[ 10. , 11. , 1. ], |
|
[ 3. , 13. , 1.29099445], |
|
[ 6. , 14. , 1.29099445], |
|
[ 9. , 15. , 1.29099445], |
|
[ 12. , 16. , 1.29099445], |
|
[ 17. , 18. , 5.77350269], |
|
[ 19. , 20. , 5.77350269], |
|
[ 21. , 22. , 8.16496581]]) |
|
|
|
The new linkage matrix ``mZ`` uses 1-indexing for all the |
|
clusters (instead of 0-indexing). Also, the last column of |
|
the original linkage matrix has been dropped. |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
Zs = Z.shape |
|
if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0): |
|
return xp_copy(Z, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
return xp.concat((Z[:, :2] + 1.0, Z[:, 2:3]), axis=1) |
|
|
|
|
|
def is_monotonic(Z): |
|
""" |
|
Return True if the linkage passed is monotonic. |
|
|
|
The linkage is monotonic if for every cluster :math:`s` and :math:`t` |
|
joined, the distance between them is no less than the distance |
|
between any previously joined clusters. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The linkage matrix to check for monotonicity. |
|
|
|
Returns |
|
------- |
|
b : bool |
|
A boolean indicating whether the linkage is monotonic. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import median, ward, is_monotonic |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
By definition, some hierarchical clustering algorithms - such as |
|
`scipy.cluster.hierarchy.ward` - produce monotonic assignments of |
|
samples to clusters; however, this is not always true for other |
|
hierarchical methods - e.g. `scipy.cluster.hierarchy.median`. |
|
|
|
Given a linkage matrix ``Z`` (as the result of a hierarchical clustering |
|
method) we can test programmatically whether it has the monotonicity |
|
property or not, using `scipy.cluster.hierarchy.is_monotonic`: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
>>> is_monotonic(Z) |
|
True |
|
|
|
>>> Z = median(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 2. , 12. , 1.11803399, 3. ], |
|
[ 5. , 13. , 1.11803399, 3. ], |
|
[ 8. , 15. , 1.11803399, 3. ], |
|
[11. , 14. , 1.11803399, 3. ], |
|
[18. , 19. , 3. , 6. ], |
|
[16. , 17. , 3.5 , 6. ], |
|
[20. , 21. , 3.25 , 12. ]]) |
|
>>> is_monotonic(Z) |
|
False |
|
|
|
Note that this method is equivalent to just verifying that the distances |
|
in the third column of the linkage matrix appear in a monotonically |
|
increasing order. |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
|
|
return xp.all(Z[1:, 2] >= Z[:-1, 2]) |
|
|
|
|
|
def is_valid_im(R, warning=False, throw=False, name=None): |
|
"""Return True if the inconsistency matrix passed is valid. |
|
|
|
It must be a :math:`n` by 4 array of doubles. The standard |
|
deviations ``R[:,1]`` must be nonnegative. The link counts |
|
``R[:,2]`` must be positive and no greater than :math:`n-1`. |
|
|
|
Parameters |
|
---------- |
|
R : ndarray |
|
The inconsistency matrix to check for validity. |
|
warning : bool, optional |
|
When True, issues a Python warning if the linkage |
|
matrix passed is invalid. |
|
throw : bool, optional |
|
When True, throws a Python exception if the linkage |
|
matrix passed is invalid. |
|
name : str, optional |
|
This string refers to the variable name of the invalid |
|
linkage matrix. |
|
|
|
Returns |
|
------- |
|
b : bool |
|
True if the inconsistency matrix is valid. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
inconsistent : for the creation of a inconsistency matrix. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Given a data set ``X``, we can apply a clustering method to obtain a |
|
linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can |
|
be also used to obtain the inconsistency matrix ``R`` associated to |
|
this clustering process: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
>>> R = inconsistent(Z) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
>>> R |
|
array([[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1.14549722, 0.20576415, 2. , 0.70710678], |
|
[1.14549722, 0.20576415, 2. , 0.70710678], |
|
[1.14549722, 0.20576415, 2. , 0.70710678], |
|
[1.14549722, 0.20576415, 2. , 0.70710678], |
|
[2.78516386, 2.58797734, 3. , 1.15470054], |
|
[2.78516386, 2.58797734, 3. , 1.15470054], |
|
[6.57065706, 1.38071187, 3. , 1.15470054]]) |
|
|
|
Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that |
|
``R`` is correct: |
|
|
|
>>> is_valid_im(R) |
|
True |
|
|
|
However, if ``R`` is wrongly constructed (e.g., one of the standard |
|
deviations is set to a negative value), then the check will fail: |
|
|
|
>>> R[-1,1] = R[-1,1] * -1 |
|
>>> is_valid_im(R) |
|
False |
|
|
|
""" |
|
xp = array_namespace(R) |
|
R = _asarray(R, order='c', xp=xp) |
|
valid = True |
|
name_str = f"{name!r} " if name else '' |
|
try: |
|
if R.dtype != xp.float64: |
|
raise TypeError(f'Inconsistency matrix {name_str}must contain doubles ' |
|
'(double).') |
|
if len(R.shape) != 2: |
|
raise ValueError(f'Inconsistency matrix {name_str}must have shape=2 (i.e. ' |
|
'be two-dimensional).') |
|
if R.shape[1] != 4: |
|
raise ValueError(f'Inconsistency matrix {name_str}' |
|
'must have 4 columns.') |
|
if R.shape[0] < 1: |
|
raise ValueError(f'Inconsistency matrix {name_str}' |
|
'must have at least one row.') |
|
if xp.any(R[:, 0] < 0): |
|
raise ValueError(f'Inconsistency matrix {name_str}' |
|
'contains negative link height means.') |
|
if xp.any(R[:, 1] < 0): |
|
raise ValueError(f'Inconsistency matrix {name_str}' |
|
'contains negative link height standard deviations.') |
|
if xp.any(R[:, 2] < 0): |
|
raise ValueError(f'Inconsistency matrix {name_str}' |
|
'contains negative link counts.') |
|
except Exception as e: |
|
if throw: |
|
raise |
|
if warning: |
|
_warning(str(e)) |
|
valid = False |
|
|
|
return valid |
|
|
|
|
|
def is_valid_linkage(Z, warning=False, throw=False, name=None): |
|
""" |
|
Check the validity of a linkage matrix. |
|
|
|
A linkage matrix is valid if it is a 2-D array (type double) |
|
with :math:`n` rows and 4 columns. The first two columns must contain |
|
indices between 0 and :math:`2n-1`. For a given row ``i``, the following |
|
two expressions have to hold: |
|
|
|
.. math:: |
|
|
|
0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1 |
|
0 \\leq Z[i,1] \\leq i+n-1 |
|
|
|
I.e., a cluster cannot join another cluster unless the cluster being joined |
|
has been generated. |
|
|
|
The fourth column of `Z` represents the number of original observations |
|
in a cluster, so a valid ``Z[i, 3]`` value may not exceed the number of |
|
original observations. |
|
|
|
Parameters |
|
---------- |
|
Z : array_like |
|
Linkage matrix. |
|
warning : bool, optional |
|
When True, issues a Python warning if the linkage |
|
matrix passed is invalid. |
|
throw : bool, optional |
|
When True, throws a Python exception if the linkage |
|
matrix passed is invalid. |
|
name : str, optional |
|
This string refers to the variable name of the invalid |
|
linkage matrix. |
|
|
|
Returns |
|
------- |
|
b : bool |
|
True if the inconsistency matrix is valid. |
|
|
|
See Also |
|
-------- |
|
linkage: for a description of what a linkage matrix is. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, is_valid_linkage |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
All linkage matrices generated by the clustering methods in this module |
|
will be valid (i.e., they will have the appropriate dimensions and the two |
|
required expressions will hold for all the rows). |
|
|
|
We can check this using `scipy.cluster.hierarchy.is_valid_linkage`: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
>>> is_valid_linkage(Z) |
|
True |
|
|
|
However, if we create a linkage matrix in a wrong way - or if we modify |
|
a valid one in a way that any of the required expressions don't hold |
|
anymore, then the check will fail: |
|
|
|
>>> Z[3][1] = 20 # the cluster number 20 is not defined at this point |
|
>>> is_valid_linkage(Z) |
|
False |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
valid = True |
|
name_str = f"{name!r} " if name else '' |
|
try: |
|
if Z.dtype != xp.float64: |
|
raise TypeError(f'Linkage matrix {name_str}must contain doubles.') |
|
if len(Z.shape) != 2: |
|
raise ValueError(f'Linkage matrix {name_str}must have shape=2 (i.e. be' |
|
' two-dimensional).') |
|
if Z.shape[1] != 4: |
|
raise ValueError(f'Linkage matrix {name_str}must have 4 columns.') |
|
if Z.shape[0] == 0: |
|
raise ValueError('Linkage must be computed on at least two ' |
|
'observations.') |
|
n = Z.shape[0] |
|
if n > 1: |
|
if (xp.any(Z[:, 0] < 0) or xp.any(Z[:, 1] < 0)): |
|
raise ValueError(f'Linkage {name_str}contains negative indices.') |
|
if xp.any(Z[:, 2] < 0): |
|
raise ValueError(f'Linkage {name_str}contains negative distances.') |
|
if xp.any(Z[:, 3] < 0): |
|
raise ValueError(f'Linkage {name_str}contains negative counts.') |
|
if xp.any(Z[:, 3] > (Z.shape[0] + 1)): |
|
raise ValueError('Linkage matrix contains excessive observations' |
|
'in a cluster') |
|
if _check_hierarchy_uses_cluster_before_formed(Z): |
|
raise ValueError(f'Linkage {name_str}uses non-singleton cluster before' |
|
' it is formed.') |
|
if _check_hierarchy_uses_cluster_more_than_once(Z): |
|
raise ValueError(f'Linkage {name_str}uses the same cluster more than once.') |
|
except Exception as e: |
|
if throw: |
|
raise |
|
if warning: |
|
_warning(str(e)) |
|
valid = False |
|
|
|
return valid |
|
|
|
|
|
def _check_hierarchy_uses_cluster_before_formed(Z): |
|
n = Z.shape[0] + 1 |
|
for i in range(0, n - 1): |
|
if Z[i, 0] >= n + i or Z[i, 1] >= n + i: |
|
return True |
|
return False |
|
|
|
|
|
def _check_hierarchy_uses_cluster_more_than_once(Z): |
|
n = Z.shape[0] + 1 |
|
chosen = set() |
|
for i in range(0, n - 1): |
|
used_more_than_once = ( |
|
(float(Z[i, 0]) in chosen) |
|
or (float(Z[i, 1]) in chosen) |
|
or Z[i, 0] == Z[i, 1] |
|
) |
|
if used_more_than_once: |
|
return True |
|
chosen.add(float(Z[i, 0])) |
|
chosen.add(float(Z[i, 1])) |
|
return False |
|
|
|
|
|
def _check_hierarchy_not_all_clusters_used(Z): |
|
n = Z.shape[0] + 1 |
|
chosen = set() |
|
for i in range(0, n - 1): |
|
chosen.add(int(Z[i, 0])) |
|
chosen.add(int(Z[i, 1])) |
|
must_chosen = set(range(0, 2 * n - 2)) |
|
return len(must_chosen.difference(chosen)) > 0 |
|
|
|
|
|
def num_obs_linkage(Z): |
|
""" |
|
Return the number of original observations of the linkage matrix passed. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The linkage matrix on which to perform the operation. |
|
|
|
Returns |
|
------- |
|
n : int |
|
The number of original observations in the linkage. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, num_obs_linkage |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
|
|
``Z`` is a linkage matrix obtained after using the Ward clustering method |
|
with ``X``, a dataset with 12 data points. |
|
|
|
>>> num_obs_linkage(Z) |
|
12 |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
return (Z.shape[0] + 1) |
|
|
|
|
|
def correspond(Z, Y): |
|
""" |
|
Check for correspondence between linkage and condensed distance matrices. |
|
|
|
They must have the same number of original observations for |
|
the check to succeed. |
|
|
|
This function is useful as a sanity check in algorithms that make |
|
extensive use of linkage and distance matrices that must |
|
correspond to the same set of original observations. |
|
|
|
Parameters |
|
---------- |
|
Z : array_like |
|
The linkage matrix to check for correspondence. |
|
Y : array_like |
|
The condensed distance matrix to check for correspondence. |
|
|
|
Returns |
|
------- |
|
b : bool |
|
A boolean indicating whether the linkage matrix and distance |
|
matrix could possibly correspond to one another. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, correspond |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
This method can be used to check if a given linkage matrix ``Z`` has been |
|
obtained from the application of a cluster method over a dataset ``X``: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
>>> X_condensed = pdist(X) |
|
>>> Z = ward(X_condensed) |
|
|
|
Here, we can compare ``Z`` and ``X`` (in condensed form): |
|
|
|
>>> correspond(Z, X_condensed) |
|
True |
|
|
|
""" |
|
is_valid_linkage(Z, throw=True) |
|
distance.is_valid_y(Y, throw=True) |
|
xp = array_namespace(Z, Y) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
Y = _asarray(Y, order='c', xp=xp) |
|
return distance.num_obs_y(Y) == num_obs_linkage(Z) |
|
|
|
|
|
def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None): |
|
""" |
|
Form flat clusters from the hierarchical clustering defined by |
|
the given linkage matrix. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded with the matrix returned |
|
by the `linkage` function. |
|
t : scalar |
|
For criteria 'inconsistent', 'distance' or 'monocrit', |
|
this is the threshold to apply when forming flat clusters. |
|
For 'maxclust' or 'maxclust_monocrit' criteria, |
|
this would be max number of clusters requested. |
|
criterion : str, optional |
|
The criterion to use in forming flat clusters. This can |
|
be any of the following values: |
|
|
|
``inconsistent`` : |
|
If a cluster node and all its |
|
descendants have an inconsistent value less than or equal |
|
to `t`, then all its leaf descendants belong to the |
|
same flat cluster. When no non-singleton cluster meets |
|
this criterion, every node is assigned to its own |
|
cluster. (Default) |
|
|
|
``distance`` : |
|
Forms flat clusters so that the original |
|
observations in each flat cluster have no greater a |
|
cophenetic distance than `t`. |
|
|
|
``maxclust`` : |
|
Finds a minimum threshold ``r`` so that |
|
the cophenetic distance between any two original |
|
observations in the same flat cluster is no more than |
|
``r`` and no more than `t` flat clusters are formed. |
|
|
|
``monocrit`` : |
|
Forms a flat cluster from a cluster node c |
|
with index i when ``monocrit[j] <= t``. |
|
|
|
For example, to threshold on the maximum mean distance |
|
as computed in the inconsistency matrix R with a |
|
threshold of 0.8 do:: |
|
|
|
MR = maxRstat(Z, R, 3) |
|
fcluster(Z, t=0.8, criterion='monocrit', monocrit=MR) |
|
|
|
``maxclust_monocrit`` : |
|
Forms a flat cluster from a |
|
non-singleton cluster node ``c`` when ``monocrit[i] <= |
|
r`` for all cluster indices ``i`` below and including |
|
``c``. ``r`` is minimized such that no more than ``t`` |
|
flat clusters are formed. monocrit must be |
|
monotonic. For example, to minimize the threshold t on |
|
maximum inconsistency values so that no more than 3 flat |
|
clusters are formed, do:: |
|
|
|
MI = maxinconsts(Z, R) |
|
fcluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI) |
|
depth : int, optional |
|
The maximum depth to perform the inconsistency calculation. |
|
It has no meaning for the other criteria. Default is 2. |
|
R : ndarray, optional |
|
The inconsistency matrix to use for the ``'inconsistent'`` |
|
criterion. This matrix is computed if not provided. |
|
monocrit : ndarray, optional |
|
An array of length n-1. `monocrit[i]` is the |
|
statistics upon which non-singleton i is thresholded. The |
|
monocrit vector must be monotonic, i.e., given a node c with |
|
index i, for all node indices j corresponding to nodes |
|
below c, ``monocrit[i] >= monocrit[j]``. |
|
|
|
Returns |
|
------- |
|
fcluster : ndarray |
|
An array of length ``n``. ``T[i]`` is the flat cluster number to |
|
which original observation ``i`` belongs. |
|
|
|
See Also |
|
-------- |
|
linkage : for information about hierarchical clustering methods work. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, fcluster |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
All cluster linkage methods - e.g., `scipy.cluster.hierarchy.ward` |
|
generate a linkage matrix ``Z`` as their output: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
|
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
|
|
This matrix represents a dendrogram, where the first and second elements |
|
are the two clusters merged at each step, the third element is the |
|
distance between these clusters, and the fourth element is the size of |
|
the new cluster - the number of original data points included. |
|
|
|
`scipy.cluster.hierarchy.fcluster` can be used to flatten the |
|
dendrogram, obtaining as a result an assignation of the original data |
|
points to single clusters. |
|
|
|
This assignation mostly depends on a distance threshold ``t`` - the maximum |
|
inter-cluster distance allowed: |
|
|
|
>>> fcluster(Z, t=0.9, criterion='distance') |
|
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) |
|
|
|
>>> fcluster(Z, t=1.1, criterion='distance') |
|
array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32) |
|
|
|
>>> fcluster(Z, t=3, criterion='distance') |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
|
|
>>> fcluster(Z, t=9, criterion='distance') |
|
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) |
|
|
|
In the first case, the threshold ``t`` is too small to allow any two |
|
samples in the data to form a cluster, so 12 different clusters are |
|
returned. |
|
|
|
In the second case, the threshold is large enough to allow the first |
|
4 points to be merged with their nearest neighbors. So, here, only 8 |
|
clusters are returned. |
|
|
|
The third case, with a much higher threshold, allows for up to 8 data |
|
points to be connected - so 4 clusters are returned here. |
|
|
|
Lastly, the threshold of the fourth case is large enough to allow for |
|
all data points to be merged together - so a single cluster is returned. |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
n = Z.shape[0] + 1 |
|
T = np.zeros((n,), dtype='i') |
|
|
|
if monocrit is not None: |
|
monocrit = np.asarray(monocrit, order='C', dtype=np.float64) |
|
|
|
Z = np.asarray(Z) |
|
monocrit = np.asarray(monocrit) |
|
if criterion == 'inconsistent': |
|
if R is None: |
|
R = inconsistent(Z, depth) |
|
else: |
|
R = _asarray(R, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_im(R, throw=True, name='R') |
|
|
|
|
|
R = np.asarray(R) |
|
_hierarchy.cluster_in(Z, R, T, float(t), int(n)) |
|
elif criterion == 'distance': |
|
_hierarchy.cluster_dist(Z, T, float(t), int(n)) |
|
elif criterion == 'maxclust': |
|
_hierarchy.cluster_maxclust_dist(Z, T, int(n), t) |
|
elif criterion == 'monocrit': |
|
_hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n)) |
|
elif criterion == 'maxclust_monocrit': |
|
_hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t)) |
|
else: |
|
raise ValueError(f'Invalid cluster formation criterion: {str(criterion)}') |
|
return xp.asarray(T) |
|
|
|
|
|
def fclusterdata(X, t, criterion='inconsistent', |
|
metric='euclidean', depth=2, method='single', R=None): |
|
""" |
|
Cluster observation data using a given metric. |
|
|
|
Clusters the original observations in the n-by-m data |
|
matrix X (n observations in m dimensions), using the euclidean |
|
distance metric to calculate distances between original observations, |
|
performs hierarchical clustering using the single linkage algorithm, |
|
and forms flat clusters using the inconsistency method with `t` as the |
|
cut-off threshold. |
|
|
|
A 1-D array ``T`` of length ``n`` is returned. ``T[i]`` is |
|
the index of the flat cluster to which the original observation ``i`` |
|
belongs. |
|
|
|
Parameters |
|
---------- |
|
X : (N, M) ndarray |
|
N by M data matrix with N observations in M dimensions. |
|
t : scalar |
|
For criteria 'inconsistent', 'distance' or 'monocrit', |
|
this is the threshold to apply when forming flat clusters. |
|
For 'maxclust' or 'maxclust_monocrit' criteria, |
|
this would be max number of clusters requested. |
|
criterion : str, optional |
|
Specifies the criterion for forming flat clusters. Valid |
|
values are 'inconsistent' (default), 'distance', or 'maxclust' |
|
cluster formation algorithms. See `fcluster` for descriptions. |
|
metric : str or function, optional |
|
The distance metric for calculating pairwise distances. See |
|
``distance.pdist`` for descriptions and linkage to verify |
|
compatibility with the linkage method. |
|
depth : int, optional |
|
The maximum depth for the inconsistency calculation. See |
|
`inconsistent` for more information. |
|
method : str, optional |
|
The linkage method to use (single, complete, average, |
|
weighted, median centroid, ward). See `linkage` for more |
|
information. Default is "single". |
|
R : ndarray, optional |
|
The inconsistency matrix. It will be computed if necessary |
|
if it is not passed. |
|
|
|
Returns |
|
------- |
|
fclusterdata : ndarray |
|
A vector of length n. T[i] is the flat cluster number to |
|
which original observation i belongs. |
|
|
|
See Also |
|
-------- |
|
scipy.spatial.distance.pdist : pairwise distance metrics |
|
|
|
Notes |
|
----- |
|
This function is similar to the MATLAB function ``clusterdata``. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import fclusterdata |
|
|
|
This is a convenience method that abstracts all the steps to perform in a |
|
typical SciPy's hierarchical clustering workflow. |
|
|
|
* Transform the input data into a condensed matrix with |
|
`scipy.spatial.distance.pdist`. |
|
|
|
* Apply a clustering method. |
|
|
|
* Obtain flat clusters at a user defined distance threshold ``t`` using |
|
`scipy.cluster.hierarchy.fcluster`. |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> fclusterdata(X, t=1) |
|
array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) |
|
|
|
The output here (for the dataset ``X``, distance threshold ``t``, and the |
|
default settings) is four clusters with three data points each. |
|
|
|
""" |
|
xp = array_namespace(X) |
|
X = _asarray(X, order='C', dtype=xp.float64, xp=xp) |
|
|
|
if X.ndim != 2: |
|
raise TypeError('The observation matrix X must be an n by m ' |
|
'array.') |
|
|
|
Y = distance.pdist(X, metric=metric) |
|
Y = xp.asarray(Y) |
|
Z = linkage(Y, method=method) |
|
if R is None: |
|
R = inconsistent(Z, d=depth) |
|
else: |
|
R = _asarray(R, order='c', xp=xp) |
|
T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t) |
|
return T |
|
|
|
|
|
def leaves_list(Z): |
|
""" |
|
Return a list of leaf node ids. |
|
|
|
The return corresponds to the observation vector index as it appears |
|
in the tree from left to right. Z is a linkage matrix. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a matrix. `Z` is |
|
a linkage matrix. See `linkage` for more information. |
|
|
|
Returns |
|
------- |
|
leaves_list : ndarray |
|
The list of leaf node ids. |
|
|
|
See Also |
|
-------- |
|
dendrogram : for information about dendrogram structure. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list |
|
>>> from scipy.spatial.distance import pdist |
|
>>> from matplotlib import pyplot as plt |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
|
|
The linkage matrix ``Z`` represents a dendrogram, that is, a tree that |
|
encodes the structure of the clustering performed. |
|
`scipy.cluster.hierarchy.leaves_list` shows the mapping between |
|
indices in the ``X`` dataset and leaves in the dendrogram: |
|
|
|
>>> leaves_list(Z) |
|
array([ 2, 0, 1, 5, 3, 4, 8, 6, 7, 11, 9, 10], dtype=int32) |
|
|
|
>>> fig = plt.figure(figsize=(25, 10)) |
|
>>> dn = dendrogram(Z) |
|
>>> plt.show() |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='C', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
n = Z.shape[0] + 1 |
|
ML = np.zeros((n,), dtype='i') |
|
Z = np.asarray(Z) |
|
_hierarchy.prelist(Z, ML, n) |
|
return xp.asarray(ML) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5} |
|
_drotation = {20: 0, 40: 45, np.inf: 90} |
|
_dtextsortedkeys = list(_dtextsizes.keys()) |
|
_dtextsortedkeys.sort() |
|
_drotationsortedkeys = list(_drotation.keys()) |
|
_drotationsortedkeys.sort() |
|
|
|
|
|
def _remove_dups(L): |
|
""" |
|
Remove duplicates AND preserve the original order of the elements. |
|
|
|
The set class is not guaranteed to do this. |
|
""" |
|
seen_before = set() |
|
L2 = [] |
|
for i in L: |
|
if i not in seen_before: |
|
seen_before.add(i) |
|
L2.append(i) |
|
return L2 |
|
|
|
|
|
def _get_tick_text_size(p): |
|
for k in _dtextsortedkeys: |
|
if p <= k: |
|
return _dtextsizes[k] |
|
|
|
|
|
def _get_tick_rotation(p): |
|
for k in _drotationsortedkeys: |
|
if p <= k: |
|
return _drotation[k] |
|
|
|
|
|
def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation, |
|
no_labels, color_list, leaf_font_size=None, |
|
leaf_rotation=None, contraction_marks=None, |
|
ax=None, above_threshold_color='C0'): |
|
|
|
|
|
try: |
|
|
|
if ax is None: |
|
import matplotlib.pylab |
|
import matplotlib.patches |
|
import matplotlib.collections |
|
except ImportError as e: |
|
raise ImportError("You must install the matplotlib library to plot " |
|
"the dendrogram. Use no_plot=True to calculate the " |
|
"dendrogram without plotting.") from e |
|
|
|
if ax is None: |
|
ax = matplotlib.pylab.gca() |
|
|
|
trigger_redraw = True |
|
else: |
|
trigger_redraw = False |
|
|
|
|
|
ivw = len(ivl) * 10 |
|
|
|
dvw = mh + mh * 0.05 |
|
|
|
iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10) |
|
if orientation in ('top', 'bottom'): |
|
if orientation == 'top': |
|
ax.set_ylim([0, dvw]) |
|
ax.set_xlim([0, ivw]) |
|
else: |
|
ax.set_ylim([dvw, 0]) |
|
ax.set_xlim([0, ivw]) |
|
|
|
xlines = icoords |
|
ylines = dcoords |
|
if no_labels: |
|
ax.set_xticks([]) |
|
ax.set_xticklabels([]) |
|
else: |
|
ax.set_xticks(iv_ticks) |
|
|
|
if orientation == 'top': |
|
ax.xaxis.set_ticks_position('bottom') |
|
else: |
|
ax.xaxis.set_ticks_position('top') |
|
|
|
|
|
for line in ax.get_xticklines(): |
|
line.set_visible(False) |
|
|
|
leaf_rot = (float(_get_tick_rotation(len(ivl))) |
|
if (leaf_rotation is None) else leaf_rotation) |
|
leaf_font = (float(_get_tick_text_size(len(ivl))) |
|
if (leaf_font_size is None) else leaf_font_size) |
|
ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font) |
|
|
|
elif orientation in ('left', 'right'): |
|
if orientation == 'left': |
|
ax.set_xlim([dvw, 0]) |
|
ax.set_ylim([0, ivw]) |
|
else: |
|
ax.set_xlim([0, dvw]) |
|
ax.set_ylim([0, ivw]) |
|
|
|
xlines = dcoords |
|
ylines = icoords |
|
if no_labels: |
|
ax.set_yticks([]) |
|
ax.set_yticklabels([]) |
|
else: |
|
ax.set_yticks(iv_ticks) |
|
|
|
if orientation == 'left': |
|
ax.yaxis.set_ticks_position('right') |
|
else: |
|
ax.yaxis.set_ticks_position('left') |
|
|
|
|
|
for line in ax.get_yticklines(): |
|
line.set_visible(False) |
|
|
|
leaf_font = (float(_get_tick_text_size(len(ivl))) |
|
if (leaf_font_size is None) else leaf_font_size) |
|
|
|
if leaf_rotation is not None: |
|
ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font) |
|
else: |
|
ax.set_yticklabels(ivl, size=leaf_font) |
|
|
|
|
|
|
|
colors_used = _remove_dups(color_list) |
|
color_to_lines = {} |
|
for color in colors_used: |
|
color_to_lines[color] = [] |
|
for (xline, yline, color) in zip(xlines, ylines, color_list): |
|
color_to_lines[color].append(list(zip(xline, yline))) |
|
|
|
colors_to_collections = {} |
|
|
|
for color in colors_used: |
|
coll = matplotlib.collections.LineCollection(color_to_lines[color], |
|
colors=(color,)) |
|
colors_to_collections[color] = coll |
|
|
|
|
|
for color in colors_used: |
|
if color != above_threshold_color: |
|
ax.add_collection(colors_to_collections[color]) |
|
|
|
if above_threshold_color in colors_to_collections: |
|
ax.add_collection(colors_to_collections[above_threshold_color]) |
|
|
|
if contraction_marks is not None: |
|
Ellipse = matplotlib.patches.Ellipse |
|
for (x, y) in contraction_marks: |
|
if orientation in ('left', 'right'): |
|
e = Ellipse((y, x), width=dvw / 100, height=1.0) |
|
else: |
|
e = Ellipse((x, y), width=1.0, height=dvw / 100) |
|
ax.add_artist(e) |
|
e.set_clip_box(ax.bbox) |
|
e.set_alpha(0.5) |
|
e.set_facecolor('k') |
|
|
|
if trigger_redraw: |
|
matplotlib.pylab.draw_if_interactive() |
|
|
|
|
|
|
|
_link_line_colors_default = ('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9') |
|
_link_line_colors = list(_link_line_colors_default) |
|
|
|
|
|
def set_link_color_palette(palette): |
|
""" |
|
Set list of matplotlib color codes for use by dendrogram. |
|
|
|
Note that this palette is global (i.e., setting it once changes the colors |
|
for all subsequent calls to `dendrogram`) and that it affects only the |
|
the colors below ``color_threshold``. |
|
|
|
Note that `dendrogram` also accepts a custom coloring function through its |
|
``link_color_func`` keyword, which is more flexible and non-global. |
|
|
|
Parameters |
|
---------- |
|
palette : list of str or None |
|
A list of matplotlib color codes. The order of the color codes is the |
|
order in which the colors are cycled through when color thresholding in |
|
the dendrogram. |
|
|
|
If ``None``, resets the palette to its default (which are matplotlib |
|
default colors C1 to C9). |
|
|
|
Returns |
|
------- |
|
None |
|
|
|
See Also |
|
-------- |
|
dendrogram |
|
|
|
Notes |
|
----- |
|
Ability to reset the palette with ``None`` added in SciPy 0.17.0. |
|
|
|
Thread safety: using this function in a multi-threaded fashion may |
|
result in `dendrogram` producing plots with unexpected colors. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster import hierarchy |
|
>>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268., |
|
... 400., 754., 564., 138., 219., 869., 669.]) |
|
>>> Z = hierarchy.linkage(ytdist, 'single') |
|
>>> dn = hierarchy.dendrogram(Z, no_plot=True) |
|
>>> dn['color_list'] |
|
['C1', 'C0', 'C0', 'C0', 'C0'] |
|
>>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k']) |
|
>>> dn = hierarchy.dendrogram(Z, no_plot=True, above_threshold_color='b') |
|
>>> dn['color_list'] |
|
['c', 'b', 'b', 'b', 'b'] |
|
>>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267, |
|
... above_threshold_color='k') |
|
>>> dn['color_list'] |
|
['c', 'm', 'm', 'k', 'k'] |
|
|
|
Now, reset the color palette to its default: |
|
|
|
>>> hierarchy.set_link_color_palette(None) |
|
|
|
""" |
|
if palette is None: |
|
|
|
palette = _link_line_colors_default |
|
elif not isinstance(palette, (list, tuple)): |
|
raise TypeError("palette must be a list or tuple") |
|
_ptypes = [isinstance(p, str) for p in palette] |
|
|
|
if False in _ptypes: |
|
raise TypeError("all palette list elements must be color strings") |
|
|
|
global _link_line_colors |
|
_link_line_colors = palette |
|
|
|
|
|
def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None, |
|
get_leaves=True, orientation='top', labels=None, |
|
count_sort=False, distance_sort=False, show_leaf_counts=True, |
|
no_plot=False, no_labels=False, leaf_font_size=None, |
|
leaf_rotation=None, leaf_label_func=None, |
|
show_contracted=False, link_color_func=None, ax=None, |
|
above_threshold_color='C0'): |
|
""" |
|
Plot the hierarchical clustering as a dendrogram. |
|
|
|
The dendrogram illustrates how each cluster is |
|
composed by drawing a U-shaped link between a non-singleton |
|
cluster and its children. The top of the U-link indicates a |
|
cluster merge. The two legs of the U-link indicate which clusters |
|
were merged. The length of the two legs of the U-link represents |
|
the distance between the child clusters. It is also the |
|
cophenetic distance between original observations in the two |
|
children clusters. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The linkage matrix encoding the hierarchical clustering to |
|
render as a dendrogram. See the ``linkage`` function for more |
|
information on the format of ``Z``. |
|
p : int, optional |
|
The ``p`` parameter for ``truncate_mode``. |
|
truncate_mode : str, optional |
|
The dendrogram can be hard to read when the original |
|
observation matrix from which the linkage is derived is |
|
large. Truncation is used to condense the dendrogram. There |
|
are several modes: |
|
|
|
``None`` |
|
No truncation is performed (default). |
|
Note: ``'none'`` is an alias for ``None`` that's kept for |
|
backward compatibility. |
|
|
|
``'lastp'`` |
|
The last ``p`` non-singleton clusters formed in the linkage are the |
|
only non-leaf nodes in the linkage; they correspond to rows |
|
``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are |
|
contracted into leaf nodes. |
|
|
|
``'level'`` |
|
No more than ``p`` levels of the dendrogram tree are displayed. |
|
A "level" includes all nodes with ``p`` merges from the final merge. |
|
|
|
Note: ``'mtica'`` is an alias for ``'level'`` that's kept for |
|
backward compatibility. |
|
|
|
color_threshold : double, optional |
|
For brevity, let :math:`t` be the ``color_threshold``. |
|
Colors all the descendent links below a cluster node |
|
:math:`k` the same color if :math:`k` is the first node below |
|
the cut threshold :math:`t`. All links connecting nodes with |
|
distances greater than or equal to the threshold are colored |
|
with de default matplotlib color ``'C0'``. If :math:`t` is less |
|
than or equal to zero, all nodes are colored ``'C0'``. |
|
If ``color_threshold`` is None or 'default', |
|
corresponding with MATLAB(TM) behavior, the threshold is set to |
|
``0.7*max(Z[:,2])``. |
|
|
|
get_leaves : bool, optional |
|
Includes a list ``R['leaves']=H`` in the result |
|
dictionary. For each :math:`i`, ``H[i] == j``, cluster node |
|
``j`` appears in position ``i`` in the left-to-right traversal |
|
of the leaves, where :math:`j < 2n-1` and :math:`i < n`. |
|
orientation : str, optional |
|
The direction to plot the dendrogram, which can be any |
|
of the following strings: |
|
|
|
``'top'`` |
|
Plots the root at the top, and plot descendent links going downwards. |
|
(default). |
|
|
|
``'bottom'`` |
|
Plots the root at the bottom, and plot descendent links going |
|
upwards. |
|
|
|
``'left'`` |
|
Plots the root at the left, and plot descendent links going right. |
|
|
|
``'right'`` |
|
Plots the root at the right, and plot descendent links going left. |
|
|
|
labels : ndarray, optional |
|
By default, ``labels`` is None so the index of the original observation |
|
is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized |
|
sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the |
|
text to put under the :math:`i` th leaf node only if it corresponds to |
|
an original observation and not a non-singleton cluster. |
|
count_sort : str or bool, optional |
|
For each node n, the order (visually, from left-to-right) n's |
|
two descendent links are plotted is determined by this |
|
parameter, which can be any of the following values: |
|
|
|
``False`` |
|
Nothing is done. |
|
|
|
``'ascending'`` or ``True`` |
|
The child with the minimum number of original objects in its cluster |
|
is plotted first. |
|
|
|
``'descending'`` |
|
The child with the maximum number of original objects in its cluster |
|
is plotted first. |
|
|
|
Note, ``distance_sort`` and ``count_sort`` cannot both be True. |
|
distance_sort : str or bool, optional |
|
For each node n, the order (visually, from left-to-right) n's |
|
two descendent links are plotted is determined by this |
|
parameter, which can be any of the following values: |
|
|
|
``False`` |
|
Nothing is done. |
|
|
|
``'ascending'`` or ``True`` |
|
The child with the minimum distance between its direct descendents is |
|
plotted first. |
|
|
|
``'descending'`` |
|
The child with the maximum distance between its direct descendents is |
|
plotted first. |
|
|
|
Note ``distance_sort`` and ``count_sort`` cannot both be True. |
|
show_leaf_counts : bool, optional |
|
When True, leaf nodes representing :math:`k>1` original |
|
observation are labeled with the number of observations they |
|
contain in parentheses. |
|
no_plot : bool, optional |
|
When True, the final rendering is not performed. This is |
|
useful if only the data structures computed for the rendering |
|
are needed or if matplotlib is not available. |
|
no_labels : bool, optional |
|
When True, no labels appear next to the leaf nodes in the |
|
rendering of the dendrogram. |
|
leaf_rotation : double, optional |
|
Specifies the angle (in degrees) to rotate the leaf |
|
labels. When unspecified, the rotation is based on the number of |
|
nodes in the dendrogram (default is 0). |
|
leaf_font_size : int, optional |
|
Specifies the font size (in points) of the leaf labels. When |
|
unspecified, the size based on the number of nodes in the |
|
dendrogram. |
|
leaf_label_func : lambda or function, optional |
|
When ``leaf_label_func`` is a callable function, for each |
|
leaf with cluster index :math:`k < 2n-1`. The function |
|
is expected to return a string with the label for the |
|
leaf. |
|
|
|
Indices :math:`k < n` correspond to original observations |
|
while indices :math:`k \\geq n` correspond to non-singleton |
|
clusters. |
|
|
|
For example, to label singletons with their node id and |
|
non-singletons with their id, count, and inconsistency |
|
coefficient, simply do:: |
|
|
|
# First define the leaf label function. |
|
def llf(id): |
|
if id < n: |
|
return str(id) |
|
else: |
|
return '[%d %d %1.2f]' % (id, count, R[n-id,3]) |
|
|
|
# The text for the leaf nodes is going to be big so force |
|
# a rotation of 90 degrees. |
|
dendrogram(Z, leaf_label_func=llf, leaf_rotation=90) |
|
|
|
# leaf_label_func can also be used together with ``truncate_mode``, |
|
# in which case you will get your leaves labeled after truncation: |
|
dendrogram(Z, leaf_label_func=llf, leaf_rotation=90, |
|
truncate_mode='level', p=2) |
|
|
|
show_contracted : bool, optional |
|
When True the heights of non-singleton nodes contracted |
|
into a leaf node are plotted as crosses along the link |
|
connecting that leaf node. This really is only useful when |
|
truncation is used (see ``truncate_mode`` parameter). |
|
link_color_func : callable, optional |
|
If given, `link_color_function` is called with each non-singleton id |
|
corresponding to each U-shaped link it will paint. The function is |
|
expected to return the color to paint the link, encoded as a matplotlib |
|
color string code. For example:: |
|
|
|
dendrogram(Z, link_color_func=lambda k: colors[k]) |
|
|
|
colors the direct links below each untruncated non-singleton node |
|
``k`` using ``colors[k]``. |
|
ax : matplotlib Axes instance, optional |
|
If None and `no_plot` is not True, the dendrogram will be plotted |
|
on the current axes. Otherwise if `no_plot` is not True the |
|
dendrogram will be plotted on the given ``Axes`` instance. This can be |
|
useful if the dendrogram is part of a more complex figure. |
|
above_threshold_color : str, optional |
|
This matplotlib color string sets the color of the links above the |
|
color_threshold. The default is ``'C0'``. |
|
|
|
Returns |
|
------- |
|
R : dict |
|
A dictionary of data structures computed to render the |
|
dendrogram. Its has the following keys: |
|
|
|
``'color_list'`` |
|
A list of color names. The k'th element represents the color of the |
|
k'th link. |
|
|
|
``'icoord'`` and ``'dcoord'`` |
|
Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]`` |
|
where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]`` |
|
where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is |
|
``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``. |
|
|
|
``'ivl'`` |
|
A list of labels corresponding to the leaf nodes. |
|
|
|
``'leaves'`` |
|
For each i, ``H[i] == j``, cluster node ``j`` appears in position |
|
``i`` in the left-to-right traversal of the leaves, where |
|
:math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the |
|
``i``-th leaf node corresponds to an original observation. |
|
Otherwise, it corresponds to a non-singleton cluster. |
|
|
|
``'leaves_color_list'`` |
|
A list of color names. The k'th element represents the color of the |
|
k'th leaf. |
|
|
|
See Also |
|
-------- |
|
linkage, set_link_color_palette |
|
|
|
Notes |
|
----- |
|
It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise |
|
crossings appear in the dendrogram. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster import hierarchy |
|
>>> import matplotlib.pyplot as plt |
|
|
|
A very basic example: |
|
|
|
>>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268., |
|
... 400., 754., 564., 138., 219., 869., 669.]) |
|
>>> Z = hierarchy.linkage(ytdist, 'single') |
|
>>> plt.figure() |
|
>>> dn = hierarchy.dendrogram(Z) |
|
|
|
Now, plot in given axes, improve the color scheme and use both vertical and |
|
horizontal orientations: |
|
|
|
>>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k']) |
|
>>> fig, axes = plt.subplots(1, 2, figsize=(8, 3)) |
|
>>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y', |
|
... orientation='top') |
|
>>> dn2 = hierarchy.dendrogram(Z, ax=axes[1], |
|
... above_threshold_color='#bcbddc', |
|
... orientation='right') |
|
>>> hierarchy.set_link_color_palette(None) # reset to default after use |
|
>>> plt.show() |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='c', xp=xp) |
|
|
|
if orientation not in ["top", "left", "bottom", "right"]: |
|
raise ValueError("orientation must be one of 'top', 'left', " |
|
"'bottom', or 'right'") |
|
|
|
if labels is not None: |
|
try: |
|
len_labels = len(labels) |
|
except (TypeError, AttributeError): |
|
len_labels = labels.shape[0] |
|
if Z.shape[0] + 1 != len_labels: |
|
raise ValueError("Dimensions of Z and labels must be consistent.") |
|
|
|
is_valid_linkage(Z, throw=True, name='Z') |
|
Zs = Z.shape |
|
n = Zs[0] + 1 |
|
if isinstance(p, (int, float)): |
|
p = int(p) |
|
else: |
|
raise TypeError('The second argument must be a number') |
|
|
|
if truncate_mode not in ('lastp', 'mtica', 'level', 'none', None): |
|
|
|
raise ValueError('Invalid truncation mode.') |
|
|
|
if truncate_mode == 'lastp': |
|
if p > n or p == 0: |
|
p = n |
|
|
|
if truncate_mode == 'mtica': |
|
|
|
truncate_mode = 'level' |
|
|
|
if truncate_mode == 'level': |
|
if p <= 0: |
|
p = np.inf |
|
|
|
if get_leaves: |
|
lvs = [] |
|
else: |
|
lvs = None |
|
|
|
icoord_list = [] |
|
dcoord_list = [] |
|
color_list = [] |
|
current_color = [0] |
|
currently_below_threshold = [False] |
|
ivl = [] |
|
|
|
if color_threshold is None or (isinstance(color_threshold, str) and |
|
color_threshold == 'default'): |
|
color_threshold = xp.max(Z[:, 2]) * 0.7 |
|
|
|
R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl, |
|
'leaves': lvs, 'color_list': color_list} |
|
|
|
|
|
contraction_marks = [] if show_contracted else None |
|
|
|
_dendrogram_calculate_info( |
|
Z=Z, p=p, |
|
truncate_mode=truncate_mode, |
|
color_threshold=color_threshold, |
|
get_leaves=get_leaves, |
|
orientation=orientation, |
|
labels=labels, |
|
count_sort=count_sort, |
|
distance_sort=distance_sort, |
|
show_leaf_counts=show_leaf_counts, |
|
i=2*n - 2, |
|
iv=0.0, |
|
ivl=ivl, |
|
n=n, |
|
icoord_list=icoord_list, |
|
dcoord_list=dcoord_list, |
|
lvs=lvs, |
|
current_color=current_color, |
|
color_list=color_list, |
|
currently_below_threshold=currently_below_threshold, |
|
leaf_label_func=leaf_label_func, |
|
contraction_marks=contraction_marks, |
|
link_color_func=link_color_func, |
|
above_threshold_color=above_threshold_color) |
|
|
|
if not no_plot: |
|
mh = xp.max(Z[:, 2]) |
|
_plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation, |
|
no_labels, color_list, |
|
leaf_font_size=leaf_font_size, |
|
leaf_rotation=leaf_rotation, |
|
contraction_marks=contraction_marks, |
|
ax=ax, |
|
above_threshold_color=above_threshold_color) |
|
|
|
R["leaves_color_list"] = _get_leaves_color_list(R) |
|
|
|
return R |
|
|
|
|
|
def _get_leaves_color_list(R): |
|
leaves_color_list = [None] * len(R['leaves']) |
|
for link_x, link_y, link_color in zip(R['icoord'], |
|
R['dcoord'], |
|
R['color_list']): |
|
for (xi, yi) in zip(link_x, link_y): |
|
if yi == 0.0 and (xi % 5 == 0 and xi % 2 == 1): |
|
|
|
|
|
|
|
|
|
leaf_index = (int(xi) - 5) // 10 |
|
|
|
leaves_color_list[leaf_index] = link_color |
|
return leaves_color_list |
|
|
|
|
|
def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func, |
|
i, labels): |
|
|
|
|
|
|
|
|
|
if lvs is not None: |
|
lvs.append(int(i)) |
|
|
|
|
|
if ivl is not None: |
|
|
|
|
|
|
|
if leaf_label_func: |
|
ivl.append(leaf_label_func(int(i))) |
|
else: |
|
|
|
|
|
if labels is not None: |
|
ivl.append(labels[int(i - n)]) |
|
else: |
|
|
|
ivl.append(str(int(i))) |
|
|
|
|
|
def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func, |
|
i, labels, show_leaf_counts): |
|
|
|
|
|
|
|
|
|
if lvs is not None: |
|
lvs.append(int(i)) |
|
if ivl is not None: |
|
if leaf_label_func: |
|
ivl.append(leaf_label_func(int(i))) |
|
else: |
|
if show_leaf_counts: |
|
ivl.append("(" + str(np.asarray(Z[i - n, 3], dtype=np.int64)) + ")") |
|
else: |
|
ivl.append("") |
|
|
|
|
|
def _append_contraction_marks(Z, iv, i, n, contraction_marks, xp): |
|
_append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp), |
|
n, contraction_marks, xp) |
|
_append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp), |
|
n, contraction_marks, xp) |
|
|
|
|
|
def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks, xp): |
|
if i >= n: |
|
contraction_marks.append((iv, Z[i - n, 2])) |
|
_append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 0], xp), |
|
n, contraction_marks, xp) |
|
_append_contraction_marks_sub(Z, iv, int_floor(Z[i - n, 1], xp), |
|
n, contraction_marks, xp) |
|
|
|
|
|
def _dendrogram_calculate_info(Z, p, truncate_mode, |
|
color_threshold=np.inf, get_leaves=True, |
|
orientation='top', labels=None, |
|
count_sort=False, distance_sort=False, |
|
show_leaf_counts=False, i=-1, iv=0.0, |
|
ivl=None, n=0, icoord_list=None, dcoord_list=None, |
|
lvs=None, mhr=False, |
|
current_color=None, color_list=None, |
|
currently_below_threshold=None, |
|
leaf_label_func=None, level=0, |
|
contraction_marks=None, |
|
link_color_func=None, |
|
above_threshold_color='C0'): |
|
""" |
|
Calculate the endpoints of the links as well as the labels for the |
|
the dendrogram rooted at the node with index i. iv is the independent |
|
variable value to plot the left-most leaf node below the root node i |
|
(if orientation='top', this would be the left-most x value where the |
|
plotting of this root node i and its descendents should begin). |
|
|
|
ivl is a list to store the labels of the leaf nodes. The leaf_label_func |
|
is called whenever ivl != None, labels == None, and |
|
leaf_label_func != None. When ivl != None and labels != None, the |
|
labels list is used only for labeling the leaf nodes. When |
|
ivl == None, no labels are generated for leaf nodes. |
|
|
|
When get_leaves==True, a list of leaves is built as they are visited |
|
in the dendrogram. |
|
|
|
Returns a tuple with l being the independent variable coordinate that |
|
corresponds to the midpoint of cluster to the left of cluster i if |
|
i is non-singleton, otherwise the independent coordinate of the leaf |
|
node if i is a leaf node. |
|
|
|
Returns |
|
------- |
|
A tuple (left, w, h, md), where: |
|
* left is the independent variable coordinate of the center of the |
|
the U of the subtree |
|
|
|
* w is the amount of space used for the subtree (in independent |
|
variable units) |
|
|
|
* h is the height of the subtree in dependent variable units |
|
|
|
* md is the ``max(Z[*,2]``) for all nodes ``*`` below and including |
|
the target node. |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
if n == 0: |
|
raise ValueError("Invalid singleton cluster count n.") |
|
|
|
if i == -1: |
|
raise ValueError("Invalid root cluster index i.") |
|
|
|
if truncate_mode == 'lastp': |
|
|
|
|
|
|
|
if 2*n - p > i >= n: |
|
d = Z[i - n, 2] |
|
_append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, |
|
leaf_label_func, i, labels, |
|
show_leaf_counts) |
|
if contraction_marks is not None: |
|
_append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp) |
|
return (iv + 5.0, 10.0, 0.0, d) |
|
elif i < n: |
|
_append_singleton_leaf_node(Z, p, n, level, lvs, ivl, |
|
leaf_label_func, i, labels) |
|
return (iv + 5.0, 10.0, 0.0, 0.0) |
|
elif truncate_mode == 'level': |
|
if i > n and level > p: |
|
d = Z[i - n, 2] |
|
_append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, |
|
leaf_label_func, i, labels, |
|
show_leaf_counts) |
|
if contraction_marks is not None: |
|
_append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks, xp) |
|
return (iv + 5.0, 10.0, 0.0, d) |
|
elif i < n: |
|
_append_singleton_leaf_node(Z, p, n, level, lvs, ivl, |
|
leaf_label_func, i, labels) |
|
return (iv + 5.0, 10.0, 0.0, 0.0) |
|
|
|
|
|
|
|
|
|
if i < n: |
|
_append_singleton_leaf_node(Z, p, n, level, lvs, ivl, |
|
leaf_label_func, i, labels) |
|
return (iv + 5.0, 10.0, 0.0, 0.0) |
|
|
|
|
|
|
|
|
|
aa = int_floor(Z[i - n, 0], xp) |
|
ab = int_floor(Z[i - n, 1], xp) |
|
if aa >= n: |
|
|
|
na = Z[aa - n, 3] |
|
|
|
da = Z[aa - n, 2] |
|
else: |
|
na = 1 |
|
da = 0.0 |
|
if ab >= n: |
|
nb = Z[ab - n, 3] |
|
db = Z[ab - n, 2] |
|
else: |
|
nb = 1 |
|
db = 0.0 |
|
|
|
if count_sort == 'ascending' or count_sort is True: |
|
|
|
|
|
if na > nb: |
|
|
|
|
|
ua = ab |
|
ub = aa |
|
else: |
|
ua = aa |
|
ub = ab |
|
elif count_sort == 'descending': |
|
|
|
|
|
|
|
if na > nb: |
|
ua = aa |
|
ub = ab |
|
else: |
|
ua = ab |
|
ub = aa |
|
elif distance_sort == 'ascending' or distance_sort is True: |
|
|
|
|
|
if da > db: |
|
ua = ab |
|
ub = aa |
|
else: |
|
ua = aa |
|
ub = ab |
|
elif distance_sort == 'descending': |
|
|
|
|
|
|
|
if da > db: |
|
ua = aa |
|
ub = ab |
|
else: |
|
ua = ab |
|
ub = aa |
|
else: |
|
ua = aa |
|
ub = ab |
|
|
|
|
|
(uiva, uwa, uah, uamd) = \ |
|
_dendrogram_calculate_info( |
|
Z=Z, p=p, |
|
truncate_mode=truncate_mode, |
|
color_threshold=color_threshold, |
|
get_leaves=get_leaves, |
|
orientation=orientation, |
|
labels=labels, |
|
count_sort=count_sort, |
|
distance_sort=distance_sort, |
|
show_leaf_counts=show_leaf_counts, |
|
i=ua, iv=iv, ivl=ivl, n=n, |
|
icoord_list=icoord_list, |
|
dcoord_list=dcoord_list, lvs=lvs, |
|
current_color=current_color, |
|
color_list=color_list, |
|
currently_below_threshold=currently_below_threshold, |
|
leaf_label_func=leaf_label_func, |
|
level=level + 1, contraction_marks=contraction_marks, |
|
link_color_func=link_color_func, |
|
above_threshold_color=above_threshold_color) |
|
|
|
h = Z[i - n, 2] |
|
if h >= color_threshold or color_threshold <= 0: |
|
c = above_threshold_color |
|
|
|
if currently_below_threshold[0]: |
|
current_color[0] = (current_color[0] + 1) % len(_link_line_colors) |
|
currently_below_threshold[0] = False |
|
else: |
|
currently_below_threshold[0] = True |
|
c = _link_line_colors[current_color[0]] |
|
|
|
(uivb, uwb, ubh, ubmd) = \ |
|
_dendrogram_calculate_info( |
|
Z=Z, p=p, |
|
truncate_mode=truncate_mode, |
|
color_threshold=color_threshold, |
|
get_leaves=get_leaves, |
|
orientation=orientation, |
|
labels=labels, |
|
count_sort=count_sort, |
|
distance_sort=distance_sort, |
|
show_leaf_counts=show_leaf_counts, |
|
i=ub, iv=iv + uwa, ivl=ivl, n=n, |
|
icoord_list=icoord_list, |
|
dcoord_list=dcoord_list, lvs=lvs, |
|
current_color=current_color, |
|
color_list=color_list, |
|
currently_below_threshold=currently_below_threshold, |
|
leaf_label_func=leaf_label_func, |
|
level=level + 1, contraction_marks=contraction_marks, |
|
link_color_func=link_color_func, |
|
above_threshold_color=above_threshold_color) |
|
|
|
max_dist = max(uamd, ubmd, h) |
|
|
|
icoord_list.append([uiva, uiva, uivb, uivb]) |
|
dcoord_list.append([uah, h, h, ubh]) |
|
if link_color_func is not None: |
|
v = link_color_func(int(i)) |
|
if not isinstance(v, str): |
|
raise TypeError("link_color_func must return a matplotlib " |
|
"color string!") |
|
color_list.append(v) |
|
else: |
|
color_list.append(c) |
|
|
|
return (((uiva + uivb) / 2), uwa + uwb, h, max_dist) |
|
|
|
|
|
def is_isomorphic(T1, T2): |
|
""" |
|
Determine if two different cluster assignments are equivalent. |
|
|
|
Parameters |
|
---------- |
|
T1 : array_like |
|
An assignment of singleton cluster ids to flat cluster ids. |
|
T2 : array_like |
|
An assignment of singleton cluster ids to flat cluster ids. |
|
|
|
Returns |
|
------- |
|
b : bool |
|
Whether the flat cluster assignments `T1` and `T2` are |
|
equivalent. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
fcluster : for the creation of flat cluster assignments. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import fcluster, is_isomorphic |
|
>>> from scipy.cluster.hierarchy import single, complete |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Two flat cluster assignments can be isomorphic if they represent the same |
|
cluster assignment, with different labels. |
|
|
|
For example, we can use the `scipy.cluster.hierarchy.single`: method |
|
and flatten the output to four clusters: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = single(pdist(X)) |
|
>>> T = fcluster(Z, 1, criterion='distance') |
|
>>> T |
|
array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) |
|
|
|
We can then do the same using the |
|
`scipy.cluster.hierarchy.complete`: method: |
|
|
|
>>> Z = complete(pdist(X)) |
|
>>> T_ = fcluster(Z, 1.5, criterion='distance') |
|
>>> T_ |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
|
|
As we can see, in both cases we obtain four clusters and all the data |
|
points are distributed in the same way - the only thing that changes |
|
are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both |
|
cluster assignments are isomorphic: |
|
|
|
>>> is_isomorphic(T, T_) |
|
True |
|
|
|
""" |
|
T1 = np.asarray(T1, order='c') |
|
T2 = np.asarray(T2, order='c') |
|
|
|
T1S = T1.shape |
|
T2S = T2.shape |
|
|
|
if len(T1S) != 1: |
|
raise ValueError('T1 must be one-dimensional.') |
|
if len(T2S) != 1: |
|
raise ValueError('T2 must be one-dimensional.') |
|
if T1S[0] != T2S[0]: |
|
raise ValueError('T1 and T2 must have the same number of elements.') |
|
n = T1S[0] |
|
d1 = {} |
|
d2 = {} |
|
for i in range(0, n): |
|
if T1[i] in d1: |
|
if T2[i] not in d2: |
|
return False |
|
if d1[T1[i]] != T2[i] or d2[T2[i]] != T1[i]: |
|
return False |
|
elif T2[i] in d2: |
|
return False |
|
else: |
|
d1[T1[i]] = T2[i] |
|
d2[T2[i]] = T1[i] |
|
return True |
|
|
|
|
|
def maxdists(Z): |
|
""" |
|
Return the maximum distance between any non-singleton cluster. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a matrix. See |
|
``linkage`` for more information. |
|
|
|
Returns |
|
------- |
|
maxdists : ndarray |
|
A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents |
|
the maximum distance between any cluster (including |
|
singletons) below and including the node with index i. More |
|
specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the |
|
set of all node indices below and including node i. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
is_monotonic : for testing for monotonicity of a linkage matrix. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import median, maxdists |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists` |
|
computes for each new cluster generated (i.e., for each row of the linkage |
|
matrix) what is the maximum distance between any two child clusters. |
|
|
|
Due to the nature of hierarchical clustering, in many cases this is going |
|
to be just the distance between the two child clusters that were merged |
|
to form the current one - that is, Z[:,2]. |
|
|
|
However, for non-monotonic cluster assignments such as |
|
`scipy.cluster.hierarchy.median` clustering this is not always the |
|
case: There may be cluster formations were the distance between the two |
|
clusters merged is smaller than the distance between their children. |
|
|
|
We can see this in an example: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = median(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 2. , 12. , 1.11803399, 3. ], |
|
[ 5. , 13. , 1.11803399, 3. ], |
|
[ 8. , 15. , 1.11803399, 3. ], |
|
[11. , 14. , 1.11803399, 3. ], |
|
[18. , 19. , 3. , 6. ], |
|
[16. , 17. , 3.5 , 6. ], |
|
[20. , 21. , 3.25 , 12. ]]) |
|
>>> maxdists(Z) |
|
array([1. , 1. , 1. , 1. , 1.11803399, |
|
1.11803399, 1.11803399, 1.11803399, 3. , 3.5 , |
|
3.5 ]) |
|
|
|
Note that while the distance between the two clusters merged when creating the |
|
last cluster is 3.25, there are two children (clusters 16 and 17) whose distance |
|
is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in |
|
this case. |
|
|
|
""" |
|
xp = array_namespace(Z) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
n = Z.shape[0] + 1 |
|
MD = np.zeros((n - 1,)) |
|
Z = np.asarray(Z) |
|
_hierarchy.get_max_dist_for_each_cluster(Z, MD, int(n)) |
|
MD = xp.asarray(MD) |
|
return MD |
|
|
|
|
|
def maxinconsts(Z, R): |
|
""" |
|
Return the maximum inconsistency coefficient for each |
|
non-singleton cluster and its children. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a matrix. See |
|
`linkage` for more information. |
|
R : ndarray |
|
The inconsistency matrix. |
|
|
|
Returns |
|
------- |
|
MI : ndarray |
|
A monotonic ``(n-1)``-sized numpy array of doubles. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
inconsistent : for the creation of a inconsistency matrix. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Given a data set ``X``, we can apply a clustering method to obtain a |
|
linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can |
|
be also used to obtain the inconsistency matrix ``R`` associated to |
|
this clustering process: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = median(pdist(X)) |
|
>>> R = inconsistent(Z) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 2. , 12. , 1.11803399, 3. ], |
|
[ 5. , 13. , 1.11803399, 3. ], |
|
[ 8. , 15. , 1.11803399, 3. ], |
|
[11. , 14. , 1.11803399, 3. ], |
|
[18. , 19. , 3. , 6. ], |
|
[16. , 17. , 3.5 , 6. ], |
|
[20. , 21. , 3.25 , 12. ]]) |
|
>>> R |
|
array([[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.74535599, 1.08655358, 3. , 1.15470054], |
|
[1.91202266, 1.37522872, 3. , 1.15470054], |
|
[3.25 , 0.25 , 3. , 0. ]]) |
|
|
|
Here, `scipy.cluster.hierarchy.maxinconsts` can be used to compute |
|
the maximum value of the inconsistency statistic (the last column of |
|
``R``) for each non-singleton cluster and its children: |
|
|
|
>>> maxinconsts(Z, R) |
|
array([0. , 0. , 0. , 0. , 0.70710678, |
|
0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054, |
|
1.15470054]) |
|
|
|
""" |
|
xp = array_namespace(Z, R) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
R = _asarray(R, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
is_valid_im(R, throw=True, name='R') |
|
|
|
n = Z.shape[0] + 1 |
|
if Z.shape[0] != R.shape[0]: |
|
raise ValueError("The inconsistency matrix and linkage matrix each " |
|
"have a different number of rows.") |
|
MI = np.zeros((n - 1,)) |
|
Z = np.asarray(Z) |
|
R = np.asarray(R) |
|
_hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, int(n), 3) |
|
MI = xp.asarray(MI) |
|
return MI |
|
|
|
|
|
def maxRstat(Z, R, i): |
|
""" |
|
Return the maximum statistic for each non-singleton cluster and its |
|
children. |
|
|
|
Parameters |
|
---------- |
|
Z : array_like |
|
The hierarchical clustering encoded as a matrix. See `linkage` for more |
|
information. |
|
R : array_like |
|
The inconsistency matrix. |
|
i : int |
|
The column of `R` to use as the statistic. |
|
|
|
Returns |
|
------- |
|
MR : ndarray |
|
Calculates the maximum statistic for the i'th column of the |
|
inconsistency matrix `R` for each non-singleton cluster |
|
node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]``, where |
|
``Q(j)`` the set of all node ids corresponding to nodes below |
|
and including ``j``. |
|
|
|
See Also |
|
-------- |
|
linkage : for a description of what a linkage matrix is. |
|
inconsistent : for the creation of a inconsistency matrix. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Given a data set ``X``, we can apply a clustering method to obtain a |
|
linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can |
|
be also used to obtain the inconsistency matrix ``R`` associated to |
|
this clustering process: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = median(pdist(X)) |
|
>>> R = inconsistent(Z) |
|
>>> R |
|
array([[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1. , 0. , 1. , 0. ], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.05901699, 0.08346263, 2. , 0.70710678], |
|
[1.74535599, 1.08655358, 3. , 1.15470054], |
|
[1.91202266, 1.37522872, 3. , 1.15470054], |
|
[3.25 , 0.25 , 3. , 0. ]]) |
|
|
|
`scipy.cluster.hierarchy.maxRstat` can be used to compute |
|
the maximum value of each column of ``R``, for each non-singleton |
|
cluster and its children: |
|
|
|
>>> maxRstat(Z, R, 0) |
|
array([1. , 1. , 1. , 1. , 1.05901699, |
|
1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266, |
|
3.25 ]) |
|
>>> maxRstat(Z, R, 1) |
|
array([0. , 0. , 0. , 0. , 0.08346263, |
|
0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872, |
|
1.37522872]) |
|
>>> maxRstat(Z, R, 3) |
|
array([0. , 0. , 0. , 0. , 0.70710678, |
|
0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054, |
|
1.15470054]) |
|
|
|
""" |
|
xp = array_namespace(Z, R) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
R = _asarray(R, order='C', dtype=xp.float64, xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
is_valid_im(R, throw=True, name='R') |
|
|
|
if not isinstance(i, int): |
|
raise TypeError('The third argument must be an integer.') |
|
|
|
if i < 0 or i > 3: |
|
raise ValueError('i must be an integer between 0 and 3 inclusive.') |
|
|
|
if Z.shape[0] != R.shape[0]: |
|
raise ValueError("The inconsistency matrix and linkage matrix each " |
|
"have a different number of rows.") |
|
|
|
n = Z.shape[0] + 1 |
|
MR = np.zeros((n - 1,)) |
|
Z = np.asarray(Z) |
|
R = np.asarray(R) |
|
_hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, int(n), i) |
|
MR = xp.asarray(MR) |
|
return MR |
|
|
|
|
|
def leaders(Z, T): |
|
""" |
|
Return the root nodes in a hierarchical clustering. |
|
|
|
Returns the root nodes in a hierarchical clustering corresponding |
|
to a cut defined by a flat cluster assignment vector ``T``. See |
|
the ``fcluster`` function for more information on the format of ``T``. |
|
|
|
For each flat cluster :math:`j` of the :math:`k` flat clusters |
|
represented in the n-sized flat cluster assignment vector ``T``, |
|
this function finds the lowest cluster node :math:`i` in the linkage |
|
tree Z, such that: |
|
|
|
* leaf descendants belong only to flat cluster j |
|
(i.e., ``T[p]==j`` for all :math:`p` in :math:`S(i)`, where |
|
:math:`S(i)` is the set of leaf ids of descendant leaf nodes |
|
with cluster node :math:`i`) |
|
|
|
* there does not exist a leaf that is not a descendant with |
|
:math:`i` that also belongs to cluster :math:`j` |
|
(i.e., ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If |
|
this condition is violated, ``T`` is not a valid cluster |
|
assignment vector, and an exception will be thrown. |
|
|
|
Parameters |
|
---------- |
|
Z : ndarray |
|
The hierarchical clustering encoded as a matrix. See |
|
`linkage` for more information. |
|
T : ndarray |
|
The flat cluster assignment vector. |
|
|
|
Returns |
|
------- |
|
L : ndarray |
|
The leader linkage node id's stored as a k-element 1-D array, |
|
where ``k`` is the number of flat clusters found in ``T``. |
|
|
|
``L[j]=i`` is the linkage cluster node id that is the |
|
leader of flat cluster with id M[j]. If ``i < n``, ``i`` |
|
corresponds to an original observation, otherwise it |
|
corresponds to a non-singleton cluster. |
|
M : ndarray |
|
The leader linkage node id's stored as a k-element 1-D array, where |
|
``k`` is the number of flat clusters found in ``T``. This allows the |
|
set of flat cluster ids to be any arbitrary set of ``k`` integers. |
|
|
|
For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with |
|
id 8's leader is linkage node 2. |
|
|
|
See Also |
|
-------- |
|
fcluster : for the creation of flat cluster assignments. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.hierarchy import ward, fcluster, leaders |
|
>>> from scipy.spatial.distance import pdist |
|
|
|
Given a linkage matrix ``Z`` - obtained after apply a clustering method |
|
to a dataset ``X`` - and a flat cluster assignment array ``T``: |
|
|
|
>>> X = [[0, 0], [0, 1], [1, 0], |
|
... [0, 4], [0, 3], [1, 4], |
|
... [4, 0], [3, 0], [4, 1], |
|
... [4, 4], [3, 4], [4, 3]] |
|
|
|
>>> Z = ward(pdist(X)) |
|
>>> Z |
|
array([[ 0. , 1. , 1. , 2. ], |
|
[ 3. , 4. , 1. , 2. ], |
|
[ 6. , 7. , 1. , 2. ], |
|
[ 9. , 10. , 1. , 2. ], |
|
[ 2. , 12. , 1.29099445, 3. ], |
|
[ 5. , 13. , 1.29099445, 3. ], |
|
[ 8. , 14. , 1.29099445, 3. ], |
|
[11. , 15. , 1.29099445, 3. ], |
|
[16. , 17. , 5.77350269, 6. ], |
|
[18. , 19. , 5.77350269, 6. ], |
|
[20. , 21. , 8.16496581, 12. ]]) |
|
|
|
>>> T = fcluster(Z, 3, criterion='distance') |
|
>>> T |
|
array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) |
|
|
|
`scipy.cluster.hierarchy.leaders` returns the indices of the nodes |
|
in the dendrogram that are the leaders of each flat cluster: |
|
|
|
>>> L, M = leaders(Z, T) |
|
>>> L |
|
array([16, 17, 18, 19], dtype=int32) |
|
|
|
(remember that indices 0-11 point to the 12 data points in ``X``, |
|
whereas indices 12-22 point to the 11 rows of ``Z``) |
|
|
|
`scipy.cluster.hierarchy.leaders` also returns the indices of |
|
the flat clusters in ``T``: |
|
|
|
>>> M |
|
array([1, 2, 3, 4], dtype=int32) |
|
|
|
""" |
|
xp = array_namespace(Z, T) |
|
Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp) |
|
T = _asarray(T, order='C', xp=xp) |
|
is_valid_linkage(Z, throw=True, name='Z') |
|
|
|
if T.dtype != xp.int32: |
|
raise TypeError('T must be a 1-D array of dtype int32.') |
|
|
|
if T.shape[0] != Z.shape[0] + 1: |
|
raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.') |
|
|
|
n_clusters = int(xp.unique_values(T).shape[0]) |
|
n_obs = int(Z.shape[0] + 1) |
|
L = np.zeros(n_clusters, dtype=np.int32) |
|
M = np.zeros(n_clusters, dtype=np.int32) |
|
Z = np.asarray(Z) |
|
T = np.asarray(T, dtype=np.int32) |
|
s = _hierarchy.leaders(Z, T, L, M, n_clusters, n_obs) |
|
if s >= 0: |
|
raise ValueError(('T is not a valid assignment vector. Error found ' |
|
'when examining linkage node %d (< 2n-1).') % s) |
|
L, M = xp.asarray(L), xp.asarray(M) |
|
return (L, M) |
|
|