Videre
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
"""Popular unsupervised clustering algorithms."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.cluster._affinity_propagation import (
|
||||
AffinityPropagation,
|
||||
affinity_propagation,
|
||||
)
|
||||
from sklearn.cluster._agglomerative import (
|
||||
AgglomerativeClustering,
|
||||
FeatureAgglomeration,
|
||||
linkage_tree,
|
||||
ward_tree,
|
||||
)
|
||||
from sklearn.cluster._bicluster import SpectralBiclustering, SpectralCoclustering
|
||||
from sklearn.cluster._birch import Birch
|
||||
from sklearn.cluster._bisect_k_means import BisectingKMeans
|
||||
from sklearn.cluster._dbscan import DBSCAN, dbscan
|
||||
from sklearn.cluster._hdbscan.hdbscan import HDBSCAN
|
||||
from sklearn.cluster._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
|
||||
from sklearn.cluster._mean_shift import (
|
||||
MeanShift,
|
||||
estimate_bandwidth,
|
||||
get_bin_seeds,
|
||||
mean_shift,
|
||||
)
|
||||
from sklearn.cluster._optics import (
|
||||
OPTICS,
|
||||
cluster_optics_dbscan,
|
||||
cluster_optics_xi,
|
||||
compute_optics_graph,
|
||||
)
|
||||
from sklearn.cluster._spectral import SpectralClustering, spectral_clustering
|
||||
|
||||
__all__ = [
|
||||
"DBSCAN",
|
||||
"HDBSCAN",
|
||||
"OPTICS",
|
||||
"AffinityPropagation",
|
||||
"AgglomerativeClustering",
|
||||
"Birch",
|
||||
"BisectingKMeans",
|
||||
"FeatureAgglomeration",
|
||||
"KMeans",
|
||||
"MeanShift",
|
||||
"MiniBatchKMeans",
|
||||
"SpectralBiclustering",
|
||||
"SpectralClustering",
|
||||
"SpectralCoclustering",
|
||||
"affinity_propagation",
|
||||
"cluster_optics_dbscan",
|
||||
"cluster_optics_xi",
|
||||
"compute_optics_graph",
|
||||
"dbscan",
|
||||
"estimate_bandwidth",
|
||||
"get_bin_seeds",
|
||||
"k_means",
|
||||
"kmeans_plusplus",
|
||||
"linkage_tree",
|
||||
"mean_shift",
|
||||
"spectral_clustering",
|
||||
"ward_tree",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,607 @@
|
||||
"""Affinity Propagation clustering algorithm."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import euclidean_distances, pairwise_distances_argmin
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
from sklearn.utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
def _equal_similarities_and_preferences(S, preference):
|
||||
def all_equal_preferences():
|
||||
return np.all(preference == preference.flat[0])
|
||||
|
||||
def all_equal_similarities():
|
||||
# Create mask to ignore diagonal of S
|
||||
mask = np.ones(S.shape, dtype=bool)
|
||||
np.fill_diagonal(mask, 0)
|
||||
|
||||
return np.all(S[mask].flat == S[mask].flat[0])
|
||||
|
||||
return all_equal_preferences() and all_equal_similarities()
|
||||
|
||||
|
||||
def _affinity_propagation(
|
||||
S,
|
||||
*,
|
||||
preference,
|
||||
convergence_iter,
|
||||
max_iter,
|
||||
damping,
|
||||
verbose,
|
||||
return_n_iter,
|
||||
random_state,
|
||||
):
|
||||
"""Main affinity propagation algorithm."""
|
||||
n_samples = S.shape[0]
|
||||
if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
|
||||
# It makes no sense to run the algorithm in this case, so return 1 or
|
||||
# n_samples clusters, depending on preferences
|
||||
warnings.warn(
|
||||
"All samples have mutually equal similarities. "
|
||||
"Returning arbitrary cluster center(s)."
|
||||
)
|
||||
if preference.flat[0] > S.flat[n_samples - 1]:
|
||||
return (
|
||||
(np.arange(n_samples), np.arange(n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.arange(n_samples), np.arange(n_samples))
|
||||
)
|
||||
else:
|
||||
return (
|
||||
(np.array([0]), np.array([0] * n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.array([0]), np.array([0] * n_samples))
|
||||
)
|
||||
|
||||
# Place preference on the diagonal of S
|
||||
S.flat[:: (n_samples + 1)] = preference
|
||||
|
||||
A = np.zeros((n_samples, n_samples))
|
||||
R = np.zeros((n_samples, n_samples)) # Initialize messages
|
||||
# Intermediate results
|
||||
tmp = np.zeros((n_samples, n_samples))
|
||||
|
||||
# Remove degeneracies
|
||||
S += (
|
||||
np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
|
||||
) * random_state.standard_normal(size=(n_samples, n_samples))
|
||||
|
||||
# Execute parallel affinity propagation updates
|
||||
e = np.zeros((n_samples, convergence_iter))
|
||||
|
||||
ind = np.arange(n_samples)
|
||||
|
||||
for it in range(max_iter):
|
||||
# tmp = A + S; compute responsibilities
|
||||
np.add(A, S, tmp)
|
||||
I = np.argmax(tmp, axis=1)
|
||||
Y = tmp[ind, I] # np.max(A + S, axis=1)
|
||||
tmp[ind, I] = -np.inf
|
||||
Y2 = np.max(tmp, axis=1)
|
||||
|
||||
# tmp = Rnew
|
||||
np.subtract(S, Y[:, None], tmp)
|
||||
tmp[ind, I] = S[ind, I] - Y2
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
R *= damping
|
||||
R += tmp
|
||||
|
||||
# tmp = Rp; compute availabilities
|
||||
np.maximum(R, 0, out=tmp)
|
||||
tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
|
||||
|
||||
# tmp = -Anew
|
||||
tmp -= np.sum(tmp, axis=0)
|
||||
dA = np.diag(tmp).copy()
|
||||
tmp.clip(0, np.inf, tmp)
|
||||
tmp.flat[:: n_samples + 1] = dA
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
A *= damping
|
||||
A -= tmp
|
||||
|
||||
# Check for convergence
|
||||
E = (np.diag(A) + np.diag(R)) > 0
|
||||
e[:, it % convergence_iter] = E
|
||||
K = np.sum(E, axis=0)
|
||||
|
||||
if it >= convergence_iter:
|
||||
se = np.sum(e, axis=1)
|
||||
unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
|
||||
if (not unconverged and (K > 0)) or (it == max_iter):
|
||||
never_converged = False
|
||||
if verbose:
|
||||
print("Converged after %d iterations." % it)
|
||||
break
|
||||
else:
|
||||
never_converged = True
|
||||
if verbose:
|
||||
print("Did not converge")
|
||||
|
||||
I = np.flatnonzero(E)
|
||||
K = I.size # Identify exemplars
|
||||
|
||||
if K > 0:
|
||||
if never_converged:
|
||||
warnings.warn(
|
||||
(
|
||||
"Affinity propagation did not converge, this model "
|
||||
"may return degenerate cluster centers and labels."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K) # Identify clusters
|
||||
# Refine the final set of exemplars and clusters and return results
|
||||
for k in range(K):
|
||||
ii = np.asarray(c == k).nonzero()[0]
|
||||
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
|
||||
I[k] = ii[j]
|
||||
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K)
|
||||
labels = I[c]
|
||||
# Reduce labels to a sorted, gapless, list
|
||||
cluster_centers_indices = np.unique(labels)
|
||||
labels = np.searchsorted(cluster_centers_indices, labels)
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
"Affinity propagation did not converge and this model "
|
||||
"will not have any cluster centers."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
labels = np.array([-1] * n_samples)
|
||||
cluster_centers_indices = []
|
||||
|
||||
if return_n_iter:
|
||||
return cluster_centers_indices, labels, it + 1
|
||||
else:
|
||||
return cluster_centers_indices, labels
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Public API
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"S": ["array-like"],
|
||||
"return_n_iter": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def affinity_propagation(
|
||||
S,
|
||||
*,
|
||||
preference=None,
|
||||
convergence_iter=15,
|
||||
max_iter=200,
|
||||
damping=0.5,
|
||||
copy=True,
|
||||
verbose=False,
|
||||
return_n_iter=False,
|
||||
random_state=None,
|
||||
):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
S : array-like of shape (n_samples, n_samples)
|
||||
Matrix of similarities between points.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number of
|
||||
exemplars, i.e. of clusters, is influenced by the input preferences
|
||||
value. If the preferences are not passed as arguments, they will be
|
||||
set to the median of the input similarities (resulting in a moderate
|
||||
number of clusters). For a smaller amount of clusters, this can be set
|
||||
to the minimum value of the similarities.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
damping : float, default=0.5
|
||||
Damping factor between 0.5 and 1.
|
||||
|
||||
copy : bool, default=True
|
||||
If copy is False, the affinity matrix is modified inplace by the
|
||||
algorithm, for memory efficiency.
|
||||
|
||||
verbose : bool, default=False
|
||||
The verbosity level.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cluster_centers_indices : ndarray of shape (n_clusters,)
|
||||
Index of clusters centers.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point.
|
||||
|
||||
n_iter : int
|
||||
Number of iterations run. Returned only if `return_n_iter` is
|
||||
set to True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example usage,
|
||||
see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
|
||||
You may also check out,
|
||||
:ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`
|
||||
|
||||
When the algorithm does not converge, it will still return an array of
|
||||
``cluster_center_indices`` and labels if there are any exemplars/clusters,
|
||||
however they may be degenerate and should be used with caution.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, a single cluster center
|
||||
and label ``0`` for every sample will be returned. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.cluster import affinity_propagation
|
||||
>>> from sklearn.metrics.pairwise import euclidean_distances
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> S = -euclidean_distances(X, squared=True)
|
||||
>>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
|
||||
>>> cluster_centers_indices
|
||||
array([0, 3])
|
||||
>>> labels
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
estimator = AffinityPropagation(
|
||||
damping=damping,
|
||||
max_iter=max_iter,
|
||||
convergence_iter=convergence_iter,
|
||||
copy=copy,
|
||||
preference=preference,
|
||||
affinity="precomputed",
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
).fit(S)
|
||||
|
||||
if return_n_iter:
|
||||
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
|
||||
return estimator.cluster_centers_indices_, estimator.labels_
|
||||
|
||||
|
||||
class AffinityPropagation(ClusterMixin, BaseEstimator):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
damping : float, default=0.5
|
||||
Damping factor in the range `[0.5, 1.0)` is the extent to
|
||||
which the current value is maintained relative to
|
||||
incoming values (weighted 1 - damping). This in order
|
||||
to avoid numerical oscillations when updating these
|
||||
values (messages).
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
copy : bool, default=True
|
||||
Make a copy of input data.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number
|
||||
of exemplars, ie of clusters, is influenced by the input
|
||||
preferences value. If the preferences are not passed as arguments,
|
||||
they will be set to the median of the input similarities.
|
||||
|
||||
affinity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Which affinity to use. At the moment 'precomputed' and
|
||||
``euclidean`` are supported. 'euclidean' uses the
|
||||
negative squared euclidean distance between points.
|
||||
|
||||
verbose : bool, default=False
|
||||
Whether to be verbose.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_indices_ : ndarray of shape (n_clusters,)
|
||||
Indices of cluster centers.
|
||||
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Cluster centers (if affinity != ``precomputed``).
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Stores the affinity matrix used in ``fit``.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations taken to converge.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
AgglomerativeClustering : Recursively merges the pair of
|
||||
clusters that minimally increases a given linkage distance.
|
||||
FeatureAgglomeration : Similar to AgglomerativeClustering,
|
||||
but recursively merges features instead of samples.
|
||||
KMeans : K-Means clustering.
|
||||
MiniBatchKMeans : Mini-Batch K-Means clustering.
|
||||
MeanShift : Mean shift clustering using a flat kernel.
|
||||
SpectralClustering : Apply clustering to a projection
|
||||
of the normalized Laplacian.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The algorithmic complexity of affinity propagation is quadratic
|
||||
in the number of points.
|
||||
|
||||
When the algorithm does not converge, it will still return an array of
|
||||
``cluster_center_indices`` and labels if there are any exemplars/clusters,
|
||||
however they may be degenerate and should be used with caution.
|
||||
|
||||
When ``fit`` does not converge, ``cluster_centers_`` is still populated
|
||||
however it may be degenerate. In such a case, proceed with caution.
|
||||
If ``fit`` does not converge and fails to produce any ``cluster_centers_``
|
||||
then ``predict`` will label every sample as ``-1``.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, ``fit`` will result in
|
||||
a single cluster center and label ``0`` for every sample. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import AffinityPropagation
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> clustering = AffinityPropagation(random_state=5).fit(X)
|
||||
>>> clustering
|
||||
AffinityPropagation(random_state=5)
|
||||
>>> clustering.labels_
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
>>> clustering.predict([[0, 0], [4, 4]])
|
||||
array([0, 1])
|
||||
>>> clustering.cluster_centers_
|
||||
array([[1, 2],
|
||||
[4, 2]])
|
||||
|
||||
For an example usage,
|
||||
see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
|
||||
|
||||
For a comparison of Affinity Propagation with other clustering algorithms, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"damping": [Interval(Real, 0.5, 1.0, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"convergence_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"copy": ["boolean"],
|
||||
"preference": [
|
||||
"array-like",
|
||||
Interval(Real, None, None, closed="neither"),
|
||||
None,
|
||||
],
|
||||
"affinity": [StrOptions({"euclidean", "precomputed"})],
|
||||
"verbose": ["verbose"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
damping=0.5,
|
||||
max_iter=200,
|
||||
convergence_iter=15,
|
||||
copy=True,
|
||||
preference=None,
|
||||
affinity="euclidean",
|
||||
verbose=False,
|
||||
random_state=None,
|
||||
):
|
||||
self.damping = damping
|
||||
self.max_iter = max_iter
|
||||
self.convergence_iter = convergence_iter
|
||||
self.copy = copy
|
||||
self.verbose = verbose
|
||||
self.preference = preference
|
||||
self.affinity = affinity
|
||||
self.random_state = random_state
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.pairwise = self.affinity == "precomputed"
|
||||
tags.input_tags.sparse = self.affinity != "precomputed"
|
||||
return tags
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
array-like of shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Returns the instance itself.
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
X = validate_data(self, X, copy=self.copy, force_writeable=True)
|
||||
self.affinity_matrix_ = X
|
||||
else: # self.affinity == "euclidean"
|
||||
X = validate_data(self, X, accept_sparse="csr")
|
||||
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
|
||||
|
||||
if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
|
||||
raise ValueError(
|
||||
"The matrix of similarities must be a square array. "
|
||||
f"Got {self.affinity_matrix_.shape} instead."
|
||||
)
|
||||
|
||||
if self.preference is None:
|
||||
preference = np.median(self.affinity_matrix_)
|
||||
else:
|
||||
preference = self.preference
|
||||
preference = np.asarray(preference)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
(
|
||||
self.cluster_centers_indices_,
|
||||
self.labels_,
|
||||
self.n_iter_,
|
||||
) = _affinity_propagation(
|
||||
self.affinity_matrix_,
|
||||
max_iter=self.max_iter,
|
||||
convergence_iter=self.convergence_iter,
|
||||
preference=preference,
|
||||
damping=self.damping,
|
||||
verbose=self.verbose,
|
||||
return_n_iter=True,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
if self.affinity != "precomputed":
|
||||
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
New data to predict. If a sparse matrix is provided, it will be
|
||||
converted into a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = validate_data(self, X, reset=False, accept_sparse="csr")
|
||||
if not hasattr(self, "cluster_centers_"):
|
||||
raise ValueError(
|
||||
"Predict method is not supported when affinity='precomputed'."
|
||||
)
|
||||
|
||||
if self.cluster_centers_.shape[0] > 0:
|
||||
with config_context(assume_finite=True):
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
"This model does not have any cluster centers "
|
||||
"because affinity propagation did not converge. "
|
||||
"Labeling every sample as '-1'."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
return np.array([-1] * X.shape[0])
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit clustering from features/affinity matrix; return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
array-like of shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,622 @@
|
||||
"""Spectral biclustering algorithms."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import norm
|
||||
from scipy.sparse import dia_matrix, issparse
|
||||
from scipy.sparse.linalg import eigsh, svds
|
||||
|
||||
from sklearn.base import BaseEstimator, BiclusterMixin, _fit_context
|
||||
from sklearn.cluster._kmeans import KMeans, MiniBatchKMeans
|
||||
from sklearn.utils import check_random_state, check_scalar
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.extmath import _randomized_svd, make_nonnegative, safe_sparse_dot
|
||||
from sklearn.utils.validation import assert_all_finite, validate_data
|
||||
|
||||
__all__ = ["SpectralBiclustering", "SpectralCoclustering"]
|
||||
|
||||
|
||||
def _scale_normalize(X):
|
||||
"""Normalize ``X`` by scaling rows and columns independently.
|
||||
|
||||
Returns the normalized matrix and the row and column scaling
|
||||
factors.
|
||||
"""
|
||||
X = make_nonnegative(X)
|
||||
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
|
||||
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
|
||||
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
|
||||
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
|
||||
if issparse(X):
|
||||
n_rows, n_cols = X.shape
|
||||
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
|
||||
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
|
||||
an = r @ X @ c
|
||||
else:
|
||||
an = row_diag[:, np.newaxis] * X * col_diag
|
||||
return an, row_diag, col_diag
|
||||
|
||||
|
||||
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
|
||||
"""Normalize rows and columns of ``X`` simultaneously so that all
|
||||
rows sum to one constant and all columns sum to a different
|
||||
constant.
|
||||
"""
|
||||
# According to paper, this can also be done more efficiently with
|
||||
# deviation reduction and balancing algorithms.
|
||||
X = make_nonnegative(X)
|
||||
X_scaled = X
|
||||
for _ in range(max_iter):
|
||||
X_new, _, _ = _scale_normalize(X_scaled)
|
||||
if issparse(X):
|
||||
dist = norm(X_scaled.data - X.data)
|
||||
else:
|
||||
dist = norm(X_scaled - X_new)
|
||||
X_scaled = X_new
|
||||
if dist is not None and dist < tol:
|
||||
break
|
||||
return X_scaled
|
||||
|
||||
|
||||
def _log_normalize(X):
|
||||
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
|
||||
X = make_nonnegative(X, min_value=1)
|
||||
if issparse(X):
|
||||
raise ValueError(
|
||||
"Cannot compute log of a sparse matrix,"
|
||||
" because log(x) diverges to -infinity as x"
|
||||
" goes to 0."
|
||||
)
|
||||
L = np.log(X)
|
||||
row_avg = L.mean(axis=1)[:, np.newaxis]
|
||||
col_avg = L.mean(axis=0)
|
||||
avg = L.mean()
|
||||
return L - row_avg - col_avg + avg
|
||||
|
||||
|
||||
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for spectral biclustering."""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"svd_method": [StrOptions({"randomized", "arpack"})],
|
||||
"n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
|
||||
"mini_batch": ["boolean"],
|
||||
"init": [StrOptions({"k-means++", "random"}), np.ndarray],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
self.n_clusters = n_clusters
|
||||
self.svd_method = svd_method
|
||||
self.n_svd_vecs = n_svd_vecs
|
||||
self.mini_batch = mini_batch
|
||||
self.init = init
|
||||
self.n_init = n_init
|
||||
self.random_state = random_state
|
||||
|
||||
@abstractmethod
|
||||
def _check_parameters(self, n_samples):
|
||||
"""Validate parameters depending on the input data."""
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Create a biclustering for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
SpectralBiclustering instance.
|
||||
"""
|
||||
X = validate_data(self, X, accept_sparse="csr", dtype=np.float64)
|
||||
self._check_parameters(X.shape[0])
|
||||
self._fit(X)
|
||||
return self
|
||||
|
||||
def _svd(self, array, n_components, n_discard):
|
||||
"""Returns first `n_components` left and right singular
|
||||
vectors u and v, discarding the first `n_discard`.
|
||||
"""
|
||||
if self.svd_method == "randomized":
|
||||
kwargs = {}
|
||||
if self.n_svd_vecs is not None:
|
||||
kwargs["n_oversamples"] = self.n_svd_vecs
|
||||
u, _, vt = _randomized_svd(
|
||||
array, n_components, random_state=self.random_state, **kwargs
|
||||
)
|
||||
|
||||
elif self.svd_method == "arpack":
|
||||
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
|
||||
if np.any(np.isnan(vt)):
|
||||
# some eigenvalues of A * A.T are negative, causing
|
||||
# sqrt() to be np.nan. This causes some vectors in vt
|
||||
# to be np.nan.
|
||||
A = safe_sparse_dot(array.T, array)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
vt = v.T
|
||||
if np.any(np.isnan(u)):
|
||||
A = safe_sparse_dot(array, array.T)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
|
||||
assert_all_finite(u)
|
||||
assert_all_finite(vt)
|
||||
u = u[:, n_discard:]
|
||||
vt = vt[n_discard:]
|
||||
return u, vt.T
|
||||
|
||||
def _k_means(self, data, n_clusters):
|
||||
if self.mini_batch:
|
||||
model = MiniBatchKMeans(
|
||||
n_clusters,
|
||||
init=self.init,
|
||||
n_init=self.n_init,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
else:
|
||||
model = KMeans(
|
||||
n_clusters,
|
||||
init=self.init,
|
||||
n_init=self.n_init,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
model.fit(data)
|
||||
centroid = model.cluster_centers_
|
||||
labels = model.labels_
|
||||
return centroid, labels
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
|
||||
|
||||
class SpectralCoclustering(BaseSpectral):
|
||||
"""Spectral Co-Clustering algorithm (Dhillon, 2001) [1]_.
|
||||
|
||||
Clusters rows and columns of an array `X` to solve the relaxed
|
||||
normalized cut of the bipartite graph created from `X` as follows:
|
||||
the edge between row vertex `i` and column vertex `j` has weight
|
||||
`X[i, j]`.
|
||||
|
||||
The resulting bicluster structure is block-diagonal, since each
|
||||
row and each column belongs to exactly one bicluster.
|
||||
|
||||
Supports sparse matrices, as long as they are nonnegative.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_coclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=3
|
||||
The number of biclusters to find.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', use
|
||||
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', use
|
||||
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random'}, or ndarray of shape \
|
||||
(n_clusters, n_features), default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
The bicluster label of each row.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
The bicluster label of each column.
|
||||
|
||||
biclusters_ : tuple of two ndarrays
|
||||
The tuple contains the `rows_` and `columns_` arrays.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralBiclustering : Partitions rows and columns under the assumption
|
||||
that the data has an underlying checkerboard structure.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
|
||||
bipartite spectral graph partitioning.
|
||||
<10.1145/502512.502550>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralCoclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_ #doctest: +SKIP
|
||||
array([0, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_ #doctest: +SKIP
|
||||
array([0, 0], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralCoclustering(n_clusters=2, random_state=0)
|
||||
|
||||
For a more detailed example, see the following:
|
||||
:ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSpectral._parameter_constraints,
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
*,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
|
||||
)
|
||||
|
||||
def _check_parameters(self, n_samples):
|
||||
if self.n_clusters > n_samples:
|
||||
raise ValueError(
|
||||
f"n_clusters should be <= n_samples={n_samples}. Got"
|
||||
f" {self.n_clusters} instead."
|
||||
)
|
||||
|
||||
def _fit(self, X):
|
||||
normalized_data, row_diag, col_diag = _scale_normalize(X)
|
||||
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard=1)
|
||||
z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
|
||||
|
||||
_, labels = self._k_means(z, self.n_clusters)
|
||||
|
||||
n_rows = X.shape[0]
|
||||
self.row_labels_ = labels[:n_rows]
|
||||
self.column_labels_ = labels[n_rows:]
|
||||
|
||||
self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
|
||||
self.columns_ = np.vstack(
|
||||
[self.column_labels_ == c for c in range(self.n_clusters)]
|
||||
)
|
||||
|
||||
|
||||
class SpectralBiclustering(BaseSpectral):
|
||||
"""Spectral biclustering (Kluger, 2003) [1]_.
|
||||
|
||||
Partitions rows and columns under the assumption that the data has
|
||||
an underlying checkerboard structure. For instance, if there are
|
||||
two row partitions and three column partitions, each row will
|
||||
belong to three biclusters, and each column will belong to two
|
||||
biclusters. The outer product of the corresponding row and column
|
||||
label vectors gives this checkerboard structure.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_biclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
|
||||
The number of row and column clusters in the checkerboard
|
||||
structure.
|
||||
|
||||
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
|
||||
Method of normalizing and converting singular vectors into
|
||||
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
|
||||
The authors recommend using 'log'. If the data is sparse,
|
||||
however, log normalization will not work, which is why the
|
||||
default is 'bistochastic'.
|
||||
|
||||
.. warning::
|
||||
if `method='log'`, the data must not be sparse.
|
||||
|
||||
n_components : int, default=6
|
||||
Number of singular vectors to check.
|
||||
|
||||
n_best : int, default=3
|
||||
Number of best singular vectors to which to project the data
|
||||
for clustering.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', uses
|
||||
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', uses
|
||||
`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
|
||||
default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
Row partition labels.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
Column partition labels.
|
||||
|
||||
biclusters_ : tuple of two ndarrays
|
||||
The tuple contains the `rows_` and `columns_` arrays.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralCoclustering : Clusters rows and columns of an array `X` to solve the
|
||||
relaxed normalized cut of the bipartite graph created from `X`.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
|
||||
data: coclustering genes and conditions.
|
||||
<10.1101/gr.648603>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralBiclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_
|
||||
array([1, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_
|
||||
array([1, 0], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralBiclustering(n_clusters=2, random_state=0)
|
||||
|
||||
For a more detailed example, see
|
||||
:ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSpectral._parameter_constraints,
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
|
||||
"method": [StrOptions({"bistochastic", "scale", "log"})],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_best": [Interval(Integral, 1, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
*,
|
||||
method="bistochastic",
|
||||
n_components=6,
|
||||
n_best=3,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
|
||||
)
|
||||
self.method = method
|
||||
self.n_components = n_components
|
||||
self.n_best = n_best
|
||||
|
||||
def _check_parameters(self, n_samples):
|
||||
if isinstance(self.n_clusters, Integral):
|
||||
if self.n_clusters > n_samples:
|
||||
raise ValueError(
|
||||
f"n_clusters should be <= n_samples={n_samples}. Got"
|
||||
f" {self.n_clusters} instead."
|
||||
)
|
||||
else: # tuple
|
||||
try:
|
||||
n_row_clusters, n_column_clusters = self.n_clusters
|
||||
check_scalar(
|
||||
n_row_clusters,
|
||||
"n_row_clusters",
|
||||
target_type=Integral,
|
||||
min_val=1,
|
||||
max_val=n_samples,
|
||||
)
|
||||
check_scalar(
|
||||
n_column_clusters,
|
||||
"n_column_clusters",
|
||||
target_type=Integral,
|
||||
min_val=1,
|
||||
max_val=n_samples,
|
||||
)
|
||||
except (ValueError, TypeError) as e:
|
||||
raise ValueError(
|
||||
"Incorrect parameter n_clusters has value:"
|
||||
f" {self.n_clusters}. It should either be a single integer"
|
||||
" or an iterable with two integers:"
|
||||
" (n_row_clusters, n_column_clusters)"
|
||||
" And the values are should be in the"
|
||||
" range: (1, n_samples)"
|
||||
) from e
|
||||
|
||||
if self.n_best > self.n_components:
|
||||
raise ValueError(
|
||||
f"n_best={self.n_best} must be <= n_components={self.n_components}."
|
||||
)
|
||||
|
||||
def _fit(self, X):
|
||||
n_sv = self.n_components
|
||||
if self.method == "bistochastic":
|
||||
normalized_data = _bistochastic_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == "scale":
|
||||
normalized_data, _, _ = _scale_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == "log":
|
||||
normalized_data = _log_normalize(X)
|
||||
n_discard = 0 if self.method == "log" else 1
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard)
|
||||
ut = u.T
|
||||
vt = v.T
|
||||
|
||||
try:
|
||||
n_row_clusters, n_col_clusters = self.n_clusters
|
||||
except TypeError:
|
||||
n_row_clusters = n_col_clusters = self.n_clusters
|
||||
|
||||
best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
|
||||
|
||||
best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
|
||||
|
||||
self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
|
||||
|
||||
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
|
||||
|
||||
self.rows_ = np.vstack(
|
||||
[
|
||||
self.row_labels_ == label
|
||||
for label in range(n_row_clusters)
|
||||
for _ in range(n_col_clusters)
|
||||
]
|
||||
)
|
||||
self.columns_ = np.vstack(
|
||||
[
|
||||
self.column_labels_ == label
|
||||
for _ in range(n_row_clusters)
|
||||
for label in range(n_col_clusters)
|
||||
]
|
||||
)
|
||||
|
||||
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
|
||||
"""Find the ``n_best`` vectors that are best approximated by piecewise
|
||||
constant vectors.
|
||||
|
||||
The piecewise vectors are found by k-means; the best is chosen
|
||||
according to Euclidean distance.
|
||||
|
||||
"""
|
||||
|
||||
def make_piecewise(v):
|
||||
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
|
||||
return centroid[labels].ravel()
|
||||
|
||||
piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
|
||||
dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
|
||||
result = vectors[np.argsort(dists)[:n_best]]
|
||||
return result
|
||||
|
||||
def _project_and_cluster(self, data, vectors, n_clusters):
|
||||
"""Project ``data`` to ``vectors`` and cluster the result."""
|
||||
projected = safe_sparse_dot(data, vectors)
|
||||
_, labels = self._k_means(projected, n_clusters)
|
||||
return labels
|
||||
@@ -0,0 +1,730 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from math import sqrt
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
ClusterMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import pairwise_distances_argmin
|
||||
from sklearn.metrics.pairwise import euclidean_distances
|
||||
from sklearn.utils._param_validation import Interval
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
def _iterate_sparse_X(X):
|
||||
"""This little hack returns a densified row when iterating over a sparse
|
||||
matrix, instead of constructing a sparse matrix for every row that is
|
||||
expensive.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
X_indices = X.indices
|
||||
X_data = X.data
|
||||
X_indptr = X.indptr
|
||||
|
||||
for i in range(n_samples):
|
||||
row = np.zeros(X.shape[1])
|
||||
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
||||
nonzero_indices = X_indices[startptr:endptr]
|
||||
row[nonzero_indices] = X_data[startptr:endptr]
|
||||
yield row
|
||||
|
||||
|
||||
def _split_node(node, threshold, branching_factor):
|
||||
"""The node has to be split if there is no place for a new subcluster
|
||||
in the node.
|
||||
1. Two empty nodes and two empty subclusters are initialized.
|
||||
2. The pair of distant subclusters are found.
|
||||
3. The properties of the empty subclusters and nodes are updated
|
||||
according to the nearest distance between the subclusters to the
|
||||
pair of distant subclusters.
|
||||
4. The two nodes are set as children to the two subclusters.
|
||||
"""
|
||||
new_subcluster1 = _CFSubcluster()
|
||||
new_subcluster2 = _CFSubcluster()
|
||||
new_node1 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_node2 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_subcluster1.child_ = new_node1
|
||||
new_subcluster2.child_ = new_node2
|
||||
|
||||
if node.is_leaf:
|
||||
if node.prev_leaf_ is not None:
|
||||
node.prev_leaf_.next_leaf_ = new_node1
|
||||
new_node1.prev_leaf_ = node.prev_leaf_
|
||||
new_node1.next_leaf_ = new_node2
|
||||
new_node2.prev_leaf_ = new_node1
|
||||
new_node2.next_leaf_ = node.next_leaf_
|
||||
if node.next_leaf_ is not None:
|
||||
node.next_leaf_.prev_leaf_ = new_node2
|
||||
|
||||
dist = euclidean_distances(
|
||||
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
|
||||
)
|
||||
n_clusters = dist.shape[0]
|
||||
|
||||
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
|
||||
node1_dist, node2_dist = dist[(farthest_idx,)]
|
||||
|
||||
node1_closer = node1_dist < node2_dist
|
||||
# make sure node1 is closest to itself even if all distances are equal.
|
||||
# This can only happen when all node.centroids_ are duplicates leading to all
|
||||
# distances between centroids being zero.
|
||||
node1_closer[farthest_idx[0]] = True
|
||||
|
||||
for idx, subcluster in enumerate(node.subclusters_):
|
||||
if node1_closer[idx]:
|
||||
new_node1.append_subcluster(subcluster)
|
||||
new_subcluster1.update(subcluster)
|
||||
else:
|
||||
new_node2.append_subcluster(subcluster)
|
||||
new_subcluster2.update(subcluster)
|
||||
return new_subcluster1, new_subcluster2
|
||||
|
||||
|
||||
class _CFNode:
|
||||
"""Each node in a CFTree is called a CFNode.
|
||||
|
||||
The CFNode can have a maximum of branching_factor
|
||||
number of CFSubclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float
|
||||
Threshold needed for a new subcluster to enter a CFSubcluster.
|
||||
|
||||
branching_factor : int
|
||||
Maximum number of CF subclusters in each node.
|
||||
|
||||
is_leaf : bool
|
||||
We need to know if the CFNode is a leaf or not, in order to
|
||||
retrieve the final subclusters.
|
||||
|
||||
n_features : int
|
||||
The number of features.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
subclusters_ : list
|
||||
List of subclusters for a particular CFNode.
|
||||
|
||||
prev_leaf_ : _CFNode
|
||||
Useful only if is_leaf is True.
|
||||
|
||||
next_leaf_ : _CFNode
|
||||
next_leaf. Useful only if is_leaf is True.
|
||||
the final subclusters.
|
||||
|
||||
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
||||
the centroids are just a view of the ``init_centroids_`` .
|
||||
|
||||
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
|
||||
|
||||
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
View of ``init_centroids_``.
|
||||
|
||||
squared_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
View of ``init_sq_norm_``.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.is_leaf = is_leaf
|
||||
self.n_features = n_features
|
||||
|
||||
# The list of subclusters, centroids and squared norms
|
||||
# to manipulate throughout.
|
||||
self.subclusters_ = []
|
||||
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
||||
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
|
||||
self.squared_norm_ = []
|
||||
self.prev_leaf_ = None
|
||||
self.next_leaf_ = None
|
||||
|
||||
def append_subcluster(self, subcluster):
|
||||
n_samples = len(self.subclusters_)
|
||||
self.subclusters_.append(subcluster)
|
||||
self.init_centroids_[n_samples] = subcluster.centroid_
|
||||
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
|
||||
|
||||
# Keep centroids and squared norm as views. In this way
|
||||
# if we change init_centroids and init_sq_norm_, it is
|
||||
# sufficient,
|
||||
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
||||
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
|
||||
|
||||
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
|
||||
"""Remove a subcluster from a node and update it with the
|
||||
split subclusters.
|
||||
"""
|
||||
ind = self.subclusters_.index(subcluster)
|
||||
self.subclusters_[ind] = new_subcluster1
|
||||
self.init_centroids_[ind] = new_subcluster1.centroid_
|
||||
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
|
||||
self.append_subcluster(new_subcluster2)
|
||||
|
||||
def insert_cf_subcluster(self, subcluster):
|
||||
"""Insert a new subcluster into the node."""
|
||||
if not self.subclusters_:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
# We need to find the closest subcluster among all the
|
||||
# subclusters so that we can insert our new subcluster.
|
||||
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
|
||||
dist_matrix *= -2.0
|
||||
dist_matrix += self.squared_norm_
|
||||
closest_index = np.argmin(dist_matrix)
|
||||
closest_subcluster = self.subclusters_[closest_index]
|
||||
|
||||
# If the subcluster has a child, we need a recursive strategy.
|
||||
if closest_subcluster.child_ is not None:
|
||||
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if not split_child:
|
||||
# If it is determined that the child need not be split, we
|
||||
# can just update the closest_subcluster
|
||||
closest_subcluster.update(subcluster)
|
||||
self.init_centroids_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].centroid_
|
||||
self.init_sq_norm_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].sq_norm_
|
||||
return False
|
||||
|
||||
# things not too good. we need to redistribute the subclusters in
|
||||
# our child node, and add a new subcluster in the parent
|
||||
# subcluster to accommodate the new child.
|
||||
else:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
closest_subcluster.child_,
|
||||
threshold,
|
||||
branching_factor,
|
||||
)
|
||||
self.update_split_subclusters(
|
||||
closest_subcluster, new_subcluster1, new_subcluster2
|
||||
)
|
||||
|
||||
if len(self.subclusters_) > self.branching_factor:
|
||||
return True
|
||||
return False
|
||||
|
||||
# good to go!
|
||||
else:
|
||||
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
||||
if merged:
|
||||
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
||||
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
|
||||
return False
|
||||
|
||||
# not close to any other subclusters, and we still
|
||||
# have space, so add.
|
||||
elif len(self.subclusters_) < self.branching_factor:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
# We do not have enough space nor is it closer to an
|
||||
# other subcluster. We need to split.
|
||||
else:
|
||||
self.append_subcluster(subcluster)
|
||||
return True
|
||||
|
||||
|
||||
class _CFSubcluster:
|
||||
"""Each subcluster in a CFNode is called a CFSubcluster.
|
||||
|
||||
A CFSubcluster can have a CFNode has its child.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linear_sum : ndarray of shape (n_features,), default=None
|
||||
Sample. This is kept optional to allow initialization of empty
|
||||
subclusters.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_samples_ : int
|
||||
Number of samples that belong to each subcluster.
|
||||
|
||||
linear_sum_ : ndarray
|
||||
Linear sum of all the samples in a subcluster. Prevents holding
|
||||
all sample data in memory.
|
||||
|
||||
squared_sum_ : float
|
||||
Sum of the squared l2 norms of all samples belonging to a subcluster.
|
||||
|
||||
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Centroid of the subcluster. Prevent recomputing of centroids when
|
||||
``CFNode.centroids_`` is called.
|
||||
|
||||
child_ : _CFNode
|
||||
Child Node of the subcluster. Once a given _CFNode is set as the child
|
||||
of the _CFNode, it is set to ``self.child_``.
|
||||
|
||||
sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
Squared norm of the subcluster. Used to prevent recomputing when
|
||||
pairwise minimum distances are computed.
|
||||
"""
|
||||
|
||||
def __init__(self, *, linear_sum=None):
|
||||
if linear_sum is None:
|
||||
self.n_samples_ = 0
|
||||
self.squared_sum_ = 0.0
|
||||
self.centroid_ = self.linear_sum_ = 0
|
||||
else:
|
||||
self.n_samples_ = 1
|
||||
self.centroid_ = self.linear_sum_ = linear_sum
|
||||
self.squared_sum_ = self.sq_norm_ = np.dot(
|
||||
self.linear_sum_, self.linear_sum_
|
||||
)
|
||||
self.child_ = None
|
||||
|
||||
def update(self, subcluster):
|
||||
self.n_samples_ += subcluster.n_samples_
|
||||
self.linear_sum_ += subcluster.linear_sum_
|
||||
self.squared_sum_ += subcluster.squared_sum_
|
||||
self.centroid_ = self.linear_sum_ / self.n_samples_
|
||||
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
|
||||
|
||||
def merge_subcluster(self, nominee_cluster, threshold):
|
||||
"""Check if a cluster is worthy enough to be merged. If
|
||||
yes then merge.
|
||||
"""
|
||||
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
|
||||
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
||||
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
||||
new_centroid = (1 / new_n) * new_ls
|
||||
new_sq_norm = np.dot(new_centroid, new_centroid)
|
||||
|
||||
# The squared radius of the cluster is defined:
|
||||
# r^2 = sum_i ||x_i - c||^2 / n
|
||||
# with x_i the n points assigned to the cluster and c its centroid:
|
||||
# c = sum_i x_i / n
|
||||
# This can be expanded to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
|
||||
# and therefore simplifies to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
|
||||
sq_radius = new_ss / new_n - new_sq_norm
|
||||
|
||||
if sq_radius <= threshold**2:
|
||||
(
|
||||
self.n_samples_,
|
||||
self.linear_sum_,
|
||||
self.squared_sum_,
|
||||
self.centroid_,
|
||||
self.sq_norm_,
|
||||
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def radius(self):
|
||||
"""Return radius of the subcluster"""
|
||||
# Because of numerical issues, this could become negative
|
||||
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
|
||||
return sqrt(max(0, sq_radius))
|
||||
|
||||
|
||||
class Birch(
|
||||
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
|
||||
):
|
||||
"""Implements the BIRCH clustering algorithm.
|
||||
|
||||
It is a memory-efficient, online-learning algorithm provided as an
|
||||
alternative to :class:`MiniBatchKMeans`. It constructs a tree
|
||||
data structure with the cluster centroids being read off the leaf.
|
||||
These can be either the final cluster centroids or can be provided as input
|
||||
to another clustering algorithm such as :class:`AgglomerativeClustering`.
|
||||
|
||||
Read more in the :ref:`User Guide <birch>`.
|
||||
|
||||
.. versionadded:: 0.16
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float, default=0.5
|
||||
The radius of the subcluster obtained by merging a new sample and the
|
||||
closest subcluster should be lesser than the threshold. Otherwise a new
|
||||
subcluster is started. Setting this value to be very low promotes
|
||||
splitting and vice-versa.
|
||||
|
||||
branching_factor : int, default=50
|
||||
Maximum number of CF subclusters in each node. If a new samples enters
|
||||
such that the number of subclusters exceed the branching_factor then
|
||||
that node is split into two nodes with the subclusters redistributed
|
||||
in each. The parent subcluster of that node is removed and two new
|
||||
subclusters are added as parents of the 2 split nodes.
|
||||
|
||||
n_clusters : int, instance of sklearn.cluster model or None, default=3
|
||||
Number of clusters after the final clustering step, which treats the
|
||||
subclusters from the leaves as new samples.
|
||||
|
||||
- `None` : the final clustering step is not performed and the
|
||||
subclusters are returned as they are.
|
||||
|
||||
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
|
||||
is fit treating the subclusters as new samples and the initial data
|
||||
is mapped to the label of the closest subcluster.
|
||||
|
||||
- `int` : the model fit is :class:`AgglomerativeClustering` with
|
||||
`n_clusters` set to be equal to the int.
|
||||
|
||||
compute_labels : bool, default=True
|
||||
Whether or not to compute labels for each fit.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
root_ : _CFNode
|
||||
Root of the CFTree.
|
||||
|
||||
dummy_leaf_ : _CFNode
|
||||
Start pointer to all the leaves.
|
||||
|
||||
subcluster_centers_ : ndarray
|
||||
Centroids of all subclusters read directly from the leaves.
|
||||
|
||||
subcluster_labels_ : ndarray
|
||||
Labels assigned to the centroids of the subclusters after
|
||||
they are clustered globally.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Array of labels assigned to the input data.
|
||||
if partial_fit is used instead of fit, they are assigned to the
|
||||
last batch of data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
MiniBatchKMeans : Alternative implementation that does incremental updates
|
||||
of the centers' positions using mini-batches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tree data structure consists of nodes with each node consisting of
|
||||
a number of subclusters. The maximum number of subclusters in a node
|
||||
is determined by the branching factor. Each subcluster maintains a
|
||||
linear sum, squared sum and the number of samples in that subcluster.
|
||||
In addition, each subcluster can also have a node as its child, if the
|
||||
subcluster is not a member of a leaf node.
|
||||
|
||||
For a new point entering the root, it is merged with the subcluster closest
|
||||
to it and the linear sum, squared sum and the number of samples of that
|
||||
subcluster are updated. This is done recursively till the properties of
|
||||
the leaf node are updated.
|
||||
|
||||
See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a
|
||||
comparison with :class:`~sklearn.cluster.MiniBatchKMeans`.
|
||||
|
||||
References
|
||||
----------
|
||||
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
|
||||
BIRCH: An efficient data clustering method for large databases.
|
||||
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
|
||||
|
||||
* Roberto Perdisci
|
||||
JBirch - Java implementation of BIRCH clustering algorithm
|
||||
https://code.google.com/archive/p/jbirch
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import Birch
|
||||
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
|
||||
>>> brc = Birch(n_clusters=None)
|
||||
>>> brc.fit(X)
|
||||
Birch(n_clusters=None)
|
||||
>>> brc.predict(X)
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
|
||||
For a comparison of the BIRCH clustering algorithm with other clustering algorithms,
|
||||
see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"threshold": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
|
||||
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
|
||||
"compute_labels": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
threshold=0.5,
|
||||
branching_factor=50,
|
||||
n_clusters=3,
|
||||
compute_labels=True,
|
||||
):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.n_clusters = n_clusters
|
||||
self.compute_labels = compute_labels
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Build a CF Tree for the input data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
return self._fit(X, partial=False)
|
||||
|
||||
def _fit(self, X, partial):
|
||||
has_root = getattr(self, "root_", None)
|
||||
first_call = not (partial and has_root)
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse="csr",
|
||||
reset=first_call,
|
||||
dtype=[np.float64, np.float32],
|
||||
)
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# If partial_fit is called for the first time or fit is called, we
|
||||
# start a new tree.
|
||||
if first_call:
|
||||
# The first root is the leaf. Manipulate this object throughout.
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
|
||||
# To enable getting back subclusters.
|
||||
self.dummy_leaf_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.dummy_leaf_.next_leaf_ = self.root_
|
||||
self.root_.prev_leaf_ = self.dummy_leaf_
|
||||
|
||||
# Cannot vectorize. Enough to convince to use cython.
|
||||
if not sparse.issparse(X):
|
||||
iter_func = iter
|
||||
else:
|
||||
iter_func = _iterate_sparse_X
|
||||
|
||||
for sample in iter_func(X):
|
||||
subcluster = _CFSubcluster(linear_sum=sample)
|
||||
split = self.root_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if split:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
self.root_, threshold, branching_factor
|
||||
)
|
||||
del self.root_
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=False,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.root_.append_subcluster(new_subcluster1)
|
||||
self.root_.append_subcluster(new_subcluster2)
|
||||
|
||||
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
||||
self.subcluster_centers_ = centroids
|
||||
self._n_features_out = self.subcluster_centers_.shape[0]
|
||||
|
||||
self._global_clustering(X)
|
||||
return self
|
||||
|
||||
def _get_leaves(self):
|
||||
"""
|
||||
Retrieve the leaves of the CF Node.
|
||||
|
||||
Returns
|
||||
-------
|
||||
leaves : list of shape (n_leaves,)
|
||||
List of the leaf nodes.
|
||||
"""
|
||||
leaf_ptr = self.dummy_leaf_.next_leaf_
|
||||
leaves = []
|
||||
while leaf_ptr is not None:
|
||||
leaves.append(leaf_ptr)
|
||||
leaf_ptr = leaf_ptr.next_leaf_
|
||||
return leaves
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X=None, y=None):
|
||||
"""
|
||||
Online learning. Prevents rebuilding of CFTree from scratch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
|
||||
default=None
|
||||
Input data. If X is not provided, only the global clustering
|
||||
step is done.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
if X is None:
|
||||
# Perform just the final global clustering step.
|
||||
self._global_clustering()
|
||||
return self
|
||||
else:
|
||||
return self._fit(X, partial=True)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict data using the ``centroids_`` of subclusters.
|
||||
|
||||
Avoid computation of the row norms of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape(n_samples,)
|
||||
Labelled data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = validate_data(self, X, accept_sparse="csr", reset=False)
|
||||
return self._predict(X)
|
||||
|
||||
def _predict(self, X):
|
||||
"""Predict data using the ``centroids_`` of subclusters."""
|
||||
kwargs = {"Y_norm_squared": self._subcluster_norms}
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
argmin = pairwise_distances_argmin(
|
||||
X, self.subcluster_centers_, metric_kwargs=kwargs
|
||||
)
|
||||
return self.subcluster_labels_[argmin]
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X into subcluster centroids dimension.
|
||||
|
||||
Each dimension represents the distance from the sample point to each
|
||||
cluster centroid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
|
||||
Transformed data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = validate_data(self, X, accept_sparse="csr", reset=False)
|
||||
with config_context(assume_finite=True):
|
||||
return euclidean_distances(X, self.subcluster_centers_)
|
||||
|
||||
def _global_clustering(self, X=None):
|
||||
"""
|
||||
Global clustering for the subclusters obtained after fitting
|
||||
"""
|
||||
clusterer = self.n_clusters
|
||||
centroids = self.subcluster_centers_
|
||||
compute_labels = (X is not None) and self.compute_labels
|
||||
|
||||
# Preprocessing for the global clustering.
|
||||
not_enough_centroids = False
|
||||
if isinstance(clusterer, Integral):
|
||||
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
|
||||
# There is no need to perform the global clustering step.
|
||||
if len(centroids) < self.n_clusters:
|
||||
not_enough_centroids = True
|
||||
|
||||
# To use in predict to avoid recalculation.
|
||||
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
|
||||
|
||||
if clusterer is None or not_enough_centroids:
|
||||
self.subcluster_labels_ = np.arange(len(centroids))
|
||||
if not_enough_centroids:
|
||||
warnings.warn(
|
||||
"Number of subclusters found (%d) by BIRCH is less "
|
||||
"than (%d). Decrease the threshold."
|
||||
% (len(centroids), self.n_clusters),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
else:
|
||||
# The global clustering step that clusters the subclusters of
|
||||
# the leaves. It assumes the centroids of the subclusters as
|
||||
# samples and finds the final centroids.
|
||||
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
|
||||
|
||||
if compute_labels:
|
||||
self.labels_ = self._predict(X)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
@@ -0,0 +1,543 @@
|
||||
"""Bisecting K-means clustering."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.base import _fit_context
|
||||
from sklearn.cluster._k_means_common import _inertia_dense, _inertia_sparse
|
||||
from sklearn.cluster._kmeans import (
|
||||
_BaseKMeans,
|
||||
_kmeans_single_elkan,
|
||||
_kmeans_single_lloyd,
|
||||
_labels_inertia_threadpool_limit,
|
||||
)
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
from sklearn.utils._param_validation import Integral, Interval, StrOptions
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.validation import (
|
||||
_check_sample_weight,
|
||||
check_is_fitted,
|
||||
check_random_state,
|
||||
validate_data,
|
||||
)
|
||||
|
||||
|
||||
class _BisectingTree:
|
||||
"""Tree structure representing the hierarchical clusters of BisectingKMeans."""
|
||||
|
||||
def __init__(self, center, indices, score):
|
||||
"""Create a new cluster node in the tree.
|
||||
|
||||
The node holds the center of this cluster and the indices of the data points
|
||||
that belong to it.
|
||||
"""
|
||||
self.center = center
|
||||
self.indices = indices
|
||||
self.score = score
|
||||
|
||||
self.left = None
|
||||
self.right = None
|
||||
|
||||
def split(self, labels, centers, scores):
|
||||
"""Split the cluster node into two subclusters."""
|
||||
self.left = _BisectingTree(
|
||||
indices=self.indices[labels == 0], center=centers[0], score=scores[0]
|
||||
)
|
||||
self.right = _BisectingTree(
|
||||
indices=self.indices[labels == 1], center=centers[1], score=scores[1]
|
||||
)
|
||||
|
||||
# reset the indices attribute to save memory
|
||||
self.indices = None
|
||||
|
||||
def get_cluster_to_bisect(self):
|
||||
"""Return the cluster node to bisect next.
|
||||
|
||||
It's based on the score of the cluster, which can be either the number of
|
||||
data points assigned to that cluster or the inertia of that cluster
|
||||
(see `bisecting_strategy` for details).
|
||||
"""
|
||||
max_score = None
|
||||
|
||||
for cluster_leaf in self.iter_leaves():
|
||||
if max_score is None or cluster_leaf.score > max_score:
|
||||
max_score = cluster_leaf.score
|
||||
best_cluster_leaf = cluster_leaf
|
||||
|
||||
return best_cluster_leaf
|
||||
|
||||
def iter_leaves(self):
|
||||
"""Iterate over all the cluster leaves in the tree."""
|
||||
if self.left is None:
|
||||
yield self
|
||||
else:
|
||||
yield from self.left.iter_leaves()
|
||||
yield from self.right.iter_leaves()
|
||||
|
||||
|
||||
class BisectingKMeans(_BaseKMeans):
|
||||
"""Bisecting K-Means clustering.
|
||||
|
||||
Read more in the :ref:`User Guide <bisect_k_means>`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=8
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate.
|
||||
|
||||
init : {'k-means++', 'random'} or callable, default='random'
|
||||
Method for initialization:
|
||||
|
||||
'k-means++' : selects initial cluster centers for k-mean
|
||||
clustering in a smart way to speed up convergence. See section
|
||||
Notes in k_init for more details.
|
||||
|
||||
'random': choose `n_clusters` observations (rows) at random from data
|
||||
for the initial centroids.
|
||||
|
||||
If a callable is passed, it should take arguments X, n_clusters and a
|
||||
random state and return an initialization.
|
||||
|
||||
n_init : int, default=1
|
||||
Number of time the inner k-means algorithm will be run with different
|
||||
centroid seeds in each bisection.
|
||||
That will result producing for each bisection best output of n_init
|
||||
consecutive runs in terms of inertia.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for centroid initialization
|
||||
in inner K-Means. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the inner k-means algorithm at each
|
||||
bisection.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity mode.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Relative tolerance with regards to Frobenius norm of the difference
|
||||
in the cluster centers of two consecutive iterations to declare
|
||||
convergence. Used in inner k-means algorithm at each bisection to pick
|
||||
best possible clusters.
|
||||
|
||||
copy_x : bool, default=True
|
||||
When pre-computing distances it is more numerically accurate to center
|
||||
the data first. If copy_x is True (default), then the original data is
|
||||
not modified. If False, the original data is modified, and put back
|
||||
before the function returns, but small numerical differences may be
|
||||
introduced by subtracting and then adding the data mean. Note that if
|
||||
the original data is not C-contiguous, a copy will be made even if
|
||||
copy_x is False. If the original data is sparse, but not in CSR format,
|
||||
a copy will be made even if copy_x is False.
|
||||
|
||||
algorithm : {"lloyd", "elkan"}, default="lloyd"
|
||||
Inner K-means algorithm used in bisection.
|
||||
The classical EM-style algorithm is `"lloyd"`.
|
||||
The `"elkan"` variation can be more efficient on some datasets with
|
||||
well-defined clusters, by using the triangle inequality. However it's
|
||||
more memory intensive due to the allocation of an extra array of shape
|
||||
`(n_samples, n_clusters)`.
|
||||
|
||||
bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
|
||||
default="biggest_inertia"
|
||||
Defines how bisection should be performed:
|
||||
|
||||
- "biggest_inertia" means that BisectingKMeans will always check
|
||||
all calculated cluster for cluster with biggest SSE
|
||||
(Sum of squared errors) and bisect it. This approach concentrates on
|
||||
precision, but may be costly in terms of execution time (especially for
|
||||
larger amount of data points).
|
||||
|
||||
- "largest_cluster" - BisectingKMeans will always split cluster with
|
||||
largest amount of points assigned to it from all clusters
|
||||
previously calculated. That should work faster than picking by SSE
|
||||
('biggest_inertia') and may produce similar results in most cases.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Coordinates of cluster centers. If the algorithm stops before fully
|
||||
converging (see ``tol`` and ``max_iter``), these will not be
|
||||
consistent with ``labels_``.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
inertia_ : float
|
||||
Sum of squared distances of samples to their closest cluster center,
|
||||
weighted by the sample weights if provided.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
See Also
|
||||
--------
|
||||
KMeans : Original implementation of K-Means algorithm.
|
||||
|
||||
Notes
|
||||
-----
|
||||
It might be inefficient when n_cluster is less than 3, due to unnecessary
|
||||
calculations for that case.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import BisectingKMeans
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [10, 1], [3, 1],
|
||||
... [10, 0], [2, 1], [10, 2],
|
||||
... [10, 8], [10, 9], [10, 10]])
|
||||
>>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
|
||||
>>> bisect_means.labels_
|
||||
array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
|
||||
>>> bisect_means.predict([[0, 0], [12, 3]])
|
||||
array([0, 2], dtype=int32)
|
||||
>>> bisect_means.cluster_centers_
|
||||
array([[ 2., 1.],
|
||||
[10., 9.],
|
||||
[10., 1.]])
|
||||
|
||||
For a comparison between BisectingKMeans and K-Means refer to example
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_BaseKMeans._parameter_constraints,
|
||||
"init": [StrOptions({"k-means++", "random"}), callable],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"copy_x": ["boolean"],
|
||||
"algorithm": [StrOptions({"lloyd", "elkan"})],
|
||||
"bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=8,
|
||||
*,
|
||||
init="random",
|
||||
n_init=1,
|
||||
random_state=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
tol=1e-4,
|
||||
copy_x=True,
|
||||
algorithm="lloyd",
|
||||
bisecting_strategy="biggest_inertia",
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters=n_clusters,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
tol=tol,
|
||||
n_init=n_init,
|
||||
)
|
||||
|
||||
self.copy_x = copy_x
|
||||
self.algorithm = algorithm
|
||||
self.bisecting_strategy = bisecting_strategy
|
||||
|
||||
def _warn_mkl_vcomp(self, n_active_threads):
|
||||
"""Warn when vcomp and mkl are both present"""
|
||||
warnings.warn(
|
||||
"BisectingKMeans is known to have a memory leak on Windows "
|
||||
"with MKL, when there are less chunks than available "
|
||||
"threads. You can avoid it by setting the environment"
|
||||
f" variable OMP_NUM_THREADS={n_active_threads}."
|
||||
)
|
||||
|
||||
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
|
||||
"""Calculate the sum of squared errors (inertia) per cluster.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
centers : ndarray of shape (n_clusters=2, n_features)
|
||||
The cluster centers.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
inertia_per_cluster : ndarray of shape (n_clusters=2,)
|
||||
Sum of squared errors (inertia) for each cluster.
|
||||
"""
|
||||
n_clusters = centers.shape[0] # = 2 since centers comes from a bisection
|
||||
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
|
||||
|
||||
inertia_per_cluster = np.empty(n_clusters)
|
||||
for label in range(n_clusters):
|
||||
inertia_per_cluster[label] = _inertia(
|
||||
X, sample_weight, centers, labels, self._n_threads, single_label=label
|
||||
)
|
||||
|
||||
return inertia_per_cluster
|
||||
|
||||
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
|
||||
"""Split a cluster into 2 subsclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
Training instances to cluster.
|
||||
|
||||
x_squared_norms : ndarray of shape (n_samples,)
|
||||
Squared euclidean norm of each data point.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
cluster_to_bisect : _BisectingTree node object
|
||||
The cluster node to split.
|
||||
"""
|
||||
X = X[cluster_to_bisect.indices]
|
||||
x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
|
||||
sample_weight = sample_weight[cluster_to_bisect.indices]
|
||||
|
||||
best_inertia = None
|
||||
|
||||
# Split samples in X into 2 clusters.
|
||||
# Repeating `n_init` times to obtain best clusters
|
||||
for _ in range(self.n_init):
|
||||
centers_init = self._init_centroids(
|
||||
X,
|
||||
x_squared_norms=x_squared_norms,
|
||||
init=self.init,
|
||||
random_state=self._random_state,
|
||||
n_centroids=2,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
labels, inertia, centers, _ = self._kmeans_single(
|
||||
X,
|
||||
sample_weight,
|
||||
centers_init,
|
||||
max_iter=self.max_iter,
|
||||
verbose=self.verbose,
|
||||
tol=self.tol,
|
||||
n_threads=self._n_threads,
|
||||
)
|
||||
|
||||
# allow small tolerance on the inertia to accommodate for
|
||||
# non-deterministic rounding errors due to parallel computation
|
||||
if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
|
||||
best_labels = labels
|
||||
best_centers = centers
|
||||
best_inertia = inertia
|
||||
|
||||
if self.verbose:
|
||||
print(f"New centroids from bisection: {best_centers}")
|
||||
|
||||
if self.bisecting_strategy == "biggest_inertia":
|
||||
scores = self._inertia_per_cluster(
|
||||
X, best_centers, best_labels, sample_weight
|
||||
)
|
||||
else: # bisecting_strategy == "largest_cluster"
|
||||
# Using minlength to make sure that we have the counts for both labels even
|
||||
# if all samples are labelled 0.
|
||||
scores = np.bincount(best_labels, minlength=2)
|
||||
|
||||
cluster_to_bisect.split(best_labels, best_centers, scores)
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Compute bisecting k-means clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
|
||||
Training instances to cluster.
|
||||
|
||||
.. note:: The data will be converted to C ordering,
|
||||
which will cause a memory copy
|
||||
if the given data is not C-contiguous.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
The weights for each observation in X. If None, all observations
|
||||
are assigned equal weight. `sample_weight` is not used during
|
||||
initialization if `init` is a callable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse="csr",
|
||||
dtype=[np.float64, np.float32],
|
||||
order="C",
|
||||
copy=self.copy_x,
|
||||
accept_large_sparse=False,
|
||||
)
|
||||
|
||||
self._check_params_vs_input(X)
|
||||
|
||||
self._random_state = check_random_state(self.random_state)
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
|
||||
self._n_threads = _openmp_effective_n_threads()
|
||||
|
||||
if self.algorithm == "lloyd" or self.n_clusters == 1:
|
||||
self._kmeans_single = _kmeans_single_lloyd
|
||||
self._check_mkl_vcomp(X, X.shape[0])
|
||||
else:
|
||||
self._kmeans_single = _kmeans_single_elkan
|
||||
|
||||
# Subtract of mean of X for more accurate distance computations
|
||||
if not sp.issparse(X):
|
||||
self._X_mean = X.mean(axis=0)
|
||||
X -= self._X_mean
|
||||
|
||||
# Initialize the hierarchical clusters tree
|
||||
self._bisecting_tree = _BisectingTree(
|
||||
indices=np.arange(X.shape[0]),
|
||||
center=X.mean(axis=0),
|
||||
score=0,
|
||||
)
|
||||
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
|
||||
for _ in range(self.n_clusters - 1):
|
||||
# Chose cluster to bisect
|
||||
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
|
||||
|
||||
# Split this cluster into 2 subclusters
|
||||
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
|
||||
|
||||
# Aggregate final labels and centers from the bisecting tree
|
||||
self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
|
||||
self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
|
||||
|
||||
for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
|
||||
self.labels_[cluster_node.indices] = i
|
||||
self.cluster_centers_[i] = cluster_node.center
|
||||
cluster_node.label = i # label final clusters for future prediction
|
||||
cluster_node.indices = None # release memory
|
||||
|
||||
# Restore original data
|
||||
if not sp.issparse(X):
|
||||
X += self._X_mean
|
||||
self.cluster_centers_ += self._X_mean
|
||||
|
||||
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
|
||||
self.inertia_ = _inertia(
|
||||
X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
|
||||
)
|
||||
|
||||
self._n_features_out = self.cluster_centers_.shape[0]
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict which cluster each sample in X belongs to.
|
||||
|
||||
Prediction is made by going down the hierarchical tree
|
||||
in searching of closest leaf cluster.
|
||||
|
||||
In the vector quantization literature, `cluster_centers_` is called
|
||||
the code book and each value returned by `predict` is the index of
|
||||
the closest code in the code book.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
New data to predict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._check_test_data(X)
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
|
||||
# sample weights are unused but necessary in cython helpers
|
||||
sample_weight = np.ones_like(x_squared_norms)
|
||||
|
||||
labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
|
||||
|
||||
return labels
|
||||
|
||||
def _predict_recursive(self, X, sample_weight, cluster_node):
|
||||
"""Predict recursively by going down the hierarchical tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
The data points, currently assigned to `cluster_node`, to predict between
|
||||
the subclusters of this node.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
cluster_node : _BisectingTree node object
|
||||
The cluster node of the hierarchical tree.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
if cluster_node.left is None:
|
||||
# This cluster has no subcluster. Labels are just the label of the cluster.
|
||||
return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
|
||||
|
||||
# Determine if data points belong to the left or right subcluster
|
||||
centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
|
||||
if hasattr(self, "_X_mean"):
|
||||
centers += self._X_mean
|
||||
|
||||
cluster_labels = _labels_inertia_threadpool_limit(
|
||||
X,
|
||||
sample_weight,
|
||||
centers,
|
||||
self._n_threads,
|
||||
return_inertia=False,
|
||||
)
|
||||
mask = cluster_labels == 0
|
||||
|
||||
# Compute the labels for each subset of the data points.
|
||||
labels = np.full(X.shape[0], -1, dtype=np.int32)
|
||||
|
||||
labels[mask] = self._predict_recursive(
|
||||
X[mask], sample_weight[mask], cluster_node.left
|
||||
)
|
||||
|
||||
labels[~mask] = self._predict_recursive(
|
||||
X[~mask], sample_weight[~mask], cluster_node.right
|
||||
)
|
||||
|
||||
return labels
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
|
||||
return tags
|
||||
@@ -0,0 +1,512 @@
|
||||
"""
|
||||
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from sklearn.cluster._dbscan_inner import dbscan_inner
|
||||
from sklearn.metrics.pairwise import _VALID_METRICS
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
from sklearn.utils.validation import _check_sample_weight, validate_data
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"sample_weight": ["array-like", None],
|
||||
},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def dbscan(
|
||||
X,
|
||||
eps=0.5,
|
||||
*,
|
||||
min_samples=5,
|
||||
metric="minkowski",
|
||||
metric_params=None,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
sample_weight=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
This function is a wrapper around :class:`~cluster.DBSCAN`, suitable for
|
||||
quick, standalone clustering tasks. For estimator-based workflows, where
|
||||
estimator attributes or pipeline integration is required, prefer
|
||||
:class:`~cluster.DBSCAN`.
|
||||
|
||||
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a
|
||||
density-based clustering algorithm that groups together points that are
|
||||
closely packed while marking points in low-density regions as outliers.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, scipy sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
A feature array, or array of distances between samples if
|
||||
``metric='precomputed'``. When using precomputed distances, X must
|
||||
be a square symmetric matrix.
|
||||
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function. Smaller values result in more clusters,
|
||||
while larger values result in fewer, larger clusters.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point
|
||||
to be considered as a core point. This includes the point itself.
|
||||
Higher values yield fewer, denser clusters, while lower values yield
|
||||
more, sparser clusters.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit.
|
||||
X may be a :term:`sparse graph <sparse graph>`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
|
||||
details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem. Generally, smaller leaf sizes
|
||||
lead to faster queries but slower construction.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is equivalent
|
||||
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
|
||||
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
|
||||
to be positive.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with negative
|
||||
weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search. ``None`` means
|
||||
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
|
||||
using all processors. See :term:`Glossary <n_jobs>` for more details.
|
||||
If precomputed distances are used, parallel execution is not available
|
||||
and thus n_jobs will have no effect.
|
||||
|
||||
Returns
|
||||
-------
|
||||
core_samples : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point. Noisy samples are given the label -1.
|
||||
Non-negative integers indicate cluster membership.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DBSCAN : An estimator interface for this clustering algorithm.
|
||||
OPTICS : A similar estimator interface clustering at multiple values of
|
||||
eps. Our implementation is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
|
||||
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
|
||||
memory usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
|
||||
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
|
||||
<10.1145/3068335>`
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import dbscan
|
||||
>>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
|
||||
>>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
|
||||
>>> core_samples
|
||||
array([0, 1, 2, 3, 4])
|
||||
>>> labels
|
||||
array([ 0, 0, 0, 1, 1, -1])
|
||||
"""
|
||||
|
||||
est = DBSCAN(
|
||||
eps=eps,
|
||||
min_samples=min_samples,
|
||||
metric=metric,
|
||||
metric_params=metric_params,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
p=p,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
return est.core_sample_indices_, est.labels_
|
||||
|
||||
|
||||
class DBSCAN(ClusterMixin, BaseEstimator):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
|
||||
Finds core samples of high density and expands clusters from them.
|
||||
This algorithm is particularly good for data which contains clusters of
|
||||
similar density and can find clusters of arbitrary shape.
|
||||
|
||||
Unlike K-means, DBSCAN does not require specifying the number of clusters
|
||||
in advance and can identify outliers as noise points.
|
||||
|
||||
This implementation has a worst case memory complexity of :math:`O({n}^2)`,
|
||||
which can occur when the `eps` param is large and `min_samples` is low,
|
||||
while the original DBSCAN only uses linear memory.
|
||||
For further details, see the Notes below.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function. Smaller values generally lead to more clusters.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point to
|
||||
be considered as a core point. This includes the point itself. If
|
||||
`min_samples` is set to a higher value, DBSCAN will find denser clusters,
|
||||
whereas if it is set to a lower value, the found clusters will be more
|
||||
sparse.
|
||||
|
||||
metric : str, or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors for DBSCAN.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
metric *precomputed* to accept precomputed sparse matrix.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
|
||||
details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem.
|
||||
|
||||
p : float, default=None
|
||||
The power of the Minkowski metric to be used to calculate distance
|
||||
between points. If None, then ``p=2`` (equivalent to the Euclidean
|
||||
distance). When p=1, this is equivalent to Manhattan distance.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
core_sample_indices_ : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
components_ : ndarray of shape (n_core_samples, n_features)
|
||||
Copy of each core sample found by training.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point in the dataset given to fit().
|
||||
Noisy samples are given the label -1. Non-negative integers
|
||||
indicate cluster membership.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
OPTICS : A similar clustering at multiple values of eps. Our implementation
|
||||
is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
|
||||
usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
|
||||
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
|
||||
<10.1145/3068335>`
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import DBSCAN
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [2, 2], [2, 3],
|
||||
... [8, 7], [8, 8], [25, 80]])
|
||||
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([ 0, 0, 0, 1, 1, -1])
|
||||
>>> clustering
|
||||
DBSCAN(eps=3, min_samples=2)
|
||||
|
||||
For an example, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
|
||||
|
||||
For a comparison of DBSCAN with other clustering algorithms, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"eps": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"min_samples": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric": [
|
||||
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
|
||||
callable,
|
||||
],
|
||||
"metric_params": [dict, None],
|
||||
"algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
|
||||
"leaf_size": [Interval(Integral, 1, None, closed="left")],
|
||||
"p": [Interval(Real, 0.0, None, closed="left"), None],
|
||||
"n_jobs": [Integral, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
eps=0.5,
|
||||
*,
|
||||
min_samples=5,
|
||||
metric="euclidean",
|
||||
metric_params=None,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.eps = eps
|
||||
self.min_samples = min_samples
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
self.algorithm = algorithm
|
||||
self.leaf_size = leaf_size
|
||||
self.p = p
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
@_fit_context(
|
||||
# DBSCAN.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Perform DBSCAN clustering from features, or distance matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of self.
|
||||
"""
|
||||
X = validate_data(self, X, accept_sparse="csr")
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
# Calculate neighborhood for all samples. This leaves the original
|
||||
# point in, which needs to be considered later (i.e. point i is in the
|
||||
# neighborhood of point i. While True, its useless information)
|
||||
if self.metric == "precomputed" and sparse.issparse(X):
|
||||
# set the diagonal to explicit values, as a point is its own
|
||||
# neighbor
|
||||
X = X.copy() # copy to avoid in-place modification
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
|
||||
X.setdiag(X.diagonal())
|
||||
|
||||
neighbors_model = NearestNeighbors(
|
||||
radius=self.eps,
|
||||
algorithm=self.algorithm,
|
||||
leaf_size=self.leaf_size,
|
||||
metric=self.metric,
|
||||
metric_params=self.metric_params,
|
||||
p=self.p,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
neighbors_model.fit(X)
|
||||
# This has worst case O(n^2) memory complexity
|
||||
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
|
||||
|
||||
if sample_weight is None:
|
||||
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
|
||||
else:
|
||||
n_neighbors = np.array(
|
||||
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
|
||||
)
|
||||
|
||||
# Initially, all samples are noise.
|
||||
labels = np.full(X.shape[0], -1, dtype=np.intp)
|
||||
|
||||
# A list of all core samples found.
|
||||
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
|
||||
dbscan_inner(core_samples, neighborhoods, labels)
|
||||
|
||||
self.core_sample_indices_ = np.where(core_samples)[0]
|
||||
self.labels_ = labels
|
||||
|
||||
if len(self.core_sample_indices_):
|
||||
# fix for scipy sparse indexing issue
|
||||
self.components_ = X[self.core_sample_indices_].copy()
|
||||
else:
|
||||
# no core samples
|
||||
self.components_ = np.empty((0, X.shape[1]))
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None, sample_weight=None):
|
||||
"""Compute clusters from a data or distance matrix and predict labels.
|
||||
|
||||
This method fits the model and returns the cluster labels in a single step.
|
||||
It is equivalent to calling fit(X).labels_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels. Noisy samples are given the label -1.
|
||||
Non-negative integers indicate cluster membership.
|
||||
"""
|
||||
self.fit(X, sample_weight=sample_weight)
|
||||
return self.labels_
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.pairwise = self.metric == "precomputed"
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
Binary file not shown.
@@ -0,0 +1,41 @@
|
||||
# Fast inner loop for DBSCAN.
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from sklearn.utils._typedefs cimport uint8_t, intp_t
|
||||
|
||||
|
||||
def dbscan_inner(const uint8_t[::1] is_core,
|
||||
object[:] neighborhoods,
|
||||
intp_t[::1] labels):
|
||||
cdef intp_t i, label_num = 0, v
|
||||
cdef intp_t[:] neighb
|
||||
cdef vector[intp_t] stack
|
||||
|
||||
for i in range(labels.shape[0]):
|
||||
if labels[i] != -1 or not is_core[i]:
|
||||
continue
|
||||
|
||||
# Depth-first search starting from i, ending at the non-core points.
|
||||
# This is very similar to the classic algorithm for computing connected
|
||||
# components, the difference being that we label non-core points as
|
||||
# part of a cluster (component), but don't expand their neighborhoods.
|
||||
while True:
|
||||
if labels[i] == -1:
|
||||
labels[i] = label_num
|
||||
if is_core[i]:
|
||||
neighb = neighborhoods[i]
|
||||
for i in range(neighb.shape[0]):
|
||||
v = neighb[i]
|
||||
if labels[v] == -1:
|
||||
stack.push_back(v)
|
||||
|
||||
if stack.size() == 0:
|
||||
break
|
||||
i = stack.back()
|
||||
stack.pop_back()
|
||||
|
||||
label_num += 1
|
||||
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Feature agglomeration. Base classes and functions for performing feature
|
||||
agglomeration.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.base import TransformerMixin
|
||||
from sklearn.utils.validation import check_is_fitted, validate_data
|
||||
|
||||
###############################################################################
|
||||
# Mixin class for feature agglomeration.
|
||||
|
||||
|
||||
class AgglomerationTransform(TransformerMixin):
|
||||
"""
|
||||
A class for feature agglomeration via the transform interface.
|
||||
"""
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform a new matrix using the built clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
An M by N array of M observations in N dimensions or a length
|
||||
M array of M one-dimensional observations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The pooled values for each feature cluster.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = validate_data(self, X, reset=False)
|
||||
if self.pooling_func == np.mean and not issparse(X):
|
||||
size = np.bincount(self.labels_)
|
||||
n_samples = X.shape[0]
|
||||
# a fast way to compute the mean of grouped features
|
||||
nX = np.array(
|
||||
[np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
|
||||
)
|
||||
else:
|
||||
nX = [
|
||||
self.pooling_func(X[:, self.labels_ == l], axis=1)
|
||||
for l in np.unique(self.labels_)
|
||||
]
|
||||
nX = np.array(nX).T
|
||||
return nX
|
||||
|
||||
def inverse_transform(self, X):
|
||||
"""
|
||||
Inverse the transformation and return a vector of size `n_features`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The values to be assigned to each cluster of samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_original : ndarray of shape (n_samples, n_features) or (n_features,)
|
||||
A vector of size `n_samples` with the values of `X` assigned to
|
||||
each of the cluster of samples.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
unil, inverse = np.unique(self.labels_, return_inverse=True)
|
||||
return X[..., inverse]
|
||||
@@ -0,0 +1,2 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,274 @@
|
||||
# Minimum spanning tree single linkage implementation for hdbscan
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.float cimport DBL_MAX
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics._dist_metrics cimport DistanceMetric64
|
||||
from sklearn.cluster._hierarchical_fast cimport UnionFind
|
||||
from sklearn.cluster._hdbscan._tree cimport HIERARCHY_t
|
||||
from sklearn.cluster._hdbscan._tree import HIERARCHY_dtype
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
# Numpy structured dtype representing a single ordered edge in Prim's algorithm
|
||||
MST_edge_dtype = np.dtype([
|
||||
("current_node", np.int64),
|
||||
("next_node", np.int64),
|
||||
("distance", np.float64),
|
||||
])
|
||||
|
||||
# Packed shouldn't make a difference since they're all 8-byte quantities,
|
||||
# but it's included just to be safe.
|
||||
ctypedef packed struct MST_edge_t:
|
||||
int64_t current_node
|
||||
int64_t next_node
|
||||
float64_t distance
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
|
||||
cnp.ndarray[float64_t, ndim=2] mutual_reachability
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mutual_reachability : ndarray of shape (n_samples, n_samples)
|
||||
Array of mutual-reachabilities between samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reachability graph. The MST is
|
||||
represented as a collection of edges.
|
||||
"""
|
||||
cdef:
|
||||
# Note: we utilize ndarray's over memory-views to make use of numpy
|
||||
# binary indexing and sub-selection below.
|
||||
cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
|
||||
cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
cnp.ndarray[uint8_t, mode='c'] label_filter
|
||||
|
||||
int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
|
||||
int64_t current_node, new_node_index, new_node, i
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
current_labels = np.arange(n_samples, dtype=np.int64)
|
||||
current_node = 0
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
for i in range(0, n_samples - 1):
|
||||
label_filter = current_labels != current_node
|
||||
current_labels = current_labels[label_filter]
|
||||
left = min_reachability[label_filter]
|
||||
right = mutual_reachability[current_node][current_labels]
|
||||
min_reachability = np.minimum(left, right)
|
||||
|
||||
new_node_index = np.argmin(min_reachability)
|
||||
new_node = current_labels[new_node_index]
|
||||
mst[i].current_node = current_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = min_reachability[new_node_index]
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
|
||||
const float64_t[:, ::1] raw_data,
|
||||
const float64_t[::1] core_distances,
|
||||
DistanceMetric64 dist_metric,
|
||||
float64_t alpha=1.0
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph generated from the provided `raw_data` and
|
||||
`core_distances` using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_data : ndarray of shape (n_samples, n_features)
|
||||
Input array of data samples.
|
||||
|
||||
core_distances : ndarray of shape (n_samples,)
|
||||
An array containing the core-distance calculated for each corresponding
|
||||
sample.
|
||||
|
||||
dist_metric : DistanceMetric
|
||||
The distance metric to use when calculating pairwise distances for
|
||||
determining mutual-reachability.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reachability graph. The MST is
|
||||
represented as a collection of edges.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
uint8_t[::1] in_tree
|
||||
float64_t[::1] min_reachability
|
||||
int64_t[::1] current_sources
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
int64_t current_node, source_node, new_node, next_node_source
|
||||
int64_t i, j, n_samples, num_features
|
||||
|
||||
float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
|
||||
float64_t next_node_min_reach, pair_distance, next_node_core_dist
|
||||
|
||||
n_samples = raw_data.shape[0]
|
||||
num_features = raw_data.shape[1]
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
|
||||
in_tree = np.zeros(n_samples, dtype=np.uint8)
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
current_sources = np.ones(n_samples, dtype=np.int64)
|
||||
|
||||
current_node = 0
|
||||
|
||||
# The following loop dynamically updates minimum reachability node-by-node,
|
||||
# avoiding unnecessary computation where possible.
|
||||
for i in range(0, n_samples - 1):
|
||||
|
||||
in_tree[current_node] = 1
|
||||
|
||||
current_node_core_dist = core_distances[current_node]
|
||||
|
||||
new_reachability = DBL_MAX
|
||||
source_node = 0
|
||||
new_node = 0
|
||||
|
||||
for j in range(n_samples):
|
||||
if in_tree[j]:
|
||||
continue
|
||||
|
||||
next_node_min_reach = min_reachability[j]
|
||||
next_node_source = current_sources[j]
|
||||
|
||||
pair_distance = dist_metric.dist(
|
||||
&raw_data[current_node, 0],
|
||||
&raw_data[j, 0],
|
||||
num_features
|
||||
)
|
||||
|
||||
pair_distance /= alpha
|
||||
|
||||
next_node_core_dist = core_distances[j]
|
||||
mutual_reachability_distance = max(
|
||||
current_node_core_dist,
|
||||
next_node_core_dist,
|
||||
pair_distance
|
||||
)
|
||||
|
||||
# If MRD(i, j) is smaller than node j's min_reachability, we update
|
||||
# node j's min_reachability for future reference.
|
||||
if mutual_reachability_distance < next_node_min_reach:
|
||||
min_reachability[j] = mutual_reachability_distance
|
||||
current_sources[j] = current_node
|
||||
|
||||
# If MRD(i, j) is also smaller than node i's current
|
||||
# min_reachability, we update and set their edge as the current
|
||||
# MST edge candidate.
|
||||
if mutual_reachability_distance < new_reachability:
|
||||
new_reachability = mutual_reachability_distance
|
||||
source_node = current_node
|
||||
new_node = j
|
||||
|
||||
# If the node j is closer to another node already in the tree, we
|
||||
# make their edge the current MST candidate edge.
|
||||
elif next_node_min_reach < new_reachability:
|
||||
new_reachability = next_node_min_reach
|
||||
source_node = next_node_source
|
||||
new_node = j
|
||||
|
||||
mst[i].current_node = source_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = new_reachability
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
|
||||
"""Construct a single-linkage tree from an MST.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reachability graph. The MST is
|
||||
represented as a collection of edges.
|
||||
|
||||
Returns
|
||||
-------
|
||||
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
|
||||
The single-linkage tree tree (dendrogram) built from the MST. Each
|
||||
of the array represents the following:
|
||||
|
||||
- left node/cluster
|
||||
- right node/cluster
|
||||
- distance
|
||||
- new cluster size
|
||||
"""
|
||||
cdef:
|
||||
cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
|
||||
|
||||
# Note mst.shape[0] is one fewer than the number of samples
|
||||
int64_t n_samples = mst.shape[0] + 1
|
||||
intp_t current_node_cluster, next_node_cluster
|
||||
int64_t current_node, next_node, i
|
||||
float64_t distance
|
||||
UnionFind U = UnionFind(n_samples)
|
||||
|
||||
single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
|
||||
|
||||
for i in range(n_samples - 1):
|
||||
|
||||
current_node = mst[i].current_node
|
||||
next_node = mst[i].next_node
|
||||
distance = mst[i].distance
|
||||
|
||||
current_node_cluster = U.fast_find(current_node)
|
||||
next_node_cluster = U.fast_find(next_node)
|
||||
|
||||
single_linkage[i].left_node = current_node_cluster
|
||||
single_linkage[i].right_node = next_node_cluster
|
||||
single_linkage[i].value = distance
|
||||
single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
|
||||
|
||||
U.union(current_node_cluster, next_node_cluster)
|
||||
|
||||
return single_linkage
|
||||
Binary file not shown.
@@ -0,0 +1,210 @@
|
||||
# mutual reachability distance computations
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from cython cimport floating, integral
|
||||
from libc.math cimport isfinite, INFINITY
|
||||
from sklearn.utils._typedefs cimport intp_t
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
def mutual_reachability_graph(
|
||||
distance_matrix, min_samples=5, max_distance=0.0
|
||||
):
|
||||
"""Compute the weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
The mutual reachability distance used to build the graph is defined as::
|
||||
|
||||
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
|
||||
|
||||
and the core distance `d_core` is defined as the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
Note that all computations are done in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
|
||||
Array of distances between samples. If sparse, the array must be in
|
||||
`CSR` format.
|
||||
|
||||
min_samples : int, default=5
|
||||
The parameter `k` used to calculate the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
max_distance : float, default=0.0
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
|
||||
(n_samples, n_samples)
|
||||
Weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
|
||||
Density-based clustering based on hierarchical density estimates.
|
||||
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
|
||||
(pp. 160-172). Springer Berlin Heidelberg.
|
||||
"""
|
||||
further_neighbor_idx = min_samples - 1
|
||||
if issparse(distance_matrix):
|
||||
if distance_matrix.format != "csr":
|
||||
raise ValueError(
|
||||
"Only sparse CSR matrices are supported for `distance_matrix`."
|
||||
)
|
||||
_sparse_mutual_reachability_graph(
|
||||
distance_matrix.data,
|
||||
distance_matrix.indices,
|
||||
distance_matrix.indptr,
|
||||
distance_matrix.shape[0],
|
||||
further_neighbor_idx=further_neighbor_idx,
|
||||
max_distance=max_distance,
|
||||
)
|
||||
else:
|
||||
_dense_mutual_reachability_graph(
|
||||
distance_matrix, further_neighbor_idx=further_neighbor_idx
|
||||
)
|
||||
return distance_matrix
|
||||
|
||||
|
||||
def _dense_mutual_reachability_graph(
|
||||
floating[:, :] distance_matrix,
|
||||
intp_t further_neighbor_idx,
|
||||
):
|
||||
"""Dense implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : ndarray of shape (n_samples, n_samples)
|
||||
Array of distances between samples.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
"""
|
||||
cdef:
|
||||
intp_t i, j, n_samples = distance_matrix.shape[0]
|
||||
floating mutual_reachability_distance
|
||||
floating[::1] core_distances
|
||||
|
||||
# We assume that the distance matrix is symmetric. We choose to sort every
|
||||
# row to have the same implementation than the sparse case that requires
|
||||
# CSR matrix.
|
||||
core_distances = np.ascontiguousarray(
|
||||
np.partition(
|
||||
distance_matrix, further_neighbor_idx, axis=1
|
||||
)[:, further_neighbor_idx]
|
||||
)
|
||||
|
||||
with nogil:
|
||||
# TODO: Update w/ prange with thread count based on
|
||||
# _openmp_effective_n_threads
|
||||
for i in range(n_samples):
|
||||
for j in range(n_samples):
|
||||
mutual_reachability_distance = max(
|
||||
core_distances[i],
|
||||
core_distances[j],
|
||||
distance_matrix[i, j],
|
||||
)
|
||||
distance_matrix[i, j] = mutual_reachability_distance
|
||||
|
||||
|
||||
def _sparse_mutual_reachability_graph(
|
||||
cnp.ndarray[floating, ndim=1, mode="c"] data,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indices,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
|
||||
intp_t n_samples,
|
||||
intp_t further_neighbor_idx,
|
||||
floating max_distance,
|
||||
):
|
||||
"""Sparse implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly. This implementation only accepts `CSR` format sparse matrices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : sparse matrix of shape (n_samples, n_samples)
|
||||
Sparse matrix of distances between samples. The sparse format should
|
||||
be `CSR`.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
|
||||
max_distance : float
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
"""
|
||||
cdef:
|
||||
integral i, col_ind, row_ind
|
||||
floating mutual_reachability_distance
|
||||
floating[:] core_distances
|
||||
floating[:] row_data
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
core_distances = np.empty(n_samples, dtype=dtype)
|
||||
|
||||
for i in range(n_samples):
|
||||
row_data = data[indptr[i]:indptr[i + 1]]
|
||||
if further_neighbor_idx < row_data.size:
|
||||
core_distances[i] = np.partition(
|
||||
row_data, further_neighbor_idx
|
||||
)[further_neighbor_idx]
|
||||
else:
|
||||
core_distances[i] = INFINITY
|
||||
|
||||
with nogil:
|
||||
for row_ind in range(n_samples):
|
||||
for i in range(indptr[row_ind], indptr[row_ind + 1]):
|
||||
col_ind = indices[i]
|
||||
mutual_reachability_distance = max(
|
||||
core_distances[row_ind], core_distances[col_ind], data[i]
|
||||
)
|
||||
if isfinite(mutual_reachability_distance):
|
||||
data[i] = mutual_reachability_distance
|
||||
elif max_distance > 0:
|
||||
data[i] = max_distance
|
||||
Binary file not shown.
@@ -0,0 +1,49 @@
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t, uint8_t
|
||||
cimport numpy as cnp
|
||||
|
||||
# This corresponds to the scipy.cluster.hierarchy format
|
||||
ctypedef packed struct HIERARCHY_t:
|
||||
intp_t left_node
|
||||
intp_t right_node
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
# Effectively an edgelist encoding a parent/child pair, along with a value and
|
||||
# the corresponding cluster_size in each row providing a tree structure.
|
||||
ctypedef packed struct CONDENSED_t:
|
||||
intp_t parent
|
||||
intp_t child
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
@@ -0,0 +1,799 @@
|
||||
# Tree handling (condensing, finding stable clusters) for hdbscan
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.math cimport isinf
|
||||
import cython
|
||||
|
||||
import numpy as np
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
cdef cnp.float64_t INFTY = np.inf
|
||||
cdef cnp.intp_t NOISE = -1
|
||||
|
||||
HIERARCHY_dtype = np.dtype([
|
||||
("left_node", np.intp),
|
||||
("right_node", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
CONDENSED_dtype = np.dtype([
|
||||
("parent", np.intp),
|
||||
("child", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
cpdef tuple tree_to_labels(
|
||||
const HIERARCHY_t[::1] single_linkage_tree,
|
||||
cnp.intp_t min_cluster_size=10,
|
||||
cluster_selection_method="eom",
|
||||
bint allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None,
|
||||
):
|
||||
cdef:
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
|
||||
|
||||
condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
|
||||
labels, probabilities = _get_clusters(
|
||||
condensed_tree,
|
||||
_compute_stability(condensed_tree),
|
||||
cluster_selection_method,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon,
|
||||
max_cluster_size,
|
||||
)
|
||||
|
||||
return (labels, probabilities)
|
||||
|
||||
cdef list bfs_from_hierarchy(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
"""
|
||||
Perform a breadth first search on a tree in scipy hclust format.
|
||||
"""
|
||||
|
||||
cdef list process_queue, next_queue, result
|
||||
cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cdef cnp.intp_t node
|
||||
process_queue = [bfs_root]
|
||||
result = []
|
||||
|
||||
while process_queue:
|
||||
result.extend(process_queue)
|
||||
# By construction, node i is formed by the union of nodes
|
||||
# hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
|
||||
process_queue = [
|
||||
x - n_samples
|
||||
for x in process_queue
|
||||
if x >= n_samples
|
||||
]
|
||||
if process_queue:
|
||||
next_queue = []
|
||||
for node in process_queue:
|
||||
next_queue.extend(
|
||||
[
|
||||
hierarchy[node].left_node,
|
||||
hierarchy[node].right_node,
|
||||
]
|
||||
)
|
||||
process_queue = next_queue
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t min_cluster_size=10
|
||||
):
|
||||
"""Condense a tree according to a minimum cluster size. This is akin
|
||||
to the runt pruning procedure of Stuetzle. The result is a much simpler
|
||||
tree that is easier to visualize. We include extra information on the
|
||||
lambda value at which individual points depart clusters for later
|
||||
analysis and computation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
A single linkage hierarchy in scipy.cluster.hierarchy format.
|
||||
|
||||
min_cluster_size : int, optional (default 10)
|
||||
The minimum size of clusters to consider. Clusters smaller than this
|
||||
are pruned from the tree.
|
||||
|
||||
Returns
|
||||
-------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root = 2 * hierarchy.shape[0]
|
||||
cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cnp.intp_t next_label = n_samples + 1
|
||||
list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
|
||||
|
||||
cnp.intp_t[::1] relabel
|
||||
cnp.uint8_t[::1] ignore
|
||||
|
||||
cnp.intp_t node, sub_node, left, right
|
||||
cnp.float64_t lambda_value, distance
|
||||
cnp.intp_t left_count, right_count
|
||||
HIERARCHY_t children
|
||||
|
||||
relabel = np.empty(root + 1, dtype=np.intp)
|
||||
relabel[root] = n_samples
|
||||
result_list = []
|
||||
ignore = np.zeros(len(node_list), dtype=bool)
|
||||
|
||||
for node in node_list:
|
||||
if ignore[node] or node < n_samples:
|
||||
continue
|
||||
|
||||
children = hierarchy[node - n_samples]
|
||||
left = children.left_node
|
||||
right = children.right_node
|
||||
distance = children.value
|
||||
if distance > 0.0:
|
||||
lambda_value = 1.0 / distance
|
||||
else:
|
||||
lambda_value = INFTY
|
||||
|
||||
if left >= n_samples:
|
||||
left_count = hierarchy[left - n_samples].cluster_size
|
||||
else:
|
||||
left_count = 1
|
||||
|
||||
if right >= n_samples:
|
||||
right_count = hierarchy[right - n_samples].cluster_size
|
||||
else:
|
||||
right_count = 1
|
||||
|
||||
if left_count >= min_cluster_size and right_count >= min_cluster_size:
|
||||
relabel[left] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[left], lambda_value, left_count)
|
||||
)
|
||||
|
||||
relabel[right] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[right], lambda_value, right_count)
|
||||
)
|
||||
|
||||
elif left_count < min_cluster_size and right_count < min_cluster_size:
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
elif left_count < min_cluster_size:
|
||||
relabel[right] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
else:
|
||||
relabel[left] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
return np.array(result_list, dtype=CONDENSED_dtype)
|
||||
|
||||
|
||||
cdef dict _compute_stability(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.float64_t[::1] result, births
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
cnp.intp_t parent, cluster_size, result_index, idx
|
||||
cnp.float64_t lambda_val
|
||||
CONDENSED_t condensed_node
|
||||
cnp.intp_t largest_child = condensed_tree['child'].max()
|
||||
cnp.intp_t smallest_cluster = np.min(parents)
|
||||
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
|
||||
dict stability_dict = {}
|
||||
|
||||
largest_child = max(largest_child, smallest_cluster)
|
||||
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
|
||||
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
births[condensed_node.child] = condensed_node.value
|
||||
|
||||
births[smallest_cluster] = 0.0
|
||||
|
||||
result = np.zeros(num_clusters, dtype=np.float64)
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
parent = condensed_node.parent
|
||||
lambda_val = condensed_node.value
|
||||
cluster_size = condensed_node.cluster_size
|
||||
|
||||
result_index = parent - smallest_cluster
|
||||
result[result_index] += (lambda_val - births[parent]) * cluster_size
|
||||
|
||||
for idx in range(num_clusters):
|
||||
stability_dict[idx + smallest_cluster] = result[idx]
|
||||
|
||||
return stability_dict
|
||||
|
||||
|
||||
cdef list bfs_from_cluster_tree(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
|
||||
cdef:
|
||||
list result = []
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
|
||||
np.array([bfs_root], dtype=np.intp)
|
||||
)
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
while len(process_queue) > 0:
|
||||
result.extend(process_queue.tolist())
|
||||
process_queue = children[np.isin(parents, process_queue)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
|
||||
|
||||
cdef:
|
||||
cnp.intp_t parent, current_parent, idx
|
||||
cnp.float64_t lambda_val, max_lambda
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t largest_parent = condensed_tree['parent'].max()
|
||||
|
||||
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
|
||||
current_parent = condensed_tree[0].parent
|
||||
max_lambda = condensed_tree[0].value
|
||||
|
||||
for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
parent = condensed_tree[idx].parent
|
||||
lambda_val = condensed_tree[idx].value
|
||||
|
||||
if parent == current_parent:
|
||||
max_lambda = max(max_lambda, lambda_val)
|
||||
else:
|
||||
deaths[current_parent] = max_lambda
|
||||
current_parent = parent
|
||||
max_lambda = lambda_val
|
||||
|
||||
deaths[current_parent] = max_lambda # value for last parent
|
||||
return deaths
|
||||
|
||||
|
||||
@cython.final
|
||||
cdef class TreeUnionFind:
|
||||
|
||||
cdef cnp.intp_t[:, ::1] data
|
||||
cdef cnp.uint8_t[::1] is_component
|
||||
|
||||
def __init__(self, size):
|
||||
cdef cnp.intp_t idx
|
||||
self.data = np.zeros((size, 2), dtype=np.intp)
|
||||
for idx in range(size):
|
||||
self.data[idx, 0] = idx
|
||||
self.is_component = np.ones(size, dtype=np.uint8)
|
||||
|
||||
cdef void union(self, cnp.intp_t x, cnp.intp_t y):
|
||||
cdef cnp.intp_t x_root = self.find(x)
|
||||
cdef cnp.intp_t y_root = self.find(y)
|
||||
|
||||
if self.data[x_root, 1] < self.data[y_root, 1]:
|
||||
self.data[x_root, 0] = y_root
|
||||
elif self.data[x_root, 1] > self.data[y_root, 1]:
|
||||
self.data[y_root, 0] = x_root
|
||||
else:
|
||||
self.data[y_root, 0] = x_root
|
||||
self.data[x_root, 1] += 1
|
||||
return
|
||||
|
||||
cdef cnp.intp_t find(self, cnp.intp_t x):
|
||||
if self.data[x, 0] != x:
|
||||
self.data[x, 0] = self.find(self.data[x, 0])
|
||||
self.is_component[x] = False
|
||||
return self.data[x, 0]
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
|
||||
const HIERARCHY_t[::1] linkage,
|
||||
cnp.float64_t cut,
|
||||
cnp.intp_t min_cluster_size
|
||||
):
|
||||
"""Given a single linkage tree and a cut value, return the
|
||||
vector of cluster labels at that cut value. This is useful
|
||||
for Robust Single Linkage, and extracting DBSCAN results
|
||||
from a single HDBSCAN run.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
The single linkage tree in scipy.cluster.hierarchy format.
|
||||
|
||||
cut : double
|
||||
The cut value at which to find clusters.
|
||||
|
||||
min_cluster_size : int
|
||||
The minimum cluster size; clusters below this size at
|
||||
the cut will be considered noise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t n, cluster, root, n_samples, cluster_label
|
||||
cnp.intp_t[::1] unique_labels, cluster_size
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
TreeUnionFind union_find
|
||||
dict cluster_label_map
|
||||
HIERARCHY_t node
|
||||
|
||||
root = 2 * linkage.shape[0]
|
||||
n_samples = root // 2 + 1
|
||||
result = np.empty(n_samples, dtype=np.intp)
|
||||
union_find = TreeUnionFind(root + 1)
|
||||
|
||||
cluster = n_samples
|
||||
for node in linkage:
|
||||
if node.value < cut:
|
||||
union_find.union(node.left_node, cluster)
|
||||
union_find.union(node.right_node, cluster)
|
||||
cluster += 1
|
||||
|
||||
cluster_size = np.zeros(cluster, dtype=np.intp)
|
||||
for n in range(n_samples):
|
||||
cluster = union_find.find(n)
|
||||
cluster_size[cluster] += 1
|
||||
result[n] = cluster
|
||||
|
||||
cluster_label_map = {-1: NOISE}
|
||||
cluster_label = 0
|
||||
unique_labels = np.unique(result)
|
||||
|
||||
for cluster in unique_labels:
|
||||
if cluster_size[cluster] < min_cluster_size:
|
||||
cluster_label_map[cluster] = NOISE
|
||||
else:
|
||||
cluster_label_map[cluster] = cluster_label
|
||||
cluster_label += 1
|
||||
|
||||
for n in range(n_samples):
|
||||
result[n] = cluster_label_map[result[n]]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
set clusters,
|
||||
dict cluster_label_map,
|
||||
cnp.intp_t allow_single_cluster,
|
||||
cnp.float64_t cluster_selection_epsilon
|
||||
):
|
||||
"""Given a condensed tree, clusters and a labeling map for the clusters,
|
||||
return an array containing the labels of each point based on cluster
|
||||
membership. Note that this is where points may be marked as noisy
|
||||
outliers. The determination of some points as noise is in large, single-
|
||||
cluster datasets is controlled by the `allow_single_cluster` and
|
||||
`cluster_selection_epsilon` parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
clusters : set
|
||||
The set of nodes corresponding to identified clusters. These node
|
||||
values should be the same as those present in `condensed_tree`.
|
||||
|
||||
cluster_label_map : dict
|
||||
A mapping from the node values present in `clusters` to the labels
|
||||
which will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root_cluster
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
|
||||
TreeUnionFind union_find
|
||||
cnp.intp_t n, parent, child, cluster
|
||||
cnp.float64_t threshold
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
root_cluster = np.min(parent_array)
|
||||
result = np.empty(root_cluster, dtype=np.intp)
|
||||
union_find = TreeUnionFind(np.max(parent_array) + 1)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
child = child_array[n]
|
||||
parent = parent_array[n]
|
||||
if child not in clusters:
|
||||
union_find.union(parent, child)
|
||||
|
||||
for n in range(root_cluster):
|
||||
cluster = union_find.find(n)
|
||||
label = NOISE
|
||||
if cluster != root_cluster:
|
||||
label = cluster_label_map[cluster]
|
||||
elif len(clusters) == 1 and allow_single_cluster:
|
||||
# There can only be one edge with this particular child hence this
|
||||
# expression extracts a unique, scalar lambda value.
|
||||
parent_lambda = lambda_array[child_array == n]
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
threshold = 1 / cluster_selection_epsilon
|
||||
else:
|
||||
# The threshold should be calculated per-sample based on the
|
||||
# largest lambda of any simbling node.
|
||||
threshold = lambda_array[parent_array == cluster].max()
|
||||
if parent_lambda >= threshold:
|
||||
label = cluster_label_map[cluster]
|
||||
|
||||
result[n] = label
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict cluster_map,
|
||||
cnp.intp_t[::1] labels
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
|
||||
cnp.float64_t[:] lambda_array
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t[:] child_array, parent_array
|
||||
cnp.intp_t root_cluster, n, point, cluster_num, cluster
|
||||
cnp.float64_t max_lambda, lambda_val
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
result = np.zeros(labels.shape[0])
|
||||
deaths = max_lambdas(condensed_tree)
|
||||
root_cluster = np.min(parent_array)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
point = child_array[n]
|
||||
if point >= root_cluster:
|
||||
continue
|
||||
|
||||
cluster_num = labels[point]
|
||||
if cluster_num == -1:
|
||||
continue
|
||||
|
||||
cluster = cluster_map[cluster_num]
|
||||
max_lambda = deaths[cluster]
|
||||
if max_lambda == 0.0 or isinf(lambda_array[n]):
|
||||
result[point] = 1.0
|
||||
else:
|
||||
lambda_val = min(lambda_array[n], max_lambda)
|
||||
result[point] = lambda_val / max_lambda
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef list recurse_leaf_dfs(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.intp_t current_node
|
||||
):
|
||||
cdef cnp.intp_t[:] children
|
||||
cdef cnp.intp_t child
|
||||
|
||||
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
|
||||
if children.shape[0] == 0:
|
||||
return [current_node,]
|
||||
else:
|
||||
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
|
||||
|
||||
|
||||
cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
|
||||
cdef cnp.intp_t root
|
||||
if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
|
||||
return []
|
||||
root = cluster_tree['parent'].min()
|
||||
return recurse_leaf_dfs(cluster_tree, root)
|
||||
|
||||
cdef cnp.intp_t traverse_upwards(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t leaf,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef cnp.intp_t root, parent
|
||||
cdef cnp.float64_t parent_eps
|
||||
|
||||
root = cluster_tree['parent'].min()
|
||||
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
|
||||
if parent == root:
|
||||
if allow_single_cluster:
|
||||
return parent
|
||||
else:
|
||||
return leaf # return node closest to root
|
||||
|
||||
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
|
||||
if parent_eps > cluster_selection_epsilon:
|
||||
return parent
|
||||
else:
|
||||
return traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
parent,
|
||||
allow_single_cluster
|
||||
)
|
||||
|
||||
cdef set epsilon_search(
|
||||
set leaves,
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef:
|
||||
list selected_clusters = list()
|
||||
list processed = list()
|
||||
cnp.intp_t leaf, epsilon_child, sub_node
|
||||
cnp.float64_t eps
|
||||
cnp.uint8_t[:] leaf_nodes
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
|
||||
|
||||
for leaf in leaves:
|
||||
leaf_nodes = children == leaf
|
||||
eps = 1 / distances[leaf_nodes][0]
|
||||
if eps < cluster_selection_epsilon:
|
||||
if leaf not in processed:
|
||||
epsilon_child = traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
leaf,
|
||||
allow_single_cluster
|
||||
)
|
||||
selected_clusters.append(epsilon_child)
|
||||
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
|
||||
if sub_node != epsilon_child:
|
||||
processed.append(sub_node)
|
||||
else:
|
||||
selected_clusters.append(leaf)
|
||||
|
||||
return set(selected_clusters)
|
||||
|
||||
|
||||
@cython.wraparound(True)
|
||||
cdef tuple _get_clusters(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict stability,
|
||||
cluster_selection_method='eom',
|
||||
cnp.uint8_t allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None
|
||||
):
|
||||
"""Given a tree and stability dict, produce the cluster labels
|
||||
(and probabilities) for a flat clustering based on the chosen
|
||||
cluster selection method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
stability : dict
|
||||
A dictionary mapping cluster_ids to stability values
|
||||
|
||||
cluster_selection_method : string, optional (default 'eom')
|
||||
The method of selecting clusters. The default is the
|
||||
Excess of Mass algorithm specified by 'eom'. The alternate
|
||||
option is 'leaf'.
|
||||
|
||||
allow_single_cluster : boolean, optional (default False)
|
||||
Whether to allow a single cluster to be selected by the
|
||||
Excess of Mass algorithm.
|
||||
|
||||
cluster_selection_epsilon: double, optional (default 0.0)
|
||||
A distance threshold for cluster splits.
|
||||
|
||||
max_cluster_size: int, default=None
|
||||
The maximum size for clusters located by the EOM clusterer. Can
|
||||
be overridden by the cluster_selection_epsilon parameter in
|
||||
rare cases.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
An integer array of cluster labels, with -1 denoting noise.
|
||||
|
||||
probabilities : ndarray (n_samples,)
|
||||
The cluster membership strength of each sample.
|
||||
|
||||
stabilities : ndarray (n_clusters,)
|
||||
The cluster coherence strengths of each cluster.
|
||||
"""
|
||||
cdef:
|
||||
list node_list
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
|
||||
cnp.uint8_t[::1] child_selection
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
dict is_cluster, cluster_sizes
|
||||
cnp.float64_t subtree_stability
|
||||
cnp.intp_t node, sub_node, cluster, n_samples
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
|
||||
|
||||
# Assume clusters are ordered by numeric id equivalent to
|
||||
# a topological sort of the tree; This is valid given the
|
||||
# current implementation above, so don't change that ... or
|
||||
# if you do, change this accordingly!
|
||||
if allow_single_cluster:
|
||||
node_list = sorted(stability.keys(), reverse=True)
|
||||
else:
|
||||
node_list = sorted(stability.keys(), reverse=True)[:-1]
|
||||
# (exclude root)
|
||||
|
||||
cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
|
||||
is_cluster = {cluster: True for cluster in node_list}
|
||||
n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
|
||||
|
||||
if max_cluster_size is None:
|
||||
max_cluster_size = n_samples + 1 # Set to a value that will never be triggered
|
||||
cluster_sizes = {
|
||||
child: cluster_size for child, cluster_size
|
||||
in zip(cluster_tree['child'], cluster_tree['cluster_size'])
|
||||
}
|
||||
if allow_single_cluster:
|
||||
# Compute cluster size for the root node
|
||||
cluster_sizes[node_list[-1]] = np.sum(
|
||||
cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
|
||||
|
||||
if cluster_selection_method == 'eom':
|
||||
for node in node_list:
|
||||
child_selection = (cluster_tree['parent'] == node)
|
||||
subtree_stability = np.sum([
|
||||
stability[child] for
|
||||
child in cluster_tree['child'][child_selection]])
|
||||
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
|
||||
is_cluster[node] = False
|
||||
stability[node] = subtree_stability
|
||||
else:
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
|
||||
if sub_node != node:
|
||||
is_cluster[sub_node] = False
|
||||
|
||||
if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
|
||||
eom_clusters = [c for c in is_cluster if is_cluster[c]]
|
||||
selected_clusters = []
|
||||
# first check if eom_clusters only has root node, which skips epsilon check.
|
||||
if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
|
||||
if allow_single_cluster:
|
||||
selected_clusters = eom_clusters
|
||||
else:
|
||||
selected_clusters = epsilon_search(
|
||||
set(eom_clusters),
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
elif cluster_selection_method == 'leaf':
|
||||
leaves = set(get_cluster_tree_leaves(cluster_tree))
|
||||
if len(leaves) == 0:
|
||||
for c in is_cluster:
|
||||
is_cluster[c] = False
|
||||
is_cluster[condensed_tree['parent'].min()] = True
|
||||
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
selected_clusters = epsilon_search(
|
||||
leaves,
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
else:
|
||||
selected_clusters = leaves
|
||||
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
clusters = {c for c in is_cluster if is_cluster[c]}
|
||||
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
|
||||
reverse_cluster_map = {n: c for c, n in cluster_map.items()}
|
||||
|
||||
labels = _do_labelling(
|
||||
condensed_tree,
|
||||
clusters,
|
||||
cluster_map,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon
|
||||
)
|
||||
probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
|
||||
|
||||
return (labels, probs)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
cluster_hdbscan_extension_metadata = {
|
||||
'_linkage': {'sources': [cython_gen.process('_linkage.pyx'), metrics_cython_tree]},
|
||||
'_reachability': {'sources': [cython_gen.process('_reachability.pyx')]},
|
||||
'_tree': {'sources': [cython_gen.process('_tree.pyx')]}
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
ext_dict.get('sources'),
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/cluster/_hdbscan',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,63 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_error_sparse_format():
|
||||
"""Check that we raise an error if the sparse format is not CSR."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, "sparse_csc")
|
||||
|
||||
err_msg = "Only sparse CSR matrices are supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mutual_reachability_graph(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
def test_mutual_reachability_graph_inplace(array_type):
|
||||
"""Check that the operation is happening inplace."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
|
||||
assert id(mr_graph) == id(X)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_equivalence_dense_sparse():
|
||||
"""Check that we get the same results for dense and sparse implementation."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(5, 5)
|
||||
X_dense = X.T @ X
|
||||
X_sparse = _convert_container(X_dense, "sparse_csr")
|
||||
|
||||
mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
|
||||
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
|
||||
|
||||
assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_mutual_reachability_graph_preserves_dtype(array_type, dtype):
|
||||
"""Check that the computation preserve dtype thanks to fused types."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = (X.T @ X).astype(dtype)
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
assert X.dtype == dtype
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
assert mr_graph.dtype == dtype
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
from sklearn.utils._typedefs cimport intp_t
|
||||
|
||||
cdef class UnionFind:
|
||||
cdef intp_t next_label
|
||||
cdef intp_t[:] parent
|
||||
cdef intp_t[:] size
|
||||
|
||||
cdef void union(self, intp_t m, intp_t n) noexcept
|
||||
cdef intp_t fast_find(self, intp_t n) noexcept
|
||||
@@ -0,0 +1,507 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
cimport cython
|
||||
|
||||
from sklearn.metrics._dist_metrics cimport DistanceMetric64
|
||||
from sklearn.utils._fast_dict cimport IntFloatDict
|
||||
from sklearn.utils._typedefs cimport float64_t, intp_t, uint8_t
|
||||
|
||||
# C++
|
||||
from cython.operator cimport dereference as deref, preincrement as inc
|
||||
from libcpp.map cimport map as cpp_map
|
||||
from libc.math cimport fmax, INFINITY
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities for computing the ward momentum
|
||||
|
||||
def compute_ward_dist(
|
||||
const float64_t[::1] m_1,
|
||||
const float64_t[:, ::1] m_2,
|
||||
const intp_t[::1] coord_row,
|
||||
const intp_t[::1] coord_col,
|
||||
float64_t[::1] res
|
||||
):
|
||||
cdef intp_t size_max = coord_row.shape[0]
|
||||
cdef intp_t n_features = m_2.shape[1]
|
||||
cdef intp_t i, j, row, col
|
||||
cdef float64_t pa, n
|
||||
|
||||
for i in range(size_max):
|
||||
row = coord_row[i]
|
||||
col = coord_col[i]
|
||||
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
|
||||
pa = 0.
|
||||
for j in range(n_features):
|
||||
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
|
||||
res[i] = pa * n
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities for cutting and exploring a hierarchical tree
|
||||
|
||||
def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
|
||||
"""
|
||||
Function returning all the descendent leaves of a set of nodes in the tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
node : integer
|
||||
The node for which we want the descendents.
|
||||
|
||||
children : list of pairs, length n_nodes
|
||||
The children of each non-leaf node. Values less than `n_samples` refer
|
||||
to leaves of the tree. A greater value `i` indicates a node with
|
||||
children `children[i - n_samples]`.
|
||||
|
||||
n_leaves : integer
|
||||
Number of leaves.
|
||||
|
||||
Returns
|
||||
-------
|
||||
descendent : list of int
|
||||
"""
|
||||
ind = [node]
|
||||
if node < n_leaves:
|
||||
return ind
|
||||
descendent = []
|
||||
|
||||
# It is actually faster to do the accounting of the number of
|
||||
# elements is the list ourselves: len is a lengthy operation on a
|
||||
# chained list
|
||||
cdef intp_t i, n_indices = 1
|
||||
|
||||
while n_indices:
|
||||
i = ind.pop()
|
||||
if i < n_leaves:
|
||||
descendent.append(i)
|
||||
n_indices -= 1
|
||||
else:
|
||||
ind.extend(children[i - n_leaves])
|
||||
n_indices += 1
|
||||
return descendent
|
||||
|
||||
|
||||
def hc_get_heads(intp_t[:] parents, copy=True):
|
||||
"""Returns the heads of the forest, as defined by parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parents : array of integers
|
||||
The parent structure defining the forest (ensemble of trees)
|
||||
copy : boolean
|
||||
If copy is False, the input 'parents' array is modified inplace
|
||||
|
||||
Returns
|
||||
-------
|
||||
heads : array of integers of same shape as parents
|
||||
The indices in the 'parents' of the tree heads
|
||||
|
||||
"""
|
||||
cdef intp_t parent, node0, node, size
|
||||
if copy:
|
||||
parents = np.copy(parents)
|
||||
size = parents.size
|
||||
|
||||
# Start from the top of the tree and go down
|
||||
for node0 in range(size - 1, -1, -1):
|
||||
node = node0
|
||||
parent = parents[node]
|
||||
while parent != node:
|
||||
parents[node0] = parent
|
||||
node = parent
|
||||
parent = parents[node]
|
||||
return parents
|
||||
|
||||
|
||||
def _get_parents(
|
||||
nodes,
|
||||
heads,
|
||||
const intp_t[:] parents,
|
||||
uint8_t[::1] not_visited
|
||||
):
|
||||
"""Returns the heads of the given nodes, as defined by parents.
|
||||
|
||||
Modifies 'heads' and 'not_visited' in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nodes : list of integers
|
||||
The nodes to start from
|
||||
heads : list of integers
|
||||
A list to hold the results (modified inplace)
|
||||
parents : array of integers
|
||||
The parent structure defining the tree
|
||||
not_visited
|
||||
The tree nodes to consider (modified inplace)
|
||||
|
||||
"""
|
||||
cdef intp_t parent, node
|
||||
|
||||
for node in nodes:
|
||||
parent = parents[node]
|
||||
while parent != node:
|
||||
node = parent
|
||||
parent = parents[node]
|
||||
if not_visited[node]:
|
||||
not_visited[node] = 0
|
||||
heads.append(node)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# merge strategies implemented on IntFloatDicts
|
||||
|
||||
# These are used in the hierarchical clustering code, to implement
|
||||
# merging between two clusters, defined as a dict containing node number
|
||||
# as keys and edge weights as values.
|
||||
|
||||
|
||||
def max_merge(
|
||||
IntFloatDict a,
|
||||
IntFloatDict b,
|
||||
const intp_t[:] mask,
|
||||
intp_t n_a,
|
||||
intp_t n_b
|
||||
):
|
||||
"""Merge two IntFloatDicts with the max strategy: when the same key is
|
||||
present in the two dicts, the max of the two values is used.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
a, b : IntFloatDict object
|
||||
The IntFloatDicts to merge
|
||||
mask : ndarray array of dtype integer and of dimension 1
|
||||
a mask for keys to ignore: if not mask[key] the corresponding key
|
||||
is skipped in the output dictionary
|
||||
n_a, n_b : float
|
||||
n_a and n_b are weights for a and b for the merge strategy.
|
||||
They are not used in the case of a max merge.
|
||||
|
||||
Returns
|
||||
=======
|
||||
out : IntFloatDict object
|
||||
The IntFloatDict resulting from the merge
|
||||
"""
|
||||
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||||
cdef intp_t key
|
||||
cdef float64_t value
|
||||
# First copy a into out
|
||||
while a_it != a_end:
|
||||
key = deref(a_it).first
|
||||
if mask[key]:
|
||||
out_obj.my_map[key] = deref(a_it).second
|
||||
inc(a_it)
|
||||
|
||||
# Then merge b into out
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||||
while b_it != b_end:
|
||||
key = deref(b_it).first
|
||||
value = deref(b_it).second
|
||||
if mask[key]:
|
||||
out_it = out_obj.my_map.find(key)
|
||||
if out_it == out_end:
|
||||
# Key not found
|
||||
out_obj.my_map[key] = value
|
||||
else:
|
||||
deref(out_it).second = fmax(deref(out_it).second, value)
|
||||
inc(b_it)
|
||||
return out_obj
|
||||
|
||||
|
||||
def average_merge(
|
||||
IntFloatDict a,
|
||||
IntFloatDict b,
|
||||
const intp_t[:] mask,
|
||||
intp_t n_a,
|
||||
intp_t n_b
|
||||
):
|
||||
"""Merge two IntFloatDicts with the average strategy: when the
|
||||
same key is present in the two dicts, the weighted average of the two
|
||||
values is used.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
a, b : IntFloatDict object
|
||||
The IntFloatDicts to merge
|
||||
mask : ndarray array of dtype integer and of dimension 1
|
||||
a mask for keys to ignore: if not mask[key] the corresponding key
|
||||
is skipped in the output dictionary
|
||||
n_a, n_b : float
|
||||
n_a and n_b are weights for a and b for the merge strategy.
|
||||
They are used for a weighted mean.
|
||||
|
||||
Returns
|
||||
=======
|
||||
out : IntFloatDict object
|
||||
The IntFloatDict resulting from the merge
|
||||
"""
|
||||
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||||
cdef intp_t key
|
||||
cdef float64_t value
|
||||
cdef float64_t n_out = <float64_t> (n_a + n_b)
|
||||
# First copy a into out
|
||||
while a_it != a_end:
|
||||
key = deref(a_it).first
|
||||
if mask[key]:
|
||||
out_obj.my_map[key] = deref(a_it).second
|
||||
inc(a_it)
|
||||
|
||||
# Then merge b into out
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||||
while b_it != b_end:
|
||||
key = deref(b_it).first
|
||||
value = deref(b_it).second
|
||||
if mask[key]:
|
||||
out_it = out_obj.my_map.find(key)
|
||||
if out_it == out_end:
|
||||
# Key not found
|
||||
out_obj.my_map[key] = value
|
||||
else:
|
||||
deref(out_it).second = (n_a * deref(out_it).second
|
||||
+ n_b * value) / n_out
|
||||
inc(b_it)
|
||||
return out_obj
|
||||
|
||||
|
||||
###############################################################################
|
||||
# An edge object for fast comparisons
|
||||
|
||||
cdef class WeightedEdge:
|
||||
cdef public intp_t a
|
||||
cdef public intp_t b
|
||||
cdef public float64_t weight
|
||||
|
||||
def __init__(self, float64_t weight, intp_t a, intp_t b):
|
||||
self.weight = weight
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def __richcmp__(self, WeightedEdge other, int op):
|
||||
"""Cython-specific comparison method.
|
||||
|
||||
op is the comparison code::
|
||||
< 0
|
||||
== 2
|
||||
> 4
|
||||
<= 1
|
||||
!= 3
|
||||
>= 5
|
||||
"""
|
||||
if op == 0:
|
||||
return self.weight < other.weight
|
||||
elif op == 1:
|
||||
return self.weight <= other.weight
|
||||
elif op == 2:
|
||||
return self.weight == other.weight
|
||||
elif op == 3:
|
||||
return self.weight != other.weight
|
||||
elif op == 4:
|
||||
return self.weight > other.weight
|
||||
elif op == 5:
|
||||
return self.weight >= other.weight
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
|
||||
self.weight,
|
||||
self.a, self.b)
|
||||
|
||||
|
||||
################################################################################
|
||||
# Efficient labelling/conversion of MSTs to single linkage hierarchies
|
||||
|
||||
cdef class UnionFind(object):
|
||||
|
||||
def __init__(self, N):
|
||||
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
|
||||
self.next_label = N
|
||||
self.size = np.hstack((np.ones(N, dtype=np.intp),
|
||||
np.zeros(N - 1, dtype=np.intp)))
|
||||
|
||||
cdef void union(self, intp_t m, intp_t n) noexcept:
|
||||
self.parent[m] = self.next_label
|
||||
self.parent[n] = self.next_label
|
||||
self.size[self.next_label] = self.size[m] + self.size[n]
|
||||
self.next_label += 1
|
||||
return
|
||||
|
||||
@cython.wraparound(True)
|
||||
cdef intp_t fast_find(self, intp_t n) noexcept:
|
||||
cdef intp_t p
|
||||
p = n
|
||||
# find the highest node in the linkage graph so far
|
||||
while self.parent[n] != -1:
|
||||
n = self.parent[n]
|
||||
# provide a shortcut up to the highest node
|
||||
while self.parent[p] != n:
|
||||
p, self.parent[p] = self.parent[p], n
|
||||
return n
|
||||
|
||||
|
||||
def _single_linkage_label(const float64_t[:, :] L):
|
||||
"""
|
||||
Convert a linkage array or MST to a tree by labelling clusters at merges.
|
||||
This is done by using a Union find structure to keep track of merges
|
||||
efficiently. This is the private version of the function that assumes that
|
||||
``L`` has been properly validated. See ``single_linkage_label`` for the
|
||||
user facing version of this function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L: array of shape (n_samples - 1, 3)
|
||||
The linkage array or MST where each row specifies two samples
|
||||
to be merged and a distance or weight at which the merge occurs. This
|
||||
array is assumed to be sorted by the distance/weight.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tree in the format used by scipy.cluster.hierarchy.
|
||||
"""
|
||||
|
||||
cdef float64_t[:, ::1] result_arr
|
||||
|
||||
cdef intp_t left, left_cluster, right, right_cluster, index
|
||||
cdef float64_t delta
|
||||
|
||||
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
|
||||
U = UnionFind(L.shape[0] + 1)
|
||||
|
||||
for index in range(L.shape[0]):
|
||||
|
||||
left = <intp_t> L[index, 0]
|
||||
right = <intp_t> L[index, 1]
|
||||
delta = L[index, 2]
|
||||
|
||||
left_cluster = U.fast_find(left)
|
||||
right_cluster = U.fast_find(right)
|
||||
|
||||
result_arr[index][0] = left_cluster
|
||||
result_arr[index][1] = right_cluster
|
||||
result_arr[index][2] = delta
|
||||
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
|
||||
|
||||
U.union(left_cluster, right_cluster)
|
||||
|
||||
return np.asarray(result_arr)
|
||||
|
||||
|
||||
@cython.wraparound(True)
|
||||
def single_linkage_label(L):
|
||||
"""
|
||||
Convert a linkage array or MST to a tree by labelling clusters at merges.
|
||||
This is done by using a Union find structure to keep track of merges
|
||||
efficiently.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L: array of shape (n_samples - 1, 3)
|
||||
The linkage array or MST where each row specifies two samples
|
||||
to be merged and a distance or weight at which the merge occurs. This
|
||||
array is assumed to be sorted by the distance/weight.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tree in the format used by scipy.cluster.hierarchy.
|
||||
"""
|
||||
# Validate L
|
||||
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
|
||||
raise ValueError("Input MST array is not a validly formatted MST array")
|
||||
|
||||
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
|
||||
if not is_sorted(L[:, 2]):
|
||||
raise ValueError("Input MST array must be sorted by weight")
|
||||
|
||||
return _single_linkage_label(L)
|
||||
|
||||
|
||||
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
|
||||
def mst_linkage_core(
|
||||
const float64_t [:, ::1] raw_data,
|
||||
DistanceMetric64 dist_metric):
|
||||
"""
|
||||
Compute the necessary elements of a minimum spanning
|
||||
tree for computation of single linkage clustering. This
|
||||
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
|
||||
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
|
||||
algorithms" <1109.2378>`.
|
||||
|
||||
In contrast to the scipy implementation is never computes
|
||||
a full distance matrix, generating distances only as they
|
||||
are needed and releasing them when no longer needed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_data: array of shape (n_samples, n_features)
|
||||
The array of feature data to be clustered. Must be C-aligned
|
||||
|
||||
dist_metric: DistanceMetric64
|
||||
A DistanceMetric64 object conforming to the API from
|
||||
``sklearn.metrics._dist_metrics.pxd`` that will be
|
||||
used to compute distances.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst_core_data: array of shape (n_samples, 3)
|
||||
An array providing information from which one
|
||||
can either compute an MST, or the linkage hierarchy
|
||||
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
|
||||
agglomerative clustering algorithms" <1109.2378>` algorithm
|
||||
MST-LINKAGE-CORE for more details.
|
||||
"""
|
||||
cdef:
|
||||
intp_t n_samples = raw_data.shape[0]
|
||||
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
|
||||
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
|
||||
|
||||
intp_t current_node = 0
|
||||
intp_t new_node
|
||||
intp_t i
|
||||
intp_t j
|
||||
intp_t num_features = raw_data.shape[1]
|
||||
|
||||
float64_t right_value
|
||||
float64_t left_value
|
||||
float64_t new_distance
|
||||
|
||||
float64_t[:] current_distances = np.full(n_samples, INFINITY)
|
||||
|
||||
for i in range(n_samples - 1):
|
||||
|
||||
in_tree[current_node] = 1
|
||||
|
||||
new_distance = INFINITY
|
||||
new_node = 0
|
||||
|
||||
for j in range(n_samples):
|
||||
if in_tree[j]:
|
||||
continue
|
||||
|
||||
right_value = current_distances[j]
|
||||
left_value = dist_metric.dist(&raw_data[current_node, 0],
|
||||
&raw_data[j, 0],
|
||||
num_features)
|
||||
|
||||
if left_value < right_value:
|
||||
current_distances[j] = left_value
|
||||
|
||||
if current_distances[j] < new_distance:
|
||||
new_distance = current_distances[j]
|
||||
new_node = j
|
||||
|
||||
result[i, 0] = current_node
|
||||
result[i, 1] = new_node
|
||||
result[i, 2] = new_distance
|
||||
current_node = new_node
|
||||
|
||||
return np.array(result)
|
||||
Binary file not shown.
@@ -0,0 +1,48 @@
|
||||
from cython cimport floating
|
||||
|
||||
|
||||
cdef floating _euclidean_dense_dense(
|
||||
const floating*,
|
||||
const floating*,
|
||||
int,
|
||||
bint
|
||||
) noexcept nogil
|
||||
|
||||
cdef floating _euclidean_sparse_dense(
|
||||
const floating[::1],
|
||||
const int[::1],
|
||||
const floating[::1],
|
||||
floating,
|
||||
bint
|
||||
) noexcept nogil
|
||||
|
||||
cpdef void _relocate_empty_clusters_dense(
|
||||
const floating[:, ::1],
|
||||
const floating[::1],
|
||||
const floating[:, ::1],
|
||||
floating[:, ::1],
|
||||
floating[::1],
|
||||
const int[::1]
|
||||
)
|
||||
|
||||
cpdef void _relocate_empty_clusters_sparse(
|
||||
const floating[::1],
|
||||
const int[::1],
|
||||
const int[::1],
|
||||
const floating[::1],
|
||||
const floating[:, ::1],
|
||||
floating[:, ::1],
|
||||
floating[::1],
|
||||
const int[::1]
|
||||
)
|
||||
|
||||
cdef void _average_centers(
|
||||
floating[:, ::1],
|
||||
const floating[::1]
|
||||
)
|
||||
|
||||
cdef void _center_shift(
|
||||
const floating[:, ::1],
|
||||
const floating[:, ::1],
|
||||
floating[::1]
|
||||
)
|
||||
@@ -0,0 +1,328 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
from cython cimport floating
|
||||
from cython.parallel cimport prange
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from sklearn.utils.extmath import row_norms
|
||||
|
||||
|
||||
# Number of samples per data chunk defined as a global constant.
|
||||
CHUNK_SIZE = 256
|
||||
|
||||
|
||||
cdef floating _euclidean_dense_dense(
|
||||
const floating* a, # IN
|
||||
const floating* b, # IN
|
||||
int n_features,
|
||||
bint squared
|
||||
) noexcept nogil:
|
||||
"""Euclidean distance between a dense and b dense"""
|
||||
cdef:
|
||||
int i
|
||||
int n = n_features // 4
|
||||
int rem = n_features % 4
|
||||
floating result = 0
|
||||
|
||||
# We manually unroll the loop for better cache optimization.
|
||||
for i in range(n):
|
||||
result += (
|
||||
(a[0] - b[0]) * (a[0] - b[0]) +
|
||||
(a[1] - b[1]) * (a[1] - b[1]) +
|
||||
(a[2] - b[2]) * (a[2] - b[2]) +
|
||||
(a[3] - b[3]) * (a[3] - b[3])
|
||||
)
|
||||
a += 4
|
||||
b += 4
|
||||
|
||||
for i in range(rem):
|
||||
result += (a[i] - b[i]) * (a[i] - b[i])
|
||||
|
||||
return result if squared else sqrt(result)
|
||||
|
||||
|
||||
def _euclidean_dense_dense_wrapper(
|
||||
const floating[::1] a,
|
||||
const floating[::1] b,
|
||||
bint squared
|
||||
):
|
||||
"""Wrapper of _euclidean_dense_dense for testing purpose"""
|
||||
return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
|
||||
|
||||
|
||||
cdef floating _euclidean_sparse_dense(
|
||||
const floating[::1] a_data, # IN
|
||||
const int[::1] a_indices, # IN
|
||||
const floating[::1] b, # IN
|
||||
floating b_squared_norm,
|
||||
bint squared
|
||||
) noexcept nogil:
|
||||
"""Euclidean distance between a sparse and b dense"""
|
||||
cdef:
|
||||
int nnz = a_indices.shape[0]
|
||||
int i
|
||||
floating tmp, bi
|
||||
floating result = 0.0
|
||||
|
||||
for i in range(nnz):
|
||||
bi = b[a_indices[i]]
|
||||
tmp = a_data[i] - bi
|
||||
result += tmp * tmp - bi * bi
|
||||
|
||||
result += b_squared_norm
|
||||
|
||||
if result < 0:
|
||||
result = 0.0
|
||||
|
||||
return result if squared else sqrt(result)
|
||||
|
||||
|
||||
def _euclidean_sparse_dense_wrapper(
|
||||
const floating[::1] a_data,
|
||||
const int[::1] a_indices,
|
||||
const floating[::1] b,
|
||||
floating b_squared_norm,
|
||||
bint squared
|
||||
):
|
||||
"""Wrapper of _euclidean_sparse_dense for testing purpose"""
|
||||
return _euclidean_sparse_dense(
|
||||
a_data, a_indices, b, b_squared_norm, squared)
|
||||
|
||||
|
||||
cpdef floating _inertia_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers, # IN
|
||||
const int[::1] labels, # IN
|
||||
int n_threads,
|
||||
int single_label=-1,
|
||||
):
|
||||
"""Compute inertia for dense input data
|
||||
|
||||
Sum of squared distance between each sample and its assigned center.
|
||||
|
||||
If single_label is >= 0, the inertia is computed only for that label.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
int i, j
|
||||
|
||||
floating sq_dist = 0.0
|
||||
floating inertia = 0.0
|
||||
|
||||
for i in prange(n_samples, nogil=True, num_threads=n_threads,
|
||||
schedule='static'):
|
||||
j = labels[i]
|
||||
if single_label < 0 or single_label == j:
|
||||
sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0],
|
||||
n_features, True)
|
||||
inertia += sq_dist * sample_weight[i]
|
||||
|
||||
return inertia
|
||||
|
||||
|
||||
cpdef floating _inertia_sparse(
|
||||
X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers, # IN
|
||||
const int[::1] labels, # IN
|
||||
int n_threads,
|
||||
int single_label=-1,
|
||||
):
|
||||
"""Compute inertia for sparse input data
|
||||
|
||||
Sum of squared distance between each sample and its assigned center.
|
||||
|
||||
If single_label is >= 0, the inertia is computed only for that label.
|
||||
"""
|
||||
cdef:
|
||||
floating[::1] X_data = X.data
|
||||
int[::1] X_indices = X.indices
|
||||
int[::1] X_indptr = X.indptr
|
||||
|
||||
int n_samples = X.shape[0]
|
||||
int i, j
|
||||
|
||||
floating sq_dist = 0.0
|
||||
floating inertia = 0.0
|
||||
|
||||
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
|
||||
|
||||
for i in prange(n_samples, nogil=True, num_threads=n_threads,
|
||||
schedule='static'):
|
||||
j = labels[i]
|
||||
if single_label < 0 or single_label == j:
|
||||
sq_dist = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i]: X_indptr[i + 1]],
|
||||
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
||||
centers[j], centers_squared_norms[j], True)
|
||||
inertia += sq_dist * sample_weight[i]
|
||||
|
||||
return inertia
|
||||
|
||||
|
||||
cpdef void _relocate_empty_clusters_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # INOUT
|
||||
floating[::1] weight_in_clusters, # INOUT
|
||||
const int[::1] labels # IN
|
||||
):
|
||||
"""Relocate centers which have no sample assigned to them."""
|
||||
cdef:
|
||||
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
|
||||
int n_empty = empty_clusters.shape[0]
|
||||
|
||||
if n_empty == 0:
|
||||
return
|
||||
|
||||
cdef:
|
||||
int n_features = X.shape[1]
|
||||
|
||||
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
|
||||
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
|
||||
|
||||
int new_cluster_id, old_cluster_id, far_idx, idx, k
|
||||
floating weight
|
||||
|
||||
if np.max(distances) == 0:
|
||||
# Happens when there are more clusters than non-duplicate samples. Relocating
|
||||
# is pointless in this case.
|
||||
return
|
||||
|
||||
for idx in range(n_empty):
|
||||
|
||||
new_cluster_id = empty_clusters[idx]
|
||||
|
||||
far_idx = far_from_centers[idx]
|
||||
weight = sample_weight[far_idx]
|
||||
|
||||
old_cluster_id = labels[far_idx]
|
||||
|
||||
for k in range(n_features):
|
||||
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
|
||||
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
|
||||
|
||||
weight_in_clusters[new_cluster_id] = weight
|
||||
weight_in_clusters[old_cluster_id] -= weight
|
||||
|
||||
|
||||
cpdef void _relocate_empty_clusters_sparse(
|
||||
const floating[::1] X_data, # IN
|
||||
const int[::1] X_indices, # IN
|
||||
const int[::1] X_indptr, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # INOUT
|
||||
floating[::1] weight_in_clusters, # INOUT
|
||||
const int[::1] labels # IN
|
||||
):
|
||||
"""Relocate centers which have no sample assigned to them."""
|
||||
cdef:
|
||||
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
|
||||
int n_empty = empty_clusters.shape[0]
|
||||
|
||||
if n_empty == 0:
|
||||
return
|
||||
|
||||
cdef:
|
||||
int n_samples = X_indptr.shape[0] - 1
|
||||
int i, j, k
|
||||
|
||||
floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
|
||||
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
|
||||
|
||||
for i in range(n_samples):
|
||||
j = labels[i]
|
||||
distances[i] = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i]: X_indptr[i + 1]],
|
||||
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
||||
centers_old[j], centers_squared_norms[j], True)
|
||||
|
||||
if np.max(distances) == 0:
|
||||
# Happens when there are more clusters than non-duplicate samples. Relocating
|
||||
# is pointless in this case.
|
||||
return
|
||||
|
||||
cdef:
|
||||
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
|
||||
|
||||
int new_cluster_id, old_cluster_id, far_idx, idx
|
||||
floating weight
|
||||
|
||||
for idx in range(n_empty):
|
||||
|
||||
new_cluster_id = empty_clusters[idx]
|
||||
|
||||
far_idx = far_from_centers[idx]
|
||||
weight = sample_weight[far_idx]
|
||||
|
||||
old_cluster_id = labels[far_idx]
|
||||
|
||||
for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
|
||||
centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
|
||||
centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
|
||||
|
||||
weight_in_clusters[new_cluster_id] = weight
|
||||
weight_in_clusters[old_cluster_id] -= weight
|
||||
|
||||
|
||||
cdef void _average_centers(
|
||||
floating[:, ::1] centers, # INOUT
|
||||
const floating[::1] weight_in_clusters # IN
|
||||
):
|
||||
"""Average new centers wrt weights."""
|
||||
cdef:
|
||||
int n_clusters = centers.shape[0]
|
||||
int n_features = centers.shape[1]
|
||||
int j, k
|
||||
floating alpha
|
||||
int argmax_weight = np.argmax(weight_in_clusters)
|
||||
|
||||
for j in range(n_clusters):
|
||||
if weight_in_clusters[j] > 0:
|
||||
alpha = 1.0 / weight_in_clusters[j]
|
||||
for k in range(n_features):
|
||||
centers[j, k] *= alpha
|
||||
else:
|
||||
# For convenience, we avoid setting empty clusters at the origin but place
|
||||
# them at the location of the biggest cluster.
|
||||
for k in range(n_features):
|
||||
centers[j, k] = centers[argmax_weight, k]
|
||||
|
||||
|
||||
cdef void _center_shift(
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
const floating[:, ::1] centers_new, # IN
|
||||
floating[::1] center_shift # OUT
|
||||
):
|
||||
"""Compute shift between old and new centers."""
|
||||
cdef:
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
int j
|
||||
|
||||
for j in range(n_clusters):
|
||||
center_shift[j] = _euclidean_dense_dense(
|
||||
¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False)
|
||||
|
||||
|
||||
def _is_same_clustering(
|
||||
const int[::1] labels1,
|
||||
const int[::1] labels2,
|
||||
n_clusters
|
||||
):
|
||||
"""Check if two arrays of labels are the same up to a permutation of the labels"""
|
||||
cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
|
||||
cdef int i
|
||||
|
||||
for i in range(labels1.shape[0]):
|
||||
if mapping[labels1[i]] == -1:
|
||||
mapping[labels1[i]] = labels2[i]
|
||||
elif mapping[labels1[i]] != labels2[i]:
|
||||
return False
|
||||
return True
|
||||
Binary file not shown.
@@ -0,0 +1,686 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from cython cimport floating
|
||||
from cython.parallel import prange, parallel
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libc.string cimport memset
|
||||
|
||||
from sklearn.utils._openmp_helpers cimport omp_lock_t
|
||||
from sklearn.utils._openmp_helpers cimport omp_init_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_destroy_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_set_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_unset_lock
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.cluster._k_means_common import CHUNK_SIZE
|
||||
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_dense
|
||||
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_sparse
|
||||
from sklearn.cluster._k_means_common cimport _euclidean_dense_dense
|
||||
from sklearn.cluster._k_means_common cimport _euclidean_sparse_dense
|
||||
from sklearn.cluster._k_means_common cimport _average_centers
|
||||
from sklearn.cluster._k_means_common cimport _center_shift
|
||||
|
||||
|
||||
def init_bounds_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[:, ::1] centers, # IN
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
int[::1] labels, # OUT
|
||||
floating[::1] upper_bounds, # OUT
|
||||
floating[:, ::1] lower_bounds, # OUT
|
||||
int n_threads):
|
||||
"""Initialize upper and lower bounds for each sample for dense input data.
|
||||
|
||||
Given X, centers and the pairwise distances divided by 2.0 between the
|
||||
centers this calculates the upper bounds and lower bounds for each sample.
|
||||
The upper bound for each sample is set to the distance between the sample
|
||||
and the closest center.
|
||||
|
||||
The lower bound for each sample is a one-dimensional array of n_clusters.
|
||||
For each sample i assume that the previously assigned cluster is c1 and the
|
||||
previous closest distance is dist, for a new cluster c2, the
|
||||
lower_bound[i][c2] is set to distance between the sample and this new
|
||||
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
|
||||
computation of unnecessary distances for each sample to the clusters that
|
||||
it is unlikely to be assigned to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features), dtype=floating
|
||||
The input data.
|
||||
|
||||
centers : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
The cluster centers.
|
||||
|
||||
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
|
||||
dtype=floating
|
||||
The half of the distance between any 2 clusters centers.
|
||||
|
||||
labels : ndarray of shape(n_samples), dtype=int
|
||||
The label for each sample. This array is modified in place.
|
||||
|
||||
upper_bounds : ndarray of shape(n_samples,), dtype=floating
|
||||
The upper bound on the distance between each sample and its closest
|
||||
cluster center. This array is modified in place.
|
||||
|
||||
lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
|
||||
The lower bound on the distance between each sample and each cluster
|
||||
center. This array is modified in place.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_clusters = centers.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
|
||||
floating min_dist, dist
|
||||
int best_cluster, i, j
|
||||
|
||||
for i in prange(
|
||||
n_samples, num_threads=n_threads, schedule='static', nogil=True
|
||||
):
|
||||
best_cluster = 0
|
||||
min_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[0, 0],
|
||||
n_features, False)
|
||||
lower_bounds[i, 0] = min_dist
|
||||
for j in range(1, n_clusters):
|
||||
if min_dist > center_half_distances[best_cluster, j]:
|
||||
dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0],
|
||||
n_features, False)
|
||||
lower_bounds[i, j] = dist
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
best_cluster = j
|
||||
labels[i] = best_cluster
|
||||
upper_bounds[i] = min_dist
|
||||
|
||||
|
||||
def init_bounds_sparse(
|
||||
X, # IN
|
||||
const floating[:, ::1] centers, # IN
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
int[::1] labels, # OUT
|
||||
floating[::1] upper_bounds, # OUT
|
||||
floating[:, ::1] lower_bounds, # OUT
|
||||
int n_threads):
|
||||
"""Initialize upper and lower bounds for each sample for sparse input data.
|
||||
|
||||
Given X, centers and the pairwise distances divided by 2.0 between the
|
||||
centers this calculates the upper bounds and lower bounds for each sample.
|
||||
The upper bound for each sample is set to the distance between the sample
|
||||
and the closest center.
|
||||
|
||||
The lower bound for each sample is a one-dimensional array of n_clusters.
|
||||
For each sample i assume that the previously assigned cluster is c1 and the
|
||||
previous closest distance is dist, for a new cluster c2, the
|
||||
lower_bound[i][c2] is set to distance between the sample and this new
|
||||
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
|
||||
computation of unnecessary distances for each sample to the clusters that
|
||||
it is unlikely to be assigned to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features), dtype=floating
|
||||
The input data. Must be in CSR format.
|
||||
|
||||
centers : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
The cluster centers.
|
||||
|
||||
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
|
||||
dtype=floating
|
||||
The half of the distance between any 2 clusters centers.
|
||||
|
||||
labels : ndarray of shape(n_samples), dtype=int
|
||||
The label for each sample. This array is modified in place.
|
||||
|
||||
upper_bounds : ndarray of shape(n_samples,), dtype=floating
|
||||
The upper bound on the distance between each sample and its closest
|
||||
cluster center. This array is modified in place.
|
||||
|
||||
lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
|
||||
The lower bound on the distance between each sample and each cluster
|
||||
center. This array is modified in place.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_clusters = centers.shape[0]
|
||||
|
||||
floating[::1] X_data = X.data
|
||||
int[::1] X_indices = X.indices
|
||||
int[::1] X_indptr = X.indptr
|
||||
|
||||
floating min_dist, dist
|
||||
int best_cluster, i, j
|
||||
|
||||
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
|
||||
|
||||
for i in prange(
|
||||
n_samples, num_threads=n_threads, schedule='static', nogil=True
|
||||
):
|
||||
best_cluster = 0
|
||||
min_dist = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i]: X_indptr[i + 1]],
|
||||
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
||||
centers[0], centers_squared_norms[0], False)
|
||||
|
||||
lower_bounds[i, 0] = min_dist
|
||||
for j in range(1, n_clusters):
|
||||
if min_dist > center_half_distances[best_cluster, j]:
|
||||
dist = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i]: X_indptr[i + 1]],
|
||||
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
||||
centers[j], centers_squared_norms[j], False)
|
||||
lower_bounds[i, j] = dist
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
best_cluster = j
|
||||
labels[i] = best_cluster
|
||||
upper_bounds[i] = min_dist
|
||||
|
||||
|
||||
def elkan_iter_chunked_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_in_clusters, # OUT
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
const floating[::1] distance_next_center, # IN
|
||||
floating[::1] upper_bounds, # INOUT
|
||||
floating[:, ::1] lower_bounds, # INOUT
|
||||
int[::1] labels, # INOUT
|
||||
floating[::1] center_shift, # OUT
|
||||
int n_threads,
|
||||
bint update_centers=True):
|
||||
"""Single iteration of K-means Elkan algorithm with dense input.
|
||||
|
||||
Update labels and centers (inplace), for one iteration, distributed
|
||||
over data chunks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features), dtype=floating
|
||||
The observations to cluster.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration.
|
||||
|
||||
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
|
||||
Placeholder for the sums of the weights of every observation assigned
|
||||
to each center.
|
||||
|
||||
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
|
||||
dtype=floating
|
||||
Half pairwise distances between centers.
|
||||
|
||||
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between each center its closest center.
|
||||
|
||||
upper_bounds : ndarray of shape (n_samples,), dtype=floating
|
||||
Upper bound for the distance between each sample and its center,
|
||||
updated inplace.
|
||||
|
||||
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
|
||||
Lower bound for the distance between each sample and each center,
|
||||
updated inplace.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
center_shift : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between old and new centers.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
|
||||
update_centers : bool
|
||||
- If True, the labels and the new centers will be computed, i.e. runs
|
||||
the E-step and the M-step of the algorithm.
|
||||
- If False, only the labels will be computed, i.e runs the E-step of
|
||||
the algorithm. This is useful especially when calling predict on a
|
||||
fitted model.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
int n_clusters = centers_new.shape[0]
|
||||
|
||||
if n_samples == 0:
|
||||
# An empty array was passed, do nothing and return early (before
|
||||
# attempting to compute n_chunks). This can typically happen when
|
||||
# calling the prediction function of a bisecting k-means model with a
|
||||
# large fraction of outliers.
|
||||
return
|
||||
|
||||
cdef:
|
||||
# hard-coded number of samples per chunk. Splitting in chunks is
|
||||
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
|
||||
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
|
||||
int n_chunks = n_samples // n_samples_chunk
|
||||
int n_samples_rem = n_samples % n_samples_chunk
|
||||
int chunk_idx
|
||||
int start, end
|
||||
|
||||
int i, j, k
|
||||
|
||||
floating *centers_new_chunk
|
||||
floating *weight_in_clusters_chunk
|
||||
|
||||
omp_lock_t lock
|
||||
|
||||
# count remainder chunk in total number of chunks
|
||||
n_chunks += n_samples != n_chunks * n_samples_chunk
|
||||
|
||||
# number of threads should not be bigger than number of chunks
|
||||
n_threads = min(n_threads, n_chunks)
|
||||
|
||||
if update_centers:
|
||||
memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
|
||||
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
|
||||
omp_init_lock(&lock)
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
# thread local buffers
|
||||
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
|
||||
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
|
||||
|
||||
for chunk_idx in prange(n_chunks, schedule='static'):
|
||||
start = chunk_idx * n_samples_chunk
|
||||
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
|
||||
end = start + n_samples_rem
|
||||
else:
|
||||
end = start + n_samples_chunk
|
||||
|
||||
_update_chunk_dense(
|
||||
X[start: end],
|
||||
sample_weight[start: end],
|
||||
centers_old,
|
||||
center_half_distances,
|
||||
distance_next_center,
|
||||
labels[start: end],
|
||||
upper_bounds[start: end],
|
||||
lower_bounds[start: end],
|
||||
centers_new_chunk,
|
||||
weight_in_clusters_chunk,
|
||||
update_centers)
|
||||
|
||||
# reduction from local buffers.
|
||||
if update_centers:
|
||||
# The lock is necessary to avoid race conditions when aggregating
|
||||
# info from different thread-local buffers.
|
||||
omp_set_lock(&lock)
|
||||
for j in range(n_clusters):
|
||||
weight_in_clusters[j] += weight_in_clusters_chunk[j]
|
||||
for k in range(n_features):
|
||||
centers_new[j, k] += centers_new_chunk[j * n_features + k]
|
||||
omp_unset_lock(&lock)
|
||||
|
||||
free(centers_new_chunk)
|
||||
free(weight_in_clusters_chunk)
|
||||
|
||||
if update_centers:
|
||||
omp_destroy_lock(&lock)
|
||||
_relocate_empty_clusters_dense(X, sample_weight, centers_old,
|
||||
centers_new, weight_in_clusters, labels)
|
||||
|
||||
_average_centers(centers_new, weight_in_clusters)
|
||||
_center_shift(centers_old, centers_new, center_shift)
|
||||
|
||||
# update lower and upper bounds
|
||||
for i in range(n_samples):
|
||||
upper_bounds[i] += center_shift[labels[i]]
|
||||
|
||||
for j in range(n_clusters):
|
||||
lower_bounds[i, j] -= center_shift[j]
|
||||
if lower_bounds[i, j] < 0:
|
||||
lower_bounds[i, j] = 0
|
||||
|
||||
|
||||
cdef void _update_chunk_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
const floating[::1] distance_next_center, # IN
|
||||
int[::1] labels, # INOUT
|
||||
floating[::1] upper_bounds, # INOUT
|
||||
floating[:, ::1] lower_bounds, # INOUT
|
||||
floating *centers_new, # OUT
|
||||
floating *weight_in_clusters, # OUT
|
||||
bint update_centers) noexcept nogil:
|
||||
"""K-means combined EM step for one dense data chunk.
|
||||
|
||||
Compute the partial contribution of a single data chunk to the labels and
|
||||
centers.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = labels.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
|
||||
floating upper_bound, distance
|
||||
int i, j, k, label
|
||||
|
||||
for i in range(n_samples):
|
||||
upper_bound = upper_bounds[i]
|
||||
bounds_tight = 0
|
||||
label = labels[i]
|
||||
|
||||
# Next center is not far away from the currently assigned center.
|
||||
# Sample might need to be assigned to another center.
|
||||
if not distance_next_center[label] >= upper_bound:
|
||||
|
||||
for j in range(n_clusters):
|
||||
|
||||
# If this holds, then center_index is a good candidate for the
|
||||
# sample to be relabelled, and we need to confirm this by
|
||||
# recomputing the upper and lower bounds.
|
||||
if (
|
||||
j != label
|
||||
and (upper_bound > lower_bounds[i, j])
|
||||
and (upper_bound > center_half_distances[label, j])
|
||||
):
|
||||
|
||||
# Recompute upper bound by calculating the actual distance
|
||||
# between the sample and its current assigned center.
|
||||
if not bounds_tight:
|
||||
upper_bound = _euclidean_dense_dense(
|
||||
&X[i, 0], ¢ers_old[label, 0], n_features, False)
|
||||
lower_bounds[i, label] = upper_bound
|
||||
bounds_tight = 1
|
||||
|
||||
# If the condition still holds, then compute the actual
|
||||
# distance between the sample and center. If this is less
|
||||
# than the previous distance, reassign label.
|
||||
if (
|
||||
upper_bound > lower_bounds[i, j]
|
||||
or (upper_bound > center_half_distances[label, j])
|
||||
):
|
||||
|
||||
distance = _euclidean_dense_dense(
|
||||
&X[i, 0], ¢ers_old[j, 0], n_features, False)
|
||||
lower_bounds[i, j] = distance
|
||||
if distance < upper_bound:
|
||||
label = j
|
||||
upper_bound = distance
|
||||
|
||||
labels[i] = label
|
||||
upper_bounds[i] = upper_bound
|
||||
|
||||
if update_centers:
|
||||
weight_in_clusters[label] += sample_weight[i]
|
||||
for k in range(n_features):
|
||||
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
|
||||
|
||||
|
||||
def elkan_iter_chunked_sparse(
|
||||
X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_in_clusters, # OUT
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
const floating[::1] distance_next_center, # IN
|
||||
floating[::1] upper_bounds, # INOUT
|
||||
floating[:, ::1] lower_bounds, # INOUT
|
||||
int[::1] labels, # INOUT
|
||||
floating[::1] center_shift, # OUT
|
||||
int n_threads,
|
||||
bint update_centers=True):
|
||||
"""Single iteration of K-means Elkan algorithm with sparse input.
|
||||
|
||||
Update labels and centers (inplace), for one iteration, distributed
|
||||
over data chunks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
The observations to cluster. Must be in CSR format.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration.
|
||||
|
||||
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
|
||||
Placeholder for the sums of the weights of every observation assigned
|
||||
to each center.
|
||||
|
||||
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
|
||||
dtype=floating
|
||||
Half pairwise distances between centers.
|
||||
|
||||
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between each center its closest center.
|
||||
|
||||
upper_bounds : ndarray of shape (n_samples,), dtype=floating
|
||||
Upper bound for the distance between each sample and its center,
|
||||
updated inplace.
|
||||
|
||||
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
|
||||
Lower bound for the distance between each sample and each center,
|
||||
updated inplace.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
center_shift : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between old and new centers.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
|
||||
update_centers : bool
|
||||
- If True, the labels and the new centers will be computed, i.e. runs
|
||||
the E-step and the M-step of the algorithm.
|
||||
- If False, only the labels will be computed, i.e runs the E-step of
|
||||
the algorithm. This is useful especially when calling predict on a
|
||||
fitted model.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
int n_clusters = centers_new.shape[0]
|
||||
|
||||
if n_samples == 0:
|
||||
# An empty array was passed, do nothing and return early (before
|
||||
# attempting to compute n_chunks). This can typically happen when
|
||||
# calling the prediction function of a bisecting k-means model with a
|
||||
# large fraction of outliers.
|
||||
return
|
||||
|
||||
cdef:
|
||||
floating[::1] X_data = X.data
|
||||
int[::1] X_indices = X.indices
|
||||
int[::1] X_indptr = X.indptr
|
||||
|
||||
# hard-coded number of samples per chunk. Splitting in chunks is
|
||||
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
|
||||
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
|
||||
int n_chunks = n_samples // n_samples_chunk
|
||||
int n_samples_rem = n_samples % n_samples_chunk
|
||||
int chunk_idx
|
||||
int start, end
|
||||
|
||||
int i, j, k
|
||||
|
||||
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
|
||||
|
||||
floating *centers_new_chunk
|
||||
floating *weight_in_clusters_chunk
|
||||
|
||||
omp_lock_t lock
|
||||
|
||||
# count remainder chunk in total number of chunks
|
||||
n_chunks += n_samples != n_chunks * n_samples_chunk
|
||||
|
||||
# number of threads should not be bigger than number of chunks
|
||||
n_threads = min(n_threads, n_chunks)
|
||||
|
||||
if update_centers:
|
||||
memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
|
||||
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
|
||||
omp_init_lock(&lock)
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
# thread local buffers
|
||||
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
|
||||
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
|
||||
|
||||
for chunk_idx in prange(n_chunks, schedule='static'):
|
||||
start = chunk_idx * n_samples_chunk
|
||||
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
|
||||
end = start + n_samples_rem
|
||||
else:
|
||||
end = start + n_samples_chunk
|
||||
|
||||
_update_chunk_sparse(
|
||||
X_data[X_indptr[start]: X_indptr[end]],
|
||||
X_indices[X_indptr[start]: X_indptr[end]],
|
||||
X_indptr[start: end+1],
|
||||
sample_weight[start: end],
|
||||
centers_old,
|
||||
centers_squared_norms,
|
||||
center_half_distances,
|
||||
distance_next_center,
|
||||
labels[start: end],
|
||||
upper_bounds[start: end],
|
||||
lower_bounds[start: end],
|
||||
centers_new_chunk,
|
||||
weight_in_clusters_chunk,
|
||||
update_centers)
|
||||
|
||||
# reduction from local buffers.
|
||||
if update_centers:
|
||||
# The lock is necessary to avoid race conditions when aggregating
|
||||
# info from different thread-local buffers.
|
||||
omp_set_lock(&lock)
|
||||
for j in range(n_clusters):
|
||||
weight_in_clusters[j] += weight_in_clusters_chunk[j]
|
||||
for k in range(n_features):
|
||||
centers_new[j, k] += centers_new_chunk[j * n_features + k]
|
||||
omp_unset_lock(&lock)
|
||||
|
||||
free(centers_new_chunk)
|
||||
free(weight_in_clusters_chunk)
|
||||
|
||||
if update_centers:
|
||||
omp_destroy_lock(&lock)
|
||||
_relocate_empty_clusters_sparse(
|
||||
X_data, X_indices, X_indptr, sample_weight,
|
||||
centers_old, centers_new, weight_in_clusters, labels)
|
||||
|
||||
_average_centers(centers_new, weight_in_clusters)
|
||||
_center_shift(centers_old, centers_new, center_shift)
|
||||
|
||||
# update lower and upper bounds
|
||||
for i in range(n_samples):
|
||||
upper_bounds[i] += center_shift[labels[i]]
|
||||
|
||||
for j in range(n_clusters):
|
||||
lower_bounds[i, j] -= center_shift[j]
|
||||
if lower_bounds[i, j] < 0:
|
||||
lower_bounds[i, j] = 0
|
||||
|
||||
|
||||
cdef void _update_chunk_sparse(
|
||||
const floating[::1] X_data, # IN
|
||||
const int[::1] X_indices, # IN
|
||||
const int[::1] X_indptr, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
const floating[::1] centers_squared_norms, # IN
|
||||
const floating[:, ::1] center_half_distances, # IN
|
||||
const floating[::1] distance_next_center, # IN
|
||||
int[::1] labels, # INOUT
|
||||
floating[::1] upper_bounds, # INOUT
|
||||
floating[:, ::1] lower_bounds, # INOUT
|
||||
floating *centers_new, # OUT
|
||||
floating *weight_in_clusters, # OUT
|
||||
bint update_centers) noexcept nogil:
|
||||
"""K-means combined EM step for one sparse data chunk.
|
||||
|
||||
Compute the partial contribution of a single data chunk to the labels and
|
||||
centers.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = labels.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
|
||||
floating upper_bound, distance
|
||||
int i, j, k, label
|
||||
int s = X_indptr[0]
|
||||
|
||||
for i in range(n_samples):
|
||||
upper_bound = upper_bounds[i]
|
||||
bounds_tight = 0
|
||||
label = labels[i]
|
||||
|
||||
# Next center is not far away from the currently assigned center.
|
||||
# Sample might need to be assigned to another center.
|
||||
if not distance_next_center[label] >= upper_bound:
|
||||
|
||||
for j in range(n_clusters):
|
||||
|
||||
# If this holds, then center_index is a good candidate for the
|
||||
# sample to be relabelled, and we need to confirm this by
|
||||
# recomputing the upper and lower bounds.
|
||||
if (
|
||||
j != label
|
||||
and (upper_bound > lower_bounds[i, j])
|
||||
and (upper_bound > center_half_distances[label, j])
|
||||
):
|
||||
|
||||
# Recompute upper bound by calculating the actual distance
|
||||
# between the sample and its current assigned center.
|
||||
if not bounds_tight:
|
||||
upper_bound = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
|
||||
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
|
||||
centers_old[label], centers_squared_norms[label], False)
|
||||
lower_bounds[i, label] = upper_bound
|
||||
bounds_tight = 1
|
||||
|
||||
# If the condition still holds, then compute the actual
|
||||
# distance between the sample and center. If this is less
|
||||
# than the previous distance, reassign label.
|
||||
if (
|
||||
upper_bound > lower_bounds[i, j]
|
||||
or (upper_bound > center_half_distances[label, j])
|
||||
):
|
||||
distance = _euclidean_sparse_dense(
|
||||
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
|
||||
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
|
||||
centers_old[j], centers_squared_norms[j], False)
|
||||
lower_bounds[i, j] = distance
|
||||
if distance < upper_bound:
|
||||
label = j
|
||||
upper_bound = distance
|
||||
|
||||
labels[i] = label
|
||||
upper_bounds[i] = upper_bound
|
||||
|
||||
if update_centers:
|
||||
weight_in_clusters[label] += sample_weight[i]
|
||||
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
|
||||
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
|
||||
Binary file not shown.
@@ -0,0 +1,420 @@
|
||||
# Licence: BSD 3 clause
|
||||
|
||||
from cython cimport floating
|
||||
from cython.parallel import prange, parallel
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.string cimport memset
|
||||
from libc.float cimport DBL_MAX, FLT_MAX
|
||||
|
||||
from sklearn.utils._openmp_helpers cimport omp_lock_t
|
||||
from sklearn.utils._openmp_helpers cimport omp_init_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_destroy_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_set_lock
|
||||
from sklearn.utils._openmp_helpers cimport omp_unset_lock
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils._cython_blas cimport _gemm
|
||||
from sklearn.utils._cython_blas cimport RowMajor, Trans, NoTrans
|
||||
from sklearn.cluster._k_means_common import CHUNK_SIZE
|
||||
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_dense
|
||||
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_sparse
|
||||
from sklearn.cluster._k_means_common cimport _average_centers, _center_shift
|
||||
|
||||
|
||||
def lloyd_iter_chunked_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_in_clusters, # OUT
|
||||
int[::1] labels, # OUT
|
||||
floating[::1] center_shift, # OUT
|
||||
int n_threads,
|
||||
bint update_centers=True):
|
||||
"""Single iteration of K-means lloyd algorithm with dense input.
|
||||
|
||||
Update labels and centers (inplace), for one iteration, distributed
|
||||
over data chunks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features), dtype=floating
|
||||
The observations to cluster.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration. `centers_new` can be `None` if
|
||||
`update_centers` is False.
|
||||
|
||||
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
|
||||
Placeholder for the sums of the weights of every observation assigned
|
||||
to each center. `weight_in_clusters` can be `None` if `update_centers`
|
||||
is False.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
center_shift : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between old and new centers.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
|
||||
update_centers : bool
|
||||
- If True, the labels and the new centers will be computed, i.e. runs
|
||||
the E-step and the M-step of the algorithm.
|
||||
- If False, only the labels will be computed, i.e runs the E-step of
|
||||
the algorithm. This is useful especially when calling predict on a
|
||||
fitted model.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
|
||||
if n_samples == 0:
|
||||
# An empty array was passed, do nothing and return early (before
|
||||
# attempting to compute n_chunks). This can typically happen when
|
||||
# calling the prediction function of a bisecting k-means model with a
|
||||
# large fraction of outliers.
|
||||
return
|
||||
|
||||
cdef:
|
||||
# hard-coded number of samples per chunk. Appeared to be close to
|
||||
# optimal in all situations.
|
||||
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
|
||||
int n_chunks = n_samples // n_samples_chunk
|
||||
int n_samples_rem = n_samples % n_samples_chunk
|
||||
int chunk_idx
|
||||
int start, end
|
||||
|
||||
int j, k
|
||||
|
||||
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
|
||||
|
||||
floating *centers_new_chunk
|
||||
floating *weight_in_clusters_chunk
|
||||
floating *pairwise_distances_chunk
|
||||
|
||||
omp_lock_t lock
|
||||
|
||||
# count remainder chunk in total number of chunks
|
||||
n_chunks += n_samples != n_chunks * n_samples_chunk
|
||||
|
||||
# number of threads should not be bigger than number of chunks
|
||||
n_threads = min(n_threads, n_chunks)
|
||||
|
||||
if update_centers:
|
||||
memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
|
||||
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
|
||||
omp_init_lock(&lock)
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
# thread local buffers
|
||||
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
|
||||
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
|
||||
pairwise_distances_chunk = <floating*> malloc(n_samples_chunk * n_clusters * sizeof(floating))
|
||||
|
||||
for chunk_idx in prange(n_chunks, schedule='static'):
|
||||
start = chunk_idx * n_samples_chunk
|
||||
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
|
||||
end = start + n_samples_rem
|
||||
else:
|
||||
end = start + n_samples_chunk
|
||||
|
||||
_update_chunk_dense(
|
||||
X[start: end],
|
||||
sample_weight[start: end],
|
||||
centers_old,
|
||||
centers_squared_norms,
|
||||
labels[start: end],
|
||||
centers_new_chunk,
|
||||
weight_in_clusters_chunk,
|
||||
pairwise_distances_chunk,
|
||||
update_centers)
|
||||
|
||||
# reduction from local buffers.
|
||||
if update_centers:
|
||||
# The lock is necessary to avoid race conditions when aggregating
|
||||
# info from different thread-local buffers.
|
||||
omp_set_lock(&lock)
|
||||
for j in range(n_clusters):
|
||||
weight_in_clusters[j] += weight_in_clusters_chunk[j]
|
||||
for k in range(n_features):
|
||||
centers_new[j, k] += centers_new_chunk[j * n_features + k]
|
||||
|
||||
omp_unset_lock(&lock)
|
||||
|
||||
free(centers_new_chunk)
|
||||
free(weight_in_clusters_chunk)
|
||||
free(pairwise_distances_chunk)
|
||||
|
||||
if update_centers:
|
||||
omp_destroy_lock(&lock)
|
||||
_relocate_empty_clusters_dense(
|
||||
X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
|
||||
)
|
||||
|
||||
_average_centers(centers_new, weight_in_clusters)
|
||||
_center_shift(centers_old, centers_new, center_shift)
|
||||
|
||||
|
||||
cdef void _update_chunk_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
const floating[::1] centers_squared_norms, # IN
|
||||
int[::1] labels, # OUT
|
||||
floating *centers_new, # OUT
|
||||
floating *weight_in_clusters, # OUT
|
||||
floating *pairwise_distances, # OUT
|
||||
bint update_centers) noexcept nogil:
|
||||
"""K-means combined EM step for one dense data chunk.
|
||||
|
||||
Compute the partial contribution of a single data chunk to the labels and
|
||||
centers.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = labels.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
|
||||
floating sq_dist, min_sq_dist
|
||||
int i, j, k, label
|
||||
|
||||
# Instead of computing the full pairwise squared distances matrix,
|
||||
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
|
||||
# the - 2 X.C^T + ||C||² term since the argmin for a given sample only
|
||||
# depends on the centers.
|
||||
# pairwise_distances = ||C||²
|
||||
for i in range(n_samples):
|
||||
for j in range(n_clusters):
|
||||
pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
|
||||
|
||||
# pairwise_distances += -2 * X.dot(C.T)
|
||||
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
|
||||
-2.0, &X[0, 0], n_features, ¢ers_old[0, 0], n_features,
|
||||
1.0, pairwise_distances, n_clusters)
|
||||
|
||||
for i in range(n_samples):
|
||||
min_sq_dist = pairwise_distances[i * n_clusters]
|
||||
label = 0
|
||||
for j in range(1, n_clusters):
|
||||
sq_dist = pairwise_distances[i * n_clusters + j]
|
||||
if sq_dist < min_sq_dist:
|
||||
min_sq_dist = sq_dist
|
||||
label = j
|
||||
labels[i] = label
|
||||
|
||||
if update_centers:
|
||||
weight_in_clusters[label] += sample_weight[i]
|
||||
for k in range(n_features):
|
||||
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
|
||||
|
||||
|
||||
def lloyd_iter_chunked_sparse(
|
||||
X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_in_clusters, # OUT
|
||||
int[::1] labels, # OUT
|
||||
floating[::1] center_shift, # OUT
|
||||
int n_threads,
|
||||
bint update_centers=True):
|
||||
"""Single iteration of K-means lloyd algorithm with sparse input.
|
||||
|
||||
Update labels and centers (inplace), for one iteration, distributed
|
||||
over data chunks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features), dtype=floating
|
||||
The observations to cluster. Must be in CSR format.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration. `centers_new` can be `None` if
|
||||
`update_centers` is False.
|
||||
|
||||
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
|
||||
Placeholder for the sums of the weights of every observation assigned
|
||||
to each center. `weight_in_clusters` can be `None` if `update_centers`
|
||||
is False.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
center_shift : ndarray of shape (n_clusters,), dtype=floating
|
||||
Distance between old and new centers.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
|
||||
update_centers : bool
|
||||
- If True, the labels and the new centers will be computed, i.e. runs
|
||||
the E-step and the M-step of the algorithm.
|
||||
- If False, only the labels will be computed, i.e runs the E-step of
|
||||
the algorithm. This is useful especially when calling predict on a
|
||||
fitted model.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_features = X.shape[1]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
|
||||
if n_samples == 0:
|
||||
# An empty array was passed, do nothing and return early (before
|
||||
# attempting to compute n_chunks). This can typically happen when
|
||||
# calling the prediction function of a bisecting k-means model with a
|
||||
# large fraction of outliers.
|
||||
return
|
||||
|
||||
cdef:
|
||||
# Choose same as for dense. Does not have the same impact since with
|
||||
# sparse data the pairwise distances matrix is not precomputed.
|
||||
# However, splitting in chunks is necessary to get parallelism.
|
||||
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
|
||||
int n_chunks = n_samples // n_samples_chunk
|
||||
int n_samples_rem = n_samples % n_samples_chunk
|
||||
int chunk_idx
|
||||
int start = 0, end = 0
|
||||
|
||||
int j, k
|
||||
|
||||
floating[::1] X_data = X.data
|
||||
int[::1] X_indices = X.indices
|
||||
int[::1] X_indptr = X.indptr
|
||||
|
||||
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
|
||||
|
||||
floating *centers_new_chunk
|
||||
floating *weight_in_clusters_chunk
|
||||
|
||||
omp_lock_t lock
|
||||
|
||||
# count remainder chunk in total number of chunks
|
||||
n_chunks += n_samples != n_chunks * n_samples_chunk
|
||||
|
||||
# number of threads should not be bigger than number of chunks
|
||||
n_threads = min(n_threads, n_chunks)
|
||||
|
||||
if update_centers:
|
||||
memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
|
||||
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
|
||||
omp_init_lock(&lock)
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
# thread local buffers
|
||||
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
|
||||
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
|
||||
|
||||
for chunk_idx in prange(n_chunks, schedule='static'):
|
||||
start = chunk_idx * n_samples_chunk
|
||||
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
|
||||
end = start + n_samples_rem
|
||||
else:
|
||||
end = start + n_samples_chunk
|
||||
|
||||
_update_chunk_sparse(
|
||||
X_data[X_indptr[start]: X_indptr[end]],
|
||||
X_indices[X_indptr[start]: X_indptr[end]],
|
||||
X_indptr[start: end+1],
|
||||
sample_weight[start: end],
|
||||
centers_old,
|
||||
centers_squared_norms,
|
||||
labels[start: end],
|
||||
centers_new_chunk,
|
||||
weight_in_clusters_chunk,
|
||||
update_centers)
|
||||
|
||||
# reduction from local buffers.
|
||||
if update_centers:
|
||||
# The lock is necessary to avoid race conditions when aggregating
|
||||
# info from different thread-local buffers.
|
||||
omp_set_lock(&lock)
|
||||
for j in range(n_clusters):
|
||||
weight_in_clusters[j] += weight_in_clusters_chunk[j]
|
||||
for k in range(n_features):
|
||||
centers_new[j, k] += centers_new_chunk[j * n_features + k]
|
||||
omp_unset_lock(&lock)
|
||||
|
||||
free(centers_new_chunk)
|
||||
free(weight_in_clusters_chunk)
|
||||
|
||||
if update_centers:
|
||||
omp_destroy_lock(&lock)
|
||||
_relocate_empty_clusters_sparse(
|
||||
X_data, X_indices, X_indptr, sample_weight,
|
||||
centers_old, centers_new, weight_in_clusters, labels)
|
||||
|
||||
_average_centers(centers_new, weight_in_clusters)
|
||||
_center_shift(centers_old, centers_new, center_shift)
|
||||
|
||||
|
||||
cdef void _update_chunk_sparse(
|
||||
const floating[::1] X_data, # IN
|
||||
const int[::1] X_indices, # IN
|
||||
const int[::1] X_indptr, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
const floating[::1] centers_squared_norms, # IN
|
||||
int[::1] labels, # OUT
|
||||
floating *centers_new, # OUT
|
||||
floating *weight_in_clusters, # OUT
|
||||
bint update_centers) noexcept nogil:
|
||||
"""K-means combined EM step for one sparse data chunk.
|
||||
|
||||
Compute the partial contribution of a single data chunk to the labels and
|
||||
centers.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = labels.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
|
||||
floating sq_dist, min_sq_dist
|
||||
int i, j, k, label
|
||||
floating max_floating = FLT_MAX if floating is float else DBL_MAX
|
||||
int s = X_indptr[0]
|
||||
|
||||
# XXX Precompute the pairwise distances matrix is not worth for sparse
|
||||
# currently. Should be tested when BLAS (sparse x dense) matrix
|
||||
# multiplication is available.
|
||||
for i in range(n_samples):
|
||||
min_sq_dist = max_floating
|
||||
label = 0
|
||||
|
||||
for j in range(n_clusters):
|
||||
sq_dist = 0.0
|
||||
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
|
||||
sq_dist += centers_old[j, X_indices[k]] * X_data[k]
|
||||
|
||||
# Instead of computing the full squared distance with each cluster,
|
||||
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
|
||||
# the - 2 X.C^T + ||C||² term since the argmin for a given sample
|
||||
# only depends on the centers C.
|
||||
sq_dist = centers_squared_norms[j] -2 * sq_dist
|
||||
if sq_dist < min_sq_dist:
|
||||
min_sq_dist = sq_dist
|
||||
label = j
|
||||
|
||||
labels[i] = label
|
||||
|
||||
if update_centers:
|
||||
weight_in_clusters[label] += sample_weight[i]
|
||||
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
|
||||
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
|
||||
Binary file not shown.
@@ -0,0 +1,218 @@
|
||||
from cython cimport floating
|
||||
from cython.parallel cimport parallel, prange
|
||||
from libc.stdlib cimport malloc, free
|
||||
|
||||
|
||||
def _minibatch_update_dense(
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_sums, # INOUT
|
||||
const int[::1] labels, # IN
|
||||
int n_threads):
|
||||
"""Update of the centers for dense MiniBatchKMeans.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features), dtype=floating
|
||||
The observations to cluster.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration.
|
||||
|
||||
weight_sums : ndarray of shape (n_clusters,), dtype=floating
|
||||
Current sums of the accumulated weights for each center.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
"""
|
||||
cdef:
|
||||
int n_samples = X.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int cluster_idx
|
||||
|
||||
int *indices
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
indices = <int*> malloc(n_samples * sizeof(int))
|
||||
|
||||
for cluster_idx in prange(n_clusters, schedule="static"):
|
||||
update_center_dense(cluster_idx, X, sample_weight,
|
||||
centers_old, centers_new, weight_sums, labels,
|
||||
indices)
|
||||
|
||||
free(indices)
|
||||
|
||||
|
||||
cdef void update_center_dense(
|
||||
int cluster_idx,
|
||||
const floating[:, ::1] X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_sums, # INOUT
|
||||
const int[::1] labels, # IN
|
||||
int *indices) noexcept nogil: # TMP
|
||||
"""Update of a single center for dense MinibatchKMeans"""
|
||||
cdef:
|
||||
int n_samples = sample_weight.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
floating alpha
|
||||
int n_indices
|
||||
int k, sample_idx, feature_idx
|
||||
|
||||
floating wsum = 0
|
||||
|
||||
# indices = np.where(labels == cluster_idx)[0]
|
||||
k = 0
|
||||
for sample_idx in range(n_samples):
|
||||
if labels[sample_idx] == cluster_idx:
|
||||
indices[k] = sample_idx
|
||||
wsum += sample_weight[sample_idx]
|
||||
k += 1
|
||||
n_indices = k
|
||||
|
||||
if wsum > 0:
|
||||
# Undo the previous count-based scaling for this cluster center
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
|
||||
|
||||
# Update cluster with new point members
|
||||
for k in range(n_indices):
|
||||
sample_idx = indices[k]
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
|
||||
|
||||
# Update the count statistics for this center
|
||||
weight_sums[cluster_idx] += wsum
|
||||
|
||||
# Rescale to compute mean of all points (old and new)
|
||||
alpha = 1 / weight_sums[cluster_idx]
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] *= alpha
|
||||
else:
|
||||
# No sample was assigned to this cluster in this batch of data
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
|
||||
|
||||
|
||||
def _minibatch_update_sparse(
|
||||
X, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_sums, # INOUT
|
||||
const int[::1] labels, # IN
|
||||
int n_threads):
|
||||
"""Update of the centers for sparse MiniBatchKMeans.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features), dtype=floating
|
||||
The observations to cluster. Must be in CSR format.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), dtype=floating
|
||||
The weights for each observation in X.
|
||||
|
||||
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers before previous iteration, placeholder for the centers after
|
||||
previous iteration.
|
||||
|
||||
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
|
||||
Centers after previous iteration, placeholder for the new centers
|
||||
computed during this iteration.
|
||||
|
||||
weight_sums : ndarray of shape (n_clusters,), dtype=floating
|
||||
Current sums of the accumulated weights for each center.
|
||||
|
||||
labels : ndarray of shape (n_samples,), dtype=int
|
||||
labels assignment.
|
||||
|
||||
n_threads : int
|
||||
The number of threads to be used by openmp.
|
||||
"""
|
||||
cdef:
|
||||
floating[::1] X_data = X.data
|
||||
int[::1] X_indices = X.indices
|
||||
int[::1] X_indptr = X.indptr
|
||||
int n_samples = X.shape[0]
|
||||
int n_clusters = centers_old.shape[0]
|
||||
int cluster_idx
|
||||
|
||||
int *indices
|
||||
|
||||
with nogil, parallel(num_threads=n_threads):
|
||||
indices = <int*> malloc(n_samples * sizeof(int))
|
||||
|
||||
for cluster_idx in prange(n_clusters, schedule="static"):
|
||||
update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
|
||||
sample_weight, centers_old, centers_new,
|
||||
weight_sums, labels, indices)
|
||||
|
||||
free(indices)
|
||||
|
||||
|
||||
cdef void update_center_sparse(
|
||||
int cluster_idx,
|
||||
const floating[::1] X_data, # IN
|
||||
const int[::1] X_indices, # IN
|
||||
const int[::1] X_indptr, # IN
|
||||
const floating[::1] sample_weight, # IN
|
||||
const floating[:, ::1] centers_old, # IN
|
||||
floating[:, ::1] centers_new, # OUT
|
||||
floating[::1] weight_sums, # INOUT
|
||||
const int[::1] labels, # IN
|
||||
int *indices) noexcept nogil: # TMP
|
||||
"""Update of a single center for sparse MinibatchKMeans"""
|
||||
cdef:
|
||||
int n_samples = sample_weight.shape[0]
|
||||
int n_features = centers_old.shape[1]
|
||||
floating alpha
|
||||
int n_indices
|
||||
int k, sample_idx, feature_idx
|
||||
|
||||
floating wsum = 0
|
||||
|
||||
# indices = np.where(labels == cluster_idx)[0]
|
||||
k = 0
|
||||
for sample_idx in range(n_samples):
|
||||
if labels[sample_idx] == cluster_idx:
|
||||
indices[k] = sample_idx
|
||||
wsum += sample_weight[sample_idx]
|
||||
k += 1
|
||||
n_indices = k
|
||||
|
||||
if wsum > 0:
|
||||
# Undo the previous count-based scaling for this cluster center:
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
|
||||
|
||||
# Update cluster with new point members
|
||||
for k in range(n_indices):
|
||||
sample_idx = indices[k]
|
||||
for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
|
||||
centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
|
||||
|
||||
# Update the count statistics for this center
|
||||
weight_sums[cluster_idx] += wsum
|
||||
|
||||
# Rescale to compute mean of all points (old and new)
|
||||
alpha = 1 / weight_sums[cluster_idx]
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] *= alpha
|
||||
else:
|
||||
# No sample was assigned to this cluster in this batch of data
|
||||
for feature_idx in range(n_features):
|
||||
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,579 @@
|
||||
"""Mean shift clustering algorithm.
|
||||
|
||||
Mean shift clustering aims to discover *blobs* in a smooth density of
|
||||
samples. It is a centroid based algorithm, which works by updating candidates
|
||||
for centroids to be the mean of the points within a given region. These
|
||||
candidates are then filtered in a post-processing stage to eliminate
|
||||
near-duplicates to form the final set of centroids.
|
||||
|
||||
Seeding is performed using a binning technique for scalability.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from sklearn.metrics.pairwise import pairwise_distances_argmin
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils import check_array, check_random_state, gen_batches
|
||||
from sklearn.utils._param_validation import Interval, validate_params
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
from sklearn.utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"quantile": [Interval(Real, 0, 1, closed="both")],
|
||||
"n_samples": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [Integral, None],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):
|
||||
"""Estimate the bandwidth to use with the mean-shift algorithm.
|
||||
|
||||
This function takes time at least quadratic in `n_samples`. For large
|
||||
datasets, it is wise to subsample by setting `n_samples`. Alternatively,
|
||||
the parameter `bandwidth` can be set to a small value without estimating
|
||||
it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input points.
|
||||
|
||||
quantile : float, default=0.3
|
||||
Should be between [0, 1]
|
||||
0.5 means that the median of all pairwise distances is used.
|
||||
|
||||
n_samples : int, default=None
|
||||
The number of samples to use. If not given, all samples are used.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
The generator used to randomly select the samples from input points
|
||||
for bandwidth estimation. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bandwidth : float
|
||||
The bandwidth parameter.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.cluster import estimate_bandwidth
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> estimate_bandwidth(X, quantile=0.5)
|
||||
np.float64(1.61)
|
||||
"""
|
||||
X = check_array(X)
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
if n_samples is not None:
|
||||
idx = random_state.permutation(X.shape[0])[:n_samples]
|
||||
X = X[idx]
|
||||
n_neighbors = int(X.shape[0] * quantile)
|
||||
if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0
|
||||
n_neighbors = 1
|
||||
nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
|
||||
nbrs.fit(X)
|
||||
|
||||
bandwidth = 0.0
|
||||
for batch in gen_batches(len(X), 500):
|
||||
d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
|
||||
bandwidth += np.max(d, axis=1).sum()
|
||||
|
||||
return bandwidth / X.shape[0]
|
||||
|
||||
|
||||
# separate function for each seed's iterative loop
|
||||
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
|
||||
# For each seed, climb gradient until convergence or max_iter
|
||||
bandwidth = nbrs.get_params()["radius"]
|
||||
stop_thresh = 1e-3 * bandwidth # when mean has converged
|
||||
completed_iterations = 0
|
||||
while True:
|
||||
# Find mean of points within bandwidth
|
||||
i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]
|
||||
points_within = X[i_nbrs]
|
||||
if len(points_within) == 0:
|
||||
break # Depending on seeding strategy this condition may occur
|
||||
my_old_mean = my_mean # save the old mean
|
||||
my_mean = np.mean(points_within, axis=0)
|
||||
# If converged or at max_iter, adds the cluster
|
||||
if (
|
||||
np.linalg.norm(my_mean - my_old_mean) <= stop_thresh
|
||||
or completed_iterations == max_iter
|
||||
):
|
||||
break
|
||||
completed_iterations += 1
|
||||
return tuple(my_mean), len(points_within), completed_iterations
|
||||
|
||||
|
||||
@validate_params(
|
||||
{"X": ["array-like"]},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def mean_shift(
|
||||
X,
|
||||
*,
|
||||
bandwidth=None,
|
||||
seeds=None,
|
||||
bin_seeding=False,
|
||||
min_bin_freq=1,
|
||||
cluster_all=True,
|
||||
max_iter=300,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform mean shift clustering of data using a flat kernel.
|
||||
|
||||
Read more in the :ref:`User Guide <mean_shift>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
bandwidth : float, default=None
|
||||
Kernel bandwidth. If not None, must be in the range [0, +inf).
|
||||
|
||||
If None, the bandwidth is determined using a heuristic based on
|
||||
the median of all pairwise distances. This will take quadratic time in
|
||||
the number of samples. The sklearn.cluster.estimate_bandwidth function
|
||||
can be used to do this more efficiently.
|
||||
|
||||
seeds : array-like of shape (n_seeds, n_features) or None
|
||||
Point used as initial kernel locations. If None and bin_seeding=False,
|
||||
each data point is used as a seed. If None and bin_seeding=True,
|
||||
see bin_seeding.
|
||||
|
||||
bin_seeding : bool, default=False
|
||||
If true, initial kernel locations are not locations of all
|
||||
points, but rather the location of the discretized version of
|
||||
points, where points are binned onto a grid whose coarseness
|
||||
corresponds to the bandwidth. Setting this option to True will speed
|
||||
up the algorithm because fewer seeds will be initialized.
|
||||
Ignored if seeds argument is not None.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
To speed up the algorithm, accept only those bins with at least
|
||||
min_bin_freq points as seeds.
|
||||
|
||||
cluster_all : bool, default=True
|
||||
If true, then all points are clustered, even those orphans that are
|
||||
not within any kernel. Orphans are assigned to the nearest kernel.
|
||||
If false, then orphans are given cluster label -1.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations, per seed point before the clustering
|
||||
operation terminates (for that seed point), if has not converged yet.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. The following tasks benefit
|
||||
from the parallelization:
|
||||
|
||||
- The search of nearest neighbors for bandwidth estimation and label
|
||||
assignments. See the details in the docstring of the
|
||||
``NearestNeighbors`` class.
|
||||
- Hill-climbing optimization for all seeds.
|
||||
|
||||
See :term:`Glossary <n_jobs>` for more details.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
Parallel Execution using *n_jobs*.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
cluster_centers : ndarray of shape (n_clusters, n_features)
|
||||
Coordinates of cluster centers.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For a usage example, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.cluster import mean_shift
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> cluster_centers, labels = mean_shift(X, bandwidth=2)
|
||||
>>> cluster_centers
|
||||
array([[3.33, 6. ],
|
||||
[1.33, 0.66]])
|
||||
>>> labels
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
"""
|
||||
model = MeanShift(
|
||||
bandwidth=bandwidth,
|
||||
seeds=seeds,
|
||||
min_bin_freq=min_bin_freq,
|
||||
bin_seeding=bin_seeding,
|
||||
cluster_all=cluster_all,
|
||||
n_jobs=n_jobs,
|
||||
max_iter=max_iter,
|
||||
).fit(X)
|
||||
return model.cluster_centers_, model.labels_
|
||||
|
||||
|
||||
def get_bin_seeds(X, bin_size, min_bin_freq=1):
|
||||
"""Find seeds for mean_shift.
|
||||
|
||||
Finds seeds by first binning data onto a grid whose lines are
|
||||
spaced bin_size apart, and then choosing those bins with at least
|
||||
min_bin_freq points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input points, the same points that will be used in mean_shift.
|
||||
|
||||
bin_size : float
|
||||
Controls the coarseness of the binning. Smaller values lead
|
||||
to more seeding (which is computationally more expensive). If you're
|
||||
not sure how to set this, set it to the value of the bandwidth used
|
||||
in clustering.mean_shift.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
Only bins with at least min_bin_freq will be selected as seeds.
|
||||
Raising this value decreases the number of seeds found, which
|
||||
makes mean_shift computationally cheaper.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bin_seeds : array-like of shape (n_samples, n_features)
|
||||
Points used as initial kernel positions in clustering.mean_shift.
|
||||
"""
|
||||
if bin_size == 0:
|
||||
return X
|
||||
|
||||
# Bin points
|
||||
bin_sizes = defaultdict(int)
|
||||
for point in X:
|
||||
binned_point = np.round(point / bin_size)
|
||||
bin_sizes[tuple(binned_point)] += 1
|
||||
|
||||
# Select only those bins as seeds which have enough members
|
||||
bin_seeds = np.array(
|
||||
[point for point, freq in bin_sizes.items() if freq >= min_bin_freq],
|
||||
dtype=np.float32,
|
||||
)
|
||||
if len(bin_seeds) == len(X):
|
||||
warnings.warn(
|
||||
"Binning data failed with provided bin_size=%f, using data points as seeds."
|
||||
% bin_size
|
||||
)
|
||||
return X
|
||||
bin_seeds = bin_seeds * bin_size
|
||||
return bin_seeds
|
||||
|
||||
|
||||
class MeanShift(ClusterMixin, BaseEstimator):
|
||||
"""Mean shift clustering using a flat kernel.
|
||||
|
||||
Mean shift clustering aims to discover "blobs" in a smooth density of
|
||||
samples. It is a centroid-based algorithm, which works by updating
|
||||
candidates for centroids to be the mean of the points within a given
|
||||
region. These candidates are then filtered in a post-processing stage to
|
||||
eliminate near-duplicates to form the final set of centroids.
|
||||
|
||||
Seeding is performed using a binning technique for scalability.
|
||||
|
||||
For an example of how to use MeanShift clustering, refer to:
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`.
|
||||
|
||||
Read more in the :ref:`User Guide <mean_shift>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bandwidth : float, default=None
|
||||
Bandwidth used in the flat kernel.
|
||||
|
||||
If not given, the bandwidth is estimated using
|
||||
sklearn.cluster.estimate_bandwidth; see the documentation for that
|
||||
function for hints on scalability (see also the Notes, below).
|
||||
|
||||
seeds : array-like of shape (n_samples, n_features), default=None
|
||||
Seeds used to initialize kernels. If not set,
|
||||
the seeds are calculated by clustering.get_bin_seeds
|
||||
with bandwidth as the grid size and default values for
|
||||
other parameters.
|
||||
|
||||
bin_seeding : bool, default=False
|
||||
If true, initial kernel locations are not locations of all
|
||||
points, but rather the location of the discretized version of
|
||||
points, where points are binned onto a grid whose coarseness
|
||||
corresponds to the bandwidth. Setting this option to True will speed
|
||||
up the algorithm because fewer seeds will be initialized.
|
||||
The default value is False.
|
||||
Ignored if seeds argument is not None.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
To speed up the algorithm, accept only those bins with at least
|
||||
min_bin_freq points as seeds.
|
||||
|
||||
cluster_all : bool, default=True
|
||||
If true, then all points are clustered, even those orphans that are
|
||||
not within any kernel. Orphans are assigned to the nearest kernel.
|
||||
If false, then orphans are given cluster label -1.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. The following tasks benefit
|
||||
from the parallelization:
|
||||
|
||||
- The search of nearest neighbors for bandwidth estimation and label
|
||||
assignments. See the details in the docstring of the
|
||||
``NearestNeighbors`` class.
|
||||
- Hill-climbing optimization for all seeds.
|
||||
|
||||
See :term:`Glossary <n_jobs>` for more details.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations, per seed point before the clustering
|
||||
operation terminates (for that seed point), if has not converged yet.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Coordinates of cluster centers.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
n_iter_ : int
|
||||
Maximum number of iterations performed on each seed.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
KMeans : K-Means clustering.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
Scalability:
|
||||
|
||||
Because this implementation uses a flat kernel and
|
||||
a Ball Tree to look up members of each kernel, the complexity will tend
|
||||
towards O(T*n*log(n)) in lower dimensions, with n the number of samples
|
||||
and T the number of points. In higher dimensions the complexity will
|
||||
tend towards O(T*n^2).
|
||||
|
||||
Scalability can be boosted by using fewer seeds, for example by using
|
||||
a higher value of min_bin_freq in the get_bin_seeds function.
|
||||
|
||||
Note that the estimate_bandwidth function is much less scalable than the
|
||||
mean shift algorithm and will be the bottleneck if it is used.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
|
||||
feature space analysis". IEEE Transactions on Pattern Analysis and
|
||||
Machine Intelligence. 2002. pp. 603-619.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import MeanShift
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = MeanShift(bandwidth=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
>>> clustering.predict([[0, 0], [5, 5]])
|
||||
array([1, 0])
|
||||
>>> clustering
|
||||
MeanShift(bandwidth=2)
|
||||
|
||||
For a comparison of Mean Shift clustering with other clustering algorithms, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"bandwidth": [Interval(Real, 0, None, closed="neither"), None],
|
||||
"seeds": ["array-like", None],
|
||||
"bin_seeding": ["boolean"],
|
||||
"min_bin_freq": [Interval(Integral, 1, None, closed="left")],
|
||||
"cluster_all": ["boolean"],
|
||||
"n_jobs": [Integral, None],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
bandwidth=None,
|
||||
seeds=None,
|
||||
bin_seeding=False,
|
||||
min_bin_freq=1,
|
||||
cluster_all=True,
|
||||
n_jobs=None,
|
||||
max_iter=300,
|
||||
):
|
||||
self.bandwidth = bandwidth
|
||||
self.seeds = seeds
|
||||
self.bin_seeding = bin_seeding
|
||||
self.cluster_all = cluster_all
|
||||
self.min_bin_freq = min_bin_freq
|
||||
self.n_jobs = n_jobs
|
||||
self.max_iter = max_iter
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Perform clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Samples to cluster.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted instance.
|
||||
"""
|
||||
X = validate_data(self, X)
|
||||
bandwidth = self.bandwidth
|
||||
if bandwidth is None:
|
||||
bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
|
||||
|
||||
seeds = self.seeds
|
||||
if seeds is None:
|
||||
if self.bin_seeding:
|
||||
seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
|
||||
else:
|
||||
seeds = X
|
||||
n_samples, n_features = X.shape
|
||||
center_intensity_dict = {}
|
||||
|
||||
# We use n_jobs=1 because this will be used in nested calls under
|
||||
# parallel calls to _mean_shift_single_seed so there is no need for
|
||||
# for further parallelism.
|
||||
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
|
||||
|
||||
# execute iterations on all seeds in parallel
|
||||
all_res = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter)
|
||||
for seed in seeds
|
||||
)
|
||||
# copy results in a dictionary
|
||||
for i in range(len(seeds)):
|
||||
if all_res[i][1]: # i.e. len(points_within) > 0
|
||||
center_intensity_dict[all_res[i][0]] = all_res[i][1]
|
||||
|
||||
self.n_iter_ = max([x[2] for x in all_res])
|
||||
|
||||
if not center_intensity_dict:
|
||||
# nothing near seeds
|
||||
raise ValueError(
|
||||
"No point was within bandwidth=%f of any seed. Try a different seeding"
|
||||
" strategy or increase the bandwidth."
|
||||
% bandwidth
|
||||
)
|
||||
|
||||
# POST PROCESSING: remove near duplicate points
|
||||
# If the distance between two kernels is less than the bandwidth,
|
||||
# then we have to remove one because it is a duplicate. Remove the
|
||||
# one with fewer points.
|
||||
|
||||
sorted_by_intensity = sorted(
|
||||
center_intensity_dict.items(),
|
||||
key=lambda tup: (tup[1], tup[0]),
|
||||
reverse=True,
|
||||
)
|
||||
sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
|
||||
unique = np.ones(len(sorted_centers), dtype=bool)
|
||||
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(
|
||||
sorted_centers
|
||||
)
|
||||
for i, center in enumerate(sorted_centers):
|
||||
if unique[i]:
|
||||
neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[
|
||||
0
|
||||
]
|
||||
unique[neighbor_idxs] = 0
|
||||
unique[i] = 1 # leave the current point as unique
|
||||
cluster_centers = sorted_centers[unique]
|
||||
|
||||
# ASSIGN LABELS: a point belongs to the cluster that it is closest to
|
||||
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)
|
||||
labels = np.zeros(n_samples, dtype=int)
|
||||
distances, idxs = nbrs.kneighbors(X)
|
||||
if self.cluster_all:
|
||||
labels = idxs.flatten()
|
||||
else:
|
||||
labels.fill(-1)
|
||||
bool_selector = distances.flatten() <= bandwidth
|
||||
labels[bool_selector] = idxs.flatten()[bool_selector]
|
||||
|
||||
self.cluster_centers_, self.labels_ = cluster_centers, labels
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
New data to predict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = validate_data(self, X, reset=False)
|
||||
with config_context(assume_finite=True):
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,805 @@
|
||||
"""Algorithms for spectral clustering"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import LinAlgError, qr, svd
|
||||
from scipy.sparse import csc_matrix
|
||||
|
||||
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from sklearn.cluster._kmeans import k_means
|
||||
from sklearn.manifold._spectral_embedding import _spectral_embedding
|
||||
from sklearn.metrics.pairwise import KERNEL_PARAMS, pairwise_kernels
|
||||
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
|
||||
from sklearn.utils import as_float_array, check_random_state
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
|
||||
def cluster_qr(vectors):
|
||||
"""Find the discrete partition closest to the eigenvector embedding.
|
||||
|
||||
This implementation was proposed in [1]_.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectors : array-like, shape: (n_samples, n_clusters)
|
||||
The embedding space of the samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The cluster labels of vectors.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
"""
|
||||
|
||||
k = vectors.shape[1]
|
||||
_, _, piv = qr(vectors.T, pivoting=True)
|
||||
ut, _, v = svd(vectors[piv[:k], :].T)
|
||||
vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
|
||||
return vectors.argmax(axis=1)
|
||||
|
||||
|
||||
def discretize(
|
||||
vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
|
||||
):
|
||||
"""Search for a partition matrix which is closest to the eigenvector embedding.
|
||||
|
||||
This implementation was proposed in [1]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectors : array-like of shape (n_samples, n_clusters)
|
||||
The embedding space of the samples.
|
||||
|
||||
copy : bool, default=True
|
||||
Whether to copy vectors, or perform in-place normalization.
|
||||
|
||||
max_svd_restarts : int, default=30
|
||||
Maximum number of attempts to restart SVD if convergence fails
|
||||
|
||||
n_iter_max : int, default=30
|
||||
Maximum number of iterations to attempt in rotation and partition
|
||||
matrix search if machine precision convergence is not reached
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines random number generation for rotation matrix initialization.
|
||||
Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
The eigenvector embedding is used to iteratively search for the
|
||||
closest discrete partition. First, the eigenvector embedding is
|
||||
normalized to the space of partition matrices. An optimal discrete
|
||||
partition matrix closest to this normalized embedding multiplied by
|
||||
an initial rotation is calculated. Fixing this discrete partition
|
||||
matrix, an optimal rotation matrix is calculated. These two
|
||||
calculations are performed until convergence. The discrete partition
|
||||
matrix is returned as the clustering solution. Used in spectral
|
||||
clustering, this method tends to be faster and more robust to random
|
||||
initialization than k-means.
|
||||
|
||||
"""
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
vectors = as_float_array(vectors, copy=copy)
|
||||
|
||||
eps = np.finfo(float).eps
|
||||
n_samples, n_components = vectors.shape
|
||||
|
||||
# Normalize the eigenvectors to an equal length of a vector of ones.
|
||||
# Reorient the eigenvectors to point in the negative direction with respect
|
||||
# to the first element. This may have to do with constraining the
|
||||
# eigenvectors to lie in a specific quadrant to make the discretization
|
||||
# search easier.
|
||||
norm_ones = np.sqrt(n_samples)
|
||||
for i in range(vectors.shape[1]):
|
||||
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
|
||||
if vectors[0, i] != 0:
|
||||
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
|
||||
|
||||
# Normalize the rows of the eigenvectors. Samples should lie on the unit
|
||||
# hypersphere centered at the origin. This transforms the samples in the
|
||||
# embedding space to the space of partition matrices.
|
||||
vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
|
||||
|
||||
svd_restarts = 0
|
||||
has_converged = False
|
||||
|
||||
# If there is an exception we try to randomize and rerun SVD again
|
||||
# do this max_svd_restarts times.
|
||||
while (svd_restarts < max_svd_restarts) and not has_converged:
|
||||
# Initialize first column of rotation matrix with a row of the
|
||||
# eigenvectors
|
||||
rotation = np.zeros((n_components, n_components))
|
||||
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
|
||||
|
||||
# To initialize the rest of the rotation matrix, find the rows
|
||||
# of the eigenvectors that are as orthogonal to each other as
|
||||
# possible
|
||||
c = np.zeros(n_samples)
|
||||
for j in range(1, n_components):
|
||||
# Accumulate c to ensure row is as orthogonal as possible to
|
||||
# previous picks as well as current one
|
||||
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
|
||||
rotation[:, j] = vectors[c.argmin(), :].T
|
||||
|
||||
last_objective_value = 0.0
|
||||
n_iter = 0
|
||||
|
||||
while not has_converged:
|
||||
n_iter += 1
|
||||
|
||||
t_discrete = np.dot(vectors, rotation)
|
||||
|
||||
labels = t_discrete.argmax(axis=1)
|
||||
vectors_discrete = csc_matrix(
|
||||
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
|
||||
shape=(n_samples, n_components),
|
||||
)
|
||||
|
||||
t_svd = vectors_discrete.T @ vectors
|
||||
|
||||
try:
|
||||
U, S, Vh = np.linalg.svd(t_svd)
|
||||
except LinAlgError:
|
||||
svd_restarts += 1
|
||||
print("SVD did not converge, randomizing and trying again")
|
||||
break
|
||||
|
||||
ncut_value = 2.0 * (n_samples - S.sum())
|
||||
if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
|
||||
has_converged = True
|
||||
else:
|
||||
# otherwise calculate rotation and continue
|
||||
last_objective_value = ncut_value
|
||||
rotation = np.dot(Vh.T, U.T)
|
||||
|
||||
if not has_converged:
|
||||
raise LinAlgError("SVD did not converge")
|
||||
return labels
|
||||
|
||||
|
||||
@validate_params(
|
||||
{"affinity": ["array-like", "sparse matrix"]},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def spectral_clustering(
|
||||
affinity,
|
||||
*,
|
||||
n_clusters=8,
|
||||
n_components=None,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
n_init=10,
|
||||
eigen_tol="auto",
|
||||
assign_labels="kmeans",
|
||||
verbose=False,
|
||||
):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster. For instance, when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If affinity is the adjacency matrix of a graph, this method can be
|
||||
used to find normalized graph cuts [1]_, [2]_.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
The affinity matrix describing the relationship of the samples to
|
||||
embed. **Must be symmetric**.
|
||||
|
||||
Possible examples:
|
||||
- adjacency matrix of a graph,
|
||||
- heat kernel of the pairwise distance matrix of the samples,
|
||||
- symmetric k-nearest neighbours connectivity matrix of the samples.
|
||||
|
||||
n_clusters : int, default=None
|
||||
Number of clusters to extract.
|
||||
|
||||
n_components : int, default=n_clusters
|
||||
Number of eigenvectors to use for the spectral embedding.
|
||||
|
||||
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
|
||||
The eigenvalue decomposition method. If None then ``'arpack'`` is used.
|
||||
See [4]_ for more details regarding ``'lobpcg'``.
|
||||
Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
|
||||
Algebraic MultiGrid preconditioning and requires pyamg to be installed.
|
||||
It can be faster on very large sparse problems [6]_ and [7]_.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigenvectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of n_init
|
||||
consecutive runs in terms of inertia. Only used if
|
||||
``assign_labels='kmeans'``.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
|
||||
The strategy to use to assign labels in the embedding
|
||||
space. There are three ways to assign labels after the Laplacian
|
||||
embedding. k-means can be applied and is a popular choice. But it can
|
||||
also be sensitive to initialization. Discretization is another
|
||||
approach which is less sensitive to random initialization [3]_.
|
||||
The cluster_qr method [5]_ directly extracts clusters from eigenvectors
|
||||
in spectral clustering. In contrast to k-means and discretization, cluster_qr
|
||||
has no tuning parameters and is not an iterative method, yet may outperform
|
||||
k-means and discretization in terms of both quality and speed. For a detailed
|
||||
comparison of clustering strategies, refer to the following example:
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`.
|
||||
|
||||
.. versionchanged:: 1.1
|
||||
Added new labeling method 'cluster_qr'.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbosity mode.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The graph should contain only one connected component, elsewhere
|
||||
the results make little sense.
|
||||
|
||||
This algorithm solves the normalized cut for `k=2`: it is a
|
||||
normalized spectral clustering.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
.. [3] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
|
||||
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
|
||||
A. V. Knyazev
|
||||
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
.. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
|
||||
for computing eigenvalues of graph Laplacians in image segmentation, 2006
|
||||
Andrew Knyazev
|
||||
<10.13140/RG.2.2.35280.02565>`
|
||||
|
||||
.. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
|
||||
streaming graph challenge (Preliminary version at arXiv.)
|
||||
David Zhuzhunashvili, Andrew Knyazev
|
||||
<10.1109/HPEC.2017.8091045>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.metrics.pairwise import pairwise_kernels
|
||||
>>> from sklearn.cluster import spectral_clustering
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> affinity = pairwise_kernels(X, metric='rbf')
|
||||
>>> spectral_clustering(
|
||||
... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0
|
||||
... )
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
"""
|
||||
|
||||
clusterer = SpectralClustering(
|
||||
n_clusters=n_clusters,
|
||||
n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
n_init=n_init,
|
||||
affinity="precomputed",
|
||||
eigen_tol=eigen_tol,
|
||||
assign_labels=assign_labels,
|
||||
verbose=verbose,
|
||||
).fit(affinity)
|
||||
|
||||
return clusterer.labels_
|
||||
|
||||
|
||||
class SpectralClustering(ClusterMixin, BaseEstimator):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex, or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster, such as when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If the affinity matrix is the adjacency matrix of a graph, this method
|
||||
can be used to find normalized graph cuts [1]_, [2]_.
|
||||
|
||||
When calling ``fit``, an affinity matrix is constructed using either
|
||||
a kernel function such the Gaussian (aka RBF) kernel with Euclidean
|
||||
distance ``d(X, X)``::
|
||||
|
||||
np.exp(-gamma * d(X,X) ** 2)
|
||||
|
||||
or a k-nearest neighbors connectivity matrix.
|
||||
|
||||
Alternatively, a user-provided affinity matrix can be specified by
|
||||
setting ``affinity='precomputed'``.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used. See [4]_ for more details regarding `'lobpcg'`.
|
||||
|
||||
n_components : int, default=None
|
||||
Number of eigenvectors to use for the spectral embedding. If None,
|
||||
defaults to `n_clusters`.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigenvectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of n_init
|
||||
consecutive runs in terms of inertia. Only used if
|
||||
``assign_labels='kmeans'``.
|
||||
|
||||
gamma : float, default=1.0
|
||||
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
|
||||
Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
|
||||
or ``affinity='precomputed_nearest_neighbors'``.
|
||||
|
||||
affinity : str or callable, default='rbf'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors': construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf': construct the affinity matrix using a radial basis function
|
||||
(RBF) kernel.
|
||||
- 'precomputed': interpret ``X`` as a precomputed affinity matrix,
|
||||
where larger values indicate greater similarity between instances.
|
||||
- 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
|
||||
of precomputed distances, and construct a binary affinity matrix
|
||||
from the ``n_neighbors`` nearest neighbors of each instance.
|
||||
- one of the kernels supported by
|
||||
:func:`~sklearn.metrics.pairwise.pairwise_kernels`.
|
||||
|
||||
Only kernels that produce similarity scores (non-negative values that
|
||||
increase with similarity) should be used. This property is not checked
|
||||
by the clustering algorithm.
|
||||
|
||||
n_neighbors : int, default=10
|
||||
Number of neighbors to use when constructing the affinity matrix using
|
||||
the nearest neighbors method. Ignored for ``affinity='rbf'``.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigen decomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
|
||||
The strategy for assigning labels in the embedding space. There are two
|
||||
ways to assign labels after the Laplacian embedding. k-means is a
|
||||
popular choice, but it can be sensitive to initialization.
|
||||
Discretization is another approach which is less sensitive to random
|
||||
initialization [3]_.
|
||||
The cluster_qr method [5]_ directly extract clusters from eigenvectors
|
||||
in spectral clustering. In contrast to k-means and discretization, cluster_qr
|
||||
has no tuning parameters and runs no iterations, yet may outperform
|
||||
k-means and discretization in terms of both quality and speed.
|
||||
|
||||
.. versionchanged:: 1.1
|
||||
Added new labeling method 'cluster_qr'.
|
||||
|
||||
degree : float, default=3
|
||||
Degree of the polynomial kernel. Ignored by other kernels.
|
||||
|
||||
coef0 : float, default=1
|
||||
Zero coefficient for polynomial and sigmoid kernels.
|
||||
Ignored by other kernels.
|
||||
|
||||
kernel_params : dict of str to any, default=None
|
||||
Parameters (keyword arguments) and values for kernel passed as
|
||||
callable object. Ignored by other kernels.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run when `affinity='nearest_neighbors'`
|
||||
or `affinity='precomputed_nearest_neighbors'`. The neighbors search
|
||||
will be done in parallel.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbosity mode.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Attributes
|
||||
----------
|
||||
affinity_matrix_ : array-like of shape (n_samples, n_samples)
|
||||
Affinity matrix used for clustering. Available only after calling
|
||||
``fit``.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.cluster.KMeans : K-Means clustering.
|
||||
sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
|
||||
Applications with Noise.
|
||||
|
||||
Notes
|
||||
-----
|
||||
A distance matrix for which 0 indicates identical elements and high values
|
||||
indicate very dissimilar elements can be transformed into an affinity /
|
||||
similarity matrix that is well-suited for the algorithm by
|
||||
applying the Gaussian (aka RBF, heat) kernel::
|
||||
|
||||
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
|
||||
|
||||
where ``delta`` is a free parameter representing the width of the Gaussian
|
||||
kernel.
|
||||
|
||||
An alternative is to take a symmetric version of the k-nearest neighbors
|
||||
connectivity matrix of the points.
|
||||
|
||||
If the pyamg package is installed, it is used: this greatly
|
||||
speeds up computation.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
.. [3] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
|
||||
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
|
||||
A. V. Knyazev
|
||||
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralClustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralClustering(n_clusters=2,
|
||||
... assign_labels='discretize',
|
||||
... random_state=0).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
>>> clustering
|
||||
SpectralClustering(assign_labels='discretize', n_clusters=2,
|
||||
random_state=0)
|
||||
|
||||
For a comparison of Spectral clustering with other clustering algorithms, see
|
||||
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"gamma": [Interval(Real, 0, None, closed="left")],
|
||||
"affinity": [
|
||||
callable,
|
||||
StrOptions(
|
||||
set(KERNEL_PARAMS)
|
||||
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
|
||||
),
|
||||
],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_tol": [
|
||||
Interval(Real, 0.0, None, closed="left"),
|
||||
StrOptions({"auto"}),
|
||||
],
|
||||
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
|
||||
"degree": [Interval(Real, 0, None, closed="left")],
|
||||
"coef0": [Interval(Real, None, None, closed="neither")],
|
||||
"kernel_params": [dict, None],
|
||||
"n_jobs": [Integral, None],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=8,
|
||||
*,
|
||||
eigen_solver=None,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
n_init=10,
|
||||
gamma=1.0,
|
||||
affinity="rbf",
|
||||
n_neighbors=10,
|
||||
eigen_tol="auto",
|
||||
assign_labels="kmeans",
|
||||
degree=3,
|
||||
coef0=1,
|
||||
kernel_params=None,
|
||||
n_jobs=None,
|
||||
verbose=False,
|
||||
):
|
||||
self.n_clusters = n_clusters
|
||||
self.eigen_solver = eigen_solver
|
||||
self.n_components = n_components
|
||||
self.random_state = random_state
|
||||
self.n_init = n_init
|
||||
self.gamma = gamma
|
||||
self.affinity = affinity
|
||||
self.n_neighbors = n_neighbors
|
||||
self.eigen_tol = eigen_tol
|
||||
self.assign_labels = assign_labels
|
||||
self.degree = degree
|
||||
self.coef0 = coef0
|
||||
self.kernel_params = kernel_params
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Perform spectral clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, similarities / affinities between
|
||||
instances if ``affinity='precomputed'``, or distances between
|
||||
instances if ``affinity='precomputed_nearest_neighbors``. If a
|
||||
sparse matrix is provided in a format other than ``csr_matrix``,
|
||||
``csc_matrix``, or ``coo_matrix``, it will be converted into a
|
||||
sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
A fitted instance of the estimator.
|
||||
"""
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=["csr", "csc", "coo"],
|
||||
dtype=np.float64,
|
||||
ensure_min_samples=2,
|
||||
)
|
||||
allow_squared = self.affinity in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
if X.shape[0] == X.shape[1] and not allow_squared:
|
||||
warnings.warn(
|
||||
"The spectral clustering API has changed. ``fit``"
|
||||
"now constructs an affinity matrix from data. To use"
|
||||
" a custom affinity matrix, "
|
||||
"set ``affinity=precomputed``."
|
||||
)
|
||||
|
||||
if self.affinity == "nearest_neighbors":
|
||||
connectivity = kneighbors_graph(
|
||||
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
else:
|
||||
params = self.kernel_params
|
||||
if params is None:
|
||||
params = {}
|
||||
if not callable(self.affinity):
|
||||
params["gamma"] = self.gamma
|
||||
params["degree"] = self.degree
|
||||
params["coef0"] = self.coef0
|
||||
self.affinity_matrix_ = pairwise_kernels(
|
||||
X, metric=self.affinity, filter_params=True, **params
|
||||
)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
n_components = (
|
||||
self.n_clusters if self.n_components is None else self.n_components
|
||||
)
|
||||
# We now obtain the real valued solution matrix to the
|
||||
# relaxed Ncut problem, solving the eigenvalue problem
|
||||
# L_sym x = lambda x and recovering u = D^-1/2 x.
|
||||
# The first eigenvector is constant only for fully connected graphs
|
||||
# and should be kept for spectral clustering (drop_first = False)
|
||||
# See spectral_embedding documentation.
|
||||
maps = _spectral_embedding(
|
||||
self.affinity_matrix_,
|
||||
n_components=n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=self.eigen_tol,
|
||||
drop_first=False,
|
||||
)
|
||||
if self.verbose:
|
||||
print(f"Computing label assignment using {self.assign_labels}")
|
||||
|
||||
if self.assign_labels == "kmeans":
|
||||
_, self.labels_, _ = k_means(
|
||||
maps,
|
||||
self.n_clusters,
|
||||
random_state=random_state,
|
||||
n_init=self.n_init,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
elif self.assign_labels == "cluster_qr":
|
||||
self.labels_ = cluster_qr(maps)
|
||||
else:
|
||||
self.labels_ = discretize(maps, random_state=random_state)
|
||||
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Perform spectral clustering on `X` and return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, similarities / affinities between
|
||||
instances if ``affinity='precomputed'``, or distances between
|
||||
instances if ``affinity='precomputed_nearest_neighbors``. If a
|
||||
sparse matrix is provided in a format other than ``csr_matrix``,
|
||||
``csc_matrix``, or ``coo_matrix``, it will be converted into a
|
||||
sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
tags.input_tags.pairwise = self.affinity in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
return tags
|
||||
@@ -0,0 +1,26 @@
|
||||
cluster_extension_metadata = {
|
||||
'_dbscan_inner':
|
||||
{'sources': [cython_gen_cpp.process('_dbscan_inner.pyx')]},
|
||||
'_hierarchical_fast':
|
||||
{'sources': [cython_gen_cpp.process('_hierarchical_fast.pyx'), metrics_cython_tree]},
|
||||
'_k_means_common':
|
||||
{'sources': [cython_gen.process('_k_means_common.pyx')], 'dependencies': [openmp_dep]},
|
||||
'_k_means_lloyd':
|
||||
{'sources': [cython_gen.process('_k_means_lloyd.pyx')], 'dependencies': [openmp_dep]},
|
||||
'_k_means_elkan':
|
||||
{'sources': [cython_gen.process('_k_means_elkan.pyx')], 'dependencies': [openmp_dep]},
|
||||
'_k_means_minibatch':
|
||||
{'sources': [cython_gen.process('_k_means_minibatch.pyx')], 'dependencies': [openmp_dep]},
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : cluster_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
[ext_dict.get('sources'), utils_cython_tree],
|
||||
dependencies: [np_dep] + ext_dict.get('dependencies', []),
|
||||
subdir: 'sklearn/cluster',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
|
||||
subdir('_hdbscan')
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
Common utilities for testing clustering.
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
###############################################################################
|
||||
# Generate sample data
|
||||
|
||||
|
||||
def generate_clustered_data(
|
||||
seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
|
||||
):
|
||||
prng = np.random.RandomState(seed)
|
||||
|
||||
# the data is voluntary shifted away from zero to check clustering
|
||||
# algorithm robustness with regards to non centered data
|
||||
means = (
|
||||
np.array(
|
||||
[
|
||||
[1, 1, 1, 0],
|
||||
[-1, -1, 0, 1],
|
||||
[1, -1, 1, 1],
|
||||
[-1, 1, 1, 0],
|
||||
]
|
||||
)
|
||||
+ 10
|
||||
)
|
||||
|
||||
X = np.empty((0, n_features))
|
||||
for i in range(n_clusters):
|
||||
X = np.r_[
|
||||
X,
|
||||
means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
|
||||
]
|
||||
return X
|
||||
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Testing for Clustering methods
|
||||
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster import AffinityPropagation, affinity_propagation
|
||||
from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import ConvergenceWarning, NotFittedError
|
||||
from sklearn.metrics import euclidean_distances
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
n_clusters = 3
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=60,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
# TODO: AffinityPropagation must preserve dtype for its fitted attributes
|
||||
# and test must be created accordingly to this new behavior.
|
||||
# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000
|
||||
|
||||
|
||||
def test_affinity_propagation(global_random_seed, global_dtype):
|
||||
"""Test consistency of the affinity propagations."""
|
||||
S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True)
|
||||
preference = np.median(S) * 10
|
||||
cluster_centers_indices, labels = affinity_propagation(
|
||||
S, preference=preference, random_state=global_random_seed
|
||||
)
|
||||
|
||||
n_clusters_ = len(cluster_centers_indices)
|
||||
|
||||
assert n_clusters == n_clusters_
|
||||
|
||||
|
||||
def test_affinity_propagation_precomputed():
|
||||
"""Check equality of precomputed affinity matrix to internally computed affinity
|
||||
matrix.
|
||||
"""
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
preference = np.median(S) * 10
|
||||
af = AffinityPropagation(
|
||||
preference=preference, affinity="precomputed", random_state=28
|
||||
)
|
||||
labels_precomputed = af.fit(S).labels_
|
||||
|
||||
af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
|
||||
labels = af.fit(X).labels_
|
||||
|
||||
assert_array_equal(labels, labels_precomputed)
|
||||
|
||||
cluster_centers_indices = af.cluster_centers_indices_
|
||||
|
||||
n_clusters_ = len(cluster_centers_indices)
|
||||
assert np.unique(labels).size == n_clusters_
|
||||
assert n_clusters == n_clusters_
|
||||
|
||||
|
||||
def test_affinity_propagation_no_copy():
|
||||
"""Check behaviour of not copying the input data."""
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
S_original = S.copy()
|
||||
preference = np.median(S) * 10
|
||||
assert not np.allclose(S.diagonal(), preference)
|
||||
|
||||
# with copy=True S should not be modified
|
||||
affinity_propagation(S, preference=preference, copy=True, random_state=0)
|
||||
assert_allclose(S, S_original)
|
||||
assert not np.allclose(S.diagonal(), preference)
|
||||
assert_allclose(S.diagonal(), np.zeros(S.shape[0]))
|
||||
|
||||
# with copy=False S will be modified inplace
|
||||
affinity_propagation(S, preference=preference, copy=False, random_state=0)
|
||||
assert_allclose(S.diagonal(), preference)
|
||||
|
||||
# test that copy=True and copy=False lead to the same result
|
||||
S = S_original.copy()
|
||||
af = AffinityPropagation(preference=preference, verbose=True, random_state=0)
|
||||
|
||||
labels = af.fit(X).labels_
|
||||
_, labels_no_copy = affinity_propagation(
|
||||
S, preference=preference, copy=False, random_state=74
|
||||
)
|
||||
assert_array_equal(labels, labels_no_copy)
|
||||
|
||||
|
||||
def test_affinity_propagation_affinity_shape():
|
||||
"""Check the shape of the affinity matrix when using `affinity_propagation."""
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
err_msg = "The matrix of similarities must be a square array"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
affinity_propagation(S[:, :-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_affinity_propagation_precomputed_with_sparse_input(csr_container):
|
||||
err_msg = "Sparse data was passed for X, but dense data is required"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3)))
|
||||
|
||||
|
||||
def test_affinity_propagation_predict(global_random_seed, global_dtype):
|
||||
# Test AffinityPropagation.predict
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed)
|
||||
X_ = X.astype(global_dtype, copy=False)
|
||||
labels = af.fit_predict(X_)
|
||||
labels2 = af.predict(X_)
|
||||
assert_array_equal(labels, labels2)
|
||||
|
||||
|
||||
def test_affinity_propagation_predict_error():
|
||||
# Test exception in AffinityPropagation.predict
|
||||
# Not fitted.
|
||||
af = AffinityPropagation(affinity="euclidean")
|
||||
with pytest.raises(NotFittedError):
|
||||
af.predict(X)
|
||||
|
||||
# Predict not supported when affinity="precomputed".
|
||||
S = np.dot(X, X.T)
|
||||
af = AffinityPropagation(affinity="precomputed", random_state=57)
|
||||
af.fit(S)
|
||||
with pytest.raises(ValueError, match="expecting 60 features as input"):
|
||||
af.predict(X)
|
||||
|
||||
|
||||
def test_affinity_propagation_fit_non_convergence(global_dtype):
|
||||
# In case of non-convergence of affinity_propagation(), the cluster
|
||||
# centers should be an empty array and training samples should be labelled
|
||||
# as noise (-1)
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
|
||||
|
||||
# Force non-convergence by allowing only a single iteration
|
||||
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
|
||||
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
af.fit(X)
|
||||
assert_allclose(np.empty((0, 2)), af.cluster_centers_)
|
||||
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
|
||||
|
||||
|
||||
def test_affinity_propagation_equal_mutual_similarities(global_dtype):
|
||||
X = np.array([[-1, 1], [1, -1]], dtype=global_dtype)
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
# setting preference > similarity
|
||||
with pytest.warns(UserWarning, match="mutually equal"):
|
||||
cluster_center_indices, labels = affinity_propagation(S, preference=0)
|
||||
|
||||
# expect every sample to become an exemplar
|
||||
assert_array_equal([0, 1], cluster_center_indices)
|
||||
assert_array_equal([0, 1], labels)
|
||||
|
||||
# setting preference < similarity
|
||||
with pytest.warns(UserWarning, match="mutually equal"):
|
||||
cluster_center_indices, labels = affinity_propagation(S, preference=-10)
|
||||
|
||||
# expect one cluster, with arbitrary (first) sample as exemplar
|
||||
assert_array_equal([0], cluster_center_indices)
|
||||
assert_array_equal([0, 0], labels)
|
||||
|
||||
# setting different preferences
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
cluster_center_indices, labels = affinity_propagation(
|
||||
S, preference=[-20, -10], random_state=37
|
||||
)
|
||||
|
||||
# expect one cluster, with highest-preference sample as exemplar
|
||||
assert_array_equal([1], cluster_center_indices)
|
||||
assert_array_equal([0, 0], labels)
|
||||
|
||||
|
||||
def test_affinity_propagation_predict_non_convergence(global_dtype):
|
||||
# In case of non-convergence of affinity_propagation(), the cluster
|
||||
# centers should be an empty array
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
|
||||
|
||||
# Force non-convergence by allowing only a single iteration
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
|
||||
|
||||
# At prediction time, consider new samples as noise since there are no
|
||||
# clusters
|
||||
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
y = af.predict(to_predict)
|
||||
assert_array_equal(np.array([-1, -1, -1]), y)
|
||||
|
||||
|
||||
def test_affinity_propagation_non_convergence_regressiontest(global_dtype):
|
||||
X = np.array(
|
||||
[[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype
|
||||
)
|
||||
af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
|
||||
msg = (
|
||||
"Affinity propagation did not converge, this model may return degenerate"
|
||||
" cluster centers and labels."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=msg):
|
||||
af.fit(X)
|
||||
|
||||
assert_array_equal(np.array([0, 0, 0]), af.labels_)
|
||||
|
||||
|
||||
def test_equal_similarities_and_preferences(global_dtype):
|
||||
# Unequal distances
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
assert not _equal_similarities_and_preferences(S, np.array(0))
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
|
||||
|
||||
# Equal distances
|
||||
X = np.array([[0, 0], [1, 1]], dtype=global_dtype)
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
# Different preferences
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
|
||||
|
||||
# Same preferences
|
||||
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
|
||||
assert _equal_similarities_and_preferences(S, np.array(0))
|
||||
|
||||
|
||||
def test_affinity_propagation_random_state():
|
||||
"""Check that different random states lead to different initialisations
|
||||
by looking at the center locations after two iterations.
|
||||
"""
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=300, centers=centers, cluster_std=0.5, random_state=0
|
||||
)
|
||||
# random_state = 0
|
||||
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
|
||||
ap.fit(X)
|
||||
centers0 = ap.cluster_centers_
|
||||
|
||||
# random_state = 76
|
||||
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
|
||||
ap.fit(X)
|
||||
centers76 = ap.cluster_centers_
|
||||
# check that the centers have not yet converged to the same solution
|
||||
assert np.mean((centers0 - centers76) ** 2) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
|
||||
def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype):
|
||||
"""
|
||||
Check that having sparse or dense `centers` format should not
|
||||
influence the convergence.
|
||||
Non-regression test for gh-13334.
|
||||
"""
|
||||
centers = container(np.zeros((1, 10)))
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.rand(40, 10).astype(global_dtype, copy=False)
|
||||
y = (4 * rng.rand(40)).astype(int)
|
||||
ap = AffinityPropagation(random_state=46)
|
||||
ap.fit(X, y)
|
||||
ap.cluster_centers_ = centers
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
|
||||
|
||||
|
||||
# FIXME; this test is broken with different random states, needs to be revisited
|
||||
def test_correct_clusters(global_dtype):
|
||||
# Test to fix incorrect clusters due to dtype change
|
||||
# (non-regression test for issue #10832)
|
||||
X = np.array(
|
||||
[[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype
|
||||
)
|
||||
afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
|
||||
X
|
||||
)
|
||||
expected = np.array([0, 1, 1, 2])
|
||||
assert_array_equal(afp.labels_, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_input_for_predict(csr_container):
|
||||
# Test to make sure sparse inputs are accepted for predict
|
||||
# (non-regression test for issue #20049)
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=42)
|
||||
af.fit(X)
|
||||
labels = af.predict(csr_container((2, 2)))
|
||||
assert_array_equal(labels, (2, 2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_input_for_fit_predict(csr_container):
|
||||
# Test to make sure sparse inputs are accepted for fit_predict
|
||||
# (non-regression test for issue #20049)
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=42)
|
||||
rng = np.random.RandomState(42)
|
||||
X = csr_container(rng.randint(0, 2, size=(5, 5)))
|
||||
labels = af.fit_predict(X)
|
||||
assert_array_equal(labels, (0, 1, 1, 2, 3))
|
||||
|
||||
|
||||
def test_affinity_propagation_equal_points():
|
||||
"""Make sure we do not assign multiple clusters to equal points.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/pull/20043
|
||||
"""
|
||||
X = np.zeros((8, 1))
|
||||
af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X)
|
||||
assert np.all(af.labels_ == 0)
|
||||
@@ -0,0 +1,265 @@
|
||||
"""Testing for Spectral Biclustering methods"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.base import BaseEstimator, BiclusterMixin, clone
|
||||
from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
|
||||
from sklearn.cluster._bicluster import (
|
||||
_bistochastic_normalize,
|
||||
_log_normalize,
|
||||
_scale_normalize,
|
||||
)
|
||||
from sklearn.datasets import make_biclusters, make_checkerboard
|
||||
from sklearn.metrics import consensus_score, v_measure_score
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
class MockBiclustering(BiclusterMixin, BaseEstimator):
|
||||
# Mock object for testing get_submatrix.
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_indices(self, i):
|
||||
# Overridden to reproduce old get_submatrix test.
|
||||
return (
|
||||
np.where([True, True, False, False, True])[0],
|
||||
np.where([False, False, True, True])[0],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_get_submatrix(csr_container):
|
||||
data = np.arange(20).reshape(5, 4)
|
||||
model = MockBiclustering()
|
||||
|
||||
for X in (data, csr_container(data), data.tolist()):
|
||||
submatrix = model.get_submatrix(0, X)
|
||||
if issparse(submatrix):
|
||||
submatrix = submatrix.toarray()
|
||||
assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
|
||||
submatrix[:] = -1
|
||||
if issparse(X):
|
||||
X = X.toarray()
|
||||
assert np.all(X != -1)
|
||||
|
||||
|
||||
def _test_shape_indices(model):
|
||||
# Test get_shape and get_indices on fitted model.
|
||||
for i in range(model.n_clusters):
|
||||
m, n = model.get_shape(i)
|
||||
i_ind, j_ind = model.get_indices(i)
|
||||
assert len(i_ind) == m
|
||||
assert len(j_ind) == n
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_spectral_coclustering(global_random_seed, csr_container):
|
||||
# Test Dhillon's Spectral CoClustering on a simple problem.
|
||||
param_grid = {
|
||||
"svd_method": ["randomized", "arpack"],
|
||||
"n_svd_vecs": [None, 20],
|
||||
"mini_batch": [False, True],
|
||||
"init": ["k-means++"],
|
||||
"n_init": [10],
|
||||
}
|
||||
S, rows, cols = make_biclusters(
|
||||
(30, 30), 3, noise=0.1, random_state=global_random_seed
|
||||
)
|
||||
S -= S.min() # needs to be nonnegative before making it sparse
|
||||
S = np.where(S < 1, 0, S) # threshold some values
|
||||
for mat in (S, csr_container(S)):
|
||||
for kwargs in ParameterGrid(param_grid):
|
||||
model = SpectralCoclustering(
|
||||
n_clusters=3, random_state=global_random_seed, **kwargs
|
||||
)
|
||||
model.fit(mat)
|
||||
|
||||
assert model.rows_.shape == (3, 30)
|
||||
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
|
||||
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
_test_shape_indices(model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_spectral_biclustering(global_random_seed, csr_container):
|
||||
# Test Kluger methods on a checkerboard dataset.
|
||||
S, rows, cols = make_checkerboard(
|
||||
(30, 30), 3, noise=0.5, random_state=global_random_seed
|
||||
)
|
||||
|
||||
non_default_params = {
|
||||
"method": ["scale", "log"],
|
||||
"svd_method": ["arpack"],
|
||||
"n_svd_vecs": [20],
|
||||
"mini_batch": [True],
|
||||
}
|
||||
|
||||
for mat in (S, csr_container(S)):
|
||||
for param_name, param_values in non_default_params.items():
|
||||
for param_value in param_values:
|
||||
model = SpectralBiclustering(
|
||||
n_clusters=3,
|
||||
n_init=3,
|
||||
init="k-means++",
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
model.set_params(**dict([(param_name, param_value)]))
|
||||
|
||||
if issparse(mat) and model.get_params().get("method") == "log":
|
||||
# cannot take log of sparse matrix
|
||||
with pytest.raises(ValueError):
|
||||
model.fit(mat)
|
||||
continue
|
||||
else:
|
||||
model.fit(mat)
|
||||
|
||||
assert model.rows_.shape == (9, 30)
|
||||
assert model.columns_.shape == (9, 30)
|
||||
assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
|
||||
assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
_test_shape_indices(model)
|
||||
|
||||
|
||||
def _do_scale_test(scaled):
|
||||
"""Check that rows sum to one constant, and columns to another."""
|
||||
row_sum = scaled.sum(axis=1)
|
||||
col_sum = scaled.sum(axis=0)
|
||||
if issparse(scaled):
|
||||
row_sum = np.asarray(row_sum).squeeze()
|
||||
col_sum = np.asarray(col_sum).squeeze()
|
||||
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
|
||||
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
|
||||
|
||||
|
||||
def _do_bistochastic_test(scaled):
|
||||
"""Check that rows and columns sum to the same constant."""
|
||||
_do_scale_test(scaled)
|
||||
assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_scale_normalize(global_random_seed, csr_container):
|
||||
generator = np.random.RandomState(global_random_seed)
|
||||
X = generator.rand(100, 100)
|
||||
for mat in (X, csr_container(X)):
|
||||
scaled, _, _ = _scale_normalize(mat)
|
||||
_do_scale_test(scaled)
|
||||
if issparse(mat):
|
||||
assert issparse(scaled)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_bistochastic_normalize(global_random_seed, csr_container):
|
||||
generator = np.random.RandomState(global_random_seed)
|
||||
X = generator.rand(100, 100)
|
||||
for mat in (X, csr_container(X)):
|
||||
scaled = _bistochastic_normalize(mat)
|
||||
_do_bistochastic_test(scaled)
|
||||
if issparse(mat):
|
||||
assert issparse(scaled)
|
||||
|
||||
|
||||
def test_log_normalize(global_random_seed):
|
||||
# adding any constant to a log-scaled matrix should make it
|
||||
# bistochastic
|
||||
generator = np.random.RandomState(global_random_seed)
|
||||
mat = generator.rand(100, 100)
|
||||
scaled = _log_normalize(mat) + 1
|
||||
_do_bistochastic_test(scaled)
|
||||
|
||||
|
||||
def test_fit_best_piecewise(global_random_seed):
|
||||
model = SpectralBiclustering(random_state=global_random_seed)
|
||||
vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
|
||||
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
|
||||
assert_array_equal(best, vectors[:2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_project_and_cluster(global_random_seed, csr_container):
|
||||
model = SpectralBiclustering(random_state=global_random_seed)
|
||||
data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
|
||||
vectors = np.array([[1, 0], [0, 1], [0, 0]])
|
||||
for mat in (data, csr_container(data)):
|
||||
labels = model._project_and_cluster(mat, vectors, n_clusters=2)
|
||||
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
|
||||
|
||||
|
||||
def test_perfect_checkerboard(global_random_seed):
|
||||
# XXX Previously failed on build bot (not reproducible)
|
||||
model = SpectralBiclustering(
|
||||
3, svd_method="arpack", random_state=global_random_seed
|
||||
)
|
||||
|
||||
S, rows, cols = make_checkerboard(
|
||||
(30, 30), 3, noise=0, random_state=global_random_seed
|
||||
)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
S, rows, cols = make_checkerboard(
|
||||
(40, 30), 3, noise=0, random_state=global_random_seed
|
||||
)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
S, rows, cols = make_checkerboard(
|
||||
(30, 40), 3, noise=0, random_state=global_random_seed
|
||||
)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, type_err, err_msg",
|
||||
[
|
||||
(
|
||||
{"n_clusters": 6},
|
||||
ValueError,
|
||||
"n_clusters should be <= n_samples=5",
|
||||
),
|
||||
(
|
||||
{"n_clusters": (3, 3, 3)},
|
||||
ValueError,
|
||||
"Incorrect parameter n_clusters",
|
||||
),
|
||||
(
|
||||
{"n_clusters": (3, 6)},
|
||||
ValueError,
|
||||
"Incorrect parameter n_clusters",
|
||||
),
|
||||
(
|
||||
{"n_components": 3, "n_best": 4},
|
||||
ValueError,
|
||||
"n_best=4 must be <= n_components=3",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
|
||||
"""Check parameters validation in `SpectralBiClustering`"""
|
||||
data = np.arange(25).reshape((5, 5))
|
||||
model = SpectralBiclustering(**params)
|
||||
with pytest.raises(type_err, match=err_msg):
|
||||
model.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
|
||||
def test_n_features_in_(est):
|
||||
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
|
||||
|
||||
est = clone(est)
|
||||
assert not hasattr(est, "n_features_in_")
|
||||
est.fit(X)
|
||||
assert est.n_features_in_ == 3
|
||||
@@ -0,0 +1,242 @@
|
||||
"""
|
||||
Tests for the birch clustering algorithm.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster import AgglomerativeClustering, Birch
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def test_n_samples_leaves_roots(global_random_seed, global_dtype):
|
||||
# Sanity check for the number of samples in leaves and roots
|
||||
X, y = make_blobs(n_samples=10, random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
brc = Birch()
|
||||
brc.fit(X)
|
||||
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
|
||||
n_samples_leaves = sum(
|
||||
[sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
|
||||
)
|
||||
assert n_samples_leaves == X.shape[0]
|
||||
assert n_samples_root == X.shape[0]
|
||||
|
||||
|
||||
def test_partial_fit(global_random_seed, global_dtype):
|
||||
# Test that fit is equivalent to calling partial_fit multiple times
|
||||
X, y = make_blobs(n_samples=100, random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
brc = Birch(n_clusters=3)
|
||||
brc.fit(X)
|
||||
brc_partial = Birch(n_clusters=None)
|
||||
brc_partial.partial_fit(X[:50])
|
||||
brc_partial.partial_fit(X[50:])
|
||||
assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_)
|
||||
|
||||
# Test that same global labels are obtained after calling partial_fit
|
||||
# with None
|
||||
brc_partial.set_params(n_clusters=3)
|
||||
brc_partial.partial_fit(None)
|
||||
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
|
||||
|
||||
|
||||
def test_birch_predict(global_random_seed, global_dtype):
|
||||
# Test the predict method predicts the nearest centroid.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# n_samples * n_samples_per_cluster
|
||||
shuffle_indices = np.arange(30)
|
||||
rng.shuffle(shuffle_indices)
|
||||
X_shuffle = X[shuffle_indices, :]
|
||||
brc = Birch(n_clusters=4, threshold=1.0)
|
||||
brc.fit(X_shuffle)
|
||||
|
||||
# Birch must preserve inputs' dtype
|
||||
assert brc.subcluster_centers_.dtype == global_dtype
|
||||
|
||||
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
|
||||
centroids = brc.subcluster_centers_
|
||||
nearest_centroid = brc.subcluster_labels_[
|
||||
pairwise_distances_argmin(X_shuffle, centroids)
|
||||
]
|
||||
assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0)
|
||||
|
||||
|
||||
def test_n_clusters(global_random_seed, global_dtype):
|
||||
# Test that n_clusters param works properly
|
||||
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
brc1 = Birch(n_clusters=10)
|
||||
brc1.fit(X)
|
||||
assert len(brc1.subcluster_centers_) > 10
|
||||
assert len(np.unique(brc1.labels_)) == 10
|
||||
|
||||
# Test that n_clusters = Agglomerative Clustering gives
|
||||
# the same results.
|
||||
gc = AgglomerativeClustering(n_clusters=10)
|
||||
brc2 = Birch(n_clusters=gc)
|
||||
brc2.fit(X)
|
||||
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
|
||||
assert_array_equal(brc1.labels_, brc2.labels_)
|
||||
|
||||
# Test that a small number of clusters raises a warning.
|
||||
brc4 = Birch(threshold=10000.0)
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
brc4.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_X(global_random_seed, global_dtype, csr_container):
|
||||
# Test that sparse and dense data give same results
|
||||
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
brc = Birch(n_clusters=10)
|
||||
brc.fit(X)
|
||||
|
||||
csr = csr_container(X)
|
||||
brc_sparse = Birch(n_clusters=10)
|
||||
brc_sparse.fit(csr)
|
||||
|
||||
# Birch must preserve inputs' dtype
|
||||
assert brc_sparse.subcluster_centers_.dtype == global_dtype
|
||||
|
||||
assert_array_equal(brc.labels_, brc_sparse.labels_)
|
||||
assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
|
||||
|
||||
|
||||
def test_partial_fit_second_call_error_checks():
|
||||
# second partial fit calls will error when n_features is not consistent
|
||||
# with the first call
|
||||
X, y = make_blobs(n_samples=100)
|
||||
brc = Birch(n_clusters=3)
|
||||
brc.partial_fit(X, y)
|
||||
|
||||
msg = "X has 1 features, but Birch is expecting 2 features"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
brc.partial_fit(X[:, [0]], y)
|
||||
|
||||
|
||||
def check_branching_factor(node, branching_factor):
|
||||
subclusters = node.subclusters_
|
||||
assert branching_factor >= len(subclusters)
|
||||
for cluster in subclusters:
|
||||
if cluster.child_:
|
||||
check_branching_factor(cluster.child_, branching_factor)
|
||||
|
||||
|
||||
def test_branching_factor(global_random_seed, global_dtype):
|
||||
# Test that nodes have at max branching_factor number of subclusters
|
||||
X, y = make_blobs(random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
branching_factor = 9
|
||||
|
||||
# Purposefully set a low threshold to maximize the subclusters.
|
||||
brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
|
||||
brc.fit(X)
|
||||
check_branching_factor(brc.root_, branching_factor)
|
||||
brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
|
||||
brc.fit(X)
|
||||
check_branching_factor(brc.root_, branching_factor)
|
||||
|
||||
|
||||
def check_threshold(birch_instance, threshold):
|
||||
"""Use the leaf linked list for traversal"""
|
||||
current_leaf = birch_instance.dummy_leaf_.next_leaf_
|
||||
while current_leaf:
|
||||
subclusters = current_leaf.subclusters_
|
||||
for sc in subclusters:
|
||||
assert threshold >= sc.radius
|
||||
current_leaf = current_leaf.next_leaf_
|
||||
|
||||
|
||||
def test_threshold(global_random_seed, global_dtype):
|
||||
# Test that the leaf subclusters have a threshold lesser than radius
|
||||
X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
brc = Birch(threshold=0.5, n_clusters=None)
|
||||
brc.fit(X)
|
||||
check_threshold(brc, 0.5)
|
||||
|
||||
brc = Birch(threshold=5.0, n_clusters=None)
|
||||
brc.fit(X)
|
||||
check_threshold(brc, 5.0)
|
||||
|
||||
|
||||
def test_birch_n_clusters_long_int():
|
||||
# Check that birch supports n_clusters with np.int64 dtype, for instance
|
||||
# coming from np.arange. #16484
|
||||
X, _ = make_blobs(random_state=0)
|
||||
n_clusters = np.int64(5)
|
||||
Birch(n_clusters=n_clusters).fit(X)
|
||||
|
||||
|
||||
def test_feature_names_out():
|
||||
"""Check `get_feature_names_out` for `Birch`."""
|
||||
X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
|
||||
brc = Birch(n_clusters=4)
|
||||
brc.fit(X)
|
||||
n_clusters = brc.subcluster_centers_.shape[0]
|
||||
|
||||
names_out = brc.get_feature_names_out()
|
||||
assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
|
||||
|
||||
|
||||
def test_transform_match_across_dtypes(global_random_seed):
|
||||
X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed)
|
||||
brc = Birch(n_clusters=4, threshold=1.1)
|
||||
Y_64 = brc.fit_transform(X)
|
||||
Y_32 = brc.fit_transform(X.astype(np.float32))
|
||||
|
||||
assert_allclose(Y_64, Y_32, atol=1e-6)
|
||||
|
||||
|
||||
def test_subcluster_dtype(global_dtype):
|
||||
X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
brc = Birch(n_clusters=4)
|
||||
assert brc.fit(X).subcluster_centers_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_both_subclusters_updated():
|
||||
"""Check that both subclusters are updated when a node a split, even when there are
|
||||
duplicated data points. Non-regression test for #23269.
|
||||
"""
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[-2.6192791, -1.5053215],
|
||||
[-2.9993038, -1.6863596],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-2.336792, -1.3417323],
|
||||
[-2.4089134, -1.3290224],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-3.364009, -1.8846745],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-2.617677, -1.5003285],
|
||||
[-2.2960556, -1.3260119],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-2.5459878, -1.4533926],
|
||||
[-2.25979, -1.3003055],
|
||||
[-2.4089134, -1.3290224],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-2.4089134, -1.3290224],
|
||||
[-2.5459878, -1.4533926],
|
||||
[-2.3724914, -1.3438171],
|
||||
[-2.9720619, -1.7058647],
|
||||
[-2.336792, -1.3417323],
|
||||
[-2.3724914, -1.3438171],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
# no error
|
||||
Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X)
|
||||
@@ -0,0 +1,158 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster import BisectingKMeans
|
||||
from sklearn.metrics import v_measure_score
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
|
||||
@pytest.mark.parametrize("init", ["k-means++", "random"])
|
||||
def test_three_clusters(bisecting_strategy, init):
|
||||
"""Tries to perform bisect k-means for three clusters to check
|
||||
if splitting data is performed correctly.
|
||||
"""
|
||||
X = np.array(
|
||||
[[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]]
|
||||
)
|
||||
bisect_means = BisectingKMeans(
|
||||
n_clusters=3,
|
||||
random_state=0,
|
||||
bisecting_strategy=bisecting_strategy,
|
||||
init=init,
|
||||
)
|
||||
bisect_means.fit(X)
|
||||
|
||||
expected_centers = [[2, 1], [10, 1], [10, 9]]
|
||||
expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2]
|
||||
|
||||
assert_allclose(
|
||||
sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist())
|
||||
)
|
||||
assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse(csr_container):
|
||||
"""Test Bisecting K-Means with sparse data.
|
||||
|
||||
Checks if labels and centers are the same between dense and sparse.
|
||||
"""
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.rand(20, 2)
|
||||
X[X < 0.8] = 0
|
||||
X_csr = csr_container(X)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
|
||||
bisect_means.fit(X_csr)
|
||||
sparse_centers = bisect_means.cluster_centers_
|
||||
|
||||
bisect_means.fit(X)
|
||||
normal_centers = bisect_means.cluster_centers_
|
||||
|
||||
# Check if results is the same for dense and sparse data
|
||||
assert_allclose(normal_centers, sparse_centers, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_clusters", [4, 5])
|
||||
def test_n_clusters(n_clusters):
|
||||
"""Test if resulting labels are in range [0, n_clusters - 1]."""
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
|
||||
bisect_means.fit(X)
|
||||
|
||||
assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
|
||||
|
||||
|
||||
def test_one_cluster():
|
||||
"""Test single cluster."""
|
||||
|
||||
X = np.array([[1, 2], [10, 2], [10, 8]])
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
|
||||
|
||||
# All labels from fit or predict should be equal 0
|
||||
assert all(bisect_means.labels_ == 0)
|
||||
assert all(bisect_means.predict(X) == 0)
|
||||
|
||||
assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
|
||||
def test_fit_predict(csr_container):
|
||||
"""Check if labels from fit(X) method are same as from fit(X).predict(X)."""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
if csr_container is not None:
|
||||
X[X < 0.8] = 0
|
||||
X = csr_container(X)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
bisect_means.fit(X)
|
||||
|
||||
assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
|
||||
def test_dtype_preserved(csr_container, global_dtype):
|
||||
"""Check that centers dtype is the same as input data dtype."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2).astype(global_dtype, copy=False)
|
||||
|
||||
if csr_container is not None:
|
||||
X[X < 0.8] = 0
|
||||
X = csr_container(X)
|
||||
|
||||
km = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
km.fit(X)
|
||||
|
||||
assert km.cluster_centers_.dtype == global_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
|
||||
def test_float32_float64_equivalence(csr_container):
|
||||
"""Check that the results are the same between float32 and float64."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
if csr_container is not None:
|
||||
X[X < 0.8] = 0
|
||||
X = csr_container(X)
|
||||
|
||||
km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
|
||||
km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
|
||||
|
||||
assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
|
||||
assert_array_equal(km32.labels_, km64.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
|
||||
def test_no_crash_on_empty_bisections(algorithm):
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/27081
|
||||
rng = np.random.RandomState(0)
|
||||
X_train = rng.rand(3000, 10)
|
||||
bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
|
||||
|
||||
# predict on scaled data to trigger pathologic case
|
||||
# where the inner mask leads to empty bisections.
|
||||
X_test = 50 * rng.rand(100, 10)
|
||||
labels = bkm.predict(X_test) # should not crash with idiv by 0
|
||||
assert np.isin(np.unique(labels), np.arange(10)).all()
|
||||
|
||||
|
||||
def test_one_feature():
|
||||
# Check that no error is raised when there is only one feature
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/27236
|
||||
X = np.random.normal(size=(128, 1))
|
||||
BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X)
|
||||
@@ -0,0 +1,434 @@
|
||||
"""
|
||||
Tests for DBSCAN clustering algorithm
|
||||
"""
|
||||
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.spatial import distance
|
||||
|
||||
from sklearn.cluster import DBSCAN, dbscan
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
|
||||
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
|
||||
|
||||
def test_dbscan_similarity():
|
||||
# Tests the DBSCAN algorithm with a similarity array.
|
||||
# Parameters chosen specifically for this task.
|
||||
eps = 0.15
|
||||
min_samples = 10
|
||||
# Compute similarities
|
||||
D = distance.squareform(distance.pdist(X))
|
||||
D /= np.max(D)
|
||||
# Compute DBSCAN
|
||||
core_samples, labels = dbscan(
|
||||
D, metric="precomputed", eps=eps, min_samples=min_samples
|
||||
)
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
|
||||
labels = db.fit(D).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
def test_dbscan_feature():
|
||||
# Tests the DBSCAN algorithm with a feature vector array.
|
||||
# Parameters chosen specifically for this task.
|
||||
# Different eps to other test, because distance is not normalised.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
metric = "euclidean"
|
||||
# Compute DBSCAN
|
||||
# parameters chosen for task
|
||||
core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_dbscan_sparse(lil_container):
|
||||
core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10)
|
||||
core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
|
||||
assert_array_equal(core_dense, core_sparse)
|
||||
assert_array_equal(labels_dense, labels_sparse)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("include_self", [False, True])
|
||||
def test_dbscan_sparse_precomputed(include_self):
|
||||
D = pairwise_distances(X)
|
||||
nn = NearestNeighbors(radius=0.9).fit(X)
|
||||
X_ = X if include_self else None
|
||||
D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
|
||||
# Ensure it is sparse not merely on diagonals:
|
||||
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
|
||||
core_sparse, labels_sparse = dbscan(
|
||||
D_sparse, eps=0.8, min_samples=10, metric="precomputed"
|
||||
)
|
||||
core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
|
||||
assert_array_equal(core_dense, core_sparse)
|
||||
assert_array_equal(labels_dense, labels_sparse)
|
||||
|
||||
|
||||
def test_dbscan_sparse_precomputed_different_eps():
|
||||
# test that precomputed neighbors graph is filtered if computed with
|
||||
# a radius larger than DBSCAN's eps.
|
||||
lower_eps = 0.2
|
||||
nn = NearestNeighbors(radius=lower_eps).fit(X)
|
||||
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
|
||||
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
|
||||
|
||||
higher_eps = lower_eps + 0.7
|
||||
nn = NearestNeighbors(radius=higher_eps).fit(X)
|
||||
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
|
||||
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
|
||||
|
||||
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
|
||||
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
|
||||
def test_dbscan_input_not_modified(metric, csr_container):
|
||||
# test that the input is not modified by dbscan
|
||||
X = np.random.RandomState(0).rand(10, 10)
|
||||
X = csr_container(X) if csr_container is not None else X
|
||||
X_copy = X.copy()
|
||||
dbscan(X, metric=metric)
|
||||
|
||||
if csr_container is not None:
|
||||
assert_array_equal(X.toarray(), X_copy.toarray())
|
||||
else:
|
||||
assert_array_equal(X, X_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container):
|
||||
"""Check that we don't modify in-place the pre-computed sparse matrix.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27508
|
||||
"""
|
||||
X = np.random.RandomState(0).rand(10, 10)
|
||||
# Add zeros on the diagonal that will be implicit when creating
|
||||
# the sparse matrix. If `X` is modified in-place, the zeros from
|
||||
# the diagonal will be made explicit.
|
||||
np.fill_diagonal(X, 0)
|
||||
X = csr_container(X)
|
||||
assert all(row != col for row, col in zip(*X.nonzero()))
|
||||
X_copy = X.copy()
|
||||
dbscan(X, metric="precomputed")
|
||||
# Make sure that we did not modify `X` in-place even by creating
|
||||
# explicit 0s values.
|
||||
assert X.nnz == X_copy.nnz
|
||||
assert_array_equal(X.toarray(), X_copy.toarray())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_dbscan_no_core_samples(csr_container):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(40, 10)
|
||||
X[X < 0.8] = 0
|
||||
|
||||
for X_ in [X, csr_container(X)]:
|
||||
db = DBSCAN(min_samples=6).fit(X_)
|
||||
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
|
||||
assert_array_equal(db.labels_, -1)
|
||||
assert db.core_sample_indices_.shape == (0,)
|
||||
|
||||
|
||||
def test_dbscan_callable():
|
||||
# Tests the DBSCAN algorithm with a callable metric.
|
||||
# Parameters chosen specifically for this task.
|
||||
# Different eps to other test, because distance is not normalised.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
# metric is the function reference, not the string key.
|
||||
metric = distance.euclidean
|
||||
# Compute DBSCAN
|
||||
# parameters chosen for task
|
||||
core_samples, labels = dbscan(
|
||||
X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
|
||||
)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
def test_dbscan_metric_params():
|
||||
# Tests that DBSCAN works with the metrics_params argument.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
p = 1
|
||||
|
||||
# Compute DBSCAN with metric_params arg
|
||||
|
||||
with warnings.catch_warnings(record=True) as warns:
|
||||
db = DBSCAN(
|
||||
metric="minkowski",
|
||||
metric_params={"p": p},
|
||||
eps=eps,
|
||||
p=None,
|
||||
min_samples=min_samples,
|
||||
algorithm="ball_tree",
|
||||
).fit(X)
|
||||
assert not warns, warns[0].message
|
||||
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
|
||||
|
||||
# Test that sample labels are the same as passing Minkowski 'p' directly
|
||||
db = DBSCAN(
|
||||
metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
|
||||
).fit(X)
|
||||
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_2)
|
||||
assert_array_equal(labels_1, labels_2)
|
||||
|
||||
# Minkowski with p=1 should be equivalent to Manhattan distance
|
||||
db = DBSCAN(
|
||||
metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
|
||||
).fit(X)
|
||||
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_3)
|
||||
assert_array_equal(labels_1, labels_3)
|
||||
|
||||
with pytest.warns(
|
||||
SyntaxWarning,
|
||||
match=(
|
||||
"Parameter p is found in metric_params. "
|
||||
"The corresponding parameter from __init__ "
|
||||
"is ignored."
|
||||
),
|
||||
):
|
||||
# Test that checks p is ignored in favor of metric_params={'p': <val>}
|
||||
db = DBSCAN(
|
||||
metric="minkowski",
|
||||
metric_params={"p": p},
|
||||
eps=eps,
|
||||
p=p + 1,
|
||||
min_samples=min_samples,
|
||||
algorithm="ball_tree",
|
||||
).fit(X)
|
||||
core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_4)
|
||||
assert_array_equal(labels_1, labels_4)
|
||||
|
||||
|
||||
def test_dbscan_balltree():
|
||||
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
|
||||
D = pairwise_distances(X)
|
||||
core_samples, labels = dbscan(
|
||||
D, metric="precomputed", eps=eps, min_samples=min_samples
|
||||
)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_3 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_3 == n_clusters
|
||||
|
||||
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_4 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_4 == n_clusters
|
||||
|
||||
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_5 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_5 == n_clusters
|
||||
|
||||
|
||||
def test_input_validation():
|
||||
# DBSCAN.fit should accept a list of lists.
|
||||
X = [[1.0, 2.0], [3.0, 4.0]]
|
||||
DBSCAN().fit(X) # must not raise exception
|
||||
|
||||
|
||||
def test_pickle():
|
||||
obj = DBSCAN()
|
||||
s = pickle.dumps(obj)
|
||||
assert type(pickle.loads(s)) is obj.__class__
|
||||
|
||||
|
||||
def test_boundaries():
|
||||
# ensure min_samples is inclusive of core point
|
||||
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
|
||||
assert 0 in core
|
||||
# ensure eps is inclusive of circumference
|
||||
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
|
||||
assert 0 in core
|
||||
core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
|
||||
assert 0 not in core
|
||||
|
||||
|
||||
def test_weighted_dbscan(global_random_seed):
|
||||
# ensure sample_weight is validated
|
||||
with pytest.raises(ValueError):
|
||||
dbscan([[0], [1]], sample_weight=[2])
|
||||
with pytest.raises(ValueError):
|
||||
dbscan([[0], [1]], sample_weight=[2, 3, 4])
|
||||
|
||||
# ensure sample_weight has an effect
|
||||
assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
|
||||
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
|
||||
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
|
||||
)
|
||||
|
||||
# points within eps of each other:
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
|
||||
)
|
||||
# and effect of non-positive and non-integer sample_weight:
|
||||
assert_array_equal(
|
||||
[], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
|
||||
# for non-negative sample_weight, cores should be identical to repetition
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
sample_weight = rng.randint(0, 5, X.shape[0])
|
||||
core1, label1 = dbscan(X, sample_weight=sample_weight)
|
||||
assert len(label1) == len(X)
|
||||
|
||||
X_repeated = np.repeat(X, sample_weight, axis=0)
|
||||
core_repeated, label_repeated = dbscan(X_repeated)
|
||||
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
|
||||
core_repeated_mask[core_repeated] = True
|
||||
core_mask = np.zeros(X.shape[0], dtype=bool)
|
||||
core_mask[core1] = True
|
||||
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
|
||||
|
||||
# sample_weight should work with precomputed distance matrix
|
||||
D = pairwise_distances(X)
|
||||
core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
|
||||
assert_array_equal(core1, core3)
|
||||
assert_array_equal(label1, label3)
|
||||
|
||||
# sample_weight should work with estimator
|
||||
est = DBSCAN().fit(X, sample_weight=sample_weight)
|
||||
core4 = est.core_sample_indices_
|
||||
label4 = est.labels_
|
||||
assert_array_equal(core1, core4)
|
||||
assert_array_equal(label1, label4)
|
||||
|
||||
est = DBSCAN()
|
||||
label5 = est.fit_predict(X, sample_weight=sample_weight)
|
||||
core5 = est.core_sample_indices_
|
||||
assert_array_equal(core1, core5)
|
||||
assert_array_equal(label1, label5)
|
||||
assert_array_equal(label1, est.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
|
||||
def test_dbscan_core_samples_toy(algorithm):
|
||||
X = [[0], [2], [3], [4], [6], [8], [10]]
|
||||
n_samples = len(X)
|
||||
|
||||
# Degenerate case: every sample is a core sample, either with its own
|
||||
# cluster or including other close core samples.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
|
||||
assert_array_equal(core_samples, np.arange(n_samples))
|
||||
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
|
||||
|
||||
# With eps=1 and min_samples=2 only the 3 samples from the denser area
|
||||
# are core samples. All other points are isolated and considered noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
|
||||
assert_array_equal(core_samples, [1, 2, 3])
|
||||
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
|
||||
|
||||
# Only the sample in the middle of the dense area is core. Its two
|
||||
# neighbors are edge samples. Remaining samples are noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
|
||||
assert_array_equal(core_samples, [2])
|
||||
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
|
||||
|
||||
# It's no longer possible to extract core samples with eps=1:
|
||||
# everything is noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
|
||||
assert_array_equal(core_samples, [])
|
||||
assert_array_equal(labels, np.full(n_samples, -1.0))
|
||||
|
||||
|
||||
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
|
||||
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
|
||||
# more details
|
||||
X = np.eye(10)
|
||||
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
|
||||
assert len(set(labels)) == 1
|
||||
|
||||
X = np.zeros((10, 10))
|
||||
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
|
||||
assert len(set(labels)) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container):
|
||||
# sample matrix with initial two row all zero
|
||||
ar = np.array(
|
||||
[
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
|
||||
]
|
||||
)
|
||||
matrix = csr_container(ar)
|
||||
labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
|
||||
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
|
||||
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Tests for sklearn.cluster._feature_agglomeration
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.cluster import FeatureAgglomeration
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_feature_agglomeration():
|
||||
n_clusters = 1
|
||||
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
|
||||
|
||||
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
|
||||
agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
|
||||
agglo_mean.fit(X)
|
||||
agglo_median.fit(X)
|
||||
|
||||
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
|
||||
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
|
||||
assert np.size(agglo_mean.labels_) == X.shape[1]
|
||||
assert np.size(agglo_median.labels_) == X.shape[1]
|
||||
|
||||
# Test transform
|
||||
Xt_mean = agglo_mean.transform(X)
|
||||
Xt_median = agglo_median.transform(X)
|
||||
assert Xt_mean.shape[1] == n_clusters
|
||||
assert Xt_median.shape[1] == n_clusters
|
||||
assert Xt_mean == np.array([1 / 3.0])
|
||||
assert Xt_median == np.array([0.0])
|
||||
|
||||
# Test inverse transform
|
||||
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
|
||||
X_full_median = agglo_median.inverse_transform(Xt_median)
|
||||
assert np.unique(X_full_mean[0]).size == n_clusters
|
||||
assert np.unique(X_full_median[0]).size == n_clusters
|
||||
|
||||
assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
|
||||
assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
|
||||
|
||||
|
||||
def test_feature_agglomeration_feature_names_out():
|
||||
"""Check `get_feature_names_out` for `FeatureAgglomeration`."""
|
||||
X, _ = make_blobs(n_features=6, random_state=0)
|
||||
agglo = FeatureAgglomeration(n_clusters=3)
|
||||
agglo.fit(X)
|
||||
n_clusters = agglo.n_clusters_
|
||||
|
||||
names_out = agglo.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"featureagglomeration{i}" for i in range(n_clusters)], names_out
|
||||
)
|
||||
@@ -0,0 +1,605 @@
|
||||
"""
|
||||
Tests for HDBSCAN clustering algorithm
|
||||
Based on the DBSCAN test code
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import stats
|
||||
from scipy.spatial import distance
|
||||
|
||||
from sklearn.cluster import HDBSCAN
|
||||
from sklearn.cluster._hdbscan._tree import (
|
||||
CONDENSED_dtype,
|
||||
_condense_tree,
|
||||
_do_labelling,
|
||||
)
|
||||
from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics import fowlkes_mallows_score
|
||||
from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances
|
||||
from sklearn.neighbors import BallTree, KDTree
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
X, y = make_blobs(n_samples=200, random_state=10)
|
||||
X, y = shuffle(X, y, random_state=7)
|
||||
X = StandardScaler().fit_transform(X)
|
||||
|
||||
ALGORITHMS = [
|
||||
"kd_tree",
|
||||
"ball_tree",
|
||||
"brute",
|
||||
"auto",
|
||||
]
|
||||
|
||||
OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()}
|
||||
|
||||
|
||||
def check_label_quality(labels, threshold=0.99):
|
||||
n_clusters = len(set(labels) - OUTLIER_SET)
|
||||
assert n_clusters == 3
|
||||
assert fowlkes_mallows_score(labels, y) > threshold
|
||||
|
||||
|
||||
@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
|
||||
def test_outlier_data(outlier_type):
|
||||
"""
|
||||
Tests if np.inf and np.nan data are each treated as special outliers.
|
||||
"""
|
||||
outlier = {
|
||||
"infinite": np.inf,
|
||||
"missing": np.nan,
|
||||
}[outlier_type]
|
||||
prob_check = {
|
||||
"infinite": lambda x, y: x == y,
|
||||
"missing": lambda x, y: np.isnan(x),
|
||||
}[outlier_type]
|
||||
label = _OUTLIER_ENCODING[outlier_type]["label"]
|
||||
prob = _OUTLIER_ENCODING[outlier_type]["prob"]
|
||||
|
||||
X_outlier = X.copy()
|
||||
X_outlier[0] = [outlier, 1]
|
||||
X_outlier[5] = [outlier, outlier]
|
||||
model = HDBSCAN(copy=False).fit(X_outlier)
|
||||
|
||||
(missing_labels_idx,) = (model.labels_ == label).nonzero()
|
||||
assert_array_equal(missing_labels_idx, [0, 5])
|
||||
|
||||
(missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero()
|
||||
assert_array_equal(missing_probs_idx, [0, 5])
|
||||
|
||||
clean_indices = list(range(1, 5)) + list(range(6, 200))
|
||||
clean_model = HDBSCAN(copy=False).fit(X_outlier[clean_indices])
|
||||
assert_array_equal(clean_model.labels_, model.labels_[clean_indices])
|
||||
|
||||
|
||||
def test_hdbscan_distance_matrix():
|
||||
"""
|
||||
Tests that HDBSCAN works with precomputed distance matrices, and throws the
|
||||
appropriate errors when needed.
|
||||
"""
|
||||
D = euclidean_distances(X)
|
||||
D_original = D.copy()
|
||||
labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)
|
||||
|
||||
assert_allclose(D, D_original)
|
||||
check_label_quality(labels)
|
||||
|
||||
msg = r"The precomputed distance matrix.*has shape"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
|
||||
|
||||
msg = r"The precomputed distance matrix.*values"
|
||||
# Ensure the matrix is not symmetric
|
||||
D[0, 1] = 10
|
||||
D[1, 0] = 1
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(metric="precomputed", copy=False).fit_predict(D)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS])
|
||||
def test_hdbscan_sparse_distance_matrix(sparse_constructor):
|
||||
"""
|
||||
Tests that HDBSCAN works with sparse distance matrices.
|
||||
"""
|
||||
D = distance.squareform(distance.pdist(X))
|
||||
D /= np.max(D)
|
||||
|
||||
threshold = stats.scoreatpercentile(D.flatten(), 50)
|
||||
|
||||
D[D >= threshold] = 0.0
|
||||
D = sparse_constructor(D)
|
||||
D.eliminate_zeros()
|
||||
|
||||
labels = HDBSCAN(metric="precomputed", copy=False).fit_predict(D)
|
||||
check_label_quality(labels)
|
||||
|
||||
|
||||
def test_hdbscan_feature_array():
|
||||
"""
|
||||
Tests that HDBSCAN works with feature array, including an arbitrary
|
||||
goodness of fit check. Note that the check is a simple heuristic.
|
||||
"""
|
||||
labels = HDBSCAN(copy=False).fit_predict(X)
|
||||
|
||||
# Check that clustering is arbitrarily good
|
||||
# This is a heuristic to guard against regression
|
||||
check_label_quality(labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algo", ALGORITHMS)
|
||||
@pytest.mark.parametrize("metric", _VALID_METRICS)
|
||||
def test_hdbscan_algorithms(algo, metric):
|
||||
"""
|
||||
Tests that HDBSCAN works with the expected combinations of algorithms and
|
||||
metrics, or raises the expected errors.
|
||||
"""
|
||||
labels = HDBSCAN(algorithm=algo, copy=False).fit_predict(X)
|
||||
check_label_quality(labels)
|
||||
|
||||
# Validation for brute is handled by `pairwise_distances`
|
||||
if algo in ("brute", "auto"):
|
||||
return
|
||||
|
||||
ALGOS_TREES = {
|
||||
"kd_tree": KDTree,
|
||||
"ball_tree": BallTree,
|
||||
}
|
||||
metric_params = {
|
||||
"mahalanobis": {"V": np.eye(X.shape[1])},
|
||||
"seuclidean": {"V": np.ones(X.shape[1])},
|
||||
"minkowski": {"p": 2},
|
||||
"wminkowski": {"p": 2, "w": np.ones(X.shape[1])},
|
||||
}.get(metric, None)
|
||||
|
||||
hdb = HDBSCAN(
|
||||
algorithm=algo,
|
||||
metric=metric,
|
||||
metric_params=metric_params,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
if metric not in ALGOS_TREES[algo].valid_metrics:
|
||||
with pytest.raises(ValueError):
|
||||
hdb.fit(X)
|
||||
elif metric == "wminkowski":
|
||||
with pytest.warns(FutureWarning):
|
||||
hdb.fit(X)
|
||||
else:
|
||||
hdb.fit(X)
|
||||
|
||||
|
||||
def test_dbscan_clustering():
|
||||
"""
|
||||
Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
|
||||
This test is more of a sanity check than a rigorous evaluation.
|
||||
"""
|
||||
clusterer = HDBSCAN(copy=False).fit(X)
|
||||
labels = clusterer.dbscan_clustering(0.3)
|
||||
|
||||
# We use a looser threshold due to dbscan producing a more constrained
|
||||
# clustering representation
|
||||
check_label_quality(labels, threshold=0.92)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
|
||||
def test_dbscan_clustering_outlier_data(cut_distance):
|
||||
"""
|
||||
Tests if np.inf and np.nan data are each treated as special outliers.
|
||||
"""
|
||||
missing_label = _OUTLIER_ENCODING["missing"]["label"]
|
||||
infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
|
||||
|
||||
X_outlier = X.copy()
|
||||
X_outlier[0] = [np.inf, 1]
|
||||
X_outlier[2] = [1, np.nan]
|
||||
X_outlier[5] = [np.inf, np.nan]
|
||||
model = HDBSCAN(copy=False).fit(X_outlier)
|
||||
labels = model.dbscan_clustering(cut_distance=cut_distance)
|
||||
|
||||
missing_labels_idx = np.flatnonzero(labels == missing_label)
|
||||
assert_array_equal(missing_labels_idx, [2, 5])
|
||||
|
||||
infinite_labels_idx = np.flatnonzero(labels == infinite_label)
|
||||
assert_array_equal(infinite_labels_idx, [0])
|
||||
|
||||
clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
|
||||
clean_model = HDBSCAN(copy=False).fit(X_outlier[clean_idx])
|
||||
clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
|
||||
assert_array_equal(clean_labels, labels[clean_idx])
|
||||
|
||||
|
||||
def test_hdbscan_best_balltree_metric():
|
||||
"""
|
||||
Tests that HDBSCAN using `BallTree` works.
|
||||
"""
|
||||
labels = HDBSCAN(
|
||||
metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}, copy=False
|
||||
).fit_predict(X)
|
||||
check_label_quality(labels)
|
||||
|
||||
|
||||
def test_hdbscan_no_clusters():
|
||||
"""
|
||||
Tests that HDBSCAN correctly does not generate a valid cluster when the
|
||||
`min_cluster_size` is too large for the data.
|
||||
"""
|
||||
labels = HDBSCAN(min_cluster_size=len(X) - 1, copy=False).fit_predict(X)
|
||||
assert set(labels).issubset(OUTLIER_SET)
|
||||
|
||||
|
||||
def test_hdbscan_min_cluster_size():
|
||||
"""
|
||||
Test that the smallest non-noise cluster has at least `min_cluster_size`
|
||||
many points
|
||||
"""
|
||||
for min_cluster_size in range(2, len(X), 1):
|
||||
labels = HDBSCAN(min_cluster_size=min_cluster_size, copy=False).fit_predict(X)
|
||||
true_labels = [label for label in labels if label != -1]
|
||||
if len(true_labels) != 0:
|
||||
assert np.min(np.bincount(true_labels)) >= min_cluster_size
|
||||
|
||||
|
||||
def test_hdbscan_callable_metric():
|
||||
"""
|
||||
Tests that HDBSCAN works when passed a callable metric.
|
||||
"""
|
||||
metric = distance.euclidean
|
||||
labels = HDBSCAN(metric=metric, copy=False).fit_predict(X)
|
||||
check_label_quality(labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
|
||||
def test_hdbscan_precomputed_non_brute(tree):
|
||||
"""
|
||||
Tests that HDBSCAN correctly raises an error when passing precomputed data
|
||||
while requesting a tree-based algorithm.
|
||||
"""
|
||||
hdb = HDBSCAN(metric="precomputed", algorithm=tree, copy=False)
|
||||
msg = "precomputed is not a valid metric for"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
hdb.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_hdbscan_sparse(csr_container):
|
||||
"""
|
||||
Tests that HDBSCAN works correctly when passing sparse feature data.
|
||||
Evaluates correctness by comparing against the same data passed as a dense
|
||||
array.
|
||||
"""
|
||||
|
||||
dense_labels = HDBSCAN(copy=False).fit(X).labels_
|
||||
check_label_quality(dense_labels)
|
||||
|
||||
_X_sparse = csr_container(X)
|
||||
X_sparse = _X_sparse.copy()
|
||||
sparse_labels = HDBSCAN(copy=False).fit(X_sparse).labels_
|
||||
assert_array_equal(dense_labels, sparse_labels)
|
||||
|
||||
# Compare that the sparse and dense non-precomputed routines return the same labels
|
||||
# where the 0th observation contains the outlier.
|
||||
for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
|
||||
X_dense = X.copy()
|
||||
X_dense[0, 0] = outlier_val
|
||||
dense_labels = HDBSCAN(copy=False).fit(X_dense).labels_
|
||||
check_label_quality(dense_labels)
|
||||
assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
|
||||
|
||||
X_sparse = _X_sparse.copy()
|
||||
X_sparse[0, 0] = outlier_val
|
||||
sparse_labels = HDBSCAN(copy=False).fit(X_sparse).labels_
|
||||
assert_array_equal(dense_labels, sparse_labels)
|
||||
|
||||
msg = "Sparse data matrices only support algorithm `brute`."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(metric="euclidean", algorithm="ball_tree", copy=False).fit(X_sparse)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ALGORITHMS)
|
||||
def test_hdbscan_centers(algorithm):
|
||||
"""
|
||||
Tests that HDBSCAN centers are calculated and stored properly, and are
|
||||
accurate to the data.
|
||||
"""
|
||||
centers = [(0.0, 0.0), (3.0, 3.0)]
|
||||
H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5)
|
||||
hdb = HDBSCAN(store_centers="both", copy=False).fit(H)
|
||||
|
||||
for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_):
|
||||
assert_allclose(center, centroid, rtol=1, atol=0.05)
|
||||
assert_allclose(center, medoid, rtol=1, atol=0.05)
|
||||
|
||||
# Ensure that nothing is done for noise
|
||||
hdb = HDBSCAN(
|
||||
algorithm=algorithm,
|
||||
store_centers="both",
|
||||
min_cluster_size=X.shape[0],
|
||||
copy=False,
|
||||
).fit(X)
|
||||
assert hdb.centroids_.shape[0] == 0
|
||||
assert hdb.medoids_.shape[0] == 0
|
||||
|
||||
|
||||
def test_hdbscan_allow_single_cluster_with_epsilon():
|
||||
"""
|
||||
Tests that HDBSCAN single-cluster selection with epsilon works correctly.
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
no_structure = rng.rand(150, 2)
|
||||
# without epsilon we should see many noise points as children of root.
|
||||
labels = HDBSCAN(
|
||||
min_cluster_size=5,
|
||||
cluster_selection_epsilon=0.0,
|
||||
cluster_selection_method="eom",
|
||||
allow_single_cluster=True,
|
||||
copy=False,
|
||||
).fit_predict(no_structure)
|
||||
unique_labels, counts = np.unique(labels, return_counts=True)
|
||||
assert len(unique_labels) == 2
|
||||
|
||||
# Arbitrary heuristic. Would prefer something more precise.
|
||||
assert counts[unique_labels == -1] > 30
|
||||
|
||||
# for this random seed an epsilon of 0.18 will produce exactly 2 noise
|
||||
# points at that cut in single linkage.
|
||||
labels = HDBSCAN(
|
||||
min_cluster_size=5,
|
||||
cluster_selection_epsilon=0.18,
|
||||
cluster_selection_method="eom",
|
||||
allow_single_cluster=True,
|
||||
algorithm="kd_tree",
|
||||
copy=False,
|
||||
).fit_predict(no_structure)
|
||||
unique_labels, counts = np.unique(labels, return_counts=True)
|
||||
assert len(unique_labels) == 2
|
||||
assert counts[unique_labels == -1] == 2
|
||||
|
||||
|
||||
def test_hdbscan_better_than_dbscan():
|
||||
"""
|
||||
Validate that HDBSCAN can properly cluster this difficult synthetic
|
||||
dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
|
||||
example)
|
||||
"""
|
||||
centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]]
|
||||
X, y = make_blobs(
|
||||
n_samples=750,
|
||||
centers=centers,
|
||||
cluster_std=[0.2, 0.35, 1.35, 1.35],
|
||||
random_state=0,
|
||||
)
|
||||
labels = HDBSCAN(copy=False).fit(X).labels_
|
||||
|
||||
n_clusters = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters == 4
|
||||
fowlkes_mallows_score(labels, y) > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs, X",
|
||||
[
|
||||
({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])),
|
||||
({"metric": "precomputed"}, [[1, 2], [2, 1]]),
|
||||
({}, [[1, 2], [3, 4]]),
|
||||
],
|
||||
)
|
||||
def test_hdbscan_usable_inputs(X, kwargs):
|
||||
"""
|
||||
Tests that HDBSCAN works correctly for array-likes and precomputed inputs
|
||||
with non-finite points.
|
||||
"""
|
||||
HDBSCAN(min_samples=1, copy=False, **kwargs).fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_hdbscan_sparse_distances_too_few_nonzero(csr_container):
|
||||
"""
|
||||
Tests that HDBSCAN raises the correct error when there are too few
|
||||
non-zero distances.
|
||||
"""
|
||||
X = csr_container(np.zeros((10, 10)))
|
||||
|
||||
msg = "There exists points with fewer than"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(metric="precomputed", copy=False).fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_hdbscan_sparse_distances_disconnected_graph(csr_container):
|
||||
"""
|
||||
Tests that HDBSCAN raises the correct error when the distance matrix
|
||||
has multiple connected components.
|
||||
"""
|
||||
# Create symmetric sparse matrix with 2 connected components
|
||||
X = np.zeros((20, 20))
|
||||
X[:5, :5] = 1
|
||||
X[5:, 15:] = 1
|
||||
X = X + X.T
|
||||
X = csr_container(X)
|
||||
msg = "HDBSCAN cannot be performed on a disconnected graph"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(metric="precomputed", copy=False).fit(X)
|
||||
|
||||
|
||||
def test_hdbscan_tree_invalid_metric():
|
||||
"""
|
||||
Tests that HDBSCAN correctly raises an error for invalid metric choices.
|
||||
"""
|
||||
metric_callable = lambda x: x
|
||||
msg = (
|
||||
".* is not a valid metric for a .*-based algorithm\\. Please select a different"
|
||||
" metric\\."
|
||||
)
|
||||
|
||||
# Callables are not supported for either
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(algorithm="kd_tree", metric=metric_callable, copy=False).fit(X)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(algorithm="ball_tree", metric=metric_callable, copy=False).fit(X)
|
||||
|
||||
# The set of valid metrics for KDTree at the time of writing this test is a
|
||||
# strict subset of those supported in BallTree
|
||||
metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
|
||||
if len(metrics_not_kd) > 0:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0], copy=False).fit(X)
|
||||
|
||||
|
||||
def test_hdbscan_too_many_min_samples():
|
||||
"""
|
||||
Tests that HDBSCAN correctly raises an error when setting `min_samples`
|
||||
larger than the number of samples.
|
||||
"""
|
||||
hdb = HDBSCAN(min_samples=len(X) + 1, copy=False)
|
||||
msg = r"min_samples (.*) must be at most"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
hdb.fit(X)
|
||||
|
||||
|
||||
def test_hdbscan_precomputed_dense_nan():
|
||||
"""
|
||||
Tests that HDBSCAN correctly raises an error when providing precomputed
|
||||
distances with `np.nan` values.
|
||||
"""
|
||||
X_nan = X.copy()
|
||||
X_nan[0, 0] = np.nan
|
||||
msg = "np.nan values found in precomputed-dense"
|
||||
hdb = HDBSCAN(metric="precomputed", copy=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
hdb.fit(X_nan)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("allow_single_cluster", [True, False])
|
||||
@pytest.mark.parametrize("epsilon", [0, 0.1])
|
||||
def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
|
||||
"""
|
||||
Tests that the `_do_labelling` helper function correctly assigns labels.
|
||||
"""
|
||||
n_samples = 48
|
||||
X, y = make_blobs(
|
||||
n_samples,
|
||||
random_state=global_random_seed,
|
||||
# Ensure the clusters are distinct with no overlap
|
||||
centers=[
|
||||
[0, 0],
|
||||
[10, 0],
|
||||
[0, 10],
|
||||
],
|
||||
)
|
||||
|
||||
est = HDBSCAN(copy=False).fit(X)
|
||||
condensed_tree = _condense_tree(
|
||||
est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
|
||||
)
|
||||
clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
|
||||
cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
|
||||
labels = _do_labelling(
|
||||
condensed_tree=condensed_tree,
|
||||
clusters=clusters,
|
||||
cluster_label_map=cluster_label_map,
|
||||
allow_single_cluster=allow_single_cluster,
|
||||
cluster_selection_epsilon=epsilon,
|
||||
)
|
||||
|
||||
first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
|
||||
y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
|
||||
aligned_target = np.vectorize(y_to_labels.get)(y)
|
||||
assert_array_equal(labels, aligned_target)
|
||||
|
||||
|
||||
def test_labelling_thresholding():
|
||||
"""
|
||||
Tests that the `_do_labelling` helper function correctly thresholds the
|
||||
incoming lambda values given various `cluster_selection_epsilon` values.
|
||||
"""
|
||||
n_samples = 5
|
||||
MAX_LAMBDA = 1.5
|
||||
condensed_tree = np.array(
|
||||
[
|
||||
(5, 2, MAX_LAMBDA, 1),
|
||||
(5, 1, 0.1, 1),
|
||||
(5, 0, MAX_LAMBDA, 1),
|
||||
(5, 3, 0.2, 1),
|
||||
(5, 4, 0.3, 1),
|
||||
],
|
||||
dtype=CONDENSED_dtype,
|
||||
)
|
||||
labels = _do_labelling(
|
||||
condensed_tree=condensed_tree,
|
||||
clusters={n_samples},
|
||||
cluster_label_map={n_samples: 0, n_samples + 1: 1},
|
||||
allow_single_cluster=True,
|
||||
cluster_selection_epsilon=1,
|
||||
)
|
||||
num_noise = condensed_tree["value"] < 1
|
||||
assert sum(num_noise) == sum(labels == -1)
|
||||
|
||||
labels = _do_labelling(
|
||||
condensed_tree=condensed_tree,
|
||||
clusters={n_samples},
|
||||
cluster_label_map={n_samples: 0, n_samples + 1: 1},
|
||||
allow_single_cluster=True,
|
||||
cluster_selection_epsilon=0,
|
||||
)
|
||||
# The threshold should be calculated per-sample based on the largest
|
||||
# lambda of any simbling node. In this case, all points are siblings
|
||||
# and the largest value is exactly MAX_LAMBDA.
|
||||
num_noise = condensed_tree["value"] < MAX_LAMBDA
|
||||
assert sum(num_noise) == sum(labels == -1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("store_centers", ["centroid", "medoid"])
|
||||
def test_hdbscan_error_precomputed_and_store_centers(store_centers):
|
||||
"""Check that we raise an error if the centers are requested together with
|
||||
a precomputed input matrix.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27893
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random((100, 2))
|
||||
X_dist = euclidean_distances(X)
|
||||
err_msg = "Cannot store centers when using a precomputed distance matrix."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
HDBSCAN(
|
||||
metric="precomputed",
|
||||
store_centers=store_centers,
|
||||
copy=False,
|
||||
).fit(X_dist)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("valid_algo", ["auto", "brute"])
|
||||
def test_hdbscan_cosine_metric_valid_algorithm(valid_algo):
|
||||
"""Test that HDBSCAN works with the "cosine" metric when the algorithm is set
|
||||
to "brute" or "auto".
|
||||
|
||||
Non-regression test for issue #28631
|
||||
"""
|
||||
HDBSCAN(metric="cosine", algorithm=valid_algo, copy=False).fit_predict(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"])
|
||||
def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo):
|
||||
"""Test that HDBSCAN raises an informative error is raised when an unsupported
|
||||
algorithm is used with the "cosine" metric.
|
||||
"""
|
||||
hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo, copy=False)
|
||||
with pytest.raises(ValueError, match="cosine is not a valid metric"):
|
||||
hdbscan.fit_predict(X)
|
||||
|
||||
|
||||
# TODO(1.10): remove this test
|
||||
def test_hdbscan_default_copy_warning():
|
||||
"""
|
||||
Test that HDBSCAN raises a FutureWarning when the `copy`
|
||||
parameter is not set.
|
||||
"""
|
||||
X = np.random.RandomState(0).random((100, 2))
|
||||
msg = r"The default value of `copy` will change from False to True in 1.10."
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
hdb = HDBSCAN(min_cluster_size=20)
|
||||
hdb.fit(X)
|
||||
@@ -0,0 +1,889 @@
|
||||
"""
|
||||
Several basic tests for hierarchical clustering procedures
|
||||
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
import shutil
|
||||
from functools import partial
|
||||
from tempfile import mkdtemp
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.cluster import hierarchy
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree
|
||||
from sklearn.cluster._agglomerative import (
|
||||
_TREE_BUILDERS,
|
||||
_fix_connectivity,
|
||||
_hc_cut,
|
||||
linkage_tree,
|
||||
)
|
||||
from sklearn.cluster._hierarchical_fast import (
|
||||
average_merge,
|
||||
max_merge,
|
||||
mst_linkage_core,
|
||||
)
|
||||
from sklearn.datasets import make_circles, make_moons
|
||||
from sklearn.feature_extraction.image import grid_to_graph
|
||||
from sklearn.metrics import DistanceMetric
|
||||
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
|
||||
from sklearn.metrics.pairwise import (
|
||||
PAIRED_DISTANCES,
|
||||
cosine_distances,
|
||||
manhattan_distances,
|
||||
pairwise_distances,
|
||||
)
|
||||
from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.utils._fast_dict import IntFloatDict
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
create_memmap_backed_data,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.fixes import LIL_CONTAINERS
|
||||
|
||||
|
||||
def test_linkage_misc():
|
||||
# Misc tests on linkage
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.normal(size=(5, 5))
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
linkage_tree(X, linkage="foo")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
linkage_tree(X, connectivity=np.ones((4, 4)))
|
||||
|
||||
# Smoke test FeatureAgglomeration
|
||||
FeatureAgglomeration().fit(X)
|
||||
|
||||
# test hierarchical clustering on a precomputed distances matrix
|
||||
dis = cosine_distances(X)
|
||||
|
||||
res = linkage_tree(dis, affinity="precomputed")
|
||||
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
|
||||
|
||||
# test hierarchical clustering on a precomputed distances matrix
|
||||
res = linkage_tree(X, affinity=manhattan_distances)
|
||||
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
|
||||
|
||||
|
||||
def test_structured_linkage_tree():
|
||||
# Check that we obtain the correct solution for structured linkage trees.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
# Avoiding a mask with only 'True' entries
|
||||
mask[4:7, 4:7] = 0
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for tree_builder in _TREE_BUILDERS.values():
|
||||
children, n_components, n_leaves, parent = tree_builder(
|
||||
X.T, connectivity=connectivity
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
# Check that ward_tree raises a ValueError with a connectivity matrix
|
||||
# of the wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
tree_builder(X.T, connectivity=np.ones((4, 4)))
|
||||
# Check that fitting with no samples raises an error
|
||||
with pytest.raises(ValueError):
|
||||
tree_builder(X.T[:0], connectivity=connectivity)
|
||||
|
||||
|
||||
def test_unstructured_linkage_tree():
|
||||
# Check that we obtain the correct solution for unstructured linkage trees.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(50, 100)
|
||||
for this_X in (X, X[0]):
|
||||
# With specified a number of clusters just for the sake of
|
||||
# raising a warning and testing the warning code
|
||||
with ignore_warnings():
|
||||
with pytest.warns(UserWarning):
|
||||
children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
for tree_builder in _TREE_BUILDERS.values():
|
||||
for this_X in (X, X[0]):
|
||||
with ignore_warnings():
|
||||
with pytest.warns(UserWarning):
|
||||
children, n_nodes, n_leaves, parent = tree_builder(
|
||||
this_X.T, n_clusters=10
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
|
||||
def test_height_linkage_tree():
|
||||
# Check that the height of the results of linkage tree is sorted.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for linkage_func in _TREE_BUILDERS.values():
|
||||
children, n_nodes, n_leaves, parent = linkage_func(
|
||||
X.T, connectivity=connectivity
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
|
||||
def test_zero_cosine_linkage_tree():
|
||||
# Check that zero vectors in X produce an error when
|
||||
# 'cosine' affinity is used
|
||||
X = np.array([[0, 1], [0, 0]])
|
||||
msg = "Cosine affinity cannot be used when X contains zero vectors"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
linkage_tree(X, affinity="cosine")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
|
||||
@pytest.mark.parametrize("compute_distances", [True, False])
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
|
||||
def test_agglomerative_clustering_distances(
|
||||
n_clusters, compute_distances, distance_threshold, linkage
|
||||
):
|
||||
# Check that when `compute_distances` is True or `distance_threshold` is
|
||||
# given, the fitted model has an attribute `distances_`.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=n_clusters,
|
||||
connectivity=connectivity,
|
||||
linkage=linkage,
|
||||
distance_threshold=distance_threshold,
|
||||
compute_distances=compute_distances,
|
||||
)
|
||||
clustering.fit(X)
|
||||
if compute_distances or (distance_threshold is not None):
|
||||
assert hasattr(clustering, "distances_")
|
||||
n_children = clustering.children_.shape[0]
|
||||
n_nodes = n_children + 1
|
||||
assert clustering.distances_.shape == (n_nodes - 1,)
|
||||
else:
|
||||
assert not hasattr(clustering, "distances_")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_agglomerative_clustering(global_random_seed, lil_container):
|
||||
# Check that we obtain the correct number of clusters with
|
||||
# agglomerative clustering.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for linkage in ("ward", "complete", "average", "single"):
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage=linkage
|
||||
)
|
||||
clustering.fit(X)
|
||||
# test caching
|
||||
try:
|
||||
tempdir = mkdtemp()
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity,
|
||||
memory=tempdir,
|
||||
linkage=linkage,
|
||||
)
|
||||
clustering.fit(X)
|
||||
labels = clustering.labels_
|
||||
assert np.size(np.unique(labels)) == 10
|
||||
finally:
|
||||
shutil.rmtree(tempdir)
|
||||
# Turn caching off now
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage=linkage
|
||||
)
|
||||
# Check that we obtain the same solution with early-stopping of the
|
||||
# tree building
|
||||
clustering.compute_full_tree = False
|
||||
clustering.fit(X)
|
||||
assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
|
||||
clustering.connectivity = None
|
||||
clustering.fit(X)
|
||||
assert np.size(np.unique(clustering.labels_)) == 10
|
||||
# Check that we raise a TypeError on dense matrices
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=lil_container(connectivity.toarray()[:10, :10]),
|
||||
linkage=linkage,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clustering.fit(X)
|
||||
|
||||
# Test that using ward with another metric than euclidean raises an
|
||||
# exception
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity.toarray(),
|
||||
metric="manhattan",
|
||||
linkage="ward",
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clustering.fit(X)
|
||||
|
||||
# Test using another metric than euclidean works with linkage complete
|
||||
for metric in PAIRED_DISTANCES.keys():
|
||||
# Compare our (structured) implementation to scipy
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=np.ones((n_samples, n_samples)),
|
||||
metric=metric,
|
||||
linkage="complete",
|
||||
)
|
||||
clustering.fit(X)
|
||||
clustering2 = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=None, metric=metric, linkage="complete"
|
||||
)
|
||||
clustering2.fit(X)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
|
||||
)
|
||||
|
||||
# Test that using a distance matrix (affinity = 'precomputed') has same
|
||||
# results (with connectivity constraints)
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage="complete"
|
||||
)
|
||||
clustering.fit(X)
|
||||
X_dist = pairwise_distances(X)
|
||||
clustering2 = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity,
|
||||
metric="precomputed",
|
||||
linkage="complete",
|
||||
)
|
||||
clustering2.fit(X_dist)
|
||||
assert_array_equal(clustering.labels_, clustering2.labels_)
|
||||
|
||||
|
||||
def test_agglomerative_clustering_memory_mapped():
|
||||
"""AgglomerativeClustering must work on mem-mapped dataset.
|
||||
|
||||
Non-regression test for issue #19875.
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
Xmm = create_memmap_backed_data(rng.randn(50, 100))
|
||||
AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)
|
||||
|
||||
|
||||
def test_ward_agglomeration(global_random_seed):
|
||||
# Check that we obtain the correct solution in a simplistic case
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
|
||||
agglo.fit(X)
|
||||
assert np.size(np.unique(agglo.labels_)) == 5
|
||||
|
||||
X_red = agglo.transform(X)
|
||||
assert X_red.shape[1] == 5
|
||||
X_full = agglo.inverse_transform(X_red)
|
||||
assert np.unique(X_full[0]).size == 5
|
||||
assert_array_almost_equal(agglo.transform(X_full), X_red)
|
||||
|
||||
# Check that fitting with no samples raises a ValueError
|
||||
with pytest.raises(ValueError):
|
||||
agglo.fit(X[:0])
|
||||
|
||||
|
||||
def test_single_linkage_clustering():
|
||||
# Check that we get the correct result in two emblematic cases
|
||||
moons, moon_labels = make_moons(noise=0.05, random_state=42)
|
||||
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
|
||||
clustering.fit(moons)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, moon_labels), 1
|
||||
)
|
||||
|
||||
circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
|
||||
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
|
||||
clustering.fit(circles)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, circle_labels), 1
|
||||
)
|
||||
|
||||
|
||||
def assess_same_labelling(cut1, cut2):
|
||||
"""Util for comparison with scipy"""
|
||||
co_clust = []
|
||||
for cut in [cut1, cut2]:
|
||||
n = len(cut)
|
||||
k = cut.max() + 1
|
||||
ecut = np.zeros((n, k))
|
||||
ecut[np.arange(n), cut] = 1
|
||||
co_clust.append(np.dot(ecut, ecut.T))
|
||||
assert (co_clust[0] == co_clust[1]).all()
|
||||
|
||||
|
||||
def test_sparse_scikit_vs_scipy(global_random_seed):
|
||||
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
|
||||
n, p, k = 10, 5, 3
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
# Not using a lil_matrix here, just to check that non sparse
|
||||
# matrices are well handled
|
||||
connectivity = np.ones((n, n))
|
||||
for linkage in _TREE_BUILDERS.keys():
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out = hierarchy.linkage(X, method=linkage)
|
||||
|
||||
children_ = out[:, :2].astype(int, copy=False)
|
||||
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
|
||||
X, connectivity=connectivity
|
||||
)
|
||||
|
||||
# Sort the order of child nodes per row for consistency
|
||||
children.sort(axis=1)
|
||||
assert_array_equal(
|
||||
children,
|
||||
children_,
|
||||
"linkage tree differs from scipy impl for linkage: " + linkage,
|
||||
)
|
||||
|
||||
cut = _hc_cut(k, children, n_leaves)
|
||||
cut_ = _hc_cut(k, children_, n_leaves)
|
||||
assess_same_labelling(cut, cut_)
|
||||
|
||||
# Test error management in _hc_cut
|
||||
with pytest.raises(ValueError):
|
||||
_hc_cut(n_leaves + 1, children, n_leaves)
|
||||
|
||||
|
||||
# Make sure our custom mst_linkage_core gives
|
||||
# the same results as scipy's builtin
|
||||
def test_vector_scikit_single_vs_scipy_single(global_random_seed):
|
||||
n_samples, n_features, n_clusters = 10, 5, 3
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = 0.1 * rng.normal(size=(n_samples, n_features))
|
||||
X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out = hierarchy.linkage(X, method="single")
|
||||
children_scipy = out[:, :2].astype(int)
|
||||
|
||||
children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
|
||||
|
||||
# Sort the order of child nodes per row for consistency
|
||||
children.sort(axis=1)
|
||||
assert_array_equal(
|
||||
children,
|
||||
children_scipy,
|
||||
"linkage tree differs from scipy impl for single linkage.",
|
||||
)
|
||||
|
||||
cut = _hc_cut(n_clusters, children, n_leaves)
|
||||
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
|
||||
assess_same_labelling(cut, cut_scipy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
|
||||
def test_mst_linkage_core_memory_mapped(metric_param_grid):
|
||||
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
|
||||
|
||||
Non-regression test for issue #19875.
|
||||
"""
|
||||
rng = np.random.RandomState(seed=1)
|
||||
X = rng.normal(size=(20, 4))
|
||||
Xmm = create_memmap_backed_data(X)
|
||||
metric, param_grid = metric_param_grid
|
||||
keys = param_grid.keys()
|
||||
for vals in itertools.product(*param_grid.values()):
|
||||
kwargs = dict(zip(keys, vals))
|
||||
distance_metric = DistanceMetric.get_metric(metric, **kwargs)
|
||||
mst = mst_linkage_core(X, distance_metric)
|
||||
mst_mm = mst_linkage_core(Xmm, distance_metric)
|
||||
np.testing.assert_equal(mst, mst_mm)
|
||||
|
||||
|
||||
def test_identical_points():
|
||||
# Ensure identical points are handled correctly when using mst with
|
||||
# a sparse connectivity matrix
|
||||
X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
|
||||
true_labels = np.array([0, 0, 1, 1, 2, 2])
|
||||
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
|
||||
connectivity = 0.5 * (connectivity + connectivity.T)
|
||||
connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
|
||||
|
||||
for linkage in ("single", "average", "average", "ward"):
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=3, linkage=linkage, connectivity=connectivity
|
||||
)
|
||||
clustering.fit(X)
|
||||
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, true_labels), 1
|
||||
)
|
||||
|
||||
|
||||
def test_connectivity_propagation():
|
||||
# Check that connectivity in the ward tree is propagated correctly during
|
||||
# merging.
|
||||
X = np.array(
|
||||
[
|
||||
(0.014, 0.120),
|
||||
(0.014, 0.099),
|
||||
(0.014, 0.097),
|
||||
(0.017, 0.153),
|
||||
(0.017, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.152),
|
||||
(0.018, 0.149),
|
||||
(0.018, 0.144),
|
||||
]
|
||||
)
|
||||
connectivity = kneighbors_graph(X, 10, include_self=False)
|
||||
ward = AgglomerativeClustering(
|
||||
n_clusters=4, connectivity=connectivity, linkage="ward"
|
||||
)
|
||||
# If changes are not propagated correctly, fit crashes with an
|
||||
# IndexError
|
||||
ward.fit(X)
|
||||
|
||||
|
||||
def test_ward_tree_children_order(global_random_seed):
|
||||
# Check that children are ordered in the same way for both structured and
|
||||
# unstructured versions of ward_tree.
|
||||
|
||||
# test on five random datasets
|
||||
n, p = 10, 5
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
connectivity = np.ones((n, n))
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out_unstructured = ward_tree(X)
|
||||
out_structured = ward_tree(X, connectivity=connectivity)
|
||||
|
||||
assert_array_equal(out_unstructured[0], out_structured[0])
|
||||
|
||||
|
||||
def test_ward_linkage_tree_return_distance(global_random_seed):
|
||||
# Test return_distance option on linkage and ward trees
|
||||
|
||||
# test that return_distance when set true, gives same
|
||||
# output on both structured and unstructured clustering.
|
||||
n, p = 10, 5
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
connectivity = np.ones((n, n))
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out_unstructured = ward_tree(X, return_distance=True)
|
||||
out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
|
||||
|
||||
# get children
|
||||
children_unstructured = out_unstructured[0]
|
||||
children_structured = out_structured[0]
|
||||
|
||||
# check if we got the same clusters
|
||||
assert_array_equal(children_unstructured, children_structured)
|
||||
|
||||
# check if the distances are the same
|
||||
dist_unstructured = out_unstructured[-1]
|
||||
dist_structured = out_structured[-1]
|
||||
|
||||
assert_array_almost_equal(dist_unstructured, dist_structured)
|
||||
|
||||
for linkage in ["average", "complete", "single"]:
|
||||
structured_items = linkage_tree(
|
||||
X, connectivity=connectivity, linkage=linkage, return_distance=True
|
||||
)[-1]
|
||||
unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
|
||||
-1
|
||||
]
|
||||
structured_dist = structured_items[-1]
|
||||
unstructured_dist = unstructured_items[-1]
|
||||
structured_children = structured_items[0]
|
||||
unstructured_children = unstructured_items[0]
|
||||
assert_array_almost_equal(structured_dist, unstructured_dist)
|
||||
assert_array_almost_equal(structured_children, unstructured_children)
|
||||
|
||||
# test on the following dataset where we know the truth
|
||||
# taken from scipy/cluster/tests/hierarchy_test_data.py
|
||||
X = np.array(
|
||||
[
|
||||
[1.43054825, -7.5693489],
|
||||
[6.95887839, 6.82293382],
|
||||
[2.87137846, -9.68248579],
|
||||
[7.87974764, -6.05485803],
|
||||
[8.24018364, -6.09495602],
|
||||
[7.39020262, 8.54004355],
|
||||
]
|
||||
)
|
||||
# truth
|
||||
linkage_X_ward = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 9.10208346, 4.0],
|
||||
[7.0, 9.0, 24.7784379, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
linkage_X_complete = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 6.96742194, 4.0],
|
||||
[7.0, 9.0, 18.77445997, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
linkage_X_average = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 6.55832839, 4.0],
|
||||
[7.0, 9.0, 15.44089605, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
n_samples, n_features = np.shape(X)
|
||||
connectivity_X = np.ones((n_samples, n_samples))
|
||||
|
||||
out_X_unstructured = ward_tree(X, return_distance=True)
|
||||
out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
|
||||
|
||||
# check that the labels are the same
|
||||
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
|
||||
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
|
||||
|
||||
# check that the distances are correct
|
||||
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
|
||||
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
|
||||
|
||||
linkage_options = ["complete", "average", "single"]
|
||||
X_linkage_truth = [linkage_X_complete, linkage_X_average]
|
||||
for linkage, X_truth in zip(linkage_options, X_linkage_truth):
|
||||
out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
|
||||
out_X_structured = linkage_tree(
|
||||
X, connectivity=connectivity_X, linkage=linkage, return_distance=True
|
||||
)
|
||||
|
||||
# check that the labels are the same
|
||||
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
|
||||
assert_array_equal(X_truth[:, :2], out_X_structured[0])
|
||||
|
||||
# check that the distances are correct
|
||||
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
|
||||
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
|
||||
|
||||
|
||||
def test_connectivity_fixing_non_lil():
|
||||
# Check non regression of a bug if a non item assignable connectivity is
|
||||
# provided with more than one component.
|
||||
# create dummy data
|
||||
x = np.array([[0, 0], [1, 1]])
|
||||
# create a mask with several components to force connectivity fixing
|
||||
m = np.array([[True, False], [False, True]])
|
||||
c = grid_to_graph(n_x=2, n_y=2, mask=m)
|
||||
w = AgglomerativeClustering(connectivity=c, linkage="ward")
|
||||
with pytest.warns(UserWarning):
|
||||
w.fit(x)
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
|
||||
other_keys = np.arange(50, dtype=np.intp)[::2]
|
||||
other_values = np.full(50, 0.5)[::2]
|
||||
other = IntFloatDict(other_keys, other_values)
|
||||
# Complete smoke test
|
||||
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
|
||||
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
|
||||
|
||||
|
||||
def test_connectivity_callable():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 5)
|
||||
connectivity = kneighbors_graph(X, 3, include_self=False)
|
||||
aglc1 = AgglomerativeClustering(connectivity=connectivity)
|
||||
aglc2 = AgglomerativeClustering(
|
||||
connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
|
||||
)
|
||||
aglc1.fit(X)
|
||||
aglc2.fit(X)
|
||||
assert_array_equal(aglc1.labels_, aglc2.labels_)
|
||||
|
||||
|
||||
def test_connectivity_ignores_diagonal():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 5)
|
||||
connectivity = kneighbors_graph(X, 3, include_self=False)
|
||||
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
|
||||
aglc1 = AgglomerativeClustering(connectivity=connectivity)
|
||||
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
|
||||
aglc1.fit(X)
|
||||
aglc2.fit(X)
|
||||
assert_array_equal(aglc1.labels_, aglc2.labels_)
|
||||
|
||||
|
||||
def test_compute_full_tree():
|
||||
# Test that the full tree is computed if n_clusters is small
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2)
|
||||
connectivity = kneighbors_graph(X, 5, include_self=False)
|
||||
|
||||
# When n_clusters is less, the full tree should be built
|
||||
# that is the number of merges should be n_samples - 1
|
||||
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
|
||||
agc.fit(X)
|
||||
n_samples = X.shape[0]
|
||||
n_nodes = agc.children_.shape[0]
|
||||
assert n_nodes == n_samples - 1
|
||||
|
||||
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
|
||||
# we should stop when there are n_clusters.
|
||||
n_clusters = 101
|
||||
X = rng.randn(200, 2)
|
||||
connectivity = kneighbors_graph(X, 10, include_self=False)
|
||||
agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
|
||||
agc.fit(X)
|
||||
n_samples = X.shape[0]
|
||||
n_nodes = agc.children_.shape[0]
|
||||
assert n_nodes == n_samples - n_clusters
|
||||
|
||||
|
||||
def test_n_components():
|
||||
# Test n_components returned by linkage, average and ward tree
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(5, 5)
|
||||
|
||||
# Connectivity matrix having five components.
|
||||
connectivity = np.eye(5)
|
||||
|
||||
for linkage_func in _TREE_BUILDERS.values():
|
||||
assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
|
||||
|
||||
|
||||
def test_affinity_passed_to_fix_connectivity():
|
||||
# Test that the affinity parameter is actually passed to the pairwise
|
||||
# function
|
||||
|
||||
size = 2
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(size, size)
|
||||
mask = np.array([True, False, False, True])
|
||||
|
||||
connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
|
||||
|
||||
class FakeAffinity:
|
||||
def __init__(self):
|
||||
self.counter = 0
|
||||
|
||||
def increment(self, *args, **kwargs):
|
||||
self.counter += 1
|
||||
return self.counter
|
||||
|
||||
fa = FakeAffinity()
|
||||
|
||||
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
|
||||
|
||||
assert fa.counter == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
|
||||
def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed):
|
||||
# Check that we obtain the correct number of clusters with
|
||||
# agglomerative clustering with distance_threshold.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
# test when distance threshold is set to 10
|
||||
distance_threshold = 10
|
||||
for conn in [None, connectivity]:
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None,
|
||||
distance_threshold=distance_threshold,
|
||||
connectivity=conn,
|
||||
linkage=linkage,
|
||||
)
|
||||
clustering.fit(X)
|
||||
clusters_produced = clustering.labels_
|
||||
num_clusters_produced = len(np.unique(clustering.labels_))
|
||||
# test if the clusters produced match the point in the linkage tree
|
||||
# where the distance exceeds the threshold
|
||||
tree_builder = _TREE_BUILDERS[linkage]
|
||||
children, n_components, n_leaves, parent, distances = tree_builder(
|
||||
X, connectivity=conn, n_clusters=None, return_distance=True
|
||||
)
|
||||
num_clusters_at_threshold = (
|
||||
np.count_nonzero(distances >= distance_threshold) + 1
|
||||
)
|
||||
# test number of clusters produced
|
||||
assert num_clusters_at_threshold == num_clusters_produced
|
||||
# test clusters produced
|
||||
clusters_at_threshold = _hc_cut(
|
||||
n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
|
||||
)
|
||||
assert np.array_equiv(clusters_produced, clusters_at_threshold)
|
||||
|
||||
|
||||
def test_small_distance_threshold(global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 10
|
||||
X = rng.randint(-300, 300, size=(n_samples, 3))
|
||||
# this should result in all data in their own clusters, given that
|
||||
# their pairwise distances are bigger than .1 (which may not be the case
|
||||
# with a different random seed).
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=1.0, linkage="single"
|
||||
).fit(X)
|
||||
# check that the pairwise distances are indeed all larger than .1
|
||||
all_distances = pairwise_distances(X, metric="minkowski", p=2)
|
||||
np.fill_diagonal(all_distances, np.inf)
|
||||
assert np.all(all_distances > 0.1)
|
||||
assert clustering.n_clusters_ == n_samples
|
||||
|
||||
|
||||
def test_cluster_distances_with_distance_threshold(global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 100
|
||||
X = rng.randint(-10, 10, size=(n_samples, 3))
|
||||
# check the distances within the clusters and with other clusters
|
||||
distance_threshold = 4
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=distance_threshold, linkage="single"
|
||||
).fit(X)
|
||||
labels = clustering.labels_
|
||||
D = pairwise_distances(X, metric="minkowski", p=2)
|
||||
# to avoid taking the 0 diagonal in min()
|
||||
np.fill_diagonal(D, np.inf)
|
||||
for label in np.unique(labels):
|
||||
in_cluster_mask = labels == label
|
||||
max_in_cluster_distance = (
|
||||
D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
|
||||
)
|
||||
min_out_cluster_distance = (
|
||||
D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
|
||||
)
|
||||
# single data point clusters only have that inf diagonal here
|
||||
if in_cluster_mask.sum() > 1:
|
||||
assert max_in_cluster_distance < distance_threshold
|
||||
assert min_out_cluster_distance >= distance_threshold
|
||||
|
||||
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
|
||||
@pytest.mark.parametrize(
|
||||
("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
|
||||
)
|
||||
def test_agglomerative_clustering_with_distance_threshold_edge_case(
|
||||
linkage, threshold, y_true
|
||||
):
|
||||
# test boundary case of distance_threshold matching the distance
|
||||
X = [[0], [1]]
|
||||
clusterer = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=threshold, linkage=linkage
|
||||
)
|
||||
y_pred = clusterer.fit_predict(X)
|
||||
assert adjusted_rand_score(y_true, y_pred) == 1
|
||||
|
||||
|
||||
def test_dist_threshold_invalid_parameters():
|
||||
X = [[0], [1]]
|
||||
with pytest.raises(ValueError, match="Exactly one of "):
|
||||
AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Exactly one of "):
|
||||
AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
|
||||
|
||||
X = [[0], [1]]
|
||||
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
|
||||
AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=1, compute_full_tree=False
|
||||
).fit(X)
|
||||
|
||||
|
||||
def test_invalid_shape_precomputed_dist_matrix():
|
||||
# Check that an error is raised when affinity='precomputed'
|
||||
# and a non square matrix is passed (PR #16257).
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(5, 3)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
|
||||
):
|
||||
AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)
|
||||
|
||||
|
||||
def test_precomputed_connectivity_metric_with_2_connected_components():
|
||||
"""Check that connecting components works when connectivity and
|
||||
affinity are both precomputed and the number of connected components is
|
||||
greater than 1. Non-regression test for #16151.
|
||||
"""
|
||||
|
||||
connectivity_matrix = np.array(
|
||||
[
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 0, 1, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 0],
|
||||
]
|
||||
)
|
||||
# ensure that connectivity_matrix has two connected components
|
||||
assert connected_components(connectivity_matrix)[0] == 2
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(5, 10)
|
||||
|
||||
X_dist = pairwise_distances(X)
|
||||
clusterer_precomputed = AgglomerativeClustering(
|
||||
metric="precomputed", connectivity=connectivity_matrix, linkage="complete"
|
||||
)
|
||||
msg = "Completing it to avoid stopping the tree early"
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clusterer_precomputed.fit(X_dist)
|
||||
|
||||
clusterer = AgglomerativeClustering(
|
||||
connectivity=connectivity_matrix, linkage="complete"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clusterer.fit(X)
|
||||
|
||||
assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
|
||||
assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Testing for mean shift clustering methods
|
||||
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics import v_measure_score
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
|
||||
n_clusters = 3
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=300,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=11,
|
||||
)
|
||||
|
||||
|
||||
def test_convergence_of_1d_constant_data():
|
||||
# Test convergence using 1D constant data
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/28926
|
||||
model = MeanShift()
|
||||
n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_
|
||||
assert n_iter < model.max_iter
|
||||
|
||||
|
||||
def test_estimate_bandwidth():
|
||||
# Test estimate_bandwidth
|
||||
bandwidth = estimate_bandwidth(X, n_samples=200)
|
||||
assert 0.9 <= bandwidth <= 1.5
|
||||
|
||||
|
||||
def test_estimate_bandwidth_1sample(global_dtype):
|
||||
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
|
||||
# n_neighbors is set to 1.
|
||||
bandwidth = estimate_bandwidth(
|
||||
X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3
|
||||
)
|
||||
|
||||
assert bandwidth.dtype == X.dtype
|
||||
assert bandwidth == pytest.approx(0.0, abs=1e-5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bandwidth, cluster_all, expected, first_cluster_label",
|
||||
[(1.2, True, 3, 0), (1.2, False, 4, -1)],
|
||||
)
|
||||
def test_mean_shift(
|
||||
global_dtype, bandwidth, cluster_all, expected, first_cluster_label
|
||||
):
|
||||
# Test MeanShift algorithm
|
||||
X_with_global_dtype = X.astype(global_dtype, copy=False)
|
||||
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
|
||||
labels = ms.fit(X_with_global_dtype).labels_
|
||||
labels_unique = np.unique(labels)
|
||||
n_clusters_ = len(labels_unique)
|
||||
assert n_clusters_ == expected
|
||||
assert labels_unique[0] == first_cluster_label
|
||||
assert ms.cluster_centers_.dtype == global_dtype
|
||||
|
||||
cluster_centers, labels_mean_shift = mean_shift(
|
||||
X_with_global_dtype, cluster_all=cluster_all
|
||||
)
|
||||
labels_mean_shift_unique = np.unique(labels_mean_shift)
|
||||
n_clusters_mean_shift = len(labels_mean_shift_unique)
|
||||
assert n_clusters_mean_shift == expected
|
||||
assert labels_mean_shift_unique[0] == first_cluster_label
|
||||
assert cluster_centers.dtype == global_dtype
|
||||
|
||||
|
||||
# TODO: remove mark once loky bug is fixed:
|
||||
# https://github.com/joblib/loky/issues/458
|
||||
@pytest.mark.thread_unsafe
|
||||
def test_parallel(global_dtype, global_random_seed):
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=50,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
ms1 = MeanShift(n_jobs=2)
|
||||
ms1.fit(X)
|
||||
|
||||
ms2 = MeanShift()
|
||||
ms2.fit(X)
|
||||
|
||||
assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_)
|
||||
assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype
|
||||
assert_array_equal(ms1.labels_, ms2.labels_)
|
||||
|
||||
|
||||
def test_meanshift_predict(global_dtype):
|
||||
# Test MeanShift.predict
|
||||
ms = MeanShift(bandwidth=1.2)
|
||||
X_with_global_dtype = X.astype(global_dtype, copy=False)
|
||||
labels = ms.fit_predict(X_with_global_dtype)
|
||||
labels2 = ms.predict(X_with_global_dtype)
|
||||
assert_array_equal(labels, labels2)
|
||||
|
||||
|
||||
def test_meanshift_all_orphans():
|
||||
# init away from the data, crash with a sensible warning
|
||||
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
|
||||
msg = "No point was within bandwidth=0.1"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ms.fit(
|
||||
X,
|
||||
)
|
||||
|
||||
|
||||
def test_unfitted():
|
||||
# Non-regression: before fit, there should be not fitted attributes.
|
||||
ms = MeanShift()
|
||||
assert not hasattr(ms, "cluster_centers_")
|
||||
assert not hasattr(ms, "labels_")
|
||||
|
||||
|
||||
def test_cluster_intensity_tie(global_dtype):
|
||||
X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype)
|
||||
c1 = MeanShift(bandwidth=2).fit(X)
|
||||
|
||||
X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype)
|
||||
c2 = MeanShift(bandwidth=2).fit(X)
|
||||
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
|
||||
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
|
||||
|
||||
|
||||
def test_bin_seeds(global_dtype):
|
||||
# Test the bin seeding technique which can be used in the mean shift
|
||||
# algorithm
|
||||
# Data is just 6 points in the plane
|
||||
X = np.array(
|
||||
[[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]],
|
||||
dtype=global_dtype,
|
||||
)
|
||||
|
||||
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
|
||||
# found
|
||||
ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
|
||||
test_bins = get_bin_seeds(X, 1, 1)
|
||||
test_result = set(tuple(p) for p in test_bins)
|
||||
assert len(ground_truth.symmetric_difference(test_result)) == 0
|
||||
|
||||
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
|
||||
# found
|
||||
ground_truth = {(1.0, 1.0), (2.0, 1.0)}
|
||||
test_bins = get_bin_seeds(X, 1, 2)
|
||||
test_result = set(tuple(p) for p in test_bins)
|
||||
assert len(ground_truth.symmetric_difference(test_result)) == 0
|
||||
|
||||
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
|
||||
# we bail and use the whole data here.
|
||||
with warnings.catch_warnings(record=True):
|
||||
test_bins = get_bin_seeds(X, 0.01, 1)
|
||||
assert_allclose(test_bins, X)
|
||||
|
||||
# tight clusters around [0, 0] and [1, 1], only get two bins
|
||||
X, _ = make_blobs(
|
||||
n_samples=100,
|
||||
n_features=2,
|
||||
centers=[[0, 0], [1, 1]],
|
||||
cluster_std=0.1,
|
||||
random_state=0,
|
||||
)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
test_bins = get_bin_seeds(X, 1)
|
||||
assert_array_equal(test_bins, [[0, 0], [1, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_iter", [1, 100])
|
||||
def test_max_iter(max_iter):
|
||||
clusters1, _ = mean_shift(X, max_iter=max_iter)
|
||||
ms = MeanShift(max_iter=max_iter).fit(X)
|
||||
clusters2 = ms.cluster_centers_
|
||||
|
||||
assert ms.n_iter_ <= ms.max_iter
|
||||
assert len(clusters1) == len(clusters2)
|
||||
|
||||
for c1, c2 in zip(clusters1, clusters2):
|
||||
assert np.allclose(c1, c2)
|
||||
|
||||
|
||||
def test_mean_shift_zero_bandwidth(global_dtype):
|
||||
# Check that mean shift works when the estimated bandwidth is 0.
|
||||
X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1)
|
||||
|
||||
# estimate_bandwidth with default args returns 0 on this dataset
|
||||
bandwidth = estimate_bandwidth(X)
|
||||
assert bandwidth == 0
|
||||
|
||||
# get_bin_seeds with a 0 bin_size should return the dataset itself
|
||||
assert get_bin_seeds(X, bin_size=bandwidth) is X
|
||||
|
||||
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
|
||||
# to no binning.
|
||||
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
|
||||
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
|
||||
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
|
||||
|
||||
assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1)
|
||||
assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1)
|
||||
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
|
||||
@@ -0,0 +1,874 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster import DBSCAN, OPTICS
|
||||
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import DataConversionWarning, EfficiencyWarning
|
||||
from sklearn.metrics.cluster import contingency_matrix
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 10
|
||||
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
|
||||
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
|
||||
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
|
||||
X = np.vstack((C1, C2, C3, C4, C5, C6))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("r_plot", "end"),
|
||||
[
|
||||
[[10, 8.9, 8.8, 8.7, 7, 10], 3],
|
||||
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
|
||||
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
|
||||
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
|
||||
],
|
||||
)
|
||||
def test_extend_downward(r_plot, end):
|
||||
r_plot = np.array(r_plot)
|
||||
ratio = r_plot[:-1] / r_plot[1:]
|
||||
steep_downward = ratio >= 1 / 0.9
|
||||
upward = ratio < 1
|
||||
|
||||
e = _extend_region(steep_downward, upward, 0, 2)
|
||||
assert e == end
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("r_plot", "end"),
|
||||
[
|
||||
[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
|
||||
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
|
||||
[[1, 2, 2.1, 2, np.inf], 0],
|
||||
[[1, 2, 2.1, np.inf], 2],
|
||||
],
|
||||
)
|
||||
def test_extend_upward(r_plot, end):
|
||||
r_plot = np.array(r_plot)
|
||||
ratio = r_plot[:-1] / r_plot[1:]
|
||||
steep_upward = ratio <= 0.9
|
||||
downward = ratio > 1
|
||||
|
||||
e = _extend_region(steep_upward, downward, 0, 2)
|
||||
assert e == end
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ordering", "clusters", "expected"),
|
||||
[
|
||||
[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
|
||||
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
|
||||
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
|
||||
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
|
||||
],
|
||||
)
|
||||
def test_the_extract_xi_labels(ordering, clusters, expected):
|
||||
labels = _extract_xi_labels(ordering, clusters)
|
||||
|
||||
assert_array_equal(labels, expected)
|
||||
|
||||
|
||||
def test_extract_xi(global_dtype):
|
||||
# small and easy test (no clusters around other clusters)
|
||||
# but with a clear noise data.
|
||||
# global_random_seed is not used here since the expected labels
|
||||
# are hardcoded for these specific data.
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 5
|
||||
|
||||
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
|
||||
C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
|
||||
C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
|
||||
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
# check float min_samples and min_cluster_size
|
||||
clust = OPTICS(
|
||||
min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
expected_labels = np.r_[
|
||||
[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
|
||||
]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
|
||||
).fit(X)
|
||||
# this may fail if the predecessor correction is not at work!
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
|
||||
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
|
||||
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
|
||||
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
|
||||
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
|
||||
def test_cluster_hierarchy(global_dtype, global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_points_per_cluster = 100
|
||||
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
X = np.vstack((C1, C2))
|
||||
X = shuffle(X, random_state=rng)
|
||||
|
||||
clusters = OPTICS(min_samples=20, xi=0.2).fit(X).cluster_hierarchy_
|
||||
assert clusters.shape == (2, 2)
|
||||
|
||||
# The first cluster should contain all point from C1 but due to how the data is
|
||||
# generated, some points from C2 may end up in it.
|
||||
assert 100 <= np.diff(clusters[0]) + 1 <= 115
|
||||
# The second cluster should contain all points from C1 and C2.
|
||||
assert np.diff(clusters[-1]) + 1 == 200
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"csr_container, metric",
|
||||
[(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
|
||||
)
|
||||
def test_correct_number_of_clusters(metric, csr_container):
|
||||
# in 'auto' mode
|
||||
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
# Parameters chosen specifically for this task.
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
|
||||
clust.fit(csr_container(X) if csr_container is not None else X)
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
# check attribute types and sizes
|
||||
assert clust.labels_.shape == (len(X),)
|
||||
assert clust.labels_.dtype.kind == "i"
|
||||
|
||||
assert clust.reachability_.shape == (len(X),)
|
||||
assert clust.reachability_.dtype.kind == "f"
|
||||
|
||||
assert clust.core_distances_.shape == (len(X),)
|
||||
assert clust.core_distances_.dtype.kind == "f"
|
||||
|
||||
assert clust.ordering_.shape == (len(X),)
|
||||
assert clust.ordering_.dtype.kind == "i"
|
||||
assert set(clust.ordering_) == set(range(len(X)))
|
||||
|
||||
|
||||
def test_minimum_number_of_sample_check():
|
||||
# test that we check a minimum number of samples
|
||||
msg = "min_samples must be no greater than"
|
||||
|
||||
# Compute OPTICS
|
||||
X = [[1, 1]]
|
||||
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
|
||||
|
||||
# Run the fit
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_bad_extract():
|
||||
# Test an extraction of eps too close to original eps
|
||||
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_bad_reachability():
|
||||
msg = "All reachability values are inf. Set a larger max_eps."
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_nowarn_if_metric_bool_data_bool():
|
||||
# make sure no warning is raised if metric and data are both boolean
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18996
|
||||
|
||||
pairwise_metric = "rogerstanimoto"
|
||||
X = np.random.randint(2, size=(5, 2), dtype=bool)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", DataConversionWarning)
|
||||
|
||||
OPTICS(metric=pairwise_metric).fit(X)
|
||||
|
||||
|
||||
def test_warn_if_metric_bool_data_no_bool():
|
||||
# make sure a *single* conversion warning is raised if metric is boolean
|
||||
# but data isn't
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18996
|
||||
|
||||
pairwise_metric = "rogerstanimoto"
|
||||
X = np.random.randint(2, size=(5, 2), dtype=np.int32)
|
||||
msg = f"Data will be converted to boolean for metric {pairwise_metric}"
|
||||
|
||||
with pytest.warns(DataConversionWarning, match=msg) as warn_record:
|
||||
# Silence a DeprecationWarning from joblib <= 1.5.1 in Python 3.14+.
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="'asyncio.iscoroutinefunction' is deprecated",
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
OPTICS(metric=pairwise_metric).fit(X)
|
||||
assert len(warn_record) == 1
|
||||
|
||||
|
||||
def test_nowarn_if_metric_no_bool():
|
||||
# make sure no conversion warning is raised if
|
||||
# metric isn't boolean, no matter what the data type is
|
||||
pairwise_metric = "minkowski"
|
||||
X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
|
||||
X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", DataConversionWarning)
|
||||
|
||||
# fit boolean data
|
||||
OPTICS(metric=pairwise_metric).fit(X_bool)
|
||||
# fit numeric data
|
||||
OPTICS(metric=pairwise_metric).fit(X_num)
|
||||
|
||||
|
||||
def test_close_extract():
|
||||
# Test extract where extraction eps is close to scaled max_eps
|
||||
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
|
||||
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
|
||||
assert max(clust.labels_) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
|
||||
@pytest.mark.parametrize("min_samples", [3, 10, 20])
|
||||
@pytest.mark.parametrize(
|
||||
"csr_container, metric",
|
||||
[(None, "minkowski"), (None, "euclidean")]
|
||||
+ [(container, "euclidean") for container in CSR_CONTAINERS],
|
||||
)
|
||||
def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
|
||||
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
|
||||
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=150, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
X = csr_container(X) if csr_container is not None else X
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# calculate optics with dbscan extract at 0.3 epsilon
|
||||
op = OPTICS(
|
||||
min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
|
||||
).fit(X)
|
||||
|
||||
# calculate dbscan labels
|
||||
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
|
||||
|
||||
contingency = contingency_matrix(db.labels_, op.labels_)
|
||||
agree = min(
|
||||
np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
|
||||
)
|
||||
disagree = X.shape[0] - agree
|
||||
|
||||
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
|
||||
|
||||
# verify label mismatch is <= 5% labels
|
||||
assert percent_mismatch <= 0.05
|
||||
|
||||
|
||||
def test_min_samples_edge_case(global_dtype):
|
||||
C1 = [[0, 0], [0, 0.1], [0, -0.1]]
|
||||
C2 = [[10, 10], [10, 9], [10, 11]]
|
||||
C3 = [[100, 100], [100, 96], [100, 106]]
|
||||
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
|
||||
|
||||
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
|
||||
clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
|
||||
clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
expected_labels = np.r_[[-1] * 9]
|
||||
with pytest.warns(UserWarning, match="All reachability values"):
|
||||
clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
|
||||
# try arbitrary minimum sizes
|
||||
@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
|
||||
def test_min_cluster_size(min_cluster_size, global_dtype):
|
||||
redX = X[::2].astype(global_dtype, copy=False) # reduce for speed
|
||||
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
|
||||
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
|
||||
if cluster_sizes.size:
|
||||
assert min(cluster_sizes) >= min_cluster_size
|
||||
# check behaviour is the same when min_cluster_size is a fraction
|
||||
clust_frac = OPTICS(
|
||||
min_samples=9,
|
||||
min_cluster_size=min_cluster_size / redX.shape[0],
|
||||
)
|
||||
clust_frac.fit(redX)
|
||||
assert_array_equal(clust.labels_, clust_frac.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_min_cluster_size_invalid2(csr_container):
|
||||
clust = OPTICS(min_cluster_size=len(X) + 1)
|
||||
with pytest.raises(ValueError, match="must be no greater than the "):
|
||||
clust.fit(X)
|
||||
|
||||
clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
|
||||
with pytest.raises(ValueError, match="must be no greater than the "):
|
||||
clust.fit(csr_container(X))
|
||||
|
||||
|
||||
def test_processing_order():
|
||||
# Ensure that we consider all unprocessed points,
|
||||
# not only direct neighbors. when picking the next point.
|
||||
Y = [[0], [10], [-10], [25]]
|
||||
|
||||
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
|
||||
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
|
||||
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
|
||||
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
|
||||
|
||||
|
||||
def test_compare_to_ELKI():
|
||||
# Expected values, computed with (future) ELKI 0.7.5 using:
|
||||
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
|
||||
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
|
||||
# where the FixedDBIDsFilter gives 0-indexed ids.
|
||||
r1 = [
|
||||
np.inf,
|
||||
1.0574896366427478,
|
||||
0.7587934993548423,
|
||||
0.7290174038973836,
|
||||
0.7290174038973836,
|
||||
0.7290174038973836,
|
||||
0.6861627576116127,
|
||||
0.7587934993548423,
|
||||
0.9280118450166668,
|
||||
1.1748022534146194,
|
||||
3.3355455741292257,
|
||||
0.49618389254482587,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.3086779122185853,
|
||||
4.163024452756142,
|
||||
1.623152630340929,
|
||||
0.45315840475822655,
|
||||
0.25468325192031926,
|
||||
0.2254004358159971,
|
||||
0.18765711877083036,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.18765711877083036,
|
||||
0.18765711877083036,
|
||||
0.2240202988740153,
|
||||
1.154337614548715,
|
||||
1.342604473837069,
|
||||
1.323308536402633,
|
||||
0.8607514948648837,
|
||||
0.27219111215810565,
|
||||
0.13260875220533205,
|
||||
0.13260875220533205,
|
||||
0.09890587675958984,
|
||||
0.09890587675958984,
|
||||
0.13548790801634494,
|
||||
0.1575483940837384,
|
||||
0.17515137170530226,
|
||||
0.17575920159442388,
|
||||
0.27219111215810565,
|
||||
0.6101447895405373,
|
||||
1.3189208094864302,
|
||||
1.323308536402633,
|
||||
2.2509184159764577,
|
||||
2.4517810628594527,
|
||||
3.675977064404973,
|
||||
3.8264795626020365,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.8459300127258036,
|
||||
2.8459300127258036,
|
||||
2.8459300127258036,
|
||||
3.0321982337972537,
|
||||
]
|
||||
o1 = [
|
||||
0,
|
||||
3,
|
||||
6,
|
||||
4,
|
||||
7,
|
||||
8,
|
||||
2,
|
||||
9,
|
||||
5,
|
||||
1,
|
||||
31,
|
||||
30,
|
||||
32,
|
||||
34,
|
||||
33,
|
||||
38,
|
||||
39,
|
||||
35,
|
||||
37,
|
||||
36,
|
||||
44,
|
||||
21,
|
||||
23,
|
||||
24,
|
||||
22,
|
||||
25,
|
||||
27,
|
||||
29,
|
||||
26,
|
||||
28,
|
||||
20,
|
||||
40,
|
||||
45,
|
||||
46,
|
||||
10,
|
||||
15,
|
||||
11,
|
||||
13,
|
||||
17,
|
||||
19,
|
||||
18,
|
||||
12,
|
||||
16,
|
||||
14,
|
||||
47,
|
||||
49,
|
||||
43,
|
||||
48,
|
||||
42,
|
||||
41,
|
||||
53,
|
||||
57,
|
||||
51,
|
||||
52,
|
||||
56,
|
||||
59,
|
||||
54,
|
||||
55,
|
||||
58,
|
||||
50,
|
||||
]
|
||||
p1 = [
|
||||
-1,
|
||||
0,
|
||||
3,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
8,
|
||||
3,
|
||||
7,
|
||||
5,
|
||||
1,
|
||||
31,
|
||||
30,
|
||||
30,
|
||||
34,
|
||||
34,
|
||||
34,
|
||||
32,
|
||||
32,
|
||||
37,
|
||||
36,
|
||||
44,
|
||||
21,
|
||||
23,
|
||||
24,
|
||||
22,
|
||||
25,
|
||||
25,
|
||||
22,
|
||||
22,
|
||||
22,
|
||||
21,
|
||||
40,
|
||||
45,
|
||||
46,
|
||||
10,
|
||||
15,
|
||||
15,
|
||||
13,
|
||||
13,
|
||||
15,
|
||||
11,
|
||||
19,
|
||||
15,
|
||||
10,
|
||||
47,
|
||||
12,
|
||||
45,
|
||||
14,
|
||||
43,
|
||||
42,
|
||||
53,
|
||||
57,
|
||||
57,
|
||||
57,
|
||||
57,
|
||||
59,
|
||||
59,
|
||||
59,
|
||||
58,
|
||||
]
|
||||
|
||||
# Tests against known extraction array
|
||||
# Does NOT work with metric='euclidean', because sklearn euclidean has
|
||||
# worse numeric precision. 'minkowski' is slower but more accurate.
|
||||
clust1 = OPTICS(min_samples=5).fit(X)
|
||||
|
||||
assert_array_equal(clust1.ordering_, np.array(o1))
|
||||
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
|
||||
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
|
||||
# ELKI currently does not print the core distances (which are not used much
|
||||
# in literature, but we can at least ensure to have this consistency:
|
||||
for i in clust1.ordering_[1:]:
|
||||
assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
|
||||
|
||||
# Expected values, computed with (future) ELKI 0.7.5 using
|
||||
r2 = [
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
0.27219111215810565,
|
||||
0.13260875220533205,
|
||||
0.13260875220533205,
|
||||
0.09890587675958984,
|
||||
0.09890587675958984,
|
||||
0.13548790801634494,
|
||||
0.1575483940837384,
|
||||
0.17515137170530226,
|
||||
0.17575920159442388,
|
||||
0.27219111215810565,
|
||||
0.4928068613197889,
|
||||
np.inf,
|
||||
0.2666183922512113,
|
||||
0.18765711877083036,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.18715928772277457,
|
||||
0.18765711877083036,
|
||||
0.18765711877083036,
|
||||
0.25468325192031926,
|
||||
np.inf,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.3086779122185853,
|
||||
0.34466409325984865,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
]
|
||||
o2 = [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
15,
|
||||
11,
|
||||
13,
|
||||
17,
|
||||
19,
|
||||
18,
|
||||
12,
|
||||
16,
|
||||
14,
|
||||
47,
|
||||
46,
|
||||
20,
|
||||
22,
|
||||
25,
|
||||
23,
|
||||
27,
|
||||
29,
|
||||
24,
|
||||
26,
|
||||
28,
|
||||
21,
|
||||
30,
|
||||
32,
|
||||
34,
|
||||
33,
|
||||
38,
|
||||
39,
|
||||
35,
|
||||
37,
|
||||
36,
|
||||
31,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
]
|
||||
p2 = [
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
10,
|
||||
15,
|
||||
15,
|
||||
13,
|
||||
13,
|
||||
15,
|
||||
11,
|
||||
19,
|
||||
15,
|
||||
10,
|
||||
47,
|
||||
-1,
|
||||
20,
|
||||
22,
|
||||
25,
|
||||
25,
|
||||
25,
|
||||
25,
|
||||
22,
|
||||
22,
|
||||
23,
|
||||
-1,
|
||||
30,
|
||||
30,
|
||||
34,
|
||||
34,
|
||||
34,
|
||||
32,
|
||||
32,
|
||||
37,
|
||||
38,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
]
|
||||
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
|
||||
|
||||
assert_array_equal(clust2.ordering_, np.array(o2))
|
||||
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
|
||||
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
|
||||
|
||||
index = np.where(clust1.core_distances_ <= 0.5)[0]
|
||||
assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
|
||||
|
||||
|
||||
def test_extract_dbscan(global_dtype, global_random_seed):
|
||||
# testing an easy dbscan case. Not including clusters with different
|
||||
# densities.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_points_per_cluster = 20
|
||||
C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
|
||||
|
||||
clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
|
||||
assert_array_equal(
|
||||
np.sort(np.unique(clust.labels_[clust.labels_ != -1])), [0, 1, 2, 3]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
|
||||
def test_precomputed_dists(global_dtype, csr_container):
|
||||
redX = X[::2].astype(global_dtype, copy=False)
|
||||
dists = pairwise_distances(redX, metric="euclidean")
|
||||
dists = csr_container(dists) if csr_container is not None else dists
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", EfficiencyWarning)
|
||||
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
|
||||
dists
|
||||
)
|
||||
clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
|
||||
|
||||
assert_allclose(clust1.reachability_, clust2.reachability_)
|
||||
assert_array_equal(clust1.labels_, clust2.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_optics_input_not_modified_precomputed_sparse_nodiag(
|
||||
csr_container, global_random_seed
|
||||
):
|
||||
"""Check that we don't modify in-place the pre-computed sparse matrix.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27508
|
||||
"""
|
||||
X = np.random.RandomState(global_random_seed).rand(6, 6)
|
||||
# Add zeros on the diagonal that will be implicit when creating
|
||||
# the sparse matrix. If `X` is modified in-place, the zeros from
|
||||
# the diagonal will be made explicit.
|
||||
np.fill_diagonal(X, 0)
|
||||
X = csr_container(X)
|
||||
assert all(row != col for row, col in zip(*X.nonzero()))
|
||||
X_copy = X.copy()
|
||||
OPTICS(metric="precomputed").fit(X)
|
||||
# Make sure that we did not modify `X` in-place even by creating
|
||||
# explicit 0s values.
|
||||
assert X.nnz == X_copy.nnz
|
||||
assert_array_equal(X.toarray(), X_copy.toarray())
|
||||
|
||||
|
||||
def test_optics_predecessor_correction_ordering():
|
||||
"""Check that cluster correction using predecessor is working as expected.
|
||||
|
||||
In the following example, the predecessor correction was not working properly
|
||||
since it was not using the right indices.
|
||||
|
||||
This non-regression test check that reordering the data does not change the results.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/26324
|
||||
"""
|
||||
X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1)
|
||||
reorder = [0, 1, 2, 4, 5, 6, 7, 3]
|
||||
X_2 = X_1[reorder]
|
||||
|
||||
optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1)
|
||||
optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2)
|
||||
|
||||
assert_array_equal(optics_1.labels_[reorder], optics_2.labels_)
|
||||
@@ -0,0 +1,335 @@
|
||||
"""Testing for Spectral Clustering methods"""
|
||||
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.linalg import LinAlgError
|
||||
|
||||
from sklearn.cluster import SpectralClustering, spectral_clustering
|
||||
from sklearn.cluster._spectral import cluster_qr, discretize
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.feature_extraction import img_to_graph
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa: F401
|
||||
|
||||
amg_loaded = True
|
||||
except ImportError:
|
||||
amg_loaded = False
|
||||
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=60,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_spectral_clustering(
|
||||
eigen_solver, assign_labels, csr_container, global_random_seed
|
||||
):
|
||||
S = np.array(
|
||||
[
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
]
|
||||
)
|
||||
|
||||
for mat in (S, csr_container(S)):
|
||||
model = SpectralClustering(
|
||||
random_state=global_random_seed,
|
||||
n_clusters=2,
|
||||
affinity="precomputed",
|
||||
eigen_solver=eigen_solver,
|
||||
assign_labels=assign_labels,
|
||||
).fit(mat)
|
||||
labels = model.labels_
|
||||
if labels[0] == 0:
|
||||
labels = 1 - labels
|
||||
|
||||
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
|
||||
|
||||
model_copy = pickle.loads(pickle.dumps(model))
|
||||
assert model_copy.n_clusters == model.n_clusters
|
||||
assert model_copy.eigen_solver == model.eigen_solver
|
||||
assert_array_equal(model_copy.labels_, model.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_spectral_clustering_sparse(assign_labels, coo_container, global_random_seed):
|
||||
X, y = make_blobs(
|
||||
n_samples=20,
|
||||
random_state=global_random_seed,
|
||||
centers=[[1, 1], [-1, -1]],
|
||||
cluster_std=0.01,
|
||||
)
|
||||
|
||||
S = rbf_kernel(X, gamma=1)
|
||||
S = np.maximum(S - 1e-4, 0)
|
||||
S = coo_container(S)
|
||||
|
||||
labels = (
|
||||
SpectralClustering(
|
||||
random_state=global_random_seed,
|
||||
n_clusters=2,
|
||||
affinity="precomputed",
|
||||
assign_labels=assign_labels,
|
||||
)
|
||||
.fit(S)
|
||||
.labels_
|
||||
)
|
||||
assert adjusted_rand_score(y, labels) == 1
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering(global_random_seed):
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
X, y = make_blobs(
|
||||
n_samples=250,
|
||||
random_state=global_random_seed,
|
||||
centers=[[1, 1, 1], [-1, -1, -1]],
|
||||
cluster_std=0.01,
|
||||
)
|
||||
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
|
||||
graph = nn.kneighbors_graph(X, mode="distance")
|
||||
labels = (
|
||||
SpectralClustering(
|
||||
random_state=global_random_seed,
|
||||
n_clusters=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.labels_
|
||||
)
|
||||
results.append(labels)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
def test_affinities(global_random_seed):
|
||||
# Note: in the following, random_state has been selected to have
|
||||
# a dataset that yields a stable eigen decomposition both when built
|
||||
# on OSX and Linux
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
# nearest neighbors affinity
|
||||
sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
|
||||
with pytest.warns(UserWarning, match="not fully connected"):
|
||||
sp.fit(X)
|
||||
assert adjusted_rand_score(y, sp.labels_) == 1
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=global_random_seed)
|
||||
labels = sp.fit(X).labels_
|
||||
assert adjusted_rand_score(y, labels) == 1
|
||||
|
||||
X = check_random_state(10).rand(10, 5) * 10
|
||||
|
||||
kernels_available = kernel_metrics()
|
||||
for kern in kernels_available:
|
||||
# Additive chi^2 gives a negative similarity matrix which
|
||||
# doesn't make sense for spectral clustering
|
||||
if kern != "additive_chi2":
|
||||
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
def histogram(x, y, **kwargs):
|
||||
# Histogram kernel implemented as a callable.
|
||||
assert kwargs == {} # no kernel_params that we didn't ask for
|
||||
return np.minimum(x, y).sum()
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
|
||||
def test_cluster_qr(global_random_seed):
|
||||
# cluster_qr by itself should not be used for clustering generic data
|
||||
# other than the rows of the eigenvectors within spectral clustering,
|
||||
# but cluster_qr must still preserve the labels for different dtypes
|
||||
# of the generic fixed input even if the labels may be meaningless.
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
n_samples, n_components = 10, 5
|
||||
data = random_state.randn(n_samples, n_components)
|
||||
labels_float64 = cluster_qr(data.astype(np.float64))
|
||||
# Each sample is assigned a cluster identifier
|
||||
assert labels_float64.shape == (n_samples,)
|
||||
# All components should be covered by the assignment
|
||||
assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
|
||||
# Single precision data should yield the same cluster assignments
|
||||
labels_float32 = cluster_qr(data.astype(np.float32))
|
||||
assert np.array_equal(labels_float64, labels_float32)
|
||||
|
||||
|
||||
def test_cluster_qr_permutation_invariance(global_random_seed):
|
||||
# cluster_qr must be invariant to sample permutation.
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
n_samples, n_components = 100, 5
|
||||
data = random_state.randn(n_samples, n_components)
|
||||
perm = random_state.permutation(n_samples)
|
||||
assert np.array_equal(
|
||||
cluster_qr(data)[perm],
|
||||
cluster_qr(data[perm]),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
|
||||
def test_discretize(n_samples, coo_container, global_random_seed):
|
||||
# Test the discretize using a noise assignment matrix
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
for n_class in range(2, 10):
|
||||
# random class labels
|
||||
y_true = random_state.randint(0, n_class + 1, n_samples)
|
||||
y_true = np.array(y_true, float)
|
||||
# noise class assignment matrix
|
||||
y_indicator = coo_container(
|
||||
(np.ones(n_samples), (np.arange(n_samples), y_true)),
|
||||
shape=(n_samples, n_class + 1),
|
||||
)
|
||||
y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
|
||||
n_samples, n_class + 1
|
||||
)
|
||||
y_pred = discretize(y_true_noisy, random_state=random_state)
|
||||
assert adjusted_rand_score(y_true, y_pred) > 0.8
|
||||
|
||||
|
||||
def test_spectral_clustering_with_arpack_amg_solvers(global_random_seed):
|
||||
# Test that spectral_clustering is the same for arpack and amg solver
|
||||
# Based on toy example from plot_segmentation_toy.py
|
||||
|
||||
# a small two coin image
|
||||
x, y = np.indices((40, 40))
|
||||
|
||||
center1, center2 = (14, 12), (20, 25)
|
||||
radius1, radius2 = 8, 7
|
||||
|
||||
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
|
||||
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
|
||||
|
||||
circles = circle1 | circle2
|
||||
mask = circles.copy()
|
||||
img = circles.astype(float)
|
||||
|
||||
graph = img_to_graph(img, mask=mask)
|
||||
graph.data = np.exp(-graph.data / graph.data.std())
|
||||
|
||||
labels_arpack = spectral_clustering(
|
||||
graph, n_clusters=2, eigen_solver="arpack", random_state=global_random_seed
|
||||
)
|
||||
|
||||
assert len(np.unique(labels_arpack)) == 2
|
||||
|
||||
if amg_loaded:
|
||||
labels_amg = spectral_clustering(
|
||||
graph, n_clusters=2, eigen_solver="amg", random_state=global_random_seed
|
||||
)
|
||||
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
|
||||
else:
|
||||
with pytest.raises(ValueError):
|
||||
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
|
||||
|
||||
|
||||
def test_n_components(global_random_seed):
|
||||
# Test that after adding n_components, result is different and
|
||||
# n_components = n_clusters by default
|
||||
X, y = make_blobs(
|
||||
n_samples=20,
|
||||
random_state=global_random_seed,
|
||||
centers=[[1, 1], [-1, -1]],
|
||||
cluster_std=0.01,
|
||||
)
|
||||
sp = SpectralClustering(n_clusters=2, random_state=global_random_seed)
|
||||
labels = sp.fit(X).labels_
|
||||
# set n_components = n_cluster and test if result is the same
|
||||
labels_same_ncomp = (
|
||||
SpectralClustering(
|
||||
n_clusters=2, n_components=2, random_state=global_random_seed
|
||||
)
|
||||
.fit(X)
|
||||
.labels_
|
||||
)
|
||||
# test that n_components=n_clusters by default
|
||||
assert_array_equal(labels, labels_same_ncomp)
|
||||
|
||||
# test that n_components affect result
|
||||
# n_clusters=8 by default, and set n_components=2
|
||||
labels_diff_ncomp = (
|
||||
SpectralClustering(n_components=2, random_state=global_random_seed)
|
||||
.fit(X)
|
||||
.labels_
|
||||
)
|
||||
assert not np.array_equal(labels, labels_diff_ncomp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_verbose(assign_labels, capsys):
|
||||
# Check verbose mode of KMeans for better coverage.
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
|
||||
SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert re.search(r"Computing label assignment using", captured.out)
|
||||
|
||||
if assign_labels == "kmeans":
|
||||
assert re.search(r"Initialization complete", captured.out)
|
||||
assert re.search(r"Iteration [0-9]+, inertia", captured.out)
|
||||
|
||||
|
||||
def test_spectral_clustering_np_matrix_raises():
|
||||
"""Check that spectral_clustering raises an informative error when passed
|
||||
an np.matrix. See #10993"""
|
||||
X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
|
||||
|
||||
msg = r"np\.matrix is not supported. Please convert to a numpy array"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
spectral_clustering(X)
|
||||
|
||||
|
||||
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
|
||||
"""Check that discretize raises LinAlgError when svd never converges.
|
||||
|
||||
Non-regression test for #21380
|
||||
"""
|
||||
|
||||
def new_svd(*args, **kwargs):
|
||||
raise LinAlgError()
|
||||
|
||||
monkeypatch.setattr(np.linalg, "svd", new_svd)
|
||||
vectors = np.ones((10, 4))
|
||||
|
||||
with pytest.raises(LinAlgError, match="SVD did not converge"):
|
||||
discretize(vectors)
|
||||
Reference in New Issue
Block a user