This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,37 @@
"""
Common utilities for testing clustering.
"""
import numpy as np
###############################################################################
# Generate sample data
def generate_clustered_data(
seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
):
prng = np.random.RandomState(seed)
# the data is voluntary shifted away from zero to check clustering
# algorithm robustness with regards to non centered data
means = (
np.array(
[
[1, 1, 1, 0],
[-1, -1, 0, 1],
[1, -1, 1, 1],
[-1, 1, 1, 0],
]
)
+ 10
)
X = np.empty((0, n_features))
for i in range(n_clusters):
X = np.r_[
X,
means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
]
return X

View File

@@ -0,0 +1,321 @@
"""
Testing for Clustering methods
"""
import warnings
import numpy as np
import pytest
from sklearn.cluster import AffinityPropagation, affinity_propagation
from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning, NotFittedError
from sklearn.metrics import euclidean_distances
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=60,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=0,
)
# TODO: AffinityPropagation must preserve dtype for its fitted attributes
# and test must be created accordingly to this new behavior.
# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000
def test_affinity_propagation(global_random_seed, global_dtype):
"""Test consistency of the affinity propagations."""
S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True)
preference = np.median(S) * 10
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference, random_state=global_random_seed
)
n_clusters_ = len(cluster_centers_indices)
assert n_clusters == n_clusters_
def test_affinity_propagation_precomputed():
"""Check equality of precomputed affinity matrix to internally computed affinity
matrix.
"""
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
af = AffinityPropagation(
preference=preference, affinity="precomputed", random_state=28
)
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert np.unique(labels).size == n_clusters_
assert n_clusters == n_clusters_
def test_affinity_propagation_no_copy():
"""Check behaviour of not copying the input data."""
S = -euclidean_distances(X, squared=True)
S_original = S.copy()
preference = np.median(S) * 10
assert not np.allclose(S.diagonal(), preference)
# with copy=True S should not be modified
affinity_propagation(S, preference=preference, copy=True, random_state=0)
assert_allclose(S, S_original)
assert not np.allclose(S.diagonal(), preference)
assert_allclose(S.diagonal(), np.zeros(S.shape[0]))
# with copy=False S will be modified inplace
affinity_propagation(S, preference=preference, copy=False, random_state=0)
assert_allclose(S.diagonal(), preference)
# test that copy=True and copy=False lead to the same result
S = S_original.copy()
af = AffinityPropagation(preference=preference, verbose=True, random_state=0)
labels = af.fit(X).labels_
_, labels_no_copy = affinity_propagation(
S, preference=preference, copy=False, random_state=74
)
assert_array_equal(labels, labels_no_copy)
def test_affinity_propagation_affinity_shape():
"""Check the shape of the affinity matrix when using `affinity_propagation."""
S = -euclidean_distances(X, squared=True)
err_msg = "The matrix of similarities must be a square array"
with pytest.raises(ValueError, match=err_msg):
affinity_propagation(S[:, :-1])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_affinity_propagation_precomputed_with_sparse_input(csr_container):
err_msg = "Sparse data was passed for X, but dense data is required"
with pytest.raises(TypeError, match=err_msg):
AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3)))
def test_affinity_propagation_predict(global_random_seed, global_dtype):
# Test AffinityPropagation.predict
af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed)
X_ = X.astype(global_dtype, copy=False)
labels = af.fit_predict(X_)
labels2 = af.predict(X_)
assert_array_equal(labels, labels2)
def test_affinity_propagation_predict_error():
# Test exception in AffinityPropagation.predict
# Not fitted.
af = AffinityPropagation(affinity="euclidean")
with pytest.raises(NotFittedError):
af.predict(X)
# Predict not supported when affinity="precomputed".
S = np.dot(X, X.T)
af = AffinityPropagation(affinity="precomputed", random_state=57)
af.fit(S)
with pytest.raises(ValueError, match="expecting 60 features as input"):
af.predict(X)
def test_affinity_propagation_fit_non_convergence(global_dtype):
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array and training samples should be labelled
# as noise (-1)
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
# Force non-convergence by allowing only a single iteration
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
with pytest.warns(ConvergenceWarning):
af.fit(X)
assert_allclose(np.empty((0, 2)), af.cluster_centers_)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_affinity_propagation_equal_mutual_similarities(global_dtype):
X = np.array([[-1, 1], [1, -1]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
# setting preference > similarity
with pytest.warns(UserWarning, match="mutually equal"):
cluster_center_indices, labels = affinity_propagation(S, preference=0)
# expect every sample to become an exemplar
assert_array_equal([0, 1], cluster_center_indices)
assert_array_equal([0, 1], labels)
# setting preference < similarity
with pytest.warns(UserWarning, match="mutually equal"):
cluster_center_indices, labels = affinity_propagation(S, preference=-10)
# expect one cluster, with arbitrary (first) sample as exemplar
assert_array_equal([0], cluster_center_indices)
assert_array_equal([0, 0], labels)
# setting different preferences
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
cluster_center_indices, labels = affinity_propagation(
S, preference=[-20, -10], random_state=37
)
# expect one cluster, with highest-preference sample as exemplar
assert_array_equal([1], cluster_center_indices)
assert_array_equal([0, 0], labels)
def test_affinity_propagation_predict_non_convergence(global_dtype):
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
# Force non-convergence by allowing only a single iteration
with pytest.warns(ConvergenceWarning):
af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
# At prediction time, consider new samples as noise since there are no
# clusters
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
with pytest.warns(ConvergenceWarning):
y = af.predict(to_predict)
assert_array_equal(np.array([-1, -1, -1]), y)
def test_affinity_propagation_non_convergence_regressiontest(global_dtype):
X = np.array(
[[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype
)
af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
msg = (
"Affinity propagation did not converge, this model may return degenerate"
" cluster centers and labels."
)
with pytest.warns(ConvergenceWarning, match=msg):
af.fit(X)
assert_array_equal(np.array([0, 0, 0]), af.labels_)
def test_equal_similarities_and_preferences(global_dtype):
# Unequal distances
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
assert not _equal_similarities_and_preferences(S, np.array(0))
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Equal distances
X = np.array([[0, 0], [1, 1]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
# Different preferences
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Same preferences
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
assert _equal_similarities_and_preferences(S, np.array(0))
def test_affinity_propagation_random_state():
"""Check that different random states lead to different initialisations
by looking at the center locations after two iterations.
"""
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=300, centers=centers, cluster_std=0.5, random_state=0
)
# random_state = 0
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
ap.fit(X)
centers0 = ap.cluster_centers_
# random_state = 76
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
ap.fit(X)
centers76 = ap.cluster_centers_
# check that the centers have not yet converged to the same solution
assert np.mean((centers0 - centers76) ** 2) > 1
@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype):
"""
Check that having sparse or dense `centers` format should not
influence the convergence.
Non-regression test for gh-13334.
"""
centers = container(np.zeros((1, 10)))
rng = np.random.RandomState(42)
X = rng.rand(40, 10).astype(global_dtype, copy=False)
y = (4 * rng.rand(40)).astype(int)
ap = AffinityPropagation(random_state=46)
ap.fit(X, y)
ap.cluster_centers_ = centers
with warnings.catch_warnings():
warnings.simplefilter("error", ConvergenceWarning)
assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
# FIXME; this test is broken with different random states, needs to be revisited
def test_correct_clusters(global_dtype):
# Test to fix incorrect clusters due to dtype change
# (non-regression test for issue #10832)
X = np.array(
[[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype
)
afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
X
)
expected = np.array([0, 1, 1, 2])
assert_array_equal(afp.labels_, expected)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input_for_predict(csr_container):
# Test to make sure sparse inputs are accepted for predict
# (non-regression test for issue #20049)
af = AffinityPropagation(affinity="euclidean", random_state=42)
af.fit(X)
labels = af.predict(csr_container((2, 2)))
assert_array_equal(labels, (2, 2))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input_for_fit_predict(csr_container):
# Test to make sure sparse inputs are accepted for fit_predict
# (non-regression test for issue #20049)
af = AffinityPropagation(affinity="euclidean", random_state=42)
rng = np.random.RandomState(42)
X = csr_container(rng.randint(0, 2, size=(5, 5)))
labels = af.fit_predict(X)
assert_array_equal(labels, (0, 1, 1, 2, 3))
def test_affinity_propagation_equal_points():
"""Make sure we do not assign multiple clusters to equal points.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/20043
"""
X = np.zeros((8, 1))
af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X)
assert np.all(af.labels_ == 0)

View File

@@ -0,0 +1,265 @@
"""Testing for Spectral Biclustering methods"""
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, BiclusterMixin, clone
from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
from sklearn.cluster._bicluster import (
_bistochastic_normalize,
_log_normalize,
_scale_normalize,
)
from sklearn.datasets import make_biclusters, make_checkerboard
from sklearn.metrics import consensus_score, v_measure_score
from sklearn.model_selection import ParameterGrid
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
class MockBiclustering(BiclusterMixin, BaseEstimator):
# Mock object for testing get_submatrix.
def __init__(self):
pass
def get_indices(self, i):
# Overridden to reproduce old get_submatrix test.
return (
np.where([True, True, False, False, True])[0],
np.where([False, False, True, True])[0],
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_get_submatrix(csr_container):
data = np.arange(20).reshape(5, 4)
model = MockBiclustering()
for X in (data, csr_container(data), data.tolist()):
submatrix = model.get_submatrix(0, X)
if issparse(submatrix):
submatrix = submatrix.toarray()
assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
submatrix[:] = -1
if issparse(X):
X = X.toarray()
assert np.all(X != -1)
def _test_shape_indices(model):
# Test get_shape and get_indices on fitted model.
for i in range(model.n_clusters):
m, n = model.get_shape(i)
i_ind, j_ind = model.get_indices(i)
assert len(i_ind) == m
assert len(j_ind) == n
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_coclustering(global_random_seed, csr_container):
# Test Dhillon's Spectral CoClustering on a simple problem.
param_grid = {
"svd_method": ["randomized", "arpack"],
"n_svd_vecs": [None, 20],
"mini_batch": [False, True],
"init": ["k-means++"],
"n_init": [10],
}
S, rows, cols = make_biclusters(
(30, 30), 3, noise=0.1, random_state=global_random_seed
)
S -= S.min() # needs to be nonnegative before making it sparse
S = np.where(S < 1, 0, S) # threshold some values
for mat in (S, csr_container(S)):
for kwargs in ParameterGrid(param_grid):
model = SpectralCoclustering(
n_clusters=3, random_state=global_random_seed, **kwargs
)
model.fit(mat)
assert model.rows_.shape == (3, 30)
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
assert consensus_score(model.biclusters_, (rows, cols)) == 1
_test_shape_indices(model)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_biclustering(global_random_seed, csr_container):
# Test Kluger methods on a checkerboard dataset.
S, rows, cols = make_checkerboard(
(30, 30), 3, noise=0.5, random_state=global_random_seed
)
non_default_params = {
"method": ["scale", "log"],
"svd_method": ["arpack"],
"n_svd_vecs": [20],
"mini_batch": [True],
}
for mat in (S, csr_container(S)):
for param_name, param_values in non_default_params.items():
for param_value in param_values:
model = SpectralBiclustering(
n_clusters=3,
n_init=3,
init="k-means++",
random_state=global_random_seed,
)
model.set_params(**dict([(param_name, param_value)]))
if issparse(mat) and model.get_params().get("method") == "log":
# cannot take log of sparse matrix
with pytest.raises(ValueError):
model.fit(mat)
continue
else:
model.fit(mat)
assert model.rows_.shape == (9, 30)
assert model.columns_.shape == (9, 30)
assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
assert consensus_score(model.biclusters_, (rows, cols)) == 1
_test_shape_indices(model)
def _do_scale_test(scaled):
"""Check that rows sum to one constant, and columns to another."""
row_sum = scaled.sum(axis=1)
col_sum = scaled.sum(axis=0)
if issparse(scaled):
row_sum = np.asarray(row_sum).squeeze()
col_sum = np.asarray(col_sum).squeeze()
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
def _do_bistochastic_test(scaled):
"""Check that rows and columns sum to the same constant."""
_do_scale_test(scaled)
assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_scale_normalize(global_random_seed, csr_container):
generator = np.random.RandomState(global_random_seed)
X = generator.rand(100, 100)
for mat in (X, csr_container(X)):
scaled, _, _ = _scale_normalize(mat)
_do_scale_test(scaled)
if issparse(mat):
assert issparse(scaled)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_bistochastic_normalize(global_random_seed, csr_container):
generator = np.random.RandomState(global_random_seed)
X = generator.rand(100, 100)
for mat in (X, csr_container(X)):
scaled = _bistochastic_normalize(mat)
_do_bistochastic_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_log_normalize(global_random_seed):
# adding any constant to a log-scaled matrix should make it
# bistochastic
generator = np.random.RandomState(global_random_seed)
mat = generator.rand(100, 100)
scaled = _log_normalize(mat) + 1
_do_bistochastic_test(scaled)
def test_fit_best_piecewise(global_random_seed):
model = SpectralBiclustering(random_state=global_random_seed)
vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
assert_array_equal(best, vectors[:2])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_project_and_cluster(global_random_seed, csr_container):
model = SpectralBiclustering(random_state=global_random_seed)
data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
vectors = np.array([[1, 0], [0, 1], [0, 0]])
for mat in (data, csr_container(data)):
labels = model._project_and_cluster(mat, vectors, n_clusters=2)
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
def test_perfect_checkerboard(global_random_seed):
# XXX Previously failed on build bot (not reproducible)
model = SpectralBiclustering(
3, svd_method="arpack", random_state=global_random_seed
)
S, rows, cols = make_checkerboard(
(30, 30), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
S, rows, cols = make_checkerboard(
(40, 30), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
S, rows, cols = make_checkerboard(
(30, 40), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
@pytest.mark.parametrize(
"params, type_err, err_msg",
[
(
{"n_clusters": 6},
ValueError,
"n_clusters should be <= n_samples=5",
),
(
{"n_clusters": (3, 3, 3)},
ValueError,
"Incorrect parameter n_clusters",
),
(
{"n_clusters": (3, 6)},
ValueError,
"Incorrect parameter n_clusters",
),
(
{"n_components": 3, "n_best": 4},
ValueError,
"n_best=4 must be <= n_components=3",
),
],
)
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
"""Check parameters validation in `SpectralBiClustering`"""
data = np.arange(25).reshape((5, 5))
model = SpectralBiclustering(**params)
with pytest.raises(type_err, match=err_msg):
model.fit(data)
@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
est = clone(est)
assert not hasattr(est, "n_features_in_")
est.fit(X)
assert est.n_features_in_ == 3

View File

@@ -0,0 +1,242 @@
"""
Tests for the birch clustering algorithm.
"""
import numpy as np
import pytest
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
def test_n_samples_leaves_roots(global_random_seed, global_dtype):
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
n_samples_leaves = sum(
[sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
)
assert n_samples_leaves == X.shape[0]
assert n_samples_root == X.shape[0]
def test_partial_fit(global_random_seed, global_dtype):
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
brc_partial.partial_fit(X[:50])
brc_partial.partial_fit(X[50:])
assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_)
# Test that same global labels are obtained after calling partial_fit
# with None
brc_partial.set_params(n_clusters=3)
brc_partial.partial_fit(None)
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
def test_birch_predict(global_random_seed, global_dtype):
# Test the predict method predicts the nearest centroid.
rng = np.random.RandomState(global_random_seed)
X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
X = X.astype(global_dtype, copy=False)
# n_samples * n_samples_per_cluster
shuffle_indices = np.arange(30)
rng.shuffle(shuffle_indices)
X_shuffle = X[shuffle_indices, :]
brc = Birch(n_clusters=4, threshold=1.0)
brc.fit(X_shuffle)
# Birch must preserve inputs' dtype
assert brc.subcluster_centers_.dtype == global_dtype
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
centroids = brc.subcluster_centers_
nearest_centroid = brc.subcluster_labels_[
pairwise_distances_argmin(X_shuffle, centroids)
]
assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0)
def test_n_clusters(global_random_seed, global_dtype):
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert len(brc1.subcluster_centers_) > 10
assert len(np.unique(brc1.labels_)) == 10
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.0)
with pytest.warns(ConvergenceWarning):
brc4.fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_X(global_random_seed, global_dtype, csr_container):
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(n_clusters=10)
brc.fit(X)
csr = csr_container(X)
brc_sparse = Birch(n_clusters=10)
brc_sparse.fit(csr)
# Birch must preserve inputs' dtype
assert brc_sparse.subcluster_centers_.dtype == global_dtype
assert_array_equal(brc.labels_, brc_sparse.labels_)
assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
def test_partial_fit_second_call_error_checks():
# second partial fit calls will error when n_features is not consistent
# with the first call
X, y = make_blobs(n_samples=100)
brc = Birch(n_clusters=3)
brc.partial_fit(X, y)
msg = "X has 1 features, but Birch is expecting 2 features"
with pytest.raises(ValueError, match=msg):
brc.partial_fit(X[:, [0]], y)
def check_branching_factor(node, branching_factor):
subclusters = node.subclusters_
assert branching_factor >= len(subclusters)
for cluster in subclusters:
if cluster.child_:
check_branching_factor(cluster.child_, branching_factor)
def test_branching_factor(global_random_seed, global_dtype):
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs(random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
def check_threshold(birch_instance, threshold):
"""Use the leaf linked list for traversal"""
current_leaf = birch_instance.dummy_leaf_.next_leaf_
while current_leaf:
subclusters = current_leaf.subclusters_
for sc in subclusters:
assert threshold >= sc.radius
current_leaf = current_leaf.next_leaf_
def test_threshold(global_random_seed, global_dtype):
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
brc = Birch(threshold=5.0, n_clusters=None)
brc.fit(X)
check_threshold(brc, 5.0)
def test_birch_n_clusters_long_int():
# Check that birch supports n_clusters with np.int64 dtype, for instance
# coming from np.arange. #16484
X, _ = make_blobs(random_state=0)
n_clusters = np.int64(5)
Birch(n_clusters=n_clusters).fit(X)
def test_feature_names_out():
"""Check `get_feature_names_out` for `Birch`."""
X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
brc = Birch(n_clusters=4)
brc.fit(X)
n_clusters = brc.subcluster_centers_.shape[0]
names_out = brc.get_feature_names_out()
assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
def test_transform_match_across_dtypes(global_random_seed):
X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed)
brc = Birch(n_clusters=4, threshold=1.1)
Y_64 = brc.fit_transform(X)
Y_32 = brc.fit_transform(X.astype(np.float32))
assert_allclose(Y_64, Y_32, atol=1e-6)
def test_subcluster_dtype(global_dtype):
X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(
global_dtype, copy=False
)
brc = Birch(n_clusters=4)
assert brc.fit(X).subcluster_centers_.dtype == global_dtype
def test_both_subclusters_updated():
"""Check that both subclusters are updated when a node a split, even when there are
duplicated data points. Non-regression test for #23269.
"""
X = np.array(
[
[-2.6192791, -1.5053215],
[-2.9993038, -1.6863596],
[-2.3724914, -1.3438171],
[-2.336792, -1.3417323],
[-2.4089134, -1.3290224],
[-2.3724914, -1.3438171],
[-3.364009, -1.8846745],
[-2.3724914, -1.3438171],
[-2.617677, -1.5003285],
[-2.2960556, -1.3260119],
[-2.3724914, -1.3438171],
[-2.5459878, -1.4533926],
[-2.25979, -1.3003055],
[-2.4089134, -1.3290224],
[-2.3724914, -1.3438171],
[-2.4089134, -1.3290224],
[-2.5459878, -1.4533926],
[-2.3724914, -1.3438171],
[-2.9720619, -1.7058647],
[-2.336792, -1.3417323],
[-2.3724914, -1.3438171],
],
dtype=np.float32,
)
# no error
Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X)

View File

@@ -0,0 +1,158 @@
import numpy as np
import pytest
from sklearn.cluster import BisectingKMeans
from sklearn.metrics import v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
@pytest.mark.parametrize("init", ["k-means++", "random"])
def test_three_clusters(bisecting_strategy, init):
"""Tries to perform bisect k-means for three clusters to check
if splitting data is performed correctly.
"""
X = np.array(
[[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]]
)
bisect_means = BisectingKMeans(
n_clusters=3,
random_state=0,
bisecting_strategy=bisecting_strategy,
init=init,
)
bisect_means.fit(X)
expected_centers = [[2, 1], [10, 1], [10, 9]]
expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2]
assert_allclose(
sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist())
)
assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse(csr_container):
"""Test Bisecting K-Means with sparse data.
Checks if labels and centers are the same between dense and sparse.
"""
rng = np.random.RandomState(0)
X = rng.rand(20, 2)
X[X < 0.8] = 0
X_csr = csr_container(X)
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
bisect_means.fit(X_csr)
sparse_centers = bisect_means.cluster_centers_
bisect_means.fit(X)
normal_centers = bisect_means.cluster_centers_
# Check if results is the same for dense and sparse data
assert_allclose(normal_centers, sparse_centers, atol=1e-8)
@pytest.mark.parametrize("n_clusters", [4, 5])
def test_n_clusters(n_clusters):
"""Test if resulting labels are in range [0, n_clusters - 1]."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
bisect_means.fit(X)
assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
def test_one_cluster():
"""Test single cluster."""
X = np.array([[1, 2], [10, 2], [10, 8]])
bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
# All labels from fit or predict should be equal 0
assert all(bisect_means.labels_ == 0)
assert all(bisect_means.predict(X) == 0)
assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_fit_predict(csr_container):
"""Check if labels from fit(X) method are same as from fit(X).predict(X)."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
bisect_means.fit(X)
assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_dtype_preserved(csr_container, global_dtype):
"""Check that centers dtype is the same as input data dtype."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2).astype(global_dtype, copy=False)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
km = BisectingKMeans(n_clusters=3, random_state=0)
km.fit(X)
assert km.cluster_centers_.dtype == global_dtype
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_float32_float64_equivalence(csr_container):
"""Check that the results are the same between float32 and float64."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
assert_array_equal(km32.labels_, km64.labels_)
@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
def test_no_crash_on_empty_bisections(algorithm):
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/27081
rng = np.random.RandomState(0)
X_train = rng.rand(3000, 10)
bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
# predict on scaled data to trigger pathologic case
# where the inner mask leads to empty bisections.
X_test = 50 * rng.rand(100, 10)
labels = bkm.predict(X_test) # should not crash with idiv by 0
assert np.isin(np.unique(labels), np.arange(10)).all()
def test_one_feature():
# Check that no error is raised when there is only one feature
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/27236
X = np.random.normal(size=(128, 1))
BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X)

View File

@@ -0,0 +1,434 @@
"""
Tests for DBSCAN clustering algorithm
"""
import pickle
import warnings
import numpy as np
import pytest
from scipy.spatial import distance
from sklearn.cluster import DBSCAN, dbscan
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
def test_dbscan_similarity():
# Tests the DBSCAN algorithm with a similarity array.
# Parameters chosen specifically for this task.
eps = 0.15
min_samples = 10
# Compute similarities
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
# Compute DBSCAN
core_samples, labels = dbscan(
D, metric="precomputed", eps=eps, min_samples=min_samples
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
labels = db.fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_feature():
# Tests the DBSCAN algorithm with a feature vector array.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
metric = "euclidean"
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_dbscan_sparse(lil_container):
core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10)
core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
@pytest.mark.parametrize("include_self", [False, True])
def test_dbscan_sparse_precomputed(include_self):
D = pairwise_distances(X)
nn = NearestNeighbors(radius=0.9).fit(X)
X_ = X if include_self else None
D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
# Ensure it is sparse not merely on diagonals:
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
core_sparse, labels_sparse = dbscan(
D_sparse, eps=0.8, min_samples=10, metric="precomputed"
)
core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps():
# test that precomputed neighbors graph is filtered if computed with
# a radius larger than DBSCAN's eps.
lower_eps = 0.2
nn = NearestNeighbors(radius=lower_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
higher_eps = lower_eps + 0.7
nn = NearestNeighbors(radius=higher_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_dbscan_input_not_modified(metric, csr_container):
# test that the input is not modified by dbscan
X = np.random.RandomState(0).rand(10, 10)
X = csr_container(X) if csr_container is not None else X
X_copy = X.copy()
dbscan(X, metric=metric)
if csr_container is not None:
assert_array_equal(X.toarray(), X_copy.toarray())
else:
assert_array_equal(X, X_copy)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container):
"""Check that we don't modify in-place the pre-computed sparse matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27508
"""
X = np.random.RandomState(0).rand(10, 10)
# Add zeros on the diagonal that will be implicit when creating
# the sparse matrix. If `X` is modified in-place, the zeros from
# the diagonal will be made explicit.
np.fill_diagonal(X, 0)
X = csr_container(X)
assert all(row != col for row, col in zip(*X.nonzero()))
X_copy = X.copy()
dbscan(X, metric="precomputed")
# Make sure that we did not modify `X` in-place even by creating
# explicit 0s values.
assert X.nnz == X_copy.nnz
assert_array_equal(X.toarray(), X_copy.toarray())
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_no_core_samples(csr_container):
rng = np.random.RandomState(0)
X = rng.rand(40, 10)
X[X < 0.8] = 0
for X_ in [X, csr_container(X)]:
db = DBSCAN(min_samples=6).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert db.core_sample_indices_.shape == (0,)
def test_dbscan_callable():
# Tests the DBSCAN algorithm with a callable metric.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
# metric is the function reference, not the string key.
metric = distance.euclidean
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(
X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_metric_params():
# Tests that DBSCAN works with the metrics_params argument.
eps = 0.8
min_samples = 10
p = 1
# Compute DBSCAN with metric_params arg
with warnings.catch_warnings(record=True) as warns:
db = DBSCAN(
metric="minkowski",
metric_params={"p": p},
eps=eps,
p=None,
min_samples=min_samples,
algorithm="ball_tree",
).fit(X)
assert not warns, warns[0].message
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
# Test that sample labels are the same as passing Minkowski 'p' directly
db = DBSCAN(
metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
).fit(X)
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_2)
assert_array_equal(labels_1, labels_2)
# Minkowski with p=1 should be equivalent to Manhattan distance
db = DBSCAN(
metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
).fit(X)
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_3)
assert_array_equal(labels_1, labels_3)
with pytest.warns(
SyntaxWarning,
match=(
"Parameter p is found in metric_params. "
"The corresponding parameter from __init__ "
"is ignored."
),
):
# Test that checks p is ignored in favor of metric_params={'p': <val>}
db = DBSCAN(
metric="minkowski",
metric_params={"p": p},
eps=eps,
p=p + 1,
min_samples=min_samples,
algorithm="ball_tree",
).fit(X)
core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_4)
assert_array_equal(labels_1, labels_4)
def test_dbscan_balltree():
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
eps = 0.8
min_samples = 10
D = pairwise_distances(X)
core_samples, labels = dbscan(
D, metric="precomputed", eps=eps, min_samples=min_samples
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
labels = db.fit(X).labels_
n_clusters_3 = len(set(labels)) - int(-1 in labels)
assert n_clusters_3 == n_clusters
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_4 = len(set(labels)) - int(-1 in labels)
assert n_clusters_4 == n_clusters
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_5 = len(set(labels)) - int(-1 in labels)
assert n_clusters_5 == n_clusters
def test_input_validation():
# DBSCAN.fit should accept a list of lists.
X = [[1.0, 2.0], [3.0, 4.0]]
DBSCAN().fit(X) # must not raise exception
def test_pickle():
obj = DBSCAN()
s = pickle.dumps(obj)
assert type(pickle.loads(s)) is obj.__class__
def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert 0 in core
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
assert 0 in core
core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
assert 0 not in core
def test_weighted_dbscan(global_random_seed):
# ensure sample_weight is validated
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2])
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2, 3, 4])
# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
)
# points within eps of each other:
assert_array_equal(
[0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
)
# and effect of non-positive and non-integer sample_weight:
assert_array_equal(
[], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
)
# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(global_random_seed)
sample_weight = rng.randint(0, 5, X.shape[0])
core1, label1 = dbscan(X, sample_weight=sample_weight)
assert len(label1) == len(X)
X_repeated = np.repeat(X, sample_weight, axis=0)
core_repeated, label_repeated = dbscan(X_repeated)
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
core_repeated_mask[core_repeated] = True
core_mask = np.zeros(X.shape[0], dtype=bool)
core_mask[core1] = True
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
# sample_weight should work with precomputed distance matrix
D = pairwise_distances(X)
core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
assert_array_equal(core1, core3)
assert_array_equal(label1, label3)
# sample_weight should work with estimator
est = DBSCAN().fit(X, sample_weight=sample_weight)
core4 = est.core_sample_indices_
label4 = est.labels_
assert_array_equal(core1, core4)
assert_array_equal(label1, label4)
est = DBSCAN()
label5 = est.fit_predict(X, sample_weight=sample_weight)
core5 = est.core_sample_indices_
assert_array_equal(core1, core5)
assert_array_equal(label1, label5)
assert_array_equal(label1, est.labels_)
@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
def test_dbscan_core_samples_toy(algorithm):
X = [[0], [2], [3], [4], [6], [8], [10]]
n_samples = len(X)
# Degenerate case: every sample is a core sample, either with its own
# cluster or including other close core samples.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
assert_array_equal(core_samples, np.arange(n_samples))
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
# With eps=1 and min_samples=2 only the 3 samples from the denser area
# are core samples. All other points are isolated and considered noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
assert_array_equal(core_samples, [1, 2, 3])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# Only the sample in the middle of the dense area is core. Its two
# neighbors are edge samples. Remaining samples are noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
assert_array_equal(core_samples, [2])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# It's no longer possible to extract core samples with eps=1:
# everything is noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
assert_array_equal(core_samples, [])
assert_array_equal(labels, np.full(n_samples, -1.0))
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
# more details
X = np.eye(10)
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
assert len(set(labels)) == 1
X = np.zeros((10, 10))
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
assert len(set(labels)) == 1
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container):
# sample matrix with initial two row all zero
ar = np.array(
[
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
]
)
matrix = csr_container(ar)
labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])

View File

@@ -0,0 +1,55 @@
"""
Tests for sklearn.cluster._feature_agglomeration
"""
import numpy as np
from numpy.testing import assert_array_equal
from sklearn.cluster import FeatureAgglomeration
from sklearn.datasets import make_blobs
from sklearn.utils._testing import assert_array_almost_equal
def test_feature_agglomeration():
n_clusters = 1
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
agglo_mean.fit(X)
agglo_median.fit(X)
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
assert np.size(agglo_mean.labels_) == X.shape[1]
assert np.size(agglo_median.labels_) == X.shape[1]
# Test transform
Xt_mean = agglo_mean.transform(X)
Xt_median = agglo_median.transform(X)
assert Xt_mean.shape[1] == n_clusters
assert Xt_median.shape[1] == n_clusters
assert Xt_mean == np.array([1 / 3.0])
assert Xt_median == np.array([0.0])
# Test inverse transform
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
X_full_median = agglo_median.inverse_transform(Xt_median)
assert np.unique(X_full_mean[0]).size == n_clusters
assert np.unique(X_full_median[0]).size == n_clusters
assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_feature_agglomeration_feature_names_out():
"""Check `get_feature_names_out` for `FeatureAgglomeration`."""
X, _ = make_blobs(n_features=6, random_state=0)
agglo = FeatureAgglomeration(n_clusters=3)
agglo.fit(X)
n_clusters = agglo.n_clusters_
names_out = agglo.get_feature_names_out()
assert_array_equal(
[f"featureagglomeration{i}" for i in range(n_clusters)], names_out
)

View File

@@ -0,0 +1,605 @@
"""
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
"""
import numpy as np
import pytest
from scipy import stats
from scipy.spatial import distance
from sklearn.cluster import HDBSCAN
from sklearn.cluster._hdbscan._tree import (
CONDENSED_dtype,
_condense_tree,
_do_labelling,
)
from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
from sklearn.datasets import make_blobs
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances
from sklearn.neighbors import BallTree, KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
X, y = make_blobs(n_samples=200, random_state=10)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
ALGORITHMS = [
"kd_tree",
"ball_tree",
"brute",
"auto",
]
OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()}
def check_label_quality(labels, threshold=0.99):
n_clusters = len(set(labels) - OUTLIER_SET)
assert n_clusters == 3
assert fowlkes_mallows_score(labels, y) > threshold
@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
def test_outlier_data(outlier_type):
"""
Tests if np.inf and np.nan data are each treated as special outliers.
"""
outlier = {
"infinite": np.inf,
"missing": np.nan,
}[outlier_type]
prob_check = {
"infinite": lambda x, y: x == y,
"missing": lambda x, y: np.isnan(x),
}[outlier_type]
label = _OUTLIER_ENCODING[outlier_type]["label"]
prob = _OUTLIER_ENCODING[outlier_type]["prob"]
X_outlier = X.copy()
X_outlier[0] = [outlier, 1]
X_outlier[5] = [outlier, outlier]
model = HDBSCAN(copy=False).fit(X_outlier)
(missing_labels_idx,) = (model.labels_ == label).nonzero()
assert_array_equal(missing_labels_idx, [0, 5])
(missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero()
assert_array_equal(missing_probs_idx, [0, 5])
clean_indices = list(range(1, 5)) + list(range(6, 200))
clean_model = HDBSCAN(copy=False).fit(X_outlier[clean_indices])
assert_array_equal(clean_model.labels_, model.labels_[clean_indices])
def test_hdbscan_distance_matrix():
"""
Tests that HDBSCAN works with precomputed distance matrices, and throws the
appropriate errors when needed.
"""
D = euclidean_distances(X)
D_original = D.copy()
labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)
assert_allclose(D, D_original)
check_label_quality(labels)
msg = r"The precomputed distance matrix.*has shape"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
msg = r"The precomputed distance matrix.*values"
# Ensure the matrix is not symmetric
D[0, 1] = 10
D[1, 0] = 1
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed", copy=False).fit_predict(D)
@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS])
def test_hdbscan_sparse_distance_matrix(sparse_constructor):
"""
Tests that HDBSCAN works with sparse distance matrices.
"""
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
threshold = stats.scoreatpercentile(D.flatten(), 50)
D[D >= threshold] = 0.0
D = sparse_constructor(D)
D.eliminate_zeros()
labels = HDBSCAN(metric="precomputed", copy=False).fit_predict(D)
check_label_quality(labels)
def test_hdbscan_feature_array():
"""
Tests that HDBSCAN works with feature array, including an arbitrary
goodness of fit check. Note that the check is a simple heuristic.
"""
labels = HDBSCAN(copy=False).fit_predict(X)
# Check that clustering is arbitrarily good
# This is a heuristic to guard against regression
check_label_quality(labels)
@pytest.mark.parametrize("algo", ALGORITHMS)
@pytest.mark.parametrize("metric", _VALID_METRICS)
def test_hdbscan_algorithms(algo, metric):
"""
Tests that HDBSCAN works with the expected combinations of algorithms and
metrics, or raises the expected errors.
"""
labels = HDBSCAN(algorithm=algo, copy=False).fit_predict(X)
check_label_quality(labels)
# Validation for brute is handled by `pairwise_distances`
if algo in ("brute", "auto"):
return
ALGOS_TREES = {
"kd_tree": KDTree,
"ball_tree": BallTree,
}
metric_params = {
"mahalanobis": {"V": np.eye(X.shape[1])},
"seuclidean": {"V": np.ones(X.shape[1])},
"minkowski": {"p": 2},
"wminkowski": {"p": 2, "w": np.ones(X.shape[1])},
}.get(metric, None)
hdb = HDBSCAN(
algorithm=algo,
metric=metric,
metric_params=metric_params,
copy=False,
)
if metric not in ALGOS_TREES[algo].valid_metrics:
with pytest.raises(ValueError):
hdb.fit(X)
elif metric == "wminkowski":
with pytest.warns(FutureWarning):
hdb.fit(X)
else:
hdb.fit(X)
def test_dbscan_clustering():
"""
Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
This test is more of a sanity check than a rigorous evaluation.
"""
clusterer = HDBSCAN(copy=False).fit(X)
labels = clusterer.dbscan_clustering(0.3)
# We use a looser threshold due to dbscan producing a more constrained
# clustering representation
check_label_quality(labels, threshold=0.92)
@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
def test_dbscan_clustering_outlier_data(cut_distance):
"""
Tests if np.inf and np.nan data are each treated as special outliers.
"""
missing_label = _OUTLIER_ENCODING["missing"]["label"]
infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
X_outlier = X.copy()
X_outlier[0] = [np.inf, 1]
X_outlier[2] = [1, np.nan]
X_outlier[5] = [np.inf, np.nan]
model = HDBSCAN(copy=False).fit(X_outlier)
labels = model.dbscan_clustering(cut_distance=cut_distance)
missing_labels_idx = np.flatnonzero(labels == missing_label)
assert_array_equal(missing_labels_idx, [2, 5])
infinite_labels_idx = np.flatnonzero(labels == infinite_label)
assert_array_equal(infinite_labels_idx, [0])
clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
clean_model = HDBSCAN(copy=False).fit(X_outlier[clean_idx])
clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
assert_array_equal(clean_labels, labels[clean_idx])
def test_hdbscan_best_balltree_metric():
"""
Tests that HDBSCAN using `BallTree` works.
"""
labels = HDBSCAN(
metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}, copy=False
).fit_predict(X)
check_label_quality(labels)
def test_hdbscan_no_clusters():
"""
Tests that HDBSCAN correctly does not generate a valid cluster when the
`min_cluster_size` is too large for the data.
"""
labels = HDBSCAN(min_cluster_size=len(X) - 1, copy=False).fit_predict(X)
assert set(labels).issubset(OUTLIER_SET)
def test_hdbscan_min_cluster_size():
"""
Test that the smallest non-noise cluster has at least `min_cluster_size`
many points
"""
for min_cluster_size in range(2, len(X), 1):
labels = HDBSCAN(min_cluster_size=min_cluster_size, copy=False).fit_predict(X)
true_labels = [label for label in labels if label != -1]
if len(true_labels) != 0:
assert np.min(np.bincount(true_labels)) >= min_cluster_size
def test_hdbscan_callable_metric():
"""
Tests that HDBSCAN works when passed a callable metric.
"""
metric = distance.euclidean
labels = HDBSCAN(metric=metric, copy=False).fit_predict(X)
check_label_quality(labels)
@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
def test_hdbscan_precomputed_non_brute(tree):
"""
Tests that HDBSCAN correctly raises an error when passing precomputed data
while requesting a tree-based algorithm.
"""
hdb = HDBSCAN(metric="precomputed", algorithm=tree, copy=False)
msg = "precomputed is not a valid metric for"
with pytest.raises(ValueError, match=msg):
hdb.fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse(csr_container):
"""
Tests that HDBSCAN works correctly when passing sparse feature data.
Evaluates correctness by comparing against the same data passed as a dense
array.
"""
dense_labels = HDBSCAN(copy=False).fit(X).labels_
check_label_quality(dense_labels)
_X_sparse = csr_container(X)
X_sparse = _X_sparse.copy()
sparse_labels = HDBSCAN(copy=False).fit(X_sparse).labels_
assert_array_equal(dense_labels, sparse_labels)
# Compare that the sparse and dense non-precomputed routines return the same labels
# where the 0th observation contains the outlier.
for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
X_dense = X.copy()
X_dense[0, 0] = outlier_val
dense_labels = HDBSCAN(copy=False).fit(X_dense).labels_
check_label_quality(dense_labels)
assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
X_sparse = _X_sparse.copy()
X_sparse[0, 0] = outlier_val
sparse_labels = HDBSCAN(copy=False).fit(X_sparse).labels_
assert_array_equal(dense_labels, sparse_labels)
msg = "Sparse data matrices only support algorithm `brute`."
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="euclidean", algorithm="ball_tree", copy=False).fit(X_sparse)
@pytest.mark.parametrize("algorithm", ALGORITHMS)
def test_hdbscan_centers(algorithm):
"""
Tests that HDBSCAN centers are calculated and stored properly, and are
accurate to the data.
"""
centers = [(0.0, 0.0), (3.0, 3.0)]
H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5)
hdb = HDBSCAN(store_centers="both", copy=False).fit(H)
for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_):
assert_allclose(center, centroid, rtol=1, atol=0.05)
assert_allclose(center, medoid, rtol=1, atol=0.05)
# Ensure that nothing is done for noise
hdb = HDBSCAN(
algorithm=algorithm,
store_centers="both",
min_cluster_size=X.shape[0],
copy=False,
).fit(X)
assert hdb.centroids_.shape[0] == 0
assert hdb.medoids_.shape[0] == 0
def test_hdbscan_allow_single_cluster_with_epsilon():
"""
Tests that HDBSCAN single-cluster selection with epsilon works correctly.
"""
rng = np.random.RandomState(0)
no_structure = rng.rand(150, 2)
# without epsilon we should see many noise points as children of root.
labels = HDBSCAN(
min_cluster_size=5,
cluster_selection_epsilon=0.0,
cluster_selection_method="eom",
allow_single_cluster=True,
copy=False,
).fit_predict(no_structure)
unique_labels, counts = np.unique(labels, return_counts=True)
assert len(unique_labels) == 2
# Arbitrary heuristic. Would prefer something more precise.
assert counts[unique_labels == -1] > 30
# for this random seed an epsilon of 0.18 will produce exactly 2 noise
# points at that cut in single linkage.
labels = HDBSCAN(
min_cluster_size=5,
cluster_selection_epsilon=0.18,
cluster_selection_method="eom",
allow_single_cluster=True,
algorithm="kd_tree",
copy=False,
).fit_predict(no_structure)
unique_labels, counts = np.unique(labels, return_counts=True)
assert len(unique_labels) == 2
assert counts[unique_labels == -1] == 2
def test_hdbscan_better_than_dbscan():
"""
Validate that HDBSCAN can properly cluster this difficult synthetic
dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
example)
"""
centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]]
X, y = make_blobs(
n_samples=750,
centers=centers,
cluster_std=[0.2, 0.35, 1.35, 1.35],
random_state=0,
)
labels = HDBSCAN(copy=False).fit(X).labels_
n_clusters = len(set(labels)) - int(-1 in labels)
assert n_clusters == 4
fowlkes_mallows_score(labels, y) > 0.99
@pytest.mark.parametrize(
"kwargs, X",
[
({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])),
({"metric": "precomputed"}, [[1, 2], [2, 1]]),
({}, [[1, 2], [3, 4]]),
],
)
def test_hdbscan_usable_inputs(X, kwargs):
"""
Tests that HDBSCAN works correctly for array-likes and precomputed inputs
with non-finite points.
"""
HDBSCAN(min_samples=1, copy=False, **kwargs).fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse_distances_too_few_nonzero(csr_container):
"""
Tests that HDBSCAN raises the correct error when there are too few
non-zero distances.
"""
X = csr_container(np.zeros((10, 10)))
msg = "There exists points with fewer than"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed", copy=False).fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse_distances_disconnected_graph(csr_container):
"""
Tests that HDBSCAN raises the correct error when the distance matrix
has multiple connected components.
"""
# Create symmetric sparse matrix with 2 connected components
X = np.zeros((20, 20))
X[:5, :5] = 1
X[5:, 15:] = 1
X = X + X.T
X = csr_container(X)
msg = "HDBSCAN cannot be performed on a disconnected graph"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed", copy=False).fit(X)
def test_hdbscan_tree_invalid_metric():
"""
Tests that HDBSCAN correctly raises an error for invalid metric choices.
"""
metric_callable = lambda x: x
msg = (
".* is not a valid metric for a .*-based algorithm\\. Please select a different"
" metric\\."
)
# Callables are not supported for either
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="kd_tree", metric=metric_callable, copy=False).fit(X)
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="ball_tree", metric=metric_callable, copy=False).fit(X)
# The set of valid metrics for KDTree at the time of writing this test is a
# strict subset of those supported in BallTree
metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
if len(metrics_not_kd) > 0:
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0], copy=False).fit(X)
def test_hdbscan_too_many_min_samples():
"""
Tests that HDBSCAN correctly raises an error when setting `min_samples`
larger than the number of samples.
"""
hdb = HDBSCAN(min_samples=len(X) + 1, copy=False)
msg = r"min_samples (.*) must be at most"
with pytest.raises(ValueError, match=msg):
hdb.fit(X)
def test_hdbscan_precomputed_dense_nan():
"""
Tests that HDBSCAN correctly raises an error when providing precomputed
distances with `np.nan` values.
"""
X_nan = X.copy()
X_nan[0, 0] = np.nan
msg = "np.nan values found in precomputed-dense"
hdb = HDBSCAN(metric="precomputed", copy=False)
with pytest.raises(ValueError, match=msg):
hdb.fit(X_nan)
@pytest.mark.parametrize("allow_single_cluster", [True, False])
@pytest.mark.parametrize("epsilon", [0, 0.1])
def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
"""
Tests that the `_do_labelling` helper function correctly assigns labels.
"""
n_samples = 48
X, y = make_blobs(
n_samples,
random_state=global_random_seed,
# Ensure the clusters are distinct with no overlap
centers=[
[0, 0],
[10, 0],
[0, 10],
],
)
est = HDBSCAN(copy=False).fit(X)
condensed_tree = _condense_tree(
est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
)
clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters=clusters,
cluster_label_map=cluster_label_map,
allow_single_cluster=allow_single_cluster,
cluster_selection_epsilon=epsilon,
)
first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
aligned_target = np.vectorize(y_to_labels.get)(y)
assert_array_equal(labels, aligned_target)
def test_labelling_thresholding():
"""
Tests that the `_do_labelling` helper function correctly thresholds the
incoming lambda values given various `cluster_selection_epsilon` values.
"""
n_samples = 5
MAX_LAMBDA = 1.5
condensed_tree = np.array(
[
(5, 2, MAX_LAMBDA, 1),
(5, 1, 0.1, 1),
(5, 0, MAX_LAMBDA, 1),
(5, 3, 0.2, 1),
(5, 4, 0.3, 1),
],
dtype=CONDENSED_dtype,
)
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=1,
)
num_noise = condensed_tree["value"] < 1
assert sum(num_noise) == sum(labels == -1)
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=0,
)
# The threshold should be calculated per-sample based on the largest
# lambda of any simbling node. In this case, all points are siblings
# and the largest value is exactly MAX_LAMBDA.
num_noise = condensed_tree["value"] < MAX_LAMBDA
assert sum(num_noise) == sum(labels == -1)
@pytest.mark.parametrize("store_centers", ["centroid", "medoid"])
def test_hdbscan_error_precomputed_and_store_centers(store_centers):
"""Check that we raise an error if the centers are requested together with
a precomputed input matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27893
"""
rng = np.random.RandomState(0)
X = rng.random((100, 2))
X_dist = euclidean_distances(X)
err_msg = "Cannot store centers when using a precomputed distance matrix."
with pytest.raises(ValueError, match=err_msg):
HDBSCAN(
metric="precomputed",
store_centers=store_centers,
copy=False,
).fit(X_dist)
@pytest.mark.parametrize("valid_algo", ["auto", "brute"])
def test_hdbscan_cosine_metric_valid_algorithm(valid_algo):
"""Test that HDBSCAN works with the "cosine" metric when the algorithm is set
to "brute" or "auto".
Non-regression test for issue #28631
"""
HDBSCAN(metric="cosine", algorithm=valid_algo, copy=False).fit_predict(X)
@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"])
def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo):
"""Test that HDBSCAN raises an informative error is raised when an unsupported
algorithm is used with the "cosine" metric.
"""
hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo, copy=False)
with pytest.raises(ValueError, match="cosine is not a valid metric"):
hdbscan.fit_predict(X)
# TODO(1.10): remove this test
def test_hdbscan_default_copy_warning():
"""
Test that HDBSCAN raises a FutureWarning when the `copy`
parameter is not set.
"""
X = np.random.RandomState(0).random((100, 2))
msg = r"The default value of `copy` will change from False to True in 1.10."
with pytest.warns(FutureWarning, match=msg):
hdb = HDBSCAN(min_cluster_size=20)
hdb.fit(X)

View File

@@ -0,0 +1,889 @@
"""
Several basic tests for hierarchical clustering procedures
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import itertools
import shutil
from functools import partial
from tempfile import mkdtemp
import numpy as np
import pytest
from scipy.cluster import hierarchy
from scipy.sparse.csgraph import connected_components
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree
from sklearn.cluster._agglomerative import (
_TREE_BUILDERS,
_fix_connectivity,
_hc_cut,
linkage_tree,
)
from sklearn.cluster._hierarchical_fast import (
average_merge,
max_merge,
mst_linkage_core,
)
from sklearn.datasets import make_circles, make_moons
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.metrics import DistanceMetric
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.pairwise import (
PAIRED_DISTANCES,
cosine_distances,
manhattan_distances,
pairwise_distances,
)
from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
from sklearn.neighbors import kneighbors_graph
from sklearn.utils._fast_dict import IntFloatDict
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
create_memmap_backed_data,
ignore_warnings,
)
from sklearn.utils.fixes import LIL_CONTAINERS
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
with pytest.raises(ValueError):
linkage_tree(X, linkage="foo")
with pytest.raises(ValueError):
linkage_tree(X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = tree_builder(
X.T, connectivity=connectivity
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
with pytest.raises(ValueError):
tree_builder(X.T, connectivity=np.ones((4, 4)))
# Check that fitting with no samples raises an error
with pytest.raises(ValueError):
tree_builder(X.T[:0], connectivity=connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
with pytest.warns(UserWarning):
children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
with pytest.warns(UserWarning):
children, n_nodes, n_leaves, parent = tree_builder(
this_X.T, n_clusters=10
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(
X.T, connectivity=connectivity
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_zero_cosine_linkage_tree():
# Check that zero vectors in X produce an error when
# 'cosine' affinity is used
X = np.array([[0, 1], [0, 0]])
msg = "Cosine affinity cannot be used when X contains zero vectors"
with pytest.raises(ValueError, match=msg):
linkage_tree(X, affinity="cosine")
@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
@pytest.mark.parametrize("compute_distances", [True, False])
@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
def test_agglomerative_clustering_distances(
n_clusters, compute_distances, distance_threshold, linkage
):
# Check that when `compute_distances` is True or `distance_threshold` is
# given, the fitted model has an attribute `distances_`.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
clustering = AgglomerativeClustering(
n_clusters=n_clusters,
connectivity=connectivity,
linkage=linkage,
distance_threshold=distance_threshold,
compute_distances=compute_distances,
)
clustering.fit(X)
if compute_distances or (distance_threshold is not None):
assert hasattr(clustering, "distances_")
n_children = clustering.children_.shape[0]
n_nodes = n_children + 1
assert clustering.distances_.shape == (n_nodes - 1,)
else:
assert not hasattr(clustering, "distances_")
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_agglomerative_clustering(global_random_seed, lil_container):
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
for linkage in ("ward", "complete", "average", "single"):
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage
)
clustering.fit(X)
# test caching
try:
tempdir = mkdtemp()
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity,
memory=tempdir,
linkage=linkage,
)
clustering.fit(X)
labels = clustering.labels_
assert np.size(np.unique(labels)) == 10
finally:
shutil.rmtree(tempdir)
# Turn caching off now
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage
)
# Check that we obtain the same solution with early-stopping of the
# tree building
clustering.compute_full_tree = False
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
clustering.connectivity = None
clustering.fit(X)
assert np.size(np.unique(clustering.labels_)) == 10
# Check that we raise a TypeError on dense matrices
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=lil_container(connectivity.toarray()[:10, :10]),
linkage=linkage,
)
with pytest.raises(ValueError):
clustering.fit(X)
# Test that using ward with another metric than euclidean raises an
# exception
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
metric="manhattan",
linkage="ward",
)
with pytest.raises(ValueError):
clustering.fit(X)
# Test using another metric than euclidean works with linkage complete
for metric in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
metric=metric,
linkage="complete",
)
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10, connectivity=None, metric=metric, linkage="complete"
)
clustering2.fit(X)
assert_almost_equal(
normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
)
# Test that using a distance matrix (affinity = 'precomputed') has same
# results (with connectivity constraints)
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage="complete"
)
clustering.fit(X)
X_dist = pairwise_distances(X)
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity,
metric="precomputed",
linkage="complete",
)
clustering2.fit(X_dist)
assert_array_equal(clustering.labels_, clustering2.labels_)
def test_agglomerative_clustering_memory_mapped():
"""AgglomerativeClustering must work on mem-mapped dataset.
Non-regression test for issue #19875.
"""
rng = np.random.RandomState(0)
Xmm = create_memmap_backed_data(rng.randn(50, 100))
AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)
def test_ward_agglomeration(global_random_seed):
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert np.size(np.unique(agglo.labels_)) == 5
X_red = agglo.transform(X)
assert X_red.shape[1] == 5
X_full = agglo.inverse_transform(X_red)
assert np.unique(X_full[0]).size == 5
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
with pytest.raises(ValueError):
agglo.fit(X[:0])
def test_single_linkage_clustering():
# Check that we get the correct result in two emblematic cases
moons, moon_labels = make_moons(noise=0.05, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
clustering.fit(moons)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, moon_labels), 1
)
circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
clustering.fit(circles)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, circle_labels), 1
)
def assess_same_labelling(cut1, cut2):
"""Util for comparison with scipy"""
co_clust = []
for cut in [cut1, cut2]:
n = len(cut)
k = cut.max() + 1
ecut = np.zeros((n, k))
ecut[np.arange(n), cut] = 1
co_clust.append(np.dot(ecut, ecut.T))
assert (co_clust[0] == co_clust[1]).all()
def test_sparse_scikit_vs_scipy(global_random_seed):
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(global_random_seed)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(int, copy=False)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
X, connectivity=connectivity
)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(
children,
children_,
"linkage tree differs from scipy impl for linkage: " + linkage,
)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
with pytest.raises(ValueError):
_hc_cut(n_leaves + 1, children, n_leaves)
# Make sure our custom mst_linkage_core gives
# the same results as scipy's builtin
def test_vector_scikit_single_vs_scipy_single(global_random_seed):
n_samples, n_features, n_clusters = 10, 5, 3
rng = np.random.RandomState(global_random_seed)
X = 0.1 * rng.normal(size=(n_samples, n_features))
X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method="single")
children_scipy = out[:, :2].astype(int)
children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(
children,
children_scipy,
"linkage tree differs from scipy impl for single linkage.",
)
cut = _hc_cut(n_clusters, children, n_leaves)
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
assess_same_labelling(cut, cut_scipy)
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_mst_linkage_core_memory_mapped(metric_param_grid):
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
Non-regression test for issue #19875.
"""
rng = np.random.RandomState(seed=1)
X = rng.normal(size=(20, 4))
Xmm = create_memmap_backed_data(X)
metric, param_grid = metric_param_grid
keys = param_grid.keys()
for vals in itertools.product(*param_grid.values()):
kwargs = dict(zip(keys, vals))
distance_metric = DistanceMetric.get_metric(metric, **kwargs)
mst = mst_linkage_core(X, distance_metric)
mst_mm = mst_linkage_core(Xmm, distance_metric)
np.testing.assert_equal(mst, mst_mm)
def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
true_labels = np.array([0, 0, 1, 1, 2, 2])
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
for linkage in ("single", "average", "average", "ward"):
clustering = AgglomerativeClustering(
n_clusters=3, linkage=linkage, connectivity=connectivity
)
clustering.fit(X)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, true_labels), 1
)
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array(
[
(0.014, 0.120),
(0.014, 0.099),
(0.014, 0.097),
(0.017, 0.153),
(0.017, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.152),
(0.018, 0.149),
(0.018, 0.144),
]
)
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage="ward"
)
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_ward_tree_children_order(global_random_seed):
# Check that children are ordered in the same way for both structured and
# unstructured versions of ward_tree.
# test on five random datasets
n, p = 10, 5
rng = np.random.RandomState(global_random_seed)
connectivity = np.ones((n, n))
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X)
out_structured = ward_tree(X, connectivity=connectivity)
assert_array_equal(out_unstructured[0], out_structured[0])
def test_ward_linkage_tree_return_distance(global_random_seed):
# Test return_distance option on linkage and ward trees
# test that return_distance when set true, gives same
# output on both structured and unstructured clustering.
n, p = 10, 5
rng = np.random.RandomState(global_random_seed)
connectivity = np.ones((n, n))
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X, return_distance=True)
out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
# get children
children_unstructured = out_unstructured[0]
children_structured = out_structured[0]
# check if we got the same clusters
assert_array_equal(children_unstructured, children_structured)
# check if the distances are the same
dist_unstructured = out_unstructured[-1]
dist_structured = out_structured[-1]
assert_array_almost_equal(dist_unstructured, dist_structured)
for linkage in ["average", "complete", "single"]:
structured_items = linkage_tree(
X, connectivity=connectivity, linkage=linkage, return_distance=True
)[-1]
unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
-1
]
structured_dist = structured_items[-1]
unstructured_dist = unstructured_items[-1]
structured_children = structured_items[0]
unstructured_children = unstructured_items[0]
assert_array_almost_equal(structured_dist, unstructured_dist)
assert_array_almost_equal(structured_children, unstructured_children)
# test on the following dataset where we know the truth
# taken from scipy/cluster/tests/hierarchy_test_data.py
X = np.array(
[
[1.43054825, -7.5693489],
[6.95887839, 6.82293382],
[2.87137846, -9.68248579],
[7.87974764, -6.05485803],
[8.24018364, -6.09495602],
[7.39020262, 8.54004355],
]
)
# truth
linkage_X_ward = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 9.10208346, 4.0],
[7.0, 9.0, 24.7784379, 6.0],
]
)
linkage_X_complete = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 6.96742194, 4.0],
[7.0, 9.0, 18.77445997, 6.0],
]
)
linkage_X_average = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 6.55832839, 4.0],
[7.0, 9.0, 15.44089605, 6.0],
]
)
n_samples, n_features = np.shape(X)
connectivity_X = np.ones((n_samples, n_samples))
out_X_unstructured = ward_tree(X, return_distance=True)
out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
# check that the labels are the same
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
linkage_options = ["complete", "average", "single"]
X_linkage_truth = [linkage_X_complete, linkage_X_average]
for linkage, X_truth in zip(linkage_options, X_linkage_truth):
out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
out_X_structured = linkage_tree(
X, connectivity=connectivity_X, linkage=linkage, return_distance=True
)
# check that the labels are the same
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
assert_array_equal(X_truth[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_connectivity_fixing_non_lil():
# Check non regression of a bug if a non item assignable connectivity is
# provided with more than one component.
# create dummy data
x = np.array([[0, 0], [1, 1]])
# create a mask with several components to force connectivity fixing
m = np.array([[True, False], [False, True]])
c = grid_to_graph(n_x=2, n_y=2, mask=m)
w = AgglomerativeClustering(connectivity=c, linkage="ward")
with pytest.warns(UserWarning):
w.fit(x)
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
other_keys = np.arange(50, dtype=np.intp)[::2]
other_values = np.full(50, 0.5)[::2]
other = IntFloatDict(other_keys, other_values)
# Complete smoke test
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_connectivity_callable():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_compute_full_tree():
# Test that the full tree is computed if n_clusters is small
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
connectivity = kneighbors_graph(X, 5, include_self=False)
# When n_clusters is less, the full tree should be built
# that is the number of merges should be n_samples - 1
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - 1
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
# we should stop when there are n_clusters.
n_clusters = 101
X = rng.randn(200, 2)
connectivity = kneighbors_graph(X, 10, include_self=False)
agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - n_clusters
def test_n_components():
# Test n_components returned by linkage, average and ward tree
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Connectivity matrix having five components.
connectivity = np.eye(5)
for linkage_func in _TREE_BUILDERS.values():
assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
def test_affinity_passed_to_fix_connectivity():
# Test that the affinity parameter is actually passed to the pairwise
# function
size = 2
rng = np.random.RandomState(0)
X = rng.randn(size, size)
mask = np.array([True, False, False, True])
connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
class FakeAffinity:
def __init__(self):
self.counter = 0
def increment(self, *args, **kwargs):
self.counter += 1
return self.counter
fa = FakeAffinity()
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
assert fa.counter == 3
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed):
# Check that we obtain the correct number of clusters with
# agglomerative clustering with distance_threshold.
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
# test when distance threshold is set to 10
distance_threshold = 10
for conn in [None, connectivity]:
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
connectivity=conn,
linkage=linkage,
)
clustering.fit(X)
clusters_produced = clustering.labels_
num_clusters_produced = len(np.unique(clustering.labels_))
# test if the clusters produced match the point in the linkage tree
# where the distance exceeds the threshold
tree_builder = _TREE_BUILDERS[linkage]
children, n_components, n_leaves, parent, distances = tree_builder(
X, connectivity=conn, n_clusters=None, return_distance=True
)
num_clusters_at_threshold = (
np.count_nonzero(distances >= distance_threshold) + 1
)
# test number of clusters produced
assert num_clusters_at_threshold == num_clusters_produced
# test clusters produced
clusters_at_threshold = _hc_cut(
n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
)
assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_small_distance_threshold(global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_samples = 10
X = rng.randint(-300, 300, size=(n_samples, 3))
# this should result in all data in their own clusters, given that
# their pairwise distances are bigger than .1 (which may not be the case
# with a different random seed).
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=1.0, linkage="single"
).fit(X)
# check that the pairwise distances are indeed all larger than .1
all_distances = pairwise_distances(X, metric="minkowski", p=2)
np.fill_diagonal(all_distances, np.inf)
assert np.all(all_distances > 0.1)
assert clustering.n_clusters_ == n_samples
def test_cluster_distances_with_distance_threshold(global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_samples = 100
X = rng.randint(-10, 10, size=(n_samples, 3))
# check the distances within the clusters and with other clusters
distance_threshold = 4
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=distance_threshold, linkage="single"
).fit(X)
labels = clustering.labels_
D = pairwise_distances(X, metric="minkowski", p=2)
# to avoid taking the 0 diagonal in min()
np.fill_diagonal(D, np.inf)
for label in np.unique(labels):
in_cluster_mask = labels == label
max_in_cluster_distance = (
D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
)
min_out_cluster_distance = (
D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
)
# single data point clusters only have that inf diagonal here
if in_cluster_mask.sum() > 1:
assert max_in_cluster_distance < distance_threshold
assert min_out_cluster_distance >= distance_threshold
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
@pytest.mark.parametrize(
("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
)
def test_agglomerative_clustering_with_distance_threshold_edge_case(
linkage, threshold, y_true
):
# test boundary case of distance_threshold matching the distance
X = [[0], [1]]
clusterer = AgglomerativeClustering(
n_clusters=None, distance_threshold=threshold, linkage=linkage
)
y_pred = clusterer.fit_predict(X)
assert adjusted_rand_score(y_true, y_pred) == 1
def test_dist_threshold_invalid_parameters():
X = [[0], [1]]
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
X = [[0], [1]]
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
AgglomerativeClustering(
n_clusters=None, distance_threshold=1, compute_full_tree=False
).fit(X)
def test_invalid_shape_precomputed_dist_matrix():
# Check that an error is raised when affinity='precomputed'
# and a non square matrix is passed (PR #16257).
rng = np.random.RandomState(0)
X = rng.rand(5, 3)
with pytest.raises(
ValueError,
match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
):
AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)
def test_precomputed_connectivity_metric_with_2_connected_components():
"""Check that connecting components works when connectivity and
affinity are both precomputed and the number of connected components is
greater than 1. Non-regression test for #16151.
"""
connectivity_matrix = np.array(
[
[0, 1, 1, 0, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 1],
[0, 0, 0, 0, 0],
]
)
# ensure that connectivity_matrix has two connected components
assert connected_components(connectivity_matrix)[0] == 2
rng = np.random.RandomState(0)
X = rng.randn(5, 10)
X_dist = pairwise_distances(X)
clusterer_precomputed = AgglomerativeClustering(
metric="precomputed", connectivity=connectivity_matrix, linkage="complete"
)
msg = "Completing it to avoid stopping the tree early"
with pytest.warns(UserWarning, match=msg):
clusterer_precomputed.fit(X_dist)
clusterer = AgglomerativeClustering(
connectivity=connectivity_matrix, linkage="complete"
)
with pytest.warns(UserWarning, match=msg):
clusterer.fit(X)
assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
assert_array_equal(clusterer.children_, clusterer_precomputed.children_)

View File

@@ -0,0 +1,218 @@
"""
Testing for mean shift clustering methods
"""
import warnings
import numpy as np
import pytest
from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
from sklearn.datasets import make_blobs
from sklearn.metrics import v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=300,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=11,
)
def test_convergence_of_1d_constant_data():
# Test convergence using 1D constant data
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/28926
model = MeanShift()
n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_
assert n_iter < model.max_iter
def test_estimate_bandwidth():
# Test estimate_bandwidth
bandwidth = estimate_bandwidth(X, n_samples=200)
assert 0.9 <= bandwidth <= 1.5
def test_estimate_bandwidth_1sample(global_dtype):
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
# n_neighbors is set to 1.
bandwidth = estimate_bandwidth(
X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3
)
assert bandwidth.dtype == X.dtype
assert bandwidth == pytest.approx(0.0, abs=1e-5)
@pytest.mark.parametrize(
"bandwidth, cluster_all, expected, first_cluster_label",
[(1.2, True, 3, 0), (1.2, False, 4, -1)],
)
def test_mean_shift(
global_dtype, bandwidth, cluster_all, expected, first_cluster_label
):
# Test MeanShift algorithm
X_with_global_dtype = X.astype(global_dtype, copy=False)
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
labels = ms.fit(X_with_global_dtype).labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
assert n_clusters_ == expected
assert labels_unique[0] == first_cluster_label
assert ms.cluster_centers_.dtype == global_dtype
cluster_centers, labels_mean_shift = mean_shift(
X_with_global_dtype, cluster_all=cluster_all
)
labels_mean_shift_unique = np.unique(labels_mean_shift)
n_clusters_mean_shift = len(labels_mean_shift_unique)
assert n_clusters_mean_shift == expected
assert labels_mean_shift_unique[0] == first_cluster_label
assert cluster_centers.dtype == global_dtype
# TODO: remove mark once loky bug is fixed:
# https://github.com/joblib/loky/issues/458
@pytest.mark.thread_unsafe
def test_parallel(global_dtype, global_random_seed):
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=50,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=global_random_seed,
)
X = X.astype(global_dtype, copy=False)
ms1 = MeanShift(n_jobs=2)
ms1.fit(X)
ms2 = MeanShift()
ms2.fit(X)
assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_)
assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype
assert_array_equal(ms1.labels_, ms2.labels_)
def test_meanshift_predict(global_dtype):
# Test MeanShift.predict
ms = MeanShift(bandwidth=1.2)
X_with_global_dtype = X.astype(global_dtype, copy=False)
labels = ms.fit_predict(X_with_global_dtype)
labels2 = ms.predict(X_with_global_dtype)
assert_array_equal(labels, labels2)
def test_meanshift_all_orphans():
# init away from the data, crash with a sensible warning
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
msg = "No point was within bandwidth=0.1"
with pytest.raises(ValueError, match=msg):
ms.fit(
X,
)
def test_unfitted():
# Non-regression: before fit, there should be not fitted attributes.
ms = MeanShift()
assert not hasattr(ms, "cluster_centers_")
assert not hasattr(ms, "labels_")
def test_cluster_intensity_tie(global_dtype):
X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype)
c1 = MeanShift(bandwidth=2).fit(X)
X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype)
c2 = MeanShift(bandwidth=2).fit(X)
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
def test_bin_seeds(global_dtype):
# Test the bin seeding technique which can be used in the mean shift
# algorithm
# Data is just 6 points in the plane
X = np.array(
[[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]],
dtype=global_dtype,
)
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
# found
ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
test_bins = get_bin_seeds(X, 1, 1)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
# found
ground_truth = {(1.0, 1.0), (2.0, 1.0)}
test_bins = get_bin_seeds(X, 1, 2)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
# we bail and use the whole data here.
with warnings.catch_warnings(record=True):
test_bins = get_bin_seeds(X, 0.01, 1)
assert_allclose(test_bins, X)
# tight clusters around [0, 0] and [1, 1], only get two bins
X, _ = make_blobs(
n_samples=100,
n_features=2,
centers=[[0, 0], [1, 1]],
cluster_std=0.1,
random_state=0,
)
X = X.astype(global_dtype, copy=False)
test_bins = get_bin_seeds(X, 1)
assert_array_equal(test_bins, [[0, 0], [1, 1]])
@pytest.mark.parametrize("max_iter", [1, 100])
def test_max_iter(max_iter):
clusters1, _ = mean_shift(X, max_iter=max_iter)
ms = MeanShift(max_iter=max_iter).fit(X)
clusters2 = ms.cluster_centers_
assert ms.n_iter_ <= ms.max_iter
assert len(clusters1) == len(clusters2)
for c1, c2 in zip(clusters1, clusters2):
assert np.allclose(c1, c2)
def test_mean_shift_zero_bandwidth(global_dtype):
# Check that mean shift works when the estimated bandwidth is 0.
X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1)
# estimate_bandwidth with default args returns 0 on this dataset
bandwidth = estimate_bandwidth(X)
assert bandwidth == 0
# get_bin_seeds with a 0 bin_size should return the dataset itself
assert get_bin_seeds(X, bin_size=bandwidth) is X
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
# to no binning.
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1)
assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1)
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)

View File

@@ -0,0 +1,874 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
import numpy as np
import pytest
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.exceptions import DataConversionWarning, EfficiencyWarning
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
rng = np.random.RandomState(0)
n_points_per_cluster = 10
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
@pytest.mark.parametrize(
("r_plot", "end"),
[
[[10, 8.9, 8.8, 8.7, 7, 10], 3],
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
],
)
def test_extend_downward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_downward = ratio >= 1 / 0.9
upward = ratio < 1
e = _extend_region(steep_downward, upward, 0, 2)
assert e == end
@pytest.mark.parametrize(
("r_plot", "end"),
[
[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
[[1, 2, 2.1, 2, np.inf], 0],
[[1, 2, 2.1, np.inf], 2],
],
)
def test_extend_upward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_upward = ratio <= 0.9
downward = ratio > 1
e = _extend_region(steep_upward, downward, 0, 2)
assert e == end
@pytest.mark.parametrize(
("ordering", "clusters", "expected"),
[
[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
],
)
def test_the_extract_xi_labels(ordering, clusters, expected):
labels = _extract_xi_labels(ordering, clusters)
assert_array_equal(labels, expected)
def test_extract_xi(global_dtype):
# small and easy test (no clusters around other clusters)
# but with a clear noise data.
# global_random_seed is not used here since the expected labels
# are hardcoded for these specific data.
rng = np.random.RandomState(0)
n_points_per_cluster = 5
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
global_dtype, copy=False
)
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# check float min_samples and min_cluster_size
clust = OPTICS(
min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
global_dtype, copy=False
)
expected_labels = np.r_[
[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
).fit(X)
# this may fail if the predecessor correction is not at work!
assert_array_equal(clust.labels_, expected_labels)
C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
def test_cluster_hierarchy(global_dtype, global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_points_per_cluster = 100
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
global_dtype, copy=False
)
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
global_dtype, copy=False
)
X = np.vstack((C1, C2))
X = shuffle(X, random_state=rng)
clusters = OPTICS(min_samples=20, xi=0.2).fit(X).cluster_hierarchy_
assert clusters.shape == (2, 2)
# The first cluster should contain all point from C1 but due to how the data is
# generated, some points from C2 may end up in it.
assert 100 <= np.diff(clusters[0]) + 1 <= 115
# The second cluster should contain all points from C1 and C2.
assert np.diff(clusters[-1]) + 1 == 200
@pytest.mark.parametrize(
"csr_container, metric",
[(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_correct_number_of_clusters(metric, csr_container):
# in 'auto' mode
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
clust.fit(csr_container(X) if csr_container is not None else X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
# check attribute types and sizes
assert clust.labels_.shape == (len(X),)
assert clust.labels_.dtype.kind == "i"
assert clust.reachability_.shape == (len(X),)
assert clust.reachability_.dtype.kind == "f"
assert clust.core_distances_.shape == (len(X),)
assert clust.core_distances_.dtype.kind == "f"
assert clust.ordering_.shape == (len(X),)
assert clust.ordering_.dtype.kind == "i"
assert set(clust.ordering_) == set(range(len(X)))
def test_minimum_number_of_sample_check():
# test that we check a minimum number of samples
msg = "min_samples must be no greater than"
# Compute OPTICS
X = [[1, 1]]
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
# Run the fit
with pytest.raises(ValueError, match=msg):
clust.fit(X)
def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
with pytest.raises(ValueError, match=msg):
clust.fit(X)
def test_bad_reachability():
msg = "All reachability values are inf. Set a larger max_eps."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
with pytest.warns(UserWarning, match=msg):
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
clust.fit(X)
def test_nowarn_if_metric_bool_data_bool():
# make sure no warning is raised if metric and data are both boolean
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18996
pairwise_metric = "rogerstanimoto"
X = np.random.randint(2, size=(5, 2), dtype=bool)
with warnings.catch_warnings():
warnings.simplefilter("error", DataConversionWarning)
OPTICS(metric=pairwise_metric).fit(X)
def test_warn_if_metric_bool_data_no_bool():
# make sure a *single* conversion warning is raised if metric is boolean
# but data isn't
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18996
pairwise_metric = "rogerstanimoto"
X = np.random.randint(2, size=(5, 2), dtype=np.int32)
msg = f"Data will be converted to boolean for metric {pairwise_metric}"
with pytest.warns(DataConversionWarning, match=msg) as warn_record:
# Silence a DeprecationWarning from joblib <= 1.5.1 in Python 3.14+.
warnings.filterwarnings(
"ignore",
message="'asyncio.iscoroutinefunction' is deprecated",
category=DeprecationWarning,
)
OPTICS(metric=pairwise_metric).fit(X)
assert len(warn_record) == 1
def test_nowarn_if_metric_no_bool():
# make sure no conversion warning is raised if
# metric isn't boolean, no matter what the data type is
pairwise_metric = "minkowski"
X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
with warnings.catch_warnings():
warnings.simplefilter("error", DataConversionWarning)
# fit boolean data
OPTICS(metric=pairwise_metric).fit(X_bool)
# fit numeric data
OPTICS(metric=pairwise_metric).fit(X_num)
def test_close_extract():
# Test extract where extraction eps is close to scaled max_eps
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
# Compute OPTICS
clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
assert max(clust.labels_) == 2
@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
@pytest.mark.parametrize("min_samples", [3, 10, 20])
@pytest.mark.parametrize(
"csr_container, metric",
[(None, "minkowski"), (None, "euclidean")]
+ [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=150, centers=centers, cluster_std=0.4, random_state=0
)
X = csr_container(X) if csr_container is not None else X
X = X.astype(global_dtype, copy=False)
# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(
min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
).fit(X)
# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
contingency = contingency_matrix(db.labels_, op.labels_)
agree = min(
np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
)
disagree = X.shape[0] - agree
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
# verify label mismatch is <= 5% labels
assert percent_mismatch <= 0.05
def test_min_samples_edge_case(global_dtype):
C1 = [[0, 0], [0, 0.1], [0, -0.1]]
C2 = [[10, 10], [10, 9], [10, 11]]
C3 = [[100, 100], [100, 96], [100, 106]]
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[-1] * 9]
with pytest.warns(UserWarning, match="All reachability values"):
clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# try arbitrary minimum sizes
@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
def test_min_cluster_size(min_cluster_size, global_dtype):
redX = X[::2].astype(global_dtype, copy=False) # reduce for speed
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
if cluster_sizes.size:
assert min(cluster_sizes) >= min_cluster_size
# check behaviour is the same when min_cluster_size is a fraction
clust_frac = OPTICS(
min_samples=9,
min_cluster_size=min_cluster_size / redX.shape[0],
)
clust_frac.fit(redX)
assert_array_equal(clust.labels_, clust_frac.labels_)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_min_cluster_size_invalid2(csr_container):
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)
clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(csr_container(X))
def test_processing_order():
# Ensure that we consider all unprocessed points,
# not only direct neighbors. when picking the next point.
Y = [[0], [10], [-10], [25]]
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
def test_compare_to_ELKI():
# Expected values, computed with (future) ELKI 0.7.5 using:
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
# where the FixedDBIDsFilter gives 0-indexed ids.
r1 = [
np.inf,
1.0574896366427478,
0.7587934993548423,
0.7290174038973836,
0.7290174038973836,
0.7290174038973836,
0.6861627576116127,
0.7587934993548423,
0.9280118450166668,
1.1748022534146194,
3.3355455741292257,
0.49618389254482587,
0.2552805046961355,
0.2552805046961355,
0.24944622248445714,
0.24944622248445714,
0.24944622248445714,
0.2552805046961355,
0.2552805046961355,
0.3086779122185853,
4.163024452756142,
1.623152630340929,
0.45315840475822655,
0.25468325192031926,
0.2254004358159971,
0.18765711877083036,
0.1821471333893275,
0.1821471333893275,
0.18765711877083036,
0.18765711877083036,
0.2240202988740153,
1.154337614548715,
1.342604473837069,
1.323308536402633,
0.8607514948648837,
0.27219111215810565,
0.13260875220533205,
0.13260875220533205,
0.09890587675958984,
0.09890587675958984,
0.13548790801634494,
0.1575483940837384,
0.17515137170530226,
0.17575920159442388,
0.27219111215810565,
0.6101447895405373,
1.3189208094864302,
1.323308536402633,
2.2509184159764577,
2.4517810628594527,
3.675977064404973,
3.8264795626020365,
2.9130735341510614,
2.9130735341510614,
2.9130735341510614,
2.9130735341510614,
2.8459300127258036,
2.8459300127258036,
2.8459300127258036,
3.0321982337972537,
]
o1 = [
0,
3,
6,
4,
7,
8,
2,
9,
5,
1,
31,
30,
32,
34,
33,
38,
39,
35,
37,
36,
44,
21,
23,
24,
22,
25,
27,
29,
26,
28,
20,
40,
45,
46,
10,
15,
11,
13,
17,
19,
18,
12,
16,
14,
47,
49,
43,
48,
42,
41,
53,
57,
51,
52,
56,
59,
54,
55,
58,
50,
]
p1 = [
-1,
0,
3,
6,
6,
6,
8,
3,
7,
5,
1,
31,
30,
30,
34,
34,
34,
32,
32,
37,
36,
44,
21,
23,
24,
22,
25,
25,
22,
22,
22,
21,
40,
45,
46,
10,
15,
15,
13,
13,
15,
11,
19,
15,
10,
47,
12,
45,
14,
43,
42,
53,
57,
57,
57,
57,
59,
59,
59,
58,
]
# Tests against known extraction array
# Does NOT work with metric='euclidean', because sklearn euclidean has
# worse numeric precision. 'minkowski' is slower but more accurate.
clust1 = OPTICS(min_samples=5).fit(X)
assert_array_equal(clust1.ordering_, np.array(o1))
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
# ELKI currently does not print the core distances (which are not used much
# in literature, but we can at least ensure to have this consistency:
for i in clust1.ordering_[1:]:
assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
# Expected values, computed with (future) ELKI 0.7.5 using
r2 = [
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
0.27219111215810565,
0.13260875220533205,
0.13260875220533205,
0.09890587675958984,
0.09890587675958984,
0.13548790801634494,
0.1575483940837384,
0.17515137170530226,
0.17575920159442388,
0.27219111215810565,
0.4928068613197889,
np.inf,
0.2666183922512113,
0.18765711877083036,
0.1821471333893275,
0.1821471333893275,
0.1821471333893275,
0.18715928772277457,
0.18765711877083036,
0.18765711877083036,
0.25468325192031926,
np.inf,
0.2552805046961355,
0.2552805046961355,
0.24944622248445714,
0.24944622248445714,
0.24944622248445714,
0.2552805046961355,
0.2552805046961355,
0.3086779122185853,
0.34466409325984865,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
]
o2 = [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
15,
11,
13,
17,
19,
18,
12,
16,
14,
47,
46,
20,
22,
25,
23,
27,
29,
24,
26,
28,
21,
30,
32,
34,
33,
38,
39,
35,
37,
36,
31,
40,
41,
42,
43,
44,
45,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
]
p2 = [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
10,
15,
15,
13,
13,
15,
11,
19,
15,
10,
47,
-1,
20,
22,
25,
25,
25,
25,
22,
22,
23,
-1,
30,
30,
34,
34,
34,
32,
32,
37,
38,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
]
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
assert_array_equal(clust2.ordering_, np.array(o2))
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
index = np.where(clust1.core_distances_ <= 0.5)[0]
assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
def test_extract_dbscan(global_dtype, global_random_seed):
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(global_random_seed)
n_points_per_cluster = 20
C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
assert_array_equal(
np.sort(np.unique(clust.labels_[clust.labels_ != -1])), [0, 1, 2, 3]
)
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
def test_precomputed_dists(global_dtype, csr_container):
redX = X[::2].astype(global_dtype, copy=False)
dists = pairwise_distances(redX, metric="euclidean")
dists = csr_container(dists) if csr_container is not None else dists
with warnings.catch_warnings():
warnings.simplefilter("ignore", EfficiencyWarning)
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
dists
)
clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
assert_allclose(clust1.reachability_, clust2.reachability_)
assert_array_equal(clust1.labels_, clust2.labels_)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_optics_input_not_modified_precomputed_sparse_nodiag(
csr_container, global_random_seed
):
"""Check that we don't modify in-place the pre-computed sparse matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27508
"""
X = np.random.RandomState(global_random_seed).rand(6, 6)
# Add zeros on the diagonal that will be implicit when creating
# the sparse matrix. If `X` is modified in-place, the zeros from
# the diagonal will be made explicit.
np.fill_diagonal(X, 0)
X = csr_container(X)
assert all(row != col for row, col in zip(*X.nonzero()))
X_copy = X.copy()
OPTICS(metric="precomputed").fit(X)
# Make sure that we did not modify `X` in-place even by creating
# explicit 0s values.
assert X.nnz == X_copy.nnz
assert_array_equal(X.toarray(), X_copy.toarray())
def test_optics_predecessor_correction_ordering():
"""Check that cluster correction using predecessor is working as expected.
In the following example, the predecessor correction was not working properly
since it was not using the right indices.
This non-regression test check that reordering the data does not change the results.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/26324
"""
X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1)
reorder = [0, 1, 2, 4, 5, 6, 7, 3]
X_2 = X_1[reorder]
optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1)
optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2)
assert_array_equal(optics_1.labels_[reorder], optics_2.labels_)

View File

@@ -0,0 +1,335 @@
"""Testing for Spectral Clustering methods"""
import pickle
import re
import numpy as np
import pytest
from scipy.linalg import LinAlgError
from sklearn.cluster import SpectralClustering, spectral_clustering
from sklearn.cluster._spectral import cluster_qr, discretize
from sklearn.datasets import make_blobs
from sklearn.feature_extraction import img_to_graph
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
try:
from pyamg import smoothed_aggregation_solver # noqa: F401
amg_loaded = True
except ImportError:
amg_loaded = False
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=60,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=0,
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering(
eigen_solver, assign_labels, csr_container, global_random_seed
):
S = np.array(
[
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
]
)
for mat in (S, csr_container(S)):
model = SpectralClustering(
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed",
eigen_solver=eigen_solver,
assign_labels=assign_labels,
).fit(mat)
labels = model.labels_
if labels[0] == 0:
labels = 1 - labels
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
model_copy = pickle.loads(pickle.dumps(model))
assert model_copy.n_clusters == model.n_clusters
assert model_copy.eigen_solver == model.eigen_solver
assert_array_equal(model_copy.labels_, model.labels_)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering_sparse(assign_labels, coo_container, global_random_seed):
X, y = make_blobs(
n_samples=20,
random_state=global_random_seed,
centers=[[1, 1], [-1, -1]],
cluster_std=0.01,
)
S = rbf_kernel(X, gamma=1)
S = np.maximum(S - 1e-4, 0)
S = coo_container(S)
labels = (
SpectralClustering(
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed",
assign_labels=assign_labels,
)
.fit(S)
.labels_
)
assert adjusted_rand_score(y, labels) == 1
def test_precomputed_nearest_neighbors_filtering(global_random_seed):
# Test precomputed graph filtering when containing too many neighbors
X, y = make_blobs(
n_samples=250,
random_state=global_random_seed,
centers=[[1, 1, 1], [-1, -1, -1]],
cluster_std=0.01,
)
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
graph = nn.kneighbors_graph(X, mode="distance")
labels = (
SpectralClustering(
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.labels_
)
results.append(labels)
assert_array_equal(results[0], results[1])
def test_affinities(global_random_seed):
# Note: in the following, random_state has been selected to have
# a dataset that yields a stable eigen decomposition both when built
# on OSX and Linux
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
# nearest neighbors affinity
sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
with pytest.warns(UserWarning, match="not fully connected"):
sp.fit(X)
assert adjusted_rand_score(y, sp.labels_) == 1
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=global_random_seed)
labels = sp.fit(X).labels_
assert adjusted_rand_score(y, labels) == 1
X = check_random_state(10).rand(10, 5) * 10
kernels_available = kernel_metrics()
for kern in kernels_available:
# Additive chi^2 gives a negative similarity matrix which
# doesn't make sense for spectral clustering
if kern != "additive_chi2":
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def test_cluster_qr(global_random_seed):
# cluster_qr by itself should not be used for clustering generic data
# other than the rows of the eigenvectors within spectral clustering,
# but cluster_qr must still preserve the labels for different dtypes
# of the generic fixed input even if the labels may be meaningless.
random_state = np.random.RandomState(seed=global_random_seed)
n_samples, n_components = 10, 5
data = random_state.randn(n_samples, n_components)
labels_float64 = cluster_qr(data.astype(np.float64))
# Each sample is assigned a cluster identifier
assert labels_float64.shape == (n_samples,)
# All components should be covered by the assignment
assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
# Single precision data should yield the same cluster assignments
labels_float32 = cluster_qr(data.astype(np.float32))
assert np.array_equal(labels_float64, labels_float32)
def test_cluster_qr_permutation_invariance(global_random_seed):
# cluster_qr must be invariant to sample permutation.
random_state = np.random.RandomState(seed=global_random_seed)
n_samples, n_components = 100, 5
data = random_state.randn(n_samples, n_components)
perm = random_state.permutation(n_samples)
assert np.array_equal(
cluster_qr(data)[perm],
cluster_qr(data[perm]),
)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
def test_discretize(n_samples, coo_container, global_random_seed):
# Test the discretize using a noise assignment matrix
random_state = np.random.RandomState(seed=global_random_seed)
for n_class in range(2, 10):
# random class labels
y_true = random_state.randint(0, n_class + 1, n_samples)
y_true = np.array(y_true, float)
# noise class assignment matrix
y_indicator = coo_container(
(np.ones(n_samples), (np.arange(n_samples), y_true)),
shape=(n_samples, n_class + 1),
)
y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
n_samples, n_class + 1
)
y_pred = discretize(y_true_noisy, random_state=random_state)
assert adjusted_rand_score(y_true, y_pred) > 0.8
def test_spectral_clustering_with_arpack_amg_solvers(global_random_seed):
# Test that spectral_clustering is the same for arpack and amg solver
# Based on toy example from plot_segmentation_toy.py
# a small two coin image
x, y = np.indices((40, 40))
center1, center2 = (14, 12), (20, 25)
radius1, radius2 = 8, 7
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
circles = circle1 | circle2
mask = circles.copy()
img = circles.astype(float)
graph = img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels_arpack = spectral_clustering(
graph, n_clusters=2, eigen_solver="arpack", random_state=global_random_seed
)
assert len(np.unique(labels_arpack)) == 2
if amg_loaded:
labels_amg = spectral_clustering(
graph, n_clusters=2, eigen_solver="amg", random_state=global_random_seed
)
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
else:
with pytest.raises(ValueError):
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
def test_n_components(global_random_seed):
# Test that after adding n_components, result is different and
# n_components = n_clusters by default
X, y = make_blobs(
n_samples=20,
random_state=global_random_seed,
centers=[[1, 1], [-1, -1]],
cluster_std=0.01,
)
sp = SpectralClustering(n_clusters=2, random_state=global_random_seed)
labels = sp.fit(X).labels_
# set n_components = n_cluster and test if result is the same
labels_same_ncomp = (
SpectralClustering(
n_clusters=2, n_components=2, random_state=global_random_seed
)
.fit(X)
.labels_
)
# test that n_components=n_clusters by default
assert_array_equal(labels, labels_same_ncomp)
# test that n_components affect result
# n_clusters=8 by default, and set n_components=2
labels_diff_ncomp = (
SpectralClustering(n_components=2, random_state=global_random_seed)
.fit(X)
.labels_
)
assert not np.array_equal(labels, labels_diff_ncomp)
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_verbose(assign_labels, capsys):
# Check verbose mode of KMeans for better coverage.
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
captured = capsys.readouterr()
assert re.search(r"Computing label assignment using", captured.out)
if assign_labels == "kmeans":
assert re.search(r"Initialization complete", captured.out)
assert re.search(r"Iteration [0-9]+, inertia", captured.out)
def test_spectral_clustering_np_matrix_raises():
"""Check that spectral_clustering raises an informative error when passed
an np.matrix. See #10993"""
X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
msg = r"np\.matrix is not supported. Please convert to a numpy array"
with pytest.raises(TypeError, match=msg):
spectral_clustering(X)
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
"""Check that discretize raises LinAlgError when svd never converges.
Non-regression test for #21380
"""
def new_svd(*args, **kwargs):
raise LinAlgError()
monkeypatch.setattr(np.linalg, "svd", new_svd)
vectors = np.ones((10, 4))
with pytest.raises(LinAlgError, match="SVD did not converge"):
discretize(vectors)