Videre
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,68 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import ClassicalMDS
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
|
||||
def test_classical_mds_equivalent_to_pca():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
pca = PCA(n_components=2)
|
||||
|
||||
Z1 = cmds.fit_transform(X)
|
||||
Z2 = pca.fit_transform(X)
|
||||
|
||||
# Swap the signs if necessary
|
||||
for comp in range(2):
|
||||
if Z1[0, comp] < 0 and Z2[0, comp] > 0:
|
||||
Z2[:, comp] *= -1
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
assert_allclose(np.sqrt(cmds.eigenvalues_), pca.singular_values_)
|
||||
|
||||
|
||||
def test_classical_mds_equivalent_on_data_and_distances():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
Z1 = cmds.fit_transform(X)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="precomputed")
|
||||
Z2 = cmds.fit_transform(euclidean_distances(X))
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
|
||||
def test_classical_mds_wrong_inputs():
|
||||
# Non-symmetric input
|
||||
dissim = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
|
||||
with pytest.raises(ValueError, match="Array must be symmetric"):
|
||||
ClassicalMDS(metric="precomputed").fit(dissim)
|
||||
|
||||
# Non-square input
|
||||
dissim = np.array([[0, 1, 2], [3, 4, 5]])
|
||||
with pytest.raises(ValueError, match="array must be 2-dimensional and square"):
|
||||
ClassicalMDS(metric="precomputed").fit(dissim)
|
||||
|
||||
|
||||
def test_classical_mds_metric_params():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
Z1 = cmds.fit_transform(X)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 2})
|
||||
Z2 = cmds.fit_transform(X)
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 1})
|
||||
Z3 = cmds.fit_transform(X)
|
||||
|
||||
assert not np.allclose(Z1, Z3)
|
||||
@@ -0,0 +1,348 @@
|
||||
import math
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import rand as sparse_rand
|
||||
|
||||
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
eigen_solvers = ["auto", "dense", "arpack"]
|
||||
path_methods = ["auto", "FW", "D"]
|
||||
|
||||
|
||||
def create_sample_data(dtype, n_pts=25, add_noise=False):
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
n_per_side = int(math.sqrt(n_pts))
|
||||
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
|
||||
if add_noise:
|
||||
# add noise in a third dimension
|
||||
rng = np.random.RandomState(0)
|
||||
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
|
||||
X = np.concatenate((X, noise), 1)
|
||||
return X
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_simple_grid(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
# Isomap should preserve distances when all neighbors are used
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
|
||||
|
||||
# distances from each point to all others
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose_dense_sparse(G, G_iso, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_reconstruction_error(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
if global_dtype is np.float32:
|
||||
pytest.skip(
|
||||
"Skipping test due to numerical instabilities on float32 data"
|
||||
"from KernelCenterer used in the reconstruction_error method"
|
||||
)
|
||||
|
||||
# Same setup as in test_isomap_simple_grid, with an added dimension
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
|
||||
|
||||
# compute input kernel
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
|
||||
centerer = preprocessing.KernelCenterer()
|
||||
K = centerer.fit_transform(-0.5 * G**2)
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
# compute output kernel
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
G_iso = G_iso.toarray()
|
||||
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
|
||||
|
||||
# make sure error agrees
|
||||
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
|
||||
def test_transform(global_dtype, n_neighbors, radius):
|
||||
n_samples = 200
|
||||
n_components = 10
|
||||
noise_scale = 0.01
|
||||
|
||||
# Create S-curve dataset
|
||||
X, y = datasets.make_s_curve(n_samples, random_state=0)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# Compute isomap embedding
|
||||
iso = manifold.Isomap(
|
||||
n_components=n_components, n_neighbors=n_neighbors, radius=radius
|
||||
)
|
||||
X_iso = iso.fit_transform(X)
|
||||
|
||||
# Re-embed a noisy version of the points
|
||||
rng = np.random.RandomState(0)
|
||||
noise = noise_scale * rng.randn(*X.shape)
|
||||
X_iso2 = iso.transform(X + noise)
|
||||
|
||||
# Make sure the rms error on re-embedding is comparable to noise_scale
|
||||
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
|
||||
def test_pipeline(n_neighbors, radius, global_dtype):
|
||||
# check that Isomap works fine as a transformer in a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
|
||||
# Test chaining NearestNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X2, _ = datasets.make_blobs(random_state=1)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
X2 = X2.astype(global_dtype, copy=False)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = pipeline.make_pipeline(
|
||||
neighbors.KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = manifold.Isomap(
|
||||
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric, p, is_euclidean",
|
||||
[
|
||||
("euclidean", 2, True),
|
||||
("manhattan", 1, False),
|
||||
("minkowski", 1, False),
|
||||
("minkowski", 2, True),
|
||||
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
|
||||
],
|
||||
)
|
||||
def test_different_metric(global_dtype, metric, p, is_euclidean):
|
||||
# Isomap must work on various metric parameters work correctly
|
||||
# and must default to euclidean.
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
reference = manifold.Isomap().fit_transform(X)
|
||||
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
|
||||
|
||||
if is_euclidean:
|
||||
assert_allclose(embedding, reference)
|
||||
else:
|
||||
with pytest.raises(AssertionError, match="Not equal to tolerance"):
|
||||
assert_allclose(embedding, reference)
|
||||
|
||||
|
||||
def test_isomap_clone_bug():
|
||||
# regression test for bug reported in #6062
|
||||
model = manifold.Isomap()
|
||||
for n_neighbors in [10, 15, 20]:
|
||||
model.set_params(n_neighbors=n_neighbors)
|
||||
model.fit(np.random.rand(50, 2))
|
||||
assert model.nbrs_.n_neighbors == n_neighbors
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_input(
|
||||
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
|
||||
):
|
||||
# TODO: compare results on dense and sparse data as proposed in:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
|
||||
X = csr_container(
|
||||
sparse_rand(
|
||||
100,
|
||||
3,
|
||||
density=0.1,
|
||||
format="csr",
|
||||
dtype=global_dtype,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
)
|
||||
|
||||
iso_dense = manifold.Isomap(
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
n_neighbors=8,
|
||||
)
|
||||
iso_sparse = clone(iso_dense)
|
||||
|
||||
X_trans_dense = iso_dense.fit_transform(X.toarray())
|
||||
X_trans_sparse = iso_sparse.fit_transform(X)
|
||||
|
||||
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
|
||||
|
||||
|
||||
def test_isomap_fit_precomputed_radius_graph(global_dtype):
|
||||
# Isomap.fit_transform must yield similar result when using
|
||||
# a precomputed distance matrix.
|
||||
|
||||
X, y = datasets.make_s_curve(200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
radius = 10
|
||||
|
||||
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
|
||||
isomap.fit(g)
|
||||
precomputed_result = isomap.embedding_
|
||||
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
|
||||
result = isomap.fit_transform(X)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(precomputed_result, result, atol=atol)
|
||||
|
||||
|
||||
def test_isomap_fitted_attributes_dtype(global_dtype):
|
||||
"""Check that the fitted attributes are stored accordingly to the
|
||||
data type of X."""
|
||||
iso = manifold.Isomap(n_neighbors=2)
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
|
||||
|
||||
iso.fit(X)
|
||||
|
||||
assert iso.dist_matrix_.dtype == global_dtype
|
||||
assert iso.embedding_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_isomap_dtype_equivalence():
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
iso_32 = manifold.Isomap(n_neighbors=2)
|
||||
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
|
||||
iso_32.fit(X_32)
|
||||
|
||||
iso_64 = manifold.Isomap(n_neighbors=2)
|
||||
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
|
||||
iso_64.fit(X_64)
|
||||
|
||||
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
|
||||
|
||||
|
||||
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
|
||||
# Isomap.fit_transform must raise a ValueError if
|
||||
# radius and n_neighbors are provided.
|
||||
|
||||
X, _ = datasets.load_digits(return_X_y=True)
|
||||
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
|
||||
msg = "Both n_neighbors and radius are provided"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
isomap.fit_transform(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components():
|
||||
# Test that a warning is raised when the graph has multiple components
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=2).fit(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components_metric_precomputed(global_dtype):
|
||||
# Test that an error is raised when the graph has multiple components
|
||||
# and when X is a precomputed neighbors graph.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
|
||||
|
||||
# works with a precomputed distance matrix (dense)
|
||||
X_distances = pairwise_distances(X)
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
|
||||
|
||||
# does not work with a precomputed neighbors graph (sparse)
|
||||
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
with pytest.raises(RuntimeError, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for Isomap."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.Isomap(n_components=n_components)
|
||||
iso.fit_transform(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
|
||||
@@ -0,0 +1,171 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import manifold, neighbors
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
eigen_solvers = ["dense", "arpack"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test utility routines
|
||||
def test_barycenter_kneighbors_graph(global_dtype):
|
||||
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 1)
|
||||
expected_graph = np.array(
|
||||
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
|
||||
)
|
||||
|
||||
assert graph.dtype == global_dtype
|
||||
|
||||
assert_allclose(graph.toarray(), expected_graph)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 2)
|
||||
# check that columns sum to one
|
||||
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
|
||||
pred = np.dot(graph.toarray(), X)
|
||||
assert linalg.norm(pred - X) / X.shape[0] < 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test LLE by computing the reconstruction error on some manifolds.
|
||||
|
||||
|
||||
def test_lle_simple_grid(global_dtype):
|
||||
# note: ARPACK is numerically unstable, so this test will fail for
|
||||
# some random seeds. We choose 42 because the tests pass.
|
||||
# for arm64 platforms 2 makes the test fail.
|
||||
# TODO: rewrite this test to make less sensitive to the random seed,
|
||||
# irrespective of the platform.
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(5), repeat=2)))
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
n_components = 2
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=5, n_components=n_components, random_state=rng
|
||||
)
|
||||
tol = 0.1
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
|
||||
assert reconstruction_error < tol
|
||||
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
|
||||
|
||||
# re-embed a noisy version of X using the transform method
|
||||
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
|
||||
X_reembedded = clf.transform(X + noise)
|
||||
assert linalg.norm(X_reembedded - clf.embedding_) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
|
||||
@pytest.mark.parametrize("solver", eigen_solvers)
|
||||
def test_lle_manifold(global_dtype, method, solver):
|
||||
rng = np.random.RandomState(0)
|
||||
# similar test on a slightly more complex manifold
|
||||
X = np.array(list(product(np.arange(18), repeat=2)))
|
||||
X = np.c_[X, X[:, 0] ** 2 / 18]
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
n_components = 2
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=6, n_components=n_components, method=method, random_state=0
|
||||
)
|
||||
tol = 1.5 if method == "standard" else 3
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X)
|
||||
assert reconstruction_error < tol
|
||||
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
details = "solver: %s, method: %s" % (solver, method)
|
||||
assert reconstruction_error < tol, details
|
||||
assert (
|
||||
np.abs(clf.reconstruction_error_ - reconstruction_error)
|
||||
< tol * reconstruction_error
|
||||
), details
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that LocallyLinearEmbedding works fine as a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
from sklearn import datasets, pipeline
|
||||
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
# Test the error raised when the weight matrix is singular
|
||||
def test_singular_matrix():
|
||||
M = np.ones((200, 3))
|
||||
f = ignore_warnings
|
||||
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
|
||||
f(
|
||||
manifold.locally_linear_embedding(
|
||||
M,
|
||||
n_neighbors=2,
|
||||
n_components=1,
|
||||
method="standard",
|
||||
eigen_solver="arpack",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# regression test for #6033
|
||||
def test_integer_input():
|
||||
rand = np.random.RandomState(0)
|
||||
X = rand.randint(0, 100, size=(20, 3))
|
||||
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
|
||||
clf.fit(X) # this previously raised a TypeError
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for LocallyLinearEmbedding."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
|
||||
iso.fit(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"locallylinearembedding{i}" for i in range(n_components)], names
|
||||
)
|
||||
@@ -0,0 +1,305 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
|
||||
|
||||
from sklearn.datasets import load_digits, load_iris
|
||||
from sklearn.manifold import ClassicalMDS
|
||||
from sklearn.manifold import _mds as mds
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
|
||||
def test_smacof():
|
||||
# test metric smacof using the data of "Modern Multidimensional Scaling",
|
||||
# Borg & Groenen, p 154
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
|
||||
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
|
||||
X_true = np.array(
|
||||
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
|
||||
)
|
||||
assert_array_almost_equal(X, X_true, decimal=3)
|
||||
|
||||
|
||||
def test_nonmetric_lower_normalized_stress():
|
||||
# Testing that nonmetric MDS results in lower normalized stress compared
|
||||
# compared to metric MDS (non-regression test for issue 27028)
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
sim = euclidean_distances(X)
|
||||
np.random.seed(42)
|
||||
Z = np.random.normal(size=(X.shape[0], 2))
|
||||
|
||||
_, stress1 = mds.smacof(
|
||||
sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True
|
||||
)
|
||||
|
||||
_, stress2 = mds.smacof(
|
||||
sim,
|
||||
init=Z,
|
||||
n_components=2,
|
||||
max_iter=1000,
|
||||
n_init=1,
|
||||
normalized_stress=True,
|
||||
metric=False,
|
||||
)
|
||||
|
||||
assert stress1 > stress2
|
||||
|
||||
# A metric MDS solution (local minimum of the raw stress) can be rescaled to
|
||||
# decrease the stress-1 (which is returned with normalized_stress=True).
|
||||
# The optimal rescaling can be computed analytically, see Borg & Groenen,
|
||||
# Modern Multidimensional Scaling, Chapter 11.1. After rescaling, stress-1
|
||||
# becomes sqrt(s^2 / (1 + s^2)), where s is the value of stress-1 before
|
||||
# rescaling.
|
||||
stress1_rescaled = np.sqrt(stress1**2 / (1 + stress1**2))
|
||||
assert stress1_rescaled > stress2
|
||||
|
||||
|
||||
def test_nonmetric_mds_optimization():
|
||||
# Test that stress is decreasing during nonmetric MDS optimization
|
||||
# (non-regression test for issue 27028)
|
||||
X, _ = load_digits(return_X_y=True)
|
||||
rng = np.random.default_rng(seed=42)
|
||||
ind_subset = rng.choice(len(X), size=200, replace=False)
|
||||
X = X[ind_subset]
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
max_iter=2,
|
||||
metric_mds=False,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress_after_2_iter = mds_est.stress_
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
max_iter=3,
|
||||
metric_mds=False,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress_after_3_iter = mds_est.stress_
|
||||
|
||||
assert stress_after_2_iter > stress_after_3_iter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_mds", [True, False])
|
||||
def test_mds_recovers_true_data(metric_mds):
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
eps=1e-15,
|
||||
max_iter=1000,
|
||||
metric_mds=metric_mds,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress = mds_est.stress_
|
||||
assert_allclose(stress, 0, atol=1e-6)
|
||||
|
||||
|
||||
def test_smacof_error():
|
||||
# Not symmetric similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, n_init=1)
|
||||
|
||||
# Not squared similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, n_init=1)
|
||||
|
||||
# init not None and not correct format:
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, init=Z, n_init=1)
|
||||
|
||||
|
||||
# TODO: remove mark once loky bug is fixed:
|
||||
# https://github.com/joblib/loky/issues/458
|
||||
@pytest.mark.thread_unsafe
|
||||
def test_MDS():
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
mds_clf = mds.MDS(
|
||||
metric_mds=False,
|
||||
n_jobs=3,
|
||||
n_init=3,
|
||||
metric="precomputed",
|
||||
init="random",
|
||||
)
|
||||
mds_clf.fit(sim)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
|
||||
def test_normed_stress(k):
|
||||
"""Test that non-metric MDS normalized stress is scale-invariant."""
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
|
||||
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
|
||||
|
||||
assert_allclose(stress1, stress2, rtol=1e-5)
|
||||
assert_allclose(X1, X2, rtol=1e-5)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric", [True, False])
|
||||
def test_normalized_stress_auto(metric, monkeypatch):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(4, 3)
|
||||
dist = euclidean_distances(X)
|
||||
|
||||
mock = Mock(side_effect=mds._smacof_single)
|
||||
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
|
||||
|
||||
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
|
||||
est.fit_transform(X)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
|
||||
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
|
||||
|
||||
def test_isotonic_outofbounds():
|
||||
# This particular configuration can trigger out of bounds error
|
||||
# in the isotonic regression (non-regression test for issue 26999)
|
||||
dis = np.array(
|
||||
[
|
||||
[0.0, 1.732050807568877, 1.7320508075688772],
|
||||
[1.732050807568877, 0.0, 6.661338147750939e-16],
|
||||
[1.7320508075688772, 6.661338147750939e-16, 0.0],
|
||||
]
|
||||
)
|
||||
init = np.array(
|
||||
[
|
||||
[0.08665881585055124, 0.7939114643387546],
|
||||
[0.9959834154297658, 0.7555546025640025],
|
||||
[0.8766008278401566, 0.4227358815811242],
|
||||
]
|
||||
)
|
||||
mds.smacof(dis, init=init, metric=False, n_init=1)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("normalized_stress", [True, False])
|
||||
def test_returned_stress(normalized_stress):
|
||||
# Test that the final stress corresponds to the final embedding
|
||||
# (non-regression test for issue 16846)
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
D = euclidean_distances(X)
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
random_state=42,
|
||||
normalized_stress=normalized_stress,
|
||||
).fit(X)
|
||||
|
||||
Z = mds_est.embedding_
|
||||
stress = mds_est.stress_
|
||||
|
||||
D_mds = euclidean_distances(Z)
|
||||
stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2
|
||||
|
||||
if normalized_stress:
|
||||
stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2))
|
||||
|
||||
assert_allclose(stress, stress_Z)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric_mds", [True, False])
|
||||
def test_convergence_does_not_depend_on_scale(metric_mds):
|
||||
# Test that the number of iterations until convergence does not depend on
|
||||
# the scale of the input data
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
random_state=42,
|
||||
metric_mds=metric_mds,
|
||||
)
|
||||
|
||||
mds_est.fit(X * 100)
|
||||
n_iter1 = mds_est.n_iter_
|
||||
|
||||
mds_est.fit(X / 100)
|
||||
n_iter2 = mds_est.n_iter_
|
||||
|
||||
assert_equal(n_iter1, n_iter2)
|
||||
|
||||
|
||||
# TODO(1.9): delete this test
|
||||
def test_future_warning_n_init():
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.warns(FutureWarning):
|
||||
mds.smacof(sim)
|
||||
|
||||
with pytest.warns(FutureWarning):
|
||||
mds.MDS(init="random").fit(X)
|
||||
|
||||
|
||||
# TODO(1.9): delete the n_init warning check
|
||||
# TODO(1.10): delete this test
|
||||
def test_future_warning_init_and_metric():
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
# dissimilarity argument deprecated
|
||||
with pytest.warns(FutureWarning, match="`dissimilarity` parameter is"):
|
||||
mds.MDS(dissimilarity="precomputed", init="random", n_init=1).fit(sim)
|
||||
|
||||
# metric=True deprecated
|
||||
with pytest.warns(FutureWarning, match="Use metric_mds"):
|
||||
mds.MDS(metric=True, init="random", n_init=1).fit(X)
|
||||
|
||||
# metric=False deprecated
|
||||
with pytest.warns(FutureWarning, match="Use metric_mds"):
|
||||
mds.MDS(metric=False, init="random", n_init=1).fit(X)
|
||||
|
||||
# default init will become classical_mds in the future
|
||||
with pytest.warns(FutureWarning, match="The default value of `init`"):
|
||||
mds.MDS(metric="euclidean", n_init=1).fit(X)
|
||||
|
||||
# TODO (1.9): delete this check
|
||||
# n_init=1 will become default in the future
|
||||
with pytest.warns(FutureWarning, match="The default value of `n_init`"):
|
||||
mds.MDS(metric="euclidean", init="random").fit(X)
|
||||
|
||||
# providing both metric and dissimilarity raises an error
|
||||
with pytest.raises(ValueError, match="provided both `dissimilarity`"):
|
||||
mds.MDS(
|
||||
metric="cosine", dissimilarity="euclidean", init="random", n_init=1
|
||||
).fit(X)
|
||||
|
||||
|
||||
# TODO(1.9): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
def test_classical_mds_init_to_mds():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS()
|
||||
Z_classical = cmds.fit_transform(X)
|
||||
|
||||
mds1 = mds.MDS(init="classical_mds")
|
||||
Z1 = mds1.fit_transform(X)
|
||||
|
||||
mds2 = mds.MDS(init="random")
|
||||
Z2 = mds1.fit_transform(X, init=Z_classical)
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
@@ -0,0 +1,503 @@
|
||||
import itertools
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
|
||||
from sklearn.manifold._spectral_embedding import (
|
||||
_graph_connected_component,
|
||||
_graph_is_connected,
|
||||
)
|
||||
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
parse_version,
|
||||
sp_version,
|
||||
)
|
||||
from sklearn.utils.fixes import laplacian as csgraph_laplacian
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa: F401
|
||||
|
||||
pyamg_available = True
|
||||
except ImportError:
|
||||
pyamg_available = False
|
||||
skip_if_no_pyamg = pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
|
||||
# non centered, sparse centers to check the
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
n_samples = 1000
|
||||
n_clusters, n_features = centers.shape
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
|
||||
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
|
||||
"""Check array A and B are equal with possible sign flipping on
|
||||
each column"""
|
||||
tol_squared = tol**2
|
||||
for A_col, B_col in zip(A.T, B.T):
|
||||
assert (
|
||||
np.max((A_col - B_col) ** 2) <= tol_squared
|
||||
or np.max((A_col + B_col) ** 2) <= tol_squared
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_sparse_graph_connected_component(coo_container):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 300
|
||||
boundaries = [0, 42, 121, 200, n_samples]
|
||||
p = rng.permutation(n_samples)
|
||||
connections = []
|
||||
|
||||
for start, stop in itertools.pairwise(boundaries):
|
||||
group = p[start:stop]
|
||||
# Connect all elements within the group at least once via an
|
||||
# arbitrary path that spans the group.
|
||||
for i in range(len(group) - 1):
|
||||
connections.append((group[i], group[i + 1]))
|
||||
|
||||
# Add some more random connections within the group
|
||||
min_idx, max_idx = 0, len(group) - 1
|
||||
n_random_connections = 1000
|
||||
source = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
target = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
connections.extend(zip(group[source], group[target]))
|
||||
|
||||
# Build a symmetric affinity matrix
|
||||
row_idx, column_idx = tuple(np.array(connections).T)
|
||||
data = rng.uniform(0.1, 42, size=len(connections))
|
||||
affinity = coo_container((data, (row_idx, column_idx)))
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
for start, stop in itertools.pairwise(boundaries):
|
||||
component_1 = _graph_connected_component(affinity, p[start])
|
||||
component_size = stop - start
|
||||
assert component_1.sum() == component_size
|
||||
|
||||
# We should retrieve the same component mask by starting by both ends
|
||||
# of the group
|
||||
component_2 = _graph_connected_component(affinity, p[stop - 1])
|
||||
assert component_2.sum() == component_size
|
||||
assert_array_equal(component_1, component_2)
|
||||
|
||||
|
||||
# TODO: investigate why this test is seed-sensitive on 32-bit Python
|
||||
# runtimes. Is this revealing a numerical stability problem ? Or is it
|
||||
# expected from the test numerical design ? In the latter case the test
|
||||
# should be made less seed-sensitive instead.
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
|
||||
# Test spectral embedding with two components
|
||||
random_state = np.random.RandomState(seed)
|
||||
n_sample = 100
|
||||
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
|
||||
# first component
|
||||
affinity[0:n_sample, 0:n_sample] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
# second component
|
||||
affinity[n_sample::, n_sample::] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
|
||||
# Test of internal _graph_connected_component before connection
|
||||
component = _graph_connected_component(affinity, 0)
|
||||
assert component[:n_sample].all()
|
||||
assert not component[n_sample:].any()
|
||||
component = _graph_connected_component(affinity, -1)
|
||||
assert not component[:n_sample].any()
|
||||
assert component[n_sample:].all()
|
||||
|
||||
# connection
|
||||
affinity[0, n_sample + 1] = 1
|
||||
affinity[n_sample + 1, 0] = 1
|
||||
affinity.flat[:: 2 * n_sample + 1] = 0
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
true_label = np.zeros(shape=2 * n_sample)
|
||||
true_label[0:n_sample] = 1
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
|
||||
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
|
||||
# thresholding on the first components using 0.
|
||||
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
|
||||
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_precomputed_affinity(
|
||||
sparse_container, eigen_solver, dtype, seed=36
|
||||
):
|
||||
# Test spectral embedding with precomputed kernel
|
||||
gamma = 1.0
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
|
||||
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
|
||||
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
|
||||
graph = nn.kneighbors_graph(S, mode="connectivity")
|
||||
embedding = (
|
||||
SpectralEmbedding(
|
||||
random_state=0,
|
||||
n_components=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.embedding_
|
||||
)
|
||||
results.append(embedding)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
|
||||
# Test spectral embedding with callable affinity
|
||||
gamma = 0.9
|
||||
kern = rbf_kernel(S, gamma=gamma)
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_callable = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
embed_callable = se_callable.fit_transform(X)
|
||||
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
|
||||
se_amg = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="amg",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_arpack = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="arpack",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_amg = se_amg.fit_transform(S.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# same with special case in which amg is not actually used
|
||||
# regression test for #10715
|
||||
# affinity between nodes
|
||||
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
|
||||
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
|
||||
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
|
||||
|
||||
affinity = coo_container(
|
||||
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
|
||||
shape=(6, 6),
|
||||
)
|
||||
se_amg.affinity = "precomputed"
|
||||
se_arpack.affinity = "precomputed"
|
||||
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
|
||||
# or is successful based on the version of SciPy which is installed.
|
||||
# Use a CSR matrix to avoid any conversion during the validation
|
||||
affinity = affinity.tocsr()
|
||||
affinity.indptr = affinity.indptr.astype(np.int64)
|
||||
affinity.indices = affinity.indices.astype(np.int64)
|
||||
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
|
||||
if scipy_graph_traversal_supports_int64_index:
|
||||
se_amg.fit_transform(affinity)
|
||||
else:
|
||||
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_amg.fit_transform(affinity)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
|
||||
# Non-regression test for amg solver failure (issue #13393 on github)
|
||||
num_nodes = 100
|
||||
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
|
||||
X = X.astype(dtype)
|
||||
upper = sparse.triu(X) - sparse.diags(X.diagonal())
|
||||
sym_matrix = upper + upper.T
|
||||
embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
|
||||
)
|
||||
|
||||
# Check that the learned embedding is stable w.r.t. random solver init:
|
||||
for i in range(3):
|
||||
new_embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
|
||||
)
|
||||
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
|
||||
|
||||
|
||||
def test_pipeline_spectral_clustering(seed=36):
|
||||
# Test using pipeline to do spectral clustering
|
||||
random_state = np.random.RandomState(seed)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=n_clusters, affinity="rbf", random_state=random_state
|
||||
)
|
||||
se_knn = SpectralEmbedding(
|
||||
n_components=n_clusters,
|
||||
affinity="nearest_neighbors",
|
||||
n_neighbors=5,
|
||||
random_state=random_state,
|
||||
)
|
||||
for se in [se_rbf, se_knn]:
|
||||
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
|
||||
km.fit(se.fit_transform(S))
|
||||
assert_array_almost_equal(
|
||||
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
|
||||
)
|
||||
|
||||
|
||||
def test_connectivity(seed=36):
|
||||
# Test that graph connectivity test works as expected
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert not _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert not _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert not _graph_is_connected(csc_container(graph))
|
||||
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert _graph_is_connected(csc_container(graph))
|
||||
|
||||
|
||||
def test_spectral_embedding_deterministic():
|
||||
# Test that Spectral Embedding is deterministic
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
embedding_1 = spectral_embedding(sims)
|
||||
embedding_2 = spectral_embedding(sims)
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unnormalized():
|
||||
# Test that spectral_embedding is also processing unnormalized laplacian
|
||||
# correctly
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 8
|
||||
embedding_1 = spectral_embedding(
|
||||
sims, norm_laplacian=False, n_components=n_components, drop_first=False
|
||||
)
|
||||
|
||||
# Verify using manual computation with dense eigh
|
||||
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
|
||||
_, diffusion_map = eigh(laplacian)
|
||||
embedding_2 = diffusion_map.T[:n_components]
|
||||
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
|
||||
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_first_eigen_vector():
|
||||
# Test that the first eigenvector of spectral_embedding
|
||||
# is constant and that the second is not (for a connected graph)
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 2
|
||||
|
||||
for seed in range(10):
|
||||
embedding = spectral_embedding(
|
||||
sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False,
|
||||
random_state=seed,
|
||||
)
|
||||
|
||||
assert np.std(embedding[:, 0]) == pytest.approx(0)
|
||||
assert np.std(embedding[:, 1]) > 1e-3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
|
||||
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
|
||||
attribute and transformed data.
|
||||
|
||||
Ideally, this test should be covered by the common test
|
||||
`check_transformer_preserve_dtypes`. However, this test only run
|
||||
with transformers implementing `transform` while `SpectralEmbedding`
|
||||
implements only `fit_transform`.
|
||||
"""
|
||||
X = S.astype(dtype)
|
||||
se = SpectralEmbedding(
|
||||
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
|
||||
)
|
||||
X_trans = se.fit_transform(X)
|
||||
|
||||
assert X_trans.dtype == dtype
|
||||
assert se.embedding_.dtype == dtype
|
||||
assert se.affinity_matrix_.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pyamg_available,
|
||||
reason="PyAMG is installed and we should not test for an error.",
|
||||
)
|
||||
def test_error_pyamg_not_available():
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
eigen_solver="amg",
|
||||
)
|
||||
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_precomp.fit_transform(S)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
|
||||
"""Test that `eigen_tol="auto"` is resolved correctly"""
|
||||
if solver == "amg" and not pyamg_available:
|
||||
pytest.skip("PyAMG is not available.")
|
||||
X, _ = make_blobs(
|
||||
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
D = pairwise_distances(X) # Distance matrix
|
||||
S = np.max(D) - D # Similarity matrix
|
||||
|
||||
solver_func = eigsh if solver == "arpack" else lobpcg
|
||||
default_value = 0 if solver == "arpack" else None
|
||||
if solver == "amg":
|
||||
S = csr_container(S)
|
||||
|
||||
mocked_solver = Mock(side_effect=solver_func)
|
||||
|
||||
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
|
||||
|
||||
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
|
||||
mocked_solver.assert_called()
|
||||
|
||||
_, kwargs = mocked_solver.call_args
|
||||
assert kwargs["tol"] == default_value
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user