This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,27 @@
"""Data embedding techniques."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.manifold._classical_mds import ClassicalMDS
from sklearn.manifold._isomap import Isomap
from sklearn.manifold._locally_linear import (
LocallyLinearEmbedding,
locally_linear_embedding,
)
from sklearn.manifold._mds import MDS, smacof
from sklearn.manifold._spectral_embedding import SpectralEmbedding, spectral_embedding
from sklearn.manifold._t_sne import TSNE, trustworthiness
__all__ = [
"MDS",
"TSNE",
"ClassicalMDS",
"Isomap",
"LocallyLinearEmbedding",
"SpectralEmbedding",
"locally_linear_embedding",
"smacof",
"spectral_embedding",
"trustworthiness",
]

View File

@@ -0,0 +1,295 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
# implementations and papers describing the technique
import numpy as np
cimport numpy as cnp
from libc.stdio cimport printf
from libc.math cimport log
from libc.stdlib cimport malloc, free
from libc.time cimport clock, clock_t
from cython.parallel cimport prange, parallel
from sklearn.neighbors._quad_tree cimport _QuadTree
cnp.import_array()
cdef char* EMPTY_STRING = ""
# Smallest strictly positive value that can be represented by floating
# point numbers for different precision levels. This is useful to avoid
# taking the log of zero when computing the KL divergence.
cdef float FLOAT32_TINY = np.finfo(np.float32).tiny
# Useful to void division by zero or divergence to +inf.
cdef float FLOAT64_EPS = np.finfo(np.float64).eps
# This is effectively an ifdef statement in Cython
# It allows us to write printf debugging lines
# and remove them at compile time
cdef enum:
DEBUGFLAG = 0
cdef float compute_gradient(float[:] val_P,
float[:, :] pos_reference,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float[:, :] tot_force,
_QuadTree qt,
float theta,
int dof,
long start,
bint compute_error,
int num_threads) noexcept nogil:
# Having created the tree, calculate the gradient
# in two components, the positive and negative forces
cdef:
long i, coord
int ax
long n_samples = pos_reference.shape[0]
int n_dimensions = qt.n_dimensions
clock_t t1 = 0, t2 = 0
double sQ
float error
int take_timing = 1 if qt.verbose > 15 else 0
if qt.verbose > 11:
printf("[t-SNE] Allocating %li elements in force arrays\n",
n_samples * n_dimensions * 2)
cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
if take_timing:
t1 = clock()
sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
num_threads)
if take_timing:
t2 = clock()
printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
if take_timing:
t1 = clock()
error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
pos_f, n_dimensions, dof, sQ, start,
qt.verbose, compute_error, num_threads)
if take_timing:
t2 = clock()
printf("[t-SNE] Computing positive gradient: %e ticks\n",
((float) (t2 - t1)))
for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
schedule='static'):
for ax in range(n_dimensions):
coord = i * n_dimensions + ax
tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
free(neg_f)
free(pos_f)
return error
cdef float compute_gradient_positive(float[:] val_P,
float[:, :] pos_reference,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float* pos_f,
int n_dimensions,
int dof,
double sum_Q,
cnp.int64_t start,
int verbose,
bint compute_error,
int num_threads) noexcept nogil:
# Sum over the following expression for i not equal to j
# grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
# This is equivalent to compute_edge_forces in the authors' code
# It just goes over the nearest neighbors instead of all the data points
# (unlike the non-nearest neighbors version of `compute_gradient_positive')
cdef:
int ax
long i, j, k
long n_samples = indptr.shape[0] - 1
float C = 0.0
float dij, qij, pij
float exponent = (dof + 1.0) / 2.0
float float_dof = (float) (dof)
float* buff
clock_t t1 = 0, t2 = 0
float dt
if verbose > 10:
t1 = clock()
with nogil, parallel(num_threads=num_threads):
# Define private buffer variables
buff = <float *> malloc(sizeof(float) * n_dimensions)
for i in prange(start, n_samples, schedule='static'):
# Init the gradient vector
for ax in range(n_dimensions):
pos_f[i * n_dimensions + ax] = 0.0
# Compute the positive interaction for the nearest neighbors
for k in range(indptr[i], indptr[i+1]):
j = neighbors[k]
dij = 0.0
pij = val_P[k]
for ax in range(n_dimensions):
buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
dij += buff[ax] * buff[ax]
qij = float_dof / (float_dof + dij)
if dof != 1: # i.e. exponent != 1
qij = qij ** exponent
dij = pij * qij
# only compute the error when needed
if compute_error:
qij = qij / sum_Q
C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
for ax in range(n_dimensions):
pos_f[i * n_dimensions + ax] += dij * buff[ax]
free(buff)
if verbose > 10:
t2 = clock()
dt = ((float) (t2 - t1))
printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
return C
cdef double compute_gradient_negative(float[:, :] pos_reference,
float* neg_f,
_QuadTree qt,
int dof,
float theta,
long start,
int num_threads) noexcept nogil:
cdef:
int ax
int n_dimensions = qt.n_dimensions
int offset = n_dimensions + 2
long i, j, idx
long n_samples = pos_reference.shape[0]
long n = n_samples - start
long dta = 0
long dtb = 0
float size, dist2s, mult
float exponent = (dof + 1.0) / 2.0
float float_dof = (float) (dof)
double qijZ, sum_Q = 0.0
float* force
float* neg_force
float* pos
clock_t t1 = 0, t2 = 0, t3 = 0
int take_timing = 1 if qt.verbose > 20 else 0
with nogil, parallel(num_threads=num_threads):
# Define thread-local buffers
summary = <float*> malloc(sizeof(float) * n * offset)
pos = <float *> malloc(sizeof(float) * n_dimensions)
force = <float *> malloc(sizeof(float) * n_dimensions)
neg_force = <float *> malloc(sizeof(float) * n_dimensions)
for i in prange(start, n_samples, schedule='static'):
# Clear the arrays
for ax in range(n_dimensions):
force[ax] = 0.0
neg_force[ax] = 0.0
pos[ax] = pos_reference[i, ax]
# Find which nodes are summarizing and collect their centers of mass
# deltas, and sizes, into vectorized arrays
if take_timing:
t1 = clock()
idx = qt.summarize(pos, summary, theta*theta)
if take_timing:
t2 = clock()
# Compute the t-SNE negative force
# for the digits dataset, walking the tree
# is about 10-15x more expensive than the
# following for loop
for j in range(idx // offset):
dist2s = summary[j * offset + n_dimensions]
size = summary[j * offset + n_dimensions + 1]
qijZ = float_dof / (float_dof + dist2s) # 1/(1+dist)
if dof != 1: # i.e. exponent != 1
qijZ = qijZ ** exponent
sum_Q += size * qijZ # size of the node * q
mult = size * qijZ * qijZ
for ax in range(n_dimensions):
neg_force[ax] += mult * summary[j * offset + ax]
if take_timing:
t3 = clock()
for ax in range(n_dimensions):
neg_f[i * n_dimensions + ax] = neg_force[ax]
if take_timing:
dta += t2 - t1
dtb += t3 - t2
free(pos)
free(force)
free(neg_force)
free(summary)
if take_timing:
printf("[t-SNE] Tree: %li clock ticks | ", dta)
printf("Force computation: %li clock ticks\n", dtb)
# Put sum_Q to machine EPSILON to avoid divisions by 0
sum_Q = max(sum_Q, FLOAT64_EPS)
return sum_Q
def gradient(float[:] val_P,
float[:, :] pos_output,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float[:, :] forces,
float theta,
int n_dimensions,
int verbose,
int dof=1,
long skip_num_points=0,
bint compute_error=1,
int num_threads=1):
# This function is designed to be called from external Python
# it passes the 'forces' array by reference and fills that's array
# up in-place
cdef float C
cdef int n
n = pos_output.shape[0]
assert val_P.itemsize == 4
assert pos_output.itemsize == 4
assert forces.itemsize == 4
m = "Forces array and pos_output shapes are incompatible"
assert n == forces.shape[0], m
m = "Pij and pos_output shapes are incompatible"
assert n == indptr.shape[0] - 1, m
if verbose > 10:
printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose)
if verbose > 10:
printf("[t-SNE] Inserting %li points\n", pos_output.shape[0])
qt.build_tree(pos_output)
if verbose > 10:
# XXX: format hack to workaround lack of `const char *` type
# in the generated C code that triggers error with gcc 4.9
# and -Werror=format-security
printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
qt, theta, dof, skip_num_points, compute_error,
num_threads)
if verbose > 10:
# XXX: format hack to workaround lack of `const char *` type
# in the generated C code
# and -Werror=format-security
printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
m = "Tree consistency failed: unexpected number of points on the tree"
assert qt.cells[0].cumulative_size == qt.n_points, m
if not compute_error:
C = np.nan
return C

View File

@@ -0,0 +1,198 @@
"""
Classical multi-dimensional scaling (classical MDS).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from numbers import Integral
import numpy as np
from scipy import linalg
from sklearn.base import BaseEstimator, _fit_context
from sklearn.metrics import pairwise_distances
from sklearn.utils import check_symmetric
from sklearn.utils._param_validation import Interval
from sklearn.utils.extmath import svd_flip
from sklearn.utils.validation import validate_data
class ClassicalMDS(BaseEstimator):
"""Classical multidimensional scaling (MDS).
This is also known as principal coordinates analysis (PCoA) or
Torgerson's scaling. It is a version of MDS that has exact solution
in terms of eigendecomposition. If the input dissimilarity matrix
consists of the pairwise Euclidean distances between some vectors,
then classical MDS is equivalent to PCA applied to this set of vectors.
Read more in the :ref:`User Guide <multidimensional_scaling>`.
Parameters
----------
n_components : int, default=2
Number of embedding dimensions.
metric : str or callable, default='euclidean'
Metric to use for dissimilarity computation. Default is "euclidean".
If metric is a string, it must be one of the options allowed by
`scipy.spatial.distance.pdist` for its metric parameter, or a metric
listed in :func:`sklearn.metrics.pairwise.distance_metrics`
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
If metric is a callable function, it takes two arrays representing 1D
vectors as inputs and must return one value indicating the distance
between those vectors. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
metric_params : dict, default=None
Additional keyword arguments for the dissimilarity computation.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Stores the position of the dataset in the embedding space.
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points.
eigenvalues_ : ndarray of shape (n_components,)
Eigenvalues of the double-centered dissimilarity matrix, corresponding
to each of the selected components. They are equal to the squared 2-norms
of the `n_components` variables in the embedding space.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
See Also
--------
sklearn.decomposition.PCA : Principal component analysis.
MDS : Metric and non-metric MDS.
References
----------
.. [1] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import ClassicalMDS
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> cmds = ClassicalMDS(n_components=2)
>>> X_emb = cmds.fit_transform(X[:100])
>>> X_emb.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"metric": [str, callable],
"metric_params": [dict, None],
}
def __init__(
self,
n_components=2,
*,
metric="euclidean",
metric_params=None,
):
self.n_components = n_components
self.metric = metric
self.metric_params = metric_params
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.pairwise = self.metric == "precomputed"
return tags
def fit(self, X, y=None):
"""
Compute the embedding positions.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``metric=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted estimator.
"""
self.fit_transform(X)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y=None):
"""
Compute and return the embedding positions.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``metric=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
The embedding coordinates.
"""
X = validate_data(self, X)
if self.metric == "precomputed":
self.dissimilarity_matrix_ = X
self.dissimilarity_matrix_ = check_symmetric(
self.dissimilarity_matrix_, raise_exception=True
)
else:
self.dissimilarity_matrix_ = pairwise_distances(
X,
metric=self.metric,
**(self.metric_params if self.metric_params is not None else {}),
)
# Double centering
B = self.dissimilarity_matrix_**2
B = B.astype(np.float64)
B -= np.mean(B, axis=0)
B -= np.mean(B, axis=1, keepdims=True)
B *= -0.5
# Eigendecomposition
w, U = linalg.eigh(B)
# Reversing the order of the eigenvalues/eigenvectors to put
# the eigenvalues in decreasing order
w = w[::-1][: self.n_components]
U = U[:, ::-1][:, : self.n_components]
# Set the signs of eigenvectors to enforce deterministic output
U, _ = svd_flip(U, None)
self.embedding_ = np.sqrt(w) * U
self.eigenvalues_ = w
return self.embedding_

View File

@@ -0,0 +1,442 @@
"""Isomap for manifold learning"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy.sparse import issparse
from scipy.sparse.csgraph import connected_components, shortest_path
from sklearn.base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_fit_context,
)
from sklearn.decomposition import KernelPCA
from sklearn.metrics.pairwise import _VALID_METRICS
from sklearn.neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph
from sklearn.preprocessing import KernelCenterer
from sklearn.utils._param_validation import Interval, StrOptions
from sklearn.utils.graph import _fix_connected_components
from sklearn.utils.validation import check_is_fitted
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
"""Isomap Embedding.
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : int or None, default=5
Number of neighbors to consider for each point. If `n_neighbors` is an int,
then `radius` must be `None`.
radius : float or None, default=None
Limiting distance of neighbors to return. If `radius` is a float,
then `n_neighbors` must be set to `None`.
.. versionadded:: 1.1
n_components : int, default=2
Number of coordinates for the manifold.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float, default=0
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : int, default=None
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : {'auto', 'FW', 'D'}, default='auto'
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
metric : str, or callable, default="minkowski"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`.
.. versionadded:: 0.22
p : float, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
.. versionadded:: 0.22
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
embedding.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
MDS : Manifold learning using multidimensional scaling.
TSNE : T-distributed Stochastic Neighbor Embedding.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import Isomap
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = Isomap(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
"radius": [Interval(Real, 0, None, closed="both"), None],
"n_components": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left"), None],
"path_method": [StrOptions({"auto", "FW", "D"})],
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
"n_jobs": [Integral, None],
"p": [Interval(Real, 1, None, closed="left")],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
"metric_params": [dict, None],
}
def __init__(
self,
*,
n_neighbors=5,
radius=None,
n_components=2,
eigen_solver="auto",
tol=0,
max_iter=None,
path_method="auto",
neighbors_algorithm="auto",
n_jobs=None,
metric="minkowski",
p=2,
metric_params=None,
):
self.n_neighbors = n_neighbors
self.radius = radius
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
self.metric = metric
self.p = p
self.metric_params = metric_params
def _fit_transform(self, X):
if self.n_neighbors is not None and self.radius is not None:
raise ValueError(
"Both n_neighbors and radius are provided. Use"
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
" radius-based neighbors"
)
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
radius=self.radius,
algorithm=self.neighbors_algorithm,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
n_jobs=self.n_jobs,
)
self.nbrs_.fit(X)
self.n_features_in_ = self.nbrs_.n_features_in_
if hasattr(self.nbrs_, "feature_names_in_"):
self.feature_names_in_ = self.nbrs_.feature_names_in_
self.kernel_pca_ = KernelPCA(
n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
n_jobs=self.n_jobs,
).set_output(transform="default")
if self.n_neighbors is not None:
nbg = kneighbors_graph(
self.nbrs_,
self.n_neighbors,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
else:
nbg = radius_neighbors_graph(
self.nbrs_,
radius=self.radius,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
# Compute the number of connected components, and connect the different
# components to be able to compute a shortest path between all pairs
# of samples in the graph.
# Similar fix to cluster._agglomerative._fix_connectivity.
n_connected_components, labels = connected_components(nbg)
if n_connected_components > 1:
if self.metric == "precomputed" and issparse(X):
raise RuntimeError(
"The number of connected components of the neighbors graph"
f" is {n_connected_components} > 1. The graph cannot be "
"completed with metric='precomputed', and Isomap cannot be"
"fitted. Increase the number of neighbors to avoid this "
"issue, or precompute the full distance matrix instead "
"of passing a sparse neighbors graph."
)
warnings.warn(
(
"The number of connected components of the neighbors graph "
f"is {n_connected_components} > 1. Completing the graph to fit"
" Isomap might be slow. Increase the number of neighbors to "
"avoid this issue."
),
stacklevel=2,
)
# use array validated by NearestNeighbors
nbg = _fix_connected_components(
X=self.nbrs_._fit_X,
graph=nbg,
n_connected_components=n_connected_components,
component_labels=labels,
mode="distance",
metric=self.nbrs_.effective_metric_,
**self.nbrs_.effective_metric_params_,
)
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
if self.nbrs_._fit_X.dtype == np.float32:
self.dist_matrix_ = self.dist_matrix_.astype(
self.nbrs_._fit_X.dtype, copy=False
)
G = self.dist_matrix_**2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
self._n_features_out = self.embedding_.shape[1]
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Reconstruction error.
Notes
-----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_**2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.eigenvalues_
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
@_fit_context(
# Isomap.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
object.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns a fitted instance of self.
"""
self._fit_transform(X)
return self
@_fit_context(
# Isomap.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree}
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
X transformed in the new space.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Returns
-------
X_new : array-like, shape (n_queries, n_components)
X transformed in the new space.
"""
check_is_fitted(self)
if self.n_neighbors is not None:
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
else:
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
# Create the graph of shortest distances from X to
# training data via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
n_samples_fit = self.nbrs_.n_samples_fit_
n_queries = distances.shape[0]
if hasattr(X, "dtype") and X.dtype == np.float32:
dtype = np.float32
else:
dtype = np.float64
G_X = np.zeros((n_queries, n_samples_fit), dtype)
for i in range(n_queries):
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
tags.input_tags.sparse = True
return tags

View File

@@ -0,0 +1,878 @@
"""Locally Linear Embedding"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from numbers import Integral, Real
import numpy as np
from scipy.linalg import eigh, qr, solve, svd
from scipy.sparse import csr_matrix, eye, lil_matrix
from scipy.sparse.linalg import eigsh
from sklearn.base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_fit_context,
_UnstableArchMixin,
)
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array, check_random_state
from sklearn.utils._arpack import _init_arpack_v0
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data
def barycenter_weights(X, Y, indices, reg=1e-3):
"""Compute barycenter weights of X from Y along the first axis
We estimate the weights to assign to each point in Y[indices] to recover
the point X[i]. The barycenter weights sum to 1.
Parameters
----------
X : array-like, shape (n_samples, n_dim)
Y : array-like, shape (n_samples, n_dim)
indices : array-like, shape (n_samples, n_dim)
Indices of the points in Y used to compute the barycenter
reg : float, default=1e-3
Amount of regularization to add for the problem to be
well-posed in the case of n_neighbors > n_dim
Returns
-------
B : array-like, shape (n_samples, n_neighbors)
Notes
-----
See developers note for more information.
"""
X = check_array(X, dtype=FLOAT_DTYPES)
Y = check_array(Y, dtype=FLOAT_DTYPES)
indices = check_array(indices, dtype=int)
n_samples, n_neighbors = indices.shape
assert X.shape[0] == n_samples
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)
# this might raise a LinalgError if G is singular and has trace
# zero
for i, ind in enumerate(indices):
A = Y[ind]
C = A - X[i] # broadcasting
G = np.dot(C, C.T)
trace = np.trace(G)
if trace > 0:
R = reg * trace
else:
R = reg
G.flat[:: n_neighbors + 1] += R
w = solve(G, v, assume_a="pos")
B[i, :] = w / np.sum(w)
return B
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
"""Computes the barycenter weighted graph of k-Neighbors for points in X
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors for each sample.
reg : float, default=1e-3
Amount of regularization when solving the least-squares
problem. Only relevant if mode='barycenter'. If None, use the
default.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
A[i, j] is assigned the weight of edge that connects i to j.
See Also
--------
sklearn.neighbors.kneighbors_graph
sklearn.neighbors.radius_neighbors_graph
"""
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
X = knn._fit_X
n_samples = knn.n_samples_fit_
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
data = barycenter_weights(X, X, ind, reg=reg)
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
def null_space(
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
):
"""
Find the null space of a matrix M.
Parameters
----------
M : {array, matrix, sparse matrix, LinearOperator}
Input covariance matrix: should be symmetric positive semi-definite
k : int
Number of eigenvalues/vectors to return
k_skip : int, default=1
Number of low eigenvalues to skip.
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for 'arpack' method.
Not used if eigen_solver=='dense'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
"""
if eigen_solver == "auto":
if M.shape[0] > 200 and k + k_skip < 10:
eigen_solver = "arpack"
else:
eigen_solver = "dense"
if eigen_solver == "arpack":
v0 = _init_arpack_v0(M.shape[0], random_state)
try:
eigen_values, eigen_vectors = eigsh(
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
)
except RuntimeError as e:
raise ValueError(
"Error in determining null-space with ARPACK. Error message: "
"'%s'. Note that eigen_solver='arpack' can fail when the "
"weight matrix is singular or otherwise ill-behaved. In that "
"case, eigen_solver='dense' is recommended. See online "
"documentation for more information." % e
) from e
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == "dense":
if hasattr(M, "toarray"):
M = M.toarray()
eigen_values, eigen_vectors = eigh(
M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True
)
index = np.argsort(np.abs(eigen_values))
return eigen_vectors[:, index], np.sum(eigen_values)
else:
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
def _locally_linear_embedding(
X,
*,
n_neighbors,
n_components,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
random_state=None,
n_jobs=None,
):
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
nbrs.fit(X)
X = nbrs._fit_X
N, d_in = X.shape
if n_components > d_in:
raise ValueError(
"output dimension must be less than or equal to input dimension"
)
if n_neighbors >= N:
raise ValueError(
"Expected n_neighbors < n_samples, but n_samples = %d, n_neighbors = %d"
% (N, n_neighbors)
)
M_sparse = eigen_solver != "dense"
M_container_constructor = lil_matrix if M_sparse else np.zeros
if method == "standard":
W = barycenter_kneighbors_graph(
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
)
# we'll compute M = (I-W)'(I-W)
# depending on the solver, we'll do this differently
if M_sparse:
M = eye(*W.shape, format=W.format) - W
M = M.T @ M
else:
M = (W.T @ W - W.T - W).toarray()
M.flat[:: M.shape[0] + 1] += 1 # M = W' W - W' - W + I
elif method == "hessian":
dp = n_components * (n_components + 1) // 2
if n_neighbors <= n_components + dp:
raise ValueError(
"for method='hessian', n_neighbors must be "
"greater than "
"[n_components * (n_components + 3) / 2]"
)
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
Yi[:, 0] = 1
M = M_container_constructor((N, N), dtype=np.float64)
use_svd = n_neighbors > d_in
for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)
# build Hessian estimator
if use_svd:
U = svd(Gi, full_matrices=0)[0]
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[1][:, ::-1]
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
j = 1 + n_components
for k in range(n_components):
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
j += n_components - k
Q, R = qr(Yi)
w = Q[:, n_components + 1 :]
S = w.sum(0)
S[np.where(abs(S) < hessian_tol)] = 1
w /= S
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
elif method == "modified":
if n_neighbors < n_components:
raise ValueError("modified LLE requires n_neighbors >= n_components")
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
# find the eigenvectors and eigenvalues of each local covariance
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
# where the columns are eigenvectors
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])
# choose the most efficient way to find the eigenvectors
use_svd = n_neighbors > d_in
if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]
# find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
# it's faster to use this rather than np.linalg.solve
reg = 1e-3 * evals.sum(1)
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
tmp[:, :nev] /= evals + reg[:, None]
tmp[:, nev:] /= reg[:, None]
w_reg = np.zeros((N, n_neighbors))
for i in range(N):
w_reg[i] = np.dot(V[i], tmp[i])
w_reg /= w_reg.sum(1)[:, None]
# calculate eta: the median of the ratio of small to large eigenvalues
# across the points. This is used to determine s_i, below
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
eta = np.median(rho)
# find s_i, the size of the "almost null space" for each point:
# this is the size of the largest set of eigenvalues
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
s_range = np.zeros(N, dtype=int)
evals_cumsum = np.cumsum(evals, 1)
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
for i in range(N):
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
s_range += n_neighbors - nev # number of zero eigenvalues
# Now calculate M.
# This is the [N x N] matrix whose null space is the desired embedding
M = M_container_constructor((N, N), dtype=np.float64)
for i in range(N):
s_i = s_range[i]
# select bottom s_i eigenvectors and calculate alpha
Vi = V[i, :, n_neighbors - s_i :]
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
# compute Householder matrix which satisfies
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
# using prescription from paper
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
norm_h = np.linalg.norm(h)
if norm_h < modified_tol:
h *= 0
else:
h /= norm_h
# Householder matrix is
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
# Then the weight matrix is
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
# We do this much more efficiently:
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
# Update M as follows:
# >> W_hat = np.zeros( (N,s_i) )
# >> W_hat[neighbors[i],:] = Wi
# >> W_hat[i] -= 1
# >> M += np.dot(W_hat,W_hat.T)
# We can do this much more efficiently:
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
Wi_sum1 = Wi.sum(1)
M[i, neighbors[i]] -= Wi_sum1
M[neighbors[i], [i]] -= Wi_sum1
M[i, i] += s_i
elif method == "ltsa":
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
M = M_container_constructor((N, N), dtype=np.float64)
use_svd = n_neighbors > d_in
for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)
# compute n_components largest eigenvalues of Xi @ Xi^T
if use_svd:
v = svd(Xi, full_matrices=True)[0]
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[1][:, ::-1]
Gi = np.zeros((n_neighbors, n_components + 1))
Gi[:, 1:] = v[:, :n_components]
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
GiGiT = np.dot(Gi, Gi.T)
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] -= GiGiT
M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors)
if M_sparse:
M = M.tocsr()
return null_space(
M,
n_components,
k_skip=1,
eigen_solver=eigen_solver,
tol=tol,
max_iter=max_iter,
random_state=random_state,
)
@validate_params(
{
"X": ["array-like", NearestNeighbors],
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"n_components": [Interval(Integral, 1, None, closed="left")],
"reg": [Interval(Real, 0, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
"hessian_tol": [Interval(Real, 0, None, closed="left")],
"modified_tol": [Interval(Real, 0, None, closed="left")],
"random_state": ["random_state"],
"n_jobs": [None, Integral],
},
prefer_skip_nested_validation=True,
)
def locally_linear_embedding(
X,
*,
n_neighbors,
n_components,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
random_state=None,
n_jobs=None,
):
"""Perform a Locally Linear Embedding analysis on the data.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors to consider for each point.
n_components : int
Number of coordinates for the manifold.
reg : float, default=1e-3
Regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for the arpack solver.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
standard : use the standard locally linear embedding algorithm.
see reference [1]_
hessian : use the Hessian eigenmap method. This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2.
see reference [2]_
modified : use the modified locally linear embedding algorithm.
see reference [3]_
ltsa : use local tangent space alignment algorithm
see reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'.
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if method == 'modified'.
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
Y : ndarray of shape (n_samples, n_components)
Embedding vectors.
squared_error : float
Reconstruction error for the embedding vectors. Equivalent to
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import locally_linear_embedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2)
>>> embedding.shape
(100, 2)
"""
return _locally_linear_embedding(
X=X,
n_neighbors=n_neighbors,
n_components=n_components,
reg=reg,
eigen_solver=eigen_solver,
tol=tol,
max_iter=max_iter,
method=method,
hessian_tol=hessian_tol,
modified_tol=modified_tol,
random_state=random_state,
n_jobs=n_jobs,
)
class LocallyLinearEmbedding(
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_UnstableArchMixin,
BaseEstimator,
):
"""Locally Linear Embedding.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to consider for each point.
n_components : int, default=2
Number of coordinates for the manifold.
reg : float, default=1e-3
Regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
The solver used to compute the eigenvectors. The available options are:
- `'auto'` : algorithm will attempt to choose the best method for input
data.
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
method, M may be a dense matrix, sparse matrix, or general linear
operator.
- `'dense'` : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array or matrix type.
This method should be avoided for large problems.
.. warning::
ARPACK can be unstable for some problems. It is best to try several
random seeds in order to check results.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for the arpack solver.
Not used if eigen_solver=='dense'.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
- `standard`: use the standard locally linear embedding algorithm. see
reference [1]_
- `hessian`: use the Hessian eigenmap method. This method requires
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
reference [2]_
- `modified`: use the modified locally linear embedding algorithm.
see reference [3]_
- `ltsa`: use local tangent space alignment algorithm. see
reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if ``method == 'hessian'``.
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if ``method == 'modified'``.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search, passed to
:class:`~sklearn.neighbors.NearestNeighbors` instance.
random_state : int, RandomState instance, default=None
Determines the random number generator when
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
across multiple function calls. See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : array-like, shape [n_samples, n_components]
Stores the embedding vectors
reconstruction_error_ : float
Reconstruction error associated with `embedding_`
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
nbrs_ : NearestNeighbors object
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
See Also
--------
SpectralEmbedding : Spectral embedding for non-linear dimensionality
reduction.
TSNE : Distributed Stochastic Neighbor Embedding.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import LocallyLinearEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = LocallyLinearEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"n_components": [Interval(Integral, 1, None, closed="left")],
"reg": [Interval(Real, 0, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
"hessian_tol": [Interval(Real, 0, None, closed="left")],
"modified_tol": [Interval(Real, 0, None, closed="left")],
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
"random_state": ["random_state"],
"n_jobs": [None, Integral],
}
def __init__(
self,
*,
n_neighbors=5,
n_components=2,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
neighbors_algorithm="auto",
random_state=None,
n_jobs=None,
):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol
self.random_state = random_state
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs,
)
random_state = check_random_state(self.random_state)
X = validate_data(self, X, dtype=float)
self.nbrs_.fit(X)
self.embedding_, self.reconstruction_error_ = _locally_linear_embedding(
X=self.nbrs_,
n_neighbors=self.n_neighbors,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
method=self.method,
hessian_tol=self.hessian_tol,
modified_tol=self.modified_tol,
random_state=random_state,
reg=self.reg,
n_jobs=self.n_jobs,
)
self._n_features_out = self.embedding_.shape[1]
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
Fitted `LocallyLinearEmbedding` class instance.
"""
self._fit_transform(X)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y=None):
"""Compute the embedding vectors for data X and transform X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
Returns the instance itself.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""
Transform new points into embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
Returns the instance itself.
Notes
-----
Because of scaling performed by this method, it is discouraged to use
it together with methods that are not scale-invariant (like SVMs).
"""
check_is_fitted(self)
X = validate_data(self, X, reset=False)
ind = self.nbrs_.kneighbors(
X, n_neighbors=self.n_neighbors, return_distance=False
)
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
X_new = np.empty((X.shape[0], self.n_components))
for i in range(X.shape[0]):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
return X_new

View File

@@ -0,0 +1,836 @@
"""
Multi-dimensional Scaling (MDS).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral, Real
import numpy as np
from joblib import effective_n_jobs
from sklearn.base import BaseEstimator, _fit_context
from sklearn.isotonic import IsotonicRegression
from sklearn.manifold import ClassicalMDS
from sklearn.metrics import euclidean_distances, pairwise_distances
from sklearn.utils import check_array, check_random_state, check_symmetric
from sklearn.utils._param_validation import (
Hidden,
Interval,
StrOptions,
validate_params,
)
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import validate_data
def _smacof_single(
dissimilarities,
metric=True,
n_components=2,
init=None,
max_iter=300,
verbose=0,
eps=1e-6,
random_state=None,
normalized_stress=False,
):
"""Computes multidimensional scaling using SMACOF algorithm.
Parameters
----------
dissimilarities : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.
.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
normalized_stress : bool, default=False
Whether to return normalized stress value (Stress-1) instead of raw
stress.
.. versionadded:: 1.2
.. versionchanged:: 1.7
Normalized stress is now supported for metric MDS as well.
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
n_iter : int
The number of iterations corresponding to the best stress.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"""
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
n_samples = dissimilarities.shape[0]
random_state = check_random_state(random_state)
dissimilarities_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
dissimilarities_flat_w = dissimilarities_flat[dissimilarities_flat != 0]
if init is None:
# Randomly choose initial configuration
X = random_state.uniform(size=n_samples * n_components)
X = X.reshape((n_samples, n_components))
else:
# overrides the parameter p
n_components = init.shape[1]
if n_samples != init.shape[0]:
raise ValueError(
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
)
X = init
distances = euclidean_distances(X)
# Out of bounds condition cannot happen because we are transforming
# the training set here, but does sometimes get triggered in
# practice due to machine precision issues. Hence "clip".
ir = IsotonicRegression(out_of_bounds="clip")
old_stress = None
for it in range(max_iter):
# Compute distance and monotonic regression
if metric:
disparities = dissimilarities
else:
distances_flat = distances.ravel()
# dissimilarities with 0 are considered as missing values
distances_flat_w = distances_flat[dissimilarities_flat != 0]
# Compute the disparities using isotonic regression.
# For the first SMACOF iteration, use scaled original dissimilarities.
# (This choice follows the R implementation described in this paper:
# https://www.jstatsoft.org/article/view/v102i10)
if it < 1:
disparities_flat = dissimilarities_flat_w
else:
disparities_flat = ir.fit_transform(
dissimilarities_flat_w, distances_flat_w
)
disparities = np.zeros_like(distances_flat)
disparities[dissimilarities_flat != 0] = disparities_flat
disparities = disparities.reshape((n_samples, n_samples))
disparities *= np.sqrt(
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
)
disparities = disparities + disparities.T
# Update X using the Guttman transform
distances[distances == 0] = 1e-5
ratio = disparities / distances
B = -ratio
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
X = 1.0 / n_samples * np.dot(B, X)
# Compute stress
distances = euclidean_distances(X)
stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2
if verbose >= 2: # pragma: no cover
print(f"Iteration {it}, stress {stress:.4f}")
if old_stress is not None:
sum_squared_distances = (distances.ravel() ** 2).sum()
if ((old_stress - stress) / (sum_squared_distances / 2)) < eps:
if verbose: # pragma: no cover
print(f"Convergence criterion reached (iteration {it}).")
break
old_stress = stress
if normalized_stress:
sum_squared_distances = (distances.ravel() ** 2).sum()
stress = np.sqrt(stress / (sum_squared_distances / 2))
return X, stress, it + 1
# TODO(1.9): change default `n_init` to 1, see PR #31117
@validate_params(
{
"dissimilarities": ["array-like"],
"metric": ["boolean"],
"n_components": [Interval(Integral, 1, None, closed="left")],
"init": ["array-like", None],
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
"n_jobs": [Integral, None],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
"eps": [Interval(Real, 0, None, closed="left")],
"random_state": ["random_state"],
"return_n_iter": ["boolean"],
"normalized_stress": ["boolean", StrOptions({"auto"})],
},
prefer_skip_nested_validation=True,
)
def smacof(
dissimilarities,
*,
metric=True,
n_components=2,
init=None,
n_init="warn",
n_jobs=None,
max_iter=300,
verbose=0,
eps=1e-6,
random_state=None,
return_n_iter=False,
normalized_stress="auto",
):
"""Compute multidimensional scaling using the SMACOF algorithm.
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
multidimensional scaling algorithm which minimizes an objective function
(the *stress*) using a majorization technique. Stress majorization, also
known as the Guttman Transform, guarantees a monotone convergence of
stress, and is more powerful than traditional techniques such as gradient
descent.
The SMACOF algorithm for metric MDS can be summarized by the following
steps:
1. Set an initial start configuration, randomly or not.
2. Compute the stress
3. Compute the Guttman Transform
4. Iterate 2 and 3 until convergence.
The nonmetric algorithm adds a monotonic regression step before computing
the stress.
Parameters
----------
dissimilarities : array-like of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : array-like of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
n_init : int, default=8
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress. If ``init`` is
provided, this option is overridden and a single run is performed.
.. versionchanged:: 1.9
The default value for `n_iter` will change from 8 to 1 in version 1.9.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.
.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
normalized_stress : bool or "auto", default="auto"
Whether to return normalized stress value (Stress-1) instead of raw
stress. By default, metric MDS returns raw stress while non-metric MDS
returns normalized stress.
.. versionadded:: 1.2
.. versionchanged:: 1.4
The default value changed from `False` to `"auto"` in version 1.4.
.. versionchanged:: 1.7
Normalized stress is now supported for metric MDS as well.
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
n_iter : int
The number of iterations corresponding to the best stress. Returned
only if ``return_n_iter`` is set to ``True``.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
Examples
--------
>>> import numpy as np
>>> from sklearn.manifold import smacof
>>> from sklearn.metrics import euclidean_distances
>>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
>>> dissimilarities = euclidean_distances(X)
>>> Z, stress = smacof(
... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42
... )
>>> Z.shape
(3, 2)
>>> np.round(stress, 6).item()
3.2e-05
"""
if n_init == "warn":
warnings.warn(
"The default value of `n_init` will change from 8 to 1 in 1.9.",
FutureWarning,
)
n_init = 8
dissimilarities = check_array(dissimilarities)
random_state = check_random_state(random_state)
if normalized_stress == "auto":
normalized_stress = not metric
if hasattr(init, "__array__"):
init = np.asarray(init).copy()
if not n_init == 1:
warnings.warn(
"Explicit initial positions passed: "
"performing only one init of the MDS instead of %d" % n_init
)
n_init = 1
best_pos, best_stress = None, None
if effective_n_jobs(n_jobs) == 1:
for it in range(n_init):
pos, stress, n_iter_ = _smacof_single(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=random_state,
normalized_stress=normalized_stress,
)
if best_stress is None or stress < best_stress:
best_stress = stress
best_pos = pos.copy()
best_iter = n_iter_
else:
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
delayed(_smacof_single)(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=seed,
normalized_stress=normalized_stress,
)
for seed in seeds
)
positions, stress, n_iters = zip(*results)
best = np.argmin(stress)
best_stress = stress[best]
best_pos = positions[best]
best_iter = n_iters[best]
if return_n_iter:
return best_pos, best_stress, best_iter
else:
return best_pos, best_stress
# TODO(1.9): change default `n_init` to 1, see PR #31117
# TODO(1.10): change default `init` to "classical_mds", see PR #32229
# TODO(1.10): drop support for boolean `metric`, see PR #32229
# TODO(1.10): drop support for `dissimilarity`, see PR #32229
class MDS(BaseEstimator):
"""Multidimensional scaling.
Read more in the :ref:`User Guide <multidimensional_scaling>`.
Parameters
----------
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities.
metric_mds : bool, default=True
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
.. versionchanged:: 1.8
The parameter `metric` was renamed into `metric_mds`.
n_init : int, default=4
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress.
.. versionchanged:: 1.9
The default value for `n_init` will change from 4 to 1 in version 1.9.
init : {'random', 'classical_mds'}, default='random'
The initialization approach. If `random`, random initialization is used.
If `classical_mds`, then classical MDS is run and used as initialization
for MDS (in this case, the value of `n_init` is ignored).
.. versionadded:: 1.8
.. versionchanged:: 1.10
The default value for `init` will change to `classical_mds`.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.
.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
dissimilarity : {'euclidean', 'precomputed'}
Dissimilarity measure to use:
- 'euclidean':
Pairwise Euclidean distances between points in the dataset.
- 'precomputed':
Pre-computed dissimilarities are passed directly to ``fit`` and
``fit_transform``.
.. deprecated:: 1.8
`dissimilarity` was renamed `metric` in 1.8 and will be removed in 1.10.
metric : str or callable, default='euclidean'
Metric to use for dissimilarity computation. Default is "euclidean".
If metric is a string, it must be one of the options allowed by
`scipy.spatial.distance.pdist` for its metric parameter, or a metric
listed in :func:`sklearn.metrics.pairwise.distance_metrics`
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
If metric is a callable function, it takes two arrays representing 1D
vectors as inputs and must return one value indicating the distance
between those vectors. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
.. versionchanged:: 1.8
Prior to 1.8, `metric=True/False` was used to select metric/non-metric
MDS, which is now the role of `metric_mds`. The support for ``True``
and ``False`` will be dropped in version 1.10, use `metric_mds` instead.
metric_params : dict, default=None
Additional keyword arguments for the dissimilarity computation.
.. versionadded:: 1.8
normalized_stress : bool or "auto" default="auto"
Whether to return normalized stress value (Stress-1) instead of raw
stress. By default, metric MDS returns raw stress while non-metric MDS
returns normalized stress.
.. versionadded:: 1.2
.. versionchanged:: 1.4
The default value changed from `False` to `"auto"` in version 1.4.
.. versionchanged:: 1.7
Normalized stress is now supported for metric MDS as well.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Stores the position of the dataset in the embedding space.
stress_ : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Symmetric matrix that:
- either uses a custom dissimilarity matrix by setting `dissimilarity`
to 'precomputed';
- or constructs a dissimilarity matrix from data using
Euclidean distances.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The number of iterations corresponding to the best stress.
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
TSNE : T-distributed Stochastic Neighbor Embedding.
Isomap : Manifold learning based on Isometric Mapping.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import MDS
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = MDS(n_components=2, n_init=1, init="random")
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
For a more detailed example of usage, see
:ref:`sphx_glr_auto_examples_manifold_plot_mds.py`.
For a comparison of manifold learning techniques, see
:ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`.
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"metric_mds": ["boolean"],
"n_init": [
Interval(Integral, 1, None, closed="left"),
Hidden(StrOptions({"warn"})),
],
"init": [StrOptions({"random", "classical_mds"}), Hidden(StrOptions({"warn"}))],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
"eps": [Interval(Real, 0.0, None, closed="left")],
"n_jobs": [None, Integral],
"random_state": ["random_state"],
"dissimilarity": [
StrOptions({"euclidean", "precomputed"}),
Hidden(StrOptions({"deprecated"})),
],
"metric": [str, callable, Hidden("boolean")],
"metric_params": [dict, None],
"normalized_stress": ["boolean", StrOptions({"auto"})],
}
def __init__(
self,
n_components=2,
*,
metric_mds=True,
n_init="warn",
init="warn",
max_iter=300,
verbose=0,
eps=1e-6,
n_jobs=None,
random_state=None,
dissimilarity="deprecated",
metric="euclidean",
metric_params=None,
normalized_stress="auto",
):
self.n_components = n_components
self.dissimilarity = dissimilarity
self.metric = metric
self.metric_params = metric_params
self.metric_mds = metric_mds
self.n_init = n_init
self.init = init
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
self.normalized_stress = normalized_stress
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.pairwise = (self.dissimilarity == "precomputed") | (
self.metric == "precomputed"
)
return tags
def fit(self, X, y=None, init=None):
"""
Compute the position of the points in the embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``metric=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
self : object
Fitted estimator.
"""
self.fit_transform(X, init=init)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from `X`, and returns the embedded coordinates.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``metric=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
X transformed in the new space.
"""
if self.n_init == "warn":
warnings.warn(
"The default value of `n_init` will change from 4 to 1 in 1.9. "
"To suppress this warning, provide some value of `n_init`.",
FutureWarning,
)
self._n_init = 4
else:
self._n_init = self.n_init
if self.init == "warn":
warnings.warn(
"The default value of `init` will change from 'random' to "
"'classical_mds' in 1.10. To suppress this warning, provide "
"some value of `init`.",
FutureWarning,
)
self._init = "random"
else:
self._init = self.init
if self.dissimilarity != "deprecated":
if not isinstance(self.metric, bool) and self.metric != "euclidean":
raise ValueError(
"You provided both `dissimilarity` and `metric`. Please use "
"only `metric`."
)
else:
warnings.warn(
"The `dissimilarity` parameter is deprecated and will be "
"removed in 1.10. Use `metric` instead.",
FutureWarning,
)
self._metric = self.dissimilarity
if isinstance(self.metric, bool):
warnings.warn(
f"Use metric_mds={self.metric} instead of metric={self.metric}. The "
"support for metric={True/False} will be dropped in 1.10.",
FutureWarning,
)
if self.dissimilarity == "deprecated":
self._metric = "euclidean"
self._metric_mds = self.metric
else:
if self.dissimilarity == "deprecated":
self._metric = self.metric
self._metric_mds = self.metric_mds
X = validate_data(self, X)
if X.shape[0] == X.shape[1] and self._metric != "precomputed":
warnings.warn(
"The provided input is a square matrix. Note that ``fit`` constructs "
"a dissimilarity matrix from data and will treat rows as samples "
"and columns as features. To use a pre-computed dissimilarity matrix, "
"set ``metric='precomputed'``."
)
if self._metric == "precomputed":
self.dissimilarity_matrix_ = X
self.dissimilarity_matrix_ = check_symmetric(
self.dissimilarity_matrix_, raise_exception=True
)
else:
self.dissimilarity_matrix_ = pairwise_distances(
X,
metric=self._metric,
**(self.metric_params if self.metric_params is not None else {}),
)
if init is not None:
init_array = init
elif self._init == "classical_mds":
cmds = ClassicalMDS(metric="precomputed")
init_array = cmds.fit_transform(self.dissimilarity_matrix_)
else:
init_array = None
self.embedding_, self.stress_, self.n_iter_ = smacof(
self.dissimilarity_matrix_,
metric=self._metric_mds,
n_components=self.n_components,
init=init_array,
n_init=self._n_init,
n_jobs=self.n_jobs,
max_iter=self.max_iter,
verbose=self.verbose,
eps=self.eps,
random_state=self.random_state,
return_n_iter=True,
normalized_stress=self.normalized_stress,
)
return self.embedding_

View File

@@ -0,0 +1,772 @@
"""Spectral Embedding."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.csgraph import connected_components
from scipy.sparse.linalg import eigsh, lobpcg
from sklearn.base import BaseEstimator, _fit_context
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
from sklearn.utils import check_array, check_random_state, check_symmetric
from sklearn.utils._arpack import _init_arpack_v0
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.fixes import laplacian as csgraph_laplacian
from sklearn.utils.fixes import parse_version, sp_version
from sklearn.utils.validation import validate_data
def _graph_connected_component(graph, node_id):
"""Find the largest graph connected components that contains one
given node.
Parameters
----------
graph : array-like of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
node_id : int
The index of the query node of the graph.
Returns
-------
connected_components_matrix : array-like of shape (n_samples,)
An array of bool value indicating the indexes of the nodes
belonging to the largest connected components of the given query
node.
"""
n_node = graph.shape[0]
if sparse.issparse(graph):
# speed up row-wise access to boolean connection mask
graph = graph.tocsr()
connected_nodes = np.zeros(n_node, dtype=bool)
nodes_to_explore = np.zeros(n_node, dtype=bool)
nodes_to_explore[node_id] = True
for _ in range(n_node):
last_num_component = connected_nodes.sum()
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
if last_num_component >= connected_nodes.sum():
break
indices = np.where(nodes_to_explore)[0]
nodes_to_explore.fill(False)
for i in indices:
if sparse.issparse(graph):
# scipy not yet implemented 1D sparse slices; can be changed back to
# `neighbors = graph[i].toarray().ravel()` once implemented
neighbors = graph[[i], :].toarray().ravel()
else:
neighbors = graph[i]
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
return connected_nodes
def _graph_is_connected(graph):
"""Return whether the graph is connected (True) or Not (False).
Parameters
----------
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
Returns
-------
is_connected : bool
True means the graph is fully connected and False means not.
"""
if sparse.issparse(graph):
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
# PR: https://github.com/scipy/scipy/pull/18913
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
# `accept_large_sparse=True`.
accept_large_sparse = sp_version >= parse_version("1.11.3")
graph = check_array(
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
)
# sparse graph, find all the connected components
n_connected_components, _ = connected_components(graph)
return n_connected_components == 1
else:
# dense graph, find all connected components start from node 0
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
def _set_diag(laplacian, value, norm_laplacian):
"""Set the diagonal of the laplacian matrix and convert it to a
sparse format well suited for eigenvalue decomposition.
Parameters
----------
laplacian : {ndarray, sparse matrix}
The graph laplacian.
value : float
The value of the diagonal.
norm_laplacian : bool
Whether the value of the diagonal should be changed or not.
Returns
-------
laplacian : {array, sparse matrix}
An array of matrix in a form that is well suited to fast
eigenvalue decomposition, depending on the band width of the
matrix.
"""
n_nodes = laplacian.shape[0]
# We need all entries in the diagonal to values
if not sparse.issparse(laplacian):
if norm_laplacian:
laplacian.flat[:: n_nodes + 1] = value
else:
laplacian = laplacian.tocoo()
if norm_laplacian:
diag_idx = laplacian.row == laplacian.col
laplacian.data[diag_idx] = value
# If the matrix has a small number of diagonals (as in the
# case of structured matrices coming from images), the
# dia format might be best suited for matvec products:
n_diags = np.unique(laplacian.row - laplacian.col).size
if n_diags <= 7:
# 3 or less outer diagonals on each side
laplacian = laplacian.todia()
else:
# csr has the fastest matvec and is thus best suited to
# arpack
laplacian = laplacian.tocsr()
return laplacian
@validate_params(
{
"adjacency": ["array-like", "sparse matrix"],
"n_components": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"random_state": ["random_state"],
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
"norm_laplacian": ["boolean"],
"drop_first": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def spectral_embedding(
adjacency,
*,
n_components=8,
eigen_solver=None,
random_state=None,
eigen_tol="auto",
norm_laplacian=True,
drop_first=True,
):
"""Project the sample on the first eigenvectors of the graph Laplacian.
The adjacency matrix is used to compute a normalized graph Laplacian
whose spectrum (especially the eigenvectors associated to the
smallest eigenvalues) has an interpretation in terms of minimal
number of cuts necessary to split the graph into comparably sized
components.
This embedding can also 'work' even if the ``adjacency`` variable is
not strictly the adjacency matrix of a graph but more generally
an affinity or similarity matrix between samples (for instance the
heat kernel of a euclidean distance matrix or a k-NN matrix).
However care must taken to always make the affinity matrix symmetric
so that the eigenvector decomposition works as expected.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
The adjacency matrix of the graph to embed.
n_components : int, default=8
The dimension of the projection subspace.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities. If None, then ``'arpack'`` is
used.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
to convergence issues and should be avoided.
.. versionadded:: 1.2
Added 'auto' option.
norm_laplacian : bool, default=True
If True, then compute symmetric normalized Laplacian.
drop_first : bool, default=True
Whether to drop the first eigenvector. For spectral embedding, this
should be True as the first eigenvector should be constant vector for
connected graph, but for spectral clustering, this should be kept as
False to retain the first eigenvector.
Returns
-------
embedding : ndarray of shape (n_samples, n_components)
The reduced samples.
Notes
-----
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
has one connected component. If there graph has many components, the first
few eigenvectors will simply uncover the connected components of the graph.
References
----------
* https://en.wikipedia.org/wiki/LOBPCG
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
Block Preconditioned Conjugate Gradient Method",
Andrew V. Knyazev
<10.1137/S1064827500366124>`
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.neighbors import kneighbors_graph
>>> from sklearn.manifold import spectral_embedding
>>> X, _ = load_digits(return_X_y=True)
>>> X = X[:100]
>>> affinity_matrix = kneighbors_graph(
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
... )
>>> # make the matrix symmetric
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
>>> embedding.shape
(100, 2)
"""
random_state = check_random_state(random_state)
return _spectral_embedding(
adjacency,
n_components=n_components,
eigen_solver=eigen_solver,
random_state=random_state,
eigen_tol=eigen_tol,
norm_laplacian=norm_laplacian,
drop_first=drop_first,
)
def _spectral_embedding(
adjacency,
*,
n_components=8,
eigen_solver=None,
random_state=None,
eigen_tol="auto",
norm_laplacian=True,
drop_first=True,
):
adjacency = check_symmetric(adjacency)
if eigen_solver == "amg":
try:
from pyamg import smoothed_aggregation_solver
except ImportError as e:
raise ValueError(
"The eigen_solver was set to 'amg', but pyamg is not available."
) from e
if eigen_solver is None:
eigen_solver = "arpack"
n_nodes = adjacency.shape[0]
# Whether to drop the first eigenvector
if drop_first:
n_components = n_components + 1
if not _graph_is_connected(adjacency):
warnings.warn(
"Graph is not fully connected, spectral embedding may not work as expected."
)
laplacian, dd = csgraph_laplacian(
adjacency, normed=norm_laplacian, return_diag=True
)
if eigen_solver == "arpack" or (
eigen_solver != "lobpcg"
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
):
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
# for details see the source code in scipy:
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
# /lobpcg/lobpcg.py#L237
# or matlab:
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# Here we'll use shift-invert mode for fast eigenvalues
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
# for a short explanation of what this means)
# Because the normalized Laplacian has eigenvalues between 0 and 2,
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
# when finding eigenvalues of largest magnitude (keyword which='LM')
# and when these eigenvalues are very large compared to the rest.
# For very large, very sparse graphs, I - L can have many, many
# eigenvalues very near 1.0. This leads to slow convergence. So
# instead, we'll use ARPACK's shift-invert mode, asking for the
# eigenvalues near 1.0. This effectively spreads-out the spectrum
# near 1.0 and leads to much faster convergence: potentially an
# orders-of-magnitude speedup over simply using keyword which='LA'
# in standard mode.
try:
# We are computing the opposite of the laplacian inplace so as
# to spare a memory allocation of a possibly very large array
tol = 0 if eigen_tol == "auto" else eigen_tol
laplacian *= -1
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
laplacian = check_array(
laplacian, accept_sparse="csr", accept_large_sparse=False
)
_, diffusion_map = eigsh(
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
)
embedding = diffusion_map.T[n_components::-1]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
except RuntimeError:
# When submatrices are exactly singular, an LU decomposition
# in arpack fails. We fallback to lobpcg
eigen_solver = "lobpcg"
# Revert the laplacian to its opposite to have lobpcg work
laplacian *= -1
elif eigen_solver == "amg":
# Use AMG to get a preconditioner and speed up the eigenvalue
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# The Laplacian matrix is always singular, having at least one zero
# eigenvalue, corresponding to the trivial eigenvector, which is a
# constant. Using a singular matrix for preconditioning may result in
# random failures in LOBPCG and is not supported by the existing
# theory:
# see https://doi.org/10.1007/s10208-015-9297-1
# Shift the Laplacian so its diagononal is not all ones. The shift
# does change the eigenpairs however, so we'll feed the shifted
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
# `pyamg` does not work with `csr_array` and we need to convert it to a
# `csr_matrix` object.
laplacian = sparse.csr_matrix(laplacian)
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
laplacian -= diag_shift
M = ml.aspreconditioner()
# Create initial approximation X to eigenvectors
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
tol = None if eigen_tol == "auto" else eigen_tol
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
embedding = diffusion_map.T
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
if eigen_solver == "lobpcg":
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
# lobpcg will fallback to eigh, so we short circuit it
if sparse.issparse(laplacian):
laplacian = laplacian.toarray()
_, diffusion_map = eigh(laplacian, check_finite=False)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
else:
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension and create initial
# approximation X to eigenvectors
X = random_state.standard_normal(
size=(laplacian.shape[0], n_components + 1)
)
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
tol = None if eigen_tol == "auto" else eigen_tol
_, diffusion_map = lobpcg(
laplacian, X, tol=tol, largest=False, maxiter=2000
)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
embedding = _deterministic_vector_sign_flip(embedding)
if drop_first:
return embedding[1:n_components].T
else:
return embedding[:n_components].T
class SpectralEmbedding(BaseEstimator):
"""Spectral embedding for non-linear dimensionality reduction.
Forms an affinity matrix given by the specified function and
applies spectral decomposition to the corresponding graph laplacian.
The resulting transformation is given by the value of the
eigenvectors for each data point.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
n_components : int, default=2
The dimension of the projected subspace.
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
'precomputed_nearest_neighbors'} or callable, \
default='nearest_neighbors'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf' : construct the affinity matrix by computing a radial basis
function (RBF) kernel.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
of precomputed nearest neighbors, and constructs the affinity matrix
by selecting the ``n_neighbors`` nearest neighbors.
- callable : use passed in function as affinity
the function takes in data matrix (n_samples, n_features)
and return affinity matrix (n_samples, n_samples).
gamma : float, default=None
Kernel coefficient for rbf kernel. If None, gamma will be set to
1/n_features.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems.
If None, then ``'arpack'`` is used.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
n_neighbors : int, default=None
Number of nearest neighbors for nearest_neighbors graph building.
If None, n_neighbors will be set to max(n_samples/10, 1).
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Spectral embedding of the training matrix.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Affinity_matrix constructed from samples or precomputed.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_neighbors_ : int
Number of nearest neighbors effectively used.
See Also
--------
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
References
----------
- :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
- `On Spectral Clustering: Analysis and an algorithm, 2001
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
- :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import SpectralEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = SpectralEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"affinity": [
StrOptions(
{
"nearest_neighbors",
"rbf",
"precomputed",
"precomputed_nearest_neighbors",
},
),
callable,
],
"gamma": [Interval(Real, 0, None, closed="left"), None],
"random_state": ["random_state"],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
"n_jobs": [None, Integral],
}
def __init__(
self,
n_components=2,
*,
affinity="nearest_neighbors",
gamma=None,
random_state=None,
eigen_solver=None,
eigen_tol="auto",
n_neighbors=None,
n_jobs=None,
):
self.n_components = n_components
self.affinity = affinity
self.gamma = gamma
self.random_state = random_state
self.eigen_solver = eigen_solver
self.eigen_tol = eigen_tol
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = True
tags.input_tags.pairwise = self.affinity in [
"precomputed",
"precomputed_nearest_neighbors",
]
return tags
def _get_affinity_matrix(self, X, Y=None):
"""Calculate the affinity matrix from data
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : array-like of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Y: Ignored
Returns
-------
affinity_matrix of shape (n_samples, n_samples)
"""
if self.affinity == "precomputed":
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == "precomputed_nearest_neighbors":
estimator = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
).fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
return self.affinity_matrix_
if self.affinity == "nearest_neighbors":
if sparse.issparse(X):
warnings.warn(
"Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity"
)
self.affinity = "rbf"
else:
self.n_neighbors_ = (
self.n_neighbors
if self.n_neighbors is not None
else max(int(X.shape[0] / 10), 1)
)
self.affinity_matrix_ = kneighbors_graph(
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
)
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (
self.affinity_matrix_ + self.affinity_matrix_.T
)
return self.affinity_matrix_
if self.affinity == "rbf":
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the model from data in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = validate_data(self, X, accept_sparse="csr", ensure_min_samples=2)
random_state = check_random_state(self.random_state)
affinity_matrix = self._get_affinity_matrix(X)
self.embedding_ = _spectral_embedding(
affinity_matrix,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
eigen_tol=self.eigen_tol,
random_state=random_state,
)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like of shape (n_samples, n_components)
Spectral embedding of the training matrix.
"""
self.fit(X)
return self.embedding_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,120 @@
import numpy as np
from libc cimport math
from libc.math cimport INFINITY
from sklearn.utils._typedefs cimport float32_t, float64_t
cdef float EPSILON_DBL = 1e-8
cdef float PERPLEXITY_TOLERANCE = 1e-5
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
def _binary_search_perplexity(
const float32_t[:, :] sqdistances,
float desired_perplexity,
int verbose):
"""Binary search for sigmas of conditional Gaussians.
This approximation reduces the computational complexity from O(N^2) to
O(uN).
Parameters
----------
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
Distances between training samples and their k nearest neighbors.
When using the exact method, this is a square (n_samples, n_samples)
distance matrix. The TSNE default metric is "euclidean" which is
interpreted as squared euclidean distance.
desired_perplexity : float
Desired perplexity (2^entropy) of the conditional Gaussians.
verbose : int
Verbosity level.
Returns
-------
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
Probabilities of conditional Gaussian distributions p_i|j.
"""
# Maximum number of binary search steps
cdef long n_steps = 100
cdef long n_samples = sqdistances.shape[0]
cdef long n_neighbors = sqdistances.shape[1]
cdef int using_neighbors = n_neighbors < n_samples
# Precisions of conditional Gaussian distributions
cdef double beta
cdef double beta_min
cdef double beta_max
cdef double beta_sum = 0.0
# Use log scale
cdef double desired_entropy = math.log(desired_perplexity)
cdef double entropy_diff
cdef double entropy
cdef double sum_Pi
cdef double sum_disti_Pi
cdef long i, j, l
# This array is later used as a 32bit array. It has multiple intermediate
# floating point additions that benefit from the extra precision
cdef float64_t[:, :] P = np.zeros(
(n_samples, n_neighbors), dtype=np.float64)
for i in range(n_samples):
beta_min = -INFINITY
beta_max = INFINITY
beta = 1.0
# Binary search of precision for i-th conditional distribution
for l in range(n_steps):
# Compute current entropy and corresponding probabilities
# computed just over the nearest neighbors or over all data
# if we're not using neighbors
sum_Pi = 0.0
for j in range(n_neighbors):
if j != i or using_neighbors:
P[i, j] = math.exp(-sqdistances[i, j] * beta)
sum_Pi += P[i, j]
if sum_Pi == 0.0:
sum_Pi = EPSILON_DBL
sum_disti_Pi = 0.0
for j in range(n_neighbors):
P[i, j] /= sum_Pi
sum_disti_Pi += sqdistances[i, j] * P[i, j]
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
entropy_diff = entropy - desired_entropy
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
break
if entropy_diff > 0.0:
beta_min = beta
if beta_max == INFINITY:
beta *= 2.0
else:
beta = (beta + beta_max) / 2.0
else:
beta_max = beta
if beta_min == -INFINITY:
beta /= 2.0
else:
beta = (beta + beta_min) / 2.0
beta_sum += beta
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
print("[t-SNE] Computed conditional probabilities for sample "
"%d / %d" % (i + 1, n_samples))
if verbose:
print("[t-SNE] Mean sigma: %f"
% np.mean(math.sqrt(n_samples / beta_sum)))
return np.asarray(P)

View File

@@ -0,0 +1,14 @@
py.extension_module(
'_utils',
[cython_gen.process('_utils.pyx'), utils_cython_tree],
subdir: 'sklearn/manifold',
install: true
)
py.extension_module(
'_barnes_hut_tsne',
cython_gen.process('_barnes_hut_tsne.pyx'),
dependencies: [np_dep, openmp_dep],
subdir: 'sklearn/manifold',
install: true
)

View File

@@ -0,0 +1,68 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.manifold import ClassicalMDS
from sklearn.metrics import euclidean_distances
def test_classical_mds_equivalent_to_pca():
X, _ = load_iris(return_X_y=True)
cmds = ClassicalMDS(n_components=2, metric="euclidean")
pca = PCA(n_components=2)
Z1 = cmds.fit_transform(X)
Z2 = pca.fit_transform(X)
# Swap the signs if necessary
for comp in range(2):
if Z1[0, comp] < 0 and Z2[0, comp] > 0:
Z2[:, comp] *= -1
assert_allclose(Z1, Z2)
assert_allclose(np.sqrt(cmds.eigenvalues_), pca.singular_values_)
def test_classical_mds_equivalent_on_data_and_distances():
X, _ = load_iris(return_X_y=True)
cmds = ClassicalMDS(n_components=2, metric="euclidean")
Z1 = cmds.fit_transform(X)
cmds = ClassicalMDS(n_components=2, metric="precomputed")
Z2 = cmds.fit_transform(euclidean_distances(X))
assert_allclose(Z1, Z2)
def test_classical_mds_wrong_inputs():
# Non-symmetric input
dissim = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
with pytest.raises(ValueError, match="Array must be symmetric"):
ClassicalMDS(metric="precomputed").fit(dissim)
# Non-square input
dissim = np.array([[0, 1, 2], [3, 4, 5]])
with pytest.raises(ValueError, match="array must be 2-dimensional and square"):
ClassicalMDS(metric="precomputed").fit(dissim)
def test_classical_mds_metric_params():
X, _ = load_iris(return_X_y=True)
cmds = ClassicalMDS(n_components=2, metric="euclidean")
Z1 = cmds.fit_transform(X)
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 2})
Z2 = cmds.fit_transform(X)
assert_allclose(Z1, Z2)
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 1})
Z3 = cmds.fit_transform(X)
assert not np.allclose(Z1, Z3)

View File

@@ -0,0 +1,348 @@
import math
from itertools import product
import numpy as np
import pytest
from scipy.sparse import rand as sparse_rand
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
eigen_solvers = ["auto", "dense", "arpack"]
path_methods = ["auto", "FW", "D"]
def create_sample_data(dtype, n_pts=25, add_noise=False):
# grid of equidistant points in 2D, n_components = n_dim
n_per_side = int(math.sqrt(n_pts))
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
if add_noise:
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
X = np.concatenate((X, noise), 1)
return X
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_simple_grid(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
# Isomap should preserve distances when all neighbors are used
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
# distances from each point to all others
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose_dense_sparse(G, G_iso, atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_reconstruction_error(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
if global_dtype is np.float32:
pytest.skip(
"Skipping test due to numerical instabilities on float32 data"
"from KernelCenterer used in the reconstruction_error method"
)
# Same setup as in test_isomap_simple_grid, with an added dimension
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
# compute input kernel
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G**2)
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
# compute output kernel
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
G_iso = G_iso.toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
def test_transform(global_dtype, n_neighbors, radius):
n_samples = 200
n_components = 10
noise_scale = 0.01
# Create S-curve dataset
X, y = datasets.make_s_curve(n_samples, random_state=0)
X = X.astype(global_dtype, copy=False)
# Compute isomap embedding
iso = manifold.Isomap(
n_components=n_components, n_neighbors=n_neighbors, radius=radius
)
X_iso = iso.fit_transform(X)
# Re-embed a noisy version of the points
rng = np.random.RandomState(0)
noise = noise_scale * rng.randn(*X.shape)
X_iso2 = iso.transform(X + noise)
# Make sure the rms error on re-embedding is comparable to noise_scale
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
def test_pipeline(n_neighbors, radius, global_dtype):
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
clf = pipeline.Pipeline(
[
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
# Test chaining NearestNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = "auto"
n_neighbors = 10
X, _ = datasets.make_blobs(random_state=0)
X2, _ = datasets.make_blobs(random_state=1)
X = X.astype(global_dtype, copy=False)
X2 = X2.astype(global_dtype, copy=False)
# compare the chained version and the compact version
est_chain = pipeline.make_pipeline(
neighbors.KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
),
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
)
est_compact = manifold.Isomap(
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_allclose(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_allclose(Xt_chain, Xt_compact)
@pytest.mark.parametrize(
"metric, p, is_euclidean",
[
("euclidean", 2, True),
("manhattan", 1, False),
("minkowski", 1, False),
("minkowski", 2, True),
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
],
)
def test_different_metric(global_dtype, metric, p, is_euclidean):
# Isomap must work on various metric parameters work correctly
# and must default to euclidean.
X, _ = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
reference = manifold.Isomap().fit_transform(X)
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
if is_euclidean:
assert_allclose(embedding, reference)
else:
with pytest.raises(AssertionError, match="Not equal to tolerance"):
assert_allclose(embedding, reference)
def test_isomap_clone_bug():
# regression test for bug reported in #6062
model = manifold.Isomap()
for n_neighbors in [10, 15, 20]:
model.set_params(n_neighbors=n_neighbors)
model.fit(np.random.rand(50, 2))
assert model.nbrs_.n_neighbors == n_neighbors
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input(
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
):
# TODO: compare results on dense and sparse data as proposed in:
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
X = csr_container(
sparse_rand(
100,
3,
density=0.1,
format="csr",
dtype=global_dtype,
random_state=global_random_seed,
)
)
iso_dense = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
iso_sparse = clone(iso_dense)
X_trans_dense = iso_dense.fit_transform(X.toarray())
X_trans_sparse = iso_sparse.fit_transform(X)
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
def test_isomap_fit_precomputed_radius_graph(global_dtype):
# Isomap.fit_transform must yield similar result when using
# a precomputed distance matrix.
X, y = datasets.make_s_curve(200, random_state=0)
X = X.astype(global_dtype, copy=False)
radius = 10
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
isomap.fit(g)
precomputed_result = isomap.embedding_
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
result = isomap.fit_transform(X)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(precomputed_result, result, atol=atol)
def test_isomap_fitted_attributes_dtype(global_dtype):
"""Check that the fitted attributes are stored accordingly to the
data type of X."""
iso = manifold.Isomap(n_neighbors=2)
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
iso.fit(X)
assert iso.dist_matrix_.dtype == global_dtype
assert iso.embedding_.dtype == global_dtype
def test_isomap_dtype_equivalence():
"""Check the equivalence of the results with 32 and 64 bits input."""
iso_32 = manifold.Isomap(n_neighbors=2)
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
iso_32.fit(X_32)
iso_64 = manifold.Isomap(n_neighbors=2)
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
iso_64.fit(X_64)
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
# Isomap.fit_transform must raise a ValueError if
# radius and n_neighbors are provided.
X, _ = datasets.load_digits(return_X_y=True)
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
msg = "Both n_neighbors and radius are provided"
with pytest.raises(ValueError, match=msg):
isomap.fit_transform(X)
def test_multiple_connected_components():
# Test that a warning is raised when the graph has multiple components
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=2).fit(X)
def test_multiple_connected_components_metric_precomputed(global_dtype):
# Test that an error is raised when the graph has multiple components
# and when X is a precomputed neighbors graph.
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
# works with a precomputed distance matrix (dense)
X_distances = pairwise_distances(X)
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
# does not work with a precomputed neighbors graph (sparse)
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
with pytest.raises(RuntimeError, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
def test_get_feature_names_out():
"""Check get_feature_names_out for Isomap."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.Isomap(n_components=n_components)
iso.fit_transform(X)
names = iso.get_feature_names_out()
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)

View File

@@ -0,0 +1,171 @@
from itertools import product
import numpy as np
import pytest
from scipy import linalg
from sklearn import manifold, neighbors
from sklearn.datasets import make_blobs
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
ignore_warnings,
)
eigen_solvers = ["dense", "arpack"]
# ----------------------------------------------------------------------
# Test utility routines
def test_barycenter_kneighbors_graph(global_dtype):
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
graph = barycenter_kneighbors_graph(X, 1)
expected_graph = np.array(
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
)
assert graph.dtype == global_dtype
assert_allclose(graph.toarray(), expected_graph)
graph = barycenter_kneighbors_graph(X, 2)
# check that columns sum to one
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
pred = np.dot(graph.toarray(), X)
assert linalg.norm(pred - X) / X.shape[0] < 1
# ----------------------------------------------------------------------
# Test LLE by computing the reconstruction error on some manifolds.
def test_lle_simple_grid(global_dtype):
# note: ARPACK is numerically unstable, so this test will fail for
# some random seeds. We choose 42 because the tests pass.
# for arm64 platforms 2 makes the test fail.
# TODO: rewrite this test to make less sensitive to the random seed,
# irrespective of the platform.
rng = np.random.RandomState(42)
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(5), repeat=2)))
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=5, n_components=n_components, random_state=rng
)
tol = 0.1
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
assert reconstruction_error < tol
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
# re-embed a noisy version of X using the transform method
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
X_reembedded = clf.transform(X + noise)
assert linalg.norm(X_reembedded - clf.embedding_) < tol
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
@pytest.mark.parametrize("solver", eigen_solvers)
def test_lle_manifold(global_dtype, method, solver):
rng = np.random.RandomState(0)
# similar test on a slightly more complex manifold
X = np.array(list(product(np.arange(18), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 18]
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=6, n_components=n_components, method=method, random_state=0
)
tol = 1.5 if method == "standard" else 3
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X)
assert reconstruction_error < tol
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
details = "solver: %s, method: %s" % (solver, method)
assert reconstruction_error < tol, details
assert (
np.abs(clf.reconstruction_error_ - reconstruction_error)
< tol * reconstruction_error
), details
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import datasets, pipeline
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
# Test the error raised when the weight matrix is singular
def test_singular_matrix():
M = np.ones((200, 3))
f = ignore_warnings
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
f(
manifold.locally_linear_embedding(
M,
n_neighbors=2,
n_components=1,
method="standard",
eigen_solver="arpack",
)
)
# regression test for #6033
def test_integer_input():
rand = np.random.RandomState(0)
X = rand.randint(0, 100, size=(20, 3))
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
clf.fit(X) # this previously raised a TypeError
def test_get_feature_names_out():
"""Check get_feature_names_out for LocallyLinearEmbedding."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
iso.fit(X)
names = iso.get_feature_names_out()
assert_array_equal(
[f"locallylinearembedding{i}" for i in range(n_components)], names
)

View File

@@ -0,0 +1,305 @@
from unittest.mock import Mock
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
from sklearn.datasets import load_digits, load_iris
from sklearn.manifold import ClassicalMDS
from sklearn.manifold import _mds as mds
from sklearn.metrics import euclidean_distances
def test_smacof():
# test metric smacof using the data of "Modern Multidimensional Scaling",
# Borg & Groenen, p 154
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
X_true = np.array(
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
)
assert_array_almost_equal(X, X_true, decimal=3)
def test_nonmetric_lower_normalized_stress():
# Testing that nonmetric MDS results in lower normalized stress compared
# compared to metric MDS (non-regression test for issue 27028)
X, _ = load_iris(return_X_y=True)
sim = euclidean_distances(X)
np.random.seed(42)
Z = np.random.normal(size=(X.shape[0], 2))
_, stress1 = mds.smacof(
sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True
)
_, stress2 = mds.smacof(
sim,
init=Z,
n_components=2,
max_iter=1000,
n_init=1,
normalized_stress=True,
metric=False,
)
assert stress1 > stress2
# A metric MDS solution (local minimum of the raw stress) can be rescaled to
# decrease the stress-1 (which is returned with normalized_stress=True).
# The optimal rescaling can be computed analytically, see Borg & Groenen,
# Modern Multidimensional Scaling, Chapter 11.1. After rescaling, stress-1
# becomes sqrt(s^2 / (1 + s^2)), where s is the value of stress-1 before
# rescaling.
stress1_rescaled = np.sqrt(stress1**2 / (1 + stress1**2))
assert stress1_rescaled > stress2
def test_nonmetric_mds_optimization():
# Test that stress is decreasing during nonmetric MDS optimization
# (non-regression test for issue 27028)
X, _ = load_digits(return_X_y=True)
rng = np.random.default_rng(seed=42)
ind_subset = rng.choice(len(X), size=200, replace=False)
X = X[ind_subset]
mds_est = mds.MDS(
n_components=2,
n_init=1,
max_iter=2,
metric_mds=False,
init="random",
random_state=42,
).fit(X)
stress_after_2_iter = mds_est.stress_
mds_est = mds.MDS(
n_components=2,
n_init=1,
max_iter=3,
metric_mds=False,
init="random",
random_state=42,
).fit(X)
stress_after_3_iter = mds_est.stress_
assert stress_after_2_iter > stress_after_3_iter
@pytest.mark.parametrize("metric_mds", [True, False])
def test_mds_recovers_true_data(metric_mds):
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
mds_est = mds.MDS(
n_components=2,
n_init=1,
eps=1e-15,
max_iter=1000,
metric_mds=metric_mds,
init="random",
random_state=42,
).fit(X)
stress = mds_est.stress_
assert_allclose(stress, 0, atol=1e-6)
def test_smacof_error():
# Not symmetric similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim, n_init=1)
# Not squared similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim, n_init=1)
# init not None and not correct format:
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
with pytest.raises(ValueError):
mds.smacof(sim, init=Z, n_init=1)
# TODO: remove mark once loky bug is fixed:
# https://github.com/joblib/loky/issues/458
@pytest.mark.thread_unsafe
def test_MDS():
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
mds_clf = mds.MDS(
metric_mds=False,
n_jobs=3,
n_init=3,
metric="precomputed",
init="random",
)
mds_clf.fit(sim)
# TODO(1.10): remove warning filter
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
def test_normed_stress(k):
"""Test that non-metric MDS normalized stress is scale-invariant."""
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
assert_allclose(stress1, stress2, rtol=1e-5)
assert_allclose(X1, X2, rtol=1e-5)
# TODO(1.10): remove warning filter
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("metric", [True, False])
def test_normalized_stress_auto(metric, monkeypatch):
rng = np.random.RandomState(0)
X = rng.randn(4, 3)
dist = euclidean_distances(X)
mock = Mock(side_effect=mds._smacof_single)
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
est.fit_transform(X)
assert mock.call_args[1]["normalized_stress"] != metric
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
assert mock.call_args[1]["normalized_stress"] != metric
def test_isotonic_outofbounds():
# This particular configuration can trigger out of bounds error
# in the isotonic regression (non-regression test for issue 26999)
dis = np.array(
[
[0.0, 1.732050807568877, 1.7320508075688772],
[1.732050807568877, 0.0, 6.661338147750939e-16],
[1.7320508075688772, 6.661338147750939e-16, 0.0],
]
)
init = np.array(
[
[0.08665881585055124, 0.7939114643387546],
[0.9959834154297658, 0.7555546025640025],
[0.8766008278401566, 0.4227358815811242],
]
)
mds.smacof(dis, init=init, metric=False, n_init=1)
# TODO(1.10): remove warning filter
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("normalized_stress", [True, False])
def test_returned_stress(normalized_stress):
# Test that the final stress corresponds to the final embedding
# (non-regression test for issue 16846)
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
D = euclidean_distances(X)
mds_est = mds.MDS(
n_components=2,
random_state=42,
normalized_stress=normalized_stress,
).fit(X)
Z = mds_est.embedding_
stress = mds_est.stress_
D_mds = euclidean_distances(Z)
stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2
if normalized_stress:
stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2))
assert_allclose(stress, stress_Z)
# TODO(1.10): remove warning filter
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("metric_mds", [True, False])
def test_convergence_does_not_depend_on_scale(metric_mds):
# Test that the number of iterations until convergence does not depend on
# the scale of the input data
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
mds_est = mds.MDS(
n_components=2,
random_state=42,
metric_mds=metric_mds,
)
mds_est.fit(X * 100)
n_iter1 = mds_est.n_iter_
mds_est.fit(X / 100)
n_iter2 = mds_est.n_iter_
assert_equal(n_iter1, n_iter2)
# TODO(1.9): delete this test
def test_future_warning_n_init():
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.warns(FutureWarning):
mds.smacof(sim)
with pytest.warns(FutureWarning):
mds.MDS(init="random").fit(X)
# TODO(1.9): delete the n_init warning check
# TODO(1.10): delete this test
def test_future_warning_init_and_metric():
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
# dissimilarity argument deprecated
with pytest.warns(FutureWarning, match="`dissimilarity` parameter is"):
mds.MDS(dissimilarity="precomputed", init="random", n_init=1).fit(sim)
# metric=True deprecated
with pytest.warns(FutureWarning, match="Use metric_mds"):
mds.MDS(metric=True, init="random", n_init=1).fit(X)
# metric=False deprecated
with pytest.warns(FutureWarning, match="Use metric_mds"):
mds.MDS(metric=False, init="random", n_init=1).fit(X)
# default init will become classical_mds in the future
with pytest.warns(FutureWarning, match="The default value of `init`"):
mds.MDS(metric="euclidean", n_init=1).fit(X)
# TODO (1.9): delete this check
# n_init=1 will become default in the future
with pytest.warns(FutureWarning, match="The default value of `n_init`"):
mds.MDS(metric="euclidean", init="random").fit(X)
# providing both metric and dissimilarity raises an error
with pytest.raises(ValueError, match="provided both `dissimilarity`"):
mds.MDS(
metric="cosine", dissimilarity="euclidean", init="random", n_init=1
).fit(X)
# TODO(1.9): remove warning filter
@pytest.mark.filterwarnings("ignore::FutureWarning")
def test_classical_mds_init_to_mds():
X, _ = load_iris(return_X_y=True)
cmds = ClassicalMDS()
Z_classical = cmds.fit_transform(X)
mds1 = mds.MDS(init="classical_mds")
Z1 = mds1.fit_transform(X)
mds2 = mds.MDS(init="random")
Z2 = mds1.fit_transform(X, init=Z_classical)
assert_allclose(Z1, Z2)

View File

@@ -0,0 +1,503 @@
import itertools
from unittest.mock import Mock
import numpy as np
import pytest
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh, lobpcg
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
from sklearn.manifold._spectral_embedding import (
_graph_connected_component,
_graph_is_connected,
)
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
parse_version,
sp_version,
)
from sklearn.utils.fixes import laplacian as csgraph_laplacian
try:
from pyamg import smoothed_aggregation_solver # noqa: F401
pyamg_available = True
except ImportError:
pyamg_available = False
skip_if_no_pyamg = pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# non centered, sparse centers to check the
centers = np.array(
[
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
]
)
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
"""Check array A and B are equal with possible sign flipping on
each column"""
tol_squared = tol**2
for A_col, B_col in zip(A.T, B.T):
assert (
np.max((A_col - B_col) ** 2) <= tol_squared
or np.max((A_col + B_col) ** 2) <= tol_squared
)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_sparse_graph_connected_component(coo_container):
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in itertools.pairwise(boundaries):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(0.1, 42, size=len(connections))
affinity = coo_container((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in itertools.pairwise(boundaries):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert component_1.sum() == component_size
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert component_2.sum() == component_size
assert_array_equal(component_1, component_2)
# TODO: investigate why this test is seed-sensitive on 32-bit Python
# runtimes. Is this revealing a numerical stability problem ? Or is it
# expected from the test numerical design ? In the latter case the test
# should be made less seed-sensitive instead.
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample, 0:n_sample] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# second component
affinity[n_sample::, n_sample::] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert component[:n_sample].all()
assert not component[n_sample:].any()
component = _graph_connected_component(affinity, -1)
assert not component[:n_sample].any()
assert component[n_sample:].all()
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[:: 2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(
n_components=1,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_precomputed_affinity(
sparse_container, eigen_solver, dtype, seed=36
):
# Test spectral embedding with precomputed kernel
gamma = 1.0
X = S if sparse_container is None else sparse_container(S)
se_precomp = SpectralEmbedding(
n_components=2,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
graph = nn.kneighbors_graph(S, mode="connectivity")
embedding = (
SpectralEmbedding(
random_state=0,
n_components=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.embedding_
)
results.append(embedding)
assert_array_equal(results[0], results[1])
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
X = S if sparse_container is None else sparse_container(S)
se_callable = SpectralEmbedding(
n_components=2,
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed),
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
)
embed_rbf = se_rbf.fit_transform(X)
embed_callable = se_callable.fit_transform(X)
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
se_amg = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="amg",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
se_arpack = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="arpack",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
embed_amg = se_amg.fit_transform(S.astype(dtype))
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# same with special case in which amg is not actually used
# regression test for #10715
# affinity between nodes
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
affinity = coo_container(
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
shape=(6, 6),
)
se_amg.affinity = "precomputed"
se_arpack.affinity = "precomputed"
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
# or is successful based on the version of SciPy which is installed.
# Use a CSR matrix to avoid any conversion during the validation
affinity = affinity.tocsr()
affinity.indptr = affinity.indptr.astype(np.int64)
affinity.indices = affinity.indices.astype(np.int64)
# PR: https://github.com/scipy/scipy/pull/18913
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
if scipy_graph_traversal_supports_int64_index:
se_amg.fit_transform(affinity)
else:
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
with pytest.raises(ValueError, match=err_msg):
se_amg.fit_transform(affinity)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
# Non-regression test for amg solver failure (issue #13393 on github)
num_nodes = 100
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
X = X.astype(dtype)
upper = sparse.triu(X) - sparse.diags(X.diagonal())
sym_matrix = upper + upper.T
embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
)
# Check that the learned embedding is stable w.r.t. random solver init:
for i in range(3):
new_embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
)
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(
n_components=n_clusters, affinity="rbf", random_state=random_state
)
se_knn = SpectralEmbedding(
n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state,
)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert not _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert not _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert not _graph_is_connected(csc_container(graph))
graph = np.array(
[
[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert _graph_is_connected(csc_container(graph))
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(
sims, norm_laplacian=False, n_components=n_components, drop_first=False
)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(
sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed,
)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
attribute and transformed data.
Ideally, this test should be covered by the common test
`check_transformer_preserve_dtypes`. However, this test only run
with transformers implementing `transform` while `SpectralEmbedding`
implements only `fit_transform`.
"""
X = S.astype(dtype)
se = SpectralEmbedding(
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
)
X_trans = se.fit_transform(X)
assert X_trans.dtype == dtype
assert se.embedding_.dtype == dtype
assert se.affinity_matrix_.dtype == dtype
@pytest.mark.skipif(
pyamg_available,
reason="PyAMG is installed and we should not test for an error.",
)
def test_error_pyamg_not_available():
se_precomp = SpectralEmbedding(
n_components=2,
affinity="rbf",
eigen_solver="amg",
)
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
with pytest.raises(ValueError, match=err_msg):
se_precomp.fit_transform(S)
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
"""Test that `eigen_tol="auto"` is resolved correctly"""
if solver == "amg" and not pyamg_available:
pytest.skip("PyAMG is not available.")
X, _ = make_blobs(
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
solver_func = eigsh if solver == "arpack" else lobpcg
default_value = 0 if solver == "arpack" else None
if solver == "amg":
S = csr_container(S)
mocked_solver = Mock(side_effect=solver_func)
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
mocked_solver.assert_called()
_, kwargs = mocked_solver.call_args
assert kwargs["tol"] == default_value