Videre
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
"""Data embedding techniques."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.manifold._classical_mds import ClassicalMDS
|
||||
from sklearn.manifold._isomap import Isomap
|
||||
from sklearn.manifold._locally_linear import (
|
||||
LocallyLinearEmbedding,
|
||||
locally_linear_embedding,
|
||||
)
|
||||
from sklearn.manifold._mds import MDS, smacof
|
||||
from sklearn.manifold._spectral_embedding import SpectralEmbedding, spectral_embedding
|
||||
from sklearn.manifold._t_sne import TSNE, trustworthiness
|
||||
|
||||
__all__ = [
|
||||
"MDS",
|
||||
"TSNE",
|
||||
"ClassicalMDS",
|
||||
"Isomap",
|
||||
"LocallyLinearEmbedding",
|
||||
"SpectralEmbedding",
|
||||
"locally_linear_embedding",
|
||||
"smacof",
|
||||
"spectral_embedding",
|
||||
"trustworthiness",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,295 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
|
||||
# implementations and papers describing the technique
|
||||
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
from libc.stdio cimport printf
|
||||
from libc.math cimport log
|
||||
from libc.stdlib cimport malloc, free
|
||||
from libc.time cimport clock, clock_t
|
||||
from cython.parallel cimport prange, parallel
|
||||
|
||||
from sklearn.neighbors._quad_tree cimport _QuadTree
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
cdef char* EMPTY_STRING = ""
|
||||
|
||||
# Smallest strictly positive value that can be represented by floating
|
||||
# point numbers for different precision levels. This is useful to avoid
|
||||
# taking the log of zero when computing the KL divergence.
|
||||
cdef float FLOAT32_TINY = np.finfo(np.float32).tiny
|
||||
|
||||
# Useful to void division by zero or divergence to +inf.
|
||||
cdef float FLOAT64_EPS = np.finfo(np.float64).eps
|
||||
|
||||
# This is effectively an ifdef statement in Cython
|
||||
# It allows us to write printf debugging lines
|
||||
# and remove them at compile time
|
||||
cdef enum:
|
||||
DEBUGFLAG = 0
|
||||
|
||||
cdef float compute_gradient(float[:] val_P,
|
||||
float[:, :] pos_reference,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float[:, :] tot_force,
|
||||
_QuadTree qt,
|
||||
float theta,
|
||||
int dof,
|
||||
long start,
|
||||
bint compute_error,
|
||||
int num_threads) noexcept nogil:
|
||||
# Having created the tree, calculate the gradient
|
||||
# in two components, the positive and negative forces
|
||||
cdef:
|
||||
long i, coord
|
||||
int ax
|
||||
long n_samples = pos_reference.shape[0]
|
||||
int n_dimensions = qt.n_dimensions
|
||||
clock_t t1 = 0, t2 = 0
|
||||
double sQ
|
||||
float error
|
||||
int take_timing = 1 if qt.verbose > 15 else 0
|
||||
|
||||
if qt.verbose > 11:
|
||||
printf("[t-SNE] Allocating %li elements in force arrays\n",
|
||||
n_samples * n_dimensions * 2)
|
||||
cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
|
||||
cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
|
||||
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
|
||||
num_threads)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
|
||||
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
|
||||
pos_f, n_dimensions, dof, sQ, start,
|
||||
qt.verbose, compute_error, num_threads)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
printf("[t-SNE] Computing positive gradient: %e ticks\n",
|
||||
((float) (t2 - t1)))
|
||||
for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
|
||||
schedule='static'):
|
||||
for ax in range(n_dimensions):
|
||||
coord = i * n_dimensions + ax
|
||||
tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
|
||||
|
||||
free(neg_f)
|
||||
free(pos_f)
|
||||
return error
|
||||
|
||||
|
||||
cdef float compute_gradient_positive(float[:] val_P,
|
||||
float[:, :] pos_reference,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float* pos_f,
|
||||
int n_dimensions,
|
||||
int dof,
|
||||
double sum_Q,
|
||||
cnp.int64_t start,
|
||||
int verbose,
|
||||
bint compute_error,
|
||||
int num_threads) noexcept nogil:
|
||||
# Sum over the following expression for i not equal to j
|
||||
# grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
|
||||
# This is equivalent to compute_edge_forces in the authors' code
|
||||
# It just goes over the nearest neighbors instead of all the data points
|
||||
# (unlike the non-nearest neighbors version of `compute_gradient_positive')
|
||||
cdef:
|
||||
int ax
|
||||
long i, j, k
|
||||
long n_samples = indptr.shape[0] - 1
|
||||
float C = 0.0
|
||||
float dij, qij, pij
|
||||
float exponent = (dof + 1.0) / 2.0
|
||||
float float_dof = (float) (dof)
|
||||
float* buff
|
||||
clock_t t1 = 0, t2 = 0
|
||||
float dt
|
||||
|
||||
if verbose > 10:
|
||||
t1 = clock()
|
||||
|
||||
with nogil, parallel(num_threads=num_threads):
|
||||
# Define private buffer variables
|
||||
buff = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
|
||||
for i in prange(start, n_samples, schedule='static'):
|
||||
# Init the gradient vector
|
||||
for ax in range(n_dimensions):
|
||||
pos_f[i * n_dimensions + ax] = 0.0
|
||||
# Compute the positive interaction for the nearest neighbors
|
||||
for k in range(indptr[i], indptr[i+1]):
|
||||
j = neighbors[k]
|
||||
dij = 0.0
|
||||
pij = val_P[k]
|
||||
for ax in range(n_dimensions):
|
||||
buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
|
||||
dij += buff[ax] * buff[ax]
|
||||
qij = float_dof / (float_dof + dij)
|
||||
if dof != 1: # i.e. exponent != 1
|
||||
qij = qij ** exponent
|
||||
dij = pij * qij
|
||||
|
||||
# only compute the error when needed
|
||||
if compute_error:
|
||||
qij = qij / sum_Q
|
||||
C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
|
||||
for ax in range(n_dimensions):
|
||||
pos_f[i * n_dimensions + ax] += dij * buff[ax]
|
||||
|
||||
free(buff)
|
||||
if verbose > 10:
|
||||
t2 = clock()
|
||||
dt = ((float) (t2 - t1))
|
||||
printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
|
||||
return C
|
||||
|
||||
|
||||
cdef double compute_gradient_negative(float[:, :] pos_reference,
|
||||
float* neg_f,
|
||||
_QuadTree qt,
|
||||
int dof,
|
||||
float theta,
|
||||
long start,
|
||||
int num_threads) noexcept nogil:
|
||||
cdef:
|
||||
int ax
|
||||
int n_dimensions = qt.n_dimensions
|
||||
int offset = n_dimensions + 2
|
||||
long i, j, idx
|
||||
long n_samples = pos_reference.shape[0]
|
||||
long n = n_samples - start
|
||||
long dta = 0
|
||||
long dtb = 0
|
||||
float size, dist2s, mult
|
||||
float exponent = (dof + 1.0) / 2.0
|
||||
float float_dof = (float) (dof)
|
||||
double qijZ, sum_Q = 0.0
|
||||
float* force
|
||||
float* neg_force
|
||||
float* pos
|
||||
clock_t t1 = 0, t2 = 0, t3 = 0
|
||||
int take_timing = 1 if qt.verbose > 20 else 0
|
||||
|
||||
with nogil, parallel(num_threads=num_threads):
|
||||
# Define thread-local buffers
|
||||
summary = <float*> malloc(sizeof(float) * n * offset)
|
||||
pos = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
force = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
neg_force = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
|
||||
for i in prange(start, n_samples, schedule='static'):
|
||||
# Clear the arrays
|
||||
for ax in range(n_dimensions):
|
||||
force[ax] = 0.0
|
||||
neg_force[ax] = 0.0
|
||||
pos[ax] = pos_reference[i, ax]
|
||||
|
||||
# Find which nodes are summarizing and collect their centers of mass
|
||||
# deltas, and sizes, into vectorized arrays
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
idx = qt.summarize(pos, summary, theta*theta)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
# Compute the t-SNE negative force
|
||||
# for the digits dataset, walking the tree
|
||||
# is about 10-15x more expensive than the
|
||||
# following for loop
|
||||
for j in range(idx // offset):
|
||||
|
||||
dist2s = summary[j * offset + n_dimensions]
|
||||
size = summary[j * offset + n_dimensions + 1]
|
||||
qijZ = float_dof / (float_dof + dist2s) # 1/(1+dist)
|
||||
if dof != 1: # i.e. exponent != 1
|
||||
qijZ = qijZ ** exponent
|
||||
|
||||
sum_Q += size * qijZ # size of the node * q
|
||||
mult = size * qijZ * qijZ
|
||||
for ax in range(n_dimensions):
|
||||
neg_force[ax] += mult * summary[j * offset + ax]
|
||||
if take_timing:
|
||||
t3 = clock()
|
||||
for ax in range(n_dimensions):
|
||||
neg_f[i * n_dimensions + ax] = neg_force[ax]
|
||||
if take_timing:
|
||||
dta += t2 - t1
|
||||
dtb += t3 - t2
|
||||
free(pos)
|
||||
free(force)
|
||||
free(neg_force)
|
||||
free(summary)
|
||||
if take_timing:
|
||||
printf("[t-SNE] Tree: %li clock ticks | ", dta)
|
||||
printf("Force computation: %li clock ticks\n", dtb)
|
||||
|
||||
# Put sum_Q to machine EPSILON to avoid divisions by 0
|
||||
sum_Q = max(sum_Q, FLOAT64_EPS)
|
||||
return sum_Q
|
||||
|
||||
|
||||
def gradient(float[:] val_P,
|
||||
float[:, :] pos_output,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float[:, :] forces,
|
||||
float theta,
|
||||
int n_dimensions,
|
||||
int verbose,
|
||||
int dof=1,
|
||||
long skip_num_points=0,
|
||||
bint compute_error=1,
|
||||
int num_threads=1):
|
||||
# This function is designed to be called from external Python
|
||||
# it passes the 'forces' array by reference and fills that's array
|
||||
# up in-place
|
||||
cdef float C
|
||||
cdef int n
|
||||
n = pos_output.shape[0]
|
||||
assert val_P.itemsize == 4
|
||||
assert pos_output.itemsize == 4
|
||||
assert forces.itemsize == 4
|
||||
m = "Forces array and pos_output shapes are incompatible"
|
||||
assert n == forces.shape[0], m
|
||||
m = "Pij and pos_output shapes are incompatible"
|
||||
assert n == indptr.shape[0] - 1, m
|
||||
if verbose > 10:
|
||||
printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
|
||||
cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose)
|
||||
if verbose > 10:
|
||||
printf("[t-SNE] Inserting %li points\n", pos_output.shape[0])
|
||||
qt.build_tree(pos_output)
|
||||
if verbose > 10:
|
||||
# XXX: format hack to workaround lack of `const char *` type
|
||||
# in the generated C code that triggers error with gcc 4.9
|
||||
# and -Werror=format-security
|
||||
printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
|
||||
|
||||
C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
|
||||
qt, theta, dof, skip_num_points, compute_error,
|
||||
num_threads)
|
||||
|
||||
if verbose > 10:
|
||||
# XXX: format hack to workaround lack of `const char *` type
|
||||
# in the generated C code
|
||||
# and -Werror=format-security
|
||||
printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
|
||||
m = "Tree consistency failed: unexpected number of points on the tree"
|
||||
assert qt.cells[0].cumulative_size == qt.n_points, m
|
||||
if not compute_error:
|
||||
C = np.nan
|
||||
return C
|
||||
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Classical multi-dimensional scaling (classical MDS).
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.utils import check_symmetric
|
||||
from sklearn.utils._param_validation import Interval
|
||||
from sklearn.utils.extmath import svd_flip
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
|
||||
class ClassicalMDS(BaseEstimator):
|
||||
"""Classical multidimensional scaling (MDS).
|
||||
|
||||
This is also known as principal coordinates analysis (PCoA) or
|
||||
Torgerson's scaling. It is a version of MDS that has exact solution
|
||||
in terms of eigendecomposition. If the input dissimilarity matrix
|
||||
consists of the pairwise Euclidean distances between some vectors,
|
||||
then classical MDS is equivalent to PCA applied to this set of vectors.
|
||||
|
||||
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
Number of embedding dimensions.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
Metric to use for dissimilarity computation. Default is "euclidean".
|
||||
|
||||
If metric is a string, it must be one of the options allowed by
|
||||
`scipy.spatial.distance.pdist` for its metric parameter, or a metric
|
||||
listed in :func:`sklearn.metrics.pairwise.distance_metrics`
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the dissimilarity computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Stores the position of the dataset in the embedding space.
|
||||
|
||||
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points.
|
||||
|
||||
eigenvalues_ : ndarray of shape (n_components,)
|
||||
Eigenvalues of the double-centered dissimilarity matrix, corresponding
|
||||
to each of the selected components. They are equal to the squared 2-norms
|
||||
of the `n_components` variables in the embedding space.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis.
|
||||
MDS : Metric and non-metric MDS.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import ClassicalMDS
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> cmds = ClassicalMDS(n_components=2)
|
||||
>>> X_emb = cmds.fit_transform(X[:100])
|
||||
>>> X_emb.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric": [str, callable],
|
||||
"metric_params": [dict, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
metric="euclidean",
|
||||
metric_params=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.pairwise = self.metric == "precomputed"
|
||||
return tags
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Compute the embedding positions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``metric=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_transform(X)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""
|
||||
Compute and return the embedding positions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``metric=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
The embedding coordinates.
|
||||
"""
|
||||
|
||||
X = validate_data(self, X)
|
||||
|
||||
if self.metric == "precomputed":
|
||||
self.dissimilarity_matrix_ = X
|
||||
self.dissimilarity_matrix_ = check_symmetric(
|
||||
self.dissimilarity_matrix_, raise_exception=True
|
||||
)
|
||||
else:
|
||||
self.dissimilarity_matrix_ = pairwise_distances(
|
||||
X,
|
||||
metric=self.metric,
|
||||
**(self.metric_params if self.metric_params is not None else {}),
|
||||
)
|
||||
|
||||
# Double centering
|
||||
B = self.dissimilarity_matrix_**2
|
||||
B = B.astype(np.float64)
|
||||
B -= np.mean(B, axis=0)
|
||||
B -= np.mean(B, axis=1, keepdims=True)
|
||||
B *= -0.5
|
||||
|
||||
# Eigendecomposition
|
||||
w, U = linalg.eigh(B)
|
||||
|
||||
# Reversing the order of the eigenvalues/eigenvectors to put
|
||||
# the eigenvalues in decreasing order
|
||||
w = w[::-1][: self.n_components]
|
||||
U = U[:, ::-1][:, : self.n_components]
|
||||
|
||||
# Set the signs of eigenvectors to enforce deterministic output
|
||||
U, _ = svd_flip(U, None)
|
||||
|
||||
self.embedding_ = np.sqrt(w) * U
|
||||
self.eigenvalues_ = w
|
||||
|
||||
return self.embedding_
|
||||
@@ -0,0 +1,442 @@
|
||||
"""Isomap for manifold learning"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse.csgraph import connected_components, shortest_path
|
||||
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from sklearn.decomposition import KernelPCA
|
||||
from sklearn.metrics.pairwise import _VALID_METRICS
|
||||
from sklearn.neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph
|
||||
from sklearn.preprocessing import KernelCenterer
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.graph import _fix_connected_components
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
|
||||
|
||||
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
|
||||
"""Isomap Embedding.
|
||||
|
||||
Non-linear dimensionality reduction through Isometric Mapping
|
||||
|
||||
Read more in the :ref:`User Guide <isomap>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int or None, default=5
|
||||
Number of neighbors to consider for each point. If `n_neighbors` is an int,
|
||||
then `radius` must be `None`.
|
||||
|
||||
radius : float or None, default=None
|
||||
Limiting distance of neighbors to return. If `radius` is a float,
|
||||
then `n_neighbors` must be set to `None`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
'auto' : Attempt to choose the most efficient solver
|
||||
for the given problem.
|
||||
|
||||
'arpack' : Use Arnoldi decomposition to find the eigenvalues
|
||||
and eigenvectors.
|
||||
|
||||
'dense' : Use a direct solver (i.e. LAPACK)
|
||||
for the eigenvalue decomposition.
|
||||
|
||||
tol : float, default=0
|
||||
Convergence tolerance passed to arpack or lobpcg.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
max_iter : int, default=None
|
||||
Maximum number of iterations for the arpack solver.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
path_method : {'auto', 'FW', 'D'}, default='auto'
|
||||
Method to use in finding shortest path.
|
||||
|
||||
'auto' : attempt to choose the best algorithm automatically.
|
||||
|
||||
'FW' : Floyd-Warshall algorithm.
|
||||
|
||||
'D' : Dijkstra's algorithm.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search,
|
||||
passed to neighbors.NearestNeighbors instance.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
metric : str, or callable, default="minkowski"
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`Glossary <sparse graph>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape (n_samples, n_components)
|
||||
Stores the embedding vectors.
|
||||
|
||||
kernel_pca_ : object
|
||||
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
|
||||
embedding.
|
||||
|
||||
nbrs_ : sklearn.neighbors.NearestNeighbors instance
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
dist_matrix_ : array-like, shape (n_samples, n_samples)
|
||||
Stores the geodesic distance matrix of training data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
MDS : Manifold learning using multidimensional scaling.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
|
||||
framework for nonlinear dimensionality reduction. Science 290 (5500)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import Isomap
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = Isomap(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"radius": [Interval(Real, 0, None, closed="both"), None],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"path_method": [StrOptions({"auto", "FW", "D"})],
|
||||
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
|
||||
"n_jobs": [Integral, None],
|
||||
"p": [Interval(Real, 1, None, closed="left")],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
"metric_params": [dict, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
radius=None,
|
||||
n_components=2,
|
||||
eigen_solver="auto",
|
||||
tol=0,
|
||||
max_iter=None,
|
||||
path_method="auto",
|
||||
neighbors_algorithm="auto",
|
||||
n_jobs=None,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.radius = radius
|
||||
self.n_components = n_components
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.path_method = path_method
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
self.metric = metric
|
||||
self.p = p
|
||||
self.metric_params = metric_params
|
||||
|
||||
def _fit_transform(self, X):
|
||||
if self.n_neighbors is not None and self.radius is not None:
|
||||
raise ValueError(
|
||||
"Both n_neighbors and radius are provided. Use"
|
||||
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
|
||||
" radius-based neighbors"
|
||||
)
|
||||
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
radius=self.radius,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self.nbrs_.fit(X)
|
||||
self.n_features_in_ = self.nbrs_.n_features_in_
|
||||
if hasattr(self.nbrs_, "feature_names_in_"):
|
||||
self.feature_names_in_ = self.nbrs_.feature_names_in_
|
||||
|
||||
self.kernel_pca_ = KernelPCA(
|
||||
n_components=self.n_components,
|
||||
kernel="precomputed",
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
n_jobs=self.n_jobs,
|
||||
).set_output(transform="default")
|
||||
|
||||
if self.n_neighbors is not None:
|
||||
nbg = kneighbors_graph(
|
||||
self.nbrs_,
|
||||
self.n_neighbors,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
else:
|
||||
nbg = radius_neighbors_graph(
|
||||
self.nbrs_,
|
||||
radius=self.radius,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
# Compute the number of connected components, and connect the different
|
||||
# components to be able to compute a shortest path between all pairs
|
||||
# of samples in the graph.
|
||||
# Similar fix to cluster._agglomerative._fix_connectivity.
|
||||
n_connected_components, labels = connected_components(nbg)
|
||||
if n_connected_components > 1:
|
||||
if self.metric == "precomputed" and issparse(X):
|
||||
raise RuntimeError(
|
||||
"The number of connected components of the neighbors graph"
|
||||
f" is {n_connected_components} > 1. The graph cannot be "
|
||||
"completed with metric='precomputed', and Isomap cannot be"
|
||||
"fitted. Increase the number of neighbors to avoid this "
|
||||
"issue, or precompute the full distance matrix instead "
|
||||
"of passing a sparse neighbors graph."
|
||||
)
|
||||
warnings.warn(
|
||||
(
|
||||
"The number of connected components of the neighbors graph "
|
||||
f"is {n_connected_components} > 1. Completing the graph to fit"
|
||||
" Isomap might be slow. Increase the number of neighbors to "
|
||||
"avoid this issue."
|
||||
),
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# use array validated by NearestNeighbors
|
||||
nbg = _fix_connected_components(
|
||||
X=self.nbrs_._fit_X,
|
||||
graph=nbg,
|
||||
n_connected_components=n_connected_components,
|
||||
component_labels=labels,
|
||||
mode="distance",
|
||||
metric=self.nbrs_.effective_metric_,
|
||||
**self.nbrs_.effective_metric_params_,
|
||||
)
|
||||
|
||||
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
|
||||
|
||||
if self.nbrs_._fit_X.dtype == np.float32:
|
||||
self.dist_matrix_ = self.dist_matrix_.astype(
|
||||
self.nbrs_._fit_X.dtype, copy=False
|
||||
)
|
||||
|
||||
G = self.dist_matrix_**2
|
||||
G *= -0.5
|
||||
|
||||
self.embedding_ = self.kernel_pca_.fit_transform(G)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
def reconstruction_error(self):
|
||||
"""Compute the reconstruction error for the embedding.
|
||||
|
||||
Returns
|
||||
-------
|
||||
reconstruction_error : float
|
||||
Reconstruction error.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The cost function of an isomap embedding is
|
||||
|
||||
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
|
||||
|
||||
Where D is the matrix of distances for the input data X,
|
||||
D_fit is the matrix of distances for the output embedding X_fit,
|
||||
and K is the isomap kernel:
|
||||
|
||||
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
|
||||
"""
|
||||
G = -0.5 * self.dist_matrix_**2
|
||||
G_center = KernelCenterer().fit_transform(G)
|
||||
evals = self.kernel_pca_.eigenvalues_
|
||||
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
|
||||
|
||||
@_fit_context(
|
||||
# Isomap.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
|
||||
object.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of self.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
@_fit_context(
|
||||
# Isomap.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, BallTree, KDTree}
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X.
|
||||
|
||||
This is implemented by linking the points X into the graph of geodesic
|
||||
distances of the training data. First the `n_neighbors` nearest
|
||||
neighbors of X are found in the training data, and from these the
|
||||
shortest geodesic distances from each point in X to each point in
|
||||
the training data are computed in order to construct the kernel.
|
||||
The embedding of X is the projection of this kernel onto the
|
||||
embedding vectors of the training set.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_queries, n_features)
|
||||
If neighbors_algorithm='precomputed', X is assumed to be a
|
||||
distance matrix or a sparse graph of shape
|
||||
(n_queries, n_samples_fit).
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_queries, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if self.n_neighbors is not None:
|
||||
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
|
||||
else:
|
||||
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
|
||||
|
||||
# Create the graph of shortest distances from X to
|
||||
# training data via the nearest neighbors of X.
|
||||
# This can be done as a single array operation, but it potentially
|
||||
# takes a lot of memory. To avoid that, use a loop:
|
||||
|
||||
n_samples_fit = self.nbrs_.n_samples_fit_
|
||||
n_queries = distances.shape[0]
|
||||
|
||||
if hasattr(X, "dtype") and X.dtype == np.float32:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
G_X = np.zeros((n_queries, n_samples_fit), dtype)
|
||||
for i in range(n_queries):
|
||||
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
|
||||
|
||||
G_X **= 2
|
||||
G_X *= -0.5
|
||||
|
||||
return self.kernel_pca_.transform(G_X)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
@@ -0,0 +1,878 @@
|
||||
"""Locally Linear Embedding"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import eigh, qr, solve, svd
|
||||
from scipy.sparse import csr_matrix, eye, lil_matrix
|
||||
from scipy.sparse.linalg import eigsh
|
||||
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
_UnstableArchMixin,
|
||||
)
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils import check_array, check_random_state
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data
|
||||
|
||||
|
||||
def barycenter_weights(X, Y, indices, reg=1e-3):
|
||||
"""Compute barycenter weights of X from Y along the first axis
|
||||
|
||||
We estimate the weights to assign to each point in Y[indices] to recover
|
||||
the point X[i]. The barycenter weights sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_dim)
|
||||
|
||||
Y : array-like, shape (n_samples, n_dim)
|
||||
|
||||
indices : array-like, shape (n_samples, n_dim)
|
||||
Indices of the points in Y used to compute the barycenter
|
||||
|
||||
reg : float, default=1e-3
|
||||
Amount of regularization to add for the problem to be
|
||||
well-posed in the case of n_neighbors > n_dim
|
||||
|
||||
Returns
|
||||
-------
|
||||
B : array-like, shape (n_samples, n_neighbors)
|
||||
|
||||
Notes
|
||||
-----
|
||||
See developers note for more information.
|
||||
"""
|
||||
X = check_array(X, dtype=FLOAT_DTYPES)
|
||||
Y = check_array(Y, dtype=FLOAT_DTYPES)
|
||||
indices = check_array(indices, dtype=int)
|
||||
|
||||
n_samples, n_neighbors = indices.shape
|
||||
assert X.shape[0] == n_samples
|
||||
|
||||
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
|
||||
v = np.ones(n_neighbors, dtype=X.dtype)
|
||||
|
||||
# this might raise a LinalgError if G is singular and has trace
|
||||
# zero
|
||||
for i, ind in enumerate(indices):
|
||||
A = Y[ind]
|
||||
C = A - X[i] # broadcasting
|
||||
G = np.dot(C, C.T)
|
||||
trace = np.trace(G)
|
||||
if trace > 0:
|
||||
R = reg * trace
|
||||
else:
|
||||
R = reg
|
||||
G.flat[:: n_neighbors + 1] += R
|
||||
w = solve(G, v, assume_a="pos")
|
||||
B[i, :] = w / np.sum(w)
|
||||
return B
|
||||
|
||||
|
||||
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
|
||||
"""Computes the barycenter weighted graph of k-Neighbors for points in X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors for each sample.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Amount of regularization when solving the least-squares
|
||||
problem. Only relevant if mode='barycenter'. If None, use the
|
||||
default.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
|
||||
A[i, j] is assigned the weight of edge that connects i to j.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neighbors.kneighbors_graph
|
||||
sklearn.neighbors.radius_neighbors_graph
|
||||
"""
|
||||
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
|
||||
X = knn._fit_X
|
||||
n_samples = knn.n_samples_fit_
|
||||
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
|
||||
data = barycenter_weights(X, X, ind, reg=reg)
|
||||
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
|
||||
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
|
||||
|
||||
|
||||
def null_space(
|
||||
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
|
||||
):
|
||||
"""
|
||||
Find the null space of a matrix M.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
M : {array, matrix, sparse matrix, LinearOperator}
|
||||
Input covariance matrix: should be symmetric positive semi-definite
|
||||
|
||||
k : int
|
||||
Number of eigenvalues/vectors to return
|
||||
|
||||
k_skip : int, default=1
|
||||
Number of low eigenvalues to skip.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
"""
|
||||
if eigen_solver == "auto":
|
||||
if M.shape[0] > 200 and k + k_skip < 10:
|
||||
eigen_solver = "arpack"
|
||||
else:
|
||||
eigen_solver = "dense"
|
||||
|
||||
if eigen_solver == "arpack":
|
||||
v0 = _init_arpack_v0(M.shape[0], random_state)
|
||||
try:
|
||||
eigen_values, eigen_vectors = eigsh(
|
||||
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
|
||||
)
|
||||
except RuntimeError as e:
|
||||
raise ValueError(
|
||||
"Error in determining null-space with ARPACK. Error message: "
|
||||
"'%s'. Note that eigen_solver='arpack' can fail when the "
|
||||
"weight matrix is singular or otherwise ill-behaved. In that "
|
||||
"case, eigen_solver='dense' is recommended. See online "
|
||||
"documentation for more information." % e
|
||||
) from e
|
||||
|
||||
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
|
||||
elif eigen_solver == "dense":
|
||||
if hasattr(M, "toarray"):
|
||||
M = M.toarray()
|
||||
eigen_values, eigen_vectors = eigh(
|
||||
M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True
|
||||
)
|
||||
index = np.argsort(np.abs(eigen_values))
|
||||
return eigen_vectors[:, index], np.sum(eigen_values)
|
||||
else:
|
||||
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
|
||||
|
||||
|
||||
def _locally_linear_embedding(
|
||||
X,
|
||||
*,
|
||||
n_neighbors,
|
||||
n_components,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
|
||||
nbrs.fit(X)
|
||||
X = nbrs._fit_X
|
||||
|
||||
N, d_in = X.shape
|
||||
|
||||
if n_components > d_in:
|
||||
raise ValueError(
|
||||
"output dimension must be less than or equal to input dimension"
|
||||
)
|
||||
if n_neighbors >= N:
|
||||
raise ValueError(
|
||||
"Expected n_neighbors < n_samples, but n_samples = %d, n_neighbors = %d"
|
||||
% (N, n_neighbors)
|
||||
)
|
||||
|
||||
M_sparse = eigen_solver != "dense"
|
||||
M_container_constructor = lil_matrix if M_sparse else np.zeros
|
||||
|
||||
if method == "standard":
|
||||
W = barycenter_kneighbors_graph(
|
||||
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
|
||||
)
|
||||
|
||||
# we'll compute M = (I-W)'(I-W)
|
||||
# depending on the solver, we'll do this differently
|
||||
if M_sparse:
|
||||
M = eye(*W.shape, format=W.format) - W
|
||||
M = M.T @ M
|
||||
else:
|
||||
M = (W.T @ W - W.T - W).toarray()
|
||||
M.flat[:: M.shape[0] + 1] += 1 # M = W' W - W' - W + I
|
||||
|
||||
elif method == "hessian":
|
||||
dp = n_components * (n_components + 1) // 2
|
||||
|
||||
if n_neighbors <= n_components + dp:
|
||||
raise ValueError(
|
||||
"for method='hessian', n_neighbors must be "
|
||||
"greater than "
|
||||
"[n_components * (n_components + 3) / 2]"
|
||||
)
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
|
||||
Yi[:, 0] = 1
|
||||
|
||||
M = M_container_constructor((N, N), dtype=np.float64)
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Gi = X[neighbors[i]]
|
||||
Gi -= Gi.mean(0)
|
||||
|
||||
# build Hessian estimator
|
||||
if use_svd:
|
||||
U = svd(Gi, full_matrices=0)[0]
|
||||
else:
|
||||
Ci = np.dot(Gi, Gi.T)
|
||||
U = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
|
||||
|
||||
j = 1 + n_components
|
||||
for k in range(n_components):
|
||||
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
|
||||
j += n_components - k
|
||||
|
||||
Q, R = qr(Yi)
|
||||
|
||||
w = Q[:, n_components + 1 :]
|
||||
S = w.sum(0)
|
||||
|
||||
S[np.where(abs(S) < hessian_tol)] = 1
|
||||
w /= S
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
|
||||
|
||||
elif method == "modified":
|
||||
if n_neighbors < n_components:
|
||||
raise ValueError("modified LLE requires n_neighbors >= n_components")
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
# find the eigenvectors and eigenvalues of each local covariance
|
||||
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
|
||||
# where the columns are eigenvectors
|
||||
V = np.zeros((N, n_neighbors, n_neighbors))
|
||||
nev = min(d_in, n_neighbors)
|
||||
evals = np.zeros([N, nev])
|
||||
|
||||
# choose the most efficient way to find the eigenvectors
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
if use_svd:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
|
||||
evals **= 2
|
||||
else:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
|
||||
evi, vi = eigh(C_nbrs)
|
||||
evals[i] = evi[::-1]
|
||||
V[i] = vi[:, ::-1]
|
||||
|
||||
# find regularized weights: this is like normal LLE.
|
||||
# because we've already computed the SVD of each covariance matrix,
|
||||
# it's faster to use this rather than np.linalg.solve
|
||||
reg = 1e-3 * evals.sum(1)
|
||||
|
||||
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
|
||||
tmp[:, :nev] /= evals + reg[:, None]
|
||||
tmp[:, nev:] /= reg[:, None]
|
||||
|
||||
w_reg = np.zeros((N, n_neighbors))
|
||||
for i in range(N):
|
||||
w_reg[i] = np.dot(V[i], tmp[i])
|
||||
w_reg /= w_reg.sum(1)[:, None]
|
||||
|
||||
# calculate eta: the median of the ratio of small to large eigenvalues
|
||||
# across the points. This is used to determine s_i, below
|
||||
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
|
||||
eta = np.median(rho)
|
||||
|
||||
# find s_i, the size of the "almost null space" for each point:
|
||||
# this is the size of the largest set of eigenvalues
|
||||
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
|
||||
s_range = np.zeros(N, dtype=int)
|
||||
evals_cumsum = np.cumsum(evals, 1)
|
||||
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
|
||||
for i in range(N):
|
||||
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
|
||||
s_range += n_neighbors - nev # number of zero eigenvalues
|
||||
|
||||
# Now calculate M.
|
||||
# This is the [N x N] matrix whose null space is the desired embedding
|
||||
M = M_container_constructor((N, N), dtype=np.float64)
|
||||
|
||||
for i in range(N):
|
||||
s_i = s_range[i]
|
||||
|
||||
# select bottom s_i eigenvectors and calculate alpha
|
||||
Vi = V[i, :, n_neighbors - s_i :]
|
||||
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
|
||||
|
||||
# compute Householder matrix which satisfies
|
||||
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
|
||||
# using prescription from paper
|
||||
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
|
||||
|
||||
norm_h = np.linalg.norm(h)
|
||||
if norm_h < modified_tol:
|
||||
h *= 0
|
||||
else:
|
||||
h /= norm_h
|
||||
|
||||
# Householder matrix is
|
||||
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
|
||||
# Then the weight matrix is
|
||||
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
|
||||
# We do this much more efficiently:
|
||||
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
|
||||
|
||||
# Update M as follows:
|
||||
# >> W_hat = np.zeros( (N,s_i) )
|
||||
# >> W_hat[neighbors[i],:] = Wi
|
||||
# >> W_hat[i] -= 1
|
||||
# >> M += np.dot(W_hat,W_hat.T)
|
||||
# We can do this much more efficiently:
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
|
||||
Wi_sum1 = Wi.sum(1)
|
||||
M[i, neighbors[i]] -= Wi_sum1
|
||||
M[neighbors[i], [i]] -= Wi_sum1
|
||||
M[i, i] += s_i
|
||||
|
||||
elif method == "ltsa":
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
M = M_container_constructor((N, N), dtype=np.float64)
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Xi = X[neighbors[i]]
|
||||
Xi -= Xi.mean(0)
|
||||
|
||||
# compute n_components largest eigenvalues of Xi @ Xi^T
|
||||
if use_svd:
|
||||
v = svd(Xi, full_matrices=True)[0]
|
||||
else:
|
||||
Ci = np.dot(Xi, Xi.T)
|
||||
v = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Gi = np.zeros((n_neighbors, n_components + 1))
|
||||
Gi[:, 1:] = v[:, :n_components]
|
||||
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
|
||||
|
||||
GiGiT = np.dot(Gi, Gi.T)
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] -= GiGiT
|
||||
|
||||
M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors)
|
||||
|
||||
if M_sparse:
|
||||
M = M.tocsr()
|
||||
|
||||
return null_space(
|
||||
M,
|
||||
n_components,
|
||||
k_skip=1,
|
||||
eigen_solver=eigen_solver,
|
||||
tol=tol,
|
||||
max_iter=max_iter,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", NearestNeighbors],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"reg": [Interval(Real, 0, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
|
||||
"hessian_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"modified_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [None, Integral],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def locally_linear_embedding(
|
||||
X,
|
||||
*,
|
||||
n_neighbors,
|
||||
n_components,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform a Locally Linear Embedding analysis on the data.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors to consider for each point.
|
||||
|
||||
n_components : int
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for the arpack solver.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
standard : use the standard locally linear embedding algorithm.
|
||||
see reference [1]_
|
||||
hessian : use the Hessian eigenmap method. This method requires
|
||||
n_neighbors > n_components * (1 + (n_components + 1) / 2.
|
||||
see reference [2]_
|
||||
modified : use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
ltsa : use local tangent space alignment algorithm
|
||||
see reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if method == 'hessian'.
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if method == 'modified'.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : ndarray of shape (n_samples, n_components)
|
||||
Embedding vectors.
|
||||
|
||||
squared_error : float
|
||||
Reconstruction error for the embedding vectors. Equivalent to
|
||||
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import locally_linear_embedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2)
|
||||
>>> embedding.shape
|
||||
(100, 2)
|
||||
"""
|
||||
return _locally_linear_embedding(
|
||||
X=X,
|
||||
n_neighbors=n_neighbors,
|
||||
n_components=n_components,
|
||||
reg=reg,
|
||||
eigen_solver=eigen_solver,
|
||||
tol=tol,
|
||||
max_iter=max_iter,
|
||||
method=method,
|
||||
hessian_tol=hessian_tol,
|
||||
modified_tol=modified_tol,
|
||||
random_state=random_state,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
|
||||
|
||||
class LocallyLinearEmbedding(
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_UnstableArchMixin,
|
||||
BaseEstimator,
|
||||
):
|
||||
"""Locally Linear Embedding.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to consider for each point.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
The solver used to compute the eigenvectors. The available options are:
|
||||
|
||||
- `'auto'` : algorithm will attempt to choose the best method for input
|
||||
data.
|
||||
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
|
||||
method, M may be a dense matrix, sparse matrix, or general linear
|
||||
operator.
|
||||
- `'dense'` : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array or matrix type.
|
||||
This method should be avoided for large problems.
|
||||
|
||||
.. warning::
|
||||
ARPACK can be unstable for some problems. It is best to try several
|
||||
random seeds in order to check results.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for the arpack solver.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
- `standard`: use the standard locally linear embedding algorithm. see
|
||||
reference [1]_
|
||||
- `hessian`: use the Hessian eigenmap method. This method requires
|
||||
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
|
||||
reference [2]_
|
||||
- `modified`: use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
- `ltsa`: use local tangent space alignment algorithm. see
|
||||
reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if ``method == 'hessian'``.
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if ``method == 'modified'``.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search, passed to
|
||||
:class:`~sklearn.neighbors.NearestNeighbors` instance.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when
|
||||
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
|
||||
across multiple function calls. See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape [n_samples, n_components]
|
||||
Stores the embedding vectors
|
||||
|
||||
reconstruction_error_ : float
|
||||
Reconstruction error associated with `embedding_`
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
nbrs_ : NearestNeighbors object
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality
|
||||
reduction.
|
||||
TSNE : Distributed Stochastic Neighbor Embedding.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import LocallyLinearEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = LocallyLinearEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"reg": [Interval(Real, 0, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
|
||||
"hessian_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"modified_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
n_components=2,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
neighbors_algorithm="auto",
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_components = n_components
|
||||
self.reg = reg
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.method = method
|
||||
self.hessian_tol = hessian_tol
|
||||
self.modified_tol = modified_tol
|
||||
self.random_state = random_state
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _fit_transform(self, X):
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
X = validate_data(self, X, dtype=float)
|
||||
self.nbrs_.fit(X)
|
||||
self.embedding_, self.reconstruction_error_ = _locally_linear_embedding(
|
||||
X=self.nbrs_,
|
||||
n_neighbors=self.n_neighbors,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
method=self.method,
|
||||
hessian_tol=self.hessian_tol,
|
||||
modified_tol=self.modified_tol,
|
||||
random_state=random_state,
|
||||
reg=self.reg,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted `LocallyLinearEmbedding` class instance.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform new points into embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because of scaling performed by this method, it is discouraged to use
|
||||
it together with methods that are not scale-invariant (like SVMs).
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = validate_data(self, X, reset=False)
|
||||
ind = self.nbrs_.kneighbors(
|
||||
X, n_neighbors=self.n_neighbors, return_distance=False
|
||||
)
|
||||
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
|
||||
X_new = np.empty((X.shape[0], self.n_components))
|
||||
for i in range(X.shape[0]):
|
||||
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
|
||||
return X_new
|
||||
@@ -0,0 +1,836 @@
|
||||
"""
|
||||
Multi-dimensional Scaling (MDS).
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from joblib import effective_n_jobs
|
||||
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.manifold import ClassicalMDS
|
||||
from sklearn.metrics import euclidean_distances, pairwise_distances
|
||||
from sklearn.utils import check_array, check_random_state, check_symmetric
|
||||
from sklearn.utils._param_validation import (
|
||||
Hidden,
|
||||
Interval,
|
||||
StrOptions,
|
||||
validate_params,
|
||||
)
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
|
||||
def _smacof_single(
|
||||
dissimilarities,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-6,
|
||||
random_state=None,
|
||||
normalized_stress=False,
|
||||
):
|
||||
"""Computes multidimensional scaling using SMACOF algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-6
|
||||
The tolerance with respect to stress (normalized by the sum of squared
|
||||
embedding distances) at which to declare convergence.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
|
||||
of a bugfix in the computation of the convergence criterion.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
normalized_stress : bool, default=False
|
||||
Whether to return normalized stress value (Stress-1) instead of raw
|
||||
stress.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Normalized stress is now supported for metric MDS as well.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
"""
|
||||
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
||||
|
||||
n_samples = dissimilarities.shape[0]
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
dissimilarities_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
||||
dissimilarities_flat_w = dissimilarities_flat[dissimilarities_flat != 0]
|
||||
if init is None:
|
||||
# Randomly choose initial configuration
|
||||
X = random_state.uniform(size=n_samples * n_components)
|
||||
X = X.reshape((n_samples, n_components))
|
||||
else:
|
||||
# overrides the parameter p
|
||||
n_components = init.shape[1]
|
||||
if n_samples != init.shape[0]:
|
||||
raise ValueError(
|
||||
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
|
||||
)
|
||||
X = init
|
||||
distances = euclidean_distances(X)
|
||||
|
||||
# Out of bounds condition cannot happen because we are transforming
|
||||
# the training set here, but does sometimes get triggered in
|
||||
# practice due to machine precision issues. Hence "clip".
|
||||
ir = IsotonicRegression(out_of_bounds="clip")
|
||||
|
||||
old_stress = None
|
||||
for it in range(max_iter):
|
||||
# Compute distance and monotonic regression
|
||||
if metric:
|
||||
disparities = dissimilarities
|
||||
else:
|
||||
distances_flat = distances.ravel()
|
||||
# dissimilarities with 0 are considered as missing values
|
||||
distances_flat_w = distances_flat[dissimilarities_flat != 0]
|
||||
|
||||
# Compute the disparities using isotonic regression.
|
||||
# For the first SMACOF iteration, use scaled original dissimilarities.
|
||||
# (This choice follows the R implementation described in this paper:
|
||||
# https://www.jstatsoft.org/article/view/v102i10)
|
||||
if it < 1:
|
||||
disparities_flat = dissimilarities_flat_w
|
||||
else:
|
||||
disparities_flat = ir.fit_transform(
|
||||
dissimilarities_flat_w, distances_flat_w
|
||||
)
|
||||
disparities = np.zeros_like(distances_flat)
|
||||
disparities[dissimilarities_flat != 0] = disparities_flat
|
||||
disparities = disparities.reshape((n_samples, n_samples))
|
||||
disparities *= np.sqrt(
|
||||
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
|
||||
)
|
||||
disparities = disparities + disparities.T
|
||||
|
||||
# Update X using the Guttman transform
|
||||
distances[distances == 0] = 1e-5
|
||||
ratio = disparities / distances
|
||||
B = -ratio
|
||||
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
||||
X = 1.0 / n_samples * np.dot(B, X)
|
||||
|
||||
# Compute stress
|
||||
distances = euclidean_distances(X)
|
||||
stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2
|
||||
|
||||
if verbose >= 2: # pragma: no cover
|
||||
print(f"Iteration {it}, stress {stress:.4f}")
|
||||
if old_stress is not None:
|
||||
sum_squared_distances = (distances.ravel() ** 2).sum()
|
||||
if ((old_stress - stress) / (sum_squared_distances / 2)) < eps:
|
||||
if verbose: # pragma: no cover
|
||||
print(f"Convergence criterion reached (iteration {it}).")
|
||||
break
|
||||
old_stress = stress
|
||||
|
||||
if normalized_stress:
|
||||
sum_squared_distances = (distances.ravel() ** 2).sum()
|
||||
stress = np.sqrt(stress / (sum_squared_distances / 2))
|
||||
|
||||
return X, stress, it + 1
|
||||
|
||||
|
||||
# TODO(1.9): change default `n_init` to 1, see PR #31117
|
||||
@validate_params(
|
||||
{
|
||||
"dissimilarities": ["array-like"],
|
||||
"metric": ["boolean"],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"init": ["array-like", None],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
|
||||
"n_jobs": [Integral, None],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"verbose": ["verbose"],
|
||||
"eps": [Interval(Real, 0, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
"return_n_iter": ["boolean"],
|
||||
"normalized_stress": ["boolean", StrOptions({"auto"})],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def smacof(
|
||||
dissimilarities,
|
||||
*,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
n_init="warn",
|
||||
n_jobs=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-6,
|
||||
random_state=None,
|
||||
return_n_iter=False,
|
||||
normalized_stress="auto",
|
||||
):
|
||||
"""Compute multidimensional scaling using the SMACOF algorithm.
|
||||
|
||||
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
||||
multidimensional scaling algorithm which minimizes an objective function
|
||||
(the *stress*) using a majorization technique. Stress majorization, also
|
||||
known as the Guttman Transform, guarantees a monotone convergence of
|
||||
stress, and is more powerful than traditional techniques such as gradient
|
||||
descent.
|
||||
|
||||
The SMACOF algorithm for metric MDS can be summarized by the following
|
||||
steps:
|
||||
|
||||
1. Set an initial start configuration, randomly or not.
|
||||
2. Compute the stress
|
||||
3. Compute the Guttman Transform
|
||||
4. Iterate 2 and 3 until convergence.
|
||||
|
||||
The nonmetric algorithm adds a monotonic regression step before computing
|
||||
the stress.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : array-like of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : array-like of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
n_init : int, default=8
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress. If ``init`` is
|
||||
provided, this option is overridden and a single run is performed.
|
||||
|
||||
.. versionchanged:: 1.9
|
||||
The default value for `n_iter` will change from 8 to 1 in version 1.9.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-6
|
||||
The tolerance with respect to stress (normalized by the sum of squared
|
||||
embedding distances) at which to declare convergence.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
|
||||
of a bugfix in the computation of the convergence criterion.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
normalized_stress : bool or "auto", default="auto"
|
||||
Whether to return normalized stress value (Stress-1) instead of raw
|
||||
stress. By default, metric MDS returns raw stress while non-metric MDS
|
||||
returns normalized stress.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
The default value changed from `False` to `"auto"` in version 1.4.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Normalized stress is now supported for metric MDS as well.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress. Returned
|
||||
only if ``return_n_iter`` is set to ``True``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.manifold import smacof
|
||||
>>> from sklearn.metrics import euclidean_distances
|
||||
>>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
|
||||
>>> dissimilarities = euclidean_distances(X)
|
||||
>>> Z, stress = smacof(
|
||||
... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42
|
||||
... )
|
||||
>>> Z.shape
|
||||
(3, 2)
|
||||
>>> np.round(stress, 6).item()
|
||||
3.2e-05
|
||||
"""
|
||||
|
||||
if n_init == "warn":
|
||||
warnings.warn(
|
||||
"The default value of `n_init` will change from 8 to 1 in 1.9.",
|
||||
FutureWarning,
|
||||
)
|
||||
n_init = 8
|
||||
|
||||
dissimilarities = check_array(dissimilarities)
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if normalized_stress == "auto":
|
||||
normalized_stress = not metric
|
||||
|
||||
if hasattr(init, "__array__"):
|
||||
init = np.asarray(init).copy()
|
||||
if not n_init == 1:
|
||||
warnings.warn(
|
||||
"Explicit initial positions passed: "
|
||||
"performing only one init of the MDS instead of %d" % n_init
|
||||
)
|
||||
n_init = 1
|
||||
|
||||
best_pos, best_stress = None, None
|
||||
|
||||
if effective_n_jobs(n_jobs) == 1:
|
||||
for it in range(n_init):
|
||||
pos, stress, n_iter_ = _smacof_single(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=random_state,
|
||||
normalized_stress=normalized_stress,
|
||||
)
|
||||
if best_stress is None or stress < best_stress:
|
||||
best_stress = stress
|
||||
best_pos = pos.copy()
|
||||
best_iter = n_iter_
|
||||
else:
|
||||
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
||||
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
||||
delayed(_smacof_single)(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=seed,
|
||||
normalized_stress=normalized_stress,
|
||||
)
|
||||
for seed in seeds
|
||||
)
|
||||
positions, stress, n_iters = zip(*results)
|
||||
best = np.argmin(stress)
|
||||
best_stress = stress[best]
|
||||
best_pos = positions[best]
|
||||
best_iter = n_iters[best]
|
||||
|
||||
if return_n_iter:
|
||||
return best_pos, best_stress, best_iter
|
||||
else:
|
||||
return best_pos, best_stress
|
||||
|
||||
|
||||
# TODO(1.9): change default `n_init` to 1, see PR #31117
|
||||
# TODO(1.10): change default `init` to "classical_mds", see PR #32229
|
||||
# TODO(1.10): drop support for boolean `metric`, see PR #32229
|
||||
# TODO(1.10): drop support for `dissimilarity`, see PR #32229
|
||||
class MDS(BaseEstimator):
|
||||
"""Multidimensional scaling.
|
||||
|
||||
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities.
|
||||
|
||||
metric_mds : bool, default=True
|
||||
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
.. versionchanged:: 1.8
|
||||
The parameter `metric` was renamed into `metric_mds`.
|
||||
|
||||
n_init : int, default=4
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress.
|
||||
|
||||
.. versionchanged:: 1.9
|
||||
The default value for `n_init` will change from 4 to 1 in version 1.9.
|
||||
|
||||
init : {'random', 'classical_mds'}, default='random'
|
||||
The initialization approach. If `random`, random initialization is used.
|
||||
If `classical_mds`, then classical MDS is run and used as initialization
|
||||
for MDS (in this case, the value of `n_init` is ignored).
|
||||
|
||||
.. versionadded:: 1.8
|
||||
|
||||
.. versionchanged:: 1.10
|
||||
The default value for `init` will change to `classical_mds`.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-6
|
||||
The tolerance with respect to stress (normalized by the sum of squared
|
||||
embedding distances) at which to declare convergence.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
|
||||
of a bugfix in the computation of the convergence criterion.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
dissimilarity : {'euclidean', 'precomputed'}
|
||||
Dissimilarity measure to use:
|
||||
|
||||
- 'euclidean':
|
||||
Pairwise Euclidean distances between points in the dataset.
|
||||
|
||||
- 'precomputed':
|
||||
Pre-computed dissimilarities are passed directly to ``fit`` and
|
||||
``fit_transform``.
|
||||
|
||||
.. deprecated:: 1.8
|
||||
`dissimilarity` was renamed `metric` in 1.8 and will be removed in 1.10.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
Metric to use for dissimilarity computation. Default is "euclidean".
|
||||
|
||||
If metric is a string, it must be one of the options allowed by
|
||||
`scipy.spatial.distance.pdist` for its metric parameter, or a metric
|
||||
listed in :func:`sklearn.metrics.pairwise.distance_metrics`
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
.. versionchanged:: 1.8
|
||||
Prior to 1.8, `metric=True/False` was used to select metric/non-metric
|
||||
MDS, which is now the role of `metric_mds`. The support for ``True``
|
||||
and ``False`` will be dropped in version 1.10, use `metric_mds` instead.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the dissimilarity computation.
|
||||
|
||||
.. versionadded:: 1.8
|
||||
|
||||
normalized_stress : bool or "auto" default="auto"
|
||||
Whether to return normalized stress value (Stress-1) instead of raw
|
||||
stress. By default, metric MDS returns raw stress while non-metric MDS
|
||||
returns normalized stress.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
The default value changed from `False` to `"auto"` in version 1.4.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Normalized stress is now supported for metric MDS as well.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Stores the position of the dataset in the embedding space.
|
||||
|
||||
stress_ : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Symmetric matrix that:
|
||||
|
||||
- either uses a custom dissimilarity matrix by setting `dissimilarity`
|
||||
to 'precomputed';
|
||||
- or constructs a dissimilarity matrix from data using
|
||||
Euclidean distances.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
Isomap : Manifold learning based on Isometric Mapping.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import MDS
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = MDS(n_components=2, n_init=1, init="random")
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
|
||||
For a more detailed example of usage, see
|
||||
:ref:`sphx_glr_auto_examples_manifold_plot_mds.py`.
|
||||
|
||||
For a comparison of manifold learning techniques, see
|
||||
:ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric_mds": ["boolean"],
|
||||
"n_init": [
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
Hidden(StrOptions({"warn"})),
|
||||
],
|
||||
"init": [StrOptions({"random", "classical_mds"}), Hidden(StrOptions({"warn"}))],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"verbose": ["verbose"],
|
||||
"eps": [Interval(Real, 0.0, None, closed="left")],
|
||||
"n_jobs": [None, Integral],
|
||||
"random_state": ["random_state"],
|
||||
"dissimilarity": [
|
||||
StrOptions({"euclidean", "precomputed"}),
|
||||
Hidden(StrOptions({"deprecated"})),
|
||||
],
|
||||
"metric": [str, callable, Hidden("boolean")],
|
||||
"metric_params": [dict, None],
|
||||
"normalized_stress": ["boolean", StrOptions({"auto"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
metric_mds=True,
|
||||
n_init="warn",
|
||||
init="warn",
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-6,
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
dissimilarity="deprecated",
|
||||
metric="euclidean",
|
||||
metric_params=None,
|
||||
normalized_stress="auto",
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.dissimilarity = dissimilarity
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
self.metric_mds = metric_mds
|
||||
self.n_init = n_init
|
||||
self.init = init
|
||||
self.max_iter = max_iter
|
||||
self.eps = eps
|
||||
self.verbose = verbose
|
||||
self.n_jobs = n_jobs
|
||||
self.random_state = random_state
|
||||
self.normalized_stress = normalized_stress
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.pairwise = (self.dissimilarity == "precomputed") | (
|
||||
self.metric == "precomputed"
|
||||
)
|
||||
return tags
|
||||
|
||||
def fit(self, X, y=None, init=None):
|
||||
"""
|
||||
Compute the position of the points in the embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``metric=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_transform(X, init=init)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None, init=None):
|
||||
"""
|
||||
Fit the data from `X`, and returns the embedded coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``metric=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
|
||||
if self.n_init == "warn":
|
||||
warnings.warn(
|
||||
"The default value of `n_init` will change from 4 to 1 in 1.9. "
|
||||
"To suppress this warning, provide some value of `n_init`.",
|
||||
FutureWarning,
|
||||
)
|
||||
self._n_init = 4
|
||||
else:
|
||||
self._n_init = self.n_init
|
||||
|
||||
if self.init == "warn":
|
||||
warnings.warn(
|
||||
"The default value of `init` will change from 'random' to "
|
||||
"'classical_mds' in 1.10. To suppress this warning, provide "
|
||||
"some value of `init`.",
|
||||
FutureWarning,
|
||||
)
|
||||
self._init = "random"
|
||||
else:
|
||||
self._init = self.init
|
||||
|
||||
if self.dissimilarity != "deprecated":
|
||||
if not isinstance(self.metric, bool) and self.metric != "euclidean":
|
||||
raise ValueError(
|
||||
"You provided both `dissimilarity` and `metric`. Please use "
|
||||
"only `metric`."
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"The `dissimilarity` parameter is deprecated and will be "
|
||||
"removed in 1.10. Use `metric` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
self._metric = self.dissimilarity
|
||||
|
||||
if isinstance(self.metric, bool):
|
||||
warnings.warn(
|
||||
f"Use metric_mds={self.metric} instead of metric={self.metric}. The "
|
||||
"support for metric={True/False} will be dropped in 1.10.",
|
||||
FutureWarning,
|
||||
)
|
||||
if self.dissimilarity == "deprecated":
|
||||
self._metric = "euclidean"
|
||||
self._metric_mds = self.metric
|
||||
else:
|
||||
if self.dissimilarity == "deprecated":
|
||||
self._metric = self.metric
|
||||
self._metric_mds = self.metric_mds
|
||||
|
||||
X = validate_data(self, X)
|
||||
if X.shape[0] == X.shape[1] and self._metric != "precomputed":
|
||||
warnings.warn(
|
||||
"The provided input is a square matrix. Note that ``fit`` constructs "
|
||||
"a dissimilarity matrix from data and will treat rows as samples "
|
||||
"and columns as features. To use a pre-computed dissimilarity matrix, "
|
||||
"set ``metric='precomputed'``."
|
||||
)
|
||||
|
||||
if self._metric == "precomputed":
|
||||
self.dissimilarity_matrix_ = X
|
||||
self.dissimilarity_matrix_ = check_symmetric(
|
||||
self.dissimilarity_matrix_, raise_exception=True
|
||||
)
|
||||
else:
|
||||
self.dissimilarity_matrix_ = pairwise_distances(
|
||||
X,
|
||||
metric=self._metric,
|
||||
**(self.metric_params if self.metric_params is not None else {}),
|
||||
)
|
||||
|
||||
if init is not None:
|
||||
init_array = init
|
||||
elif self._init == "classical_mds":
|
||||
cmds = ClassicalMDS(metric="precomputed")
|
||||
init_array = cmds.fit_transform(self.dissimilarity_matrix_)
|
||||
else:
|
||||
init_array = None
|
||||
|
||||
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
||||
self.dissimilarity_matrix_,
|
||||
metric=self._metric_mds,
|
||||
n_components=self.n_components,
|
||||
init=init_array,
|
||||
n_init=self._n_init,
|
||||
n_jobs=self.n_jobs,
|
||||
max_iter=self.max_iter,
|
||||
verbose=self.verbose,
|
||||
eps=self.eps,
|
||||
random_state=self.random_state,
|
||||
return_n_iter=True,
|
||||
normalized_stress=self.normalized_stress,
|
||||
)
|
||||
|
||||
return self.embedding_
|
||||
@@ -0,0 +1,772 @@
|
||||
"""Spectral Embedding."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
|
||||
from sklearn.utils import check_array, check_random_state, check_symmetric
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.fixes import laplacian as csgraph_laplacian
|
||||
from sklearn.utils.fixes import parse_version, sp_version
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
|
||||
def _graph_connected_component(graph, node_id):
|
||||
"""Find the largest graph connected components that contains one
|
||||
given node.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : array-like of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
node_id : int
|
||||
The index of the query node of the graph.
|
||||
|
||||
Returns
|
||||
-------
|
||||
connected_components_matrix : array-like of shape (n_samples,)
|
||||
An array of bool value indicating the indexes of the nodes
|
||||
belonging to the largest connected components of the given query
|
||||
node.
|
||||
"""
|
||||
n_node = graph.shape[0]
|
||||
if sparse.issparse(graph):
|
||||
# speed up row-wise access to boolean connection mask
|
||||
graph = graph.tocsr()
|
||||
connected_nodes = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore[node_id] = True
|
||||
for _ in range(n_node):
|
||||
last_num_component = connected_nodes.sum()
|
||||
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
|
||||
if last_num_component >= connected_nodes.sum():
|
||||
break
|
||||
indices = np.where(nodes_to_explore)[0]
|
||||
nodes_to_explore.fill(False)
|
||||
for i in indices:
|
||||
if sparse.issparse(graph):
|
||||
# scipy not yet implemented 1D sparse slices; can be changed back to
|
||||
# `neighbors = graph[i].toarray().ravel()` once implemented
|
||||
neighbors = graph[[i], :].toarray().ravel()
|
||||
else:
|
||||
neighbors = graph[i]
|
||||
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
|
||||
return connected_nodes
|
||||
|
||||
|
||||
def _graph_is_connected(graph):
|
||||
"""Return whether the graph is connected (True) or Not (False).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_connected : bool
|
||||
True means the graph is fully connected and False means not.
|
||||
"""
|
||||
if sparse.issparse(graph):
|
||||
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
|
||||
# `accept_large_sparse=True`.
|
||||
accept_large_sparse = sp_version >= parse_version("1.11.3")
|
||||
graph = check_array(
|
||||
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
|
||||
)
|
||||
# sparse graph, find all the connected components
|
||||
n_connected_components, _ = connected_components(graph)
|
||||
return n_connected_components == 1
|
||||
else:
|
||||
# dense graph, find all connected components start from node 0
|
||||
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
|
||||
|
||||
|
||||
def _set_diag(laplacian, value, norm_laplacian):
|
||||
"""Set the diagonal of the laplacian matrix and convert it to a
|
||||
sparse format well suited for eigenvalue decomposition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
laplacian : {ndarray, sparse matrix}
|
||||
The graph laplacian.
|
||||
|
||||
value : float
|
||||
The value of the diagonal.
|
||||
|
||||
norm_laplacian : bool
|
||||
Whether the value of the diagonal should be changed or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
laplacian : {array, sparse matrix}
|
||||
An array of matrix in a form that is well suited to fast
|
||||
eigenvalue decomposition, depending on the band width of the
|
||||
matrix.
|
||||
"""
|
||||
n_nodes = laplacian.shape[0]
|
||||
# We need all entries in the diagonal to values
|
||||
if not sparse.issparse(laplacian):
|
||||
if norm_laplacian:
|
||||
laplacian.flat[:: n_nodes + 1] = value
|
||||
else:
|
||||
laplacian = laplacian.tocoo()
|
||||
if norm_laplacian:
|
||||
diag_idx = laplacian.row == laplacian.col
|
||||
laplacian.data[diag_idx] = value
|
||||
# If the matrix has a small number of diagonals (as in the
|
||||
# case of structured matrices coming from images), the
|
||||
# dia format might be best suited for matvec products:
|
||||
n_diags = np.unique(laplacian.row - laplacian.col).size
|
||||
if n_diags <= 7:
|
||||
# 3 or less outer diagonals on each side
|
||||
laplacian = laplacian.todia()
|
||||
else:
|
||||
# csr has the fastest matvec and is thus best suited to
|
||||
# arpack
|
||||
laplacian = laplacian.tocsr()
|
||||
return laplacian
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"adjacency": ["array-like", "sparse matrix"],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"norm_laplacian": ["boolean"],
|
||||
"drop_first": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
"""Project the sample on the first eigenvectors of the graph Laplacian.
|
||||
|
||||
The adjacency matrix is used to compute a normalized graph Laplacian
|
||||
whose spectrum (especially the eigenvectors associated to the
|
||||
smallest eigenvalues) has an interpretation in terms of minimal
|
||||
number of cuts necessary to split the graph into comparably sized
|
||||
components.
|
||||
|
||||
This embedding can also 'work' even if the ``adjacency`` variable is
|
||||
not strictly the adjacency matrix of a graph but more generally
|
||||
an affinity or similarity matrix between samples (for instance the
|
||||
heat kernel of a euclidean distance matrix or a k-NN matrix).
|
||||
|
||||
However care must taken to always make the affinity matrix symmetric
|
||||
so that the eigenvector decomposition works as expected.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
|
||||
The adjacency matrix of the graph to embed.
|
||||
|
||||
n_components : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
|
||||
to convergence issues and should be avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
norm_laplacian : bool, default=True
|
||||
If True, then compute symmetric normalized Laplacian.
|
||||
|
||||
drop_first : bool, default=True
|
||||
Whether to drop the first eigenvector. For spectral embedding, this
|
||||
should be True as the first eigenvector should be constant vector for
|
||||
connected graph, but for spectral clustering, this should be kept as
|
||||
False to retain the first eigenvector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
embedding : ndarray of shape (n_samples, n_components)
|
||||
The reduced samples.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
|
||||
has one connected component. If there graph has many components, the first
|
||||
few eigenvectors will simply uncover the connected components of the graph.
|
||||
|
||||
References
|
||||
----------
|
||||
* https://en.wikipedia.org/wiki/LOBPCG
|
||||
|
||||
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
|
||||
Block Preconditioned Conjugate Gradient Method",
|
||||
Andrew V. Knyazev
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.neighbors import kneighbors_graph
|
||||
>>> from sklearn.manifold import spectral_embedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X = X[:100]
|
||||
>>> affinity_matrix = kneighbors_graph(
|
||||
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
|
||||
... )
|
||||
>>> # make the matrix symmetric
|
||||
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
|
||||
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
|
||||
>>> embedding.shape
|
||||
(100, 2)
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
return _spectral_embedding(
|
||||
adjacency,
|
||||
n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=eigen_tol,
|
||||
norm_laplacian=norm_laplacian,
|
||||
drop_first=drop_first,
|
||||
)
|
||||
|
||||
|
||||
def _spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
adjacency = check_symmetric(adjacency)
|
||||
|
||||
if eigen_solver == "amg":
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
"The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
) from e
|
||||
|
||||
if eigen_solver is None:
|
||||
eigen_solver = "arpack"
|
||||
|
||||
n_nodes = adjacency.shape[0]
|
||||
# Whether to drop the first eigenvector
|
||||
if drop_first:
|
||||
n_components = n_components + 1
|
||||
|
||||
if not _graph_is_connected(adjacency):
|
||||
warnings.warn(
|
||||
"Graph is not fully connected, spectral embedding may not work as expected."
|
||||
)
|
||||
|
||||
laplacian, dd = csgraph_laplacian(
|
||||
adjacency, normed=norm_laplacian, return_diag=True
|
||||
)
|
||||
if eigen_solver == "arpack" or (
|
||||
eigen_solver != "lobpcg"
|
||||
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
|
||||
):
|
||||
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
|
||||
# for details see the source code in scipy:
|
||||
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
|
||||
# /lobpcg/lobpcg.py#L237
|
||||
# or matlab:
|
||||
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# Here we'll use shift-invert mode for fast eigenvalues
|
||||
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
|
||||
# for a short explanation of what this means)
|
||||
# Because the normalized Laplacian has eigenvalues between 0 and 2,
|
||||
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
|
||||
# when finding eigenvalues of largest magnitude (keyword which='LM')
|
||||
# and when these eigenvalues are very large compared to the rest.
|
||||
# For very large, very sparse graphs, I - L can have many, many
|
||||
# eigenvalues very near 1.0. This leads to slow convergence. So
|
||||
# instead, we'll use ARPACK's shift-invert mode, asking for the
|
||||
# eigenvalues near 1.0. This effectively spreads-out the spectrum
|
||||
# near 1.0 and leads to much faster convergence: potentially an
|
||||
# orders-of-magnitude speedup over simply using keyword which='LA'
|
||||
# in standard mode.
|
||||
try:
|
||||
# We are computing the opposite of the laplacian inplace so as
|
||||
# to spare a memory allocation of a possibly very large array
|
||||
tol = 0 if eigen_tol == "auto" else eigen_tol
|
||||
laplacian *= -1
|
||||
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
|
||||
laplacian = check_array(
|
||||
laplacian, accept_sparse="csr", accept_large_sparse=False
|
||||
)
|
||||
_, diffusion_map = eigsh(
|
||||
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
|
||||
)
|
||||
embedding = diffusion_map.T[n_components::-1]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
except RuntimeError:
|
||||
# When submatrices are exactly singular, an LU decomposition
|
||||
# in arpack fails. We fallback to lobpcg
|
||||
eigen_solver = "lobpcg"
|
||||
# Revert the laplacian to its opposite to have lobpcg work
|
||||
laplacian *= -1
|
||||
|
||||
elif eigen_solver == "amg":
|
||||
# Use AMG to get a preconditioner and speed up the eigenvalue
|
||||
# problem.
|
||||
if not sparse.issparse(laplacian):
|
||||
warnings.warn("AMG works better for sparse matrices")
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# The Laplacian matrix is always singular, having at least one zero
|
||||
# eigenvalue, corresponding to the trivial eigenvector, which is a
|
||||
# constant. Using a singular matrix for preconditioning may result in
|
||||
# random failures in LOBPCG and is not supported by the existing
|
||||
# theory:
|
||||
# see https://doi.org/10.1007/s10208-015-9297-1
|
||||
# Shift the Laplacian so its diagononal is not all ones. The shift
|
||||
# does change the eigenpairs however, so we'll feed the shifted
|
||||
# matrix to the solver and afterward set it back to the original.
|
||||
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
|
||||
laplacian += diag_shift
|
||||
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
|
||||
# `pyamg` does not work with `csr_array` and we need to convert it to a
|
||||
# `csr_matrix` object.
|
||||
laplacian = sparse.csr_matrix(laplacian)
|
||||
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
|
||||
laplacian -= diag_shift
|
||||
|
||||
M = ml.aspreconditioner()
|
||||
# Create initial approximation X to eigenvectors
|
||||
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
|
||||
embedding = diffusion_map.T
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
if eigen_solver == "lobpcg":
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
if n_nodes < 5 * n_components + 1:
|
||||
# see note above under arpack why lobpcg has problems with small
|
||||
# number of nodes
|
||||
# lobpcg will fallback to eigh, so we short circuit it
|
||||
if sparse.issparse(laplacian):
|
||||
laplacian = laplacian.toarray()
|
||||
_, diffusion_map = eigh(laplacian, check_finite=False)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
else:
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
# We increase the number of eigenvectors requested, as lobpcg
|
||||
# doesn't behave well in low dimension and create initial
|
||||
# approximation X to eigenvectors
|
||||
X = random_state.standard_normal(
|
||||
size=(laplacian.shape[0], n_components + 1)
|
||||
)
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(
|
||||
laplacian, X, tol=tol, largest=False, maxiter=2000
|
||||
)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
embedding = _deterministic_vector_sign_flip(embedding)
|
||||
if drop_first:
|
||||
return embedding[1:n_components].T
|
||||
else:
|
||||
return embedding[:n_components].T
|
||||
|
||||
|
||||
class SpectralEmbedding(BaseEstimator):
|
||||
"""Spectral embedding for non-linear dimensionality reduction.
|
||||
|
||||
Forms an affinity matrix given by the specified function and
|
||||
applies spectral decomposition to the corresponding graph laplacian.
|
||||
The resulting transformation is given by the value of the
|
||||
eigenvectors for each data point.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
The dimension of the projected subspace.
|
||||
|
||||
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
|
||||
'precomputed_nearest_neighbors'} or callable, \
|
||||
default='nearest_neighbors'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf' : construct the affinity matrix by computing a radial basis
|
||||
function (RBF) kernel.
|
||||
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
||||
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
||||
of precomputed nearest neighbors, and constructs the affinity matrix
|
||||
by selecting the ``n_neighbors`` nearest neighbors.
|
||||
- callable : use passed in function as affinity
|
||||
the function takes in data matrix (n_samples, n_features)
|
||||
and return affinity matrix (n_samples, n_samples).
|
||||
|
||||
gamma : float, default=None
|
||||
Kernel coefficient for rbf kernel. If None, gamma will be set to
|
||||
1/n_features.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems.
|
||||
If None, then ``'arpack'`` is used.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
n_neighbors : int, default=None
|
||||
Number of nearest neighbors for nearest_neighbors graph building.
|
||||
If None, n_neighbors will be set to max(n_samples/10, 1).
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Affinity_matrix constructed from samples or precomputed.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_neighbors_ : int
|
||||
Number of nearest neighbors effectively used.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
- `On Spectral Clustering: Analysis and an algorithm, 2001
|
||||
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
|
||||
|
||||
- :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import SpectralEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = SpectralEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"affinity": [
|
||||
StrOptions(
|
||||
{
|
||||
"nearest_neighbors",
|
||||
"rbf",
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
},
|
||||
),
|
||||
callable,
|
||||
],
|
||||
"gamma": [Interval(Real, 0, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
affinity="nearest_neighbors",
|
||||
gamma=None,
|
||||
random_state=None,
|
||||
eigen_solver=None,
|
||||
eigen_tol="auto",
|
||||
n_neighbors=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.affinity = affinity
|
||||
self.gamma = gamma
|
||||
self.random_state = random_state
|
||||
self.eigen_solver = eigen_solver
|
||||
self.eigen_tol = eigen_tol
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.sparse = True
|
||||
tags.input_tags.pairwise = self.affinity in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
return tags
|
||||
|
||||
def _get_affinity_matrix(self, X, Y=None):
|
||||
"""Calculate the affinity matrix from data
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : array-like of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
Y: Ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
affinity_matrix of shape (n_samples, n_samples)
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "nearest_neighbors":
|
||||
if sparse.issparse(X):
|
||||
warnings.warn(
|
||||
"Nearest neighbors affinity currently does "
|
||||
"not support sparse input, falling back to "
|
||||
"rbf affinity"
|
||||
)
|
||||
self.affinity = "rbf"
|
||||
else:
|
||||
self.n_neighbors_ = (
|
||||
self.n_neighbors
|
||||
if self.n_neighbors is not None
|
||||
else max(int(X.shape[0] / 10), 1)
|
||||
)
|
||||
self.affinity_matrix_ = kneighbors_graph(
|
||||
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
# currently only symmetric affinity_matrix supported
|
||||
self.affinity_matrix_ = 0.5 * (
|
||||
self.affinity_matrix_ + self.affinity_matrix_.T
|
||||
)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "rbf":
|
||||
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
|
||||
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
|
||||
return self.affinity_matrix_
|
||||
self.affinity_matrix_ = self.affinity(X)
|
||||
return self.affinity_matrix_
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model from data in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = validate_data(self, X, accept_sparse="csr", ensure_min_samples=2)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
affinity_matrix = self._get_affinity_matrix(X)
|
||||
self.embedding_ = _spectral_embedding(
|
||||
affinity_matrix,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
eigen_tol=self.eigen_tol,
|
||||
random_state=random_state,
|
||||
)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
"""
|
||||
self.fit(X)
|
||||
return self.embedding_
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,120 @@
|
||||
import numpy as np
|
||||
|
||||
from libc cimport math
|
||||
from libc.math cimport INFINITY
|
||||
|
||||
from sklearn.utils._typedefs cimport float32_t, float64_t
|
||||
|
||||
|
||||
cdef float EPSILON_DBL = 1e-8
|
||||
cdef float PERPLEXITY_TOLERANCE = 1e-5
|
||||
|
||||
|
||||
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
|
||||
def _binary_search_perplexity(
|
||||
const float32_t[:, :] sqdistances,
|
||||
float desired_perplexity,
|
||||
int verbose):
|
||||
"""Binary search for sigmas of conditional Gaussians.
|
||||
|
||||
This approximation reduces the computational complexity from O(N^2) to
|
||||
O(uN).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
|
||||
Distances between training samples and their k nearest neighbors.
|
||||
When using the exact method, this is a square (n_samples, n_samples)
|
||||
distance matrix. The TSNE default metric is "euclidean" which is
|
||||
interpreted as squared euclidean distance.
|
||||
|
||||
desired_perplexity : float
|
||||
Desired perplexity (2^entropy) of the conditional Gaussians.
|
||||
|
||||
verbose : int
|
||||
Verbosity level.
|
||||
|
||||
Returns
|
||||
-------
|
||||
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
|
||||
Probabilities of conditional Gaussian distributions p_i|j.
|
||||
"""
|
||||
# Maximum number of binary search steps
|
||||
cdef long n_steps = 100
|
||||
|
||||
cdef long n_samples = sqdistances.shape[0]
|
||||
cdef long n_neighbors = sqdistances.shape[1]
|
||||
cdef int using_neighbors = n_neighbors < n_samples
|
||||
# Precisions of conditional Gaussian distributions
|
||||
cdef double beta
|
||||
cdef double beta_min
|
||||
cdef double beta_max
|
||||
cdef double beta_sum = 0.0
|
||||
|
||||
# Use log scale
|
||||
cdef double desired_entropy = math.log(desired_perplexity)
|
||||
cdef double entropy_diff
|
||||
|
||||
cdef double entropy
|
||||
cdef double sum_Pi
|
||||
cdef double sum_disti_Pi
|
||||
cdef long i, j, l
|
||||
|
||||
# This array is later used as a 32bit array. It has multiple intermediate
|
||||
# floating point additions that benefit from the extra precision
|
||||
cdef float64_t[:, :] P = np.zeros(
|
||||
(n_samples, n_neighbors), dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
beta_min = -INFINITY
|
||||
beta_max = INFINITY
|
||||
beta = 1.0
|
||||
|
||||
# Binary search of precision for i-th conditional distribution
|
||||
for l in range(n_steps):
|
||||
# Compute current entropy and corresponding probabilities
|
||||
# computed just over the nearest neighbors or over all data
|
||||
# if we're not using neighbors
|
||||
sum_Pi = 0.0
|
||||
for j in range(n_neighbors):
|
||||
if j != i or using_neighbors:
|
||||
P[i, j] = math.exp(-sqdistances[i, j] * beta)
|
||||
sum_Pi += P[i, j]
|
||||
|
||||
if sum_Pi == 0.0:
|
||||
sum_Pi = EPSILON_DBL
|
||||
sum_disti_Pi = 0.0
|
||||
|
||||
for j in range(n_neighbors):
|
||||
P[i, j] /= sum_Pi
|
||||
sum_disti_Pi += sqdistances[i, j] * P[i, j]
|
||||
|
||||
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
|
||||
entropy_diff = entropy - desired_entropy
|
||||
|
||||
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
|
||||
break
|
||||
|
||||
if entropy_diff > 0.0:
|
||||
beta_min = beta
|
||||
if beta_max == INFINITY:
|
||||
beta *= 2.0
|
||||
else:
|
||||
beta = (beta + beta_max) / 2.0
|
||||
else:
|
||||
beta_max = beta
|
||||
if beta_min == -INFINITY:
|
||||
beta /= 2.0
|
||||
else:
|
||||
beta = (beta + beta_min) / 2.0
|
||||
|
||||
beta_sum += beta
|
||||
|
||||
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
|
||||
print("[t-SNE] Computed conditional probabilities for sample "
|
||||
"%d / %d" % (i + 1, n_samples))
|
||||
|
||||
if verbose:
|
||||
print("[t-SNE] Mean sigma: %f"
|
||||
% np.mean(math.sqrt(n_samples / beta_sum)))
|
||||
return np.asarray(P)
|
||||
@@ -0,0 +1,14 @@
|
||||
py.extension_module(
|
||||
'_utils',
|
||||
[cython_gen.process('_utils.pyx'), utils_cython_tree],
|
||||
subdir: 'sklearn/manifold',
|
||||
install: true
|
||||
)
|
||||
|
||||
py.extension_module(
|
||||
'_barnes_hut_tsne',
|
||||
cython_gen.process('_barnes_hut_tsne.pyx'),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
subdir: 'sklearn/manifold',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,68 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import ClassicalMDS
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
|
||||
def test_classical_mds_equivalent_to_pca():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
pca = PCA(n_components=2)
|
||||
|
||||
Z1 = cmds.fit_transform(X)
|
||||
Z2 = pca.fit_transform(X)
|
||||
|
||||
# Swap the signs if necessary
|
||||
for comp in range(2):
|
||||
if Z1[0, comp] < 0 and Z2[0, comp] > 0:
|
||||
Z2[:, comp] *= -1
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
assert_allclose(np.sqrt(cmds.eigenvalues_), pca.singular_values_)
|
||||
|
||||
|
||||
def test_classical_mds_equivalent_on_data_and_distances():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
Z1 = cmds.fit_transform(X)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="precomputed")
|
||||
Z2 = cmds.fit_transform(euclidean_distances(X))
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
|
||||
def test_classical_mds_wrong_inputs():
|
||||
# Non-symmetric input
|
||||
dissim = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
|
||||
with pytest.raises(ValueError, match="Array must be symmetric"):
|
||||
ClassicalMDS(metric="precomputed").fit(dissim)
|
||||
|
||||
# Non-square input
|
||||
dissim = np.array([[0, 1, 2], [3, 4, 5]])
|
||||
with pytest.raises(ValueError, match="array must be 2-dimensional and square"):
|
||||
ClassicalMDS(metric="precomputed").fit(dissim)
|
||||
|
||||
|
||||
def test_classical_mds_metric_params():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="euclidean")
|
||||
Z1 = cmds.fit_transform(X)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 2})
|
||||
Z2 = cmds.fit_transform(X)
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
|
||||
cmds = ClassicalMDS(n_components=2, metric="minkowski", metric_params={"p": 1})
|
||||
Z3 = cmds.fit_transform(X)
|
||||
|
||||
assert not np.allclose(Z1, Z3)
|
||||
@@ -0,0 +1,348 @@
|
||||
import math
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import rand as sparse_rand
|
||||
|
||||
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
eigen_solvers = ["auto", "dense", "arpack"]
|
||||
path_methods = ["auto", "FW", "D"]
|
||||
|
||||
|
||||
def create_sample_data(dtype, n_pts=25, add_noise=False):
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
n_per_side = int(math.sqrt(n_pts))
|
||||
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
|
||||
if add_noise:
|
||||
# add noise in a third dimension
|
||||
rng = np.random.RandomState(0)
|
||||
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
|
||||
X = np.concatenate((X, noise), 1)
|
||||
return X
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_simple_grid(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
# Isomap should preserve distances when all neighbors are used
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
|
||||
|
||||
# distances from each point to all others
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose_dense_sparse(G, G_iso, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_reconstruction_error(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
if global_dtype is np.float32:
|
||||
pytest.skip(
|
||||
"Skipping test due to numerical instabilities on float32 data"
|
||||
"from KernelCenterer used in the reconstruction_error method"
|
||||
)
|
||||
|
||||
# Same setup as in test_isomap_simple_grid, with an added dimension
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
|
||||
|
||||
# compute input kernel
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
|
||||
centerer = preprocessing.KernelCenterer()
|
||||
K = centerer.fit_transform(-0.5 * G**2)
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
# compute output kernel
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
G_iso = G_iso.toarray()
|
||||
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
|
||||
|
||||
# make sure error agrees
|
||||
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
|
||||
def test_transform(global_dtype, n_neighbors, radius):
|
||||
n_samples = 200
|
||||
n_components = 10
|
||||
noise_scale = 0.01
|
||||
|
||||
# Create S-curve dataset
|
||||
X, y = datasets.make_s_curve(n_samples, random_state=0)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# Compute isomap embedding
|
||||
iso = manifold.Isomap(
|
||||
n_components=n_components, n_neighbors=n_neighbors, radius=radius
|
||||
)
|
||||
X_iso = iso.fit_transform(X)
|
||||
|
||||
# Re-embed a noisy version of the points
|
||||
rng = np.random.RandomState(0)
|
||||
noise = noise_scale * rng.randn(*X.shape)
|
||||
X_iso2 = iso.transform(X + noise)
|
||||
|
||||
# Make sure the rms error on re-embedding is comparable to noise_scale
|
||||
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
|
||||
def test_pipeline(n_neighbors, radius, global_dtype):
|
||||
# check that Isomap works fine as a transformer in a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
|
||||
# Test chaining NearestNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X2, _ = datasets.make_blobs(random_state=1)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
X2 = X2.astype(global_dtype, copy=False)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = pipeline.make_pipeline(
|
||||
neighbors.KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = manifold.Isomap(
|
||||
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric, p, is_euclidean",
|
||||
[
|
||||
("euclidean", 2, True),
|
||||
("manhattan", 1, False),
|
||||
("minkowski", 1, False),
|
||||
("minkowski", 2, True),
|
||||
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
|
||||
],
|
||||
)
|
||||
def test_different_metric(global_dtype, metric, p, is_euclidean):
|
||||
# Isomap must work on various metric parameters work correctly
|
||||
# and must default to euclidean.
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
reference = manifold.Isomap().fit_transform(X)
|
||||
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
|
||||
|
||||
if is_euclidean:
|
||||
assert_allclose(embedding, reference)
|
||||
else:
|
||||
with pytest.raises(AssertionError, match="Not equal to tolerance"):
|
||||
assert_allclose(embedding, reference)
|
||||
|
||||
|
||||
def test_isomap_clone_bug():
|
||||
# regression test for bug reported in #6062
|
||||
model = manifold.Isomap()
|
||||
for n_neighbors in [10, 15, 20]:
|
||||
model.set_params(n_neighbors=n_neighbors)
|
||||
model.fit(np.random.rand(50, 2))
|
||||
assert model.nbrs_.n_neighbors == n_neighbors
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_input(
|
||||
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
|
||||
):
|
||||
# TODO: compare results on dense and sparse data as proposed in:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
|
||||
X = csr_container(
|
||||
sparse_rand(
|
||||
100,
|
||||
3,
|
||||
density=0.1,
|
||||
format="csr",
|
||||
dtype=global_dtype,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
)
|
||||
|
||||
iso_dense = manifold.Isomap(
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
n_neighbors=8,
|
||||
)
|
||||
iso_sparse = clone(iso_dense)
|
||||
|
||||
X_trans_dense = iso_dense.fit_transform(X.toarray())
|
||||
X_trans_sparse = iso_sparse.fit_transform(X)
|
||||
|
||||
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
|
||||
|
||||
|
||||
def test_isomap_fit_precomputed_radius_graph(global_dtype):
|
||||
# Isomap.fit_transform must yield similar result when using
|
||||
# a precomputed distance matrix.
|
||||
|
||||
X, y = datasets.make_s_curve(200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
radius = 10
|
||||
|
||||
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
|
||||
isomap.fit(g)
|
||||
precomputed_result = isomap.embedding_
|
||||
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
|
||||
result = isomap.fit_transform(X)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(precomputed_result, result, atol=atol)
|
||||
|
||||
|
||||
def test_isomap_fitted_attributes_dtype(global_dtype):
|
||||
"""Check that the fitted attributes are stored accordingly to the
|
||||
data type of X."""
|
||||
iso = manifold.Isomap(n_neighbors=2)
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
|
||||
|
||||
iso.fit(X)
|
||||
|
||||
assert iso.dist_matrix_.dtype == global_dtype
|
||||
assert iso.embedding_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_isomap_dtype_equivalence():
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
iso_32 = manifold.Isomap(n_neighbors=2)
|
||||
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
|
||||
iso_32.fit(X_32)
|
||||
|
||||
iso_64 = manifold.Isomap(n_neighbors=2)
|
||||
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
|
||||
iso_64.fit(X_64)
|
||||
|
||||
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
|
||||
|
||||
|
||||
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
|
||||
# Isomap.fit_transform must raise a ValueError if
|
||||
# radius and n_neighbors are provided.
|
||||
|
||||
X, _ = datasets.load_digits(return_X_y=True)
|
||||
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
|
||||
msg = "Both n_neighbors and radius are provided"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
isomap.fit_transform(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components():
|
||||
# Test that a warning is raised when the graph has multiple components
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=2).fit(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components_metric_precomputed(global_dtype):
|
||||
# Test that an error is raised when the graph has multiple components
|
||||
# and when X is a precomputed neighbors graph.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
|
||||
|
||||
# works with a precomputed distance matrix (dense)
|
||||
X_distances = pairwise_distances(X)
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
|
||||
|
||||
# does not work with a precomputed neighbors graph (sparse)
|
||||
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
with pytest.raises(RuntimeError, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for Isomap."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.Isomap(n_components=n_components)
|
||||
iso.fit_transform(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
|
||||
@@ -0,0 +1,171 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import manifold, neighbors
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
eigen_solvers = ["dense", "arpack"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test utility routines
|
||||
def test_barycenter_kneighbors_graph(global_dtype):
|
||||
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 1)
|
||||
expected_graph = np.array(
|
||||
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
|
||||
)
|
||||
|
||||
assert graph.dtype == global_dtype
|
||||
|
||||
assert_allclose(graph.toarray(), expected_graph)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 2)
|
||||
# check that columns sum to one
|
||||
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
|
||||
pred = np.dot(graph.toarray(), X)
|
||||
assert linalg.norm(pred - X) / X.shape[0] < 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test LLE by computing the reconstruction error on some manifolds.
|
||||
|
||||
|
||||
def test_lle_simple_grid(global_dtype):
|
||||
# note: ARPACK is numerically unstable, so this test will fail for
|
||||
# some random seeds. We choose 42 because the tests pass.
|
||||
# for arm64 platforms 2 makes the test fail.
|
||||
# TODO: rewrite this test to make less sensitive to the random seed,
|
||||
# irrespective of the platform.
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(5), repeat=2)))
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
n_components = 2
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=5, n_components=n_components, random_state=rng
|
||||
)
|
||||
tol = 0.1
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
|
||||
assert reconstruction_error < tol
|
||||
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
|
||||
|
||||
# re-embed a noisy version of X using the transform method
|
||||
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
|
||||
X_reembedded = clf.transform(X + noise)
|
||||
assert linalg.norm(X_reembedded - clf.embedding_) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
|
||||
@pytest.mark.parametrize("solver", eigen_solvers)
|
||||
def test_lle_manifold(global_dtype, method, solver):
|
||||
rng = np.random.RandomState(0)
|
||||
# similar test on a slightly more complex manifold
|
||||
X = np.array(list(product(np.arange(18), repeat=2)))
|
||||
X = np.c_[X, X[:, 0] ** 2 / 18]
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
n_components = 2
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=6, n_components=n_components, method=method, random_state=0
|
||||
)
|
||||
tol = 1.5 if method == "standard" else 3
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X)
|
||||
assert reconstruction_error < tol
|
||||
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
details = "solver: %s, method: %s" % (solver, method)
|
||||
assert reconstruction_error < tol, details
|
||||
assert (
|
||||
np.abs(clf.reconstruction_error_ - reconstruction_error)
|
||||
< tol * reconstruction_error
|
||||
), details
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that LocallyLinearEmbedding works fine as a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
from sklearn import datasets, pipeline
|
||||
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
# Test the error raised when the weight matrix is singular
|
||||
def test_singular_matrix():
|
||||
M = np.ones((200, 3))
|
||||
f = ignore_warnings
|
||||
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
|
||||
f(
|
||||
manifold.locally_linear_embedding(
|
||||
M,
|
||||
n_neighbors=2,
|
||||
n_components=1,
|
||||
method="standard",
|
||||
eigen_solver="arpack",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# regression test for #6033
|
||||
def test_integer_input():
|
||||
rand = np.random.RandomState(0)
|
||||
X = rand.randint(0, 100, size=(20, 3))
|
||||
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
|
||||
clf.fit(X) # this previously raised a TypeError
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for LocallyLinearEmbedding."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
|
||||
iso.fit(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"locallylinearembedding{i}" for i in range(n_components)], names
|
||||
)
|
||||
@@ -0,0 +1,305 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
|
||||
|
||||
from sklearn.datasets import load_digits, load_iris
|
||||
from sklearn.manifold import ClassicalMDS
|
||||
from sklearn.manifold import _mds as mds
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
|
||||
def test_smacof():
|
||||
# test metric smacof using the data of "Modern Multidimensional Scaling",
|
||||
# Borg & Groenen, p 154
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
|
||||
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
|
||||
X_true = np.array(
|
||||
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
|
||||
)
|
||||
assert_array_almost_equal(X, X_true, decimal=3)
|
||||
|
||||
|
||||
def test_nonmetric_lower_normalized_stress():
|
||||
# Testing that nonmetric MDS results in lower normalized stress compared
|
||||
# compared to metric MDS (non-regression test for issue 27028)
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
sim = euclidean_distances(X)
|
||||
np.random.seed(42)
|
||||
Z = np.random.normal(size=(X.shape[0], 2))
|
||||
|
||||
_, stress1 = mds.smacof(
|
||||
sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True
|
||||
)
|
||||
|
||||
_, stress2 = mds.smacof(
|
||||
sim,
|
||||
init=Z,
|
||||
n_components=2,
|
||||
max_iter=1000,
|
||||
n_init=1,
|
||||
normalized_stress=True,
|
||||
metric=False,
|
||||
)
|
||||
|
||||
assert stress1 > stress2
|
||||
|
||||
# A metric MDS solution (local minimum of the raw stress) can be rescaled to
|
||||
# decrease the stress-1 (which is returned with normalized_stress=True).
|
||||
# The optimal rescaling can be computed analytically, see Borg & Groenen,
|
||||
# Modern Multidimensional Scaling, Chapter 11.1. After rescaling, stress-1
|
||||
# becomes sqrt(s^2 / (1 + s^2)), where s is the value of stress-1 before
|
||||
# rescaling.
|
||||
stress1_rescaled = np.sqrt(stress1**2 / (1 + stress1**2))
|
||||
assert stress1_rescaled > stress2
|
||||
|
||||
|
||||
def test_nonmetric_mds_optimization():
|
||||
# Test that stress is decreasing during nonmetric MDS optimization
|
||||
# (non-regression test for issue 27028)
|
||||
X, _ = load_digits(return_X_y=True)
|
||||
rng = np.random.default_rng(seed=42)
|
||||
ind_subset = rng.choice(len(X), size=200, replace=False)
|
||||
X = X[ind_subset]
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
max_iter=2,
|
||||
metric_mds=False,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress_after_2_iter = mds_est.stress_
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
max_iter=3,
|
||||
metric_mds=False,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress_after_3_iter = mds_est.stress_
|
||||
|
||||
assert stress_after_2_iter > stress_after_3_iter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_mds", [True, False])
|
||||
def test_mds_recovers_true_data(metric_mds):
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
n_init=1,
|
||||
eps=1e-15,
|
||||
max_iter=1000,
|
||||
metric_mds=metric_mds,
|
||||
init="random",
|
||||
random_state=42,
|
||||
).fit(X)
|
||||
stress = mds_est.stress_
|
||||
assert_allclose(stress, 0, atol=1e-6)
|
||||
|
||||
|
||||
def test_smacof_error():
|
||||
# Not symmetric similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, n_init=1)
|
||||
|
||||
# Not squared similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, n_init=1)
|
||||
|
||||
# init not None and not correct format:
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, init=Z, n_init=1)
|
||||
|
||||
|
||||
# TODO: remove mark once loky bug is fixed:
|
||||
# https://github.com/joblib/loky/issues/458
|
||||
@pytest.mark.thread_unsafe
|
||||
def test_MDS():
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
mds_clf = mds.MDS(
|
||||
metric_mds=False,
|
||||
n_jobs=3,
|
||||
n_init=3,
|
||||
metric="precomputed",
|
||||
init="random",
|
||||
)
|
||||
mds_clf.fit(sim)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
|
||||
def test_normed_stress(k):
|
||||
"""Test that non-metric MDS normalized stress is scale-invariant."""
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
|
||||
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
|
||||
|
||||
assert_allclose(stress1, stress2, rtol=1e-5)
|
||||
assert_allclose(X1, X2, rtol=1e-5)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric", [True, False])
|
||||
def test_normalized_stress_auto(metric, monkeypatch):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(4, 3)
|
||||
dist = euclidean_distances(X)
|
||||
|
||||
mock = Mock(side_effect=mds._smacof_single)
|
||||
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
|
||||
|
||||
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
|
||||
est.fit_transform(X)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
|
||||
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
|
||||
|
||||
def test_isotonic_outofbounds():
|
||||
# This particular configuration can trigger out of bounds error
|
||||
# in the isotonic regression (non-regression test for issue 26999)
|
||||
dis = np.array(
|
||||
[
|
||||
[0.0, 1.732050807568877, 1.7320508075688772],
|
||||
[1.732050807568877, 0.0, 6.661338147750939e-16],
|
||||
[1.7320508075688772, 6.661338147750939e-16, 0.0],
|
||||
]
|
||||
)
|
||||
init = np.array(
|
||||
[
|
||||
[0.08665881585055124, 0.7939114643387546],
|
||||
[0.9959834154297658, 0.7555546025640025],
|
||||
[0.8766008278401566, 0.4227358815811242],
|
||||
]
|
||||
)
|
||||
mds.smacof(dis, init=init, metric=False, n_init=1)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("normalized_stress", [True, False])
|
||||
def test_returned_stress(normalized_stress):
|
||||
# Test that the final stress corresponds to the final embedding
|
||||
# (non-regression test for issue 16846)
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
D = euclidean_distances(X)
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
random_state=42,
|
||||
normalized_stress=normalized_stress,
|
||||
).fit(X)
|
||||
|
||||
Z = mds_est.embedding_
|
||||
stress = mds_est.stress_
|
||||
|
||||
D_mds = euclidean_distances(Z)
|
||||
stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2
|
||||
|
||||
if normalized_stress:
|
||||
stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2))
|
||||
|
||||
assert_allclose(stress, stress_Z)
|
||||
|
||||
|
||||
# TODO(1.10): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric_mds", [True, False])
|
||||
def test_convergence_does_not_depend_on_scale(metric_mds):
|
||||
# Test that the number of iterations until convergence does not depend on
|
||||
# the scale of the input data
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
|
||||
mds_est = mds.MDS(
|
||||
n_components=2,
|
||||
random_state=42,
|
||||
metric_mds=metric_mds,
|
||||
)
|
||||
|
||||
mds_est.fit(X * 100)
|
||||
n_iter1 = mds_est.n_iter_
|
||||
|
||||
mds_est.fit(X / 100)
|
||||
n_iter2 = mds_est.n_iter_
|
||||
|
||||
assert_equal(n_iter1, n_iter2)
|
||||
|
||||
|
||||
# TODO(1.9): delete this test
|
||||
def test_future_warning_n_init():
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.warns(FutureWarning):
|
||||
mds.smacof(sim)
|
||||
|
||||
with pytest.warns(FutureWarning):
|
||||
mds.MDS(init="random").fit(X)
|
||||
|
||||
|
||||
# TODO(1.9): delete the n_init warning check
|
||||
# TODO(1.10): delete this test
|
||||
def test_future_warning_init_and_metric():
|
||||
X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]])
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
# dissimilarity argument deprecated
|
||||
with pytest.warns(FutureWarning, match="`dissimilarity` parameter is"):
|
||||
mds.MDS(dissimilarity="precomputed", init="random", n_init=1).fit(sim)
|
||||
|
||||
# metric=True deprecated
|
||||
with pytest.warns(FutureWarning, match="Use metric_mds"):
|
||||
mds.MDS(metric=True, init="random", n_init=1).fit(X)
|
||||
|
||||
# metric=False deprecated
|
||||
with pytest.warns(FutureWarning, match="Use metric_mds"):
|
||||
mds.MDS(metric=False, init="random", n_init=1).fit(X)
|
||||
|
||||
# default init will become classical_mds in the future
|
||||
with pytest.warns(FutureWarning, match="The default value of `init`"):
|
||||
mds.MDS(metric="euclidean", n_init=1).fit(X)
|
||||
|
||||
# TODO (1.9): delete this check
|
||||
# n_init=1 will become default in the future
|
||||
with pytest.warns(FutureWarning, match="The default value of `n_init`"):
|
||||
mds.MDS(metric="euclidean", init="random").fit(X)
|
||||
|
||||
# providing both metric and dissimilarity raises an error
|
||||
with pytest.raises(ValueError, match="provided both `dissimilarity`"):
|
||||
mds.MDS(
|
||||
metric="cosine", dissimilarity="euclidean", init="random", n_init=1
|
||||
).fit(X)
|
||||
|
||||
|
||||
# TODO(1.9): remove warning filter
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
def test_classical_mds_init_to_mds():
|
||||
X, _ = load_iris(return_X_y=True)
|
||||
|
||||
cmds = ClassicalMDS()
|
||||
Z_classical = cmds.fit_transform(X)
|
||||
|
||||
mds1 = mds.MDS(init="classical_mds")
|
||||
Z1 = mds1.fit_transform(X)
|
||||
|
||||
mds2 = mds.MDS(init="random")
|
||||
Z2 = mds1.fit_transform(X, init=Z_classical)
|
||||
|
||||
assert_allclose(Z1, Z2)
|
||||
@@ -0,0 +1,503 @@
|
||||
import itertools
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
|
||||
from sklearn.manifold._spectral_embedding import (
|
||||
_graph_connected_component,
|
||||
_graph_is_connected,
|
||||
)
|
||||
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
parse_version,
|
||||
sp_version,
|
||||
)
|
||||
from sklearn.utils.fixes import laplacian as csgraph_laplacian
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa: F401
|
||||
|
||||
pyamg_available = True
|
||||
except ImportError:
|
||||
pyamg_available = False
|
||||
skip_if_no_pyamg = pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
|
||||
# non centered, sparse centers to check the
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
n_samples = 1000
|
||||
n_clusters, n_features = centers.shape
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
|
||||
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
|
||||
"""Check array A and B are equal with possible sign flipping on
|
||||
each column"""
|
||||
tol_squared = tol**2
|
||||
for A_col, B_col in zip(A.T, B.T):
|
||||
assert (
|
||||
np.max((A_col - B_col) ** 2) <= tol_squared
|
||||
or np.max((A_col + B_col) ** 2) <= tol_squared
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_sparse_graph_connected_component(coo_container):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 300
|
||||
boundaries = [0, 42, 121, 200, n_samples]
|
||||
p = rng.permutation(n_samples)
|
||||
connections = []
|
||||
|
||||
for start, stop in itertools.pairwise(boundaries):
|
||||
group = p[start:stop]
|
||||
# Connect all elements within the group at least once via an
|
||||
# arbitrary path that spans the group.
|
||||
for i in range(len(group) - 1):
|
||||
connections.append((group[i], group[i + 1]))
|
||||
|
||||
# Add some more random connections within the group
|
||||
min_idx, max_idx = 0, len(group) - 1
|
||||
n_random_connections = 1000
|
||||
source = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
target = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
connections.extend(zip(group[source], group[target]))
|
||||
|
||||
# Build a symmetric affinity matrix
|
||||
row_idx, column_idx = tuple(np.array(connections).T)
|
||||
data = rng.uniform(0.1, 42, size=len(connections))
|
||||
affinity = coo_container((data, (row_idx, column_idx)))
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
for start, stop in itertools.pairwise(boundaries):
|
||||
component_1 = _graph_connected_component(affinity, p[start])
|
||||
component_size = stop - start
|
||||
assert component_1.sum() == component_size
|
||||
|
||||
# We should retrieve the same component mask by starting by both ends
|
||||
# of the group
|
||||
component_2 = _graph_connected_component(affinity, p[stop - 1])
|
||||
assert component_2.sum() == component_size
|
||||
assert_array_equal(component_1, component_2)
|
||||
|
||||
|
||||
# TODO: investigate why this test is seed-sensitive on 32-bit Python
|
||||
# runtimes. Is this revealing a numerical stability problem ? Or is it
|
||||
# expected from the test numerical design ? In the latter case the test
|
||||
# should be made less seed-sensitive instead.
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
|
||||
# Test spectral embedding with two components
|
||||
random_state = np.random.RandomState(seed)
|
||||
n_sample = 100
|
||||
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
|
||||
# first component
|
||||
affinity[0:n_sample, 0:n_sample] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
# second component
|
||||
affinity[n_sample::, n_sample::] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
|
||||
# Test of internal _graph_connected_component before connection
|
||||
component = _graph_connected_component(affinity, 0)
|
||||
assert component[:n_sample].all()
|
||||
assert not component[n_sample:].any()
|
||||
component = _graph_connected_component(affinity, -1)
|
||||
assert not component[:n_sample].any()
|
||||
assert component[n_sample:].all()
|
||||
|
||||
# connection
|
||||
affinity[0, n_sample + 1] = 1
|
||||
affinity[n_sample + 1, 0] = 1
|
||||
affinity.flat[:: 2 * n_sample + 1] = 0
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
true_label = np.zeros(shape=2 * n_sample)
|
||||
true_label[0:n_sample] = 1
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
|
||||
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
|
||||
# thresholding on the first components using 0.
|
||||
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
|
||||
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_precomputed_affinity(
|
||||
sparse_container, eigen_solver, dtype, seed=36
|
||||
):
|
||||
# Test spectral embedding with precomputed kernel
|
||||
gamma = 1.0
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
|
||||
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
|
||||
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
|
||||
graph = nn.kneighbors_graph(S, mode="connectivity")
|
||||
embedding = (
|
||||
SpectralEmbedding(
|
||||
random_state=0,
|
||||
n_components=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.embedding_
|
||||
)
|
||||
results.append(embedding)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
|
||||
# Test spectral embedding with callable affinity
|
||||
gamma = 0.9
|
||||
kern = rbf_kernel(S, gamma=gamma)
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_callable = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
embed_callable = se_callable.fit_transform(X)
|
||||
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
|
||||
se_amg = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="amg",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_arpack = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="arpack",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_amg = se_amg.fit_transform(S.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# same with special case in which amg is not actually used
|
||||
# regression test for #10715
|
||||
# affinity between nodes
|
||||
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
|
||||
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
|
||||
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
|
||||
|
||||
affinity = coo_container(
|
||||
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
|
||||
shape=(6, 6),
|
||||
)
|
||||
se_amg.affinity = "precomputed"
|
||||
se_arpack.affinity = "precomputed"
|
||||
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
|
||||
# or is successful based on the version of SciPy which is installed.
|
||||
# Use a CSR matrix to avoid any conversion during the validation
|
||||
affinity = affinity.tocsr()
|
||||
affinity.indptr = affinity.indptr.astype(np.int64)
|
||||
affinity.indices = affinity.indices.astype(np.int64)
|
||||
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
|
||||
if scipy_graph_traversal_supports_int64_index:
|
||||
se_amg.fit_transform(affinity)
|
||||
else:
|
||||
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_amg.fit_transform(affinity)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
|
||||
# Non-regression test for amg solver failure (issue #13393 on github)
|
||||
num_nodes = 100
|
||||
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
|
||||
X = X.astype(dtype)
|
||||
upper = sparse.triu(X) - sparse.diags(X.diagonal())
|
||||
sym_matrix = upper + upper.T
|
||||
embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
|
||||
)
|
||||
|
||||
# Check that the learned embedding is stable w.r.t. random solver init:
|
||||
for i in range(3):
|
||||
new_embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
|
||||
)
|
||||
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
|
||||
|
||||
|
||||
def test_pipeline_spectral_clustering(seed=36):
|
||||
# Test using pipeline to do spectral clustering
|
||||
random_state = np.random.RandomState(seed)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=n_clusters, affinity="rbf", random_state=random_state
|
||||
)
|
||||
se_knn = SpectralEmbedding(
|
||||
n_components=n_clusters,
|
||||
affinity="nearest_neighbors",
|
||||
n_neighbors=5,
|
||||
random_state=random_state,
|
||||
)
|
||||
for se in [se_rbf, se_knn]:
|
||||
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
|
||||
km.fit(se.fit_transform(S))
|
||||
assert_array_almost_equal(
|
||||
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
|
||||
)
|
||||
|
||||
|
||||
def test_connectivity(seed=36):
|
||||
# Test that graph connectivity test works as expected
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert not _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert not _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert not _graph_is_connected(csc_container(graph))
|
||||
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert _graph_is_connected(csc_container(graph))
|
||||
|
||||
|
||||
def test_spectral_embedding_deterministic():
|
||||
# Test that Spectral Embedding is deterministic
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
embedding_1 = spectral_embedding(sims)
|
||||
embedding_2 = spectral_embedding(sims)
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unnormalized():
|
||||
# Test that spectral_embedding is also processing unnormalized laplacian
|
||||
# correctly
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 8
|
||||
embedding_1 = spectral_embedding(
|
||||
sims, norm_laplacian=False, n_components=n_components, drop_first=False
|
||||
)
|
||||
|
||||
# Verify using manual computation with dense eigh
|
||||
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
|
||||
_, diffusion_map = eigh(laplacian)
|
||||
embedding_2 = diffusion_map.T[:n_components]
|
||||
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
|
||||
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_first_eigen_vector():
|
||||
# Test that the first eigenvector of spectral_embedding
|
||||
# is constant and that the second is not (for a connected graph)
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 2
|
||||
|
||||
for seed in range(10):
|
||||
embedding = spectral_embedding(
|
||||
sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False,
|
||||
random_state=seed,
|
||||
)
|
||||
|
||||
assert np.std(embedding[:, 0]) == pytest.approx(0)
|
||||
assert np.std(embedding[:, 1]) > 1e-3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
|
||||
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
|
||||
attribute and transformed data.
|
||||
|
||||
Ideally, this test should be covered by the common test
|
||||
`check_transformer_preserve_dtypes`. However, this test only run
|
||||
with transformers implementing `transform` while `SpectralEmbedding`
|
||||
implements only `fit_transform`.
|
||||
"""
|
||||
X = S.astype(dtype)
|
||||
se = SpectralEmbedding(
|
||||
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
|
||||
)
|
||||
X_trans = se.fit_transform(X)
|
||||
|
||||
assert X_trans.dtype == dtype
|
||||
assert se.embedding_.dtype == dtype
|
||||
assert se.affinity_matrix_.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pyamg_available,
|
||||
reason="PyAMG is installed and we should not test for an error.",
|
||||
)
|
||||
def test_error_pyamg_not_available():
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
eigen_solver="amg",
|
||||
)
|
||||
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_precomp.fit_transform(S)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
|
||||
"""Test that `eigen_tol="auto"` is resolved correctly"""
|
||||
if solver == "amg" and not pyamg_available:
|
||||
pytest.skip("PyAMG is not available.")
|
||||
X, _ = make_blobs(
|
||||
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
D = pairwise_distances(X) # Distance matrix
|
||||
S = np.max(D) - D # Similarity matrix
|
||||
|
||||
solver_func = eigsh if solver == "arpack" else lobpcg
|
||||
default_value = 0 if solver == "arpack" else None
|
||||
if solver == "amg":
|
||||
S = csr_container(S)
|
||||
|
||||
mocked_solver = Mock(side_effect=solver_func)
|
||||
|
||||
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
|
||||
|
||||
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
|
||||
mocked_solver.assert_called()
|
||||
|
||||
_, kwargs = mocked_solver.call_args
|
||||
assert kwargs["tol"] == default_value
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user