Videre
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
"""The k-nearest neighbors algorithms."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.neighbors._ball_tree import BallTree
|
||||
from sklearn.neighbors._base import (
|
||||
VALID_METRICS,
|
||||
VALID_METRICS_SPARSE,
|
||||
sort_graph_by_row_values,
|
||||
)
|
||||
from sklearn.neighbors._classification import (
|
||||
KNeighborsClassifier,
|
||||
RadiusNeighborsClassifier,
|
||||
)
|
||||
from sklearn.neighbors._graph import (
|
||||
KNeighborsTransformer,
|
||||
RadiusNeighborsTransformer,
|
||||
kneighbors_graph,
|
||||
radius_neighbors_graph,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import KDTree
|
||||
from sklearn.neighbors._kde import KernelDensity
|
||||
from sklearn.neighbors._lof import LocalOutlierFactor
|
||||
from sklearn.neighbors._nca import NeighborhoodComponentsAnalysis
|
||||
from sklearn.neighbors._nearest_centroid import NearestCentroid
|
||||
from sklearn.neighbors._regression import KNeighborsRegressor, RadiusNeighborsRegressor
|
||||
from sklearn.neighbors._unsupervised import NearestNeighbors
|
||||
|
||||
__all__ = [
|
||||
"VALID_METRICS",
|
||||
"VALID_METRICS_SPARSE",
|
||||
"BallTree",
|
||||
"KDTree",
|
||||
"KNeighborsClassifier",
|
||||
"KNeighborsRegressor",
|
||||
"KNeighborsTransformer",
|
||||
"KernelDensity",
|
||||
"LocalOutlierFactor",
|
||||
"NearestCentroid",
|
||||
"NearestNeighbors",
|
||||
"NeighborhoodComponentsAnalysis",
|
||||
"RadiusNeighborsClassifier",
|
||||
"RadiusNeighborsRegressor",
|
||||
"RadiusNeighborsTransformer",
|
||||
"kneighbors_graph",
|
||||
"radius_neighbors_graph",
|
||||
"sort_graph_by_row_values",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,284 @@
|
||||
{{py:
|
||||
|
||||
# Generated file: _ball_tree.pyx
|
||||
|
||||
implementation_specific_values = [
|
||||
# The values are arranged as follows:
|
||||
#
|
||||
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
#
|
||||
('64', 'float64_t', 'np.float64'),
|
||||
('32', 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
}}
|
||||
|
||||
|
||||
__all__ = ['BallTree', 'BallTree64', 'BallTree32']
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
DOC_DICT{{name_suffix}} = {
|
||||
'BinaryTree': 'BallTree{{name_suffix}}',
|
||||
'binary_tree': 'ball_tree{{name_suffix}}',
|
||||
}
|
||||
|
||||
VALID_METRICS{{name_suffix}} = [
|
||||
'BrayCurtisDistance{{name_suffix}}',
|
||||
'CanberraDistance{{name_suffix}}',
|
||||
'ChebyshevDistance{{name_suffix}}',
|
||||
'DiceDistance{{name_suffix}}',
|
||||
'EuclideanDistance{{name_suffix}}',
|
||||
'HammingDistance{{name_suffix}}',
|
||||
'HaversineDistance{{name_suffix}}',
|
||||
'JaccardDistance{{name_suffix}}',
|
||||
'MahalanobisDistance{{name_suffix}}',
|
||||
'ManhattanDistance{{name_suffix}}',
|
||||
'MinkowskiDistance{{name_suffix}}',
|
||||
'PyFuncDistance{{name_suffix}}',
|
||||
'RogersTanimotoDistance{{name_suffix}}',
|
||||
'RussellRaoDistance{{name_suffix}}',
|
||||
'SEuclideanDistance{{name_suffix}}',
|
||||
'SokalMichenerDistance{{name_suffix}}',
|
||||
'SokalSneathDistance{{name_suffix}}',
|
||||
'WMinkowskiDistance{{name_suffix}}',
|
||||
]
|
||||
|
||||
{{endfor}}
|
||||
|
||||
include "_binary_tree.pxi"
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}}
|
||||
cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}):
|
||||
__doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
|
||||
pass
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# The functions below specialized the Binary Tree as a Ball Tree
|
||||
#
|
||||
# Note that these functions use the concept of "reduced distance".
|
||||
# The reduced distance, defined for some metrics, is a quantity which
|
||||
# is more efficient to compute than the distance, but preserves the
|
||||
# relative rankings of the true distance. For example, the reduced
|
||||
# distance for the Euclidean metric is the squared-euclidean distance.
|
||||
# For some metrics, the reduced distance is simply the distance.
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
cdef int allocate_data{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t n_nodes,
|
||||
intp_t n_features,
|
||||
) except -1:
|
||||
"""Allocate arrays needed for the KD Tree"""
|
||||
tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}})
|
||||
return 0
|
||||
|
||||
|
||||
cdef int init_node{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
NodeData_t[::1] node_data,
|
||||
intp_t i_node,
|
||||
intp_t idx_start,
|
||||
intp_t idx_end,
|
||||
) except -1:
|
||||
"""Initialize the node for the dataset stored in tree.data"""
|
||||
cdef intp_t n_features = tree.data.shape[1]
|
||||
cdef intp_t n_points = idx_end - idx_start
|
||||
|
||||
cdef intp_t i, j
|
||||
cdef float64_t radius
|
||||
cdef const {{INPUT_DTYPE_t}} *this_pt
|
||||
|
||||
cdef const intp_t* idx_array = &tree.idx_array[0]
|
||||
cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
|
||||
cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0]
|
||||
|
||||
cdef bint with_sample_weight = tree.sample_weight is not None
|
||||
cdef const {{INPUT_DTYPE_t}}* sample_weight
|
||||
cdef float64_t sum_weight_node
|
||||
if with_sample_weight:
|
||||
sample_weight = &tree.sample_weight[0]
|
||||
|
||||
# determine Node centroid
|
||||
for j in range(n_features):
|
||||
centroid[j] = 0
|
||||
|
||||
if with_sample_weight:
|
||||
sum_weight_node = 0
|
||||
for i in range(idx_start, idx_end):
|
||||
sum_weight_node += sample_weight[idx_array[i]]
|
||||
this_pt = data + n_features * idx_array[i]
|
||||
for j from 0 <= j < n_features:
|
||||
centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
|
||||
|
||||
for j in range(n_features):
|
||||
centroid[j] /= sum_weight_node
|
||||
else:
|
||||
for i in range(idx_start, idx_end):
|
||||
this_pt = data + n_features * idx_array[i]
|
||||
for j from 0 <= j < n_features:
|
||||
centroid[j] += this_pt[j]
|
||||
|
||||
for j in range(n_features):
|
||||
centroid[j] /= n_points
|
||||
|
||||
# determine Node radius
|
||||
radius = 0
|
||||
for i in range(idx_start, idx_end):
|
||||
radius = fmax(radius,
|
||||
tree.rdist(centroid,
|
||||
data + n_features * idx_array[i],
|
||||
n_features))
|
||||
|
||||
node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius)
|
||||
node_data[i_node].idx_start = idx_start
|
||||
node_data[i_node].idx_end = idx_end
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline float64_t min_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1 nogil:
|
||||
"""Compute the minimum distance between a point and a node"""
|
||||
cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
|
||||
tree.data.shape[1])
|
||||
return fmax(0, dist_pt - tree.node_data[i_node].radius)
|
||||
|
||||
|
||||
cdef inline float64_t max_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1:
|
||||
"""Compute the maximum distance between a point and a node"""
|
||||
cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
|
||||
tree.data.shape[1])
|
||||
return dist_pt + tree.node_data[i_node].radius
|
||||
|
||||
|
||||
cdef inline int min_max_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
float64_t* min_dist,
|
||||
float64_t* max_dist,
|
||||
) except -1 nogil:
|
||||
"""Compute the minimum and maximum distance between a point and a node"""
|
||||
cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0],
|
||||
tree.data.shape[1])
|
||||
cdef float64_t rad = tree.node_data[i_node].radius
|
||||
min_dist[0] = fmax(0, dist_pt - rad)
|
||||
max_dist[0] = dist_pt + rad
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline float64_t min_rdist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1 nogil:
|
||||
"""Compute the minimum reduced-distance between a point and a node"""
|
||||
if tree.euclidean:
|
||||
return euclidean_dist_to_rdist{{name_suffix}}(
|
||||
min_dist{{name_suffix}}(tree, i_node, pt)
|
||||
)
|
||||
else:
|
||||
return tree.dist_metric._dist_to_rdist(
|
||||
min_dist{{name_suffix}}(tree, i_node, pt)
|
||||
)
|
||||
|
||||
|
||||
cdef inline float64_t max_rdist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1:
|
||||
"""Compute the maximum reduced-distance between a point and a node"""
|
||||
if tree.euclidean:
|
||||
return euclidean_dist_to_rdist{{name_suffix}}(
|
||||
max_dist{{name_suffix}}(tree, i_node, pt)
|
||||
)
|
||||
else:
|
||||
return tree.dist_metric._dist_to_rdist(
|
||||
max_dist{{name_suffix}}(tree, i_node, pt)
|
||||
)
|
||||
|
||||
|
||||
cdef inline float64_t min_dist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""compute the minimum distance between two nodes"""
|
||||
cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
|
||||
&tree1.node_bounds[0, i_node1, 0],
|
||||
tree1.data.shape[1])
|
||||
return fmax(0, (dist_pt - tree1.node_data[i_node1].radius
|
||||
- tree2.node_data[i_node2].radius))
|
||||
|
||||
|
||||
cdef inline float64_t max_dist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""compute the maximum distance between two nodes"""
|
||||
cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0],
|
||||
&tree1.node_bounds[0, i_node1, 0],
|
||||
tree1.data.shape[1])
|
||||
return (dist_pt + tree1.node_data[i_node1].radius
|
||||
+ tree2.node_data[i_node2].radius)
|
||||
|
||||
|
||||
cdef inline float64_t min_rdist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""compute the minimum reduced distance between two nodes"""
|
||||
if tree1.euclidean:
|
||||
return euclidean_dist_to_rdist{{name_suffix}}(
|
||||
min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
else:
|
||||
return tree1.dist_metric._dist_to_rdist(
|
||||
min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
|
||||
|
||||
cdef inline float64_t max_rdist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""compute the maximum reduced distance between two nodes"""
|
||||
if tree1.euclidean:
|
||||
return euclidean_dist_to_rdist{{name_suffix}}(
|
||||
max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
else:
|
||||
return tree1.dist_metric._dist_to_rdist(
|
||||
max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
class BallTree(BallTree64):
|
||||
__doc__ = CLASS_DOC.format(BinaryTree="BallTree")
|
||||
pass
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,923 @@
|
||||
"""Nearest Neighbor Classification"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import ClassifierMixin, _fit_context
|
||||
from sklearn.metrics._pairwise_distances_reduction import (
|
||||
ArgKminClassMode,
|
||||
RadiusNeighborsClassMode,
|
||||
)
|
||||
from sklearn.neighbors._base import (
|
||||
KNeighborsMixin,
|
||||
NeighborsBase,
|
||||
RadiusNeighborsMixin,
|
||||
_check_precomputed,
|
||||
_get_weights,
|
||||
)
|
||||
from sklearn.utils._param_validation import StrOptions
|
||||
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1
|
||||
from sklearn.utils.extmath import weighted_mode
|
||||
from sklearn.utils.fixes import _mode
|
||||
from sklearn.utils.validation import (
|
||||
_is_arraylike,
|
||||
_num_samples,
|
||||
check_is_fitted,
|
||||
validate_data,
|
||||
)
|
||||
|
||||
|
||||
def _adjusted_metric(metric, metric_kwargs, p=None):
|
||||
metric_kwargs = metric_kwargs or {}
|
||||
if metric == "minkowski":
|
||||
metric_kwargs["p"] = p
|
||||
if p == 2:
|
||||
metric = "euclidean"
|
||||
return metric, metric_kwargs
|
||||
|
||||
|
||||
class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
|
||||
"""Classifier implementing the k-nearest neighbors vote.
|
||||
|
||||
Read more in the :ref:`User Guide <classification>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Refer to the example entitled
|
||||
:ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
|
||||
showing the impact of the `weights` parameter on the decision
|
||||
boundary.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is equivalent
|
||||
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
|
||||
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
|
||||
to be positive.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
Doesn't affect :meth:`fit` method.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : array of shape (n_classes,)
|
||||
Class labels known to the classifier
|
||||
|
||||
effective_metric_ : str or callble
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
outputs_2d_ : bool
|
||||
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||||
otherwise True.
|
||||
|
||||
See Also
|
||||
--------
|
||||
RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.
|
||||
KNeighborsRegressor: Regression based on k-nearest neighbors.
|
||||
RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.
|
||||
NearestNeighbors: Unsupervised learner for implementing neighbor searches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
.. warning::
|
||||
|
||||
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||||
neighbors, neighbor `k+1` and `k`, have identical distances
|
||||
but different labels, the results will depend on the ordering of the
|
||||
training data.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import KNeighborsClassifier
|
||||
>>> neigh = KNeighborsClassifier(n_neighbors=3)
|
||||
>>> neigh.fit(X, y)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(neigh.predict([[1.1]]))
|
||||
[0]
|
||||
>>> print(neigh.predict_proba([[0.9]]))
|
||||
[[0.666 0.333]]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {**NeighborsBase._parameter_constraints}
|
||||
_parameter_constraints.pop("radius")
|
||||
_parameter_constraints.update(
|
||||
{"weights": [StrOptions({"uniform", "distance"}), callable, None]}
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_neighbors=5,
|
||||
*,
|
||||
weights="uniform",
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
metric="minkowski",
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.weights = weights
|
||||
|
||||
@_fit_context(
|
||||
# KNeighborsClassifier.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y):
|
||||
"""Fit the k-nearest neighbors classifier from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_outputs)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : KNeighborsClassifier
|
||||
The fitted k-nearest neighbors classifier.
|
||||
"""
|
||||
return self._fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the class labels for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||||
Class labels for each data sample.
|
||||
"""
|
||||
check_is_fitted(self, "_fit_method")
|
||||
if self.weights == "uniform":
|
||||
if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
|
||||
X, self._fit_X, self.metric
|
||||
):
|
||||
probabilities = self.predict_proba(X)
|
||||
if self.outputs_2d_:
|
||||
return np.stack(
|
||||
[
|
||||
self.classes_[idx][np.argmax(probas, axis=1)]
|
||||
for idx, probas in enumerate(probabilities)
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
return self.classes_[np.argmax(probabilities, axis=1)]
|
||||
# In that case, we do not need the distances to perform
|
||||
# the weighting so we do not compute them.
|
||||
neigh_ind = self.kneighbors(X, return_distance=False)
|
||||
neigh_dist = None
|
||||
else:
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_outputs = len(classes_)
|
||||
n_queries = _num_samples(self._fit_X if X is None else X)
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
|
||||
raise ValueError(
|
||||
"All neighbors of some sample is getting zero weights. "
|
||||
"Please modify 'weights' to avoid this case if you are "
|
||||
"using a user-defined function."
|
||||
)
|
||||
|
||||
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||||
for k, classes_k in enumerate(classes_):
|
||||
if weights is None:
|
||||
mode, _ = _mode(_y[neigh_ind, k], axis=1)
|
||||
else:
|
||||
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
|
||||
|
||||
mode = np.asarray(mode.ravel(), dtype=np.intp)
|
||||
y_pred[:, k] = classes_k.take(mode)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \
|
||||
of such arrays if n_outputs > 1.
|
||||
The class probabilities of the input samples. Classes are ordered
|
||||
by lexicographic order.
|
||||
"""
|
||||
check_is_fitted(self, "_fit_method")
|
||||
if self.weights == "uniform":
|
||||
# TODO: systematize this mapping of metric for
|
||||
# PairwiseDistancesReductions.
|
||||
metric, metric_kwargs = _adjusted_metric(
|
||||
metric=self.metric, metric_kwargs=self.metric_params, p=self.p
|
||||
)
|
||||
if (
|
||||
self._fit_method == "brute"
|
||||
and ArgKminClassMode.is_usable_for(X, self._fit_X, metric)
|
||||
# TODO: Implement efficient multi-output solution
|
||||
and not self.outputs_2d_
|
||||
):
|
||||
if self.metric == "precomputed":
|
||||
X = _check_precomputed(X)
|
||||
else:
|
||||
X = validate_data(
|
||||
self, X, accept_sparse="csr", reset=False, order="C"
|
||||
)
|
||||
|
||||
probabilities = ArgKminClassMode.compute(
|
||||
X,
|
||||
self._fit_X,
|
||||
k=self.n_neighbors,
|
||||
weights=self.weights,
|
||||
Y_labels=self._y,
|
||||
unique_Y_labels=self.classes_,
|
||||
metric=metric,
|
||||
metric_kwargs=metric_kwargs,
|
||||
# `strategy="parallel_on_X"` has in practice be shown
|
||||
# to be more efficient than `strategy="parallel_on_Y``
|
||||
# on many combination of datasets.
|
||||
# Hence, we choose to enforce it here.
|
||||
# For more information, see:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342
|
||||
# TODO: adapt the heuristic for `strategy="auto"` for
|
||||
# `ArgKminClassMode` and use `strategy="auto"`.
|
||||
strategy="parallel_on_X",
|
||||
)
|
||||
return probabilities
|
||||
|
||||
# In that case, we do not need the distances to perform
|
||||
# the weighting so we do not compute them.
|
||||
neigh_ind = self.kneighbors(X, return_distance=False)
|
||||
neigh_dist = None
|
||||
else:
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_queries = _num_samples(self._fit_X if X is None else X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
if weights is None:
|
||||
weights = np.ones_like(neigh_ind)
|
||||
elif _all_with_any_reduction_axis_1(weights, value=0):
|
||||
raise ValueError(
|
||||
"All neighbors of some sample is getting zero weights. "
|
||||
"Please modify 'weights' to avoid this case if you are "
|
||||
"using a user-defined function."
|
||||
)
|
||||
|
||||
all_rows = np.arange(n_queries)
|
||||
probabilities = []
|
||||
for k, classes_k in enumerate(classes_):
|
||||
pred_labels = _y[:, k][neigh_ind]
|
||||
proba_k = np.zeros((n_queries, classes_k.size))
|
||||
|
||||
# a simple ':' index doesn't work right
|
||||
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
|
||||
proba_k[all_rows, idx] += weights[:, i]
|
||||
|
||||
# normalize 'votes' into real [0,1] probabilities
|
||||
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||||
proba_k /= normalizer
|
||||
|
||||
probabilities.append(proba_k)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probabilities = probabilities[0]
|
||||
|
||||
return probabilities
|
||||
|
||||
# This function is defined here only to modify the parent docstring
|
||||
# and add information about X=None
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""
|
||||
Return the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features), or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
used; in this case, points are not considered their own
|
||||
neighbors. This means that `knn.fit(X, y).score(None, y)`
|
||||
implicitly performs a leave-one-out cross-validation procedure
|
||||
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
|
||||
but typically much faster.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for `X`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
|
||||
"""
|
||||
return super().score(X, y, sample_weight)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.classifier_tags.multi_label = True
|
||||
tags.input_tags.pairwise = self.metric == "precomputed"
|
||||
return tags
|
||||
|
||||
|
||||
class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):
|
||||
"""Classifier implementing a vote among neighbors within a given radius.
|
||||
|
||||
Read more in the :ref:`User Guide <classification>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
This parameter is expected to be positive.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
outlier_label : {manual label, 'most_frequent'}, default=None
|
||||
Label for outlier samples (samples with no neighbors in given radius).
|
||||
|
||||
- manual label: str or int label (should be the same type as y)
|
||||
or list of manual labels if multi-output is used.
|
||||
- 'most_frequent' : assign the most frequent label of y to outliers.
|
||||
- None : when any outlier is detected, ValueError will be raised.
|
||||
|
||||
The outlier label should be selected from among the unique 'Y' labels.
|
||||
If it is specified with a different value a warning will be raised and
|
||||
all class probabilities of outliers will be assigned to be 0.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Class labels known to the classifier.
|
||||
|
||||
effective_metric_ : str or callable
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
outlier_label_ : int or array-like of shape (n_class,)
|
||||
Label which is given for outlier samples (samples with no neighbors
|
||||
on given radius).
|
||||
|
||||
outputs_2d_ : bool
|
||||
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||||
otherwise True.
|
||||
|
||||
See Also
|
||||
--------
|
||||
KNeighborsClassifier : Classifier implementing the k-nearest neighbors
|
||||
vote.
|
||||
RadiusNeighborsRegressor : Regression based on neighbors within a
|
||||
fixed radius.
|
||||
KNeighborsRegressor : Regression based on k-nearest neighbors.
|
||||
NearestNeighbors : Unsupervised learner for implementing neighbor
|
||||
searches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import RadiusNeighborsClassifier
|
||||
>>> neigh = RadiusNeighborsClassifier(radius=1.0)
|
||||
>>> neigh.fit(X, y)
|
||||
RadiusNeighborsClassifier(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0]
|
||||
>>> print(neigh.predict_proba([[1.0]]))
|
||||
[[0.66666667 0.33333333]]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"weights": [StrOptions({"uniform", "distance"}), callable, None],
|
||||
"outlier_label": [Integral, str, "array-like", None],
|
||||
}
|
||||
_parameter_constraints.pop("n_neighbors")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
radius=1.0,
|
||||
*,
|
||||
weights="uniform",
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
metric="minkowski",
|
||||
outlier_label=None,
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.weights = weights
|
||||
self.outlier_label = outlier_label
|
||||
|
||||
@_fit_context(
|
||||
# RadiusNeighborsClassifier.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y):
|
||||
"""Fit the radius neighbors classifier from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_outputs)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : RadiusNeighborsClassifier
|
||||
The fitted radius neighbors classifier.
|
||||
"""
|
||||
self._fit(X, y)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
if self.outlier_label is None:
|
||||
outlier_label_ = None
|
||||
|
||||
elif self.outlier_label == "most_frequent":
|
||||
outlier_label_ = []
|
||||
# iterate over multi-output, get the most frequent label for each
|
||||
# output.
|
||||
for k, classes_k in enumerate(classes_):
|
||||
label_count = np.bincount(_y[:, k])
|
||||
outlier_label_.append(classes_k[label_count.argmax()])
|
||||
|
||||
else:
|
||||
if _is_arraylike(self.outlier_label) and not isinstance(
|
||||
self.outlier_label, str
|
||||
):
|
||||
if len(self.outlier_label) != len(classes_):
|
||||
raise ValueError(
|
||||
"The length of outlier_label: {} is "
|
||||
"inconsistent with the output "
|
||||
"length: {}".format(self.outlier_label, len(classes_))
|
||||
)
|
||||
outlier_label_ = self.outlier_label
|
||||
else:
|
||||
outlier_label_ = [self.outlier_label] * len(classes_)
|
||||
|
||||
for classes, label in zip(classes_, outlier_label_):
|
||||
if _is_arraylike(label) and not isinstance(label, str):
|
||||
# ensure the outlier label for each output is a scalar.
|
||||
raise TypeError(
|
||||
"The outlier_label of classes {} is "
|
||||
"supposed to be a scalar, got "
|
||||
"{}.".format(classes, label)
|
||||
)
|
||||
if np.append(classes, label).dtype != classes.dtype:
|
||||
# ensure the dtype of outlier label is consistent with y.
|
||||
raise TypeError(
|
||||
"The dtype of outlier_label {} is "
|
||||
"inconsistent with classes {} in "
|
||||
"y.".format(label, classes)
|
||||
)
|
||||
|
||||
self.outlier_label_ = outlier_label_
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the class labels for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||||
Class labels for each data sample.
|
||||
"""
|
||||
|
||||
probs = self.predict_proba(X)
|
||||
classes_ = self.classes_
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probs = [probs]
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_outputs = len(classes_)
|
||||
n_queries = probs[0].shape[0]
|
||||
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||||
|
||||
for k, prob in enumerate(probs):
|
||||
# iterate over multi-output, assign labels based on probabilities
|
||||
# of each output.
|
||||
max_prob_index = prob.argmax(axis=1)
|
||||
y_pred[:, k] = classes_[k].take(max_prob_index)
|
||||
|
||||
outlier_zero_probs = (prob == 0).all(axis=1)
|
||||
if outlier_zero_probs.any():
|
||||
zero_prob_index = np.flatnonzero(outlier_zero_probs)
|
||||
y_pred[zero_prob_index, k] = self.outlier_label_[k]
|
||||
|
||||
if not self.outputs_2d_:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : ndarray of shape (n_queries, n_classes), or a list of \
|
||||
n_outputs of such arrays if n_outputs > 1.
|
||||
The class probabilities of the input samples. Classes are ordered
|
||||
by lexicographic order.
|
||||
"""
|
||||
check_is_fitted(self, "_fit_method")
|
||||
n_queries = _num_samples(self._fit_X if X is None else X)
|
||||
|
||||
metric, metric_kwargs = _adjusted_metric(
|
||||
metric=self.metric, metric_kwargs=self.metric_params, p=self.p
|
||||
)
|
||||
|
||||
if (
|
||||
self.weights == "uniform"
|
||||
and self._fit_method == "brute"
|
||||
and not self.outputs_2d_
|
||||
and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric)
|
||||
):
|
||||
probabilities = RadiusNeighborsClassMode.compute(
|
||||
X=X,
|
||||
Y=self._fit_X,
|
||||
radius=self.radius,
|
||||
weights=self.weights,
|
||||
Y_labels=self._y,
|
||||
unique_Y_labels=self.classes_,
|
||||
outlier_label=self.outlier_label,
|
||||
metric=metric,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy="parallel_on_X",
|
||||
# `strategy="parallel_on_X"` has in practice be shown
|
||||
# to be more efficient than `strategy="parallel_on_Y``
|
||||
# on many combination of datasets.
|
||||
# Hence, we choose to enforce it here.
|
||||
# For more information, see:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471
|
||||
)
|
||||
return probabilities
|
||||
|
||||
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||||
outlier_mask = np.zeros(n_queries, dtype=bool)
|
||||
outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
|
||||
outliers = np.flatnonzero(outlier_mask)
|
||||
inliers = np.flatnonzero(~outlier_mask)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
if self.outlier_label_ is None and outliers.size > 0:
|
||||
raise ValueError(
|
||||
"No neighbors found for test samples %r, "
|
||||
"you can try using larger radius, "
|
||||
"giving a label for outliers, "
|
||||
"or considering removing them from your dataset." % outliers
|
||||
)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
if weights is not None:
|
||||
weights = weights[inliers]
|
||||
|
||||
probabilities = []
|
||||
# iterate over multi-output, measure probabilities of the k-th output.
|
||||
for k, classes_k in enumerate(classes_):
|
||||
pred_labels = np.zeros(len(neigh_ind), dtype=object)
|
||||
pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
|
||||
|
||||
proba_k = np.zeros((n_queries, classes_k.size))
|
||||
proba_inl = np.zeros((len(inliers), classes_k.size))
|
||||
|
||||
# samples have different size of neighbors within the same radius
|
||||
if weights is None:
|
||||
for i, idx in enumerate(pred_labels[inliers]):
|
||||
proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)
|
||||
else:
|
||||
for i, idx in enumerate(pred_labels[inliers]):
|
||||
proba_inl[i, :] = np.bincount(
|
||||
idx, weights[i], minlength=classes_k.size
|
||||
)
|
||||
proba_k[inliers, :] = proba_inl
|
||||
|
||||
if outliers.size > 0:
|
||||
_outlier_label = self.outlier_label_[k]
|
||||
label_index = np.flatnonzero(classes_k == _outlier_label)
|
||||
if label_index.size == 1:
|
||||
proba_k[outliers, label_index[0]] = 1.0
|
||||
else:
|
||||
warnings.warn(
|
||||
"Outlier label {} is not in training "
|
||||
"classes. All class probabilities of "
|
||||
"outliers will be assigned with 0."
|
||||
"".format(self.outlier_label_[k])
|
||||
)
|
||||
|
||||
# normalize 'votes' into real [0,1] probabilities
|
||||
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
proba_k /= normalizer
|
||||
|
||||
probabilities.append(proba_k)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probabilities = probabilities[0]
|
||||
|
||||
return probabilities
|
||||
|
||||
# This function is defined here only to modify the parent docstring
|
||||
# and add information about X=None
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""
|
||||
Return the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features), or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
used; in this case, points are not considered their own
|
||||
neighbors. This means that `knn.fit(X, y).score(None, y)`
|
||||
implicitly performs a leave-one-out cross-validation procedure
|
||||
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
|
||||
but typically much faster.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for `X`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
|
||||
"""
|
||||
return super().score(X, y, sample_weight)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.classifier_tags.multi_label = True
|
||||
return tags
|
||||
@@ -0,0 +1,709 @@
|
||||
"""Nearest Neighbors graph functions"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
|
||||
from sklearn.base import ClassNamePrefixFeaturesOutMixin, TransformerMixin, _fit_context
|
||||
from sklearn.neighbors._base import (
|
||||
VALID_METRICS,
|
||||
KNeighborsMixin,
|
||||
NeighborsBase,
|
||||
RadiusNeighborsMixin,
|
||||
)
|
||||
from sklearn.neighbors._unsupervised import NearestNeighbors
|
||||
from sklearn.utils._param_validation import (
|
||||
Integral,
|
||||
Interval,
|
||||
Real,
|
||||
StrOptions,
|
||||
validate_params,
|
||||
)
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
|
||||
|
||||
def _check_params(X, metric, p, metric_params):
|
||||
"""Check the validity of the input parameters"""
|
||||
params = zip(["metric", "p", "metric_params"], [metric, p, metric_params])
|
||||
est_params = X.get_params()
|
||||
for param_name, func_param in params:
|
||||
if func_param != est_params[param_name]:
|
||||
raise ValueError(
|
||||
"Got %s for %s, while the estimator has %s for the same parameter."
|
||||
% (func_param, param_name, est_params[param_name])
|
||||
)
|
||||
|
||||
|
||||
def _query_include_self(X, include_self, mode):
|
||||
"""Return the query based on include_self param"""
|
||||
if include_self == "auto":
|
||||
include_self = mode == "connectivity"
|
||||
|
||||
# it does not include each sample as its own neighbors
|
||||
if not include_self:
|
||||
X = None
|
||||
|
||||
return X
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix", KNeighborsMixin],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"mode": [StrOptions({"connectivity", "distance"})],
|
||||
"metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
|
||||
"p": [Interval(Real, 0, None, closed="right"), None],
|
||||
"metric_params": [dict, None],
|
||||
"include_self": ["boolean", StrOptions({"auto"})],
|
||||
"n_jobs": [Integral, None],
|
||||
},
|
||||
prefer_skip_nested_validation=False, # metric is not validated yet
|
||||
)
|
||||
def kneighbors_graph(
|
||||
X,
|
||||
n_neighbors,
|
||||
*,
|
||||
mode="connectivity",
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
include_self=False,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Compute the (weighted) graph of k-Neighbors for points in X.
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Sample data.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors for each sample.
|
||||
|
||||
mode : {'connectivity', 'distance'}, default='connectivity'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
metric : str, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is equivalent
|
||||
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
|
||||
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
|
||||
to be positive.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
include_self : bool or 'auto', default=False
|
||||
Whether or not to mark each sample as the first nearest neighbor to
|
||||
itself. If 'auto', then True is used for mode='connectivity' and False
|
||||
for mode='distance'.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph where A[i, j] is assigned the weight of edge that
|
||||
connects i to j. The matrix is of CSR format.
|
||||
|
||||
See Also
|
||||
--------
|
||||
radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [3], [1]]
|
||||
>>> from sklearn.neighbors import kneighbors_graph
|
||||
>>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
|
||||
>>> A.toarray()
|
||||
array([[1., 0., 1.],
|
||||
[0., 1., 1.],
|
||||
[1., 0., 1.]])
|
||||
"""
|
||||
if not isinstance(X, KNeighborsMixin):
|
||||
X = NearestNeighbors(
|
||||
n_neighbors=n_neighbors,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
).fit(X)
|
||||
else:
|
||||
_check_params(X, metric, p, metric_params)
|
||||
|
||||
query = _query_include_self(X._fit_X, include_self, mode)
|
||||
return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix", RadiusNeighborsMixin],
|
||||
"radius": [Interval(Real, 0, None, closed="both")],
|
||||
"mode": [StrOptions({"connectivity", "distance"})],
|
||||
"metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable],
|
||||
"p": [Interval(Real, 0, None, closed="right"), None],
|
||||
"metric_params": [dict, None],
|
||||
"include_self": ["boolean", StrOptions({"auto"})],
|
||||
"n_jobs": [Integral, None],
|
||||
},
|
||||
prefer_skip_nested_validation=False, # metric is not validated yet
|
||||
)
|
||||
def radius_neighbors_graph(
|
||||
X,
|
||||
radius,
|
||||
*,
|
||||
mode="connectivity",
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
include_self=False,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Compute the (weighted) graph of Neighbors for points in X.
|
||||
|
||||
Neighborhoods are restricted the points at a distance lower than
|
||||
radius.
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Sample data.
|
||||
|
||||
radius : float
|
||||
Radius of neighborhoods.
|
||||
|
||||
mode : {'connectivity', 'distance'}, default='connectivity'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
metric : str, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
include_self : bool or 'auto', default=False
|
||||
Whether or not to mark each sample as the first nearest neighbor to
|
||||
itself. If 'auto', then True is used for mode='connectivity' and False
|
||||
for mode='distance'.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph where A[i, j] is assigned the weight of edge that connects
|
||||
i to j. The matrix is of CSR format.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kneighbors_graph: Compute the weighted graph of k-neighbors for points in X.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [3], [1]]
|
||||
>>> from sklearn.neighbors import radius_neighbors_graph
|
||||
>>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',
|
||||
... include_self=True)
|
||||
>>> A.toarray()
|
||||
array([[1., 0., 1.],
|
||||
[0., 1., 0.],
|
||||
[1., 0., 1.]])
|
||||
"""
|
||||
if not isinstance(X, RadiusNeighborsMixin):
|
||||
X = NearestNeighbors(
|
||||
radius=radius,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
).fit(X)
|
||||
else:
|
||||
_check_params(X, metric, p, metric_params)
|
||||
|
||||
query = _query_include_self(X._fit_X, include_self, mode)
|
||||
return X.radius_neighbors_graph(query, radius, mode)
|
||||
|
||||
|
||||
class KNeighborsTransformer(
|
||||
ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase
|
||||
):
|
||||
"""Transform X into a (weighted) graph of k nearest neighbors.
|
||||
|
||||
The transformed data is a sparse graph as returned by kneighbors_graph.
|
||||
|
||||
Read more in the :ref:`User Guide <neighbors_transformer>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode : {'distance', 'connectivity'}, default='distance'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors for each sample in the transformed sparse graph.
|
||||
For compatibility reasons, as each sample is considered as its own
|
||||
neighbor, one extra neighbor will be computed when mode == 'distance'.
|
||||
In this case, the sparse graph contains (n_neighbors + 1) neighbors.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
Distance matrices are not supported.
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
This parameter is expected to be positive.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kneighbors_graph : Compute the weighted graph of k-neighbors for
|
||||
points in X.
|
||||
RadiusNeighborsTransformer : Transform X into a weighted graph of
|
||||
neighbors nearer than a radius.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example of using :class:`~sklearn.neighbors.KNeighborsTransformer`
|
||||
in combination with :class:`~sklearn.manifold.TSNE` see
|
||||
:ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_wine
|
||||
>>> from sklearn.neighbors import KNeighborsTransformer
|
||||
>>> X, _ = load_wine(return_X_y=True)
|
||||
>>> X.shape
|
||||
(178, 13)
|
||||
>>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')
|
||||
>>> X_dist_graph = transformer.fit_transform(X)
|
||||
>>> X_dist_graph.shape
|
||||
(178, 178)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"mode": [StrOptions({"distance", "connectivity"})],
|
||||
}
|
||||
_parameter_constraints.pop("radius")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
mode="distance",
|
||||
n_neighbors=5,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=None,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.mode = mode
|
||||
|
||||
@_fit_context(
|
||||
# KNeighborsTransformer.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the k-nearest neighbors transformer from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : KNeighborsTransformer
|
||||
The fitted k-nearest neighbors transformer.
|
||||
"""
|
||||
self._fit(X)
|
||||
self._n_features_out = self.n_samples_fit_
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Compute the (weighted) graph of Neighbors for points in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples_transform, n_features)
|
||||
Sample data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
add_one = self.mode == "distance"
|
||||
return self.kneighbors_graph(
|
||||
X, mode=self.mode, n_neighbors=self.n_neighbors + add_one
|
||||
)
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit to data, then transform it.
|
||||
|
||||
Fits transformer to X and y with optional parameters fit_params
|
||||
and returns a transformed version of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples, n_samples)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
|
||||
class RadiusNeighborsTransformer(
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
RadiusNeighborsMixin,
|
||||
TransformerMixin,
|
||||
NeighborsBase,
|
||||
):
|
||||
"""Transform X into a (weighted) graph of neighbors nearer than a radius.
|
||||
|
||||
The transformed data is a sparse graph as returned by
|
||||
`radius_neighbors_graph`.
|
||||
|
||||
Read more in the :ref:`User Guide <neighbors_transformer>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode : {'distance', 'connectivity'}, default='distance'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
radius : float, default=1.0
|
||||
Radius of neighborhood in the transformed sparse graph.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
Distance matrices are not supported.
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
This parameter is expected to be positive.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kneighbors_graph : Compute the weighted graph of k-neighbors for
|
||||
points in X.
|
||||
KNeighborsTransformer : Transform X into a weighted graph of k
|
||||
nearest neighbors.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.datasets import load_wine
|
||||
>>> from sklearn.cluster import DBSCAN
|
||||
>>> from sklearn.neighbors import RadiusNeighborsTransformer
|
||||
>>> from sklearn.pipeline import make_pipeline
|
||||
>>> X, _ = load_wine(return_X_y=True)
|
||||
>>> estimator = make_pipeline(
|
||||
... RadiusNeighborsTransformer(radius=42.0, mode='distance'),
|
||||
... DBSCAN(eps=25.0, metric='precomputed'))
|
||||
>>> X_clustered = estimator.fit_predict(X)
|
||||
>>> clusters, counts = np.unique(X_clustered, return_counts=True)
|
||||
>>> print(counts)
|
||||
[ 29 15 111 11 12]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"mode": [StrOptions({"distance", "connectivity"})],
|
||||
}
|
||||
_parameter_constraints.pop("n_neighbors")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
mode="distance",
|
||||
radius=1.0,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=None,
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.mode = mode
|
||||
|
||||
@_fit_context(
|
||||
# RadiusNeighborsTransformer.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the radius neighbors transformer from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : RadiusNeighborsTransformer
|
||||
The fitted radius neighbors transformer.
|
||||
"""
|
||||
self._fit(X)
|
||||
self._n_features_out = self.n_samples_fit_
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Compute the (weighted) graph of Neighbors for points in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples_transform, n_features)
|
||||
Sample data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit to data, then transform it.
|
||||
|
||||
Fits transformer to X and y with optional parameters fit_params
|
||||
and returns a transformed version of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples, n_samples)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
return self.fit(X).transform(X)
|
||||
Binary file not shown.
@@ -0,0 +1,336 @@
|
||||
{{py:
|
||||
|
||||
# Generated file: _kd_tree.pyx
|
||||
|
||||
implementation_specific_values = [
|
||||
# The values are arranged as follows:
|
||||
#
|
||||
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
#
|
||||
('64', 'float64_t', 'np.float64'),
|
||||
('32', 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
|
||||
# written for the scikit-learn project
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
}}
|
||||
|
||||
|
||||
__all__ = ['KDTree', 'KDTree64', 'KDTree32']
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
DOC_DICT{{name_suffix}} = {
|
||||
'BinaryTree': 'KDTree{{name_suffix}}',
|
||||
'binary_tree': 'kd_tree{{name_suffix}}',
|
||||
}
|
||||
|
||||
VALID_METRICS{{name_suffix}} = [
|
||||
'EuclideanDistance{{name_suffix}}',
|
||||
'ManhattanDistance{{name_suffix}}',
|
||||
'ChebyshevDistance{{name_suffix}}',
|
||||
'MinkowskiDistance{{name_suffix}}'
|
||||
]
|
||||
|
||||
{{endfor}}
|
||||
|
||||
include "_binary_tree.pxi"
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}}
|
||||
cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}):
|
||||
__doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}})
|
||||
pass
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# The functions below specialized the Binary Tree as a KD Tree
|
||||
#
|
||||
# Note that these functions use the concept of "reduced distance".
|
||||
# The reduced distance, defined for some metrics, is a quantity which
|
||||
# is more efficient to compute than the distance, but preserves the
|
||||
# relative rankings of the true distance. For example, the reduced
|
||||
# distance for the Euclidean metric is the squared-euclidean distance.
|
||||
# For some metrics, the reduced distance is simply the distance.
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
cdef int allocate_data{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t n_nodes,
|
||||
intp_t n_features,
|
||||
) except -1:
|
||||
"""Allocate arrays needed for the KD Tree"""
|
||||
tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}})
|
||||
return 0
|
||||
|
||||
|
||||
cdef int init_node{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
NodeData_t[::1] node_data,
|
||||
intp_t i_node,
|
||||
intp_t idx_start,
|
||||
intp_t idx_end,
|
||||
) except -1:
|
||||
"""Initialize the node for the dataset stored in tree.data"""
|
||||
cdef intp_t n_features = tree.data.shape[1]
|
||||
cdef intp_t i, j
|
||||
cdef float64_t rad = 0
|
||||
|
||||
cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0]
|
||||
cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0]
|
||||
cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0]
|
||||
cdef const intp_t* idx_array = &tree.idx_array[0]
|
||||
|
||||
cdef const {{INPUT_DTYPE_t}}* data_row
|
||||
|
||||
# determine Node bounds
|
||||
for j in range(n_features):
|
||||
lower_bounds[j] = INF
|
||||
upper_bounds[j] = -INF
|
||||
|
||||
# Compute the actual data range. At build time, this is slightly
|
||||
# slower than using the previously-computed bounds of the parent node,
|
||||
# but leads to more compact trees and thus faster queries.
|
||||
for i in range(idx_start, idx_end):
|
||||
data_row = data + idx_array[i] * n_features
|
||||
for j in range(n_features):
|
||||
lower_bounds[j] = fmin(lower_bounds[j], data_row[j])
|
||||
upper_bounds[j] = fmax(upper_bounds[j], data_row[j])
|
||||
|
||||
for j in range(n_features):
|
||||
if tree.dist_metric.p == INF:
|
||||
rad = fmax(rad, 0.5 * (upper_bounds[j] - lower_bounds[j]))
|
||||
else:
|
||||
rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]),
|
||||
tree.dist_metric.p)
|
||||
|
||||
node_data[i_node].idx_start = idx_start
|
||||
node_data[i_node].idx_end = idx_end
|
||||
|
||||
# The radius will hold the size of the circumscribed hypersphere measured
|
||||
# with the specified metric: in querying, this is used as a measure of the
|
||||
# size of each node when deciding which nodes to split.
|
||||
node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p)
|
||||
return 0
|
||||
|
||||
|
||||
cdef float64_t min_rdist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1 nogil:
|
||||
"""Compute the minimum reduced-distance between a point and a node"""
|
||||
cdef intp_t n_features = tree.data.shape[1]
|
||||
cdef float64_t d, d_lo, d_hi, rdist=0.0
|
||||
cdef intp_t j
|
||||
|
||||
if tree.dist_metric.p == INF:
|
||||
for j in range(n_features):
|
||||
d_lo = tree.node_bounds[0, i_node, j] - pt[j]
|
||||
d_hi = pt[j] - tree.node_bounds[1, i_node, j]
|
||||
d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
|
||||
rdist = fmax(rdist, 0.5 * d)
|
||||
else:
|
||||
# here we'll use the fact that x + abs(x) = 2 * max(x, 0)
|
||||
for j in range(n_features):
|
||||
d_lo = tree.node_bounds[0, i_node, j] - pt[j]
|
||||
d_hi = pt[j] - tree.node_bounds[1, i_node, j]
|
||||
d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
|
||||
rdist += pow(0.5 * d, tree.dist_metric.p)
|
||||
|
||||
return rdist
|
||||
|
||||
|
||||
cdef float64_t min_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1:
|
||||
"""Compute the minimum distance between a point and a node"""
|
||||
if tree.dist_metric.p == INF:
|
||||
return min_rdist{{name_suffix}}(tree, i_node, pt)
|
||||
else:
|
||||
return pow(
|
||||
min_rdist{{name_suffix}}(tree, i_node, pt),
|
||||
1. / tree.dist_metric.p
|
||||
)
|
||||
|
||||
|
||||
cdef float64_t max_rdist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1:
|
||||
"""Compute the maximum reduced-distance between a point and a node"""
|
||||
cdef intp_t n_features = tree.data.shape[1]
|
||||
|
||||
cdef float64_t d_lo, d_hi, rdist=0.0
|
||||
cdef intp_t j
|
||||
|
||||
if tree.dist_metric.p == INF:
|
||||
for j in range(n_features):
|
||||
rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[0, i_node, j]))
|
||||
rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[1, i_node, j]))
|
||||
else:
|
||||
for j in range(n_features):
|
||||
d_lo = fabs(pt[j] - tree.node_bounds[0, i_node, j])
|
||||
d_hi = fabs(pt[j] - tree.node_bounds[1, i_node, j])
|
||||
rdist += pow(fmax(d_lo, d_hi), tree.dist_metric.p)
|
||||
|
||||
return rdist
|
||||
|
||||
|
||||
cdef float64_t max_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
) except -1:
|
||||
"""Compute the maximum distance between a point and a node"""
|
||||
if tree.dist_metric.p == INF:
|
||||
return max_rdist{{name_suffix}}(tree, i_node, pt)
|
||||
else:
|
||||
return pow(
|
||||
max_rdist{{name_suffix}}(tree, i_node, pt),
|
||||
1. / tree.dist_metric.p
|
||||
)
|
||||
|
||||
|
||||
cdef inline int min_max_dist{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree,
|
||||
intp_t i_node,
|
||||
const {{INPUT_DTYPE_t}}* pt,
|
||||
float64_t* min_dist,
|
||||
float64_t* max_dist,
|
||||
) except -1 nogil:
|
||||
"""Compute the minimum and maximum distance between a point and a node"""
|
||||
cdef intp_t n_features = tree.data.shape[1]
|
||||
|
||||
cdef float64_t d, d_lo, d_hi
|
||||
cdef intp_t j
|
||||
|
||||
min_dist[0] = 0.0
|
||||
max_dist[0] = 0.0
|
||||
|
||||
if tree.dist_metric.p == INF:
|
||||
for j in range(n_features):
|
||||
d_lo = tree.node_bounds[0, i_node, j] - pt[j]
|
||||
d_hi = pt[j] - tree.node_bounds[1, i_node, j]
|
||||
d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
|
||||
min_dist[0] = fmax(min_dist[0], 0.5 * d)
|
||||
max_dist[0] = fmax(max_dist[0], fabs(d_lo))
|
||||
max_dist[0] = fmax(max_dist[0], fabs(d_hi))
|
||||
else:
|
||||
# as above, use the fact that x + abs(x) = 2 * max(x, 0)
|
||||
for j in range(n_features):
|
||||
d_lo = tree.node_bounds[0, i_node, j] - pt[j]
|
||||
d_hi = pt[j] - tree.node_bounds[1, i_node, j]
|
||||
d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
|
||||
min_dist[0] += pow(0.5 * d, tree.dist_metric.p)
|
||||
max_dist[0] += pow(fmax(fabs(d_lo), fabs(d_hi)),
|
||||
tree.dist_metric.p)
|
||||
|
||||
min_dist[0] = pow(min_dist[0], 1. / tree.dist_metric.p)
|
||||
max_dist[0] = pow(max_dist[0], 1. / tree.dist_metric.p)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline float64_t min_rdist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""Compute the minimum reduced distance between two nodes"""
|
||||
cdef intp_t n_features = tree1.data.shape[1]
|
||||
|
||||
cdef float64_t d, d1, d2, rdist=0.0
|
||||
cdef intp_t j
|
||||
|
||||
if tree1.dist_metric.p == INF:
|
||||
for j in range(n_features):
|
||||
d1 = (tree1.node_bounds[0, i_node1, j]
|
||||
- tree2.node_bounds[1, i_node2, j])
|
||||
d2 = (tree2.node_bounds[0, i_node2, j]
|
||||
- tree1.node_bounds[1, i_node1, j])
|
||||
d = (d1 + fabs(d1)) + (d2 + fabs(d2))
|
||||
|
||||
rdist = fmax(rdist, 0.5 * d)
|
||||
else:
|
||||
# here we'll use the fact that x + abs(x) = 2 * max(x, 0)
|
||||
for j in range(n_features):
|
||||
d1 = (tree1.node_bounds[0, i_node1, j]
|
||||
- tree2.node_bounds[1, i_node2, j])
|
||||
d2 = (tree2.node_bounds[0, i_node2, j]
|
||||
- tree1.node_bounds[1, i_node1, j])
|
||||
d = (d1 + fabs(d1)) + (d2 + fabs(d2))
|
||||
|
||||
rdist += pow(0.5 * d, tree1.dist_metric.p)
|
||||
|
||||
return rdist
|
||||
|
||||
|
||||
cdef inline float64_t min_dist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""Compute the minimum distance between two nodes"""
|
||||
return tree1.dist_metric._rdist_to_dist(
|
||||
min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
|
||||
|
||||
cdef inline float64_t max_rdist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""Compute the maximum reduced distance between two nodes"""
|
||||
cdef intp_t n_features = tree1.data.shape[1]
|
||||
|
||||
cdef float64_t d1, d2, rdist=0.0
|
||||
cdef intp_t j
|
||||
|
||||
if tree1.dist_metric.p == INF:
|
||||
for j in range(n_features):
|
||||
rdist = fmax(rdist, fabs(tree1.node_bounds[0, i_node1, j]
|
||||
- tree2.node_bounds[1, i_node2, j]))
|
||||
rdist = fmax(rdist, fabs(tree1.node_bounds[1, i_node1, j]
|
||||
- tree2.node_bounds[0, i_node2, j]))
|
||||
else:
|
||||
for j in range(n_features):
|
||||
d1 = fabs(tree1.node_bounds[0, i_node1, j]
|
||||
- tree2.node_bounds[1, i_node2, j])
|
||||
d2 = fabs(tree1.node_bounds[1, i_node1, j]
|
||||
- tree2.node_bounds[0, i_node2, j])
|
||||
rdist += pow(fmax(d1, d2), tree1.dist_metric.p)
|
||||
|
||||
return rdist
|
||||
|
||||
|
||||
cdef inline float64_t max_dist_dual{{name_suffix}}(
|
||||
BinaryTree{{name_suffix}} tree1,
|
||||
intp_t i_node1,
|
||||
BinaryTree{{name_suffix}} tree2,
|
||||
intp_t i_node2,
|
||||
) except -1:
|
||||
"""Compute the maximum distance between two nodes"""
|
||||
return tree1.dist_metric._rdist_to_dist(
|
||||
max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2)
|
||||
)
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
class KDTree(KDTree64):
|
||||
__doc__ = CLASS_DOC.format(BinaryTree="KDTree")
|
||||
pass
|
||||
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
Kernel Density Estimation
|
||||
-------------------------
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import gammainc
|
||||
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.neighbors._ball_tree import BallTree
|
||||
from sklearn.neighbors._base import VALID_METRICS
|
||||
from sklearn.neighbors._kd_tree import KDTree
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.validation import (
|
||||
_check_sample_weight,
|
||||
check_is_fitted,
|
||||
validate_data,
|
||||
)
|
||||
|
||||
VALID_KERNELS = [
|
||||
"gaussian",
|
||||
"tophat",
|
||||
"epanechnikov",
|
||||
"exponential",
|
||||
"linear",
|
||||
"cosine",
|
||||
]
|
||||
|
||||
TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree}
|
||||
|
||||
|
||||
# TODO: implement a brute force version for testing purposes
|
||||
# TODO: create a density estimation base class?
|
||||
class KernelDensity(BaseEstimator):
|
||||
"""Kernel Density Estimation.
|
||||
|
||||
Read more in the :ref:`User Guide <kernel_density>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bandwidth : float or {"scott", "silverman"}, default=1.0
|
||||
The bandwidth of the kernel. If bandwidth is a float, it defines the
|
||||
bandwidth of the kernel. If bandwidth is a string, one of the estimation
|
||||
methods is implemented.
|
||||
|
||||
algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto'
|
||||
The tree algorithm to use.
|
||||
|
||||
kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \
|
||||
'cosine'}, default='gaussian'
|
||||
The kernel to use.
|
||||
|
||||
metric : str, default='euclidean'
|
||||
Metric to use for distance computation. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
Not all metrics are valid with all algorithms: refer to the
|
||||
documentation of :class:`BallTree` and :class:`KDTree`. Note that the
|
||||
normalization of the density output is correct only for the Euclidean
|
||||
distance metric.
|
||||
|
||||
atol : float, default=0
|
||||
The desired absolute tolerance of the result. A larger tolerance will
|
||||
generally lead to faster execution.
|
||||
|
||||
rtol : float, default=0
|
||||
The desired relative tolerance of the result. A larger tolerance will
|
||||
generally lead to faster execution.
|
||||
|
||||
breadth_first : bool, default=True
|
||||
If true (default), use a breadth-first approach to the problem.
|
||||
Otherwise use a depth-first approach.
|
||||
|
||||
leaf_size : int, default=40
|
||||
Specify the leaf size of the underlying tree. See :class:`BallTree`
|
||||
or :class:`KDTree` for details.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional parameters to be passed to the tree for use with the
|
||||
metric. For more information, see the documentation of
|
||||
:class:`BallTree` or :class:`KDTree`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
tree_ : ``BinaryTree`` instance
|
||||
The tree algorithm for fast generalized N-point problems.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
bandwidth_ : float
|
||||
Value of the bandwidth, given directly by the bandwidth parameter or
|
||||
estimated using the 'scott' or 'silverman' method.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point
|
||||
problems.
|
||||
sklearn.neighbors.BallTree : Ball tree for fast generalized N-point
|
||||
problems.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Compute a gaussian kernel density estimate with a fixed bandwidth.
|
||||
|
||||
>>> from sklearn.neighbors import KernelDensity
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> X = rng.random_sample((100, 3))
|
||||
>>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
|
||||
>>> log_density = kde.score_samples(X[:3])
|
||||
>>> log_density
|
||||
array([-1.52955942, -1.51462041, -1.60244657])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"bandwidth": [
|
||||
Interval(Real, 0, None, closed="neither"),
|
||||
StrOptions({"scott", "silverman"}),
|
||||
],
|
||||
"algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})],
|
||||
"kernel": [StrOptions(set(VALID_KERNELS))],
|
||||
"metric": [
|
||||
StrOptions(
|
||||
set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()]))
|
||||
)
|
||||
],
|
||||
"atol": [Interval(Real, 0, None, closed="left")],
|
||||
"rtol": [Interval(Real, 0, None, closed="left")],
|
||||
"breadth_first": ["boolean"],
|
||||
"leaf_size": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric_params": [None, dict],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
bandwidth=1.0,
|
||||
algorithm="auto",
|
||||
kernel="gaussian",
|
||||
metric="euclidean",
|
||||
atol=0,
|
||||
rtol=0,
|
||||
breadth_first=True,
|
||||
leaf_size=40,
|
||||
metric_params=None,
|
||||
):
|
||||
self.algorithm = algorithm
|
||||
self.bandwidth = bandwidth
|
||||
self.kernel = kernel
|
||||
self.metric = metric
|
||||
self.atol = atol
|
||||
self.rtol = rtol
|
||||
self.breadth_first = breadth_first
|
||||
self.leaf_size = leaf_size
|
||||
self.metric_params = metric_params
|
||||
|
||||
def _choose_algorithm(self, algorithm, metric):
|
||||
# given the algorithm string + metric string, choose the optimal
|
||||
# algorithm to compute the result.
|
||||
if algorithm == "auto":
|
||||
# use KD Tree if possible
|
||||
if metric in KDTree.valid_metrics:
|
||||
return "kd_tree"
|
||||
elif metric in BallTree.valid_metrics:
|
||||
return "ball_tree"
|
||||
else: # kd_tree or ball_tree
|
||||
if metric not in TREE_DICT[algorithm].valid_metrics:
|
||||
raise ValueError(
|
||||
"invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric)
|
||||
)
|
||||
return algorithm
|
||||
|
||||
@_fit_context(
|
||||
# KernelDensity.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Fit the Kernel Density model on the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
List of n_features-dimensional data points. Each row
|
||||
corresponds to a single data point.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`~sklearn.pipeline.Pipeline`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
List of sample weights attached to the data X.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
algorithm = self._choose_algorithm(self.algorithm, self.metric)
|
||||
|
||||
if isinstance(self.bandwidth, str):
|
||||
if self.bandwidth == "scott":
|
||||
self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4))
|
||||
elif self.bandwidth == "silverman":
|
||||
self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** (
|
||||
-1 / (X.shape[1] + 4)
|
||||
)
|
||||
else:
|
||||
self.bandwidth_ = self.bandwidth
|
||||
|
||||
X = validate_data(self, X, order="C", dtype=np.float64)
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64, ensure_non_negative=True
|
||||
)
|
||||
|
||||
kwargs = self.metric_params
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
self.tree_ = TREE_DICT[algorithm](
|
||||
X,
|
||||
metric=self.metric,
|
||||
leaf_size=self.leaf_size,
|
||||
sample_weight=sample_weight,
|
||||
**kwargs,
|
||||
)
|
||||
return self
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Compute the log-likelihood of each sample under the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
An array of points to query. Last dimension should match dimension
|
||||
of training data (n_features).
|
||||
|
||||
Returns
|
||||
-------
|
||||
density : ndarray of shape (n_samples,)
|
||||
Log-likelihood of each sample in `X`. These are normalized to be
|
||||
probability densities, so values will be low for high-dimensional
|
||||
data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
# The returned density is normalized to the number of points.
|
||||
# For it to be a probability, we must scale it. For this reason
|
||||
# we'll also scale atol.
|
||||
X = validate_data(self, X, order="C", dtype=np.float64, reset=False)
|
||||
if self.tree_.sample_weight is None:
|
||||
N = self.tree_.data.shape[0]
|
||||
else:
|
||||
N = self.tree_.sum_weight
|
||||
atol_N = self.atol * N
|
||||
log_density = self.tree_.kernel_density(
|
||||
X,
|
||||
h=self.bandwidth_,
|
||||
kernel=self.kernel,
|
||||
atol=atol_N,
|
||||
rtol=self.rtol,
|
||||
breadth_first=self.breadth_first,
|
||||
return_log=True,
|
||||
)
|
||||
log_density -= np.log(N)
|
||||
return log_density
|
||||
|
||||
def score(self, X, y=None):
|
||||
"""Compute the total log-likelihood under the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
List of n_features-dimensional data points. Each row
|
||||
corresponds to a single data point.
|
||||
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`~sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
logprob : float
|
||||
Total log-likelihood of the data in X. This is normalized to be a
|
||||
probability density, so the value will be low for high-dimensional
|
||||
data.
|
||||
"""
|
||||
return np.sum(self.score_samples(X))
|
||||
|
||||
def sample(self, n_samples=1, random_state=None):
|
||||
"""Generate random samples from the model.
|
||||
|
||||
Currently, this is implemented only for gaussian and tophat kernels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int, default=1
|
||||
Number of samples to generate.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation used to generate
|
||||
random samples. Pass an int for reproducible results
|
||||
across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
List of samples.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
# TODO: implement sampling for other valid kernel shapes
|
||||
if self.kernel not in ["gaussian", "tophat"]:
|
||||
raise NotImplementedError()
|
||||
|
||||
data = np.asarray(self.tree_.data)
|
||||
|
||||
rng = check_random_state(random_state)
|
||||
u = rng.uniform(0, 1, size=n_samples)
|
||||
if self.tree_.sample_weight is None:
|
||||
i = (u * data.shape[0]).astype(np.int64)
|
||||
else:
|
||||
cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
|
||||
sum_weight = cumsum_weight[-1]
|
||||
i = np.searchsorted(cumsum_weight, u * sum_weight)
|
||||
if self.kernel == "gaussian":
|
||||
return np.atleast_2d(rng.normal(data[i], self.bandwidth_))
|
||||
|
||||
elif self.kernel == "tophat":
|
||||
# we first draw points from a d-dimensional normal distribution,
|
||||
# then use an incomplete gamma function to map them to a uniform
|
||||
# d-dimensional tophat distribution.
|
||||
dim = data.shape[1]
|
||||
X = rng.normal(size=(n_samples, dim))
|
||||
s_sq = row_norms(X, squared=True)
|
||||
correction = (
|
||||
gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim)
|
||||
* self.bandwidth_
|
||||
/ np.sqrt(s_sq)
|
||||
)
|
||||
return data[i] + X * correction[:, np.newaxis]
|
||||
@@ -0,0 +1,521 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import OutlierMixin, _fit_context
|
||||
from sklearn.neighbors._base import KNeighborsMixin, NeighborsBase
|
||||
from sklearn.utils import check_array
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.metaestimators import available_if
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
|
||||
__all__ = ["LocalOutlierFactor"]
|
||||
|
||||
|
||||
class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):
|
||||
"""Unsupervised Outlier Detection using the Local Outlier Factor (LOF).
|
||||
|
||||
The anomaly score of each sample is called the Local Outlier Factor.
|
||||
It measures the local deviation of the density of a given sample with respect
|
||||
to its neighbors.
|
||||
It is local in that the anomaly score depends on how isolated the object
|
||||
is with respect to the surrounding neighborhood.
|
||||
More precisely, locality is given by k-nearest neighbors, whose distance
|
||||
is used to estimate the local density.
|
||||
By comparing the local density of a sample to the local densities of its
|
||||
neighbors, one can identify samples that have a substantially lower density
|
||||
than their neighbors. These are considered outliers.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=20
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
If n_neighbors is larger than the number of samples provided,
|
||||
all samples will be used.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||||
affect the speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
:func:`sklearn.metrics.pairwise_distances`. When p = 1, this
|
||||
is equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
contamination : 'auto' or float, default='auto'
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. When fitting this is used to define the
|
||||
threshold on the scores of the samples.
|
||||
|
||||
- if 'auto', the threshold is determined as in the
|
||||
original paper,
|
||||
- if a float, the contamination should be in the range (0, 0.5].
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default value of ``contamination`` changed from 0.1
|
||||
to ``'auto'``.
|
||||
|
||||
novelty : bool, default=False
|
||||
By default, LocalOutlierFactor is only meant to be used for outlier
|
||||
detection (novelty=False). Set novelty to True if you want to use
|
||||
LocalOutlierFactor for novelty detection. In this case be aware that
|
||||
you should only use predict, decision_function and score_samples
|
||||
on new unseen data and not on the training set; and note that the
|
||||
results obtained this way may differ from the standard LOF results.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
negative_outlier_factor_ : ndarray of shape (n_samples,)
|
||||
The opposite LOF of the training samples. The higher, the more normal.
|
||||
Inliers tend to have a LOF score close to 1
|
||||
(``negative_outlier_factor_`` close to -1), while outliers tend to have
|
||||
a larger LOF score.
|
||||
|
||||
The local outlier factor (LOF) of a sample captures its
|
||||
supposed 'degree of abnormality'.
|
||||
It is the average of the ratio of the local reachability density of
|
||||
a sample and those of its k-nearest neighbors.
|
||||
|
||||
n_neighbors_ : int
|
||||
The actual number of neighbors used for :meth:`kneighbors` queries.
|
||||
|
||||
offset_ : float
|
||||
Offset used to obtain binary labels from the raw scores.
|
||||
Observations having a negative_outlier_factor smaller than `offset_`
|
||||
are detected as abnormal.
|
||||
The offset is set to -1.5 (inliers score around -1), except when a
|
||||
contamination parameter different than "auto" is provided. In that
|
||||
case, the offset is defined in such a way we obtain the expected
|
||||
number of outliers in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
effective_metric_ : str
|
||||
The effective metric used for the distance computation.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
The effective additional keyword arguments for the metric function.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
It is the number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using
|
||||
Support Vector Machine.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
|
||||
`LOF: identifying density-based local outliers.
|
||||
<https://dl.acm.org/doi/pdf/10.1145/342009.335388>`_
|
||||
In Proceedings of the 2000 ACM SIGMOD International Conference on
|
||||
Management of Data, pp. 93-104.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neighbors import LocalOutlierFactor
|
||||
>>> X = [[-1.1], [0.2], [101.1], [0.3]]
|
||||
>>> clf = LocalOutlierFactor(n_neighbors=2)
|
||||
>>> clf.fit_predict(X)
|
||||
array([ 1, 1, -1, 1])
|
||||
>>> clf.negative_outlier_factor_
|
||||
array([ -0.9821, -1.0370, -73.3697, -0.9821])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"contamination": [
|
||||
StrOptions({"auto"}),
|
||||
Interval(Real, 0, 0.5, closed="right"),
|
||||
],
|
||||
"novelty": ["boolean"],
|
||||
}
|
||||
_parameter_constraints.pop("radius")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_neighbors=20,
|
||||
*,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
contamination="auto",
|
||||
novelty=False,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.contamination = contamination
|
||||
self.novelty = novelty
|
||||
|
||||
def _check_novelty_fit_predict(self):
|
||||
if self.novelty:
|
||||
msg = (
|
||||
"fit_predict is not available when novelty=True. Use "
|
||||
"novelty=False if you want to predict on the training set."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_fit_predict)
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit the model to the training set X and return the labels.
|
||||
|
||||
**Not available for novelty detection (when novelty is set to True).**
|
||||
Label is 1 for an inlier and -1 for an outlier according to the LOF
|
||||
score and the contamination parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and 1 for inliers.
|
||||
"""
|
||||
|
||||
# As fit_predict would be different from fit.predict, fit_predict is
|
||||
# only available for outlier detection (novelty=False)
|
||||
|
||||
return self.fit(X)._predict()
|
||||
|
||||
@_fit_context(
|
||||
# LocalOutlierFactor.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the local outlier factor detector from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : LocalOutlierFactor
|
||||
The fitted local outlier factor detector.
|
||||
"""
|
||||
self._fit(X)
|
||||
|
||||
n_samples = self.n_samples_fit_
|
||||
if self.n_neighbors > n_samples:
|
||||
warnings.warn(
|
||||
"n_neighbors (%s) is greater than the "
|
||||
"total number of samples (%s). n_neighbors "
|
||||
"will be set to (n_samples - 1) for estimation."
|
||||
% (self.n_neighbors, n_samples)
|
||||
)
|
||||
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
|
||||
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
|
||||
n_neighbors=self.n_neighbors_
|
||||
)
|
||||
|
||||
if self._fit_X.dtype == np.float32:
|
||||
self._distances_fit_X_ = self._distances_fit_X_.astype(
|
||||
self._fit_X.dtype,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
self._lrd = self._local_reachability_density(
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_
|
||||
)
|
||||
|
||||
# Compute lof score over training samples to define offset_:
|
||||
lrd_ratios_array = (
|
||||
self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
|
||||
)
|
||||
|
||||
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
if self.contamination == "auto":
|
||||
# inliers score around -1 (the higher, the less abnormal).
|
||||
self.offset_ = -1.5
|
||||
else:
|
||||
self.offset_ = np.percentile(
|
||||
self.negative_outlier_factor_, 100.0 * self.contamination
|
||||
)
|
||||
|
||||
# Verify if negative_outlier_factor_ values are within acceptable range.
|
||||
# Novelty must also be false to detect outliers
|
||||
if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty:
|
||||
warnings.warn(
|
||||
"Duplicate values are leading to incorrect results. "
|
||||
"Increase the number of neighbors for more accurate results."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def _check_novelty_predict(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"predict is not available when novelty=False, use "
|
||||
"fit_predict if you want to predict on training data. Use "
|
||||
"novelty=True if you want to use LOF for novelty detection "
|
||||
"and predict on new unseen data."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_predict)
|
||||
def predict(self, X=None):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
This method allows to generalize prediction to *new observations* (not
|
||||
in the training set). Note that the result of ``clf.fit(X)`` then
|
||||
``clf.predict(X)`` with ``novelty=True`` may differ from the result
|
||||
obtained by ``clf.fit_predict(X)`` with ``novelty=False``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
return self._predict(X)
|
||||
|
||||
def _predict(self, X=None):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
If X is None, returns the same as fit_predict(X_train).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples. If None, makes prediction on the
|
||||
training data without considering them as their own neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if X is not None:
|
||||
shifted_opposite_lof_scores = self.decision_function(X)
|
||||
is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int)
|
||||
is_inlier[shifted_opposite_lof_scores < 0] = -1
|
||||
else:
|
||||
is_inlier = np.ones(self.n_samples_fit_, dtype=int)
|
||||
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
|
||||
|
||||
return is_inlier
|
||||
|
||||
def _check_novelty_decision_function(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"decision_function is not available when novelty=False. "
|
||||
"Use novelty=True if you want to use LOF for novelty "
|
||||
"detection and compute decision_function for new unseen "
|
||||
"data. Note that the opposite LOF of the training samples "
|
||||
"is always available by considering the "
|
||||
"negative_outlier_factor_ attribute."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_decision_function)
|
||||
def decision_function(self, X):
|
||||
"""Shifted opposite of the Local Outlier Factor of X.
|
||||
|
||||
Bigger is better, i.e. large values correspond to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The shift offset allows a zero threshold for being an outlier.
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shifted_opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The shifted opposite of the Local Outlier Factor of each input
|
||||
samples. The lower, the more abnormal. Negative scores represent
|
||||
outliers, positive scores represent inliers.
|
||||
"""
|
||||
return self.score_samples(X) - self.offset_
|
||||
|
||||
def _check_novelty_score_samples(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"score_samples is not available when novelty=False. The "
|
||||
"scores of the training samples are always available "
|
||||
"through the negative_outlier_factor_ attribute. Use "
|
||||
"novelty=True if you want to use LOF for novelty detection "
|
||||
"and compute score_samples for new unseen data."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_score_samples)
|
||||
def score_samples(self, X):
|
||||
"""Opposite of the Local Outlier Factor of X.
|
||||
|
||||
It is the opposite as bigger is better, i.e. large values correspond
|
||||
to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point. Because of this, the scores obtained via ``score_samples`` may
|
||||
differ from the standard LOF scores.
|
||||
The standard LOF scores for the training data is available via the
|
||||
``negative_outlier_factor_`` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The opposite of the Local Outlier Factor of each input samples.
|
||||
The lower, the more abnormal.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse="csr")
|
||||
|
||||
distances_X, neighbors_indices_X = self.kneighbors(
|
||||
X, n_neighbors=self.n_neighbors_
|
||||
)
|
||||
|
||||
if X.dtype == np.float32:
|
||||
distances_X = distances_X.astype(X.dtype, copy=False)
|
||||
|
||||
X_lrd = self._local_reachability_density(
|
||||
distances_X,
|
||||
neighbors_indices_X,
|
||||
)
|
||||
|
||||
lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]
|
||||
|
||||
# as bigger is better:
|
||||
return -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
def _local_reachability_density(self, distances_X, neighbors_indices):
|
||||
"""The local reachability density (LRD)
|
||||
|
||||
The LRD of a sample is the inverse of the average reachability
|
||||
distance of its k-nearest neighbors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distances_X : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Distances to the neighbors (in the training samples `self._fit_X`)
|
||||
of each query point to compute the LRD.
|
||||
|
||||
neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Neighbors indices (of each query point) among training samples
|
||||
self._fit_X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
local_reachability_density : ndarray of shape (n_queries,)
|
||||
The local reachability density of each sample.
|
||||
"""
|
||||
dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]
|
||||
reach_dist_array = np.maximum(distances_X, dist_k)
|
||||
|
||||
# 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
|
||||
return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)
|
||||
@@ -0,0 +1,534 @@
|
||||
"""
|
||||
Neighborhood Component Analysis
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import sys
|
||||
import time
|
||||
from numbers import Integral, Real
|
||||
from warnings import warn
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import minimize
|
||||
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.extmath import softmax
|
||||
from sklearn.utils.fixes import _get_additional_lbfgs_options_dict
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.utils.random import check_random_state
|
||||
from sklearn.utils.validation import check_array, check_is_fitted, validate_data
|
||||
|
||||
|
||||
class NeighborhoodComponentsAnalysis(
|
||||
ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
|
||||
):
|
||||
"""Neighborhood Components Analysis.
|
||||
|
||||
Neighborhood Component Analysis (NCA) is a machine learning algorithm for
|
||||
metric learning. It learns a linear transformation in a supervised fashion
|
||||
to improve the classification accuracy of a stochastic nearest neighbors
|
||||
rule in the transformed space.
|
||||
|
||||
Read more in the :ref:`User Guide <nca>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=None
|
||||
Preferred dimensionality of the projected space.
|
||||
If None it will be set to `n_features`.
|
||||
|
||||
init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \
|
||||
(n_features_a, n_features_b), default='auto'
|
||||
Initialization of the linear transformation. Possible options are
|
||||
`'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy
|
||||
array of shape `(n_features_a, n_features_b)`.
|
||||
|
||||
- `'auto'`
|
||||
Depending on `n_components`, the most reasonable initialization
|
||||
is chosen. If `n_components <= min(n_features, n_classes - 1)`
|
||||
we use `'lda'`, as it uses labels information. If not, but
|
||||
`n_components < min(n_features, n_samples)`, we use `'pca'`, as
|
||||
it projects data in meaningful directions (those of higher
|
||||
variance). Otherwise, we just use `'identity'`.
|
||||
|
||||
- `'pca'`
|
||||
`n_components` principal components of the inputs passed
|
||||
to :meth:`fit` will be used to initialize the transformation.
|
||||
(See :class:`~sklearn.decomposition.PCA`)
|
||||
|
||||
- `'lda'`
|
||||
`min(n_components, n_classes)` most discriminative
|
||||
components of the inputs passed to :meth:`fit` will be used to
|
||||
initialize the transformation. (If `n_components > n_classes`,
|
||||
the rest of the components will be zero.) (See
|
||||
:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
|
||||
|
||||
- `'identity'`
|
||||
If `n_components` is strictly smaller than the
|
||||
dimensionality of the inputs passed to :meth:`fit`, the identity
|
||||
matrix will be truncated to the first `n_components` rows.
|
||||
|
||||
- `'random'`
|
||||
The initial transformation will be a random array of shape
|
||||
`(n_components, n_features)`. Each value is sampled from the
|
||||
standard normal distribution.
|
||||
|
||||
- numpy array
|
||||
`n_features_b` must match the dimensionality of the inputs passed
|
||||
to :meth:`fit` and n_features_a must be less than or equal to that.
|
||||
If `n_components` is not `None`, `n_features_a` must match it.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If `True` and :meth:`fit` has been called before, the solution of the
|
||||
previous call to :meth:`fit` is used as the initial linear
|
||||
transformation (`n_components` and `init` will be ignored).
|
||||
|
||||
max_iter : int, default=50
|
||||
Maximum number of iterations in the optimization.
|
||||
|
||||
tol : float, default=1e-5
|
||||
Convergence tolerance for the optimization.
|
||||
|
||||
callback : callable, default=None
|
||||
If not `None`, this function is called after every iteration of the
|
||||
optimizer, taking as arguments the current solution (flattened
|
||||
transformation matrix) and the number of iterations. This might be
|
||||
useful in case one wants to examine or store the transformation
|
||||
found after each iteration.
|
||||
|
||||
verbose : int, default=0
|
||||
If 0, no progress messages will be printed.
|
||||
If 1, progress messages will be printed to stdout.
|
||||
If > 1, progress messages will be printed and the `disp`
|
||||
parameter of :func:`scipy.optimize.minimize` will be set to
|
||||
`verbose - 2`.
|
||||
|
||||
random_state : int or numpy.RandomState, default=None
|
||||
A pseudo random number generator object or a seed for it if int. If
|
||||
`init='random'`, `random_state` is used to initialize the random
|
||||
transformation. If `init='pca'`, `random_state` is passed as an
|
||||
argument to PCA when initializing the transformation. Pass an int
|
||||
for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
components_ : ndarray of shape (n_components, n_features)
|
||||
The linear transformation learned during fitting.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
n_iter_ : int
|
||||
Counts the number of iterations performed by the optimizer.
|
||||
|
||||
random_state_ : numpy.RandomState
|
||||
Pseudo random number generator object used during initialization.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear
|
||||
Discriminant Analysis.
|
||||
sklearn.decomposition.PCA : Principal component analysis (PCA).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
|
||||
"Neighbourhood Components Analysis". Advances in Neural Information
|
||||
Processing Systems. 17, 513-520, 2005.
|
||||
https://www.cs.toronto.edu/~rsalakhu/papers/ncanips.pdf
|
||||
|
||||
.. [2] Wikipedia entry on Neighborhood Components Analysis
|
||||
https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
|
||||
>>> from sklearn.neighbors import KNeighborsClassifier
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> X, y = load_iris(return_X_y=True)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
... stratify=y, test_size=0.7, random_state=42)
|
||||
>>> nca = NeighborhoodComponentsAnalysis(random_state=42)
|
||||
>>> nca.fit(X_train, y_train)
|
||||
NeighborhoodComponentsAnalysis(...)
|
||||
>>> knn = KNeighborsClassifier(n_neighbors=3)
|
||||
>>> knn.fit(X_train, y_train)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(knn.score(X_test, y_test))
|
||||
0.933333...
|
||||
>>> knn.fit(nca.transform(X_train), y_train)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(knn.score(nca.transform(X_test), y_test))
|
||||
0.961904...
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
None,
|
||||
],
|
||||
"init": [
|
||||
StrOptions({"auto", "pca", "lda", "identity", "random"}),
|
||||
np.ndarray,
|
||||
],
|
||||
"warm_start": ["boolean"],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"callback": [callable, None],
|
||||
"verbose": ["verbose"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
*,
|
||||
init="auto",
|
||||
warm_start=False,
|
||||
max_iter=50,
|
||||
tol=1e-5,
|
||||
callback=None,
|
||||
verbose=0,
|
||||
random_state=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.warm_start = warm_start
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.callback = callback
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit the model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The corresponding training labels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
# Validate the inputs X and y, and converts y to numerical classes.
|
||||
X, y = validate_data(self, X, y, ensure_min_samples=2)
|
||||
check_classification_targets(y)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
|
||||
# Check the preferred dimensionality of the projected space
|
||||
if self.n_components is not None and self.n_components > X.shape[1]:
|
||||
raise ValueError(
|
||||
"The preferred dimensionality of the "
|
||||
f"projected space `n_components` ({self.n_components}) cannot "
|
||||
"be greater than the given data "
|
||||
f"dimensionality ({X.shape[1]})!"
|
||||
)
|
||||
# If warm_start is enabled, check that the inputs are consistent
|
||||
if (
|
||||
self.warm_start
|
||||
and hasattr(self, "components_")
|
||||
and self.components_.shape[1] != X.shape[1]
|
||||
):
|
||||
raise ValueError(
|
||||
f"The new inputs dimensionality ({X.shape[1]}) does not "
|
||||
"match the input dimensionality of the "
|
||||
f"previously learned transformation ({self.components_.shape[1]})."
|
||||
)
|
||||
# Check how the linear transformation should be initialized
|
||||
init = self.init
|
||||
if isinstance(init, np.ndarray):
|
||||
init = check_array(init)
|
||||
# Assert that init.shape[1] = X.shape[1]
|
||||
if init.shape[1] != X.shape[1]:
|
||||
raise ValueError(
|
||||
f"The input dimensionality ({init.shape[1]}) of the given "
|
||||
"linear transformation `init` must match the "
|
||||
f"dimensionality of the given inputs `X` ({X.shape[1]})."
|
||||
)
|
||||
# Assert that init.shape[0] <= init.shape[1]
|
||||
if init.shape[0] > init.shape[1]:
|
||||
raise ValueError(
|
||||
f"The output dimensionality ({init.shape[0]}) of the given "
|
||||
"linear transformation `init` cannot be "
|
||||
f"greater than its input dimensionality ({init.shape[1]})."
|
||||
)
|
||||
# Assert that self.n_components = init.shape[0]
|
||||
if self.n_components is not None and self.n_components != init.shape[0]:
|
||||
raise ValueError(
|
||||
"The preferred dimensionality of the "
|
||||
f"projected space `n_components` ({self.n_components}) does"
|
||||
" not match the output dimensionality of "
|
||||
"the given linear transformation "
|
||||
f"`init` ({init.shape[0]})!"
|
||||
)
|
||||
|
||||
# Initialize the random generator
|
||||
self.random_state_ = check_random_state(self.random_state)
|
||||
|
||||
# Measure the total training time
|
||||
t_train = time.time()
|
||||
|
||||
# Compute a mask that stays fixed during optimization:
|
||||
same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
# (n_samples, n_samples)
|
||||
|
||||
# Initialize the transformation
|
||||
transformation = np.ravel(self._initialize(X, y, init))
|
||||
|
||||
# Create a dictionary of parameters to be passed to the optimizer
|
||||
disp = self.verbose - 2 if self.verbose > 1 else -1
|
||||
optimizer_params = {
|
||||
"method": "L-BFGS-B",
|
||||
"fun": self._loss_grad_lbfgs,
|
||||
"args": (X, same_class_mask, -1.0),
|
||||
"jac": True,
|
||||
"x0": transformation,
|
||||
"tol": self.tol,
|
||||
"options": dict(
|
||||
maxiter=self.max_iter,
|
||||
**_get_additional_lbfgs_options_dict("disp", disp),
|
||||
),
|
||||
"callback": self._callback,
|
||||
}
|
||||
|
||||
# Call the optimizer
|
||||
self.n_iter_ = 0
|
||||
opt_result = minimize(**optimizer_params)
|
||||
|
||||
# Reshape the solution found by the optimizer
|
||||
self.components_ = opt_result.x.reshape(-1, X.shape[1])
|
||||
|
||||
# Stop timer
|
||||
t_train = time.time() - t_train
|
||||
if self.verbose:
|
||||
cls_name = self.__class__.__name__
|
||||
|
||||
# Warn the user if the algorithm did not converge
|
||||
if not opt_result.success:
|
||||
warn(
|
||||
"[{}] NCA did not converge: {}".format(
|
||||
cls_name, opt_result.message
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
|
||||
print("[{}] Training took {:8.2f}s.".format(cls_name, t_train))
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Apply the learned transformation to the given data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_embedded: ndarray of shape (n_samples, n_components)
|
||||
The data samples transformed.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotFittedError
|
||||
If :meth:`fit` has not been called before.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
X = validate_data(self, X, reset=False)
|
||||
|
||||
return np.dot(X, self.components_.T)
|
||||
|
||||
def _initialize(self, X, y, init):
|
||||
"""Initialize the transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The training labels.
|
||||
|
||||
init : str or ndarray of shape (n_features_a, n_features_b)
|
||||
The validated initialization of the linear transformation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
transformation : ndarray of shape (n_components, n_features)
|
||||
The initialized linear transformation.
|
||||
|
||||
"""
|
||||
|
||||
transformation = init
|
||||
if self.warm_start and hasattr(self, "components_"):
|
||||
transformation = self.components_
|
||||
elif isinstance(init, np.ndarray):
|
||||
pass
|
||||
else:
|
||||
n_samples, n_features = X.shape
|
||||
n_components = self.n_components or n_features
|
||||
if init == "auto":
|
||||
n_classes = len(np.unique(y))
|
||||
if n_components <= min(n_features, n_classes - 1):
|
||||
init = "lda"
|
||||
elif n_components < min(n_features, n_samples):
|
||||
init = "pca"
|
||||
else:
|
||||
init = "identity"
|
||||
if init == "identity":
|
||||
transformation = np.eye(n_components, X.shape[1])
|
||||
elif init == "random":
|
||||
transformation = self.random_state_.standard_normal(
|
||||
size=(n_components, X.shape[1])
|
||||
)
|
||||
elif init in {"pca", "lda"}:
|
||||
init_time = time.time()
|
||||
if init == "pca":
|
||||
pca = PCA(
|
||||
n_components=n_components, random_state=self.random_state_
|
||||
)
|
||||
if self.verbose:
|
||||
print("Finding principal components... ", end="")
|
||||
sys.stdout.flush()
|
||||
pca.fit(X)
|
||||
transformation = pca.components_
|
||||
elif init == "lda":
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
if self.verbose:
|
||||
print("Finding most discriminative components... ", end="")
|
||||
sys.stdout.flush()
|
||||
lda.fit(X, y)
|
||||
transformation = lda.scalings_.T[:n_components]
|
||||
if self.verbose:
|
||||
print("done in {:5.2f}s".format(time.time() - init_time))
|
||||
return transformation
|
||||
|
||||
def _callback(self, transformation):
|
||||
"""Called after each iteration of the optimizer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformation : ndarray of shape (n_components * n_features,)
|
||||
The solution computed by the optimizer in this iteration.
|
||||
"""
|
||||
if self.callback is not None:
|
||||
self.callback(transformation, self.n_iter_)
|
||||
|
||||
self.n_iter_ += 1
|
||||
|
||||
def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
|
||||
"""Compute the loss and the loss gradient w.r.t. `transformation`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformation : ndarray of shape (n_components * n_features,)
|
||||
The raveled linear transformation on which to compute loss and
|
||||
evaluate gradient.
|
||||
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
same_class_mask : ndarray of shape (n_samples, n_samples)
|
||||
A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong
|
||||
to the same class, and `0` otherwise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The loss computed for the given transformation.
|
||||
|
||||
gradient : ndarray of shape (n_components * n_features,)
|
||||
The new (flattened) gradient of the loss.
|
||||
"""
|
||||
|
||||
if self.n_iter_ == 0:
|
||||
self.n_iter_ += 1
|
||||
if self.verbose:
|
||||
header_fields = ["Iteration", "Objective Value", "Time(s)"]
|
||||
header_fmt = "{:>10} {:>20} {:>10}"
|
||||
header = header_fmt.format(*header_fields)
|
||||
cls_name = self.__class__.__name__
|
||||
print("[{}]".format(cls_name))
|
||||
print(
|
||||
"[{}] {}\n[{}] {}".format(
|
||||
cls_name, header, cls_name, "-" * len(header)
|
||||
)
|
||||
)
|
||||
|
||||
t_funcall = time.time()
|
||||
|
||||
transformation = transformation.reshape(-1, X.shape[1])
|
||||
X_embedded = np.dot(X, transformation.T) # (n_samples, n_components)
|
||||
|
||||
# Compute softmax distances
|
||||
p_ij = pairwise_distances(X_embedded, squared=True)
|
||||
np.fill_diagonal(p_ij, np.inf)
|
||||
p_ij = softmax(-p_ij) # (n_samples, n_samples)
|
||||
|
||||
# Compute loss
|
||||
masked_p_ij = p_ij * same_class_mask
|
||||
p = np.sum(masked_p_ij, axis=1, keepdims=True) # (n_samples, 1)
|
||||
loss = np.sum(p)
|
||||
|
||||
# Compute gradient of loss w.r.t. `transform`
|
||||
weighted_p_ij = masked_p_ij - p_ij * p
|
||||
weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
|
||||
np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
|
||||
gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
|
||||
# time complexity of the gradient: O(n_components x n_samples x (
|
||||
# n_samples + n_features))
|
||||
|
||||
if self.verbose:
|
||||
t_funcall = time.time() - t_funcall
|
||||
values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}"
|
||||
print(
|
||||
values_fmt.format(
|
||||
self.__class__.__name__, self.n_iter_, loss, t_funcall
|
||||
)
|
||||
)
|
||||
sys.stdout.flush()
|
||||
|
||||
return sign * loss, sign * gradient.ravel()
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.target_tags.required = True
|
||||
return tags
|
||||
|
||||
@property
|
||||
def _n_features_out(self):
|
||||
"""Number of transformed output features."""
|
||||
return self.components_.shape[0]
|
||||
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Nearest Centroid Classification
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse as sp
|
||||
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, _fit_context
|
||||
from sklearn.discriminant_analysis import DiscriminantAnalysisPredictionMixin
|
||||
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils import get_tags
|
||||
from sklearn.utils._available_if import available_if
|
||||
from sklearn.utils._param_validation import Interval, StrOptions
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.utils.sparsefuncs import csc_median_axis_0
|
||||
from sklearn.utils.validation import check_is_fitted, validate_data
|
||||
|
||||
|
||||
class NearestCentroid(
|
||||
DiscriminantAnalysisPredictionMixin, ClassifierMixin, BaseEstimator
|
||||
):
|
||||
"""Nearest centroid classifier.
|
||||
|
||||
Each class is represented by its centroid, with test samples classified to
|
||||
the class with the nearest centroid.
|
||||
|
||||
Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric : {"euclidean", "manhattan"}, default="euclidean"
|
||||
Metric to use for distance computation.
|
||||
|
||||
If `metric="euclidean"`, the centroid for the samples corresponding to each
|
||||
class is the arithmetic mean, which minimizes the sum of squared L1 distances.
|
||||
If `metric="manhattan"`, the centroid is the feature-wise median, which
|
||||
minimizes the sum of L1 distances.
|
||||
|
||||
.. versionchanged:: 1.5
|
||||
All metrics but `"euclidean"` and `"manhattan"` were deprecated and
|
||||
now raise an error.
|
||||
|
||||
.. versionchanged:: 0.19
|
||||
`metric='precomputed'` was deprecated and now raises an error
|
||||
|
||||
shrink_threshold : float, default=None
|
||||
Threshold for shrinking centroids to remove features.
|
||||
|
||||
priors : {"uniform", "empirical"} or array-like of shape (n_classes,), \
|
||||
default="uniform"
|
||||
The class prior probabilities. By default, the class proportions are
|
||||
inferred from the training data.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
Attributes
|
||||
----------
|
||||
centroids_ : array-like of shape (n_classes, n_features)
|
||||
Centroid of each class.
|
||||
|
||||
classes_ : array of shape (n_classes,)
|
||||
The unique classes labels.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
deviations_ : ndarray of shape (n_classes, n_features)
|
||||
Deviations (or shrinkages) of the centroids of each class from the
|
||||
overall centroid. Equal to eq. (18.4) if `shrink_threshold=None`,
|
||||
else (18.5) p. 653 of [2]. Can be used to identify features used
|
||||
for classification.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
within_class_std_dev_ : ndarray of shape (n_features,)
|
||||
Pooled or within-class standard deviation of input data.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
class_prior_ : ndarray of shape (n_classes,)
|
||||
The class prior probabilities.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
See Also
|
||||
--------
|
||||
KNeighborsClassifier : Nearest neighbors classifier.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When used for text classification with tf-idf vectors, this classifier is
|
||||
also known as the Rocchio classifier.
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
|
||||
multiple cancer types by shrunken centroids of gene expression. Proceedings
|
||||
of the National Academy of Sciences of the United States of America,
|
||||
99(10), 6567-6572. The National Academy of Sciences.
|
||||
|
||||
[2] Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical
|
||||
Learning Data Mining, Inference, and Prediction. 2nd Edition. New York, Springer.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.neighbors import NearestCentroid
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
||||
>>> y = np.array([1, 1, 1, 2, 2, 2])
|
||||
>>> clf = NearestCentroid()
|
||||
>>> clf.fit(X, y)
|
||||
NearestCentroid()
|
||||
>>> print(clf.predict([[-0.8, -1]]))
|
||||
[1]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"metric": [StrOptions({"manhattan", "euclidean"})],
|
||||
"shrink_threshold": [Interval(Real, 0, None, closed="neither"), None],
|
||||
"priors": ["array-like", StrOptions({"empirical", "uniform"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
metric="euclidean",
|
||||
*,
|
||||
shrink_threshold=None,
|
||||
priors="uniform",
|
||||
):
|
||||
self.metric = metric
|
||||
self.shrink_threshold = shrink_threshold
|
||||
self.priors = priors
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
Fit the NearestCentroid model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
Note that centroid shrinking cannot be used with sparse matrices.
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
# If X is sparse and the metric is "manhattan", store it in a csc
|
||||
# format is easier to calculate the median.
|
||||
if self.metric == "manhattan":
|
||||
X, y = validate_data(self, X, y, accept_sparse=["csc"])
|
||||
else:
|
||||
ensure_all_finite = (
|
||||
"allow-nan" if get_tags(self).input_tags.allow_nan else True
|
||||
)
|
||||
X, y = validate_data(
|
||||
self,
|
||||
X,
|
||||
y,
|
||||
ensure_all_finite=ensure_all_finite,
|
||||
accept_sparse=["csr", "csc"],
|
||||
)
|
||||
is_X_sparse = sp.issparse(X)
|
||||
check_classification_targets(y)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
le = LabelEncoder()
|
||||
y_ind = le.fit_transform(y)
|
||||
self.classes_ = classes = le.classes_
|
||||
n_classes = classes.size
|
||||
if n_classes < 2:
|
||||
raise ValueError(
|
||||
"The number of classes has to be greater than one; got %d class"
|
||||
% (n_classes)
|
||||
)
|
||||
|
||||
if self.priors == "empirical": # estimate priors from sample
|
||||
_, class_counts = np.unique(y, return_inverse=True) # non-negative ints
|
||||
self.class_prior_ = np.bincount(class_counts) / float(len(y))
|
||||
elif self.priors == "uniform":
|
||||
self.class_prior_ = np.asarray([1 / n_classes] * n_classes)
|
||||
else:
|
||||
self.class_prior_ = np.asarray(self.priors)
|
||||
|
||||
if (self.class_prior_ < 0).any():
|
||||
raise ValueError("priors must be non-negative")
|
||||
if not np.isclose(self.class_prior_.sum(), 1.0):
|
||||
warnings.warn(
|
||||
"The priors do not sum to 1. Normalizing such that it sums to one.",
|
||||
UserWarning,
|
||||
)
|
||||
self.class_prior_ = self.class_prior_ / self.class_prior_.sum()
|
||||
|
||||
# Mask mapping each class to its members.
|
||||
self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
|
||||
|
||||
# Number of clusters in each class.
|
||||
nk = np.zeros(n_classes)
|
||||
|
||||
for cur_class in range(n_classes):
|
||||
center_mask = y_ind == cur_class
|
||||
nk[cur_class] = np.sum(center_mask)
|
||||
if is_X_sparse:
|
||||
center_mask = np.where(center_mask)[0]
|
||||
|
||||
if self.metric == "manhattan":
|
||||
# NumPy does not calculate median of sparse matrices.
|
||||
if not is_X_sparse:
|
||||
self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
|
||||
else:
|
||||
self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
|
||||
else: # metric == "euclidean"
|
||||
self.centroids_[cur_class] = X[center_mask].mean(axis=0)
|
||||
|
||||
# Compute within-class std_dev with unshrunked centroids
|
||||
variance = np.array(X - self.centroids_[y_ind], copy=False) ** 2
|
||||
self.within_class_std_dev_ = np.array(
|
||||
np.sqrt(variance.sum(axis=0) / (n_samples - n_classes)), copy=False
|
||||
)
|
||||
if any(self.within_class_std_dev_ == 0):
|
||||
warnings.warn(
|
||||
"self.within_class_std_dev_ has at least 1 zero standard deviation."
|
||||
"Inputs within the same classes for at least 1 feature are identical."
|
||||
)
|
||||
|
||||
err_msg = "All features have zero variance. Division by zero."
|
||||
if is_X_sparse and np.all((X.max(axis=0) - X.min(axis=0)).toarray() == 0):
|
||||
raise ValueError(err_msg)
|
||||
elif not is_X_sparse and np.all(np.ptp(X, axis=0) == 0):
|
||||
raise ValueError(err_msg)
|
||||
|
||||
dataset_centroid_ = X.mean(axis=0)
|
||||
# m parameter for determining deviation
|
||||
m = np.sqrt((1.0 / nk) - (1.0 / n_samples))
|
||||
# Calculate deviation using the standard deviation of centroids.
|
||||
# To deter outliers from affecting the results.
|
||||
s = self.within_class_std_dev_ + np.median(self.within_class_std_dev_)
|
||||
mm = m.reshape(len(m), 1) # Reshape to allow broadcasting.
|
||||
ms = mm * s
|
||||
self.deviations_ = np.array(
|
||||
(self.centroids_ - dataset_centroid_) / ms, copy=False
|
||||
)
|
||||
# Soft thresholding: if the deviation crosses 0 during shrinking,
|
||||
# it becomes zero.
|
||||
if self.shrink_threshold:
|
||||
signs = np.sign(self.deviations_)
|
||||
self.deviations_ = np.abs(self.deviations_) - self.shrink_threshold
|
||||
np.clip(self.deviations_, 0, None, out=self.deviations_)
|
||||
self.deviations_ *= signs
|
||||
# Now adjust the centroids using the deviation
|
||||
msd = ms * self.deviations_
|
||||
self.centroids_ = np.array(dataset_centroid_ + msd, copy=False)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Perform classification on an array of test vectors `X`.
|
||||
|
||||
The predicted class `C` for each sample in `X` is returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : ndarray of shape (n_samples,)
|
||||
The predicted classes.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if np.isclose(self.class_prior_, 1 / len(self.classes_)).all():
|
||||
# `validate_data` is called here since we are not calling `super()`
|
||||
ensure_all_finite = (
|
||||
"allow-nan" if get_tags(self).input_tags.allow_nan else True
|
||||
)
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
ensure_all_finite=ensure_all_finite,
|
||||
accept_sparse="csr",
|
||||
reset=False,
|
||||
)
|
||||
return self.classes_[
|
||||
pairwise_distances_argmin(X, self.centroids_, metric=self.metric)
|
||||
]
|
||||
else:
|
||||
return super().predict(X)
|
||||
|
||||
def _decision_function(self, X):
|
||||
# return discriminant scores, see eq. (18.2) p. 652 of the ESL.
|
||||
check_is_fitted(self, "centroids_")
|
||||
|
||||
X_normalized = validate_data(
|
||||
self, X, copy=True, reset=False, accept_sparse="csr", dtype=np.float64
|
||||
)
|
||||
|
||||
discriminant_score = np.empty(
|
||||
(X_normalized.shape[0], self.classes_.size), dtype=np.float64
|
||||
)
|
||||
|
||||
mask = self.within_class_std_dev_ != 0
|
||||
X_normalized[:, mask] /= self.within_class_std_dev_[mask]
|
||||
centroids_normalized = self.centroids_.copy()
|
||||
centroids_normalized[:, mask] /= self.within_class_std_dev_[mask]
|
||||
|
||||
for class_idx in range(self.classes_.size):
|
||||
distances = pairwise_distances(
|
||||
X_normalized, centroids_normalized[[class_idx]], metric=self.metric
|
||||
).ravel()
|
||||
distances **= 2
|
||||
discriminant_score[:, class_idx] = np.squeeze(
|
||||
-distances + 2.0 * np.log(self.class_prior_[class_idx])
|
||||
)
|
||||
|
||||
return discriminant_score
|
||||
|
||||
def _check_euclidean_metric(self):
|
||||
return self.metric == "euclidean"
|
||||
|
||||
decision_function = available_if(_check_euclidean_metric)(
|
||||
DiscriminantAnalysisPredictionMixin.decision_function
|
||||
)
|
||||
|
||||
predict_proba = available_if(_check_euclidean_metric)(
|
||||
DiscriminantAnalysisPredictionMixin.predict_proba
|
||||
)
|
||||
|
||||
predict_log_proba = available_if(_check_euclidean_metric)(
|
||||
DiscriminantAnalysisPredictionMixin.predict_log_proba
|
||||
)
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
tags.input_tags.allow_nan = self.metric == "nan_euclidean"
|
||||
tags.input_tags.sparse = True
|
||||
return tags
|
||||
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
from cython cimport floating
|
||||
from sklearn.utils._typedefs cimport float64_t, intp_t
|
||||
|
||||
cdef int partition_node_indices(
|
||||
const floating *data,
|
||||
intp_t *node_indices,
|
||||
intp_t split_dim,
|
||||
intp_t split_index,
|
||||
intp_t n_features,
|
||||
intp_t n_points) except -1
|
||||
@@ -0,0 +1,122 @@
|
||||
# BinaryTrees rely on partial sorts to partition their nodes during their
|
||||
# initialisation.
|
||||
#
|
||||
# The C++ std library exposes nth_element, an efficient partial sort for this
|
||||
# situation which has a linear time complexity as well as the best performances.
|
||||
#
|
||||
# To use std::algorithm::nth_element, a few fixture are defined using Cython:
|
||||
# - partition_node_indices, a Cython function used in BinaryTrees, that calls
|
||||
# - partition_node_indices_inner, a C++ function that wraps nth_element and uses
|
||||
# - an IndexComparator to state how to compare KDTrees' indices
|
||||
#
|
||||
# IndexComparator has been defined so that partial sorts are stable with
|
||||
# respect to the nodes initial indices.
|
||||
#
|
||||
# See for reference:
|
||||
# - https://en.cppreference.com/w/cpp/algorithm/nth_element.
|
||||
# - https://github.com/scikit-learn/scikit-learn/pull/11103
|
||||
# - https://github.com/scikit-learn/scikit-learn/pull/19473
|
||||
from cython cimport floating
|
||||
|
||||
|
||||
cdef extern from *:
|
||||
"""
|
||||
#include <algorithm>
|
||||
|
||||
template<class D, class I>
|
||||
class IndexComparator {
|
||||
private:
|
||||
const D *data;
|
||||
I split_dim, n_features;
|
||||
public:
|
||||
IndexComparator(const D *data, const I &split_dim, const I &n_features):
|
||||
data(data), split_dim(split_dim), n_features(n_features) {}
|
||||
|
||||
bool operator()(const I &a, const I &b) const {
|
||||
D a_value = data[a * n_features + split_dim];
|
||||
D b_value = data[b * n_features + split_dim];
|
||||
return a_value == b_value ? a < b : a_value < b_value;
|
||||
}
|
||||
};
|
||||
|
||||
template<class D, class I>
|
||||
void partition_node_indices_inner(
|
||||
const D *data,
|
||||
I *node_indices,
|
||||
const I &split_dim,
|
||||
const I &split_index,
|
||||
const I &n_features,
|
||||
const I &n_points) {
|
||||
IndexComparator<D, I> index_comparator(data, split_dim, n_features);
|
||||
std::nth_element(
|
||||
node_indices,
|
||||
node_indices + split_index,
|
||||
node_indices + n_points,
|
||||
index_comparator);
|
||||
}
|
||||
"""
|
||||
void partition_node_indices_inner[D, I](
|
||||
const D *data,
|
||||
I *node_indices,
|
||||
I split_dim,
|
||||
I split_index,
|
||||
I n_features,
|
||||
I n_points) except +
|
||||
|
||||
|
||||
cdef int partition_node_indices(
|
||||
const floating *data,
|
||||
intp_t *node_indices,
|
||||
intp_t split_dim,
|
||||
intp_t split_index,
|
||||
intp_t n_features,
|
||||
intp_t n_points) except -1:
|
||||
"""Partition points in the node into two equal-sized groups.
|
||||
|
||||
Upon return, the values in node_indices will be rearranged such that
|
||||
(assuming numpy-style indexing):
|
||||
|
||||
data[node_indices[0:split_index], split_dim]
|
||||
<= data[node_indices[split_index], split_dim]
|
||||
|
||||
and
|
||||
|
||||
data[node_indices[split_index], split_dim]
|
||||
<= data[node_indices[split_index:n_points], split_dim]
|
||||
|
||||
The algorithm is essentially a partial in-place quicksort around a
|
||||
set pivot.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : double pointer
|
||||
Pointer to a 2D array of the training data, of shape [N, n_features].
|
||||
N must be greater than any of the values in node_indices.
|
||||
node_indices : int pointer
|
||||
Pointer to a 1D array of length n_points. This lists the indices of
|
||||
each of the points within the current node. This will be modified
|
||||
in-place.
|
||||
split_dim : int
|
||||
the dimension on which to split. This will usually be computed via
|
||||
the routine ``find_node_split_dim``.
|
||||
split_index : int
|
||||
the index within node_indices around which to split the points.
|
||||
n_features: int
|
||||
the number of features (i.e columns) in the 2D array pointed by data.
|
||||
n_points : int
|
||||
the length of node_indices. This is also the number of points in
|
||||
the original dataset.
|
||||
Returns
|
||||
-------
|
||||
status : int
|
||||
integer exit status. On return, the contents of node_indices are
|
||||
modified as noted above.
|
||||
"""
|
||||
partition_node_indices_inner(
|
||||
data,
|
||||
node_indices,
|
||||
split_dim,
|
||||
split_index,
|
||||
n_features,
|
||||
n_points)
|
||||
return 0
|
||||
Binary file not shown.
@@ -0,0 +1,90 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See quad_tree.pyx for details.
|
||||
|
||||
cimport numpy as cnp
|
||||
from sklearn.utils._typedefs cimport float32_t, intp_t
|
||||
|
||||
# This is effectively an ifdef statement in Cython
|
||||
# It allows us to write printf debugging lines
|
||||
# and remove them at compile time
|
||||
cdef enum:
|
||||
DEBUGFLAG = 0
|
||||
|
||||
# XXX: Careful to not change the order of the arguments. It is important to
|
||||
# have is_leaf and max_width consecutive as it permits to avoid padding by
|
||||
# the compiler and keep the size coherent for both C and numpy data structures.
|
||||
cdef struct Cell:
|
||||
# Base storage structure for cells in a QuadTree object
|
||||
|
||||
# Tree structure
|
||||
intp_t parent # Parent cell of this cell
|
||||
intp_t[8] children # Array pointing to children of this cell
|
||||
|
||||
# Cell description
|
||||
intp_t cell_id # Id of the cell in the cells array in the Tree
|
||||
intp_t point_index # Index of the point at this cell (only defined
|
||||
# # in non empty leaf)
|
||||
bint is_leaf # Does this cell have children?
|
||||
float32_t squared_max_width # Squared value of the maximum width w
|
||||
intp_t depth # Depth of the cell in the tree
|
||||
intp_t cumulative_size # Number of points included in the subtree with
|
||||
# # this cell as a root.
|
||||
|
||||
# Internal constants
|
||||
float32_t[3] center # Store the center for quick split of cells
|
||||
float32_t[3] barycenter # Keep track of the center of mass of the cell
|
||||
|
||||
# Cell boundaries
|
||||
float32_t[3] min_bounds # Inferior boundaries of this cell (inclusive)
|
||||
float32_t[3] max_bounds # Superior boundaries of this cell (exclusive)
|
||||
|
||||
|
||||
cdef class _QuadTree:
|
||||
# The QuadTree object is a quad tree structure constructed by inserting
|
||||
# recursively points in the tree and splitting cells in 4 so that each
|
||||
# leaf cell contains at most one point.
|
||||
# This structure also handle 3D data, inserted in trees with 8 children
|
||||
# for each node.
|
||||
|
||||
# Parameters of the tree
|
||||
cdef public int n_dimensions # Number of dimensions in X
|
||||
cdef public int verbose # Verbosity of the output
|
||||
cdef intp_t n_cells_per_cell # Number of children per node. (2 ** n_dimension)
|
||||
|
||||
# Tree inner structure
|
||||
cdef public intp_t max_depth # Max depth of the tree
|
||||
cdef public intp_t cell_count # Counter for node IDs
|
||||
cdef public intp_t capacity # Capacity of tree, in terms of nodes
|
||||
cdef public intp_t n_points # Total number of points
|
||||
cdef Cell* cells # Array of nodes
|
||||
|
||||
# Point insertion methods
|
||||
cdef int insert_point(self, float32_t[3] point, intp_t point_index,
|
||||
intp_t cell_id=*) except -1 nogil
|
||||
cdef intp_t _insert_point_in_new_child(self, float32_t[3] point, Cell* cell,
|
||||
intp_t point_index, intp_t size=*
|
||||
) noexcept nogil
|
||||
cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil
|
||||
cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil
|
||||
|
||||
# Create a summary of the Tree compare to a query point
|
||||
cdef long summarize(self, float32_t[3] point, float32_t* results,
|
||||
float squared_theta=*, intp_t cell_id=*, long idx=*
|
||||
) noexcept nogil
|
||||
|
||||
# Internal cell initialization methods
|
||||
cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil
|
||||
cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
|
||||
) noexcept nogil
|
||||
|
||||
# Private methods
|
||||
cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
|
||||
) except -1 nogil
|
||||
|
||||
# Private array manipulation to manage the ``cells`` array
|
||||
cdef int _resize(self, intp_t capacity) except -1 nogil
|
||||
cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
|
||||
cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=*) except -1 nogil
|
||||
cdef Cell[:] _get_cell_ndarray(self)
|
||||
@@ -0,0 +1,611 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
|
||||
from cpython cimport Py_INCREF, PyObject, PyTypeObject
|
||||
|
||||
from libc.math cimport fabsf
|
||||
from libc.stdlib cimport free
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdio cimport printf
|
||||
from libc.stdint cimport SIZE_MAX
|
||||
|
||||
from sklearn.tree._utils cimport safe_realloc
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
|
||||
int nd, cnp.npy_intp* dims,
|
||||
cnp.npy_intp* strides,
|
||||
void* data, int flags, object obj)
|
||||
int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
|
||||
|
||||
# Build the corresponding numpy dtype for Cell.
|
||||
# This works by casting `dummy` to an array of Cell of length 1, which numpy
|
||||
# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
|
||||
# for a more detailed explanation.
|
||||
cdef Cell dummy
|
||||
CELL_DTYPE = np.asarray(<Cell[:1]>(&dummy)).dtype
|
||||
|
||||
assert CELL_DTYPE.itemsize == sizeof(Cell)
|
||||
|
||||
cdef const float EPSILON = 1e-6
|
||||
|
||||
|
||||
cdef class _QuadTree:
|
||||
"""Array-based representation of a QuadTree.
|
||||
|
||||
This class is currently working for indexing 2D data (regular QuadTree) and
|
||||
for indexing 3D data (OcTree). It is planned to split the 2 implementations
|
||||
using `Cython.Tempita` to save some memory for QuadTree.
|
||||
|
||||
Note that this code is currently internally used only by the Barnes-Hut
|
||||
method in `sklearn.manifold.TSNE`. It is planned to be refactored and
|
||||
generalized in the future to be compatible with nearest neighbors API of
|
||||
`sklearn.neighbors` with 2D and 3D data.
|
||||
"""
|
||||
def __cinit__(self, int n_dimensions, int verbose):
|
||||
"""Constructor."""
|
||||
# Parameters of the tree
|
||||
self.n_dimensions = n_dimensions
|
||||
self.verbose = verbose
|
||||
self.n_cells_per_cell = <int> (2 ** self.n_dimensions)
|
||||
|
||||
# Inner structures
|
||||
self.max_depth = 0
|
||||
self.cell_count = 0
|
||||
self.capacity = 0
|
||||
self.n_points = 0
|
||||
self.cells = NULL
|
||||
|
||||
def __dealloc__(self):
|
||||
"""Destructor."""
|
||||
# Free all inner structures
|
||||
free(self.cells)
|
||||
|
||||
@property
|
||||
def cumulative_size(self):
|
||||
cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
|
||||
return cell_mem_view.base['cumulative_size'][:self.cell_count]
|
||||
|
||||
@property
|
||||
def leafs(self):
|
||||
cdef Cell[:] cell_mem_view = self._get_cell_ndarray()
|
||||
return cell_mem_view.base['is_leaf'][:self.cell_count]
|
||||
|
||||
def build_tree(self, X):
|
||||
"""Build a tree from an array of points X."""
|
||||
cdef:
|
||||
int i
|
||||
float32_t[3] pt
|
||||
float32_t[3] min_bounds, max_bounds
|
||||
|
||||
# validate X and prepare for query
|
||||
# X = check_array(X, dtype=float32_t, order='C')
|
||||
n_samples = X.shape[0]
|
||||
|
||||
capacity = 100
|
||||
self._resize(capacity)
|
||||
m = np.min(X, axis=0)
|
||||
M = np.max(X, axis=0)
|
||||
# Scale the maximum to get all points strictly in the tree bounding box
|
||||
# The 3 bounds are for positive, negative and small values
|
||||
M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3)
|
||||
for i in range(self.n_dimensions):
|
||||
min_bounds[i] = m[i]
|
||||
max_bounds[i] = M[i]
|
||||
|
||||
if self.verbose > 10:
|
||||
printf("[QuadTree] bounding box axis %i : [%f, %f]\n",
|
||||
i, min_bounds[i], max_bounds[i])
|
||||
|
||||
# Create the initial node with boundaries from the dataset
|
||||
self._init_root(min_bounds, max_bounds)
|
||||
|
||||
for i in range(n_samples):
|
||||
for j in range(self.n_dimensions):
|
||||
pt[j] = X[i, j]
|
||||
self.insert_point(pt, i)
|
||||
|
||||
# Shrink the cells array to reduce memory usage
|
||||
self._resize(capacity=self.cell_count)
|
||||
|
||||
cdef int insert_point(self, float32_t[3] point, intp_t point_index,
|
||||
intp_t cell_id=0) except -1 nogil:
|
||||
"""Insert a point in the QuadTree."""
|
||||
cdef int ax
|
||||
cdef intp_t selected_child
|
||||
cdef Cell* cell = &self.cells[cell_id]
|
||||
cdef intp_t n_point = cell.cumulative_size
|
||||
|
||||
if self.verbose > 10:
|
||||
printf("[QuadTree] Inserting depth %li\n", cell.depth)
|
||||
|
||||
# Assert that the point is in the right range
|
||||
if DEBUGFLAG:
|
||||
self._check_point_in_cell(point, cell)
|
||||
|
||||
# If the cell is an empty leaf, insert the point in it
|
||||
if cell.cumulative_size == 0:
|
||||
cell.cumulative_size = 1
|
||||
self.n_points += 1
|
||||
for i in range(self.n_dimensions):
|
||||
cell.barycenter[i] = point[i]
|
||||
cell.point_index = point_index
|
||||
if self.verbose > 10:
|
||||
printf("[QuadTree] inserted point %li in cell %li\n",
|
||||
point_index, cell_id)
|
||||
return cell_id
|
||||
|
||||
# If the cell is not a leaf, update cell internals and
|
||||
# recurse in selected child
|
||||
if not cell.is_leaf:
|
||||
for ax in range(self.n_dimensions):
|
||||
# barycenter update using a weighted mean
|
||||
cell.barycenter[ax] = (
|
||||
n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1)
|
||||
|
||||
# Increase the size of the subtree starting from this cell
|
||||
cell.cumulative_size += 1
|
||||
|
||||
# Insert child in the correct subtree
|
||||
selected_child = self._select_child(point, cell)
|
||||
if self.verbose > 49:
|
||||
printf("[QuadTree] selected child %li\n", selected_child)
|
||||
if selected_child == -1:
|
||||
self.n_points += 1
|
||||
return self._insert_point_in_new_child(point, cell, point_index)
|
||||
return self.insert_point(point, point_index, selected_child)
|
||||
|
||||
# Finally, if the cell is a leaf with a point already inserted,
|
||||
# split the cell in n_cells_per_cell if the point is not a duplicate.
|
||||
# If it is a duplicate, increase the size of the leaf and return.
|
||||
if self._is_duplicate(point, cell.barycenter):
|
||||
if self.verbose > 10:
|
||||
printf("[QuadTree] found a duplicate!\n")
|
||||
cell.cumulative_size += 1
|
||||
self.n_points += 1
|
||||
return cell_id
|
||||
|
||||
# In a leaf, the barycenter correspond to the only point included
|
||||
# in it.
|
||||
self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index,
|
||||
cell.cumulative_size)
|
||||
return self.insert_point(point, point_index, cell_id)
|
||||
|
||||
# XXX: This operation is not Thread safe
|
||||
cdef intp_t _insert_point_in_new_child(
|
||||
self, float32_t[3] point, Cell* cell, intp_t point_index, intp_t size=1
|
||||
) noexcept nogil:
|
||||
"""Create a child of cell which will contain point."""
|
||||
|
||||
# Local variable definition
|
||||
cdef:
|
||||
intp_t cell_id, cell_child_id, parent_id
|
||||
float32_t[3] save_point
|
||||
float32_t width
|
||||
Cell* child
|
||||
int i
|
||||
|
||||
# If the maximal capacity of the Tree have been reached, double the capacity
|
||||
# We need to save the current cell id and the current point to retrieve them
|
||||
# in case the reallocation
|
||||
if self.cell_count + 1 > self.capacity:
|
||||
parent_id = cell.cell_id
|
||||
for i in range(self.n_dimensions):
|
||||
save_point[i] = point[i]
|
||||
self._resize(SIZE_MAX)
|
||||
cell = &self.cells[parent_id]
|
||||
point = save_point
|
||||
|
||||
# Get an empty cell and initialize it
|
||||
cell_id = self.cell_count
|
||||
self.cell_count += 1
|
||||
child = &self.cells[cell_id]
|
||||
|
||||
self._init_cell(child, cell.cell_id, cell.depth + 1)
|
||||
child.cell_id = cell_id
|
||||
|
||||
# Set the cell as an inner cell of the Tree
|
||||
cell.is_leaf = False
|
||||
cell.point_index = -1
|
||||
|
||||
# Set the correct boundary for the cell, store the point in the cell
|
||||
# and compute its index in the children array.
|
||||
cell_child_id = 0
|
||||
for i in range(self.n_dimensions):
|
||||
cell_child_id *= 2
|
||||
if point[i] >= cell.center[i]:
|
||||
cell_child_id += 1
|
||||
child.min_bounds[i] = cell.center[i]
|
||||
child.max_bounds[i] = cell.max_bounds[i]
|
||||
else:
|
||||
child.min_bounds[i] = cell.min_bounds[i]
|
||||
child.max_bounds[i] = cell.center[i]
|
||||
child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2.
|
||||
width = child.max_bounds[i] - child.min_bounds[i]
|
||||
|
||||
child.barycenter[i] = point[i]
|
||||
child.squared_max_width = max(child.squared_max_width, width*width)
|
||||
|
||||
# Store the point info and the size to account for duplicated points
|
||||
child.point_index = point_index
|
||||
child.cumulative_size = size
|
||||
|
||||
# Store the child cell in the correct place in children
|
||||
cell.children[cell_child_id] = child.cell_id
|
||||
|
||||
if DEBUGFLAG:
|
||||
# Assert that the point is in the right range
|
||||
self._check_point_in_cell(point, child)
|
||||
if self.verbose > 10:
|
||||
printf("[QuadTree] inserted point %li in new child %li\n",
|
||||
point_index, cell_id)
|
||||
|
||||
return cell_id
|
||||
|
||||
cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil:
|
||||
"""Check if the two given points are equals."""
|
||||
cdef int i
|
||||
cdef bint res = True
|
||||
for i in range(self.n_dimensions):
|
||||
# Use EPSILON to avoid numerical error that would overgrow the tree
|
||||
res &= fabsf(point1[i] - point2[i]) <= EPSILON
|
||||
return res
|
||||
|
||||
cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil:
|
||||
"""Select the child of cell which contains the given query point."""
|
||||
cdef:
|
||||
int i
|
||||
intp_t selected_child = 0
|
||||
|
||||
for i in range(self.n_dimensions):
|
||||
# Select the correct child cell to insert the point by comparing
|
||||
# it to the borders of the cells using precomputed center.
|
||||
selected_child *= 2
|
||||
if point[i] >= cell.center[i]:
|
||||
selected_child += 1
|
||||
return cell.children[selected_child]
|
||||
|
||||
cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil:
|
||||
"""Initialize a cell structure with some constants."""
|
||||
cell.parent = parent
|
||||
cell.is_leaf = True
|
||||
cell.depth = depth
|
||||
cell.squared_max_width = 0
|
||||
cell.cumulative_size = 0
|
||||
for i in range(self.n_cells_per_cell):
|
||||
cell.children[i] = SIZE_MAX
|
||||
|
||||
cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds
|
||||
) noexcept nogil:
|
||||
"""Initialize the root node with the given space boundaries"""
|
||||
cdef:
|
||||
int i
|
||||
float32_t width
|
||||
Cell* root = &self.cells[0]
|
||||
|
||||
self._init_cell(root, -1, 0)
|
||||
for i in range(self.n_dimensions):
|
||||
root.min_bounds[i] = min_bounds[i]
|
||||
root.max_bounds[i] = max_bounds[i]
|
||||
root.center[i] = (max_bounds[i] + min_bounds[i]) / 2.
|
||||
width = max_bounds[i] - min_bounds[i]
|
||||
root.squared_max_width = max(root.squared_max_width, width*width)
|
||||
root.cell_id = 0
|
||||
|
||||
self.cell_count += 1
|
||||
|
||||
cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell
|
||||
) except -1 nogil:
|
||||
"""Check that the given point is in the cell boundaries."""
|
||||
|
||||
if self.verbose >= 50:
|
||||
if self.n_dimensions == 3:
|
||||
printf("[QuadTree] Checking point (%f, %f, %f) in cell %li "
|
||||
"([%f/%f, %f/%f, %f/%f], size %li)\n",
|
||||
point[0], point[1], point[2], cell.cell_id,
|
||||
cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1],
|
||||
cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2],
|
||||
cell.cumulative_size)
|
||||
else:
|
||||
printf("[QuadTree] Checking point (%f, %f) in cell %li "
|
||||
"([%f/%f, %f/%f], size %li)\n",
|
||||
point[0], point[1], cell.cell_id, cell.min_bounds[0],
|
||||
cell.max_bounds[0], cell.min_bounds[1],
|
||||
cell.max_bounds[1], cell.cumulative_size)
|
||||
|
||||
for i in range(self.n_dimensions):
|
||||
if (cell.min_bounds[i] > point[i] or
|
||||
cell.max_bounds[i] <= point[i]):
|
||||
with gil:
|
||||
msg = "[QuadTree] InsertionError: point out of cell "
|
||||
msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n"
|
||||
|
||||
msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i]
|
||||
raise ValueError(msg)
|
||||
|
||||
def _check_coherence(self):
|
||||
"""Check the coherence of the cells of the tree.
|
||||
|
||||
Check that the info stored in each cell is compatible with the info
|
||||
stored in descendent and sibling cells. Raise a ValueError if this
|
||||
fails.
|
||||
"""
|
||||
for cell in self.cells[:self.cell_count]:
|
||||
# Check that the barycenter of inserted point is within the cell
|
||||
# boundaries
|
||||
self._check_point_in_cell(cell.barycenter, &cell)
|
||||
|
||||
if not cell.is_leaf:
|
||||
# Compute the number of point in children and compare with
|
||||
# its cummulative_size.
|
||||
n_points = 0
|
||||
for idx in range(self.n_cells_per_cell):
|
||||
child_id = cell.children[idx]
|
||||
if child_id != -1:
|
||||
child = self.cells[child_id]
|
||||
n_points += child.cumulative_size
|
||||
assert child.cell_id == child_id, (
|
||||
"Cell id not correctly initialized.")
|
||||
if n_points != cell.cumulative_size:
|
||||
raise ValueError(
|
||||
"Cell {} is incoherent. Size={} but found {} points "
|
||||
"in children. ({})"
|
||||
.format(cell.cell_id, cell.cumulative_size,
|
||||
n_points, cell.children))
|
||||
|
||||
# Make sure that the number of point in the tree correspond to the
|
||||
# cumulative size in root cell.
|
||||
if self.n_points != self.cells[0].cumulative_size:
|
||||
raise ValueError(
|
||||
"QuadTree is incoherent. Size={} but found {} points "
|
||||
"in children."
|
||||
.format(self.n_points, self.cells[0].cumulative_size))
|
||||
|
||||
cdef long summarize(self, float32_t[3] point, float32_t* results,
|
||||
float squared_theta=.5, intp_t cell_id=0, long idx=0
|
||||
) noexcept nogil:
|
||||
"""Summarize the tree compared to a query point.
|
||||
|
||||
Input arguments
|
||||
---------------
|
||||
point : array (n_dimensions)
|
||||
query point to construct the summary.
|
||||
cell_id : integer, optional (default: 0)
|
||||
current cell of the tree summarized. This should be set to 0 for
|
||||
external calls.
|
||||
idx : integer, optional (default: 0)
|
||||
current index in the result array. This should be set to 0 for
|
||||
external calls
|
||||
squared_theta: float, optional (default: .5)
|
||||
threshold to decide whether the node is sufficiently far
|
||||
from the query point to be a good summary. The formula is such that
|
||||
the node is a summary if
|
||||
node_width^2 / dist_node_point^2 < squared_theta.
|
||||
Note that the argument should be passed as theta^2 to avoid
|
||||
computing square roots of the distances.
|
||||
|
||||
Output arguments
|
||||
----------------
|
||||
results : array (n_samples * (n_dimensions+2))
|
||||
result will contain a summary of the tree information compared to
|
||||
the query point:
|
||||
- results[idx:idx+n_dimensions] contains the coordinate-wise
|
||||
difference between the query point and the summary cell idx.
|
||||
This is useful in t-SNE to compute the negative forces.
|
||||
- result[idx+n_dimensions+1] contains the squared euclidean
|
||||
distance to the summary cell idx.
|
||||
- result[idx+n_dimensions+2] contains the number of point of the
|
||||
tree contained in the summary cell idx.
|
||||
|
||||
Return
|
||||
------
|
||||
idx : integer
|
||||
number of elements in the results array.
|
||||
"""
|
||||
cdef:
|
||||
int i, idx_d = idx + self.n_dimensions
|
||||
bint duplicate = True
|
||||
Cell* cell = &self.cells[cell_id]
|
||||
|
||||
results[idx_d] = 0.
|
||||
for i in range(self.n_dimensions):
|
||||
results[idx + i] = point[i] - cell.barycenter[i]
|
||||
results[idx_d] += results[idx + i] * results[idx + i]
|
||||
duplicate &= fabsf(results[idx + i]) <= EPSILON
|
||||
|
||||
# Do not compute self interactions
|
||||
if duplicate and cell.is_leaf:
|
||||
return idx
|
||||
|
||||
# Check whether we can use this node as a summary
|
||||
# It's a summary node if the angular size as measured from the point
|
||||
# is relatively small (w.r.t. theta) or if it is a leaf node.
|
||||
# If it can be summarized, we use the cell center of mass
|
||||
# Otherwise, we go a higher level of resolution and into the leaves.
|
||||
if cell.is_leaf or (
|
||||
(cell.squared_max_width / results[idx_d]) < squared_theta):
|
||||
results[idx_d + 1] = <float32_t> cell.cumulative_size
|
||||
return idx + self.n_dimensions + 2
|
||||
|
||||
else:
|
||||
# Recursively compute the summary in nodes
|
||||
for c in range(self.n_cells_per_cell):
|
||||
child_id = cell.children[c]
|
||||
if child_id != -1:
|
||||
idx = self.summarize(point, results, squared_theta,
|
||||
child_id, idx)
|
||||
|
||||
return idx
|
||||
|
||||
def get_cell(self, point):
|
||||
"""return the id of the cell containing the query point or raise
|
||||
ValueError if the point is not in the tree
|
||||
"""
|
||||
cdef float32_t[3] query_pt
|
||||
cdef int i
|
||||
|
||||
assert len(point) == self.n_dimensions, (
|
||||
"Query point should be a point in dimension {}."
|
||||
.format(self.n_dimensions))
|
||||
|
||||
for i in range(self.n_dimensions):
|
||||
query_pt[i] = point[i]
|
||||
|
||||
return self._get_cell(query_pt, 0)
|
||||
|
||||
cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=0
|
||||
) except -1 nogil:
|
||||
"""guts of get_cell.
|
||||
|
||||
Return the id of the cell containing the query point or raise ValueError
|
||||
if the point is not in the tree"""
|
||||
cdef:
|
||||
intp_t selected_child
|
||||
Cell* cell = &self.cells[cell_id]
|
||||
|
||||
if cell.is_leaf:
|
||||
if self._is_duplicate(cell.barycenter, point):
|
||||
if self.verbose > 99:
|
||||
printf("[QuadTree] Found point in cell: %li\n",
|
||||
cell.cell_id)
|
||||
return cell_id
|
||||
with gil:
|
||||
raise ValueError("Query point not in the Tree.")
|
||||
|
||||
selected_child = self._select_child(point, cell)
|
||||
if selected_child > 0:
|
||||
if self.verbose > 99:
|
||||
printf("[QuadTree] Selected_child: %li\n", selected_child)
|
||||
return self._get_cell(point, selected_child)
|
||||
with gil:
|
||||
raise ValueError("Query point not in the Tree.")
|
||||
|
||||
# Pickling primitives
|
||||
|
||||
def __reduce__(self):
|
||||
"""Reduce re-implementation, for pickling."""
|
||||
return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__())
|
||||
|
||||
def __getstate__(self):
|
||||
"""Getstate re-implementation, for pickling."""
|
||||
d = {}
|
||||
# capacity is inferred during the __setstate__ using nodes
|
||||
d["max_depth"] = self.max_depth
|
||||
d["cell_count"] = self.cell_count
|
||||
d["capacity"] = self.capacity
|
||||
d["n_points"] = self.n_points
|
||||
d["cells"] = self._get_cell_ndarray().base
|
||||
return d
|
||||
|
||||
def __setstate__(self, d):
|
||||
"""Setstate re-implementation, for unpickling."""
|
||||
self.max_depth = d["max_depth"]
|
||||
self.cell_count = d["cell_count"]
|
||||
self.capacity = d["capacity"]
|
||||
self.n_points = d["n_points"]
|
||||
|
||||
if 'cells' not in d:
|
||||
raise ValueError('You have loaded Tree version which '
|
||||
'cannot be imported')
|
||||
|
||||
cell_ndarray = d['cells']
|
||||
|
||||
if (cell_ndarray.ndim != 1 or
|
||||
cell_ndarray.dtype != CELL_DTYPE or
|
||||
not cell_ndarray.flags.c_contiguous):
|
||||
raise ValueError('Did not recognise loaded array layout')
|
||||
|
||||
self.capacity = cell_ndarray.shape[0]
|
||||
if self._resize_c(self.capacity) != 0:
|
||||
raise MemoryError("resizing tree to %d" % self.capacity)
|
||||
|
||||
cdef Cell[:] cell_mem_view = cell_ndarray
|
||||
memcpy(
|
||||
pto=self.cells,
|
||||
pfrom=&cell_mem_view[0],
|
||||
size=self.capacity * sizeof(Cell),
|
||||
)
|
||||
|
||||
# Array manipulation methods, to convert it to numpy or to resize
|
||||
# self.cells array
|
||||
|
||||
cdef Cell[:] _get_cell_ndarray(self):
|
||||
"""Wraps nodes as a NumPy struct array.
|
||||
|
||||
The array keeps a reference to this Tree, which manages the underlying
|
||||
memory. Individual fields are publicly accessible as properties of the
|
||||
Tree.
|
||||
"""
|
||||
cdef cnp.npy_intp shape[1]
|
||||
shape[0] = <cnp.npy_intp> self.cell_count
|
||||
cdef cnp.npy_intp strides[1]
|
||||
strides[0] = sizeof(Cell)
|
||||
cdef Cell[:] arr
|
||||
Py_INCREF(CELL_DTYPE)
|
||||
arr = PyArray_NewFromDescr(
|
||||
subtype=<PyTypeObject *> np.ndarray,
|
||||
descr=CELL_DTYPE,
|
||||
nd=1,
|
||||
dims=shape,
|
||||
strides=strides,
|
||||
data=<void*> self.cells,
|
||||
flags=cnp.NPY_ARRAY_DEFAULT,
|
||||
obj=None,
|
||||
)
|
||||
Py_INCREF(self)
|
||||
if PyArray_SetBaseObject(arr.base, <PyObject*> self) < 0:
|
||||
raise ValueError("Can't initialize array!")
|
||||
return arr
|
||||
|
||||
cdef int _resize(self, intp_t capacity) except -1 nogil:
|
||||
"""Resize all inner arrays to `capacity`, if `capacity` == -1, then
|
||||
double the size of the inner arrays.
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
if self._resize_c(capacity) != 0:
|
||||
# Acquire gil only if we need to raise
|
||||
with gil:
|
||||
raise MemoryError()
|
||||
|
||||
cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil:
|
||||
"""Guts of _resize
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
if capacity == self.capacity and self.cells != NULL:
|
||||
return 0
|
||||
|
||||
if <size_t> capacity == SIZE_MAX:
|
||||
if self.capacity == 0:
|
||||
capacity = 9 # default initial value to min
|
||||
else:
|
||||
capacity = 2 * self.capacity
|
||||
|
||||
safe_realloc(&self.cells, capacity)
|
||||
|
||||
# if capacity smaller than cell_count, adjust the counter
|
||||
if capacity < self.cell_count:
|
||||
self.cell_count = capacity
|
||||
|
||||
self.capacity = capacity
|
||||
return 0
|
||||
|
||||
def _py_summarize(self, float32_t[:] query_pt, float32_t[:, :] X, float angle):
|
||||
# Used for testing summarize
|
||||
cdef:
|
||||
float32_t[:] summary
|
||||
int n_samples
|
||||
|
||||
n_samples = X.shape[0]
|
||||
summary = np.empty(4 * n_samples, dtype=np.float32)
|
||||
|
||||
idx = self.summarize(&query_pt[0], &summary[0], angle * angle)
|
||||
return idx, summary
|
||||
@@ -0,0 +1,518 @@
|
||||
"""Nearest Neighbor Regression."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import RegressorMixin, _fit_context
|
||||
from sklearn.metrics import DistanceMetric
|
||||
from sklearn.neighbors._base import (
|
||||
KNeighborsMixin,
|
||||
NeighborsBase,
|
||||
RadiusNeighborsMixin,
|
||||
_get_weights,
|
||||
)
|
||||
from sklearn.utils._param_validation import StrOptions
|
||||
|
||||
|
||||
class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
|
||||
"""Regression based on k-nearest neighbors.
|
||||
|
||||
The target is predicted by local interpolation of the targets
|
||||
associated of the nearest neighbors in the training set.
|
||||
|
||||
Read more in the :ref:`User Guide <regression>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
See the following example for a demonstration of the impact of
|
||||
different weighting schemes on predictions:
|
||||
:ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str, DistanceMetric object or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
If metric is a DistanceMetric object, it will be passed directly to
|
||||
the underlying computation routines.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
Doesn't affect :meth:`fit` method.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric to use. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
NearestNeighbors : Unsupervised learner for implementing neighbor searches.
|
||||
RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.
|
||||
KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.
|
||||
RadiusNeighborsClassifier : Classifier implementing
|
||||
a vote among neighbors within a given radius.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
.. warning::
|
||||
|
||||
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||||
neighbors, neighbor `k+1` and `k`, have identical distances but
|
||||
different labels, the results will depend on the ordering of the
|
||||
training data.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import KNeighborsRegressor
|
||||
>>> neigh = KNeighborsRegressor(n_neighbors=2)
|
||||
>>> neigh.fit(X, y)
|
||||
KNeighborsRegressor(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0.5]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"weights": [StrOptions({"uniform", "distance"}), callable, None],
|
||||
}
|
||||
_parameter_constraints["metric"].append(DistanceMetric)
|
||||
_parameter_constraints.pop("radius")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_neighbors=5,
|
||||
*,
|
||||
weights="uniform",
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
metric="minkowski",
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.weights = weights
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
tags = super().__sklearn_tags__()
|
||||
# For cross-validation routines to split data correctly
|
||||
tags.input_tags.pairwise = self.metric == "precomputed"
|
||||
return tags
|
||||
|
||||
@_fit_context(
|
||||
# KNeighborsRegressor.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y):
|
||||
"""Fit the k-nearest neighbors regressor from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_outputs)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : KNeighborsRegressor
|
||||
The fitted k-nearest neighbors regressor.
|
||||
"""
|
||||
return self._fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
|
||||
Target values.
|
||||
"""
|
||||
if self.weights == "uniform":
|
||||
# In that case, we do not need the distances to perform
|
||||
# the weighting so we do not compute them.
|
||||
neigh_ind = self.kneighbors(X, return_distance=False)
|
||||
neigh_dist = None
|
||||
else:
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
|
||||
_y = self._y
|
||||
if _y.ndim == 1:
|
||||
_y = _y.reshape((-1, 1))
|
||||
|
||||
if weights is None:
|
||||
y_pred = np.mean(_y[neigh_ind], axis=1)
|
||||
else:
|
||||
y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64)
|
||||
denom = np.sum(weights, axis=1)
|
||||
|
||||
for j in range(_y.shape[1]):
|
||||
num = np.sum(_y[neigh_ind, j] * weights, axis=1)
|
||||
y_pred[:, j] = num / denom
|
||||
|
||||
if self._y.ndim == 1:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
|
||||
class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase):
|
||||
"""Regression based on neighbors within a fixed radius.
|
||||
|
||||
The target is predicted by local interpolation of the targets
|
||||
associated of the nearest neighbors in the training set.
|
||||
|
||||
Read more in the :ref:`User Guide <regression>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric to use. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
NearestNeighbors : Unsupervised learner for implementing neighbor searches.
|
||||
KNeighborsRegressor : Regression based on k-nearest neighbors.
|
||||
KNeighborsClassifier : Classifier based on the k-nearest neighbors.
|
||||
RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import RadiusNeighborsRegressor
|
||||
>>> neigh = RadiusNeighborsRegressor(radius=1.0)
|
||||
>>> neigh.fit(X, y)
|
||||
RadiusNeighborsRegressor(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0.5]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"weights": [StrOptions({"uniform", "distance"}), callable, None],
|
||||
}
|
||||
_parameter_constraints.pop("n_neighbors")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
radius=1.0,
|
||||
*,
|
||||
weights="uniform",
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
metric="minkowski",
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
p=p,
|
||||
metric=metric,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.weights = weights
|
||||
|
||||
@_fit_context(
|
||||
# RadiusNeighborsRegressor.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y):
|
||||
"""Fit the radius neighbors regressor from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_outputs)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : RadiusNeighborsRegressor
|
||||
The fitted radius neighbors regressor.
|
||||
"""
|
||||
return self._fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed', or None
|
||||
Test samples. If `None`, predictions for all indexed points are
|
||||
returned; in this case, points are not considered their own
|
||||
neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
|
||||
dtype=double
|
||||
Target values.
|
||||
"""
|
||||
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
|
||||
_y = self._y
|
||||
if _y.ndim == 1:
|
||||
_y = _y.reshape((-1, 1))
|
||||
|
||||
empty_obs = np.full_like(_y[0], np.nan)
|
||||
|
||||
if weights is None:
|
||||
y_pred = np.array(
|
||||
[
|
||||
np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs
|
||||
for (i, ind) in enumerate(neigh_ind)
|
||||
]
|
||||
)
|
||||
|
||||
else:
|
||||
y_pred = np.array(
|
||||
[
|
||||
(
|
||||
np.average(_y[ind, :], axis=0, weights=weights[i])
|
||||
if len(ind)
|
||||
else empty_obs
|
||||
)
|
||||
for (i, ind) in enumerate(neigh_ind)
|
||||
]
|
||||
)
|
||||
|
||||
if np.any(np.isnan(y_pred)):
|
||||
empty_warning_msg = (
|
||||
"One or more samples have no neighbors "
|
||||
"within specified radius; predicting NaN."
|
||||
)
|
||||
warnings.warn(empty_warning_msg)
|
||||
|
||||
if self._y.ndim == 1:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
@@ -0,0 +1,179 @@
|
||||
"""Unsupervised nearest neighbors learner"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.base import _fit_context
|
||||
from sklearn.neighbors._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin
|
||||
|
||||
|
||||
class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
|
||||
"""Unsupervised learner for implementing neighbor searches.
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
p : float (positive), default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str
|
||||
Metric used to compute distances to neighbors.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Parameters for the metric used to compute distances to neighbors.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
Number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
KNeighborsClassifier : Classifier implementing the k-nearest neighbors
|
||||
vote.
|
||||
RadiusNeighborsClassifier : Classifier implementing a vote among neighbors
|
||||
within a given radius.
|
||||
KNeighborsRegressor : Regression based on k-nearest neighbors.
|
||||
RadiusNeighborsRegressor : Regression based on neighbors within a fixed
|
||||
radius.
|
||||
BallTree : Space partitioning data structure for organizing points in a
|
||||
multi-dimensional space, used for nearest neighbor search.
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neighbors import NearestNeighbors
|
||||
>>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
|
||||
>>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
|
||||
>>> neigh.fit(samples)
|
||||
NearestNeighbors(...)
|
||||
>>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
|
||||
array([[2, 0]]...)
|
||||
>>> nbrs = neigh.radius_neighbors(
|
||||
... [[0, 0, 1.3]], 0.4, return_distance=False
|
||||
... )
|
||||
>>> np.asarray(nbrs[0][0])
|
||||
array(2)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
radius=1.0,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
|
||||
@_fit_context(
|
||||
# NearestNeighbors.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the nearest neighbors estimator from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : NearestNeighbors
|
||||
The fitted nearest neighbors estimator.
|
||||
"""
|
||||
return self._fit(X)
|
||||
@@ -0,0 +1,53 @@
|
||||
_binary_tree_pxi = custom_target(
|
||||
'_binary_tree_pxi',
|
||||
output: '_binary_tree.pxi',
|
||||
input: '_binary_tree.pxi.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
)
|
||||
|
||||
# .pyx is generated so this is needed to make Cython compilation work. The pxi
|
||||
# file is included avoid "missing dependency paths" with ninja -t missindeps
|
||||
neighbors_cython_tree = [
|
||||
fs.copyfile('__init__.py'),
|
||||
fs.copyfile('_partition_nodes.pxd'),
|
||||
_binary_tree_pxi,
|
||||
]
|
||||
|
||||
name_list = ['_ball_tree', '_kd_tree']
|
||||
|
||||
foreach name: name_list
|
||||
pyx = custom_target(
|
||||
name + '_pyx',
|
||||
output: name + '.pyx',
|
||||
input: name + '.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [neighbors_cython_tree, utils_cython_tree, metrics_cython_tree],
|
||||
)
|
||||
py.extension_module(
|
||||
name,
|
||||
cython_gen.process(pyx),
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/neighbors',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
|
||||
neighbors_extension_metadata = {
|
||||
'_partition_nodes':
|
||||
{'sources': [cython_gen_cpp.process('_partition_nodes.pyx')],
|
||||
'dependencies': [np_dep]},
|
||||
'_quad_tree': {'sources': [cython_gen.process('_quad_tree.pyx')], 'dependencies': [np_dep]},
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : neighbors_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
[ext_dict.get('sources'), utils_cython_tree],
|
||||
dependencies: ext_dict.get('dependencies'),
|
||||
subdir: 'sklearn/neighbors',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,200 @@
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal
|
||||
|
||||
from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import _convert_container
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
rng = np.random.RandomState(10)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {
|
||||
"euclidean": {},
|
||||
"manhattan": {},
|
||||
"minkowski": dict(p=3),
|
||||
"chebyshev": {},
|
||||
}
|
||||
|
||||
DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
|
||||
|
||||
BOOLEAN_METRICS = [
|
||||
"jaccard",
|
||||
"dice",
|
||||
"rogerstanimoto",
|
||||
"russellrao",
|
||||
"sokalmichener",
|
||||
"sokalsneath",
|
||||
]
|
||||
|
||||
BALL_TREE_CLASSES = [
|
||||
BallTree64,
|
||||
BallTree32,
|
||||
]
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
from sklearn.metrics import DistanceMetric
|
||||
|
||||
X, Y = check_array(X), check_array(Y)
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
def test_BallTree_is_BallTree64_subclass():
|
||||
assert issubclass(BallTree, BallTree64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
|
||||
@pytest.mark.parametrize("array_type", ["list", "array"])
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation):
|
||||
rng = check_random_state(0)
|
||||
if metric in BOOLEAN_METRICS:
|
||||
X = rng.random_sample((40, 10)).round(0)
|
||||
Y = rng.random_sample((10, 10)).round(0)
|
||||
elif metric in DISCRETE_METRICS:
|
||||
X = (4 * rng.random_sample((40, 10))).round(0)
|
||||
Y = (4 * rng.random_sample((10, 10))).round(0)
|
||||
X = _convert_container(X, array_type)
|
||||
Y = _convert_container(Y, array_type)
|
||||
|
||||
k = 5
|
||||
|
||||
bt = BallTreeImplementation(X, leaf_size=1, metric=metric)
|
||||
dist1, ind1 = bt.query(Y, k)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])
|
||||
)
|
||||
def test_query_haversine(BallTreeImplementation, decimal_tol):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * np.pi * rng.random_sample((40, 2))
|
||||
bt = BallTreeImplementation(X, leaf_size=1, metric="haversine")
|
||||
dist1, ind1 = bt.query(X, k=5)
|
||||
dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
|
||||
|
||||
assert_array_almost_equal(dist1, dist2, decimal=decimal_tol)
|
||||
assert_array_almost_equal(ind1, ind2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_array_object_type(BallTreeImplementation):
|
||||
"""Check that we do not accept object dtype array."""
|
||||
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
|
||||
with pytest.raises(ValueError, match="setting an array element with a sequence"):
|
||||
BallTreeImplementation(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES)
|
||||
def test_bad_pyfunc_metric(BallTreeImplementation):
|
||||
def wrong_returned_value(x, y):
|
||||
return "1"
|
||||
|
||||
def one_arg_func(x):
|
||||
return 1.0 # pragma: no cover
|
||||
|
||||
X = np.ones((5, 2))
|
||||
msg = "Custom distance function must accept two vectors and return a float."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
BallTreeImplementation(X, metric=wrong_returned_value)
|
||||
|
||||
msg = "takes 1 positional argument but 2 were given"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
BallTreeImplementation(X, metric=one_arg_func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
|
||||
def test_ball_tree_numerical_consistency(global_random_seed, metric):
|
||||
# Results on float64 and float32 versions of a dataset must be
|
||||
# numerically close.
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
|
||||
random_seed=global_random_seed, features=50
|
||||
)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
|
||||
bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
|
||||
|
||||
# Test consistency with respect to the `query` method
|
||||
k = 5
|
||||
dist_64, ind_64 = bt_64.query(Y_64, k=k)
|
||||
dist_32, ind_32 = bt_32.query(Y_32, k=k)
|
||||
assert_allclose(dist_64, dist_32, rtol=1e-5)
|
||||
assert_equal(ind_64, ind_32)
|
||||
assert dist_64.dtype == np.float64
|
||||
assert dist_32.dtype == np.float32
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
r = 2.38
|
||||
ind_64 = bt_64.query_radius(Y_64, r=r)
|
||||
ind_32 = bt_32.query_radius(Y_32, r=r)
|
||||
for _ind64, _ind32 in zip(ind_64, ind_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
# with return distances being true
|
||||
ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True)
|
||||
ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True)
|
||||
for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
assert_allclose(_dist_64, _dist_32, rtol=1e-5)
|
||||
assert _dist_64.dtype == np.float64
|
||||
assert _dist_32.dtype == np.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS))
|
||||
def test_kernel_density_numerical_consistency(global_random_seed, metric):
|
||||
# Test consistency with respect to the `kernel_density` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params)
|
||||
bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params)
|
||||
|
||||
kernel = "gaussian"
|
||||
h = 0.1
|
||||
density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
|
||||
density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
|
||||
assert_allclose(density64, density32, rtol=1e-5)
|
||||
assert density64.dtype == np.float64
|
||||
assert density32.dtype == np.float32
|
||||
|
||||
|
||||
def test_two_point_correlation_numerical_consistency(global_random_seed):
|
||||
# Test consistency with respect to the `two_point_correlation` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
bt_64 = BallTree64(X_64, leaf_size=10)
|
||||
bt_32 = BallTree32(X_32, leaf_size=10)
|
||||
|
||||
r = np.linspace(0, 1, 10)
|
||||
|
||||
counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True)
|
||||
counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True)
|
||||
assert_allclose(counts_64, counts_32)
|
||||
|
||||
|
||||
def get_dataset_for_binary_tree(random_seed, features=3):
|
||||
rng = np.random.RandomState(random_seed)
|
||||
_X = rng.rand(100, features)
|
||||
_Y = rng.rand(5, features)
|
||||
|
||||
X_64 = _X.astype(dtype=np.float64, copy=False)
|
||||
Y_64 = _Y.astype(dtype=np.float64, copy=False)
|
||||
|
||||
X_32 = _X.astype(dtype=np.float32, copy=False)
|
||||
Y_32 = _Y.astype(dtype=np.float32, copy=False)
|
||||
|
||||
return X_64, X_32, Y_64, Y_32
|
||||
@@ -0,0 +1,101 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.metrics import euclidean_distances
|
||||
from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
|
||||
from sklearn.neighbors._base import _is_sorted_by_data
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_transformer_result():
|
||||
# Test the number of neighbors returned
|
||||
n_neighbors = 5
|
||||
n_samples_fit = 20
|
||||
n_queries = 18
|
||||
n_features = 10
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_queries, n_features)
|
||||
radius = np.percentile(euclidean_distances(X), 10)
|
||||
|
||||
# with n_neighbors
|
||||
for mode in ["distance", "connectivity"]:
|
||||
add_one = mode == "distance"
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
|
||||
assert Xt.format == "csr"
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert X2t.data.shape == (n_queries * (n_neighbors + add_one),)
|
||||
assert X2t.format == "csr"
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
# with radius
|
||||
for mode in ["distance", "connectivity"]:
|
||||
add_one = mode == "distance"
|
||||
nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
|
||||
assert Xt.format == "csr"
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),)
|
||||
assert X2t.format == "csr"
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
|
||||
def _has_explicit_diagonal(X):
|
||||
"""Return True if the diagonal is explicitly stored"""
|
||||
X = X.tocoo()
|
||||
explicit = X.row[X.row == X.col]
|
||||
return len(explicit) == X.shape[0]
|
||||
|
||||
|
||||
def test_explicit_diagonal():
|
||||
# Test that the diagonal is explicitly stored in the sparse graph
|
||||
n_neighbors = 5
|
||||
n_samples_fit, n_samples_transform, n_features = 20, 18, 10
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_samples_transform, n_features)
|
||||
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
Xt = nnt.transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
# Using transform on new data should not always have zero diagonal
|
||||
X2t = nnt.transform(X2)
|
||||
assert not _has_explicit_diagonal(X2t)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer])
|
||||
def test_graph_feature_names_out(Klass):
|
||||
"""Check `get_feature_names_out` for transformers defined in `_graph.py`."""
|
||||
|
||||
n_samples_fit = 20
|
||||
n_features = 10
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
|
||||
est = Klass().fit(X)
|
||||
names_out = est.get_feature_names_out()
|
||||
|
||||
class_name_lower = Klass.__name__.lower()
|
||||
expected_names_out = np.array(
|
||||
[f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)],
|
||||
dtype=object,
|
||||
)
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
@@ -0,0 +1,103 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
|
||||
from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64
|
||||
from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
|
||||
|
||||
KD_TREE_CLASSES = [
|
||||
KDTree64,
|
||||
KDTree32,
|
||||
]
|
||||
|
||||
|
||||
def test_KDTree_is_KDTree64_subclass():
|
||||
assert issubclass(KDTree, KDTree64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
|
||||
def test_array_object_type(BinarySearchTree):
|
||||
"""Check that we do not accept object dtype array."""
|
||||
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
|
||||
with pytest.raises(ValueError, match="setting an array element with a sequence"):
|
||||
BinarySearchTree(X)
|
||||
|
||||
|
||||
# TODO: remove mark once loky bug is fixed:
|
||||
# https://github.com/joblib/loky/issues/458
|
||||
@pytest.mark.thread_unsafe
|
||||
@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES)
|
||||
def test_kdtree_picklable_with_joblib(BinarySearchTree):
|
||||
"""Make sure that KDTree queries work when joblib memmaps.
|
||||
|
||||
Non-regression test for #21685 and #21228."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random_sample((10, 3))
|
||||
tree = BinarySearchTree(X, leaf_size=2)
|
||||
|
||||
# Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
|
||||
# use to raise "ValueError: buffer source array is read-only" in a previous
|
||||
# version of the Cython code.
|
||||
Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", METRICS)
|
||||
def test_kd_tree_numerical_consistency(global_random_seed, metric):
|
||||
# Results on float64 and float32 versions of a dataset must be
|
||||
# numerically close.
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(
|
||||
random_seed=global_random_seed, features=50
|
||||
)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
|
||||
kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
|
||||
|
||||
# Test consistency with respect to the `query` method
|
||||
k = 4
|
||||
dist_64, ind_64 = kd_64.query(Y_64, k=k)
|
||||
dist_32, ind_32 = kd_32.query(Y_32, k=k)
|
||||
assert_allclose(dist_64, dist_32, rtol=1e-5)
|
||||
assert_equal(ind_64, ind_32)
|
||||
assert dist_64.dtype == np.float64
|
||||
assert dist_32.dtype == np.float32
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
r = 2.38
|
||||
ind_64 = kd_64.query_radius(Y_64, r=r)
|
||||
ind_32 = kd_32.query_radius(Y_32, r=r)
|
||||
for _ind64, _ind32 in zip(ind_64, ind_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
|
||||
# Test consistency with respect to the `query_radius` method
|
||||
# with return distances being true
|
||||
ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True)
|
||||
ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True)
|
||||
for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32):
|
||||
assert_equal(_ind64, _ind32)
|
||||
assert_allclose(_dist_64, _dist_32, rtol=1e-5)
|
||||
assert _dist_64.dtype == np.float64
|
||||
assert _dist_32.dtype == np.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", METRICS)
|
||||
def test_kernel_density_numerical_consistency(global_random_seed, metric):
|
||||
# Test consistency with respect to the `kernel_density` method
|
||||
X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed)
|
||||
|
||||
metric_params = METRICS.get(metric, {})
|
||||
kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params)
|
||||
kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params)
|
||||
|
||||
kernel = "gaussian"
|
||||
h = 0.1
|
||||
density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True)
|
||||
density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True)
|
||||
assert_allclose(density64, density32, rtol=1e-5)
|
||||
assert density64.dtype == np.float64
|
||||
assert density32.dtype == np.float32
|
||||
@@ -0,0 +1,252 @@
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors
|
||||
from sklearn.neighbors._ball_tree import kernel_norm
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
# XXX Duplicated in test_neighbors_tree, test_kde
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
if h == "scott":
|
||||
h = X.shape[0] ** (-1 / (X.shape[1] + 4))
|
||||
elif h == "silverman":
|
||||
h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
|
||||
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
|
||||
|
||||
if kernel == "gaussian":
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == "tophat":
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == "epanechnikov":
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == "exponential":
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == "linear":
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == "cosine":
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError("kernel not recognized")
|
||||
|
||||
|
||||
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
|
||||
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol)
|
||||
log_dens = kde.fit(X).score_samples(Y)
|
||||
assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol))
|
||||
assert_allclose(
|
||||
np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
|
||||
)
|
||||
@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"])
|
||||
def test_kernel_density(kernel, bandwidth):
|
||||
n_samples, n_features = (100, 3)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
Y = rng.randn(n_samples, n_features)
|
||||
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
|
||||
|
||||
for rtol in [0, 1e-5]:
|
||||
for atol in [1e-6, 1e-2]:
|
||||
for breadth_first in (True, False):
|
||||
check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true)
|
||||
|
||||
|
||||
def test_kernel_density_sampling(n_samples=100, n_features=3):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
|
||||
bandwidth = 0.2
|
||||
|
||||
for kernel in ["gaussian", "tophat"]:
|
||||
# draw a tophat sample
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
samp = kde.sample(100)
|
||||
assert X.shape == samp.shape
|
||||
|
||||
# check that samples are in the right range
|
||||
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
|
||||
dist, ind = nbrs.kneighbors(X, return_distance=True)
|
||||
|
||||
if kernel == "tophat":
|
||||
assert np.all(dist < bandwidth)
|
||||
elif kernel == "gaussian":
|
||||
# 5 standard deviations is safe for 100 samples, but there's a
|
||||
# very small chance this test could fail.
|
||||
assert np.all(dist < 5 * bandwidth)
|
||||
|
||||
# check unsupported kernels
|
||||
for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
with pytest.raises(NotImplementedError):
|
||||
kde.sample(100)
|
||||
|
||||
# non-regression test: used to return a scalar
|
||||
X = rng.randn(4, 1)
|
||||
kde = KernelDensity(kernel="gaussian").fit(X)
|
||||
assert kde.sample().shape == (1, 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"])
|
||||
@pytest.mark.parametrize(
|
||||
"metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"]
|
||||
)
|
||||
def test_kde_algorithm_metric_choice(algorithm, metric):
|
||||
# Smoke test for various metrics and algorithms
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2) # 2 features required for haversine dist.
|
||||
Y = rng.randn(10, 2)
|
||||
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
|
||||
if algorithm == "kd_tree" and metric not in KDTree.valid_metrics:
|
||||
with pytest.raises(ValueError, match="invalid metric"):
|
||||
kde.fit(X)
|
||||
else:
|
||||
kde.fit(X)
|
||||
y_dens = kde.score_samples(Y)
|
||||
assert y_dens.shape == Y.shape[:1]
|
||||
|
||||
|
||||
def test_kde_score(n_samples=100, n_features=3):
|
||||
pass
|
||||
# FIXME
|
||||
# rng = np.random.RandomState(0)
|
||||
# X = rng.random_sample((n_samples, n_features))
|
||||
# Y = rng.random_sample((n_samples, n_features))
|
||||
|
||||
|
||||
def test_kde_sample_weights_error():
|
||||
kde = KernelDensity()
|
||||
with pytest.raises(ValueError):
|
||||
kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10)))
|
||||
with pytest.raises(ValueError):
|
||||
kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200))
|
||||
|
||||
|
||||
def test_kde_pipeline_gridsearch():
|
||||
# test that kde plays nice in pipelines and grid-searches
|
||||
X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
|
||||
pipe1 = make_pipeline(
|
||||
StandardScaler(with_mean=False, with_std=False),
|
||||
KernelDensity(kernel="gaussian"),
|
||||
)
|
||||
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
|
||||
search = GridSearchCV(pipe1, param_grid=params)
|
||||
search.fit(X)
|
||||
assert search.best_params_["kerneldensity__bandwidth"] == 0.1
|
||||
|
||||
|
||||
def test_kde_sample_weights():
|
||||
n_samples = 400
|
||||
size_test = 20
|
||||
weights_neutral = np.full(n_samples, 3.0)
|
||||
for d in [1, 2, 10]:
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(n_samples, d)
|
||||
weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
|
||||
X_repetitions = np.repeat(X, weights, axis=0)
|
||||
n_samples_test = size_test // d
|
||||
test_points = rng.rand(n_samples_test, d)
|
||||
for algorithm in ["auto", "ball_tree", "kd_tree"]:
|
||||
for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]:
|
||||
if algorithm != "kd_tree" or metric in KDTree.valid_metrics:
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
|
||||
# Test that adding a constant sample weight has no effect
|
||||
kde.fit(X, sample_weight=weights_neutral)
|
||||
scores_const_weight = kde.score_samples(test_points)
|
||||
sample_const_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X)
|
||||
scores_no_weight = kde.score_samples(test_points)
|
||||
sample_no_weight = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_const_weight, scores_no_weight)
|
||||
assert_allclose(sample_const_weight, sample_no_weight)
|
||||
|
||||
# Test equivalence between sampling and (integer) weights
|
||||
kde.fit(X, sample_weight=weights)
|
||||
scores_weight = kde.score_samples(test_points)
|
||||
sample_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X_repetitions)
|
||||
scores_ref_sampling = kde.score_samples(test_points)
|
||||
sample_ref_sampling = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_weight, scores_ref_sampling)
|
||||
assert_allclose(sample_weight, sample_ref_sampling)
|
||||
|
||||
# Test that sample weights has a non-trivial effect
|
||||
diff = np.max(np.abs(scores_no_weight - scores_weight))
|
||||
assert diff > 0.001
|
||||
|
||||
# Test invariance with respect to arbitrary scaling
|
||||
scale_factor = rng.rand()
|
||||
kde.fit(X, sample_weight=(scale_factor * weights))
|
||||
scores_scaled_weight = kde.score_samples(test_points)
|
||||
assert_allclose(scores_scaled_weight, scores_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]])
|
||||
def test_pickling(tmpdir, sample_weight):
|
||||
# Make sure that predictions are the same before and after pickling. Used
|
||||
# to be a bug because sample_weights wasn't pickled and the resulting tree
|
||||
# would miss some info.
|
||||
|
||||
kde = KernelDensity()
|
||||
data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
|
||||
kde.fit(data, sample_weight=sample_weight)
|
||||
|
||||
X = np.reshape([1.1, 2.1], (-1, 1))
|
||||
scores = kde.score_samples(X)
|
||||
|
||||
file_path = str(tmpdir.join("dump.pkl"))
|
||||
joblib.dump(kde, file_path)
|
||||
kde = joblib.load(file_path)
|
||||
scores_pickled = kde.score_samples(X)
|
||||
|
||||
assert_allclose(scores, scores_pickled)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["score_samples", "sample"])
|
||||
def test_check_is_fitted(method):
|
||||
# Check that predict raises an exception in an unfitted estimator.
|
||||
# Unfitted estimators should raise a NotFittedError.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2)
|
||||
kde = KernelDensity()
|
||||
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(kde, method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1])
|
||||
def test_bandwidth(bandwidth):
|
||||
n_samples, n_features = (100, 3)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
kde = KernelDensity(bandwidth=bandwidth).fit(X)
|
||||
samp = kde.sample(100)
|
||||
kde_sc = kde.score_samples(X)
|
||||
assert X.shape == samp.shape
|
||||
assert kde_sc.shape == (n_samples,)
|
||||
|
||||
# Test that the attribute self.bandwidth_ has the expected value
|
||||
if bandwidth == "scott":
|
||||
h = X.shape[0] ** (-1 / (X.shape[1] + 4))
|
||||
elif bandwidth == "silverman":
|
||||
h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
|
||||
else:
|
||||
h = bandwidth
|
||||
assert kde.bandwidth_ == pytest.approx(h)
|
||||
@@ -0,0 +1,394 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import re
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import metrics, neighbors
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
from sklearn.utils.estimator_checks import (
|
||||
check_outlier_corruption,
|
||||
parametrize_with_checks,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# load the iris dataset
|
||||
# and randomly permute it
|
||||
rng = check_random_state(0)
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
def test_lof(global_dtype):
|
||||
# Toy sample (the last two samples are outliers):
|
||||
X = np.asarray(
|
||||
[[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]],
|
||||
dtype=global_dtype,
|
||||
)
|
||||
|
||||
# Test LocalOutlierFactor:
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
|
||||
score = clf.fit(X).negative_outlier_factor_
|
||||
assert_array_equal(clf._fit_X, X)
|
||||
|
||||
# Assert largest outlier score is smaller than smallest inlier score:
|
||||
assert np.min(score[:-2]) > np.max(score[-2:])
|
||||
|
||||
# Assert predict() works:
|
||||
clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X)
|
||||
expected_predictions = 6 * [1] + 2 * [-1]
|
||||
assert_array_equal(clf._predict(), expected_predictions)
|
||||
assert_array_equal(clf.fit_predict(X), expected_predictions)
|
||||
|
||||
|
||||
def test_lof_performance(global_dtype):
|
||||
# Generate train/test data
|
||||
rng = check_random_state(2)
|
||||
X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False)
|
||||
X_train = X[:100]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
X_test = np.r_[X[100:], X_outliers]
|
||||
y_test = np.array([0] * 20 + [1] * 20)
|
||||
|
||||
# fit the model for novelty detection
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that roc_auc is good
|
||||
assert roc_auc_score(y_test, y_pred) > 0.99
|
||||
|
||||
|
||||
def test_lof_values(global_dtype):
|
||||
# toy samples:
|
||||
X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
|
||||
clf1 = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=2, contamination=0.1, novelty=True
|
||||
).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
|
||||
s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0))
|
||||
s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2)))
|
||||
# check predict()
|
||||
assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
# check predict(one sample not in train)
|
||||
assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0])
|
||||
assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0])
|
||||
# check predict(one sample already in train)
|
||||
assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1])
|
||||
assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1])
|
||||
|
||||
|
||||
def test_lof_precomputed(global_dtype, random_state=42):
|
||||
"""Tests LOF with a distance matrix."""
|
||||
# Note: smaller samples may result in spurious test success
|
||||
rng = np.random.RandomState(random_state)
|
||||
X = rng.random_sample((10, 4)).astype(global_dtype, copy=False)
|
||||
Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False)
|
||||
DXX = metrics.pairwise_distances(X, metric="euclidean")
|
||||
DYX = metrics.pairwise_distances(Y, X, metric="euclidean")
|
||||
# As a feature matrix (n_samples by n_features)
|
||||
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
|
||||
lof_X.fit(X)
|
||||
pred_X_X = lof_X._predict()
|
||||
pred_X_Y = lof_X.predict(Y)
|
||||
|
||||
# As a dense distance matrix (n_samples by n_samples)
|
||||
lof_D = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True
|
||||
)
|
||||
lof_D.fit(DXX)
|
||||
pred_D_X = lof_D._predict()
|
||||
pred_D_Y = lof_D.predict(DYX)
|
||||
|
||||
assert_allclose(pred_X_X, pred_D_X)
|
||||
assert_allclose(pred_X_Y, pred_D_Y)
|
||||
|
||||
|
||||
def test_n_neighbors_attribute():
|
||||
X = iris.data
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
|
||||
msg = "n_neighbors will be set to (n_samples - 1)"
|
||||
with pytest.warns(UserWarning, match=re.escape(msg)):
|
||||
clf.fit(X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
|
||||
def test_score_samples(global_dtype):
|
||||
X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype)
|
||||
X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype)
|
||||
clf1 = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=2, contamination=0.1, novelty=True
|
||||
).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
|
||||
|
||||
clf1_scores = clf1.score_samples(X_test)
|
||||
clf1_decisions = clf1.decision_function(X_test)
|
||||
|
||||
clf2_scores = clf2.score_samples(X_test)
|
||||
clf2_decisions = clf2.decision_function(X_test)
|
||||
|
||||
assert_allclose(
|
||||
clf1_scores,
|
||||
clf1_decisions + clf1.offset_,
|
||||
)
|
||||
assert_allclose(
|
||||
clf2_scores,
|
||||
clf2_decisions + clf2.offset_,
|
||||
)
|
||||
assert_allclose(clf1_scores, clf2_scores)
|
||||
|
||||
|
||||
def test_novelty_errors():
|
||||
X = iris.data
|
||||
|
||||
# check errors for novelty=False
|
||||
clf = neighbors.LocalOutlierFactor()
|
||||
clf.fit(X)
|
||||
# predict, decision_function and score_samples raise ValueError
|
||||
for method in ["predict", "decision_function", "score_samples"]:
|
||||
outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'"
|
||||
inner_msg = "{} is not available when novelty=False".format(method)
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
getattr(clf, method)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
# check errors for novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
|
||||
outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'"
|
||||
inner_msg = "fit_predict is not available when novelty=True"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
getattr(clf, "fit_predict")
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
def test_novelty_training_scores(global_dtype):
|
||||
# check that the scores of the training samples are still accessible
|
||||
# when novelty=True through the negative_outlier_factor_ attribute
|
||||
X = iris.data.astype(global_dtype)
|
||||
|
||||
# fit with novelty=False
|
||||
clf_1 = neighbors.LocalOutlierFactor()
|
||||
clf_1.fit(X)
|
||||
scores_1 = clf_1.negative_outlier_factor_
|
||||
|
||||
# fit with novelty=True
|
||||
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf_2.fit(X)
|
||||
scores_2 = clf_2.negative_outlier_factor_
|
||||
|
||||
assert_allclose(scores_1, scores_2)
|
||||
|
||||
|
||||
def test_hasattr_prediction():
|
||||
# check availability of prediction methods depending on novelty value.
|
||||
X = [[1, 1], [1, 2], [2, 1]]
|
||||
|
||||
# when novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, "predict")
|
||||
assert hasattr(clf, "decision_function")
|
||||
assert hasattr(clf, "score_samples")
|
||||
assert not hasattr(clf, "fit_predict")
|
||||
|
||||
# when novelty=False
|
||||
clf = neighbors.LocalOutlierFactor(novelty=False)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, "fit_predict")
|
||||
assert not hasattr(clf, "predict")
|
||||
assert not hasattr(clf, "decision_function")
|
||||
assert not hasattr(clf, "score_samples")
|
||||
|
||||
|
||||
@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
|
||||
def test_novelty_true_common_tests(estimator, check):
|
||||
# the common tests are run for the default LOF (novelty=False).
|
||||
# here we run these common tests for LOF when novelty=True
|
||||
check(estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expected_outliers", [30, 53])
|
||||
def test_predicted_outlier_number(expected_outliers):
|
||||
# the number of predicted outliers should be equal to the number of
|
||||
# expected outliers unless there are ties in the abnormality scores.
|
||||
X = iris.data
|
||||
n_samples = X.shape[0]
|
||||
contamination = float(expected_outliers) / n_samples
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(contamination=contamination)
|
||||
y_pred = clf.fit_predict(X)
|
||||
|
||||
num_outliers = np.sum(y_pred != 1)
|
||||
if num_outliers != expected_outliers:
|
||||
y_dec = clf.negative_outlier_factor_
|
||||
check_outlier_corruption(num_outliers, expected_outliers, y_dec)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse(csr_container):
|
||||
# LocalOutlierFactor must support CSR inputs
|
||||
# TODO: compare results on dense and sparse data as proposed in:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
|
||||
X = csr_container(iris.data)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(novelty=True)
|
||||
lof.fit(X)
|
||||
lof.predict(X)
|
||||
lof.score_samples(X)
|
||||
lof.decision_function(X)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(novelty=False)
|
||||
lof.fit_predict(X)
|
||||
|
||||
|
||||
def test_lof_error_n_neighbors_too_large():
|
||||
"""Check that we raise a proper error message when n_neighbors == n_samples.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/17207
|
||||
"""
|
||||
X = np.ones((7, 7))
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors < n_samples_fit, but n_neighbors = 1, "
|
||||
"n_samples_fit = 1, n_samples = 1"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1])
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2])
|
||||
assert lof.n_samples_fit_ == 2
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors < n_samples_fit, but n_neighbors = 2, "
|
||||
"n_samples_fit = 2, n_samples = 2"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof.kneighbors(None, n_neighbors=2)
|
||||
|
||||
distances, indices = lof.kneighbors(None, n_neighbors=1)
|
||||
assert distances.shape == (2, 1)
|
||||
assert indices.shape == (2, 1)
|
||||
|
||||
msg = (
|
||||
"Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, "
|
||||
"n_samples_fit = 2, n_samples = 7"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lof.kneighbors(X, n_neighbors=3)
|
||||
|
||||
(
|
||||
distances,
|
||||
indices,
|
||||
) = lof.kneighbors(X, n_neighbors=2)
|
||||
assert distances.shape == (7, 2)
|
||||
assert indices.shape == (7, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
|
||||
@pytest.mark.parametrize("novelty", [True, False])
|
||||
@pytest.mark.parametrize("contamination", [0.5, "auto"])
|
||||
def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty):
|
||||
"""Check that the fitted attributes are stored using the data type of X."""
|
||||
X = iris.data.astype(global_dtype, copy=False)
|
||||
|
||||
iso = neighbors.LocalOutlierFactor(
|
||||
n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty
|
||||
)
|
||||
iso.fit(X)
|
||||
|
||||
assert iso.negative_outlier_factor_.dtype == global_dtype
|
||||
|
||||
for method in ("score_samples", "decision_function"):
|
||||
if hasattr(iso, method):
|
||||
y_pred = getattr(iso, method)(X)
|
||||
assert y_pred.dtype == global_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
|
||||
@pytest.mark.parametrize("novelty", [True, False])
|
||||
@pytest.mark.parametrize("contamination", [0.5, "auto"])
|
||||
def test_lof_dtype_equivalence(algorithm, novelty, contamination):
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
|
||||
inliers = iris.data[:50] # setosa iris are really distinct from others
|
||||
outliers = iris.data[-5:] # virginica will be considered as outliers
|
||||
# lower the precision of the input data to check that we have an equivalence when
|
||||
# making the computation in 32 and 64 bits.
|
||||
X = np.concatenate([inliers, outliers], axis=0).astype(np.float32)
|
||||
|
||||
lof_32 = neighbors.LocalOutlierFactor(
|
||||
algorithm=algorithm, novelty=novelty, contamination=contamination
|
||||
)
|
||||
X_32 = X.astype(np.float32, copy=True)
|
||||
lof_32.fit(X_32)
|
||||
|
||||
lof_64 = neighbors.LocalOutlierFactor(
|
||||
algorithm=algorithm, novelty=novelty, contamination=contamination
|
||||
)
|
||||
X_64 = X.astype(np.float64, copy=True)
|
||||
lof_64.fit(X_64)
|
||||
|
||||
assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_)
|
||||
|
||||
for method in ("score_samples", "decision_function", "predict", "fit_predict"):
|
||||
if hasattr(lof_32, method):
|
||||
y_pred_32 = getattr(lof_32, method)(X_32)
|
||||
y_pred_64 = getattr(lof_64, method)(X_64)
|
||||
assert_allclose(y_pred_32, y_pred_64, atol=0.0002)
|
||||
|
||||
|
||||
def test_lof_duplicate_samples():
|
||||
"""
|
||||
Check that LocalOutlierFactor raises a warning when duplicate values
|
||||
in the training data cause inaccurate results.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27839
|
||||
"""
|
||||
|
||||
rng = np.random.default_rng(0)
|
||||
|
||||
x = rng.permutation(
|
||||
np.hstack(
|
||||
[
|
||||
[0.1] * 1000, # constant values
|
||||
np.linspace(0.1, 0.3, num=3000),
|
||||
rng.random(500) * 100, # the clear outliers
|
||||
]
|
||||
)
|
||||
)
|
||||
X = x.reshape(-1, 1)
|
||||
|
||||
error_msg = (
|
||||
"Duplicate values are leading to incorrect results. "
|
||||
"Increase the number of neighbors for more accurate results."
|
||||
)
|
||||
|
||||
lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
|
||||
|
||||
# Catch the warning
|
||||
with pytest.warns(UserWarning, match=re.escape(error_msg)):
|
||||
lof.fit_predict(X)
|
||||
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
from scipy.optimize import check_grad
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.datasets import load_iris, make_blobs, make_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.neighbors import NeighborhoodComponentsAnalysis
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils.validation import validate_data
|
||||
|
||||
rng = check_random_state(0)
|
||||
# Load and shuffle the iris dataset.
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris_data = iris.data[perm]
|
||||
iris_target = iris.target[perm]
|
||||
# Avoid having test data introducing dependencies between tests.
|
||||
iris_data.flags.writeable = False
|
||||
iris_target.flags.writeable = False
|
||||
EPS = np.finfo(float).eps
|
||||
|
||||
|
||||
def test_simple_example():
|
||||
"""Test on a simple example.
|
||||
|
||||
Puts four points in the input space where the opposite labels points are
|
||||
next to each other. After transform the samples from the same class
|
||||
should be next to each other.
|
||||
|
||||
"""
|
||||
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
|
||||
y = np.array([1, 0, 1, 0])
|
||||
nca = NeighborhoodComponentsAnalysis(
|
||||
n_components=2, init="identity", random_state=42
|
||||
)
|
||||
nca.fit(X, y)
|
||||
X_t = nca.transform(X)
|
||||
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
|
||||
|
||||
|
||||
def test_toy_example_collapse_points():
|
||||
"""Test on a toy example of three points that should collapse
|
||||
|
||||
We build a simple example: two points from the same class and a point from
|
||||
a different class in the middle of them. On this simple example, the new
|
||||
(transformed) points should all collapse into one single point. Indeed, the
|
||||
objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
|
||||
two samples from the same class. This is maximized for d=0 (because d>=0),
|
||||
with an objective equal to 1 (loss=-1.).
|
||||
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
input_dim = 5
|
||||
two_points = rng.randn(2, input_dim)
|
||||
X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
|
||||
y = [0, 0, 1]
|
||||
|
||||
class LossStorer:
|
||||
def __init__(self, X, y):
|
||||
self.loss = np.inf # initialize the loss to very high
|
||||
# Initialize a fake NCA and variables needed to compute the loss:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the loss function"""
|
||||
self.loss, _ = self.fake_nca._loss_grad_lbfgs(
|
||||
transformation, self.X, self.same_class_mask, -1.0
|
||||
)
|
||||
|
||||
loss_storer = LossStorer(X, y)
|
||||
nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback)
|
||||
X_t = nca.fit_transform(X, y)
|
||||
print(X_t)
|
||||
# test that points are collapsed into one point
|
||||
assert_array_almost_equal(X_t - X_t[0], 0.0)
|
||||
assert abs(loss_storer.loss + 1) < 1e-10
|
||||
|
||||
|
||||
def test_finite_differences(global_random_seed):
|
||||
"""Test gradient of loss function
|
||||
|
||||
Assert that the gradient is almost equal to its finite differences
|
||||
approximation.
|
||||
"""
|
||||
# Initialize the transformation `M`, as well as `X` and `y` and `NCA`
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X, y = make_classification(random_state=global_random_seed)
|
||||
M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.n_iter_ = 0
|
||||
mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def fun(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[0]
|
||||
|
||||
def grad(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[1]
|
||||
|
||||
# compare the gradient to a finite difference approximation
|
||||
diff = check_grad(fun, grad, M.ravel())
|
||||
assert diff == pytest.approx(0.0, abs=1e-4)
|
||||
|
||||
|
||||
def test_params_validation():
|
||||
# Test that invalid parameters raise value error
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
NCA = NeighborhoodComponentsAnalysis
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
init = rng.rand(5, 3)
|
||||
msg = (
|
||||
f"The output dimensionality ({init.shape[0]}) "
|
||||
"of the given linear transformation `init` cannot be "
|
||||
f"greater than its input dimensionality ({init.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
NCA(init=init).fit(X, y)
|
||||
n_components = 10
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) cannot be greater "
|
||||
f"than the given data dimensionality ({X.shape[1]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
NCA(n_components=n_components).fit(X, y)
|
||||
|
||||
|
||||
def test_transformation_dimensions():
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
# Fail if transformation input dimension does not match inputs dimensions
|
||||
transformation = np.array([[1, 2], [3, 4]])
|
||||
with pytest.raises(ValueError):
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
# Fail if transformation output dimension is larger than
|
||||
# transformation input dimension
|
||||
transformation = np.array([[1, 2], [3, 4], [5, 6]])
|
||||
# len(transformation) > len(transformation[0])
|
||||
with pytest.raises(ValueError):
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
# Pass otherwise
|
||||
transformation = np.arange(9).reshape(3, 3)
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
|
||||
def test_n_components():
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
init = rng.rand(X.shape[1] - 1, 3)
|
||||
|
||||
# n_components = X.shape[1] != transformation.shape[0]
|
||||
n_components = X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) does not match the output "
|
||||
"dimensionality of the given linear transformation "
|
||||
f"`init` ({init.shape[0]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# n_components > X.shape[1]
|
||||
n_components = X.shape[1] + 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the projected space "
|
||||
f"`n_components` ({n_components}) cannot be greater than "
|
||||
f"the given data dimensionality ({X.shape[1]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# n_components < X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(n_components=2, init="identity")
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
def test_init_transformation():
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
|
||||
# Start learning from scratch
|
||||
nca = NeighborhoodComponentsAnalysis(init="identity")
|
||||
nca.fit(X, y)
|
||||
|
||||
# Initialize with random
|
||||
nca_random = NeighborhoodComponentsAnalysis(init="random")
|
||||
nca_random.fit(X, y)
|
||||
|
||||
# Initialize with auto
|
||||
nca_auto = NeighborhoodComponentsAnalysis(init="auto")
|
||||
nca_auto.fit(X, y)
|
||||
|
||||
# Initialize with PCA
|
||||
nca_pca = NeighborhoodComponentsAnalysis(init="pca")
|
||||
nca_pca.fit(X, y)
|
||||
|
||||
# Initialize with LDA
|
||||
nca_lda = NeighborhoodComponentsAnalysis(init="lda")
|
||||
nca_lda.fit(X, y)
|
||||
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[1] must match X.shape[1]
|
||||
init = rng.rand(X.shape[1], X.shape[1] + 1)
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
msg = (
|
||||
f"The input dimensionality ({init.shape[1]}) of the given "
|
||||
"linear transformation `init` must match the "
|
||||
f"dimensionality of the given inputs `X` ({X.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[0] must be <= init.shape[1]
|
||||
init = rng.rand(X.shape[1] + 1, X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
msg = (
|
||||
f"The output dimensionality ({init.shape[0]}) of the given "
|
||||
"linear transformation `init` cannot be "
|
||||
f"greater than its input dimensionality ({init.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[0] must match n_components
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
n_components = X.shape[1] - 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
msg = (
|
||||
"The preferred dimensionality of the "
|
||||
f"projected space `n_components` ({n_components}) "
|
||||
"does not match the output dimensionality of the given "
|
||||
f"linear transformation `init` ({init.shape[0]})!"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples", [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize("n_features", [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize("n_classes", [5, 7, 11])
|
||||
@pytest.mark.parametrize("n_components", [3, 5, 7, 11])
|
||||
def test_auto_init(n_samples, n_features, n_classes, n_components):
|
||||
# Test that auto choose the init as expected with every configuration
|
||||
# of order of n_samples, n_features, n_classes and n_components.
|
||||
rng = np.random.RandomState(42)
|
||||
nca_base = NeighborhoodComponentsAnalysis(
|
||||
init="auto", n_components=n_components, max_iter=1, random_state=rng
|
||||
)
|
||||
if n_classes >= n_samples:
|
||||
pass
|
||||
# n_classes > n_samples is impossible, and n_classes == n_samples
|
||||
# throws an error from lda but is an absurd case
|
||||
else:
|
||||
X = rng.randn(n_samples, n_features)
|
||||
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
|
||||
if n_components > n_features:
|
||||
# this would return a ValueError, which is already tested in
|
||||
# test_params_validation
|
||||
pass
|
||||
else:
|
||||
nca = clone(nca_base)
|
||||
nca.fit(X, y)
|
||||
if n_components <= min(n_classes - 1, n_features):
|
||||
nca_other = clone(nca_base).set_params(init="lda")
|
||||
elif n_components < min(n_features, n_samples):
|
||||
nca_other = clone(nca_base).set_params(init="pca")
|
||||
else:
|
||||
nca_other = clone(nca_base).set_params(init="identity")
|
||||
nca_other.fit(X, y)
|
||||
assert_array_almost_equal(nca.components_, nca_other.components_)
|
||||
|
||||
|
||||
def test_warm_start_validation():
|
||||
X, y = make_classification(
|
||||
n_samples=30,
|
||||
n_features=5,
|
||||
n_classes=4,
|
||||
n_redundant=0,
|
||||
n_informative=5,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
|
||||
nca.fit(X, y)
|
||||
|
||||
X_less_features, y = make_classification(
|
||||
n_samples=30,
|
||||
n_features=4,
|
||||
n_classes=4,
|
||||
n_redundant=0,
|
||||
n_informative=4,
|
||||
random_state=0,
|
||||
)
|
||||
msg = (
|
||||
f"The new inputs dimensionality ({X_less_features.shape[1]}) "
|
||||
"does not match the input dimensionality of the previously learned "
|
||||
f"transformation ({nca.components_.shape[1]})."
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
nca.fit(X_less_features, y)
|
||||
|
||||
|
||||
def test_warm_start_effectiveness():
|
||||
# A 1-iteration second fit on same data should give almost same result
|
||||
# with warm starting, and quite different result without warm starting.
|
||||
|
||||
nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm = nca_warm.components_
|
||||
nca_warm.max_iter = 1
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm_plus_one = nca_warm.components_
|
||||
|
||||
nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold = nca_cold.components_
|
||||
nca_cold.max_iter = 1
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold_plus_one = nca_cold.components_
|
||||
|
||||
diff_warm = np.sum(np.abs(transformation_warm_plus_one - transformation_warm))
|
||||
diff_cold = np.sum(np.abs(transformation_cold_plus_one - transformation_cold))
|
||||
assert diff_warm < 3.0, (
|
||||
"Transformer changed significantly after one "
|
||||
"iteration even though it was warm-started."
|
||||
)
|
||||
|
||||
assert diff_cold > diff_warm, (
|
||||
"Cold-started transformer changed less "
|
||||
"significantly than warm-started "
|
||||
"transformer after one iteration."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"init_name", ["pca", "lda", "identity", "random", "precomputed"]
|
||||
)
|
||||
def test_verbose(init_name, capsys):
|
||||
# assert there is proper output when verbose = 1, for every initialization
|
||||
# except auto because auto will call one of the others
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
regexp_init = r"... done in \ *\d+\.\d{2}s"
|
||||
msgs = {
|
||||
"pca": "Finding principal components" + regexp_init,
|
||||
"lda": "Finding most discriminative components" + regexp_init,
|
||||
}
|
||||
if init_name == "precomputed":
|
||||
init = rng.randn(X.shape[1], X.shape[1])
|
||||
else:
|
||||
init = init_name
|
||||
nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
|
||||
nca.fit(X, y)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
lines = re.split("\n+", out)
|
||||
# if pca or lda init, an additional line is printed, so we test
|
||||
# it and remove it to test the rest equally among initializations
|
||||
if init_name in ["pca", "lda"]:
|
||||
assert re.match(msgs[init_name], lines[0])
|
||||
lines = lines[1:]
|
||||
assert lines[0] == "[NeighborhoodComponentsAnalysis]"
|
||||
header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)")
|
||||
assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header)
|
||||
assert lines[2] == "[NeighborhoodComponentsAnalysis] {}".format("-" * len(header))
|
||||
for line in lines[3:-2]:
|
||||
# The following regex will match for instance:
|
||||
# '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01'
|
||||
assert re.match(
|
||||
r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
|
||||
r"[+|-]\d+\ *\d+\.\d{2}",
|
||||
line,
|
||||
)
|
||||
assert re.match(
|
||||
r"\[NeighborhoodComponentsAnalysis\] Training took\ *\d+\.\d{2}s\.",
|
||||
lines[-2],
|
||||
)
|
||||
assert lines[-1] == ""
|
||||
|
||||
|
||||
def test_no_verbose(capsys):
|
||||
# assert by default there is no output (verbose=0)
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
# check output
|
||||
assert out == ""
|
||||
|
||||
|
||||
def test_singleton_class():
|
||||
X = iris_data.copy()
|
||||
y = iris_target.copy()
|
||||
|
||||
# one singleton class
|
||||
singleton_class = 1
|
||||
(ind_singleton,) = np.where(y == singleton_class)
|
||||
y[ind_singleton] = 2
|
||||
y[ind_singleton[0]] = singleton_class
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# One non-singleton class
|
||||
(ind_1,) = np.where(y == 1)
|
||||
(ind_2,) = np.where(y == 2)
|
||||
y[ind_1] = 0
|
||||
y[ind_1[0]] = 1
|
||||
y[ind_2] = 0
|
||||
y[ind_2[0]] = 2
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# Only singleton classes
|
||||
(ind_0,) = np.where(y == 0)
|
||||
(ind_1,) = np.where(y == 1)
|
||||
(ind_2,) = np.where(y == 2)
|
||||
X = X[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
y = y[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(init="identity", max_iter=30)
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_one_class():
|
||||
X = iris_data[iris_target == 0]
|
||||
y = iris_target[iris_target == 0]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(
|
||||
max_iter=30, n_components=X.shape[1], init="identity"
|
||||
)
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_callback(capsys):
|
||||
max_iter = 10
|
||||
|
||||
def my_cb(transformation, n_iter):
|
||||
assert transformation.shape == (iris_data.shape[1] ** 2,)
|
||||
rem_iter = max_iter - n_iter
|
||||
print("{} iterations remaining...".format(rem_iter))
|
||||
|
||||
# assert that my_cb is called
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1)
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
assert "{} iterations remaining...".format(max_iter - 1) in out
|
||||
|
||||
|
||||
def test_expected_transformation_shape():
|
||||
"""Test that the transformation has the expected shape."""
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
class TransformationStorer:
|
||||
def __init__(self, X, y):
|
||||
# Initialize a fake NCA and variables needed to call the loss
|
||||
# function:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the transformation taken as input by
|
||||
the optimizer"""
|
||||
self.transformation = transformation
|
||||
|
||||
transformation_storer = TransformationStorer(X, y)
|
||||
cb = transformation_storer.callback
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
|
||||
nca.fit(X, y)
|
||||
assert transformation_storer.transformation.size == X.shape[1] ** 2
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
|
||||
cls_name = nca.__class__.__name__
|
||||
msg = "[{}] NCA did not converge".format(cls_name)
|
||||
with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
|
||||
nca.fit(iris_data, iris_target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"param, value",
|
||||
[
|
||||
("n_components", np.int32(3)),
|
||||
("max_iter", np.int32(100)),
|
||||
("tol", np.float32(0.0001)),
|
||||
],
|
||||
)
|
||||
def test_parameters_valid_types(param, value):
|
||||
# check that no error is raised when parameters have numpy integer or
|
||||
# floating types.
|
||||
nca = NeighborhoodComponentsAnalysis(**{param: value})
|
||||
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_components", [None, 2])
|
||||
def test_nca_feature_names_out(n_components):
|
||||
"""Check `get_feature_names_out` for `NeighborhoodComponentsAnalysis`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28293
|
||||
"""
|
||||
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
est = NeighborhoodComponentsAnalysis(n_components=n_components).fit(X, y)
|
||||
names_out = est.get_feature_names_out()
|
||||
|
||||
class_name_lower = est.__class__.__name__.lower()
|
||||
|
||||
if n_components is not None:
|
||||
expected_n_features = n_components
|
||||
else:
|
||||
expected_n_features = X.shape[1]
|
||||
|
||||
expected_names_out = np.array(
|
||||
[f"{class_name_lower}{i}" for i in range(expected_n_features)],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
@@ -0,0 +1,237 @@
|
||||
"""
|
||||
Testing for the nearest centroid module.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.neighbors import NearestCentroid
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
true_result = [-1, 1, 1]
|
||||
true_result_prior1 = [-1, 1, 1]
|
||||
|
||||
true_discriminant_scores = [-32, 64, 80]
|
||||
true_proba = [[1, 1.26642e-14], [1.60381e-28, 1], [1.80485e-35, 1]]
|
||||
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
rng = np.random.RandomState(1)
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_classification_toy(csr_container):
|
||||
# Check classification on a toy dataset, including sparse versions.
|
||||
X_csr = csr_container(X)
|
||||
T_csr = csr_container(T)
|
||||
|
||||
# Check classification on a toy dataset, including sparse versions.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
# Test uniform priors
|
||||
clf = NearestCentroid(priors="uniform")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
clf = NearestCentroid(priors="empirical")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
|
||||
assert_array_almost_equal(clf.predict_proba(T), true_proba)
|
||||
|
||||
# Test custom priors
|
||||
clf = NearestCentroid(priors=[0.25, 0.75])
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result_prior1)
|
||||
|
||||
# Same test, but with a sparse matrix to fit and test.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit with sparse, test with non-sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
|
||||
# Fit with non-sparse, test with sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit and predict with non-CSR sparse matrices
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr.tocoo(), y)
|
||||
assert_array_equal(clf.predict(T_csr.tolil()), true_result)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
for metric in ("euclidean", "manhattan"):
|
||||
clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.9, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_iris_shrinkage():
|
||||
# Check consistency on dataset iris, when using shrinkage.
|
||||
for metric in ("euclidean", "manhattan"):
|
||||
for shrink_threshold in [None, 0.1, 0.5]:
|
||||
clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
|
||||
clf = clf.fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.8, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
import pickle
|
||||
|
||||
# classification
|
||||
obj = NearestCentroid()
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert_array_equal(
|
||||
score,
|
||||
score2,
|
||||
"Failed to generate same score after pickling (classification).",
|
||||
)
|
||||
|
||||
|
||||
def test_shrinkage_correct():
|
||||
# Ensure that the shrinking is correct.
|
||||
# The expected result is calculated by R (pamr),
|
||||
# which is implemented by the author of the original paper.
|
||||
# (One need to modify the code to output the new centroid in pamr.predict)
|
||||
|
||||
X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
|
||||
y = np.array([1, 1, 2, 2, 2])
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
|
||||
np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
|
||||
|
||||
|
||||
def test_shrinkage_threshold_decoded_y():
|
||||
clf = NearestCentroid(shrink_threshold=0.01)
|
||||
y_ind = np.asarray(y)
|
||||
y_ind[y_ind == -1] = 0
|
||||
clf.fit(X, y_ind)
|
||||
centroid_encoded = clf.centroids_
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(centroid_encoded, clf.centroids_)
|
||||
|
||||
|
||||
def test_predict_translated_data():
|
||||
# Test that NearestCentroid gives same results on translated data
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(50, 50)
|
||||
y = rng.randint(0, 3, 50)
|
||||
noise = rng.rand(50)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
y_init = clf.predict(X)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
X_noise = X + noise
|
||||
clf.fit(X_noise, y)
|
||||
y_translate = clf.predict(X_noise)
|
||||
assert_array_equal(y_init, y_translate)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_manhattan_metric(csr_container):
|
||||
# Test the manhattan metric.
|
||||
X_csr = csr_container(X)
|
||||
|
||||
clf = NearestCentroid(metric="manhattan")
|
||||
clf.fit(X, y)
|
||||
dense_centroid = clf.centroids_
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.centroids_, dense_centroid)
|
||||
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
|
||||
|
||||
|
||||
def test_features_zero_var():
|
||||
# Test that features with 0 variance throw error
|
||||
|
||||
X = np.empty((10, 2))
|
||||
X[:, 0] = -0.13725701
|
||||
X[:, 1] = -0.9853293
|
||||
y = np.zeros((10))
|
||||
y[0] = 1
|
||||
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_negative_priors_error():
|
||||
"""Check that we raise an error when the user-defined priors are negative."""
|
||||
clf = NearestCentroid(priors=[-2, 4])
|
||||
with pytest.raises(ValueError, match="priors must be non-negative"):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_warn_non_normalized_priors():
|
||||
"""Check that we raise a warning and normalize the user-defined priors when they
|
||||
don't sum to 1.
|
||||
"""
|
||||
priors = [2, 4]
|
||||
clf = NearestCentroid(priors=priors)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match="The priors do not sum to 1. Normalizing such that it sums to one.",
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_allclose(clf.class_prior_, np.asarray(priors) / np.asarray(priors).sum())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
|
||||
)
|
||||
def test_method_not_available_with_manhattan(response_method):
|
||||
"""Check that we raise an AttributeError with Manhattan metric when trying
|
||||
to call a non-thresholded response method.
|
||||
"""
|
||||
clf = NearestCentroid(metric="manhattan").fit(X, y)
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, response_method)(T)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_constructor", [np.array] + CSR_CONTAINERS)
|
||||
def test_error_zero_variances(array_constructor):
|
||||
"""Check that we raise an error when the variance for all features is zero."""
|
||||
X = np.ones((len(y), 2))
|
||||
X[:, 1] *= 2
|
||||
X = array_constructor(X)
|
||||
|
||||
clf = NearestCentroid()
|
||||
with pytest.raises(ValueError, match="All features have zero variance"):
|
||||
clf.fit(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
This is testing the equivalence between some estimators with internal nearest
|
||||
neighbors computations, and the corresponding pipeline versions with
|
||||
KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
|
||||
neighbors.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.cluster import DBSCAN, SpectralClustering
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold import TSNE, Isomap, SpectralEmbedding
|
||||
from sklearn.neighbors import (
|
||||
KNeighborsRegressor,
|
||||
KNeighborsTransformer,
|
||||
LocalOutlierFactor,
|
||||
RadiusNeighborsRegressor,
|
||||
RadiusNeighborsTransformer,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_spectral_clustering():
|
||||
# Test chaining KNeighborsTransformer and SpectralClustering
|
||||
n_neighbors = 5
|
||||
X, _ = make_blobs(random_state=0)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
|
||||
SpectralClustering(
|
||||
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
|
||||
),
|
||||
)
|
||||
est_compact = SpectralClustering(
|
||||
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
|
||||
)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_spectral_embedding():
|
||||
# Test chaining KNeighborsTransformer and SpectralEmbedding
|
||||
n_neighbors = 5
|
||||
|
||||
n_samples = 1000
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
|
||||
SpectralEmbedding(
|
||||
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
|
||||
),
|
||||
)
|
||||
est_compact = SpectralEmbedding(
|
||||
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
|
||||
)
|
||||
St_compact = est_compact.fit_transform(S)
|
||||
St_chain = est_chain.fit_transform(S)
|
||||
assert_array_almost_equal(St_chain, St_compact)
|
||||
|
||||
|
||||
def test_dbscan():
|
||||
# Test chaining RadiusNeighborsTransformer and DBSCAN
|
||||
radius = 0.3
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
RadiusNeighborsTransformer(radius=radius, mode="distance"),
|
||||
DBSCAN(metric="precomputed", eps=radius),
|
||||
)
|
||||
est_compact = DBSCAN(eps=radius)
|
||||
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_isomap():
|
||||
# Test chaining KNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = make_blobs(random_state=0)
|
||||
X2, _ = make_blobs(random_state=1)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_tsne():
|
||||
# Test chaining KNeighborsTransformer and TSNE
|
||||
max_iter = 250
|
||||
perplexity = 5
|
||||
n_neighbors = int(3.0 * perplexity + 1)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
for metric in ["minkowski", "sqeuclidean"]:
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, mode="distance", metric=metric
|
||||
),
|
||||
TSNE(
|
||||
init="random",
|
||||
metric="precomputed",
|
||||
perplexity=perplexity,
|
||||
method="barnes_hut",
|
||||
random_state=42,
|
||||
max_iter=max_iter,
|
||||
),
|
||||
)
|
||||
est_compact = TSNE(
|
||||
init="random",
|
||||
metric=metric,
|
||||
perplexity=perplexity,
|
||||
max_iter=max_iter,
|
||||
method="barnes_hut",
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_false():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
|
||||
LocalOutlierFactor(
|
||||
metric="precomputed",
|
||||
n_neighbors=n_neighbors,
|
||||
novelty=False,
|
||||
contamination="auto",
|
||||
),
|
||||
)
|
||||
est_compact = LocalOutlierFactor(
|
||||
n_neighbors=n_neighbors, novelty=False, contamination="auto"
|
||||
)
|
||||
|
||||
pred_chain = est_chain.fit_predict(X)
|
||||
pred_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_true():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X1 = rng.randn(40, 2)
|
||||
X2 = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
|
||||
LocalOutlierFactor(
|
||||
metric="precomputed",
|
||||
n_neighbors=n_neighbors,
|
||||
novelty=True,
|
||||
contamination="auto",
|
||||
),
|
||||
)
|
||||
est_compact = LocalOutlierFactor(
|
||||
n_neighbors=n_neighbors, novelty=True, contamination="auto"
|
||||
)
|
||||
|
||||
pred_chain = est_chain.fit(X1).predict(X2)
|
||||
pred_compact = est_compact.fit(X1).predict(X2)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_kneighbors_regressor():
|
||||
# Test chaining KNeighborsTransformer and classifiers/regressors
|
||||
rng = np.random.RandomState(0)
|
||||
X = 2 * rng.rand(40, 5) - 1
|
||||
X2 = 2 * rng.rand(40, 5) - 1
|
||||
y = rng.rand(40, 1)
|
||||
|
||||
n_neighbors = 12
|
||||
radius = 1.5
|
||||
# We precompute more neighbors than necessary, to have equivalence between
|
||||
# k-neighbors estimator after radius-neighbors transformer, and vice-versa.
|
||||
factor = 2
|
||||
|
||||
k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
|
||||
k_trans_factor = KNeighborsTransformer(
|
||||
n_neighbors=int(n_neighbors * factor), mode="distance"
|
||||
)
|
||||
|
||||
r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance")
|
||||
r_trans_factor = RadiusNeighborsTransformer(
|
||||
radius=int(radius * factor), mode="distance"
|
||||
)
|
||||
|
||||
k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
|
||||
r_reg = RadiusNeighborsRegressor(radius=radius)
|
||||
|
||||
test_list = [
|
||||
(k_trans, k_reg),
|
||||
(k_trans_factor, r_reg),
|
||||
(r_trans, r_reg),
|
||||
(r_trans_factor, k_reg),
|
||||
]
|
||||
|
||||
for trans, reg in test_list:
|
||||
# compare the chained version and the compact version
|
||||
reg_compact = clone(reg)
|
||||
reg_precomp = clone(reg)
|
||||
reg_precomp.set_params(metric="precomputed")
|
||||
|
||||
reg_chain = make_pipeline(clone(trans), reg_precomp)
|
||||
|
||||
y_pred_chain = reg_chain.fit(X, y).predict(X2)
|
||||
y_pred_compact = reg_compact.fit(X, y).predict(X2)
|
||||
assert_array_almost_equal(y_pred_chain, y_pred_compact)
|
||||
@@ -0,0 +1,296 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import itertools
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal
|
||||
|
||||
from sklearn.metrics import DistanceMetric
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
BallTree,
|
||||
kernel_norm,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
NeighborsHeap64 as NeighborsHeapBT,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
nodeheap_sort as nodeheap_sort_bt,
|
||||
)
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
simultaneous_sort as simultaneous_sort_bt,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
KDTree,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
NeighborsHeap64 as NeighborsHeapKDT,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
nodeheap_sort as nodeheap_sort_kdt,
|
||||
)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
simultaneous_sort as simultaneous_sort_kdt,
|
||||
)
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {
|
||||
"euclidean": {},
|
||||
"manhattan": {},
|
||||
"minkowski": dict(p=3),
|
||||
"chebyshev": {},
|
||||
"seuclidean": dict(V=rng.random_sample(DIMENSION)),
|
||||
"mahalanobis": dict(V=V_mahalanobis),
|
||||
}
|
||||
|
||||
KD_TREE_METRICS = ["euclidean", "manhattan", "chebyshev", "minkowski"]
|
||||
BALL_TREE_METRICS = list(METRICS)
|
||||
|
||||
|
||||
def dist_func(x1, x2, p):
|
||||
return np.sum((x1 - x2) ** p) ** (1.0 / p)
|
||||
|
||||
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel)
|
||||
|
||||
if kernel == "gaussian":
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == "tophat":
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == "epanechnikov":
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == "exponential":
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == "linear":
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == "cosine":
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError("kernel not recognized")
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
@pytest.mark.parametrize(
|
||||
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
|
||||
)
|
||||
@pytest.mark.parametrize("h", [0.01, 0.1, 1])
|
||||
@pytest.mark.parametrize("rtol", [0, 1e-5])
|
||||
@pytest.mark.parametrize("atol", [1e-6, 1e-2])
|
||||
@pytest.mark.parametrize("breadth_first", [True, False])
|
||||
def test_kernel_density(
|
||||
Cls, kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3
|
||||
):
|
||||
rng = check_random_state(1)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, h)
|
||||
|
||||
tree = Cls(X, leaf_size=10)
|
||||
dens = tree.kernel_density(
|
||||
Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first
|
||||
)
|
||||
assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1e-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind = tree.query_radius([query_pt], r + eps)[0]
|
||||
i = np.where(rad <= r + eps)[0]
|
||||
|
||||
ind.sort()
|
||||
i.sort()
|
||||
|
||||
assert_array_almost_equal(i, ind)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius_distance(Cls, n_samples=100, n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1e-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind, dist = tree.query_radius([query_pt], r + eps, return_distance=True)
|
||||
|
||||
ind = ind[0]
|
||||
dist = dist[0]
|
||||
|
||||
d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
|
||||
|
||||
assert_array_almost_equal(d, dist)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
@pytest.mark.parametrize("dualtree", (True, False))
|
||||
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
r = np.linspace(0, 1, 10)
|
||||
tree = Cls(X, leaf_size=10)
|
||||
|
||||
D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
|
||||
counts_true = [(D <= ri).sum() for ri in r]
|
||||
|
||||
counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
|
||||
assert_array_almost_equal(counts, counts_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
|
||||
def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
|
||||
heap = NeighborsHeap(n_pts, n_nbrs)
|
||||
rng = check_random_state(0)
|
||||
|
||||
for row in range(n_pts):
|
||||
d_in = rng.random_sample(2 * n_nbrs).astype(np.float64, copy=False)
|
||||
i_in = np.arange(2 * n_nbrs, dtype=np.intp)
|
||||
for d, i in zip(d_in, i_in):
|
||||
heap.push(row, d, i)
|
||||
|
||||
ind = np.argsort(d_in)
|
||||
d_in = d_in[ind]
|
||||
i_in = i_in[ind]
|
||||
|
||||
d_heap, i_heap = heap.get_arrays(sort=True)
|
||||
|
||||
assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
|
||||
assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
|
||||
def test_node_heap(nodeheap_sort, n_nodes=50):
|
||||
rng = check_random_state(0)
|
||||
vals = rng.random_sample(n_nodes).astype(np.float64, copy=False)
|
||||
|
||||
i1 = np.argsort(vals)
|
||||
vals2, i2 = nodeheap_sort(vals)
|
||||
|
||||
assert_array_almost_equal(i1, i2)
|
||||
assert_array_almost_equal(vals[i1], vals2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
|
||||
)
|
||||
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
|
||||
rng = check_random_state(0)
|
||||
dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False)
|
||||
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False)
|
||||
|
||||
dist2 = dist.copy()
|
||||
ind2 = ind.copy()
|
||||
|
||||
# simultaneous sort rows using function
|
||||
simultaneous_sort(dist, ind)
|
||||
|
||||
# simultaneous sort rows using numpy
|
||||
i = np.argsort(dist2, axis=1)
|
||||
row_ind = np.arange(n_rows)[:, None]
|
||||
dist2 = dist2[row_ind, i]
|
||||
ind2 = ind2[row_ind, i]
|
||||
|
||||
assert_array_almost_equal(dist, dist2)
|
||||
assert_array_almost_equal(ind, ind2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
|
||||
def test_gaussian_kde(Cls, n_samples=1000):
|
||||
# Compare gaussian KDE results to scipy.stats.gaussian_kde
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
rng = check_random_state(0)
|
||||
x_in = rng.normal(0, 1, n_samples)
|
||||
x_out = np.linspace(-5, 5, 30)
|
||||
|
||||
for h in [0.01, 0.1, 1]:
|
||||
tree = Cls(x_in[:, None])
|
||||
gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
|
||||
|
||||
dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
|
||||
dens_gkde = gkde.evaluate(x_out)
|
||||
|
||||
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, metric",
|
||||
itertools.chain(
|
||||
[(KDTree, metric) for metric in KD_TREE_METRICS],
|
||||
[(BallTree, metric) for metric in BALL_TREE_METRICS],
|
||||
),
|
||||
)
|
||||
@pytest.mark.parametrize("k", (1, 3, 5))
|
||||
@pytest.mark.parametrize("dualtree", (True, False))
|
||||
@pytest.mark.parametrize("breadth_first", (True, False))
|
||||
def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((40, DIMENSION))
|
||||
Y = rng.random_sample((10, DIMENSION))
|
||||
|
||||
kwargs = METRICS[metric]
|
||||
|
||||
kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
|
||||
|
||||
# don't check indices here: if there are any duplicate distances,
|
||||
# the indices may not match. Distances should not have this problem.
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, metric",
|
||||
[(KDTree, "euclidean"), (BallTree, "euclidean"), (BallTree, dist_func)],
|
||||
)
|
||||
@pytest.mark.parametrize("protocol", (0, 1, 2))
|
||||
def test_pickle(Cls, metric, protocol):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((10, 3))
|
||||
|
||||
if hasattr(metric, "__call__"):
|
||||
kwargs = {"p": 2}
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
|
||||
ind1, dist1 = tree1.query(X)
|
||||
|
||||
s = pickle.dumps(tree1, protocol=protocol)
|
||||
tree2 = pickle.loads(s)
|
||||
|
||||
ind2, dist2 = tree2.query(X)
|
||||
|
||||
assert_array_almost_equal(ind1, ind2)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
assert isinstance(tree2, Cls)
|
||||
@@ -0,0 +1,150 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.neighbors._quad_tree import _QuadTree
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
|
||||
def test_quadtree_boundary_computation():
|
||||
# Introduce a point into a quad tree with boundaries not easy to compute.
|
||||
Xs = []
|
||||
|
||||
# check a random case
|
||||
Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
|
||||
# check the case where only 0 are inserted
|
||||
Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
|
||||
# check the case where only negative are inserted
|
||||
Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
|
||||
# check the case where only small numbers are inserted
|
||||
Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
def test_quadtree_similar_point():
|
||||
# Introduce a point into a quad tree where a similar point already exists.
|
||||
# Test will hang if it doesn't complete.
|
||||
Xs = []
|
||||
|
||||
# check the case where points are actually different
|
||||
Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
|
||||
# check the case where points are the same on X axis
|
||||
Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on X axis
|
||||
Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
|
||||
# check the case where points are the same on Y axis
|
||||
Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on Y axis
|
||||
Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - x axis
|
||||
Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - y axis
|
||||
Xs.append(
|
||||
np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], dtype=np.float32)
|
||||
)
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_dimensions", (2, 3))
|
||||
@pytest.mark.parametrize("protocol", (0, 1, 2))
|
||||
def test_quad_tree_pickle(n_dimensions, protocol):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(X)
|
||||
|
||||
s = pickle.dumps(tree, protocol=protocol)
|
||||
bt2 = pickle.loads(s)
|
||||
|
||||
for x in X:
|
||||
cell_x_tree = tree.get_cell(x)
|
||||
cell_x_bt2 = bt2.get_cell(x)
|
||||
assert cell_x_tree == cell_x_bt2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_dimensions", (2, 3))
|
||||
def test_qt_insert_duplicate(n_dimensions):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
# create some duplicates
|
||||
Xd = np.r_[X, X[:5]]
|
||||
epsilon = 1e-6
|
||||
# EPSILON=1e-6 is defined in sklearn/neighbors/_quad_tree.pyx but not
|
||||
# accessible from Python
|
||||
# add slight noise: duplicate detection should tolerate tiny numerical differences
|
||||
Xd += epsilon * (rng.rand(*Xd.shape) - 0.5)
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(Xd)
|
||||
|
||||
cumulative_size = tree.cumulative_size
|
||||
leafs = tree.leafs
|
||||
|
||||
# Assert that the first 5 are indeed duplicated and that the next
|
||||
# ones are single point leaf
|
||||
for i, x in enumerate(X):
|
||||
cell_id = tree.get_cell(x)
|
||||
assert leafs[cell_id]
|
||||
assert cumulative_size[cell_id] == 1 + (i < 5)
|
||||
|
||||
|
||||
def test_summarize():
|
||||
# Simple check for quad tree's summarize
|
||||
|
||||
angle = 0.9
|
||||
X = np.array(
|
||||
[[-10.0, -10.0], [9.0, 10.0], [10.0, 9.0], [10.0, 10.0]], dtype=np.float32
|
||||
)
|
||||
query_pt = X[0, :]
|
||||
n_dimensions = X.shape[1]
|
||||
offset = n_dimensions + 2
|
||||
|
||||
qt = _QuadTree(n_dimensions, verbose=0)
|
||||
qt.build_tree(X)
|
||||
|
||||
idx, summary = qt._py_summarize(query_pt, X, angle)
|
||||
|
||||
node_dist = summary[n_dimensions]
|
||||
node_size = summary[n_dimensions + 1]
|
||||
|
||||
# Summary should contain only 1 node with size 3 and distance to
|
||||
# X[1:] barycenter
|
||||
barycenter = X[1:].mean(axis=0)
|
||||
ds2c = ((X[0] - barycenter) ** 2).sum()
|
||||
|
||||
assert idx == offset
|
||||
assert node_size == 3, "summary size = {}".format(node_size)
|
||||
assert np.isclose(node_dist, ds2c)
|
||||
|
||||
# Summary should contain all 3 node with size 1 and distance to
|
||||
# each point in X[1:] for ``angle=0``
|
||||
idx, summary = qt._py_summarize(query_pt, X, 0.0)
|
||||
barycenter = X[1:].mean(axis=0)
|
||||
ds2c = ((X[0] - barycenter) ** 2).sum()
|
||||
|
||||
assert idx == 3 * (offset)
|
||||
for i in range(3):
|
||||
node_dist = summary[i * offset + n_dimensions]
|
||||
node_size = summary[i * offset + n_dimensions + 1]
|
||||
|
||||
ds2c = ((X[0] - X[i + 1]) ** 2).sum()
|
||||
|
||||
assert node_size == 1, "summary size = {}".format(node_size)
|
||||
assert np.isclose(node_dist, ds2c)
|
||||
Reference in New Issue
Block a user