This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,54 @@
"""Module to give helpful messages to the user that did not
compile scikit-learn properly.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import os
INPLACE_MSG = """
It appears that you are importing a local scikit-learn source tree. For
this, you need to have an inplace install. Maybe you are in the source
directory and you need to try from another location."""
STANDARD_MSG = """
If you have used an installer, please check that it is suited for your
Python version, your operating system and your platform."""
def raise_build_error(e):
# Raise a comprehensible error and list the contents of the
# directory to help debugging on the mailing list.
local_dir = os.path.split(__file__)[0]
msg = STANDARD_MSG
if local_dir == "sklearn/__check_build":
# Picking up the local install: this will work only if the
# install is an 'inplace build'
msg = INPLACE_MSG
dir_content = list()
for i, filename in enumerate(os.listdir(local_dir)):
if (i + 1) % 3:
dir_content.append(filename.ljust(26))
else:
dir_content.append(filename + "\n")
raise ImportError(
"""%s
___________________________________________________________________________
Contents of %s:
%s
___________________________________________________________________________
It seems that scikit-learn has not been built correctly.
If you have installed scikit-learn from source, please do not forget
to build the package before using it. For detailed instructions, see:
https://scikit-learn.org/dev/developers/advanced_installation.html#building-from-source
%s"""
% (e, local_dir, "".join(dir_content).strip(), msg)
)
try:
from sklearn.__check_build._check_build import check_build # noqa: F401
except ImportError as e:
raise_build_error(e)

View File

@@ -0,0 +1,2 @@
def check_build():
return

View File

@@ -0,0 +1,6 @@
py.extension_module(
'_check_build',
cython_gen.process('_check_build.pyx'),
install: true,
subdir: 'sklearn/__check_build',
)

View File

@@ -0,0 +1,150 @@
"""Configure global settings and get information about the working environment."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# Machine learning module for Python
# ==================================
#
# sklearn is a Python module integrating classical machine
# learning algorithms in the tightly-knit world of scientific Python
# packages (numpy, scipy, matplotlib).
#
# It aims to provide simple and efficient solutions to learning problems
# that are accessible to everybody and reusable in various contexts:
# machine-learning as a versatile tool for science and engineering.
#
# See https://scikit-learn.org for complete documentation.
import importlib as _importlib
import logging
import os
import random
from sklearn._config import config_context, get_config, set_config
logger = logging.getLogger(__name__)
# PEP0440 compatible formatted version, see:
# https://www.python.org/dev/peps/pep-0440/
#
# Generic release markers:
# X.Y.0 # For first release after an increment in Y
# X.Y.Z # For bugfix releases
#
# Admissible pre-release markers:
# X.Y.ZaN # Alpha release
# X.Y.ZbN # Beta release
# X.Y.ZrcN # Release Candidate
# X.Y.Z # Final release
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
__version__ = "1.8.0"
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
# simultaneously. This can happen for instance when calling BLAS inside a
# prange. Setting the following environment variable allows multiple OpenMP
# libraries to be loaded. It should not degrade performances since we manually
# take care of potential over-subcription performance issues, in sections of
# the code where nested OpenMP loops can happen, by dynamically reconfiguring
# the inner OpenMP runtime to temporarily disable it while under the scope of
# the outer OpenMP parallel section.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
# Workaround issue discovered in intel-openmp 2019.5:
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
# `_distributor_init` allows distributors to run custom init code.
# For instance, for the Windows wheel, this is used to pre-load the
# vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
# sub-folder.
# It is necessary to do this prior to importing show_versions as the
# later is linked to the OpenMP runtime to make it possible to introspect
# it and importing it first would fail if the OpenMP dll cannot be found.
from sklearn import __check_build, _distributor_init # noqa: E402 F401
from sklearn.base import clone # noqa: E402
from sklearn.utils._show_versions import show_versions # noqa: E402
_submodules = [
"calibration",
"cluster",
"covariance",
"cross_decomposition",
"datasets",
"decomposition",
"dummy",
"ensemble",
"exceptions",
"experimental",
"externals",
"feature_extraction",
"feature_selection",
"frozen",
"gaussian_process",
"inspection",
"isotonic",
"kernel_approximation",
"kernel_ridge",
"linear_model",
"manifold",
"metrics",
"mixture",
"model_selection",
"multiclass",
"multioutput",
"naive_bayes",
"neighbors",
"neural_network",
"pipeline",
"preprocessing",
"random_projection",
"semi_supervised",
"svm",
"tree",
"discriminant_analysis",
"impute",
"compose",
]
__all__ = _submodules + [
# Non-modules:
"clone",
"get_config",
"set_config",
"config_context",
"show_versions",
]
def __dir__():
return __all__
def __getattr__(name):
if name in _submodules:
return _importlib.import_module(f"sklearn.{name}")
else:
try:
return globals()[name]
except KeyError:
raise AttributeError(f"Module 'sklearn' has no attribute '{name}'")
def setup_module(module):
"""Fixture for the tests to assure globally controllable seeding of RNGs"""
import numpy as np
# Check if a random seed exists in the environment, if not create one.
_random_seed = os.environ.get("SKLEARN_SEED", None)
if _random_seed is None:
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
_random_seed = int(_random_seed)
print("I: Seeding RNGs with %r" % _random_seed)
np.random.seed(_random_seed)
random.seed(_random_seed)

View File

@@ -0,0 +1,62 @@
#!/usr/bin/env python3
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import argparse
import os
from Cython import Tempita as tempita
# XXX: If this import ever fails (does it really?), vendor either
# cython.tempita or numpy/npy_tempita.
def process_tempita(fromfile, outfile=None):
"""Process tempita templated file and write out the result.
The template file is expected to end in `.c.tp` or `.pyx.tp`:
E.g. processing `template.c.in` generates `template.c`.
"""
with open(fromfile, "r", encoding="utf-8") as f:
template_content = f.read()
template = tempita.Template(template_content)
content = template.substitute()
with open(outfile, "w", encoding="utf-8") as f:
f.write(content)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=str, help="Path to the input file")
parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
parser.add_argument(
"-i",
"--ignore",
type=str,
help=(
"An ignored input - may be useful to add a "
"dependency between custom targets"
),
)
args = parser.parse_args()
if not args.infile.endswith(".tp"):
raise ValueError(f"Unexpected extension: {args.infile}")
if not args.outdir:
raise ValueError("Missing `--outdir` argument to tempita.py")
outdir_abs = os.path.join(os.getcwd(), args.outdir)
outfile = os.path.join(
outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
)
process_tempita(args.infile, outfile)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python3
"""Extract version number from __init__.py"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import os
sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
data = open(sklearn_init).readlines()
version_line = next(line for line in data if line.startswith("__version__"))
version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
print(version)

View File

@@ -0,0 +1,407 @@
"""Global configuration state and functions for management"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import os
import threading
from contextlib import contextmanager as contextmanager
_global_config = {
"assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
"working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": int(
os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
),
"enable_cython_pairwise_dist": True,
"array_api_dispatch": False,
"transform_output": "default",
"enable_metadata_routing": False,
"skip_parameter_validation": False,
}
_threadlocal = threading.local()
def _get_threadlocal_config():
"""Get a threadlocal **mutable** configuration. If the configuration
does not exist, copy the default global configuration."""
if not hasattr(_threadlocal, "global_config"):
_threadlocal.global_config = _global_config.copy()
return _threadlocal.global_config
def get_config():
"""Retrieve the current scikit-learn configuration.
This reflects the effective global configurations as established by default upon
library import, or modified via :func:`set_config` or :func:`config_context`.
Returns
-------
config : dict
Keys are parameter names that can be passed to :func:`set_config`.
See Also
--------
config_context : Context manager for global scikit-learn configuration.
set_config : Set global scikit-learn configuration.
Examples
--------
>>> import sklearn
>>> config = sklearn.get_config()
>>> config.keys()
dict_keys([...])
"""
# Return a copy of the threadlocal configuration so that users will
# not be able to modify the configuration with the returned dict.
return _get_threadlocal_config().copy()
def set_config(
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
enable_metadata_routing=None,
skip_parameter_validation=None,
):
"""Set global scikit-learn configuration.
These settings control the behaviour of scikit-learn functions during a library
usage session. Global configuration defaults (as described in the parameter list
below) take effect when scikit-learn is imported.
This function can be used to modify the global scikit-learn configuration at
runtime. Passing `None` as an argument (the default) leaves the corresponding
setting unchanged. This allows users to selectively update the global configuration
values without affecting the others.
.. versionadded:: 0.19
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. Global default: False.
.. versionadded:: 0.19
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. Global default: 1024.
.. versionadded:: 0.20
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()' while the default
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
all the non-changed parameters. Global default: True.
.. versionadded:: 0.21
.. versionchanged:: 0.23
Global default configuration changed from False to True.
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. Global default: 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Global default: 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Global default: False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
Global default: "default".
.. versionadded:: 1.2
.. versionadded:: 1.4
`"polars"` option was added.
enable_metadata_routing : bool, default=None
Enable metadata routing. By default this feature is disabled.
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
details.
- `True`: Metadata routing is enabled
- `False`: Metadata routing is disabled, use the old syntax.
- `None`: Configuration is unchanged
Global default: False.
.. versionadded:: 1.3
skip_parameter_validation : bool, default=None
If `True`, disable the validation of the hyper-parameters' types and values in
the fit method of estimators and for arguments passed to public helper
functions. It can save time in some situations but can lead to low level
crashes and exceptions with confusing error messages.
Global default: False.
Note that for data parameters, such as `X` and `y`, only type validation is
skipped but validation with `check_array` will continue to run.
.. versionadded:: 1.3
See Also
--------
config_context : Context manager for global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
Examples
--------
>>> from sklearn import set_config
>>> set_config(display='diagram') # doctest: +SKIP
"""
local_config = _get_threadlocal_config()
if assume_finite is not None:
local_config["assume_finite"] = assume_finite
if working_memory is not None:
local_config["working_memory"] = working_memory
if print_changed_only is not None:
local_config["print_changed_only"] = print_changed_only
if display is not None:
local_config["display"] = display
if pairwise_dist_chunk_size is not None:
local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
if enable_cython_pairwise_dist is not None:
local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
if array_api_dispatch is not None:
from sklearn.utils._array_api import _check_array_api_dispatch
_check_array_api_dispatch(array_api_dispatch)
local_config["array_api_dispatch"] = array_api_dispatch
if transform_output is not None:
local_config["transform_output"] = transform_output
if enable_metadata_routing is not None:
local_config["enable_metadata_routing"] = enable_metadata_routing
if skip_parameter_validation is not None:
local_config["skip_parameter_validation"] = skip_parameter_validation
@contextmanager
def config_context(
*,
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
enable_metadata_routing=None,
skip_parameter_validation=None,
):
"""Context manager to temporarily change the global scikit-learn configuration.
This context manager can be used to apply scikit-learn configuration changes within
the scope of the with statement. Once the context exits, the global configuration is
restored again.
The default global configurations (which take effect when scikit-learn is imported)
are defined below in the parameter list.
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. If None, the existing configuration won't change.
Global default: False.
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. If None, the existing configuration won't change.
Global default: 1024.
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()', but would print
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
when False. If None, the existing configuration won't change.
Global default: True.
.. versionchanged:: 0.23
Global default configuration changed from False to True.
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. If None, the existing configuration won't change.
Global default: 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Global default: 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Global default: False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
Global default: "default".
.. versionadded:: 1.2
.. versionadded:: 1.4
`"polars"` option was added.
enable_metadata_routing : bool, default=None
Enable metadata routing. By default this feature is disabled.
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
details.
- `True`: Metadata routing is enabled
- `False`: Metadata routing is disabled, use the old syntax.
- `None`: Configuration is unchanged
Global default: False.
.. versionadded:: 1.3
skip_parameter_validation : bool, default=None
If `True`, disable the validation of the hyper-parameters' types and values in
the fit method of estimators and for arguments passed to public helper
functions. It can save time in some situations but can lead to low level
crashes and exceptions with confusing error messages.
Global default: False.
Note that for data parameters, such as `X` and `y`, only type validation is
skipped but validation with `check_array` will continue to run.
.. versionadded:: 1.3
Yields
------
None.
See Also
--------
set_config : Set global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
Notes
-----
All settings, not just those presently modified, will be returned to
their previous values when the context manager is exited.
Examples
--------
>>> import sklearn
>>> from sklearn.utils.validation import assert_all_finite
>>> with sklearn.config_context(assume_finite=True):
... assert_all_finite([float('nan')])
>>> with sklearn.config_context(assume_finite=True):
... with sklearn.config_context(assume_finite=False):
... assert_all_finite([float('nan')])
Traceback (most recent call last):
...
ValueError: Input contains NaN...
"""
old_config = get_config()
set_config(
assume_finite=assume_finite,
working_memory=working_memory,
print_changed_only=print_changed_only,
display=display,
pairwise_dist_chunk_size=pairwise_dist_chunk_size,
enable_cython_pairwise_dist=enable_cython_pairwise_dist,
array_api_dispatch=array_api_dispatch,
transform_output=transform_output,
enable_metadata_routing=enable_metadata_routing,
skip_parameter_validation=skip_parameter_validation,
)
try:
yield
finally:
set_config(**old_config)

View File

@@ -0,0 +1,13 @@
"""Distributor init file
Distributors: you can add custom code here to support particular distributions
of scikit-learn.
For example, this is a good place to put any checks for hardware requirements.
The scikit-learn standard source distribution will not put code in this file,
so you can safely replace this file with your own version.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

View File

@@ -0,0 +1,116 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# Uses the pool adjacent violators algorithm (PAVA), with the
# enhancement of searching for the longest decreasing subsequence to
# pool at each step.
import numpy as np
from cython cimport floating
def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
cdef:
Py_ssize_t n = y.shape[0], i, k
floating prev_y, sum_wy, sum_w
Py_ssize_t[::1] target = np.arange(n, dtype=np.intp)
# target describes a list of blocks. At any time, if [i..j] (inclusive) is
# an active block, then target[i] := j and target[j] := i.
# For "active" indices (block starts):
# w[i] := sum{w_orig[j], j=[i..target[i]]}
# y[i] := sum{y_orig[j]*w_orig[j], j=[i..target[i]]} / w[i]
with nogil:
i = 0
while i < n:
k = target[i] + 1
if k == n:
break
if y[i] < y[k]:
i = k
continue
sum_wy = w[i] * y[i]
sum_w = w[i]
while True:
# We are within a decreasing subsequence.
prev_y = y[k]
sum_wy += w[k] * y[k]
sum_w += w[k]
k = target[k] + 1
if k == n or prev_y < y[k]:
# Non-singleton decreasing subsequence is finished,
# update first entry.
y[i] = sum_wy / sum_w
w[i] = sum_w
target[i] = k - 1
target[k - 1] = i
if i > 0:
# Backtrack if we can. This makes the algorithm
# single-pass and ensures O(n) complexity.
i = target[i - 1]
# Otherwise, restart from the same point.
break
# Reconstruct the solution.
i = 0
while i < n:
k = target[i] + 1
y[i + 1 : k] = y[i]
i = k
def _make_unique(const floating[::1] X,
const floating[::1] y,
const floating[::1] sample_weights):
"""Average targets for duplicate X, drop duplicates.
Aggregates duplicate X values into a single X value where
the target y is a (sample_weighted) average of the individual
targets.
Assumes that X is ordered, so that all duplicates follow each other.
"""
unique_values = len(np.unique(X))
if floating is float:
dtype = np.float32
else:
dtype = np.float64
cdef floating[::1] y_out = np.empty(unique_values, dtype=dtype)
cdef floating[::1] x_out = np.empty_like(y_out)
cdef floating[::1] weights_out = np.empty_like(y_out)
cdef floating current_x = X[0]
cdef floating current_y = 0
cdef floating current_weight = 0
cdef int i = 0
cdef int j
cdef floating x
cdef int n_samples = len(X)
cdef floating eps = np.finfo(dtype).resolution
for j in range(n_samples):
x = X[j]
if x - current_x >= eps:
# next unique value
x_out[i] = current_x
weights_out[i] = current_weight
y_out[i] = current_y / current_weight
i += 1
current_x = x
current_weight = sample_weights[j]
current_y = y[j] * sample_weights[j]
else:
current_weight += sample_weights[j]
current_y += y[j] * sample_weights[j]
x_out[i] = current_x
weights_out[i] = current_weight
y_out[i] = current_y / current_weight
return(
np.asarray(x_out[:i+1]),
np.asarray(y_out[:i+1]),
np.asarray(weights_out[:i+1]),
)

View File

@@ -0,0 +1,33 @@
"""
The :mod:`sklearn._loss` module includes loss function classes suitable for
fitting classification and regression tasks.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn._loss.loss import (
AbsoluteError,
HalfBinomialLoss,
HalfGammaLoss,
HalfMultinomialLoss,
HalfPoissonLoss,
HalfSquaredError,
HalfTweedieLoss,
HalfTweedieLossIdentity,
HuberLoss,
PinballLoss,
)
__all__ = [
"AbsoluteError",
"HalfBinomialLoss",
"HalfGammaLoss",
"HalfMultinomialLoss",
"HalfPoissonLoss",
"HalfSquaredError",
"HalfTweedieLoss",
"HalfTweedieLossIdentity",
"HuberLoss",
"PinballLoss",
]

View File

@@ -0,0 +1,101 @@
# Fused types for input like y_true, raw_prediction, sample_weights.
ctypedef fused floating_in:
double
float
# Fused types for output like gradient and hessian
# We use a different fused types for input (floating_in) and output (floating_out), such
# that input and output can have different dtypes in the same function call. A single
# fused type can only take on one single value (type) for all arguments in one function
# call.
ctypedef fused floating_out:
double
float
# Struct to return 2 doubles
ctypedef struct double_pair:
double val1
double val2
# C base class for loss functions
cdef class CyLossFunction:
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfSquaredError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyAbsoluteError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyPinballLoss(CyLossFunction):
cdef readonly double quantile # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHuberLoss(CyLossFunction):
cdef public double delta # public makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfPoissonLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfGammaLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfTweedieLoss(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfTweedieLossIdentity(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfBinomialLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyExponentialLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfMultinomialLoss():
cdef void cy_gradient(
self,
const floating_in y_true,
const floating_in[::1] raw_prediction,
const floating_in sample_weight,
floating_out[::1] gradient_out,
) noexcept nogil

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,282 @@
"""
Module contains classes for invertible (and differentiable) link functions.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from abc import ABC, abstractmethod
from dataclasses import dataclass
import numpy as np
from scipy.special import expit, logit
from scipy.stats import gmean
from sklearn.utils.extmath import softmax
@dataclass
class Interval:
low: float
high: float
low_inclusive: bool
high_inclusive: bool
def __post_init__(self):
"""Check that low <= high"""
if self.low > self.high:
raise ValueError(
f"One must have low <= high; got low={self.low}, high={self.high}."
)
def includes(self, x):
"""Test whether all values of x are in interval range.
Parameters
----------
x : ndarray
Array whose elements are tested to be in interval range.
Returns
-------
result : bool
"""
if self.low_inclusive:
low = np.greater_equal(x, self.low)
else:
low = np.greater(x, self.low)
if not np.all(low):
return False
if self.high_inclusive:
high = np.less_equal(x, self.high)
else:
high = np.less(x, self.high)
# Note: np.all returns numpy.bool_
return bool(np.all(high))
def _inclusive_low_high(interval, dtype=np.float64):
"""Generate values low and high to be within the interval range.
This is used in tests only.
Returns
-------
low, high : tuple
The returned values low and high lie within the interval.
"""
eps = 10 * np.finfo(dtype).eps
if interval.low == -np.inf:
low = -1e10
elif interval.low < 0:
low = interval.low * (1 - eps) + eps
else:
low = interval.low * (1 + eps) + eps
if interval.high == np.inf:
high = 1e10
elif interval.high < 0:
high = interval.high * (1 + eps) - eps
else:
high = interval.high * (1 - eps) - eps
return low, high
class BaseLink(ABC):
"""Abstract base class for differentiable, invertible link functions.
Convention:
- link function g: raw_prediction = g(y_pred)
- inverse link h: y_pred = h(raw_prediction)
For (generalized) linear models, `raw_prediction = X @ coef` is the so
called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
conditional (on X) expected value of the target `y_true`.
The methods are not implemented as staticmethods in case a link function needs
parameters.
"""
is_multiclass = False # used for testing only
# Usually, raw_prediction may be any real number and y_pred is an open
# interval.
# interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
interval_y_pred = Interval(-np.inf, np.inf, False, False)
@abstractmethod
def link(self, y_pred, out=None):
"""Compute the link function g(y_pred).
The link function maps (predicted) target values to raw predictions,
i.e. `g(y_pred) = raw_prediction`.
Parameters
----------
y_pred : array
Predicted target values.
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise link function.
"""
@abstractmethod
def inverse(self, raw_prediction, out=None):
"""Compute the inverse link function h(raw_prediction).
The inverse link function maps raw predictions to predicted target
values, i.e. `h(raw_prediction) = y_pred`.
Parameters
----------
raw_prediction : array
Raw prediction values (in link space).
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise inverse link function.
"""
class IdentityLink(BaseLink):
"""The identity link function g(x)=x."""
def link(self, y_pred, out=None):
if out is not None:
np.copyto(out, y_pred)
return out
else:
return y_pred
inverse = link
class LogLink(BaseLink):
"""The log link function g(x)=log(x)."""
interval_y_pred = Interval(0, np.inf, False, False)
def link(self, y_pred, out=None):
return np.log(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return np.exp(raw_prediction, out=out)
class LogitLink(BaseLink):
"""The logit link function g(x)=logit(x)."""
interval_y_pred = Interval(0, 1, False, False)
def link(self, y_pred, out=None):
return logit(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return expit(raw_prediction, out=out)
class HalfLogitLink(BaseLink):
"""Half the logit link function g(x)=1/2 * logit(x).
Used for the exponential loss.
"""
interval_y_pred = Interval(0, 1, False, False)
def link(self, y_pred, out=None):
out = logit(y_pred, out=out)
out *= 0.5
return out
def inverse(self, raw_prediction, out=None):
return expit(2 * raw_prediction, out)
class MultinomialLogit(BaseLink):
"""The symmetric multinomial logit function.
Convention:
- y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
Notes:
- The inverse link h is the softmax function.
- The sum is over the second axis, i.e. axis=1 (n_classes).
We have to choose additional constraints in order to make
y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
for n_classes classes identifiable and invertible.
We choose the symmetric side constraint where the geometric mean response
is set as reference category, see [2]:
The symmetric multinomial logit link function for a single data point is
then defined as
raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
= log(y_pred[k]) - mean(log(y_pred)).
Note that this is equivalent to the definition in [1] and implies mean
centered raw predictions:
sum(raw_prediction[k], k=0..n_classes-1) = 0.
For linear models with raw_prediction = X @ coef, this corresponds to
sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
feature is zero.
Reference
---------
.. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
logistic regression: a statistical view of boosting" Ann. Statist.
28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
https://projecteuclid.org/euclid.aos/1016218223
.. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
multinomial logit models with symmetric side constraints."
Computational Statistics 28 (2013): 1017-1034.
http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
"""
is_multiclass = True
interval_y_pred = Interval(0, 1, False, False)
def symmetrize_raw_prediction(self, raw_prediction):
return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
def link(self, y_pred, out=None):
# geometric mean as reference category
gm = gmean(y_pred, axis=1)
return np.log(y_pred / gm[:, np.newaxis], out=out)
def inverse(self, raw_prediction, out=None):
if out is None:
return softmax(raw_prediction, copy=True)
else:
np.copyto(out, raw_prediction)
softmax(out, copy=False)
return out
_LINKS = {
"identity": IdentityLink,
"log": LogLink,
"logit": LogitLink,
"half_logit": HalfLogitLink,
"multinomial_logit": MultinomialLogit,
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
# .pyx is generated, so this is needed to make Cython compilation work
_loss_cython_tree = [
fs.copyfile('_loss.pxd')
]
_loss_pyx = custom_target(
'_loss_pyx',
output: '_loss.pyx',
input: '_loss.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: _loss_cython_tree,
)
py.extension_module(
'_loss',
cython_gen.process(_loss_pyx),
dependencies: [openmp_dep],
install: true,
subdir: 'sklearn/_loss',
)

View File

@@ -0,0 +1,111 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn._loss.link import (
_LINKS,
HalfLogitLink,
Interval,
MultinomialLogit,
_inclusive_low_high,
)
LINK_FUNCTIONS = list(_LINKS.values())
def test_interval_raises():
"""Test that interval with low > high raises ValueError."""
with pytest.raises(
ValueError, match="One must have low <= high; got low=1, high=0."
):
Interval(1, 0, False, False)
@pytest.mark.parametrize(
"interval",
[
Interval(0, 1, False, False),
Interval(0, 1, False, True),
Interval(0, 1, True, False),
Interval(0, 1, True, True),
Interval(-np.inf, np.inf, False, False),
Interval(-np.inf, np.inf, False, True),
Interval(-np.inf, np.inf, True, False),
Interval(-np.inf, np.inf, True, True),
Interval(-10, -1, False, False),
Interval(-10, -1, False, True),
Interval(-10, -1, True, False),
Interval(-10, -1, True, True),
],
)
def test_is_in_range(interval):
# make sure low and high are always within the interval, used for linspace
low, high = _inclusive_low_high(interval)
x = np.linspace(low, high, num=10)
assert interval.includes(x)
# x contains lower bound
assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
# x contains upper bound
assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
# x contains upper and lower bound
assert interval.includes(np.r_[x, interval.low, interval.high]) == (
interval.low_inclusive and interval.high_inclusive
)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_inverse_identity(link, global_random_seed):
# Test that link of inverse gives identity.
rng = np.random.RandomState(global_random_seed)
link = link()
n_samples, n_classes = 100, None
# The values for `raw_prediction` are limited from -20 to 20 because in the
# class `LogitLink` the term `expit(x)` comes very close to 1 for large
# positive x and therefore loses precision.
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
elif isinstance(link, HalfLogitLink):
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
else:
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
y_pred = link.inverse(raw_prediction)
assert_allclose(link.inverse(link.link(y_pred)), y_pred)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_out_argument(link):
# Test that out argument gets assigned the result.
rng = np.random.RandomState(42)
link = link()
n_samples, n_classes = 100, None
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
else:
# So far, the valid interval of raw_prediction is (-inf, inf) and
# we do not need to distinguish.
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
y_pred = link.inverse(raw_prediction, out=None)
out = np.empty_like(raw_prediction)
y_pred_2 = link.inverse(raw_prediction, out=out)
assert_allclose(y_pred, out)
assert_array_equal(out, y_pred_2)
assert np.shares_memory(out, y_pred_2)
out = np.empty_like(y_pred)
raw_prediction_2 = link.link(y_pred, out=out)
assert_allclose(raw_prediction, out)
assert_array_equal(out, raw_prediction_2)
assert np.shares_memory(out, raw_prediction_2)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,74 @@
"""All minimum dependencies for scikit-learn."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import argparse
from collections import defaultdict
# scipy and cython should by in sync with pyproject.toml
NUMPY_MIN_VERSION = "1.24.1"
SCIPY_MIN_VERSION = "1.10.0"
JOBLIB_MIN_VERSION = "1.3.0"
THREADPOOLCTL_MIN_VERSION = "3.2.0"
PYTEST_MIN_VERSION = "7.1.2"
CYTHON_MIN_VERSION = "3.1.2"
# 'build' and 'install' is included to have structured metadata for CI.
# It will NOT be included in setup's extras_require
# The values are (version_spec, comma separated tags)
dependent_packages = {
"numpy": (NUMPY_MIN_VERSION, "build, install"),
"scipy": (SCIPY_MIN_VERSION, "build, install"),
"joblib": (JOBLIB_MIN_VERSION, "install"),
"threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"),
"cython": (CYTHON_MIN_VERSION, "build"),
"meson-python": ("0.17.1", "build"),
"matplotlib": ("3.6.1", "benchmark, docs, examples, tests"),
"scikit-image": ("0.22.0", "docs, examples"),
"pandas": ("1.5.0", "benchmark, docs, examples, tests"),
"seaborn": ("0.13.0", "docs, examples"),
"memory_profiler": ("0.57.0", "benchmark, docs"),
"pytest": (PYTEST_MIN_VERSION, "tests"),
"pytest-cov": ("2.9.0", "tests"),
"ruff": ("0.11.7", "tests"),
"mypy": ("1.15", "tests"),
"pyamg": ("5.0.0", "tests"),
"polars": ("0.20.30", "docs, tests"),
"pyarrow": ("12.0.0", "tests"),
"sphinx": ("7.3.7", "docs"),
"sphinx-copybutton": ("0.5.2", "docs"),
"sphinx-gallery": ("0.17.1", "docs"),
"numpydoc": ("1.2.0", "docs, tests"),
"Pillow": ("10.1.0", "docs"),
"pooch": ("1.8.0", "docs, examples, tests"),
"sphinx-prompt": ("1.4.0", "docs"),
"sphinxext-opengraph": ("0.9.1", "docs"),
"plotly": ("5.18.0", "docs, examples"),
"sphinxcontrib-sass": ("0.3.4", "docs"),
"sphinx-remove-toctrees": ("1.0.0.post1", "docs"),
"sphinx-design": ("0.6.0", "docs"),
"pydata-sphinx-theme": ("0.15.3", "docs"),
"towncrier": ("24.8.0", "docs"),
# XXX: Pin conda-lock to the latest released version (needs manual update
# from time to time)
"conda-lock": ("3.0.1", "maintenance"),
}
# create inverse mapping for setuptools
tag_to_packages: dict = defaultdict(list)
for package, (min_version, extras) in dependent_packages.items():
for extra in extras.split(", "):
tag_to_packages[extra].append("{}>={}".format(package, min_version))
# Used by CI to get the min dependencies
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get min dependencies for a package")
parser.add_argument("package", choices=dependent_packages)
args = parser.parse_args()
min_version = dependent_packages[args.package][0]
print(min_version)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
"""Popular unsupervised clustering algorithms."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.cluster._affinity_propagation import (
AffinityPropagation,
affinity_propagation,
)
from sklearn.cluster._agglomerative import (
AgglomerativeClustering,
FeatureAgglomeration,
linkage_tree,
ward_tree,
)
from sklearn.cluster._bicluster import SpectralBiclustering, SpectralCoclustering
from sklearn.cluster._birch import Birch
from sklearn.cluster._bisect_k_means import BisectingKMeans
from sklearn.cluster._dbscan import DBSCAN, dbscan
from sklearn.cluster._hdbscan.hdbscan import HDBSCAN
from sklearn.cluster._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
from sklearn.cluster._mean_shift import (
MeanShift,
estimate_bandwidth,
get_bin_seeds,
mean_shift,
)
from sklearn.cluster._optics import (
OPTICS,
cluster_optics_dbscan,
cluster_optics_xi,
compute_optics_graph,
)
from sklearn.cluster._spectral import SpectralClustering, spectral_clustering
__all__ = [
"DBSCAN",
"HDBSCAN",
"OPTICS",
"AffinityPropagation",
"AgglomerativeClustering",
"Birch",
"BisectingKMeans",
"FeatureAgglomeration",
"KMeans",
"MeanShift",
"MiniBatchKMeans",
"SpectralBiclustering",
"SpectralClustering",
"SpectralCoclustering",
"affinity_propagation",
"cluster_optics_dbscan",
"cluster_optics_xi",
"compute_optics_graph",
"dbscan",
"estimate_bandwidth",
"get_bin_seeds",
"k_means",
"kmeans_plusplus",
"linkage_tree",
"mean_shift",
"spectral_clustering",
"ward_tree",
]

View File

@@ -0,0 +1,607 @@
"""Affinity Propagation clustering algorithm."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral, Real
import numpy as np
from sklearn._config import config_context
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import euclidean_distances, pairwise_distances_argmin
from sklearn.utils import check_random_state
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
from sklearn.utils.validation import check_is_fitted, validate_data
def _equal_similarities_and_preferences(S, preference):
def all_equal_preferences():
return np.all(preference == preference.flat[0])
def all_equal_similarities():
# Create mask to ignore diagonal of S
mask = np.ones(S.shape, dtype=bool)
np.fill_diagonal(mask, 0)
return np.all(S[mask].flat == S[mask].flat[0])
return all_equal_preferences() and all_equal_similarities()
def _affinity_propagation(
S,
*,
preference,
convergence_iter,
max_iter,
damping,
verbose,
return_n_iter,
random_state,
):
"""Main affinity propagation algorithm."""
n_samples = S.shape[0]
if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
# It makes no sense to run the algorithm in this case, so return 1 or
# n_samples clusters, depending on preferences
warnings.warn(
"All samples have mutually equal similarities. "
"Returning arbitrary cluster center(s)."
)
if preference.flat[0] > S.flat[n_samples - 1]:
return (
(np.arange(n_samples), np.arange(n_samples), 0)
if return_n_iter
else (np.arange(n_samples), np.arange(n_samples))
)
else:
return (
(np.array([0]), np.array([0] * n_samples), 0)
if return_n_iter
else (np.array([0]), np.array([0] * n_samples))
)
# Place preference on the diagonal of S
S.flat[:: (n_samples + 1)] = preference
A = np.zeros((n_samples, n_samples))
R = np.zeros((n_samples, n_samples)) # Initialize messages
# Intermediate results
tmp = np.zeros((n_samples, n_samples))
# Remove degeneracies
S += (
np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
) * random_state.standard_normal(size=(n_samples, n_samples))
# Execute parallel affinity propagation updates
e = np.zeros((n_samples, convergence_iter))
ind = np.arange(n_samples)
for it in range(max_iter):
# tmp = A + S; compute responsibilities
np.add(A, S, tmp)
I = np.argmax(tmp, axis=1)
Y = tmp[ind, I] # np.max(A + S, axis=1)
tmp[ind, I] = -np.inf
Y2 = np.max(tmp, axis=1)
# tmp = Rnew
np.subtract(S, Y[:, None], tmp)
tmp[ind, I] = S[ind, I] - Y2
# Damping
tmp *= 1 - damping
R *= damping
R += tmp
# tmp = Rp; compute availabilities
np.maximum(R, 0, out=tmp)
tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
# tmp = -Anew
tmp -= np.sum(tmp, axis=0)
dA = np.diag(tmp).copy()
tmp.clip(0, np.inf, tmp)
tmp.flat[:: n_samples + 1] = dA
# Damping
tmp *= 1 - damping
A *= damping
A -= tmp
# Check for convergence
E = (np.diag(A) + np.diag(R)) > 0
e[:, it % convergence_iter] = E
K = np.sum(E, axis=0)
if it >= convergence_iter:
se = np.sum(e, axis=1)
unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
if (not unconverged and (K > 0)) or (it == max_iter):
never_converged = False
if verbose:
print("Converged after %d iterations." % it)
break
else:
never_converged = True
if verbose:
print("Did not converge")
I = np.flatnonzero(E)
K = I.size # Identify exemplars
if K > 0:
if never_converged:
warnings.warn(
(
"Affinity propagation did not converge, this model "
"may return degenerate cluster centers and labels."
),
ConvergenceWarning,
)
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K) # Identify clusters
# Refine the final set of exemplars and clusters and return results
for k in range(K):
ii = np.asarray(c == k).nonzero()[0]
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
I[k] = ii[j]
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K)
labels = I[c]
# Reduce labels to a sorted, gapless, list
cluster_centers_indices = np.unique(labels)
labels = np.searchsorted(cluster_centers_indices, labels)
else:
warnings.warn(
(
"Affinity propagation did not converge and this model "
"will not have any cluster centers."
),
ConvergenceWarning,
)
labels = np.array([-1] * n_samples)
cluster_centers_indices = []
if return_n_iter:
return cluster_centers_indices, labels, it + 1
else:
return cluster_centers_indices, labels
###############################################################################
# Public API
@validate_params(
{
"S": ["array-like"],
"return_n_iter": ["boolean"],
},
prefer_skip_nested_validation=False,
)
def affinity_propagation(
S,
*,
preference=None,
convergence_iter=15,
max_iter=200,
damping=0.5,
copy=True,
verbose=False,
return_n_iter=False,
random_state=None,
):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
S : array-like of shape (n_samples, n_samples)
Matrix of similarities between points.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
max_iter : int, default=200
Maximum number of iterations.
damping : float, default=0.5
Damping factor between 0.5 and 1.
copy : bool, default=True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency.
verbose : bool, default=False
The verbosity level.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Returns
-------
cluster_centers_indices : ndarray of shape (n_clusters,)
Index of clusters centers.
labels : ndarray of shape (n_samples,)
Cluster labels for each point.
n_iter : int
Number of iterations run. Returned only if `return_n_iter` is
set to True.
Notes
-----
For an example usage,
see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
You may also check out,
:ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`
When the algorithm does not converge, it will still return an array of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> import numpy as np
>>> from sklearn.cluster import affinity_propagation
>>> from sklearn.metrics.pairwise import euclidean_distances
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> S = -euclidean_distances(X, squared=True)
>>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
>>> cluster_centers_indices
array([0, 3])
>>> labels
array([0, 0, 0, 1, 1, 1])
"""
estimator = AffinityPropagation(
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
copy=copy,
preference=preference,
affinity="precomputed",
verbose=verbose,
random_state=random_state,
).fit(S)
if return_n_iter:
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
return estimator.cluster_centers_indices_, estimator.labels_
class AffinityPropagation(ClusterMixin, BaseEstimator):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
damping : float, default=0.5
Damping factor in the range `[0.5, 1.0)` is the extent to
which the current value is maintained relative to
incoming values (weighted 1 - damping). This in order
to avoid numerical oscillations when updating these
values (messages).
max_iter : int, default=200
Maximum number of iterations.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
copy : bool, default=True
Make a copy of input data.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number
of exemplars, ie of clusters, is influenced by the input
preferences value. If the preferences are not passed as arguments,
they will be set to the median of the input similarities.
affinity : {'euclidean', 'precomputed'}, default='euclidean'
Which affinity to use. At the moment 'precomputed' and
``euclidean`` are supported. 'euclidean' uses the
negative squared euclidean distance between points.
verbose : bool, default=False
Whether to be verbose.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Attributes
----------
cluster_centers_indices_ : ndarray of shape (n_clusters,)
Indices of cluster centers.
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Cluster centers (if affinity != ``precomputed``).
labels_ : ndarray of shape (n_samples,)
Labels of each point.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity matrix used in ``fit``.
n_iter_ : int
Number of iterations taken to converge.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
AgglomerativeClustering : Recursively merges the pair of
clusters that minimally increases a given linkage distance.
FeatureAgglomeration : Similar to AgglomerativeClustering,
but recursively merges features instead of samples.
KMeans : K-Means clustering.
MiniBatchKMeans : Mini-Batch K-Means clustering.
MeanShift : Mean shift clustering using a flat kernel.
SpectralClustering : Apply clustering to a projection
of the normalized Laplacian.
Notes
-----
The algorithmic complexity of affinity propagation is quadratic
in the number of points.
When the algorithm does not converge, it will still return an array of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When ``fit`` does not converge, ``cluster_centers_`` is still populated
however it may be degenerate. In such a case, proceed with caution.
If ``fit`` does not converge and fails to produce any ``cluster_centers_``
then ``predict`` will label every sample as ``-1``.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, ``fit`` will result in
a single cluster center and label ``0`` for every sample. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> from sklearn.cluster import AffinityPropagation
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clustering = AffinityPropagation(random_state=5).fit(X)
>>> clustering
AffinityPropagation(random_state=5)
>>> clustering.labels_
array([0, 0, 0, 1, 1, 1])
>>> clustering.predict([[0, 0], [4, 4]])
array([0, 1])
>>> clustering.cluster_centers_
array([[1, 2],
[4, 2]])
For an example usage,
see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
For a comparison of Affinity Propagation with other clustering algorithms, see
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
"""
_parameter_constraints: dict = {
"damping": [Interval(Real, 0.5, 1.0, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"convergence_iter": [Interval(Integral, 1, None, closed="left")],
"copy": ["boolean"],
"preference": [
"array-like",
Interval(Real, None, None, closed="neither"),
None,
],
"affinity": [StrOptions({"euclidean", "precomputed"})],
"verbose": ["verbose"],
"random_state": ["random_state"],
}
def __init__(
self,
*,
damping=0.5,
max_iter=200,
convergence_iter=15,
copy=True,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
):
self.damping = damping
self.max_iter = max_iter
self.convergence_iter = convergence_iter
self.copy = copy
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.random_state = random_state
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.pairwise = self.affinity == "precomputed"
tags.input_tags.sparse = self.affinity != "precomputed"
return tags
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the clustering from features, or affinity matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Returns the instance itself.
"""
if self.affinity == "precomputed":
X = validate_data(self, X, copy=self.copy, force_writeable=True)
self.affinity_matrix_ = X
else: # self.affinity == "euclidean"
X = validate_data(self, X, accept_sparse="csr")
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
raise ValueError(
"The matrix of similarities must be a square array. "
f"Got {self.affinity_matrix_.shape} instead."
)
if self.preference is None:
preference = np.median(self.affinity_matrix_)
else:
preference = self.preference
preference = np.asarray(preference)
random_state = check_random_state(self.random_state)
(
self.cluster_centers_indices_,
self.labels_,
self.n_iter_,
) = _affinity_propagation(
self.affinity_matrix_,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
preference=preference,
damping=self.damping,
verbose=self.verbose,
return_n_iter=True,
random_state=random_state,
)
if self.affinity != "precomputed":
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
check_is_fitted(self)
X = validate_data(self, X, reset=False, accept_sparse="csr")
if not hasattr(self, "cluster_centers_"):
raise ValueError(
"Predict method is not supported when affinity='precomputed'."
)
if self.cluster_centers_.shape[0] > 0:
with config_context(assume_finite=True):
return pairwise_distances_argmin(X, self.cluster_centers_)
else:
warnings.warn(
(
"This model does not have any cluster centers "
"because affinity propagation did not converge. "
"Labeling every sample as '-1'."
),
ConvergenceWarning,
)
return np.array([-1] * X.shape[0])
def fit_predict(self, X, y=None):
"""Fit clustering from features/affinity matrix; return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,622 @@
"""Spectral biclustering algorithms."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from abc import ABCMeta, abstractmethod
from numbers import Integral
import numpy as np
from scipy.linalg import norm
from scipy.sparse import dia_matrix, issparse
from scipy.sparse.linalg import eigsh, svds
from sklearn.base import BaseEstimator, BiclusterMixin, _fit_context
from sklearn.cluster._kmeans import KMeans, MiniBatchKMeans
from sklearn.utils import check_random_state, check_scalar
from sklearn.utils._param_validation import Interval, StrOptions
from sklearn.utils.extmath import _randomized_svd, make_nonnegative, safe_sparse_dot
from sklearn.utils.validation import assert_all_finite, validate_data
__all__ = ["SpectralBiclustering", "SpectralCoclustering"]
def _scale_normalize(X):
"""Normalize ``X`` by scaling rows and columns independently.
Returns the normalized matrix and the row and column scaling
factors.
"""
X = make_nonnegative(X)
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
if issparse(X):
n_rows, n_cols = X.shape
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
an = r @ X @ c
else:
an = row_diag[:, np.newaxis] * X * col_diag
return an, row_diag, col_diag
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
"""Normalize rows and columns of ``X`` simultaneously so that all
rows sum to one constant and all columns sum to a different
constant.
"""
# According to paper, this can also be done more efficiently with
# deviation reduction and balancing algorithms.
X = make_nonnegative(X)
X_scaled = X
for _ in range(max_iter):
X_new, _, _ = _scale_normalize(X_scaled)
if issparse(X):
dist = norm(X_scaled.data - X.data)
else:
dist = norm(X_scaled - X_new)
X_scaled = X_new
if dist is not None and dist < tol:
break
return X_scaled
def _log_normalize(X):
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
X = make_nonnegative(X, min_value=1)
if issparse(X):
raise ValueError(
"Cannot compute log of a sparse matrix,"
" because log(x) diverges to -infinity as x"
" goes to 0."
)
L = np.log(X)
row_avg = L.mean(axis=1)[:, np.newaxis]
col_avg = L.mean(axis=0)
avg = L.mean()
return L - row_avg - col_avg + avg
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for spectral biclustering."""
_parameter_constraints: dict = {
"svd_method": [StrOptions({"randomized", "arpack"})],
"n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
"mini_batch": ["boolean"],
"init": [StrOptions({"k-means++", "random"}), np.ndarray],
"n_init": [Interval(Integral, 1, None, closed="left")],
"random_state": ["random_state"],
}
@abstractmethod
def __init__(
self,
n_clusters=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
self.n_clusters = n_clusters
self.svd_method = svd_method
self.n_svd_vecs = n_svd_vecs
self.mini_batch = mini_batch
self.init = init
self.n_init = n_init
self.random_state = random_state
@abstractmethod
def _check_parameters(self, n_samples):
"""Validate parameters depending on the input data."""
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Create a biclustering for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
SpectralBiclustering instance.
"""
X = validate_data(self, X, accept_sparse="csr", dtype=np.float64)
self._check_parameters(X.shape[0])
self._fit(X)
return self
def _svd(self, array, n_components, n_discard):
"""Returns first `n_components` left and right singular
vectors u and v, discarding the first `n_discard`.
"""
if self.svd_method == "randomized":
kwargs = {}
if self.n_svd_vecs is not None:
kwargs["n_oversamples"] = self.n_svd_vecs
u, _, vt = _randomized_svd(
array, n_components, random_state=self.random_state, **kwargs
)
elif self.svd_method == "arpack":
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
if np.any(np.isnan(vt)):
# some eigenvalues of A * A.T are negative, causing
# sqrt() to be np.nan. This causes some vectors in vt
# to be np.nan.
A = safe_sparse_dot(array.T, array)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
vt = v.T
if np.any(np.isnan(u)):
A = safe_sparse_dot(array, array.T)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
assert_all_finite(u)
assert_all_finite(vt)
u = u[:, n_discard:]
vt = vt[n_discard:]
return u, vt.T
def _k_means(self, data, n_clusters):
if self.mini_batch:
model = MiniBatchKMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
else:
model = KMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
model.fit(data)
centroid = model.cluster_centers_
labels = model.labels_
return centroid, labels
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = True
return tags
class SpectralCoclustering(BaseSpectral):
"""Spectral Co-Clustering algorithm (Dhillon, 2001) [1]_.
Clusters rows and columns of an array `X` to solve the relaxed
normalized cut of the bipartite graph created from `X` as follows:
the edge between row vertex `i` and column vertex `j` has weight
`X[i, j]`.
The resulting bicluster structure is block-diagonal, since each
row and each column belongs to exactly one bicluster.
Supports sparse matrices, as long as they are nonnegative.
Read more in the :ref:`User Guide <spectral_coclustering>`.
Parameters
----------
n_clusters : int, default=3
The number of biclusters to find.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', use
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', use
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'}, or ndarray of shape \
(n_clusters, n_features), default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
The bicluster label of each row.
column_labels_ : array-like of shape (n_cols,)
The bicluster label of each column.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralBiclustering : Partitions rows and columns under the assumption
that the data has an underlying checkerboard structure.
References
----------
.. [1] :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
bipartite spectral graph partitioning.
<10.1145/502512.502550>`
Examples
--------
>>> from sklearn.cluster import SpectralCoclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_ #doctest: +SKIP
array([0, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_ #doctest: +SKIP
array([0, 0], dtype=int32)
>>> clustering
SpectralCoclustering(n_clusters=2, random_state=0)
For a more detailed example, see the following:
:ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`.
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
def _check_parameters(self, n_samples):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
def _fit(self, X):
normalized_data, row_diag, col_diag = _scale_normalize(X)
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
u, v = self._svd(normalized_data, n_sv, n_discard=1)
z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
_, labels = self._k_means(z, self.n_clusters)
n_rows = X.shape[0]
self.row_labels_ = labels[:n_rows]
self.column_labels_ = labels[n_rows:]
self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
self.columns_ = np.vstack(
[self.column_labels_ == c for c in range(self.n_clusters)]
)
class SpectralBiclustering(BaseSpectral):
"""Spectral biclustering (Kluger, 2003) [1]_.
Partitions rows and columns under the assumption that the data has
an underlying checkerboard structure. For instance, if there are
two row partitions and three column partitions, each row will
belong to three biclusters, and each column will belong to two
biclusters. The outer product of the corresponding row and column
label vectors gives this checkerboard structure.
Read more in the :ref:`User Guide <spectral_biclustering>`.
Parameters
----------
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
The number of row and column clusters in the checkerboard
structure.
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
Method of normalizing and converting singular vectors into
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
The authors recommend using 'log'. If the data is sparse,
however, log normalization will not work, which is why the
default is 'bistochastic'.
.. warning::
if `method='log'`, the data must not be sparse.
n_components : int, default=6
Number of singular vectors to check.
n_best : int, default=3
Number of best singular vectors to which to project the data
for clustering.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', uses
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', uses
`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
Row partition labels.
column_labels_ : array-like of shape (n_cols,)
Column partition labels.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralCoclustering : Clusters rows and columns of an array `X` to solve the
relaxed normalized cut of the bipartite graph created from `X`.
References
----------
.. [1] :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
data: coclustering genes and conditions.
<10.1101/gr.648603>`
Examples
--------
>>> from sklearn.cluster import SpectralBiclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_
array([1, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_
array([1, 0], dtype=int32)
>>> clustering
SpectralBiclustering(n_clusters=2, random_state=0)
For a more detailed example, see
:ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
"method": [StrOptions({"bistochastic", "scale", "log"})],
"n_components": [Interval(Integral, 1, None, closed="left")],
"n_best": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
method="bistochastic",
n_components=6,
n_best=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
self.method = method
self.n_components = n_components
self.n_best = n_best
def _check_parameters(self, n_samples):
if isinstance(self.n_clusters, Integral):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
else: # tuple
try:
n_row_clusters, n_column_clusters = self.n_clusters
check_scalar(
n_row_clusters,
"n_row_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
check_scalar(
n_column_clusters,
"n_column_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
except (ValueError, TypeError) as e:
raise ValueError(
"Incorrect parameter n_clusters has value:"
f" {self.n_clusters}. It should either be a single integer"
" or an iterable with two integers:"
" (n_row_clusters, n_column_clusters)"
" And the values are should be in the"
" range: (1, n_samples)"
) from e
if self.n_best > self.n_components:
raise ValueError(
f"n_best={self.n_best} must be <= n_components={self.n_components}."
)
def _fit(self, X):
n_sv = self.n_components
if self.method == "bistochastic":
normalized_data = _bistochastic_normalize(X)
n_sv += 1
elif self.method == "scale":
normalized_data, _, _ = _scale_normalize(X)
n_sv += 1
elif self.method == "log":
normalized_data = _log_normalize(X)
n_discard = 0 if self.method == "log" else 1
u, v = self._svd(normalized_data, n_sv, n_discard)
ut = u.T
vt = v.T
try:
n_row_clusters, n_col_clusters = self.n_clusters
except TypeError:
n_row_clusters = n_col_clusters = self.n_clusters
best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
self.rows_ = np.vstack(
[
self.row_labels_ == label
for label in range(n_row_clusters)
for _ in range(n_col_clusters)
]
)
self.columns_ = np.vstack(
[
self.column_labels_ == label
for _ in range(n_row_clusters)
for label in range(n_col_clusters)
]
)
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
"""Find the ``n_best`` vectors that are best approximated by piecewise
constant vectors.
The piecewise vectors are found by k-means; the best is chosen
according to Euclidean distance.
"""
def make_piecewise(v):
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
return centroid[labels].ravel()
piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
result = vectors[np.argsort(dists)[:n_best]]
return result
def _project_and_cluster(self, data, vectors, n_clusters):
"""Project ``data`` to ``vectors`` and cluster the result."""
projected = safe_sparse_dot(data, vectors)
_, labels = self._k_means(projected, n_clusters)
return labels

View File

@@ -0,0 +1,730 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from math import sqrt
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from sklearn._config import config_context
from sklearn.base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
ClusterMixin,
TransformerMixin,
_fit_context,
)
from sklearn.cluster import AgglomerativeClustering
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils._param_validation import Interval
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted, validate_data
def _iterate_sparse_X(X):
"""This little hack returns a densified row when iterating over a sparse
matrix, instead of constructing a sparse matrix for every row that is
expensive.
"""
n_samples = X.shape[0]
X_indices = X.indices
X_data = X.data
X_indptr = X.indptr
for i in range(n_samples):
row = np.zeros(X.shape[1])
startptr, endptr = X_indptr[i], X_indptr[i + 1]
nonzero_indices = X_indices[startptr:endptr]
row[nonzero_indices] = X_data[startptr:endptr]
yield row
def _split_node(node, threshold, branching_factor):
"""The node has to be split if there is no place for a new subcluster
in the node.
1. Two empty nodes and two empty subclusters are initialized.
2. The pair of distant subclusters are found.
3. The properties of the empty subclusters and nodes are updated
according to the nearest distance between the subclusters to the
pair of distant subclusters.
4. The two nodes are set as children to the two subclusters.
"""
new_subcluster1 = _CFSubcluster()
new_subcluster2 = _CFSubcluster()
new_node1 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_node2 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_subcluster1.child_ = new_node1
new_subcluster2.child_ = new_node2
if node.is_leaf:
if node.prev_leaf_ is not None:
node.prev_leaf_.next_leaf_ = new_node1
new_node1.prev_leaf_ = node.prev_leaf_
new_node1.next_leaf_ = new_node2
new_node2.prev_leaf_ = new_node1
new_node2.next_leaf_ = node.next_leaf_
if node.next_leaf_ is not None:
node.next_leaf_.prev_leaf_ = new_node2
dist = euclidean_distances(
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
)
n_clusters = dist.shape[0]
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
node1_dist, node2_dist = dist[(farthest_idx,)]
node1_closer = node1_dist < node2_dist
# make sure node1 is closest to itself even if all distances are equal.
# This can only happen when all node.centroids_ are duplicates leading to all
# distances between centroids being zero.
node1_closer[farthest_idx[0]] = True
for idx, subcluster in enumerate(node.subclusters_):
if node1_closer[idx]:
new_node1.append_subcluster(subcluster)
new_subcluster1.update(subcluster)
else:
new_node2.append_subcluster(subcluster)
new_subcluster2.update(subcluster)
return new_subcluster1, new_subcluster2
class _CFNode:
"""Each node in a CFTree is called a CFNode.
The CFNode can have a maximum of branching_factor
number of CFSubclusters.
Parameters
----------
threshold : float
Threshold needed for a new subcluster to enter a CFSubcluster.
branching_factor : int
Maximum number of CF subclusters in each node.
is_leaf : bool
We need to know if the CFNode is a leaf or not, in order to
retrieve the final subclusters.
n_features : int
The number of features.
Attributes
----------
subclusters_ : list
List of subclusters for a particular CFNode.
prev_leaf_ : _CFNode
Useful only if is_leaf is True.
next_leaf_ : _CFNode
next_leaf. Useful only if is_leaf is True.
the final subclusters.
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
Manipulate ``init_centroids_`` throughout rather than centroids_ since
the centroids are just a view of the ``init_centroids_`` .
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
centroids_ : ndarray of shape (branching_factor + 1, n_features)
View of ``init_centroids_``.
squared_norm_ : ndarray of shape (branching_factor + 1,)
View of ``init_sq_norm_``.
"""
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
self.threshold = threshold
self.branching_factor = branching_factor
self.is_leaf = is_leaf
self.n_features = n_features
# The list of subclusters, centroids and squared norms
# to manipulate throughout.
self.subclusters_ = []
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
self.squared_norm_ = []
self.prev_leaf_ = None
self.next_leaf_ = None
def append_subcluster(self, subcluster):
n_samples = len(self.subclusters_)
self.subclusters_.append(subcluster)
self.init_centroids_[n_samples] = subcluster.centroid_
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
# Keep centroids and squared norm as views. In this way
# if we change init_centroids and init_sq_norm_, it is
# sufficient,
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
"""Remove a subcluster from a node and update it with the
split subclusters.
"""
ind = self.subclusters_.index(subcluster)
self.subclusters_[ind] = new_subcluster1
self.init_centroids_[ind] = new_subcluster1.centroid_
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
self.append_subcluster(new_subcluster2)
def insert_cf_subcluster(self, subcluster):
"""Insert a new subcluster into the node."""
if not self.subclusters_:
self.append_subcluster(subcluster)
return False
threshold = self.threshold
branching_factor = self.branching_factor
# We need to find the closest subcluster among all the
# subclusters so that we can insert our new subcluster.
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
dist_matrix *= -2.0
dist_matrix += self.squared_norm_
closest_index = np.argmin(dist_matrix)
closest_subcluster = self.subclusters_[closest_index]
# If the subcluster has a child, we need a recursive strategy.
if closest_subcluster.child_ is not None:
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
if not split_child:
# If it is determined that the child need not be split, we
# can just update the closest_subcluster
closest_subcluster.update(subcluster)
self.init_centroids_[closest_index] = self.subclusters_[
closest_index
].centroid_
self.init_sq_norm_[closest_index] = self.subclusters_[
closest_index
].sq_norm_
return False
# things not too good. we need to redistribute the subclusters in
# our child node, and add a new subcluster in the parent
# subcluster to accommodate the new child.
else:
new_subcluster1, new_subcluster2 = _split_node(
closest_subcluster.child_,
threshold,
branching_factor,
)
self.update_split_subclusters(
closest_subcluster, new_subcluster1, new_subcluster2
)
if len(self.subclusters_) > self.branching_factor:
return True
return False
# good to go!
else:
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
if merged:
self.init_centroids_[closest_index] = closest_subcluster.centroid_
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
return False
# not close to any other subclusters, and we still
# have space, so add.
elif len(self.subclusters_) < self.branching_factor:
self.append_subcluster(subcluster)
return False
# We do not have enough space nor is it closer to an
# other subcluster. We need to split.
else:
self.append_subcluster(subcluster)
return True
class _CFSubcluster:
"""Each subcluster in a CFNode is called a CFSubcluster.
A CFSubcluster can have a CFNode has its child.
Parameters
----------
linear_sum : ndarray of shape (n_features,), default=None
Sample. This is kept optional to allow initialization of empty
subclusters.
Attributes
----------
n_samples_ : int
Number of samples that belong to each subcluster.
linear_sum_ : ndarray
Linear sum of all the samples in a subcluster. Prevents holding
all sample data in memory.
squared_sum_ : float
Sum of the squared l2 norms of all samples belonging to a subcluster.
centroid_ : ndarray of shape (branching_factor + 1, n_features)
Centroid of the subcluster. Prevent recomputing of centroids when
``CFNode.centroids_`` is called.
child_ : _CFNode
Child Node of the subcluster. Once a given _CFNode is set as the child
of the _CFNode, it is set to ``self.child_``.
sq_norm_ : ndarray of shape (branching_factor + 1,)
Squared norm of the subcluster. Used to prevent recomputing when
pairwise minimum distances are computed.
"""
def __init__(self, *, linear_sum=None):
if linear_sum is None:
self.n_samples_ = 0
self.squared_sum_ = 0.0
self.centroid_ = self.linear_sum_ = 0
else:
self.n_samples_ = 1
self.centroid_ = self.linear_sum_ = linear_sum
self.squared_sum_ = self.sq_norm_ = np.dot(
self.linear_sum_, self.linear_sum_
)
self.child_ = None
def update(self, subcluster):
self.n_samples_ += subcluster.n_samples_
self.linear_sum_ += subcluster.linear_sum_
self.squared_sum_ += subcluster.squared_sum_
self.centroid_ = self.linear_sum_ / self.n_samples_
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
def merge_subcluster(self, nominee_cluster, threshold):
"""Check if a cluster is worthy enough to be merged. If
yes then merge.
"""
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
new_n = self.n_samples_ + nominee_cluster.n_samples_
new_centroid = (1 / new_n) * new_ls
new_sq_norm = np.dot(new_centroid, new_centroid)
# The squared radius of the cluster is defined:
# r^2 = sum_i ||x_i - c||^2 / n
# with x_i the n points assigned to the cluster and c its centroid:
# c = sum_i x_i / n
# This can be expanded to:
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
# and therefore simplifies to:
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
sq_radius = new_ss / new_n - new_sq_norm
if sq_radius <= threshold**2:
(
self.n_samples_,
self.linear_sum_,
self.squared_sum_,
self.centroid_,
self.sq_norm_,
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
return True
return False
@property
def radius(self):
"""Return radius of the subcluster"""
# Because of numerical issues, this could become negative
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
return sqrt(max(0, sq_radius))
class Birch(
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
):
"""Implements the BIRCH clustering algorithm.
It is a memory-efficient, online-learning algorithm provided as an
alternative to :class:`MiniBatchKMeans`. It constructs a tree
data structure with the cluster centroids being read off the leaf.
These can be either the final cluster centroids or can be provided as input
to another clustering algorithm such as :class:`AgglomerativeClustering`.
Read more in the :ref:`User Guide <birch>`.
.. versionadded:: 0.16
Parameters
----------
threshold : float, default=0.5
The radius of the subcluster obtained by merging a new sample and the
closest subcluster should be lesser than the threshold. Otherwise a new
subcluster is started. Setting this value to be very low promotes
splitting and vice-versa.
branching_factor : int, default=50
Maximum number of CF subclusters in each node. If a new samples enters
such that the number of subclusters exceed the branching_factor then
that node is split into two nodes with the subclusters redistributed
in each. The parent subcluster of that node is removed and two new
subclusters are added as parents of the 2 split nodes.
n_clusters : int, instance of sklearn.cluster model or None, default=3
Number of clusters after the final clustering step, which treats the
subclusters from the leaves as new samples.
- `None` : the final clustering step is not performed and the
subclusters are returned as they are.
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
is fit treating the subclusters as new samples and the initial data
is mapped to the label of the closest subcluster.
- `int` : the model fit is :class:`AgglomerativeClustering` with
`n_clusters` set to be equal to the int.
compute_labels : bool, default=True
Whether or not to compute labels for each fit.
Attributes
----------
root_ : _CFNode
Root of the CFTree.
dummy_leaf_ : _CFNode
Start pointer to all the leaves.
subcluster_centers_ : ndarray
Centroids of all subclusters read directly from the leaves.
subcluster_labels_ : ndarray
Labels assigned to the centroids of the subclusters after
they are clustered globally.
labels_ : ndarray of shape (n_samples,)
Array of labels assigned to the input data.
if partial_fit is used instead of fit, they are assigned to the
last batch of data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
MiniBatchKMeans : Alternative implementation that does incremental updates
of the centers' positions using mini-batches.
Notes
-----
The tree data structure consists of nodes with each node consisting of
a number of subclusters. The maximum number of subclusters in a node
is determined by the branching factor. Each subcluster maintains a
linear sum, squared sum and the number of samples in that subcluster.
In addition, each subcluster can also have a node as its child, if the
subcluster is not a member of a leaf node.
For a new point entering the root, it is merged with the subcluster closest
to it and the linear sum, squared sum and the number of samples of that
subcluster are updated. This is done recursively till the properties of
the leaf node are updated.
See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a
comparison with :class:`~sklearn.cluster.MiniBatchKMeans`.
References
----------
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
BIRCH: An efficient data clustering method for large databases.
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
* Roberto Perdisci
JBirch - Java implementation of BIRCH clustering algorithm
https://code.google.com/archive/p/jbirch
Examples
--------
>>> from sklearn.cluster import Birch
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
>>> brc = Birch(n_clusters=None)
>>> brc.fit(X)
Birch(n_clusters=None)
>>> brc.predict(X)
array([0, 0, 0, 1, 1, 1])
For a comparison of the BIRCH clustering algorithm with other clustering algorithms,
see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
"""
_parameter_constraints: dict = {
"threshold": [Interval(Real, 0.0, None, closed="neither")],
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
"compute_labels": ["boolean"],
}
def __init__(
self,
*,
threshold=0.5,
branching_factor=50,
n_clusters=3,
compute_labels=True,
):
self.threshold = threshold
self.branching_factor = branching_factor
self.n_clusters = n_clusters
self.compute_labels = compute_labels
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""
Build a CF Tree for the input data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
return self._fit(X, partial=False)
def _fit(self, X, partial):
has_root = getattr(self, "root_", None)
first_call = not (partial and has_root)
X = validate_data(
self,
X,
accept_sparse="csr",
reset=first_call,
dtype=[np.float64, np.float32],
)
threshold = self.threshold
branching_factor = self.branching_factor
n_samples, n_features = X.shape
# If partial_fit is called for the first time or fit is called, we
# start a new tree.
if first_call:
# The first root is the leaf. Manipulate this object throughout.
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
# To enable getting back subclusters.
self.dummy_leaf_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
self.dummy_leaf_.next_leaf_ = self.root_
self.root_.prev_leaf_ = self.dummy_leaf_
# Cannot vectorize. Enough to convince to use cython.
if not sparse.issparse(X):
iter_func = iter
else:
iter_func = _iterate_sparse_X
for sample in iter_func(X):
subcluster = _CFSubcluster(linear_sum=sample)
split = self.root_.insert_cf_subcluster(subcluster)
if split:
new_subcluster1, new_subcluster2 = _split_node(
self.root_, threshold, branching_factor
)
del self.root_
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=False,
n_features=n_features,
dtype=X.dtype,
)
self.root_.append_subcluster(new_subcluster1)
self.root_.append_subcluster(new_subcluster2)
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
self.subcluster_centers_ = centroids
self._n_features_out = self.subcluster_centers_.shape[0]
self._global_clustering(X)
return self
def _get_leaves(self):
"""
Retrieve the leaves of the CF Node.
Returns
-------
leaves : list of shape (n_leaves,)
List of the leaf nodes.
"""
leaf_ptr = self.dummy_leaf_.next_leaf_
leaves = []
while leaf_ptr is not None:
leaves.append(leaf_ptr)
leaf_ptr = leaf_ptr.next_leaf_
return leaves
@_fit_context(prefer_skip_nested_validation=True)
def partial_fit(self, X=None, y=None):
"""
Online learning. Prevents rebuilding of CFTree from scratch.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
default=None
Input data. If X is not provided, only the global clustering
step is done.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
if X is None:
# Perform just the final global clustering step.
self._global_clustering()
return self
else:
return self._fit(X, partial=True)
def predict(self, X):
"""
Predict data using the ``centroids_`` of subclusters.
Avoid computation of the row norms of X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
labels : ndarray of shape(n_samples,)
Labelled data.
"""
check_is_fitted(self)
X = validate_data(self, X, accept_sparse="csr", reset=False)
return self._predict(X)
def _predict(self, X):
"""Predict data using the ``centroids_`` of subclusters."""
kwargs = {"Y_norm_squared": self._subcluster_norms}
with config_context(assume_finite=True):
argmin = pairwise_distances_argmin(
X, self.subcluster_centers_, metric_kwargs=kwargs
)
return self.subcluster_labels_[argmin]
def transform(self, X):
"""
Transform X into subcluster centroids dimension.
Each dimension represents the distance from the sample point to each
cluster centroid.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
Transformed data.
"""
check_is_fitted(self)
X = validate_data(self, X, accept_sparse="csr", reset=False)
with config_context(assume_finite=True):
return euclidean_distances(X, self.subcluster_centers_)
def _global_clustering(self, X=None):
"""
Global clustering for the subclusters obtained after fitting
"""
clusterer = self.n_clusters
centroids = self.subcluster_centers_
compute_labels = (X is not None) and self.compute_labels
# Preprocessing for the global clustering.
not_enough_centroids = False
if isinstance(clusterer, Integral):
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
# There is no need to perform the global clustering step.
if len(centroids) < self.n_clusters:
not_enough_centroids = True
# To use in predict to avoid recalculation.
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
if clusterer is None or not_enough_centroids:
self.subcluster_labels_ = np.arange(len(centroids))
if not_enough_centroids:
warnings.warn(
"Number of subclusters found (%d) by BIRCH is less "
"than (%d). Decrease the threshold."
% (len(centroids), self.n_clusters),
ConvergenceWarning,
)
else:
# The global clustering step that clusters the subclusters of
# the leaves. It assumes the centroids of the subclusters as
# samples and finds the final centroids.
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
if compute_labels:
self.labels_ = self._predict(X)
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
tags.input_tags.sparse = True
return tags

View File

@@ -0,0 +1,543 @@
"""Bisecting K-means clustering."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
import numpy as np
import scipy.sparse as sp
from sklearn.base import _fit_context
from sklearn.cluster._k_means_common import _inertia_dense, _inertia_sparse
from sklearn.cluster._kmeans import (
_BaseKMeans,
_kmeans_single_elkan,
_kmeans_single_lloyd,
_labels_inertia_threadpool_limit,
)
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils._param_validation import Integral, Interval, StrOptions
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import (
_check_sample_weight,
check_is_fitted,
check_random_state,
validate_data,
)
class _BisectingTree:
"""Tree structure representing the hierarchical clusters of BisectingKMeans."""
def __init__(self, center, indices, score):
"""Create a new cluster node in the tree.
The node holds the center of this cluster and the indices of the data points
that belong to it.
"""
self.center = center
self.indices = indices
self.score = score
self.left = None
self.right = None
def split(self, labels, centers, scores):
"""Split the cluster node into two subclusters."""
self.left = _BisectingTree(
indices=self.indices[labels == 0], center=centers[0], score=scores[0]
)
self.right = _BisectingTree(
indices=self.indices[labels == 1], center=centers[1], score=scores[1]
)
# reset the indices attribute to save memory
self.indices = None
def get_cluster_to_bisect(self):
"""Return the cluster node to bisect next.
It's based on the score of the cluster, which can be either the number of
data points assigned to that cluster or the inertia of that cluster
(see `bisecting_strategy` for details).
"""
max_score = None
for cluster_leaf in self.iter_leaves():
if max_score is None or cluster_leaf.score > max_score:
max_score = cluster_leaf.score
best_cluster_leaf = cluster_leaf
return best_cluster_leaf
def iter_leaves(self):
"""Iterate over all the cluster leaves in the tree."""
if self.left is None:
yield self
else:
yield from self.left.iter_leaves()
yield from self.right.iter_leaves()
class BisectingKMeans(_BaseKMeans):
"""Bisecting K-Means clustering.
Read more in the :ref:`User Guide <bisect_k_means>`.
.. versionadded:: 1.1
Parameters
----------
n_clusters : int, default=8
The number of clusters to form as well as the number of
centroids to generate.
init : {'k-means++', 'random'} or callable, default='random'
Method for initialization:
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose `n_clusters` observations (rows) at random from data
for the initial centroids.
If a callable is passed, it should take arguments X, n_clusters and a
random state and return an initialization.
n_init : int, default=1
Number of time the inner k-means algorithm will be run with different
centroid seeds in each bisection.
That will result producing for each bisection best output of n_init
consecutive runs in terms of inertia.
random_state : int, RandomState instance or None, default=None
Determines random number generation for centroid initialization
in inner K-Means. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
max_iter : int, default=300
Maximum number of iterations of the inner k-means algorithm at each
bisection.
verbose : int, default=0
Verbosity mode.
tol : float, default=1e-4
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence. Used in inner k-means algorithm at each bisection to pick
best possible clusters.
copy_x : bool, default=True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True (default), then the original data is
not modified. If False, the original data is modified, and put back
before the function returns, but small numerical differences may be
introduced by subtracting and then adding the data mean. Note that if
the original data is not C-contiguous, a copy will be made even if
copy_x is False. If the original data is sparse, but not in CSR format,
a copy will be made even if copy_x is False.
algorithm : {"lloyd", "elkan"}, default="lloyd"
Inner K-means algorithm used in bisection.
The classical EM-style algorithm is `"lloyd"`.
The `"elkan"` variation can be more efficient on some datasets with
well-defined clusters, by using the triangle inequality. However it's
more memory intensive due to the allocation of an extra array of shape
`(n_samples, n_clusters)`.
bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
default="biggest_inertia"
Defines how bisection should be performed:
- "biggest_inertia" means that BisectingKMeans will always check
all calculated cluster for cluster with biggest SSE
(Sum of squared errors) and bisect it. This approach concentrates on
precision, but may be costly in terms of execution time (especially for
larger amount of data points).
- "largest_cluster" - BisectingKMeans will always split cluster with
largest amount of points assigned to it from all clusters
previously calculated. That should work faster than picking by SSE
('biggest_inertia') and may produce similar results in most cases.
Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers. If the algorithm stops before fully
converging (see ``tol`` and ``max_iter``), these will not be
consistent with ``labels_``.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
inertia_ : float
Sum of squared distances of samples to their closest cluster center,
weighted by the sample weights if provided.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
See Also
--------
KMeans : Original implementation of K-Means algorithm.
Notes
-----
It might be inefficient when n_cluster is less than 3, due to unnecessary
calculations for that case.
Examples
--------
>>> from sklearn.cluster import BisectingKMeans
>>> import numpy as np
>>> X = np.array([[1, 1], [10, 1], [3, 1],
... [10, 0], [2, 1], [10, 2],
... [10, 8], [10, 9], [10, 10]])
>>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
>>> bisect_means.labels_
array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
>>> bisect_means.predict([[0, 0], [12, 3]])
array([0, 2], dtype=int32)
>>> bisect_means.cluster_centers_
array([[ 2., 1.],
[10., 9.],
[10., 1.]])
For a comparison between BisectingKMeans and K-Means refer to example
:ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`.
"""
_parameter_constraints: dict = {
**_BaseKMeans._parameter_constraints,
"init": [StrOptions({"k-means++", "random"}), callable],
"n_init": [Interval(Integral, 1, None, closed="left")],
"copy_x": ["boolean"],
"algorithm": [StrOptions({"lloyd", "elkan"})],
"bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
}
def __init__(
self,
n_clusters=8,
*,
init="random",
n_init=1,
random_state=None,
max_iter=300,
verbose=0,
tol=1e-4,
copy_x=True,
algorithm="lloyd",
bisecting_strategy="biggest_inertia",
):
super().__init__(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
verbose=verbose,
random_state=random_state,
tol=tol,
n_init=n_init,
)
self.copy_x = copy_x
self.algorithm = algorithm
self.bisecting_strategy = bisecting_strategy
def _warn_mkl_vcomp(self, n_active_threads):
"""Warn when vcomp and mkl are both present"""
warnings.warn(
"BisectingKMeans is known to have a memory leak on Windows "
"with MKL, when there are less chunks than available "
"threads. You can avoid it by setting the environment"
f" variable OMP_NUM_THREADS={n_active_threads}."
)
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
"""Calculate the sum of squared errors (inertia) per cluster.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The input samples.
centers : ndarray of shape (n_clusters=2, n_features)
The cluster centers.
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
Returns
-------
inertia_per_cluster : ndarray of shape (n_clusters=2,)
Sum of squared errors (inertia) for each cluster.
"""
n_clusters = centers.shape[0] # = 2 since centers comes from a bisection
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
inertia_per_cluster = np.empty(n_clusters)
for label in range(n_clusters):
inertia_per_cluster[label] = _inertia(
X, sample_weight, centers, labels, self._n_threads, single_label=label
)
return inertia_per_cluster
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
"""Split a cluster into 2 subsclusters.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
Training instances to cluster.
x_squared_norms : ndarray of shape (n_samples,)
Squared euclidean norm of each data point.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_to_bisect : _BisectingTree node object
The cluster node to split.
"""
X = X[cluster_to_bisect.indices]
x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
sample_weight = sample_weight[cluster_to_bisect.indices]
best_inertia = None
# Split samples in X into 2 clusters.
# Repeating `n_init` times to obtain best clusters
for _ in range(self.n_init):
centers_init = self._init_centroids(
X,
x_squared_norms=x_squared_norms,
init=self.init,
random_state=self._random_state,
n_centroids=2,
sample_weight=sample_weight,
)
labels, inertia, centers, _ = self._kmeans_single(
X,
sample_weight,
centers_init,
max_iter=self.max_iter,
verbose=self.verbose,
tol=self.tol,
n_threads=self._n_threads,
)
# allow small tolerance on the inertia to accommodate for
# non-deterministic rounding errors due to parallel computation
if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
best_labels = labels
best_centers = centers
best_inertia = inertia
if self.verbose:
print(f"New centroids from bisection: {best_centers}")
if self.bisecting_strategy == "biggest_inertia":
scores = self._inertia_per_cluster(
X, best_centers, best_labels, sample_weight
)
else: # bisecting_strategy == "largest_cluster"
# Using minlength to make sure that we have the counts for both labels even
# if all samples are labelled 0.
scores = np.bincount(best_labels, minlength=2)
cluster_to_bisect.split(best_labels, best_centers, scores)
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None, sample_weight=None):
"""Compute bisecting k-means clustering.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training instances to cluster.
.. note:: The data will be converted to C ordering,
which will cause a memory copy
if the given data is not C-contiguous.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
The weights for each observation in X. If None, all observations
are assigned equal weight. `sample_weight` is not used during
initialization if `init` is a callable.
Returns
-------
self
Fitted estimator.
"""
X = validate_data(
self,
X,
accept_sparse="csr",
dtype=[np.float64, np.float32],
order="C",
copy=self.copy_x,
accept_large_sparse=False,
)
self._check_params_vs_input(X)
self._random_state = check_random_state(self.random_state)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
self._n_threads = _openmp_effective_n_threads()
if self.algorithm == "lloyd" or self.n_clusters == 1:
self._kmeans_single = _kmeans_single_lloyd
self._check_mkl_vcomp(X, X.shape[0])
else:
self._kmeans_single = _kmeans_single_elkan
# Subtract of mean of X for more accurate distance computations
if not sp.issparse(X):
self._X_mean = X.mean(axis=0)
X -= self._X_mean
# Initialize the hierarchical clusters tree
self._bisecting_tree = _BisectingTree(
indices=np.arange(X.shape[0]),
center=X.mean(axis=0),
score=0,
)
x_squared_norms = row_norms(X, squared=True)
for _ in range(self.n_clusters - 1):
# Chose cluster to bisect
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
# Split this cluster into 2 subclusters
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
# Aggregate final labels and centers from the bisecting tree
self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
self.labels_[cluster_node.indices] = i
self.cluster_centers_[i] = cluster_node.center
cluster_node.label = i # label final clusters for future prediction
cluster_node.indices = None # release memory
# Restore original data
if not sp.issparse(X):
X += self._X_mean
self.cluster_centers_ += self._X_mean
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
self.inertia_ = _inertia(
X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
)
self._n_features_out = self.cluster_centers_.shape[0]
return self
def predict(self, X):
"""Predict which cluster each sample in X belongs to.
Prediction is made by going down the hierarchical tree
in searching of closest leaf cluster.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
# sample weights are unused but necessary in cython helpers
sample_weight = np.ones_like(x_squared_norms)
labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
return labels
def _predict_recursive(self, X, sample_weight, cluster_node):
"""Predict recursively by going down the hierarchical tree.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The data points, currently assigned to `cluster_node`, to predict between
the subclusters of this node.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_node : _BisectingTree node object
The cluster node of the hierarchical tree.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
if cluster_node.left is None:
# This cluster has no subcluster. Labels are just the label of the cluster.
return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
# Determine if data points belong to the left or right subcluster
centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
if hasattr(self, "_X_mean"):
centers += self._X_mean
cluster_labels = _labels_inertia_threadpool_limit(
X,
sample_weight,
centers,
self._n_threads,
return_inertia=False,
)
mask = cluster_labels == 0
# Compute the labels for each subset of the data points.
labels = np.full(X.shape[0], -1, dtype=np.int32)
labels[mask] = self._predict_recursive(
X[mask], sample_weight[mask], cluster_node.left
)
labels[~mask] = self._predict_recursive(
X[~mask], sample_weight[~mask], cluster_node.right
)
return labels
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.sparse = True
tags.transformer_tags.preserves_dtype = ["float64", "float32"]
return tags

View File

@@ -0,0 +1,512 @@
"""
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, ClusterMixin, _fit_context
from sklearn.cluster._dbscan_inner import dbscan_inner
from sklearn.metrics.pairwise import _VALID_METRICS
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
from sklearn.utils.validation import _check_sample_weight, validate_data
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"sample_weight": ["array-like", None],
},
prefer_skip_nested_validation=False,
)
def dbscan(
X,
eps=0.5,
*,
min_samples=5,
metric="minkowski",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=2,
sample_weight=None,
n_jobs=None,
):
"""Perform DBSCAN clustering from vector array or distance matrix.
This function is a wrapper around :class:`~cluster.DBSCAN`, suitable for
quick, standalone clustering tasks. For estimator-based workflows, where
estimator attributes or pipeline integration is required, prefer
:class:`~cluster.DBSCAN`.
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a
density-based clustering algorithm that groups together points that are
closely packed while marking points in low-density regions as outliers.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
X : {array-like, scipy sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``. When using precomputed distances, X must
be a square symmetric matrix.
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function. Smaller values result in more clusters,
while larger values result in fewer, larger clusters.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
Higher values yield fewer, denser clusters, while lower values yield
more, sparser clusters.
metric : str or callable, default='minkowski'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
X may be a :term:`sparse graph <sparse graph>`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem. Generally, smaller leaf sizes
lead to faster queries but slower construction.
p : float, default=2
Power parameter for the Minkowski metric. When p = 1, this is equivalent
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
to be positive.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search. ``None`` means
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors. See :term:`Glossary <n_jobs>` for more details.
If precomputed distances are used, parallel execution is not available
and thus n_jobs will have no effect.
Returns
-------
core_samples : ndarray of shape (n_core_samples,)
Indices of core samples.
labels : ndarray of shape (n_samples,)
Cluster labels for each point. Noisy samples are given the label -1.
Non-negative integers indicate cluster membership.
See Also
--------
DBSCAN : An estimator interface for this clustering algorithm.
OPTICS : A similar estimator interface clustering at multiple values of
eps. Our implementation is optimized for memory usage.
Notes
-----
For an example, see :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
memory usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
Examples
--------
>>> from sklearn.cluster import dbscan
>>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
>>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
>>> core_samples
array([0, 1, 2, 3, 4])
>>> labels
array([ 0, 0, 0, 1, 1, -1])
"""
est = DBSCAN(
eps=eps,
min_samples=min_samples,
metric=metric,
metric_params=metric_params,
algorithm=algorithm,
leaf_size=leaf_size,
p=p,
n_jobs=n_jobs,
)
est.fit(X, sample_weight=sample_weight)
return est.core_sample_indices_, est.labels_
class DBSCAN(ClusterMixin, BaseEstimator):
"""Perform DBSCAN clustering from vector array or distance matrix.
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
This algorithm is particularly good for data which contains clusters of
similar density and can find clusters of arbitrary shape.
Unlike K-means, DBSCAN does not require specifying the number of clusters
in advance and can identify outliers as noise points.
This implementation has a worst case memory complexity of :math:`O({n}^2)`,
which can occur when the `eps` param is large and `min_samples` is low,
while the original DBSCAN only uses linear memory.
For further details, see the Notes below.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function. Smaller values generally lead to more clusters.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point to
be considered as a core point. This includes the point itself. If
`min_samples` is set to a higher value, DBSCAN will find denser clusters,
whereas if it is set to a lower value, the found clusters will be more
sparse.
metric : str, or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`sparse graph`, in which
case only "nonzero" elements may be considered neighbors for DBSCAN.
.. versionadded:: 0.17
metric *precomputed* to accept precomputed sparse matrix.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=None
The power of the Minkowski metric to be used to calculate distance
between points. If None, then ``p=2`` (equivalent to the Euclidean
distance). When p=1, this is equivalent to Manhattan distance.
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
core_sample_indices_ : ndarray of shape (n_core_samples,)
Indices of core samples.
components_ : ndarray of shape (n_core_samples, n_features)
Copy of each core sample found by training.
labels_ : ndarray of shape (n_samples,)
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1. Non-negative integers
indicate cluster membership.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
OPTICS : A similar clustering at multiple values of eps. Our implementation
is optimized for memory usage.
Notes
-----
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
Examples
--------
>>> from sklearn.cluster import DBSCAN
>>> import numpy as np
>>> X = np.array([[1, 2], [2, 2], [2, 3],
... [8, 7], [8, 8], [25, 80]])
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
>>> clustering.labels_
array([ 0, 0, 0, 1, 1, -1])
>>> clustering
DBSCAN(eps=3, min_samples=2)
For an example, see
:ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
For a comparison of DBSCAN with other clustering algorithms, see
:ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
"""
_parameter_constraints: dict = {
"eps": [Interval(Real, 0.0, None, closed="neither")],
"min_samples": [Interval(Integral, 1, None, closed="left")],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
],
"metric_params": [dict, None],
"algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
"leaf_size": [Interval(Integral, 1, None, closed="left")],
"p": [Interval(Real, 0.0, None, closed="left"), None],
"n_jobs": [Integral, None],
}
def __init__(
self,
eps=0.5,
*,
min_samples=5,
metric="euclidean",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
):
self.eps = eps
self.min_samples = min_samples
self.metric = metric
self.metric_params = metric_params
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.n_jobs = n_jobs
@_fit_context(
# DBSCAN.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features, or distance matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
self : object
Returns a fitted instance of self.
"""
X = validate_data(self, X, accept_sparse="csr")
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Calculate neighborhood for all samples. This leaves the original
# point in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
if self.metric == "precomputed" and sparse.issparse(X):
# set the diagonal to explicit values, as a point is its own
# neighbor
X = X.copy() # copy to avoid in-place modification
with warnings.catch_warnings():
warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
X.setdiag(X.diagonal())
neighbors_model = NearestNeighbors(
radius=self.eps,
algorithm=self.algorithm,
leaf_size=self.leaf_size,
metric=self.metric,
metric_params=self.metric_params,
p=self.p,
n_jobs=self.n_jobs,
)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
if sample_weight is None:
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
else:
n_neighbors = np.array(
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
)
# Initially, all samples are noise.
labels = np.full(X.shape[0], -1, dtype=np.intp)
# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
dbscan_inner(core_samples, neighborhoods, labels)
self.core_sample_indices_ = np.where(core_samples)[0]
self.labels_ = labels
if len(self.core_sample_indices_):
# fix for scipy sparse indexing issue
self.components_ = X[self.core_sample_indices_].copy()
else:
# no core samples
self.components_ = np.empty((0, X.shape[1]))
return self
def fit_predict(self, X, y=None, sample_weight=None):
"""Compute clusters from a data or distance matrix and predict labels.
This method fits the model and returns the cluster labels in a single step.
It is equivalent to calling fit(X).labels_.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels. Noisy samples are given the label -1.
Non-negative integers indicate cluster membership.
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.pairwise = self.metric == "precomputed"
tags.input_tags.sparse = True
return tags

View File

@@ -0,0 +1,41 @@
# Fast inner loop for DBSCAN.
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from libcpp.vector cimport vector
from sklearn.utils._typedefs cimport uint8_t, intp_t
def dbscan_inner(const uint8_t[::1] is_core,
object[:] neighborhoods,
intp_t[::1] labels):
cdef intp_t i, label_num = 0, v
cdef intp_t[:] neighb
cdef vector[intp_t] stack
for i in range(labels.shape[0]):
if labels[i] != -1 or not is_core[i]:
continue
# Depth-first search starting from i, ending at the non-core points.
# This is very similar to the classic algorithm for computing connected
# components, the difference being that we label non-core points as
# part of a cluster (component), but don't expand their neighborhoods.
while True:
if labels[i] == -1:
labels[i] = label_num
if is_core[i]:
neighb = neighborhoods[i]
for i in range(neighb.shape[0]):
v = neighb[i]
if labels[v] == -1:
stack.push_back(v)
if stack.size() == 0:
break
i = stack.back()
stack.pop_back()
label_num += 1

View File

@@ -0,0 +1,76 @@
"""
Feature agglomeration. Base classes and functions for performing feature
agglomeration.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from scipy.sparse import issparse
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted, validate_data
###############################################################################
# Mixin class for feature agglomeration.
class AgglomerationTransform(TransformerMixin):
"""
A class for feature agglomeration via the transform interface.
"""
def transform(self, X):
"""
Transform a new matrix using the built clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
An M by N array of M observations in N dimensions or a length
M array of M one-dimensional observations.
Returns
-------
Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
The pooled values for each feature cluster.
"""
check_is_fitted(self)
X = validate_data(self, X, reset=False)
if self.pooling_func == np.mean and not issparse(X):
size = np.bincount(self.labels_)
n_samples = X.shape[0]
# a fast way to compute the mean of grouped features
nX = np.array(
[np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
)
else:
nX = [
self.pooling_func(X[:, self.labels_ == l], axis=1)
for l in np.unique(self.labels_)
]
nX = np.array(nX).T
return nX
def inverse_transform(self, X):
"""
Inverse the transformation and return a vector of size `n_features`.
Parameters
----------
X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
The values to be assigned to each cluster of samples.
Returns
-------
X_original : ndarray of shape (n_samples, n_features) or (n_features,)
A vector of size `n_samples` with the values of `X` assigned to
each of the cluster of samples.
"""
check_is_fitted(self)
unil, inverse = np.unique(self.labels_, return_inverse=True)
return X[..., inverse]

View File

@@ -0,0 +1,2 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

View File

@@ -0,0 +1,274 @@
# Minimum spanning tree single linkage implementation for hdbscan
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
from libc.float cimport DBL_MAX
import numpy as np
from sklearn.metrics._dist_metrics cimport DistanceMetric64
from sklearn.cluster._hierarchical_fast cimport UnionFind
from sklearn.cluster._hdbscan._tree cimport HIERARCHY_t
from sklearn.cluster._hdbscan._tree import HIERARCHY_dtype
from sklearn.utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
cnp.import_array()
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
# Numpy structured dtype representing a single ordered edge in Prim's algorithm
MST_edge_dtype = np.dtype([
("current_node", np.int64),
("next_node", np.int64),
("distance", np.float64),
])
# Packed shouldn't make a difference since they're all 8-byte quantities,
# but it's included just to be safe.
ctypedef packed struct MST_edge_t:
int64_t current_node
int64_t next_node
float64_t distance
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
cnp.ndarray[float64_t, ndim=2] mutual_reachability
):
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
reachability graph using Prim's algorithm.
Parameters
----------
mutual_reachability : ndarray of shape (n_samples, n_samples)
Array of mutual-reachabilities between samples.
Returns
-------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reachability graph. The MST is
represented as a collection of edges.
"""
cdef:
# Note: we utilize ndarray's over memory-views to make use of numpy
# binary indexing and sub-selection below.
cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
cnp.ndarray[uint8_t, mode='c'] label_filter
int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
int64_t current_node, new_node_index, new_node, i
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
current_labels = np.arange(n_samples, dtype=np.int64)
current_node = 0
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
for i in range(0, n_samples - 1):
label_filter = current_labels != current_node
current_labels = current_labels[label_filter]
left = min_reachability[label_filter]
right = mutual_reachability[current_node][current_labels]
min_reachability = np.minimum(left, right)
new_node_index = np.argmin(min_reachability)
new_node = current_labels[new_node_index]
mst[i].current_node = current_node
mst[i].next_node = new_node
mst[i].distance = min_reachability[new_node_index]
current_node = new_node
return mst
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
const float64_t[:, ::1] raw_data,
const float64_t[::1] core_distances,
DistanceMetric64 dist_metric,
float64_t alpha=1.0
):
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
reachability graph generated from the provided `raw_data` and
`core_distances` using Prim's algorithm.
Parameters
----------
raw_data : ndarray of shape (n_samples, n_features)
Input array of data samples.
core_distances : ndarray of shape (n_samples,)
An array containing the core-distance calculated for each corresponding
sample.
dist_metric : DistanceMetric
The distance metric to use when calculating pairwise distances for
determining mutual-reachability.
Returns
-------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reachability graph. The MST is
represented as a collection of edges.
"""
cdef:
uint8_t[::1] in_tree
float64_t[::1] min_reachability
int64_t[::1] current_sources
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
int64_t current_node, source_node, new_node, next_node_source
int64_t i, j, n_samples, num_features
float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
float64_t next_node_min_reach, pair_distance, next_node_core_dist
n_samples = raw_data.shape[0]
num_features = raw_data.shape[1]
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
in_tree = np.zeros(n_samples, dtype=np.uint8)
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
current_sources = np.ones(n_samples, dtype=np.int64)
current_node = 0
# The following loop dynamically updates minimum reachability node-by-node,
# avoiding unnecessary computation where possible.
for i in range(0, n_samples - 1):
in_tree[current_node] = 1
current_node_core_dist = core_distances[current_node]
new_reachability = DBL_MAX
source_node = 0
new_node = 0
for j in range(n_samples):
if in_tree[j]:
continue
next_node_min_reach = min_reachability[j]
next_node_source = current_sources[j]
pair_distance = dist_metric.dist(
&raw_data[current_node, 0],
&raw_data[j, 0],
num_features
)
pair_distance /= alpha
next_node_core_dist = core_distances[j]
mutual_reachability_distance = max(
current_node_core_dist,
next_node_core_dist,
pair_distance
)
# If MRD(i, j) is smaller than node j's min_reachability, we update
# node j's min_reachability for future reference.
if mutual_reachability_distance < next_node_min_reach:
min_reachability[j] = mutual_reachability_distance
current_sources[j] = current_node
# If MRD(i, j) is also smaller than node i's current
# min_reachability, we update and set their edge as the current
# MST edge candidate.
if mutual_reachability_distance < new_reachability:
new_reachability = mutual_reachability_distance
source_node = current_node
new_node = j
# If the node j is closer to another node already in the tree, we
# make their edge the current MST candidate edge.
elif next_node_min_reach < new_reachability:
new_reachability = next_node_min_reach
source_node = next_node_source
new_node = j
mst[i].current_node = source_node
mst[i].next_node = new_node
mst[i].distance = new_reachability
current_node = new_node
return mst
cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
"""Construct a single-linkage tree from an MST.
Parameters
----------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reachability graph. The MST is
represented as a collection of edges.
Returns
-------
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
The single-linkage tree tree (dendrogram) built from the MST. Each
of the array represents the following:
- left node/cluster
- right node/cluster
- distance
- new cluster size
"""
cdef:
cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
# Note mst.shape[0] is one fewer than the number of samples
int64_t n_samples = mst.shape[0] + 1
intp_t current_node_cluster, next_node_cluster
int64_t current_node, next_node, i
float64_t distance
UnionFind U = UnionFind(n_samples)
single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
for i in range(n_samples - 1):
current_node = mst[i].current_node
next_node = mst[i].next_node
distance = mst[i].distance
current_node_cluster = U.fast_find(current_node)
next_node_cluster = U.fast_find(next_node)
single_linkage[i].left_node = current_node_cluster
single_linkage[i].right_node = next_node_cluster
single_linkage[i].value = distance
single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
U.union(current_node_cluster, next_node_cluster)
return single_linkage

View File

@@ -0,0 +1,210 @@
# mutual reachability distance computations
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
import numpy as np
from scipy.sparse import issparse
from cython cimport floating, integral
from libc.math cimport isfinite, INFINITY
from sklearn.utils._typedefs cimport intp_t
cnp.import_array()
def mutual_reachability_graph(
distance_matrix, min_samples=5, max_distance=0.0
):
"""Compute the weighted adjacency matrix of the mutual reachability graph.
The mutual reachability distance used to build the graph is defined as::
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
and the core distance `d_core` is defined as the distance between a point
`x_p` and its k-th nearest neighbor.
Note that all computations are done in-place.
Parameters
----------
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
Array of distances between samples. If sparse, the array must be in
`CSR` format.
min_samples : int, default=5
The parameter `k` used to calculate the distance between a point
`x_p` and its k-th nearest neighbor.
max_distance : float, default=0.0
The distance which `np.inf` is replaced with. When the true mutual-
reachability distance is measured to be infinite, it is instead
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
matrix.
Returns
-------
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
(n_samples, n_samples)
Weighted adjacency matrix of the mutual reachability graph.
References
----------
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
Density-based clustering based on hierarchical density estimates.
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
(pp. 160-172). Springer Berlin Heidelberg.
"""
further_neighbor_idx = min_samples - 1
if issparse(distance_matrix):
if distance_matrix.format != "csr":
raise ValueError(
"Only sparse CSR matrices are supported for `distance_matrix`."
)
_sparse_mutual_reachability_graph(
distance_matrix.data,
distance_matrix.indices,
distance_matrix.indptr,
distance_matrix.shape[0],
further_neighbor_idx=further_neighbor_idx,
max_distance=max_distance,
)
else:
_dense_mutual_reachability_graph(
distance_matrix, further_neighbor_idx=further_neighbor_idx
)
return distance_matrix
def _dense_mutual_reachability_graph(
floating[:, :] distance_matrix,
intp_t further_neighbor_idx,
):
"""Dense implementation of mutual reachability graph.
The computation is done in-place, i.e. the distance matrix is modified
directly.
Parameters
----------
distance_matrix : ndarray of shape (n_samples, n_samples)
Array of distances between samples.
further_neighbor_idx : int
The index of the furthest neighbor to use to define the core distances.
"""
cdef:
intp_t i, j, n_samples = distance_matrix.shape[0]
floating mutual_reachability_distance
floating[::1] core_distances
# We assume that the distance matrix is symmetric. We choose to sort every
# row to have the same implementation than the sparse case that requires
# CSR matrix.
core_distances = np.ascontiguousarray(
np.partition(
distance_matrix, further_neighbor_idx, axis=1
)[:, further_neighbor_idx]
)
with nogil:
# TODO: Update w/ prange with thread count based on
# _openmp_effective_n_threads
for i in range(n_samples):
for j in range(n_samples):
mutual_reachability_distance = max(
core_distances[i],
core_distances[j],
distance_matrix[i, j],
)
distance_matrix[i, j] = mutual_reachability_distance
def _sparse_mutual_reachability_graph(
cnp.ndarray[floating, ndim=1, mode="c"] data,
cnp.ndarray[integral, ndim=1, mode="c"] indices,
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
intp_t n_samples,
intp_t further_neighbor_idx,
floating max_distance,
):
"""Sparse implementation of mutual reachability graph.
The computation is done in-place, i.e. the distance matrix is modified
directly. This implementation only accepts `CSR` format sparse matrices.
Parameters
----------
distance_matrix : sparse matrix of shape (n_samples, n_samples)
Sparse matrix of distances between samples. The sparse format should
be `CSR`.
further_neighbor_idx : int
The index of the furthest neighbor to use to define the core distances.
max_distance : float
The distance which `np.inf` is replaced with. When the true mutual-
reachability distance is measured to be infinite, it is instead
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
matrix.
"""
cdef:
integral i, col_ind, row_ind
floating mutual_reachability_distance
floating[:] core_distances
floating[:] row_data
if floating is float:
dtype = np.float32
else:
dtype = np.float64
core_distances = np.empty(n_samples, dtype=dtype)
for i in range(n_samples):
row_data = data[indptr[i]:indptr[i + 1]]
if further_neighbor_idx < row_data.size:
core_distances[i] = np.partition(
row_data, further_neighbor_idx
)[further_neighbor_idx]
else:
core_distances[i] = INFINITY
with nogil:
for row_ind in range(n_samples):
for i in range(indptr[row_ind], indptr[row_ind + 1]):
col_ind = indices[i]
mutual_reachability_distance = max(
core_distances[row_ind], core_distances[col_ind], data[i]
)
if isfinite(mutual_reachability_distance):
data[i] = mutual_reachability_distance
elif max_distance > 0:
data[i] = max_distance

View File

@@ -0,0 +1,49 @@
# Copyright (c) 2015, Leland McInnes
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from sklearn.utils._typedefs cimport intp_t, float64_t, uint8_t
cimport numpy as cnp
# This corresponds to the scipy.cluster.hierarchy format
ctypedef packed struct HIERARCHY_t:
intp_t left_node
intp_t right_node
float64_t value
intp_t cluster_size
# Effectively an edgelist encoding a parent/child pair, along with a value and
# the corresponding cluster_size in each row providing a tree structure.
ctypedef packed struct CONDENSED_t:
intp_t parent
intp_t child
float64_t value
intp_t cluster_size
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)

View File

@@ -0,0 +1,799 @@
# Tree handling (condensing, finding stable clusters) for hdbscan
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
from libc.math cimport isinf
import cython
import numpy as np
cnp.import_array()
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
cdef cnp.float64_t INFTY = np.inf
cdef cnp.intp_t NOISE = -1
HIERARCHY_dtype = np.dtype([
("left_node", np.intp),
("right_node", np.intp),
("value", np.float64),
("cluster_size", np.intp),
])
CONDENSED_dtype = np.dtype([
("parent", np.intp),
("child", np.intp),
("value", np.float64),
("cluster_size", np.intp),
])
cpdef tuple tree_to_labels(
const HIERARCHY_t[::1] single_linkage_tree,
cnp.intp_t min_cluster_size=10,
cluster_selection_method="eom",
bint allow_single_cluster=False,
cnp.float64_t cluster_selection_epsilon=0.0,
max_cluster_size=None,
):
cdef:
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
labels, probabilities = _get_clusters(
condensed_tree,
_compute_stability(condensed_tree),
cluster_selection_method,
allow_single_cluster,
cluster_selection_epsilon,
max_cluster_size,
)
return (labels, probabilities)
cdef list bfs_from_hierarchy(
const HIERARCHY_t[::1] hierarchy,
cnp.intp_t bfs_root
):
"""
Perform a breadth first search on a tree in scipy hclust format.
"""
cdef list process_queue, next_queue, result
cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
cdef cnp.intp_t node
process_queue = [bfs_root]
result = []
while process_queue:
result.extend(process_queue)
# By construction, node i is formed by the union of nodes
# hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
process_queue = [
x - n_samples
for x in process_queue
if x >= n_samples
]
if process_queue:
next_queue = []
for node in process_queue:
next_queue.extend(
[
hierarchy[node].left_node,
hierarchy[node].right_node,
]
)
process_queue = next_queue
return result
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
const HIERARCHY_t[::1] hierarchy,
cnp.intp_t min_cluster_size=10
):
"""Condense a tree according to a minimum cluster size. This is akin
to the runt pruning procedure of Stuetzle. The result is a much simpler
tree that is easier to visualize. We include extra information on the
lambda value at which individual points depart clusters for later
analysis and computation.
Parameters
----------
hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
A single linkage hierarchy in scipy.cluster.hierarchy format.
min_cluster_size : int, optional (default 10)
The minimum size of clusters to consider. Clusters smaller than this
are pruned from the tree.
Returns
-------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
"""
cdef:
cnp.intp_t root = 2 * hierarchy.shape[0]
cnp.intp_t n_samples = hierarchy.shape[0] + 1
cnp.intp_t next_label = n_samples + 1
list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
cnp.intp_t[::1] relabel
cnp.uint8_t[::1] ignore
cnp.intp_t node, sub_node, left, right
cnp.float64_t lambda_value, distance
cnp.intp_t left_count, right_count
HIERARCHY_t children
relabel = np.empty(root + 1, dtype=np.intp)
relabel[root] = n_samples
result_list = []
ignore = np.zeros(len(node_list), dtype=bool)
for node in node_list:
if ignore[node] or node < n_samples:
continue
children = hierarchy[node - n_samples]
left = children.left_node
right = children.right_node
distance = children.value
if distance > 0.0:
lambda_value = 1.0 / distance
else:
lambda_value = INFTY
if left >= n_samples:
left_count = hierarchy[left - n_samples].cluster_size
else:
left_count = 1
if right >= n_samples:
right_count = hierarchy[right - n_samples].cluster_size
else:
right_count = 1
if left_count >= min_cluster_size and right_count >= min_cluster_size:
relabel[left] = next_label
next_label += 1
result_list.append(
(relabel[node], relabel[left], lambda_value, left_count)
)
relabel[right] = next_label
next_label += 1
result_list.append(
(relabel[node], relabel[right], lambda_value, right_count)
)
elif left_count < min_cluster_size and right_count < min_cluster_size:
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
elif left_count < min_cluster_size:
relabel[right] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
else:
relabel[left] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
return np.array(result_list, dtype=CONDENSED_dtype)
cdef dict _compute_stability(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
):
cdef:
cnp.float64_t[::1] result, births
cnp.intp_t[:] parents = condensed_tree['parent']
cnp.intp_t parent, cluster_size, result_index, idx
cnp.float64_t lambda_val
CONDENSED_t condensed_node
cnp.intp_t largest_child = condensed_tree['child'].max()
cnp.intp_t smallest_cluster = np.min(parents)
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
dict stability_dict = {}
largest_child = max(largest_child, smallest_cluster)
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
condensed_node = condensed_tree[idx]
births[condensed_node.child] = condensed_node.value
births[smallest_cluster] = 0.0
result = np.zeros(num_clusters, dtype=np.float64)
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
condensed_node = condensed_tree[idx]
parent = condensed_node.parent
lambda_val = condensed_node.value
cluster_size = condensed_node.cluster_size
result_index = parent - smallest_cluster
result[result_index] += (lambda_val - births[parent]) * cluster_size
for idx in range(num_clusters):
stability_dict[idx + smallest_cluster] = result[idx]
return stability_dict
cdef list bfs_from_cluster_tree(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
cnp.intp_t bfs_root
):
cdef:
list result = []
cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
np.array([bfs_root], dtype=np.intp)
)
cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
cnp.intp_t[:] parents = condensed_tree['parent']
while len(process_queue) > 0:
result.extend(process_queue.tolist())
process_queue = children[np.isin(parents, process_queue)]
return result
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
cdef:
cnp.intp_t parent, current_parent, idx
cnp.float64_t lambda_val, max_lambda
cnp.float64_t[::1] deaths
cnp.intp_t largest_parent = condensed_tree['parent'].max()
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
current_parent = condensed_tree[0].parent
max_lambda = condensed_tree[0].value
for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
parent = condensed_tree[idx].parent
lambda_val = condensed_tree[idx].value
if parent == current_parent:
max_lambda = max(max_lambda, lambda_val)
else:
deaths[current_parent] = max_lambda
current_parent = parent
max_lambda = lambda_val
deaths[current_parent] = max_lambda # value for last parent
return deaths
@cython.final
cdef class TreeUnionFind:
cdef cnp.intp_t[:, ::1] data
cdef cnp.uint8_t[::1] is_component
def __init__(self, size):
cdef cnp.intp_t idx
self.data = np.zeros((size, 2), dtype=np.intp)
for idx in range(size):
self.data[idx, 0] = idx
self.is_component = np.ones(size, dtype=np.uint8)
cdef void union(self, cnp.intp_t x, cnp.intp_t y):
cdef cnp.intp_t x_root = self.find(x)
cdef cnp.intp_t y_root = self.find(y)
if self.data[x_root, 1] < self.data[y_root, 1]:
self.data[x_root, 0] = y_root
elif self.data[x_root, 1] > self.data[y_root, 1]:
self.data[y_root, 0] = x_root
else:
self.data[y_root, 0] = x_root
self.data[x_root, 1] += 1
return
cdef cnp.intp_t find(self, cnp.intp_t x):
if self.data[x, 0] != x:
self.data[x, 0] = self.find(self.data[x, 0])
self.is_component[x] = False
return self.data[x, 0]
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
const HIERARCHY_t[::1] linkage,
cnp.float64_t cut,
cnp.intp_t min_cluster_size
):
"""Given a single linkage tree and a cut value, return the
vector of cluster labels at that cut value. This is useful
for Robust Single Linkage, and extracting DBSCAN results
from a single HDBSCAN run.
Parameters
----------
linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
The single linkage tree in scipy.cluster.hierarchy format.
cut : double
The cut value at which to find clusters.
min_cluster_size : int
The minimum cluster size; clusters below this size at
the cut will be considered noise.
Returns
-------
labels : ndarray of shape (n_samples,)
The cluster labels for each point in the data set;
a label of -1 denotes a noise assignment.
"""
cdef:
cnp.intp_t n, cluster, root, n_samples, cluster_label
cnp.intp_t[::1] unique_labels, cluster_size
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
TreeUnionFind union_find
dict cluster_label_map
HIERARCHY_t node
root = 2 * linkage.shape[0]
n_samples = root // 2 + 1
result = np.empty(n_samples, dtype=np.intp)
union_find = TreeUnionFind(root + 1)
cluster = n_samples
for node in linkage:
if node.value < cut:
union_find.union(node.left_node, cluster)
union_find.union(node.right_node, cluster)
cluster += 1
cluster_size = np.zeros(cluster, dtype=np.intp)
for n in range(n_samples):
cluster = union_find.find(n)
cluster_size[cluster] += 1
result[n] = cluster
cluster_label_map = {-1: NOISE}
cluster_label = 0
unique_labels = np.unique(result)
for cluster in unique_labels:
if cluster_size[cluster] < min_cluster_size:
cluster_label_map[cluster] = NOISE
else:
cluster_label_map[cluster] = cluster_label
cluster_label += 1
for n in range(n_samples):
result[n] = cluster_label_map[result[n]]
return result
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
set clusters,
dict cluster_label_map,
cnp.intp_t allow_single_cluster,
cnp.float64_t cluster_selection_epsilon
):
"""Given a condensed tree, clusters and a labeling map for the clusters,
return an array containing the labels of each point based on cluster
membership. Note that this is where points may be marked as noisy
outliers. The determination of some points as noise is in large, single-
cluster datasets is controlled by the `allow_single_cluster` and
`cluster_selection_epsilon` parameters.
Parameters
----------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
clusters : set
The set of nodes corresponding to identified clusters. These node
values should be the same as those present in `condensed_tree`.
cluster_label_map : dict
A mapping from the node values present in `clusters` to the labels
which will be returned.
Returns
-------
labels : ndarray of shape (n_samples,)
The cluster labels for each point in the data set;
a label of -1 denotes a noise assignment.
"""
cdef:
cnp.intp_t root_cluster
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
TreeUnionFind union_find
cnp.intp_t n, parent, child, cluster
cnp.float64_t threshold
child_array = condensed_tree['child']
parent_array = condensed_tree['parent']
lambda_array = condensed_tree['value']
root_cluster = np.min(parent_array)
result = np.empty(root_cluster, dtype=np.intp)
union_find = TreeUnionFind(np.max(parent_array) + 1)
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
child = child_array[n]
parent = parent_array[n]
if child not in clusters:
union_find.union(parent, child)
for n in range(root_cluster):
cluster = union_find.find(n)
label = NOISE
if cluster != root_cluster:
label = cluster_label_map[cluster]
elif len(clusters) == 1 and allow_single_cluster:
# There can only be one edge with this particular child hence this
# expression extracts a unique, scalar lambda value.
parent_lambda = lambda_array[child_array == n]
if cluster_selection_epsilon != 0.0:
threshold = 1 / cluster_selection_epsilon
else:
# The threshold should be calculated per-sample based on the
# largest lambda of any simbling node.
threshold = lambda_array[parent_array == cluster].max()
if parent_lambda >= threshold:
label = cluster_label_map[cluster]
result[n] = label
return result
cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
dict cluster_map,
cnp.intp_t[::1] labels
):
cdef:
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
cnp.float64_t[:] lambda_array
cnp.float64_t[::1] deaths
cnp.intp_t[:] child_array, parent_array
cnp.intp_t root_cluster, n, point, cluster_num, cluster
cnp.float64_t max_lambda, lambda_val
child_array = condensed_tree['child']
parent_array = condensed_tree['parent']
lambda_array = condensed_tree['value']
result = np.zeros(labels.shape[0])
deaths = max_lambdas(condensed_tree)
root_cluster = np.min(parent_array)
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
point = child_array[n]
if point >= root_cluster:
continue
cluster_num = labels[point]
if cluster_num == -1:
continue
cluster = cluster_map[cluster_num]
max_lambda = deaths[cluster]
if max_lambda == 0.0 or isinf(lambda_array[n]):
result[point] = 1.0
else:
lambda_val = min(lambda_array[n], max_lambda)
result[point] = lambda_val / max_lambda
return result
cpdef list recurse_leaf_dfs(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.intp_t current_node
):
cdef cnp.intp_t[:] children
cdef cnp.intp_t child
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
if children.shape[0] == 0:
return [current_node,]
else:
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
cdef cnp.intp_t root
if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
return []
root = cluster_tree['parent'].min()
return recurse_leaf_dfs(cluster_tree, root)
cdef cnp.intp_t traverse_upwards(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.float64_t cluster_selection_epsilon,
cnp.intp_t leaf,
cnp.intp_t allow_single_cluster
):
cdef cnp.intp_t root, parent
cdef cnp.float64_t parent_eps
root = cluster_tree['parent'].min()
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
if parent == root:
if allow_single_cluster:
return parent
else:
return leaf # return node closest to root
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
if parent_eps > cluster_selection_epsilon:
return parent
else:
return traverse_upwards(
cluster_tree,
cluster_selection_epsilon,
parent,
allow_single_cluster
)
cdef set epsilon_search(
set leaves,
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.float64_t cluster_selection_epsilon,
cnp.intp_t allow_single_cluster
):
cdef:
list selected_clusters = list()
list processed = list()
cnp.intp_t leaf, epsilon_child, sub_node
cnp.float64_t eps
cnp.uint8_t[:] leaf_nodes
cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
for leaf in leaves:
leaf_nodes = children == leaf
eps = 1 / distances[leaf_nodes][0]
if eps < cluster_selection_epsilon:
if leaf not in processed:
epsilon_child = traverse_upwards(
cluster_tree,
cluster_selection_epsilon,
leaf,
allow_single_cluster
)
selected_clusters.append(epsilon_child)
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
if sub_node != epsilon_child:
processed.append(sub_node)
else:
selected_clusters.append(leaf)
return set(selected_clusters)
@cython.wraparound(True)
cdef tuple _get_clusters(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
dict stability,
cluster_selection_method='eom',
cnp.uint8_t allow_single_cluster=False,
cnp.float64_t cluster_selection_epsilon=0.0,
max_cluster_size=None
):
"""Given a tree and stability dict, produce the cluster labels
(and probabilities) for a flat clustering based on the chosen
cluster selection method.
Parameters
----------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
stability : dict
A dictionary mapping cluster_ids to stability values
cluster_selection_method : string, optional (default 'eom')
The method of selecting clusters. The default is the
Excess of Mass algorithm specified by 'eom'. The alternate
option is 'leaf'.
allow_single_cluster : boolean, optional (default False)
Whether to allow a single cluster to be selected by the
Excess of Mass algorithm.
cluster_selection_epsilon: double, optional (default 0.0)
A distance threshold for cluster splits.
max_cluster_size: int, default=None
The maximum size for clusters located by the EOM clusterer. Can
be overridden by the cluster_selection_epsilon parameter in
rare cases.
Returns
-------
labels : ndarray of shape (n_samples,)
An integer array of cluster labels, with -1 denoting noise.
probabilities : ndarray (n_samples,)
The cluster membership strength of each sample.
stabilities : ndarray (n_clusters,)
The cluster coherence strengths of each cluster.
"""
cdef:
list node_list
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
cnp.uint8_t[::1] child_selection
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
dict is_cluster, cluster_sizes
cnp.float64_t subtree_stability
cnp.intp_t node, sub_node, cluster, n_samples
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
# Assume clusters are ordered by numeric id equivalent to
# a topological sort of the tree; This is valid given the
# current implementation above, so don't change that ... or
# if you do, change this accordingly!
if allow_single_cluster:
node_list = sorted(stability.keys(), reverse=True)
else:
node_list = sorted(stability.keys(), reverse=True)[:-1]
# (exclude root)
cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
is_cluster = {cluster: True for cluster in node_list}
n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
if max_cluster_size is None:
max_cluster_size = n_samples + 1 # Set to a value that will never be triggered
cluster_sizes = {
child: cluster_size for child, cluster_size
in zip(cluster_tree['child'], cluster_tree['cluster_size'])
}
if allow_single_cluster:
# Compute cluster size for the root node
cluster_sizes[node_list[-1]] = np.sum(
cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
if cluster_selection_method == 'eom':
for node in node_list:
child_selection = (cluster_tree['parent'] == node)
subtree_stability = np.sum([
stability[child] for
child in cluster_tree['child'][child_selection]])
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
is_cluster[node] = False
stability[node] = subtree_stability
else:
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
if sub_node != node:
is_cluster[sub_node] = False
if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
eom_clusters = [c for c in is_cluster if is_cluster[c]]
selected_clusters = []
# first check if eom_clusters only has root node, which skips epsilon check.
if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
if allow_single_cluster:
selected_clusters = eom_clusters
else:
selected_clusters = epsilon_search(
set(eom_clusters),
cluster_tree,
cluster_selection_epsilon,
allow_single_cluster
)
for c in is_cluster:
if c in selected_clusters:
is_cluster[c] = True
else:
is_cluster[c] = False
elif cluster_selection_method == 'leaf':
leaves = set(get_cluster_tree_leaves(cluster_tree))
if len(leaves) == 0:
for c in is_cluster:
is_cluster[c] = False
is_cluster[condensed_tree['parent'].min()] = True
if cluster_selection_epsilon != 0.0:
selected_clusters = epsilon_search(
leaves,
cluster_tree,
cluster_selection_epsilon,
allow_single_cluster
)
else:
selected_clusters = leaves
for c in is_cluster:
if c in selected_clusters:
is_cluster[c] = True
else:
is_cluster[c] = False
clusters = {c for c in is_cluster if is_cluster[c]}
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
reverse_cluster_map = {n: c for c, n in cluster_map.items()}
labels = _do_labelling(
condensed_tree,
clusters,
cluster_map,
allow_single_cluster,
cluster_selection_epsilon
)
probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
return (labels, probs)

View File

@@ -0,0 +1,15 @@
cluster_hdbscan_extension_metadata = {
'_linkage': {'sources': [cython_gen.process('_linkage.pyx'), metrics_cython_tree]},
'_reachability': {'sources': [cython_gen.process('_reachability.pyx')]},
'_tree': {'sources': [cython_gen.process('_tree.pyx')]}
}
foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
py.extension_module(
ext_name,
ext_dict.get('sources'),
dependencies: [np_dep],
subdir: 'sklearn/cluster/_hdbscan',
install: true
)
endforeach

View File

@@ -0,0 +1,63 @@
import numpy as np
import pytest
from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
)
def test_mutual_reachability_graph_error_sparse_format():
"""Check that we raise an error if the sparse format is not CSR."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, "sparse_csc")
err_msg = "Only sparse CSR matrices are supported"
with pytest.raises(ValueError, match=err_msg):
mutual_reachability_graph(X)
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
def test_mutual_reachability_graph_inplace(array_type):
"""Check that the operation is happening inplace."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
mr_graph = mutual_reachability_graph(X)
assert id(mr_graph) == id(X)
def test_mutual_reachability_graph_equivalence_dense_sparse():
"""Check that we get the same results for dense and sparse implementation."""
rng = np.random.RandomState(0)
X = rng.randn(5, 5)
X_dense = X.T @ X
X_sparse = _convert_container(X_dense, "sparse_csr")
mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_mutual_reachability_graph_preserves_dtype(array_type, dtype):
"""Check that the computation preserve dtype thanks to fused types."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = (X.T @ X).astype(dtype)
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
assert X.dtype == dtype
mr_graph = mutual_reachability_graph(X)
assert mr_graph.dtype == dtype

View File

@@ -0,0 +1,9 @@
from sklearn.utils._typedefs cimport intp_t
cdef class UnionFind:
cdef intp_t next_label
cdef intp_t[:] parent
cdef intp_t[:] size
cdef void union(self, intp_t m, intp_t n) noexcept
cdef intp_t fast_find(self, intp_t n) noexcept

View File

@@ -0,0 +1,507 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
cimport cython
from sklearn.metrics._dist_metrics cimport DistanceMetric64
from sklearn.utils._fast_dict cimport IntFloatDict
from sklearn.utils._typedefs cimport float64_t, intp_t, uint8_t
# C++
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.map cimport map as cpp_map
from libc.math cimport fmax, INFINITY
###############################################################################
# Utilities for computing the ward momentum
def compute_ward_dist(
const float64_t[::1] m_1,
const float64_t[:, ::1] m_2,
const intp_t[::1] coord_row,
const intp_t[::1] coord_col,
float64_t[::1] res
):
cdef intp_t size_max = coord_row.shape[0]
cdef intp_t n_features = m_2.shape[1]
cdef intp_t i, j, row, col
cdef float64_t pa, n
for i in range(size_max):
row = coord_row[i]
col = coord_col[i]
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
pa = 0.
for j in range(n_features):
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
res[i] = pa * n
###############################################################################
# Utilities for cutting and exploring a hierarchical tree
def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
"""
Function returning all the descendent leaves of a set of nodes in the tree.
Parameters
----------
node : integer
The node for which we want the descendents.
children : list of pairs, length n_nodes
The children of each non-leaf node. Values less than `n_samples` refer
to leaves of the tree. A greater value `i` indicates a node with
children `children[i - n_samples]`.
n_leaves : integer
Number of leaves.
Returns
-------
descendent : list of int
"""
ind = [node]
if node < n_leaves:
return ind
descendent = []
# It is actually faster to do the accounting of the number of
# elements is the list ourselves: len is a lengthy operation on a
# chained list
cdef intp_t i, n_indices = 1
while n_indices:
i = ind.pop()
if i < n_leaves:
descendent.append(i)
n_indices -= 1
else:
ind.extend(children[i - n_leaves])
n_indices += 1
return descendent
def hc_get_heads(intp_t[:] parents, copy=True):
"""Returns the heads of the forest, as defined by parents.
Parameters
----------
parents : array of integers
The parent structure defining the forest (ensemble of trees)
copy : boolean
If copy is False, the input 'parents' array is modified inplace
Returns
-------
heads : array of integers of same shape as parents
The indices in the 'parents' of the tree heads
"""
cdef intp_t parent, node0, node, size
if copy:
parents = np.copy(parents)
size = parents.size
# Start from the top of the tree and go down
for node0 in range(size - 1, -1, -1):
node = node0
parent = parents[node]
while parent != node:
parents[node0] = parent
node = parent
parent = parents[node]
return parents
def _get_parents(
nodes,
heads,
const intp_t[:] parents,
uint8_t[::1] not_visited
):
"""Returns the heads of the given nodes, as defined by parents.
Modifies 'heads' and 'not_visited' in-place.
Parameters
----------
nodes : list of integers
The nodes to start from
heads : list of integers
A list to hold the results (modified inplace)
parents : array of integers
The parent structure defining the tree
not_visited
The tree nodes to consider (modified inplace)
"""
cdef intp_t parent, node
for node in nodes:
parent = parents[node]
while parent != node:
node = parent
parent = parents[node]
if not_visited[node]:
not_visited[node] = 0
heads.append(node)
###############################################################################
# merge strategies implemented on IntFloatDicts
# These are used in the hierarchical clustering code, to implement
# merging between two clusters, defined as a dict containing node number
# as keys and edge weights as values.
def max_merge(
IntFloatDict a,
IntFloatDict b,
const intp_t[:] mask,
intp_t n_a,
intp_t n_b
):
"""Merge two IntFloatDicts with the max strategy: when the same key is
present in the two dicts, the max of the two values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are not used in the case of a max merge.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
cdef intp_t key
cdef float64_t value
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = fmax(deref(out_it).second, value)
inc(b_it)
return out_obj
def average_merge(
IntFloatDict a,
IntFloatDict b,
const intp_t[:] mask,
intp_t n_a,
intp_t n_b
):
"""Merge two IntFloatDicts with the average strategy: when the
same key is present in the two dicts, the weighted average of the two
values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are used for a weighted mean.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
cdef intp_t key
cdef float64_t value
cdef float64_t n_out = <float64_t> (n_a + n_b)
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = (n_a * deref(out_it).second
+ n_b * value) / n_out
inc(b_it)
return out_obj
###############################################################################
# An edge object for fast comparisons
cdef class WeightedEdge:
cdef public intp_t a
cdef public intp_t b
cdef public float64_t weight
def __init__(self, float64_t weight, intp_t a, intp_t b):
self.weight = weight
self.a = a
self.b = b
def __richcmp__(self, WeightedEdge other, int op):
"""Cython-specific comparison method.
op is the comparison code::
< 0
== 2
> 4
<= 1
!= 3
>= 5
"""
if op == 0:
return self.weight < other.weight
elif op == 1:
return self.weight <= other.weight
elif op == 2:
return self.weight == other.weight
elif op == 3:
return self.weight != other.weight
elif op == 4:
return self.weight > other.weight
elif op == 5:
return self.weight >= other.weight
def __repr__(self):
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
self.weight,
self.a, self.b)
################################################################################
# Efficient labelling/conversion of MSTs to single linkage hierarchies
cdef class UnionFind(object):
def __init__(self, N):
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
self.next_label = N
self.size = np.hstack((np.ones(N, dtype=np.intp),
np.zeros(N - 1, dtype=np.intp)))
cdef void union(self, intp_t m, intp_t n) noexcept:
self.parent[m] = self.next_label
self.parent[n] = self.next_label
self.size[self.next_label] = self.size[m] + self.size[n]
self.next_label += 1
return
@cython.wraparound(True)
cdef intp_t fast_find(self, intp_t n) noexcept:
cdef intp_t p
p = n
# find the highest node in the linkage graph so far
while self.parent[n] != -1:
n = self.parent[n]
# provide a shortcut up to the highest node
while self.parent[p] != n:
p, self.parent[p] = self.parent[p], n
return n
def _single_linkage_label(const float64_t[:, :] L):
"""
Convert a linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently. This is the private version of the function that assumes that
``L`` has been properly validated. See ``single_linkage_label`` for the
user facing version of this function.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
cdef float64_t[:, ::1] result_arr
cdef intp_t left, left_cluster, right, right_cluster, index
cdef float64_t delta
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
U = UnionFind(L.shape[0] + 1)
for index in range(L.shape[0]):
left = <intp_t> L[index, 0]
right = <intp_t> L[index, 1]
delta = L[index, 2]
left_cluster = U.fast_find(left)
right_cluster = U.fast_find(right)
result_arr[index][0] = left_cluster
result_arr[index][1] = right_cluster
result_arr[index][2] = delta
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
U.union(left_cluster, right_cluster)
return np.asarray(result_arr)
@cython.wraparound(True)
def single_linkage_label(L):
"""
Convert a linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
# Validate L
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
raise ValueError("Input MST array is not a validly formatted MST array")
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
if not is_sorted(L[:, 2]):
raise ValueError("Input MST array must be sorted by weight")
return _single_linkage_label(L)
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
def mst_linkage_core(
const float64_t [:, ::1] raw_data,
DistanceMetric64 dist_metric):
"""
Compute the necessary elements of a minimum spanning
tree for computation of single linkage clustering. This
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
algorithms" <1109.2378>`.
In contrast to the scipy implementation is never computes
a full distance matrix, generating distances only as they
are needed and releasing them when no longer needed.
Parameters
----------
raw_data: array of shape (n_samples, n_features)
The array of feature data to be clustered. Must be C-aligned
dist_metric: DistanceMetric64
A DistanceMetric64 object conforming to the API from
``sklearn.metrics._dist_metrics.pxd`` that will be
used to compute distances.
Returns
-------
mst_core_data: array of shape (n_samples, 3)
An array providing information from which one
can either compute an MST, or the linkage hierarchy
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
agglomerative clustering algorithms" <1109.2378>` algorithm
MST-LINKAGE-CORE for more details.
"""
cdef:
intp_t n_samples = raw_data.shape[0]
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
intp_t current_node = 0
intp_t new_node
intp_t i
intp_t j
intp_t num_features = raw_data.shape[1]
float64_t right_value
float64_t left_value
float64_t new_distance
float64_t[:] current_distances = np.full(n_samples, INFINITY)
for i in range(n_samples - 1):
in_tree[current_node] = 1
new_distance = INFINITY
new_node = 0
for j in range(n_samples):
if in_tree[j]:
continue
right_value = current_distances[j]
left_value = dist_metric.dist(&raw_data[current_node, 0],
&raw_data[j, 0],
num_features)
if left_value < right_value:
current_distances[j] = left_value
if current_distances[j] < new_distance:
new_distance = current_distances[j]
new_node = j
result[i, 0] = current_node
result[i, 1] = new_node
result[i, 2] = new_distance
current_node = new_node
return np.array(result)

View File

@@ -0,0 +1,48 @@
from cython cimport floating
cdef floating _euclidean_dense_dense(
const floating*,
const floating*,
int,
bint
) noexcept nogil
cdef floating _euclidean_sparse_dense(
const floating[::1],
const int[::1],
const floating[::1],
floating,
bint
) noexcept nogil
cpdef void _relocate_empty_clusters_dense(
const floating[:, ::1],
const floating[::1],
const floating[:, ::1],
floating[:, ::1],
floating[::1],
const int[::1]
)
cpdef void _relocate_empty_clusters_sparse(
const floating[::1],
const int[::1],
const int[::1],
const floating[::1],
const floating[:, ::1],
floating[:, ::1],
floating[::1],
const int[::1]
)
cdef void _average_centers(
floating[:, ::1],
const floating[::1]
)
cdef void _center_shift(
const floating[:, ::1],
const floating[:, ::1],
floating[::1]
)

View File

@@ -0,0 +1,328 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from cython cimport floating
from cython.parallel cimport prange
from libc.math cimport sqrt
from sklearn.utils.extmath import row_norms
# Number of samples per data chunk defined as a global constant.
CHUNK_SIZE = 256
cdef floating _euclidean_dense_dense(
const floating* a, # IN
const floating* b, # IN
int n_features,
bint squared
) noexcept nogil:
"""Euclidean distance between a dense and b dense"""
cdef:
int i
int n = n_features // 4
int rem = n_features % 4
floating result = 0
# We manually unroll the loop for better cache optimization.
for i in range(n):
result += (
(a[0] - b[0]) * (a[0] - b[0]) +
(a[1] - b[1]) * (a[1] - b[1]) +
(a[2] - b[2]) * (a[2] - b[2]) +
(a[3] - b[3]) * (a[3] - b[3])
)
a += 4
b += 4
for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
return result if squared else sqrt(result)
def _euclidean_dense_dense_wrapper(
const floating[::1] a,
const floating[::1] b,
bint squared
):
"""Wrapper of _euclidean_dense_dense for testing purpose"""
return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
cdef floating _euclidean_sparse_dense(
const floating[::1] a_data, # IN
const int[::1] a_indices, # IN
const floating[::1] b, # IN
floating b_squared_norm,
bint squared
) noexcept nogil:
"""Euclidean distance between a sparse and b dense"""
cdef:
int nnz = a_indices.shape[0]
int i
floating tmp, bi
floating result = 0.0
for i in range(nnz):
bi = b[a_indices[i]]
tmp = a_data[i] - bi
result += tmp * tmp - bi * bi
result += b_squared_norm
if result < 0:
result = 0.0
return result if squared else sqrt(result)
def _euclidean_sparse_dense_wrapper(
const floating[::1] a_data,
const int[::1] a_indices,
const floating[::1] b,
floating b_squared_norm,
bint squared
):
"""Wrapper of _euclidean_sparse_dense for testing purpose"""
return _euclidean_sparse_dense(
a_data, a_indices, b, b_squared_norm, squared)
cpdef floating _inertia_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers, # IN
const int[::1] labels, # IN
int n_threads,
int single_label=-1,
):
"""Compute inertia for dense input data
Sum of squared distance between each sample and its assigned center.
If single_label is >= 0, the inertia is computed only for that label.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
for i in prange(n_samples, nogil=True, num_threads=n_threads,
schedule='static'):
j = labels[i]
if single_label < 0 or single_label == j:
sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef floating _inertia_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers, # IN
const int[::1] labels, # IN
int n_threads,
int single_label=-1,
):
"""Compute inertia for sparse input data
Sum of squared distance between each sample and its assigned center.
If single_label is >= 0, the inertia is computed only for that label.
"""
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
int n_samples = X.shape[0]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
for i in prange(n_samples, nogil=True, num_threads=n_threads,
schedule='static'):
j = labels[i]
if single_label < 0 or single_label == j:
sq_dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[j], centers_squared_norms[j], True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef void _relocate_empty_clusters_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
const int[::1] labels # IN
):
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_features = X.shape[1]
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx, k
floating weight
if np.max(distances) == 0:
# Happens when there are more clusters than non-duplicate samples. Relocating
# is pointless in this case.
return
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(n_features):
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cpdef void _relocate_empty_clusters_sparse(
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
const int[::1] labels # IN
):
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_samples = X_indptr.shape[0] - 1
int i, j, k
floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
for i in range(n_samples):
j = labels[i]
distances[i] = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers_old[j], centers_squared_norms[j], True)
if np.max(distances) == 0:
# Happens when there are more clusters than non-duplicate samples. Relocating
# is pointless in this case.
return
cdef:
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx
floating weight
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cdef void _average_centers(
floating[:, ::1] centers, # INOUT
const floating[::1] weight_in_clusters # IN
):
"""Average new centers wrt weights."""
cdef:
int n_clusters = centers.shape[0]
int n_features = centers.shape[1]
int j, k
floating alpha
int argmax_weight = np.argmax(weight_in_clusters)
for j in range(n_clusters):
if weight_in_clusters[j] > 0:
alpha = 1.0 / weight_in_clusters[j]
for k in range(n_features):
centers[j, k] *= alpha
else:
# For convenience, we avoid setting empty clusters at the origin but place
# them at the location of the biggest cluster.
for k in range(n_features):
centers[j, k] = centers[argmax_weight, k]
cdef void _center_shift(
const floating[:, ::1] centers_old, # IN
const floating[:, ::1] centers_new, # IN
floating[::1] center_shift # OUT
):
"""Compute shift between old and new centers."""
cdef:
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
int j
for j in range(n_clusters):
center_shift[j] = _euclidean_dense_dense(
&centers_new[j, 0], &centers_old[j, 0], n_features, False)
def _is_same_clustering(
const int[::1] labels1,
const int[::1] labels2,
n_clusters
):
"""Check if two arrays of labels are the same up to a permutation of the labels"""
cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
cdef int i
for i in range(labels1.shape[0]):
if mapping[labels1[i]] == -1:
mapping[labels1[i]] = labels2[i]
elif mapping[labels1[i]] != labels2[i]:
return False
return True

View File

@@ -0,0 +1,686 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython cimport floating
from cython.parallel import prange, parallel
from libc.stdlib cimport calloc, free
from libc.string cimport memset
from sklearn.utils._openmp_helpers cimport omp_lock_t
from sklearn.utils._openmp_helpers cimport omp_init_lock
from sklearn.utils._openmp_helpers cimport omp_destroy_lock
from sklearn.utils._openmp_helpers cimport omp_set_lock
from sklearn.utils._openmp_helpers cimport omp_unset_lock
from sklearn.utils.extmath import row_norms
from sklearn.cluster._k_means_common import CHUNK_SIZE
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_dense
from sklearn.cluster._k_means_common cimport _relocate_empty_clusters_sparse
from sklearn.cluster._k_means_common cimport _euclidean_dense_dense
from sklearn.cluster._k_means_common cimport _euclidean_sparse_dense
from sklearn.cluster._k_means_common cimport _average_centers
from sklearn.cluster._k_means_common cimport _center_shift
def init_bounds_dense(
const floating[:, ::1] X, # IN
const floating[:, ::1] centers, # IN
const floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds, # OUT
int n_threads):
"""Initialize upper and lower bounds for each sample for dense input data.
Given X, centers and the pairwise distances divided by 2.0 between the
centers this calculates the upper bounds and lower bounds for each sample.
The upper bound for each sample is set to the distance between the sample
and the closest center.
The lower bound for each sample is a one-dimensional array of n_clusters.
For each sample i assume that the previously assigned cluster is c1 and the
previous closest distance is dist, for a new cluster c2, the
lower_bound[i][c2] is set to distance between the sample and this new
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
computation of unnecessary distances for each sample to the clusters that
it is unlikely to be assigned to.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The input data.
centers : ndarray of shape (n_clusters, n_features), dtype=floating
The cluster centers.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
The half of the distance between any 2 clusters centers.
labels : ndarray of shape(n_samples), dtype=int
The label for each sample. This array is modified in place.
upper_bounds : ndarray of shape(n_samples,), dtype=floating
The upper bound on the distance between each sample and its closest
cluster center. This array is modified in place.
lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
The lower bound on the distance between each sample and each cluster
center. This array is modified in place.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
int n_features = X.shape[1]
floating min_dist, dist
int best_cluster, i, j
for i in prange(
n_samples, num_threads=n_threads, schedule='static', nogil=True
):
best_cluster = 0
min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
n_features, False)
lower_bounds[i, 0] = min_dist
for j in range(1, n_clusters):
if min_dist > center_half_distances[best_cluster, j]:
dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, False)
lower_bounds[i, j] = dist
if dist < min_dist:
min_dist = dist
best_cluster = j
labels[i] = best_cluster
upper_bounds[i] = min_dist
def init_bounds_sparse(
X, # IN
const floating[:, ::1] centers, # IN
const floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds, # OUT
int n_threads):
"""Initialize upper and lower bounds for each sample for sparse input data.
Given X, centers and the pairwise distances divided by 2.0 between the
centers this calculates the upper bounds and lower bounds for each sample.
The upper bound for each sample is set to the distance between the sample
and the closest center.
The lower bound for each sample is a one-dimensional array of n_clusters.
For each sample i assume that the previously assigned cluster is c1 and the
previous closest distance is dist, for a new cluster c2, the
lower_bound[i][c2] is set to distance between the sample and this new
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
computation of unnecessary distances for each sample to the clusters that
it is unlikely to be assigned to.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features), dtype=floating
The input data. Must be in CSR format.
centers : ndarray of shape (n_clusters, n_features), dtype=floating
The cluster centers.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
The half of the distance between any 2 clusters centers.
labels : ndarray of shape(n_samples), dtype=int
The label for each sample. This array is modified in place.
upper_bounds : ndarray of shape(n_samples,), dtype=floating
The upper bound on the distance between each sample and its closest
cluster center. This array is modified in place.
lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
The lower bound on the distance between each sample and each cluster
center. This array is modified in place.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
floating min_dist, dist
int best_cluster, i, j
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
for i in prange(
n_samples, num_threads=n_threads, schedule='static', nogil=True
):
best_cluster = 0
min_dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[0], centers_squared_norms[0], False)
lower_bounds[i, 0] = min_dist
for j in range(1, n_clusters):
if min_dist > center_half_distances[best_cluster, j]:
dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[j], centers_squared_norms[j], False)
lower_bounds[i, j] = dist
if dist < min_dist:
min_dist = dist
best_cluster = j
labels[i] = best_cluster
upper_bounds[i] = min_dist
def elkan_iter_chunked_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means Elkan algorithm with dense input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The observations to cluster.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
Half pairwise distances between centers.
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
Distance between each center its closest center.
upper_bounds : ndarray of shape (n_samples,), dtype=floating
Upper bound for the distance between each sample and its center,
updated inplace.
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
Lower bound for the distance between each sample and each center,
updated inplace.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_new.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outliers.
return
cdef:
# hard-coded number of samples per chunk. Splitting in chunks is
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start, end
int i, j, k
floating *centers_new_chunk
floating *weight_in_clusters_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_dense(
X[start: end],
sample_weight[start: end],
centers_old,
center_half_distances,
distance_next_center,
labels[start: end],
upper_bounds[start: end],
lower_bounds[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_dense(X, sample_weight, centers_old,
centers_new, weight_in_clusters, labels)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
# update lower and upper bounds
for i in range(n_samples):
upper_bounds[i] += center_shift[labels[i]]
for j in range(n_clusters):
lower_bounds[i, j] -= center_shift[j]
if lower_bounds[i, j] < 0:
lower_bounds[i, j] = 0
cdef void _update_chunk_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
int[::1] labels, # INOUT
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one dense data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating upper_bound, distance
int i, j, k, label
for i in range(n_samples):
upper_bound = upper_bounds[i]
bounds_tight = 0
label = labels[i]
# Next center is not far away from the currently assigned center.
# Sample might need to be assigned to another center.
if not distance_next_center[label] >= upper_bound:
for j in range(n_clusters):
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])
):
# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
if not bounds_tight:
upper_bound = _euclidean_dense_dense(
&X[i, 0], &centers_old[label, 0], n_features, False)
lower_bounds[i, label] = upper_bound
bounds_tight = 1
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_dense_dense(
&X[i, 0], &centers_old[j, 0], n_features, False)
lower_bounds[i, j] = distance
if distance < upper_bound:
label = j
upper_bound = distance
labels[i] = label
upper_bounds[i] = upper_bound
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
def elkan_iter_chunked_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means Elkan algorithm with sparse input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
The observations to cluster. Must be in CSR format.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
Half pairwise distances between centers.
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
Distance between each center its closest center.
upper_bounds : ndarray of shape (n_samples,), dtype=floating
Upper bound for the distance between each sample and its center,
updated inplace.
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
Lower bound for the distance between each sample and each center,
updated inplace.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_new.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outliers.
return
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
# hard-coded number of samples per chunk. Splitting in chunks is
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start, end
int i, j, k
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
floating *centers_new_chunk
floating *weight_in_clusters_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_sparse(
X_data[X_indptr[start]: X_indptr[end]],
X_indices[X_indptr[start]: X_indptr[end]],
X_indptr[start: end+1],
sample_weight[start: end],
centers_old,
centers_squared_norms,
center_half_distances,
distance_next_center,
labels[start: end],
upper_bounds[start: end],
lower_bounds[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_sparse(
X_data, X_indices, X_indptr, sample_weight,
centers_old, centers_new, weight_in_clusters, labels)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
# update lower and upper bounds
for i in range(n_samples):
upper_bounds[i] += center_shift[labels[i]]
for j in range(n_clusters):
lower_bounds[i, j] -= center_shift[j]
if lower_bounds[i, j] < 0:
lower_bounds[i, j] = 0
cdef void _update_chunk_sparse(
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[::1] centers_squared_norms, # IN
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
int[::1] labels, # INOUT
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one sparse data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating upper_bound, distance
int i, j, k, label
int s = X_indptr[0]
for i in range(n_samples):
upper_bound = upper_bounds[i]
bounds_tight = 0
label = labels[i]
# Next center is not far away from the currently assigned center.
# Sample might need to be assigned to another center.
if not distance_next_center[label] >= upper_bound:
for j in range(n_clusters):
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])
):
# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
if not bounds_tight:
upper_bound = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
centers_old[label], centers_squared_norms[label], False)
lower_bounds[i, label] = upper_bound
bounds_tight = 1
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
centers_old[j], centers_squared_norms[j], False)
lower_bounds[i, j] = distance
if distance < upper_bound:
label = j
upper_bound = distance
labels[i] = label
upper_bounds[i] = upper_bound
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]

Some files were not shown because too many files have changed in this diff Show More