This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,190 @@
"""Score functions, performance metrics, pairwise metrics and distance computations."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.metrics import cluster
from sklearn.metrics._classification import (
accuracy_score,
balanced_accuracy_score,
brier_score_loss,
class_likelihood_ratios,
classification_report,
cohen_kappa_score,
confusion_matrix,
d2_brier_score,
d2_log_loss_score,
f1_score,
fbeta_score,
hamming_loss,
hinge_loss,
jaccard_score,
log_loss,
matthews_corrcoef,
multilabel_confusion_matrix,
precision_recall_fscore_support,
precision_score,
recall_score,
zero_one_loss,
)
from sklearn.metrics._dist_metrics import DistanceMetric
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
from sklearn.metrics._plot.det_curve import DetCurveDisplay
from sklearn.metrics._plot.precision_recall_curve import PrecisionRecallDisplay
from sklearn.metrics._plot.regression import PredictionErrorDisplay
from sklearn.metrics._plot.roc_curve import RocCurveDisplay
from sklearn.metrics._ranking import (
auc,
average_precision_score,
confusion_matrix_at_thresholds,
coverage_error,
dcg_score,
det_curve,
label_ranking_average_precision_score,
label_ranking_loss,
ndcg_score,
precision_recall_curve,
roc_auc_score,
roc_curve,
top_k_accuracy_score,
)
from sklearn.metrics._regression import (
d2_absolute_error_score,
d2_pinball_score,
d2_tweedie_score,
explained_variance_score,
max_error,
mean_absolute_error,
mean_absolute_percentage_error,
mean_gamma_deviance,
mean_pinball_loss,
mean_poisson_deviance,
mean_squared_error,
mean_squared_log_error,
mean_tweedie_deviance,
median_absolute_error,
r2_score,
root_mean_squared_error,
root_mean_squared_log_error,
)
from sklearn.metrics._scorer import (
check_scoring,
get_scorer,
get_scorer_names,
make_scorer,
)
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
calinski_harabasz_score,
completeness_score,
consensus_score,
davies_bouldin_score,
fowlkes_mallows_score,
homogeneity_completeness_v_measure,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
pair_confusion_matrix,
rand_score,
silhouette_samples,
silhouette_score,
v_measure_score,
)
from sklearn.metrics.pairwise import (
euclidean_distances,
nan_euclidean_distances,
pairwise_distances,
pairwise_distances_argmin,
pairwise_distances_argmin_min,
pairwise_distances_chunked,
pairwise_kernels,
)
__all__ = [
"ConfusionMatrixDisplay",
"DetCurveDisplay",
"DistanceMetric",
"PrecisionRecallDisplay",
"PredictionErrorDisplay",
"RocCurveDisplay",
"accuracy_score",
"adjusted_mutual_info_score",
"adjusted_rand_score",
"auc",
"average_precision_score",
"balanced_accuracy_score",
"brier_score_loss",
"calinski_harabasz_score",
"check_scoring",
"class_likelihood_ratios",
"classification_report",
"cluster",
"cohen_kappa_score",
"completeness_score",
"confusion_matrix",
"confusion_matrix_at_thresholds",
"consensus_score",
"coverage_error",
"d2_absolute_error_score",
"d2_brier_score",
"d2_log_loss_score",
"d2_pinball_score",
"d2_tweedie_score",
"davies_bouldin_score",
"dcg_score",
"det_curve",
"euclidean_distances",
"explained_variance_score",
"f1_score",
"fbeta_score",
"fowlkes_mallows_score",
"get_scorer",
"get_scorer_names",
"hamming_loss",
"hinge_loss",
"homogeneity_completeness_v_measure",
"homogeneity_score",
"jaccard_score",
"label_ranking_average_precision_score",
"label_ranking_loss",
"log_loss",
"make_scorer",
"matthews_corrcoef",
"max_error",
"mean_absolute_error",
"mean_absolute_percentage_error",
"mean_gamma_deviance",
"mean_pinball_loss",
"mean_poisson_deviance",
"mean_squared_error",
"mean_squared_log_error",
"mean_tweedie_deviance",
"median_absolute_error",
"multilabel_confusion_matrix",
"mutual_info_score",
"nan_euclidean_distances",
"ndcg_score",
"normalized_mutual_info_score",
"pair_confusion_matrix",
"pairwise_distances",
"pairwise_distances_argmin",
"pairwise_distances_argmin_min",
"pairwise_distances_chunked",
"pairwise_kernels",
"precision_recall_curve",
"precision_recall_fscore_support",
"precision_score",
"r2_score",
"rand_score",
"recall_score",
"roc_auc_score",
"roc_curve",
"root_mean_squared_error",
"root_mean_squared_log_error",
"silhouette_samples",
"silhouette_score",
"top_k_accuracy_score",
"v_measure_score",
"zero_one_loss",
]

View File

@@ -0,0 +1,193 @@
"""
Common code for all metrics.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from itertools import combinations
import numpy as np
from sklearn.utils import check_array, check_consistent_length
from sklearn.utils.multiclass import type_of_target
def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight=None):
"""Average a binary metric for multilabel classification.
Parameters
----------
y_true : array, shape = [n_samples] or [n_samples, n_classes]
True binary labels in binary label indicators.
y_score : array, shape = [n_samples] or [n_samples, n_classes]
Target scores, can either be probability estimates of the positive
class, confidence values, or binary decisions.
average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'
If ``None``, the scores for each class are returned. Otherwise,
this determines the type of averaging performed on the data:
``'micro'``:
Calculate metrics globally by considering each element of the label
indicator matrix as a label.
``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label).
``'samples'``:
Calculate metrics for each instance, and find their average.
Will be ignored when ``y_true`` is binary.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
binary_metric : callable, returns shape [n_classes]
The binary metric function to use.
Returns
-------
score : float or array of shape [n_classes]
If not ``None``, average the score, else return the score for each
classes.
"""
average_options = (None, "micro", "macro", "weighted", "samples")
if average not in average_options:
raise ValueError("average has to be one of {0}".format(average_options))
y_type = type_of_target(y_true)
if y_type not in ("binary", "multilabel-indicator"):
raise ValueError("{0} format is not supported".format(y_type))
if y_type == "binary":
return binary_metric(y_true, y_score, sample_weight=sample_weight)
check_consistent_length(y_true, y_score, sample_weight)
y_true = check_array(y_true)
y_score = check_array(y_score)
not_average_axis = 1
score_weight = sample_weight
average_weight = None
if average == "micro":
if score_weight is not None:
score_weight = np.repeat(score_weight, y_true.shape[1])
y_true = y_true.ravel()
y_score = y_score.ravel()
elif average == "weighted":
if score_weight is not None:
average_weight = np.sum(
np.multiply(y_true, np.reshape(score_weight, (-1, 1))), axis=0
)
else:
average_weight = np.sum(y_true, axis=0)
if np.isclose(average_weight.sum(), 0.0):
return 0
elif average == "samples":
# swap average_weight <-> score_weight
average_weight = score_weight
score_weight = None
not_average_axis = 0
if y_true.ndim == 1:
y_true = y_true.reshape((-1, 1))
if y_score.ndim == 1:
y_score = y_score.reshape((-1, 1))
n_classes = y_score.shape[not_average_axis]
score = np.zeros((n_classes,))
for c in range(n_classes):
y_true_c = y_true.take([c], axis=not_average_axis).ravel()
y_score_c = y_score.take([c], axis=not_average_axis).ravel()
score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight)
# Average the results
if average is not None:
if average_weight is not None:
# Scores with 0 weights are forced to be 0, preventing the average
# score from being affected by 0-weighted NaN elements.
average_weight = np.asarray(average_weight)
score[average_weight == 0] = 0
return float(np.average(score, weights=average_weight))
else:
return score
def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average="macro"):
"""Average one-versus-one scores for multiclass classification.
Uses the binary metric for one-vs-one multiclass classification,
where the score is computed according to the Hand & Till (2001) algorithm.
Parameters
----------
binary_metric : callable
The binary metric function to use that accepts the following as input:
y_true_target : array, shape = [n_samples_target]
Some sub-array of y_true for a pair of classes designated
positive and negative in the one-vs-one scheme.
y_score_target : array, shape = [n_samples_target]
Scores corresponding to the probability estimates
of a sample belonging to the designated positive class label
y_true : array-like of shape (n_samples,)
True multiclass labels.
y_score : array-like of shape (n_samples, n_classes)
Target scores corresponding to probability estimates of a sample
belonging to a particular class.
average : {'macro', 'weighted'}, default='macro'
Determines the type of averaging performed on the pairwise binary
metric scores:
``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account. Classes
are assumed to be uniformly distributed.
``'weighted'``:
Calculate metrics for each label, taking into account the
prevalence of the classes.
Returns
-------
score : float
Average of the pairwise binary metric scores.
"""
check_consistent_length(y_true, y_score)
y_true_unique = np.unique(y_true)
n_classes = y_true_unique.shape[0]
n_pairs = n_classes * (n_classes - 1) // 2
pair_scores = np.empty(n_pairs)
is_weighted = average == "weighted"
prevalence = np.empty(n_pairs) if is_weighted else None
# Compute scores treating a as positive class and b as negative class,
# then b as positive class and a as negative class
for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
a_mask = y_true == a
b_mask = y_true == b
ab_mask = np.logical_or(a_mask, b_mask)
if is_weighted:
prevalence[ix] = np.average(ab_mask)
a_true = a_mask[ab_mask]
b_true = b_mask[ab_mask]
a_true_score = binary_metric(a_true, y_score[ab_mask, a])
b_true_score = binary_metric(b_true, y_score[ab_mask, b])
pair_scores[ix] = (a_true_score + b_true_score) / 2
return np.average(pair_scores, weights=prevalence)

View File

@@ -0,0 +1,268 @@
from libc.math cimport sqrt, exp
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
cdef class DistanceMetric:
pass
######################################################################
# Inline distance functions
#
# We use these for the default (euclidean) case so that they can be
# inlined. This leads to faster computation for the most common case
cdef inline float64_t euclidean_dist64(
const float64_t* x1,
const float64_t* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t> (x1[j] - x2[j])
d += tmp * tmp
return sqrt(d)
cdef inline float64_t euclidean_rdist64(
const float64_t* x1,
const float64_t* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t>(x1[j] - x2[j])
d += tmp * tmp
return d
cdef inline float64_t euclidean_dist_to_rdist64(const float64_t dist) except -1 nogil:
return dist * dist
cdef inline float64_t euclidean_rdist_to_dist64(const float64_t dist) except -1 nogil:
return sqrt(dist)
######################################################################
# DistanceMetric64 base class
cdef class DistanceMetric64(DistanceMetric):
# The following attributes are required for a few of the subclasses.
# we must define them here so that cython's limited polymorphism will work.
# Because we don't expect to instantiate a lot of these objects, the
# extra memory overhead of this setup should not be an issue.
cdef float64_t p
cdef const float64_t[::1] vec
cdef const float64_t[:, ::1] mat
cdef intp_t size
cdef object func
cdef object kwargs
cdef float64_t dist(
self,
const float64_t* x1,
const float64_t* x2,
intp_t size,
) except -1 nogil
cdef float64_t rdist(
self,
const float64_t* x1,
const float64_t* x2,
intp_t size,
) except -1 nogil
cdef float64_t dist_csr(
self,
const float64_t* x1_data,
const int32_t* x1_indices,
const float64_t* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef float64_t rdist_csr(
self,
const float64_t* x1_data,
const int32_t* x1_indices,
const float64_t* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef int pdist(
self,
const float64_t[:, ::1] X,
float64_t[:, ::1] D,
) except -1
cdef int cdist(
self,
const float64_t[:, ::1] X,
const float64_t[:, ::1] Y,
float64_t[:, ::1] D,
) except -1
cdef int pdist_csr(
self,
const float64_t* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const intp_t size,
float64_t[:, ::1] D,
) except -1 nogil
cdef int cdist_csr(
self,
const float64_t* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const float64_t* x2_data,
const int32_t[::1] x2_indices,
const int32_t[::1] x2_indptr,
const intp_t size,
float64_t[:, ::1] D,
) except -1 nogil
cdef float64_t _rdist_to_dist(self, float64_t rdist) except -1 nogil
cdef float64_t _dist_to_rdist(self, float64_t dist) except -1 nogil
######################################################################
# Inline distance functions
#
# We use these for the default (euclidean) case so that they can be
# inlined. This leads to faster computation for the most common case
cdef inline float64_t euclidean_dist32(
const float32_t* x1,
const float32_t* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t> (x1[j] - x2[j])
d += tmp * tmp
return sqrt(d)
cdef inline float64_t euclidean_rdist32(
const float32_t* x1,
const float32_t* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t>(x1[j] - x2[j])
d += tmp * tmp
return d
cdef inline float64_t euclidean_dist_to_rdist32(const float32_t dist) except -1 nogil:
return dist * dist
cdef inline float64_t euclidean_rdist_to_dist32(const float32_t dist) except -1 nogil:
return sqrt(dist)
######################################################################
# DistanceMetric32 base class
cdef class DistanceMetric32(DistanceMetric):
# The following attributes are required for a few of the subclasses.
# we must define them here so that cython's limited polymorphism will work.
# Because we don't expect to instantiate a lot of these objects, the
# extra memory overhead of this setup should not be an issue.
cdef float64_t p
cdef const float64_t[::1] vec
cdef const float64_t[:, ::1] mat
cdef intp_t size
cdef object func
cdef object kwargs
cdef float32_t dist(
self,
const float32_t* x1,
const float32_t* x2,
intp_t size,
) except -1 nogil
cdef float32_t rdist(
self,
const float32_t* x1,
const float32_t* x2,
intp_t size,
) except -1 nogil
cdef float32_t dist_csr(
self,
const float32_t* x1_data,
const int32_t* x1_indices,
const float32_t* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef float32_t rdist_csr(
self,
const float32_t* x1_data,
const int32_t* x1_indices,
const float32_t* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef int pdist(
self,
const float32_t[:, ::1] X,
float32_t[:, ::1] D,
) except -1
cdef int cdist(
self,
const float32_t[:, ::1] X,
const float32_t[:, ::1] Y,
float32_t[:, ::1] D,
) except -1
cdef int pdist_csr(
self,
const float32_t* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const intp_t size,
float32_t[:, ::1] D,
) except -1 nogil
cdef int cdist_csr(
self,
const float32_t* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const float32_t* x2_data,
const int32_t[::1] x2_indices,
const int32_t[::1] x2_indptr,
const intp_t size,
float32_t[:, ::1] D,
) except -1 nogil
cdef float32_t _rdist_to_dist(self, float32_t rdist) except -1 nogil
cdef float32_t _dist_to_rdist(self, float32_t dist) except -1 nogil

View File

@@ -0,0 +1,152 @@
{{py:
implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
('64', 'float64_t', 'np.float64'),
('32', 'float32_t', 'np.float32')
]
}}
from libc.math cimport sqrt, exp
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
cdef class DistanceMetric:
pass
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
######################################################################
# Inline distance functions
#
# We use these for the default (euclidean) case so that they can be
# inlined. This leads to faster computation for the most common case
cdef inline float64_t euclidean_dist{{name_suffix}}(
const {{INPUT_DTYPE_t}}* x1,
const {{INPUT_DTYPE_t}}* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t> (x1[j] - x2[j])
d += tmp * tmp
return sqrt(d)
cdef inline float64_t euclidean_rdist{{name_suffix}}(
const {{INPUT_DTYPE_t}}* x1,
const {{INPUT_DTYPE_t}}* x2,
intp_t size,
) except -1 nogil:
cdef float64_t tmp, d=0
cdef intp_t j
for j in range(size):
tmp = <float64_t>(x1[j] - x2[j])
d += tmp * tmp
return d
cdef inline float64_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
return dist * dist
cdef inline float64_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
return sqrt(dist)
######################################################################
# DistanceMetric{{name_suffix}} base class
cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
# The following attributes are required for a few of the subclasses.
# we must define them here so that cython's limited polymorphism will work.
# Because we don't expect to instantiate a lot of these objects, the
# extra memory overhead of this setup should not be an issue.
cdef float64_t p
cdef const float64_t[::1] vec
cdef const float64_t[:, ::1] mat
cdef intp_t size
cdef object func
cdef object kwargs
cdef {{INPUT_DTYPE_t}} dist(
self,
const {{INPUT_DTYPE_t}}* x1,
const {{INPUT_DTYPE_t}}* x2,
intp_t size,
) except -1 nogil
cdef {{INPUT_DTYPE_t}} rdist(
self,
const {{INPUT_DTYPE_t}}* x1,
const {{INPUT_DTYPE_t}}* x2,
intp_t size,
) except -1 nogil
cdef {{INPUT_DTYPE_t}} dist_csr(
self,
const {{INPUT_DTYPE_t}}* x1_data,
const int32_t* x1_indices,
const {{INPUT_DTYPE_t}}* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef {{INPUT_DTYPE_t}} rdist_csr(
self,
const {{INPUT_DTYPE_t}}* x1_data,
const int32_t* x1_indices,
const {{INPUT_DTYPE_t}}* x2_data,
const int32_t* x2_indices,
const int32_t x1_start,
const int32_t x1_end,
const int32_t x2_start,
const int32_t x2_end,
const intp_t size,
) except -1 nogil
cdef int pdist(
self,
const {{INPUT_DTYPE_t}}[:, ::1] X,
{{INPUT_DTYPE_t}}[:, ::1] D,
) except -1
cdef int cdist(
self,
const {{INPUT_DTYPE_t}}[:, ::1] X,
const {{INPUT_DTYPE_t}}[:, ::1] Y,
{{INPUT_DTYPE_t}}[:, ::1] D,
) except -1
cdef int pdist_csr(
self,
const {{INPUT_DTYPE_t}}* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const intp_t size,
{{INPUT_DTYPE_t}}[:, ::1] D,
) except -1 nogil
cdef int cdist_csr(
self,
const {{INPUT_DTYPE_t}}* x1_data,
const int32_t[::1] x1_indices,
const int32_t[::1] x1_indptr,
const {{INPUT_DTYPE_t}}* x2_data,
const int32_t[::1] x2_indices,
const int32_t[::1] x2_indptr,
const intp_t size,
{{INPUT_DTYPE_t}}[:, ::1] D,
) except -1 nogil
cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil
cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil
{{endfor}}

View File

@@ -0,0 +1,112 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
#
# Pairwise Distances Reductions
# =============================
#
# Overview
# --------
#
# This module provides routines to compute pairwise distances between a set
# of row vectors of X and another set of row vectors of Y and apply a
# reduction on top. The canonical example is the brute-force computation
# of the top k nearest neighbors by leveraging the arg-k-min reduction.
#
# The reduction takes a matrix of pairwise distances between rows of X and Y
# as input and outputs an aggregate data-structure for each row of X. The
# aggregate values are typically smaller than the number of rows in Y, hence
# the term reduction.
#
# For computational reasons, the reduction are performed on the fly on chunks
# of rows of X and Y so as to keep intermediate data-structures in CPU cache
# and avoid unnecessary round trips of large distance arrays with the RAM
# that would otherwise severely degrade the speed by making the overall
# processing memory-bound.
#
# Finally, the routines follow a generic parallelization template to process
# chunks of data with OpenMP loops (via Cython prange), either on rows of X
# or rows of Y depending on their respective sizes.
#
#
# Dispatching to specialized implementations
# ------------------------------------------
#
# Dispatchers are meant to be used in the Python code. Under the hood, a
# dispatcher must only define the logic to choose at runtime to the correct
# dtype-specialized :class:`BaseDistancesReductionDispatcher` implementation based
# on the dtype of X and of Y.
#
#
# High-level diagram
# ------------------
#
# Legend:
#
# A ---⊳ B: A inherits from B
# A ---x B: A dispatches to B
#
#
# (base dispatcher)
# BaseDistancesReductionDispatcher
# ∆
# |
# |
# +------------------+---------------+---------------+------------------+
# | | | |
# | (dispatcher) (dispatcher) |
# | ArgKmin RadiusNeighbors |
# | | | |
# | | | |
# | | (float{32,64} implem.) | |
# | | BaseDistancesReduction{32,64} | |
# | | ∆ | |
# (dispatcher) | | | (dispatcher)
# ArgKminClassMode | | | RadiusNeighborsClassMode
# | | +----------+----------+ | |
# | | | | | |
# | | | | | |
# | x | | x |
# | +-------⊳ ArgKmin{32,64} RadiusNeighbors{32,64} ⊲---+ |
# x | | ∆ ∆ | | x
# ArgKminClassMode{32,64} | | | | RadiusNeighborsClassMode{32,64}
# ===================================== Specializations ============================================
# | | | |
# | | | |
# x | | x
# EuclideanArgKmin{32,64} EuclideanRadiusNeighbors{32,64}
#
#
# For instance :class:`ArgKmin` dispatches to:
# - :class:`ArgKmin64` if X and Y are two `float64` array-likes
# - :class:`ArgKmin32` if X and Y are two `float32` array-likes
#
# In addition, if the metric parameter is set to "euclidean" or "sqeuclidean",
# then some direct subclass of `BaseDistancesReduction{32,64}` further dispatches
# to one of their subclass for euclidean-specialized implementation. For instance,
# :class:`ArgKmin64` dispatches to :class:`EuclideanArgKmin64`.
#
# Those Euclidean-specialized implementations relies on optimal implementations of
# a decomposition of the squared euclidean distance matrix into a sum of three terms
# (see :class:`MiddleTermComputer{32,64}`).
#
from sklearn.metrics._pairwise_distances_reduction._dispatcher import (
ArgKmin,
ArgKminClassMode,
BaseDistancesReductionDispatcher,
RadiusNeighbors,
RadiusNeighborsClassMode,
sqeuclidean_row_norms,
)
__all__ = [
"ArgKmin",
"ArgKminClassMode",
"BaseDistancesReductionDispatcher",
"RadiusNeighbors",
"RadiusNeighborsClassMode",
"sqeuclidean_row_norms",
]
# ruff: noqa: E501

View File

@@ -0,0 +1,31 @@
from sklearn.utils._typedefs cimport intp_t, float64_t
{{for name_suffix in ['64', '32']}}
from sklearn.metrics._pairwise_distances_reduction._base cimport BaseDistancesReduction{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
"""float{{name_suffix}} implementation of the ArgKmin."""
cdef:
intp_t k
intp_t[:, ::1] argkmin_indices
float64_t[:, ::1] argkmin_distances
# Used as array of pointers to private datastructures used in threads.
float64_t ** heaps_r_distances_chunks
intp_t ** heaps_indices_chunks
cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
"""EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
cdef:
MiddleTermComputer{{name_suffix}} middle_term_computer
const float64_t[::1] X_norm_squared
const float64_t[::1] Y_norm_squared
bint use_squared_distances
{{endfor}}

View File

@@ -0,0 +1,512 @@
from libc.stdlib cimport free, malloc
from libc.float cimport DBL_MAX
from cython cimport final
from cython.parallel cimport parallel, prange
from sklearn.utils._heap cimport heap_push
from sklearn.utils._sorting cimport simultaneous_sort
from sklearn.utils._typedefs cimport intp_t, float64_t
import numpy as np
import warnings
from numbers import Integral
from scipy.sparse import issparse
from sklearn.utils import check_array, check_scalar
from sklearn.utils.fixes import _in_unstable_openblas_configuration
from sklearn.utils.parallel import _get_threadpool_controller
{{for name_suffix in ['64', '32']}}
from sklearn.metrics._pairwise_distances_reduction._base cimport (
BaseDistancesReduction{{name_suffix}},
_sqeuclidean_row_norms{{name_suffix}},
)
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
"""float{{name_suffix}} implementation of the ArgKmin."""
@classmethod
def compute(
cls,
X,
Y,
intp_t k,
metric="euclidean",
chunk_size=None,
dict metric_kwargs=None,
str strategy=None,
bint return_distance=False,
):
"""Compute the argkmin reduction.
This classmethod is responsible for introspecting the arguments
values to dispatch to the most appropriate implementation of
:class:`ArgKmin{{name_suffix}}`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
No instance should directly be created outside of this class method.
"""
# Limit the number of threads in second level of nested parallelism for BLAS
# to avoid threads over-subscription (in DOT or GEMM for instance).
with _get_threadpool_controller().limit(limits=1, user_api='blas'):
if metric in ("euclidean", "sqeuclidean"):
# Specialized implementation of ArgKmin for the Euclidean distance
# for the dense-dense and sparse-sparse cases.
# This implementation computes the distances by chunk using
# a decomposition of the Squared Euclidean distance.
# This specialisation has an improved arithmetic intensity for both
# the dense and sparse settings, allowing in most case speed-ups of
# several orders of magnitude compared to the generic ArgKmin
# implementation.
# Note that squared norms of X and Y are precomputed in the
# constructor of this class by issuing BLAS calls that may use
# multithreading (depending on the BLAS implementation), hence calling
# the constructor needs to be protected under the threadpool_limits
# context, along with the main calls to _parallel_on_Y and
# _parallel_on_X.
# For more information see MiddleTermComputer.
use_squared_distances = metric == "sqeuclidean"
pda = EuclideanArgKmin{{name_suffix}}(
X=X, Y=Y, k=k,
use_squared_distances=use_squared_distances,
chunk_size=chunk_size,
strategy=strategy,
metric_kwargs=metric_kwargs,
)
else:
# Fall back on a generic implementation that handles most scipy
# metrics by computing the distances between 2 vectors at a time.
pda = ArgKmin{{name_suffix}}(
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
k=k,
chunk_size=chunk_size,
strategy=strategy,
)
if pda.execute_in_parallel_on_Y:
pda._parallel_on_Y()
else:
pda._parallel_on_X()
return pda._finalize_results(return_distance)
def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
chunk_size=None,
strategy=None,
intp_t k=1,
):
super().__init__(
datasets_pair=datasets_pair,
chunk_size=chunk_size,
strategy=strategy,
)
self.k = check_scalar(k, "k", Integral, min_val=1)
# Allocating pointers to datastructures but not the datastructures themselves.
# There are as many pointers as effective threads.
#
# For the sake of explicitness:
# - when parallelizing on X, the pointers of those heaps are referencing
# (with proper offsets) addresses of the two main heaps (see below)
# - when parallelizing on Y, the pointers of those heaps are referencing
# small heaps which are thread-wise-allocated and whose content will be
# merged with the main heaps'.
self.heaps_r_distances_chunks = <float64_t **> malloc(
sizeof(float64_t *) * self.chunks_n_threads
)
self.heaps_indices_chunks = <intp_t **> malloc(
sizeof(intp_t *) * self.chunks_n_threads
)
# Main heaps which will be returned as results by `ArgKmin{{name_suffix}}.compute`.
self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp)
self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64)
def __dealloc__(self):
if self.heaps_indices_chunks is not NULL:
free(self.heaps_indices_chunks)
if self.heaps_r_distances_chunks is not NULL:
free(self.heaps_r_distances_chunks)
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
intp_t i, j
intp_t n_samples_X = X_end - X_start
intp_t n_samples_Y = Y_end - Y_start
float64_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
intp_t *heaps_indices = self.heaps_indices_chunks[thread_num]
# Pushing the distances and their associated indices on a heap
# which by construction will keep track of the argkmin.
for i in range(n_samples_X):
for j in range(n_samples_Y):
heap_push(
values=heaps_r_distances + i * self.k,
indices=heaps_indices + i * self.k,
size=self.k,
val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
val_idx=Y_start + j,
)
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
# As this strategy is embarrassingly parallel, we can set each
# thread's heaps pointer to the proper position on the main heaps.
self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
cdef:
intp_t idx
# Sorting the main heaps portion associated to `X[X_start:X_end]`
# in ascending order w.r.t the distances.
for idx in range(X_end - X_start):
simultaneous_sort(
self.heaps_r_distances_chunks[thread_num] + idx * self.k,
self.heaps_indices_chunks[thread_num] + idx * self.k,
self.k
)
cdef void _parallel_on_Y_init(
self,
) noexcept nogil:
cdef:
# Maximum number of scalar elements (the last chunks can be smaller)
intp_t heaps_size = self.X_n_samples_chunk * self.k
intp_t thread_num
# The allocation is done in parallel for data locality purposes: this way
# the heaps used in each threads are allocated in pages which are closer
# to the CPU core used by the thread.
# See comments about First Touch Placement Policy:
# https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
num_threads=self.chunks_n_threads):
# As chunks of X are shared across threads, so must their
# heaps. To solve this, each thread has its own heaps
# which are then synchronised back in the main ones.
self.heaps_r_distances_chunks[thread_num] = <float64_t *> malloc(
heaps_size * sizeof(float64_t)
)
self.heaps_indices_chunks[thread_num] = <intp_t *> malloc(
heaps_size * sizeof(intp_t)
)
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
# Initialising heaps (memset can't be used here)
for idx in range(self.X_n_samples_chunk * self.k):
self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
self.heaps_indices_chunks[thread_num][idx] = -1
@final
cdef void _parallel_on_Y_synchronize(
self,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
cdef:
intp_t idx, jdx, thread_num
with nogil, parallel(num_threads=self.effective_n_threads):
# Synchronising the thread heaps with the main heaps.
# This is done in parallel sample-wise (no need for locks).
#
# This might break each thread's data locality as each heap which
# was allocated in a thread is being now being used in several threads.
#
# Still, this parallel pattern has shown to be efficient in practice.
for idx in prange(X_end - X_start, schedule="static"):
for thread_num in range(self.chunks_n_threads):
for jdx in range(self.k):
heap_push(
values=&self.argkmin_distances[X_start + idx, 0],
indices=&self.argkmin_indices[X_start + idx, 0],
size=self.k,
val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
)
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil:
cdef:
intp_t idx, thread_num
with nogil, parallel(num_threads=self.chunks_n_threads):
# Deallocating temporary datastructures
for thread_num in prange(self.chunks_n_threads, schedule='static'):
free(self.heaps_r_distances_chunks[thread_num])
free(self.heaps_indices_chunks[thread_num])
# Sorting the main in ascending order w.r.t the distances.
# This is done in parallel sample-wise (no need for locks).
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
&self.argkmin_distances[idx, 0],
&self.argkmin_indices[idx, 0],
self.k,
)
return
cdef void compute_exact_distances(self) noexcept nogil:
cdef:
intp_t i, j
float64_t[:, ::1] distances = self.argkmin_distances
for i in prange(self.n_samples_X, schedule='static', nogil=True,
num_threads=self.effective_n_threads):
for j in range(self.k):
distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
# Guard against potential -0., causing nan production.
max(distances[i, j], 0.)
)
def _finalize_results(self, bint return_distance=False):
if return_distance:
# We need to recompute distances because we relied on
# surrogate distances for the reduction.
self.compute_exact_distances()
# Values are returned identically to the way `KNeighborsMixin.kneighbors`
# returns values. This is counter-intuitive but this allows not using
# complex adaptations where `ArgKmin.compute` is called.
return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
return np.asarray(self.argkmin_indices)
cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
"""EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
@classmethod
def is_usable_for(cls, X, Y, metric) -> bool:
return (ArgKmin{{name_suffix}}.is_usable_for(X, Y, metric) and
not _in_unstable_openblas_configuration())
def __init__(
self,
X,
Y,
intp_t k,
bint use_squared_distances=False,
chunk_size=None,
strategy=None,
metric_kwargs=None,
):
if (
isinstance(metric_kwargs, dict) and
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
):
warnings.warn(
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
f"usable for this case (EuclideanArgKmin64) and will be ignored.",
UserWarning,
stacklevel=3,
)
super().__init__(
# The datasets pair here is used for exact distances computations
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
chunk_size=chunk_size,
strategy=strategy,
k=k,
)
cdef:
intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
X,
Y,
self.effective_n_threads,
self.chunks_n_threads,
dist_middle_terms_chunks_size,
n_features=X.shape[1],
chunk_size=self.chunk_size,
)
if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
self.Y_norm_squared = check_array(
metric_kwargs.pop("Y_norm_squared"),
ensure_2d=False,
input_name="Y_norm_squared",
dtype=np.float64,
)
else:
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
Y,
self.effective_n_threads,
)
if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
self.X_norm_squared = check_array(
metric_kwargs.pop("X_norm_squared"),
ensure_2d=False,
input_name="X_norm_squared",
dtype=np.float64,
)
else:
# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(
X,
self.effective_n_threads,
)
)
self.use_squared_distances = use_squared_distances
@final
cdef void compute_exact_distances(self) noexcept nogil:
if not self.use_squared_distances:
ArgKmin{{name_suffix}}.compute_exact_distances(self)
@final
cdef void _parallel_on_X_parallel_init(
self,
intp_t thread_num,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
@final
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
@final
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num,
)
@final
cdef void _parallel_on_Y_init(
self,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
self.middle_term_computer._parallel_on_Y_init()
@final
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
@final
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
ArgKmin{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num
)
@final
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
intp_t i, j
float64_t sqeuclidean_dist_i_j
intp_t n_X = X_end - X_start
intp_t n_Y = Y_end - Y_start
float64_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
X_start, X_end, Y_start, Y_end, thread_num
)
float64_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
intp_t * heaps_indices = self.heaps_indices_chunks[thread_num]
# Pushing the distance and their associated indices on heaps
# which keep tracks of the argkmin.
for i in range(n_X):
for j in range(n_Y):
sqeuclidean_dist_i_j = (
self.X_norm_squared[i + X_start] +
dist_middle_terms[i * n_Y + j] +
self.Y_norm_squared[j + Y_start]
)
# Catastrophic cancellation might cause -0. to be present,
# e.g. when computing d(x_i, y_i) when X is Y.
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
heap_push(
values=heaps_r_distances + i * self.k,
indices=heaps_indices + i * self.k,
size=self.k,
val=sqeuclidean_dist_i_j,
val_idx=j + Y_start,
)
{{endfor}}

View File

@@ -0,0 +1,182 @@
from cython cimport floating, integral
from cython.parallel cimport parallel, prange
from libcpp.map cimport map as cpp_map, pair as cpp_pair
from libc.stdlib cimport free
from sklearn.utils._typedefs cimport intp_t, float64_t
from sklearn.utils.parallel import _get_threadpool_controller
import numpy as np
from scipy.sparse import issparse
from sklearn.metrics._pairwise_distances_reduction._classmode cimport WeightingStrategy
{{for name_suffix in ["32", "64"]}}
from sklearn.metrics._pairwise_distances_reduction._argkmin cimport ArgKmin{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
"""
{{name_suffix}}bit implementation of ArgKminClassMode.
"""
cdef:
const intp_t[:] Y_labels,
const intp_t[:] unique_Y_labels
float64_t[:, :] class_scores
cpp_map[intp_t, intp_t] labels_to_index
WeightingStrategy weight_type
@classmethod
def compute(
cls,
X,
Y,
intp_t k,
weights,
Y_labels,
unique_Y_labels,
str metric="euclidean",
chunk_size=None,
dict metric_kwargs=None,
str strategy=None,
):
"""Compute the argkmin reduction with Y_labels.
This classmethod is responsible for introspecting the arguments
values to dispatch to the most appropriate implementation of
:class:`ArgKminClassMode{{name_suffix}}`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
No instance _must_ directly be created outside of this class method.
"""
# Use a generic implementation that handles most scipy
# metrics by computing the distances between 2 vectors at a time.
pda = ArgKminClassMode{{name_suffix}}(
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
k=k,
chunk_size=chunk_size,
strategy=strategy,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
# Limit the number of threads in second level of nested parallelism for BLAS
# to avoid threads over-subscription (in GEMM for instance).
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
if pda.execute_in_parallel_on_Y:
pda._parallel_on_Y()
else:
pda._parallel_on_X()
return pda._finalize_results()
def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
const intp_t[:] Y_labels,
const intp_t[:] unique_Y_labels,
chunk_size=None,
strategy=None,
intp_t k=1,
weights=None,
):
super().__init__(
datasets_pair=datasets_pair,
chunk_size=chunk_size,
strategy=strategy,
k=k,
)
if weights == "uniform":
self.weight_type = WeightingStrategy.uniform
elif weights == "distance":
self.weight_type = WeightingStrategy.distance
else:
self.weight_type = WeightingStrategy.callable
self.Y_labels = Y_labels
self.unique_Y_labels = unique_Y_labels
cdef intp_t idx, neighbor_class_idx
# Map from set of unique labels to their indices in `class_scores`
# Buffer used in building a histogram for one-pass weighted mode
self.class_scores = np.zeros(
(self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
)
def _finalize_results(self):
probabilities = np.asarray(self.class_scores)
probabilities /= probabilities.sum(axis=1, keepdims=True)
return probabilities
cdef inline void weighted_histogram_mode(
self,
intp_t sample_index,
intp_t* indices,
float64_t* distances,
) noexcept nogil:
cdef:
intp_t neighbor_idx, neighbor_class_idx, label_index, multi_output_index
float64_t score_incr = 1
# TODO: Implement other WeightingStrategy values
bint use_distance_weighting = (
self.weight_type == WeightingStrategy.distance
)
# Iterate through the sample k-nearest neighbours
for neighbor_rank in range(self.k):
# Absolute indice of the neighbor_rank-th Nearest Neighbors
# in range [0, n_samples_Y)
# TODO: inspect if it worth permuting this condition
# and the for-loop above for improved branching.
if use_distance_weighting:
score_incr = 1 / distances[neighbor_rank]
neighbor_idx = indices[neighbor_rank]
neighbor_class_idx = self.Y_labels[neighbor_idx]
self.class_scores[sample_index][neighbor_class_idx] += score_incr
return
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
cdef:
intp_t idx, sample_index
for idx in range(X_end - X_start):
# One-pass top-one weighted mode
# Compute the absolute index in [0, n_samples_X)
sample_index = X_start + idx
self.weighted_histogram_mode(
sample_index,
&self.heaps_indices_chunks[thread_num][idx * self.k],
&self.heaps_r_distances_chunks[thread_num][idx * self.k],
)
return
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil:
cdef:
intp_t sample_index, thread_num
with nogil, parallel(num_threads=self.chunks_n_threads):
# Deallocating temporary datastructures
for thread_num in prange(self.chunks_n_threads, schedule='static'):
free(self.heaps_r_distances_chunks[thread_num])
free(self.heaps_indices_chunks[thread_num])
for sample_index in prange(self.n_samples_X, schedule='static'):
self.weighted_histogram_mode(
sample_index,
&self.argkmin_indices[sample_index][0],
&self.argkmin_distances[sample_index][0],
)
return
{{endfor}}

View File

@@ -0,0 +1,135 @@
from cython cimport final
from sklearn.utils._typedefs cimport intp_t, float64_t
{{for name_suffix in ['64', '32']}}
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
X,
intp_t num_threads,
)
cdef class BaseDistancesReduction{{name_suffix}}:
"""
Base float{{name_suffix}} implementation template of the pairwise-distances
reduction backends.
Implementations inherit from this template and may override the several
defined hooks as needed in order to easily extend functionality with
minimal redundant code.
"""
cdef:
readonly DatasetsPair{{name_suffix}} datasets_pair
# The number of threads that can be used is stored in effective_n_threads.
#
# The number of threads to use in the parallelization strategy
# (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
# for small datasets, fewer threads might be needed to loop over pair of chunks.
#
# Hence, the number of threads that _will_ be used for looping over chunks
# is stored in chunks_n_threads, allowing solely using what we need.
#
# Thus, an invariant is:
#
# chunks_n_threads <= effective_n_threads
#
intp_t effective_n_threads
intp_t chunks_n_threads
intp_t n_samples_chunk, chunk_size
intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
bint execute_in_parallel_on_Y
@final
cdef void _parallel_on_X(self) noexcept nogil
@final
cdef void _parallel_on_Y(self) noexcept nogil
# Placeholder methods which have to be implemented
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
# Placeholder methods which can be implemented
cdef void compute_exact_distances(self) noexcept nogil
cdef void _parallel_on_X_parallel_init(
self,
intp_t thread_num,
) noexcept nogil
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_X_parallel_finalize(
self,
intp_t thread_num
) noexcept nogil
cdef void _parallel_on_Y_init(
self,
) noexcept nogil
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef void _parallel_on_Y_synchronize(
self,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil
{{endfor}}

View File

@@ -0,0 +1,505 @@
from cython cimport final
from cython.operator cimport dereference as deref
from cython.parallel cimport parallel, prange
from libcpp.vector cimport vector
from numbers import Integral
import numpy as np
from scipy.sparse import issparse
from sklearn.utils._cython_blas cimport _dot
from sklearn.utils._openmp_helpers cimport omp_get_thread_num
from sklearn.utils._typedefs cimport intp_t, float32_t, float64_t, int32_t
from sklearn import get_config
from sklearn.utils import check_scalar
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
#####################
cdef float64_t[::1] _sqeuclidean_row_norms64_dense(
const float64_t[:, ::1] X,
intp_t num_threads,
):
"""Compute the squared euclidean norm of the rows of X in parallel.
This is faster than using np.einsum("ij, ij->i") even when using a single thread.
"""
cdef:
# Casting for X to remove the const qualifier is needed because APIs
# exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
# const qualifier.
# See: https://github.com/scipy/scipy/issues/14262
float64_t * X_ptr = <float64_t *> &X[0, 0]
intp_t idx = 0
intp_t n = X.shape[0]
intp_t d = X.shape[1]
float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
return squared_row_norms
cdef float64_t[::1] _sqeuclidean_row_norms32_dense(
const float32_t[:, ::1] X,
intp_t num_threads,
):
"""Compute the squared euclidean norm of the rows of X in parallel.
This is faster than using np.einsum("ij, ij->i") even when using a single thread.
"""
cdef:
# Casting for X to remove the const qualifier is needed because APIs
# exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
# const qualifier.
# See: https://github.com/scipy/scipy/issues/14262
float32_t * X_ptr = <float32_t *> &X[0, 0]
intp_t i = 0, j = 0
intp_t thread_num
intp_t n = X.shape[0]
intp_t d = X.shape[1]
float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
# To upcast the i-th row of X from float32 to float64
vector[vector[float64_t]] X_i_upcast = vector[vector[float64_t]](
num_threads, vector[float64_t](d)
)
with nogil, parallel(num_threads=num_threads):
thread_num = omp_get_thread_num()
for i in prange(n, schedule='static'):
# Upcasting the i-th row of X from float32 to float64
for j in range(d):
X_i_upcast[thread_num][j] = <float64_t> deref(X_ptr + i * d + j)
squared_row_norms[i] = _dot(
d, X_i_upcast[thread_num].data(), 1,
X_i_upcast[thread_num].data(), 1,
)
return squared_row_norms
cdef float64_t[::1] _sqeuclidean_row_norms64_sparse(
const float64_t[:] X_data,
const int32_t[:] X_indptr,
intp_t num_threads,
):
cdef:
intp_t n = X_indptr.shape[0] - 1
int32_t X_i_ptr, idx = 0
float64_t[::1] squared_row_norms = np.zeros(n, dtype=np.float64)
for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]):
squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr]
return squared_row_norms
{{for name_suffix in ["64", "32"]}}
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
X,
intp_t num_threads,
):
if issparse(X):
# TODO: remove this instruction which is a cast in the float32 case
# by moving squared row norms computations in MiddleTermComputer.
X_data = np.asarray(X.data, dtype=np.float64)
X_indptr = np.asarray(X.indptr, dtype=np.int32)
return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads)
else:
return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
cdef class BaseDistancesReduction{{name_suffix}}:
"""
Base float{{name_suffix}} implementation template of the pairwise-distances
reduction backends.
Implementations inherit from this template and may override the several
defined hooks as needed in order to easily extend functionality with
minimal redundant code.
"""
def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
chunk_size=None,
strategy=None,
):
cdef:
intp_t X_n_full_chunks, Y_n_full_chunks
if chunk_size is None:
chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
self.effective_n_threads = _openmp_effective_n_threads()
self.datasets_pair = datasets_pair
self.n_samples_X = datasets_pair.n_samples_X()
self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
if X_n_samples_remainder != 0:
self.X_n_samples_last_chunk = X_n_samples_remainder
else:
self.X_n_samples_last_chunk = self.X_n_samples_chunk
self.n_samples_Y = datasets_pair.n_samples_Y()
self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
if Y_n_samples_remainder != 0:
self.Y_n_samples_last_chunk = Y_n_samples_remainder
else:
self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
if strategy is None:
strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
f"or 'auto', but currently strategy='{self.strategy}'.")
if strategy == 'auto':
# This is a simple heuristic whose constant for the
# comparison has been chosen based on experiments.
# parallel_on_X has less synchronization overhead than
# parallel_on_Y and should therefore be used whenever
# n_samples_X is large enough to not starve any of the
# available hardware threads.
if self.n_samples_Y < self.n_samples_X:
# No point to even consider parallelizing on Y in this case. This
# is in particular important to do this on machines with a large
# number of hardware threads.
strategy = 'parallel_on_X'
elif 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
# If Y is larger than X, but X is still large enough to allow for
# parallelism, we might still want to favor parallelizing on X.
strategy = 'parallel_on_X'
else:
strategy = 'parallel_on_Y'
self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
# Not using less, not using more.
self.chunks_n_threads = min(
self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
self.effective_n_threads,
)
@final
cdef void _parallel_on_X(self) noexcept nogil:
"""Perform computation and reduction in parallel on chunks of X.
This strategy dispatches tasks statically on threads. Each task
processes exactly only one chunk of X, computing and reducing
distances matrices between vectors of this chunk and vectors of all
chunks of Y, one chunk of Y at a time.
This strategy is embarrassingly parallel with no intermediate data
structures synchronization at all.
Private datastructures are modified internally by threads.
Private template methods can be implemented on subclasses to
interact with those datastructures at various stages.
"""
cdef:
intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
intp_t thread_num
with nogil, parallel(num_threads=self.chunks_n_threads):
thread_num = omp_get_thread_num()
# Allocating thread datastructures
self._parallel_on_X_parallel_init(thread_num)
for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
X_start = X_chunk_idx * self.X_n_samples_chunk
if X_chunk_idx == self.X_n_chunks - 1:
X_end = X_start + self.X_n_samples_last_chunk
else:
X_end = X_start + self.X_n_samples_chunk
# Reinitializing thread datastructures for the new X chunk
self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
for Y_chunk_idx in range(self.Y_n_chunks):
Y_start = Y_chunk_idx * self.Y_n_samples_chunk
if Y_chunk_idx == self.Y_n_chunks - 1:
Y_end = Y_start + self.Y_n_samples_last_chunk
else:
Y_end = Y_start + self.Y_n_samples_chunk
self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self._compute_and_reduce_distances_on_chunks(
X_start, X_end,
Y_start, Y_end,
thread_num,
)
# Adjusting thread datastructures on the full pass on Y
self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
# end: for X_chunk_idx
# Deallocating thread datastructures
self._parallel_on_X_parallel_finalize(thread_num)
# end: with nogil, parallel
return
@final
cdef void _parallel_on_Y(self) noexcept nogil:
"""Perform computation and reduction in parallel on chunks of Y.
This strategy is a sequence of embarrassingly parallel subtasks:
chunks of X are iterated over sequentially, and for each chunk of X,
tasks are dispatched statically on threads. Each task processes one
and only one chunk of Y, computing and reducing distances matrices
between vectors of the chunk of X and vectors of the Y.
It comes with lock-free and parallelized intermediate data structures
that synchronize at each iteration of the sequential outer loop on X
chunks.
Private datastructures are modified internally by threads.
Private template methods can be implemented on subclasses to
interact with those datastructures at various stages.
"""
cdef:
intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
intp_t thread_num
# Allocating datastructures shared by all threads
self._parallel_on_Y_init()
for X_chunk_idx in range(self.X_n_chunks):
X_start = X_chunk_idx * self.X_n_samples_chunk
if X_chunk_idx == self.X_n_chunks - 1:
X_end = X_start + self.X_n_samples_last_chunk
else:
X_end = X_start + self.X_n_samples_chunk
with nogil, parallel(num_threads=self.chunks_n_threads):
thread_num = omp_get_thread_num()
# Initializing datastructures used in this thread
self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
Y_start = Y_chunk_idx * self.Y_n_samples_chunk
if Y_chunk_idx == self.Y_n_chunks - 1:
Y_end = Y_start + self.Y_n_samples_last_chunk
else:
Y_end = Y_start + self.Y_n_samples_chunk
self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self._compute_and_reduce_distances_on_chunks(
X_start, X_end,
Y_start, Y_end,
thread_num,
)
# end: prange
# end: with nogil, parallel
# Synchronizing the thread datastructures with the main ones
self._parallel_on_Y_synchronize(X_start, X_end)
# end: for X_chunk_idx
# Deallocating temporary datastructures and adjusting main datastructures
self._parallel_on_Y_finalize()
return
# Placeholder methods which have to be implemented
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
"""Compute the pairwise distances on two chunks of X and Y and reduce them.
This is THE core computational method of BaseDistancesReduction{{name_suffix}}.
This must be implemented in subclasses agnostically from the parallelization
strategies.
"""
return
def _finalize_results(self, bint return_distance):
"""Callback adapting datastructures before returning results.
This must be implemented in subclasses.
"""
return None
# Placeholder methods which can be implemented
cdef void compute_exact_distances(self) noexcept nogil:
"""Convert rank-preserving distances to exact distances or recompute them."""
return
cdef void _parallel_on_X_parallel_init(
self,
intp_t thread_num,
) noexcept nogil:
"""Allocate datastructures used in a thread given its number."""
return
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
"""Initialize datastructures used in a thread given its number.
In this method, EuclideanDistance specialisations of subclass of
BaseDistancesReduction _must_ call:
self.middle_term_computer._parallel_on_X_init_chunk(
thread_num, X_start, X_end,
)
to ensure the proper upcast of X[X_start:X_end] to float64 prior
to the reduction with float64 accumulator buffers when X.dtype is
float32.
"""
return
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
"""Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
In this method, EuclideanDistance specialisations of subclass of
BaseDistancesReduction _must_ call:
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num,
)
to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
to the reduction with float64 accumulator buffers when Y.dtype is
float32.
"""
return
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
"""Interact with datastructures after a reduction on chunks."""
return
cdef void _parallel_on_X_parallel_finalize(
self,
intp_t thread_num
) noexcept nogil:
"""Interact with datastructures after executing all the reductions."""
return
cdef void _parallel_on_Y_init(
self,
) noexcept nogil:
"""Allocate datastructures used in all threads."""
return
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
"""Initialize datastructures used in a thread given its number.
In this method, EuclideanDistance specialisations of subclass of
BaseDistancesReduction _must_ call:
self.middle_term_computer._parallel_on_Y_parallel_init(
thread_num, X_start, X_end,
)
to ensure the proper upcast of X[X_start:X_end] to float64 prior
to the reduction with float64 accumulator buffers when X.dtype is
float32.
"""
return
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
"""Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
In this method, EuclideanDistance specialisations of subclass of
BaseDistancesReduction _must_ call:
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num,
)
to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
to the reduction with float64 accumulator buffers when Y.dtype is
float32.
"""
return
cdef void _parallel_on_Y_synchronize(
self,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
"""Update thread datastructures before leaving a parallel region."""
return
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil:
"""Update datastructures after executing all the reductions."""
return
{{endfor}}

View File

@@ -0,0 +1,5 @@
cpdef enum WeightingStrategy:
uniform = 0
# TODO: Implement the following options in weighted_histogram_mode
distance = 1
callable = 2

View File

@@ -0,0 +1,67 @@
{{py:
implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
('64', 'DistanceMetric64', 'float64_t'),
('32', 'DistanceMetric32', 'float32_t')
]
}}
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
from sklearn.metrics._dist_metrics cimport DistanceMetric64, DistanceMetric32, DistanceMetric
{{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}}
cdef class DatasetsPair{{name_suffix}}:
cdef:
{{DistanceMetric}} distance_metric
intp_t n_features
cdef intp_t n_samples_X(self) noexcept nogil
cdef intp_t n_samples_Y(self) noexcept nogil
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil
cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:, ::1] X
const {{INPUT_DTYPE_t}}[:, ::1] Y
cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:] X_data
const int32_t[::1] X_indices
const int32_t[::1] X_indptr
const {{INPUT_DTYPE_t}}[:] Y_data
const int32_t[::1] Y_indices
const int32_t[::1] Y_indptr
cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:] X_data
const int32_t[::1] X_indices
const int32_t[::1] X_indptr
const {{INPUT_DTYPE_t}}[:] Y_data
const int32_t[::1] Y_indices
intp_t n_Y
cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
# As distance metrics are commutative, we can simply rely
# on the implementation of SparseDenseDatasetsPair and
# swap arguments.
DatasetsPair{{name_suffix}} datasets_pair
{{endfor}}

View File

@@ -0,0 +1,406 @@
import copy
{{py:
implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
('64', 'DistanceMetric64', 'float64_t', 'np.float64'),
('32', 'DistanceMetric32', 'float32_t', 'np.float32')
]
}}
import numpy as np
from cython cimport final
from sklearn.utils._typedefs cimport float64_t, float32_t, intp_t
from scipy.sparse import issparse, csr_matrix
{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
cdef class DatasetsPair{{name_suffix}}:
"""Abstract class which wraps a pair of datasets (X, Y).
This class allows computing distances between a single pair of rows of
of X and Y at a time given the pair of their indices (i, j). This class is
specialized for each metric thanks to the :func:`get_for` factory classmethod.
The handling of parallelization over chunks to compute the distances
and aggregation for several rows at a time is done in dedicated
subclasses of :class:`BaseDistancesReductionDispatcher` that in-turn rely on
subclasses of :class:`DatasetsPair` for each pair of rows in the data. The
goal is to make it possible to decouple the generic parallelization and
aggregation logic from metric-specific computation as much as possible.
X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
in subclasses.
This class avoids the overhead of dispatching distance computations
to :class:`sklearn.metrics.DistanceMetric` based on the physical
representation of the vectors (sparse vs. dense). It makes use of
cython.final to remove the overhead of dispatching method calls.
Parameters
----------
distance_metric: {{DistanceMetric}}
The distance metric responsible for computing distances
between two vectors of (X, Y).
"""
@classmethod
def get_for(
cls,
X,
Y,
metric="euclidean",
dict metric_kwargs=None,
) -> DatasetsPair{{name_suffix}}:
"""Return the DatasetsPair implementation for the given arguments.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
Input data.
If provided as an ndarray, it must be C-contiguous.
If provided as a sparse matrix, it must be in CSR format.
Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
Input data.
If provided as an ndarray, it must be C-contiguous.
If provided as a sparse matrix, it must be in CSR format.
metric : str or DistanceMetric object, default='euclidean'
The distance metric to compute between rows of X and Y.
The default metric is a fast implementation of the Euclidean
metric. For a list of available metrics, see the documentation
of :class:`~sklearn.metrics.DistanceMetric`.
metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.
Returns
-------
datasets_pair: DatasetsPair{{name_suffix}}
The suited DatasetsPair{{name_suffix}} implementation.
"""
# X_norm_squared and Y_norm_squared might be propagated
# down to DatasetsPairs via metrics_kwargs when the Euclidean
# specialisations can't be used.
# To prevent X_norm_squared and Y_norm_squared to be passed
# down to DistanceMetrics (whose constructors would raise
# a RuntimeError), we pop them here.
if metric_kwargs is not None:
# Copying metric_kwargs not to pop "X_norm_squared"
# and "Y_norm_squared" where they are used
metric_kwargs = copy.copy(metric_kwargs)
metric_kwargs.pop("X_norm_squared", None)
metric_kwargs.pop("Y_norm_squared", None)
cdef:
{{DistanceMetric}} distance_metric = DistanceMetric.get_metric(
metric,
{{INPUT_DTYPE}},
**(metric_kwargs or {})
)
# Metric-specific checks that do not replace nor duplicate `check_array`.
distance_metric._validate_data(X)
distance_metric._validate_data(Y)
X_is_sparse = issparse(X)
Y_is_sparse = issparse(Y)
if not X_is_sparse and not Y_is_sparse:
return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
if X_is_sparse and Y_is_sparse:
return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
if X_is_sparse and not Y_is_sparse:
return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
@classmethod
def unpack_csr_matrix(cls, X: csr_matrix):
"""Ensure that the CSR matrix is indexed with np.int32."""
X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
X_indices = np.asarray(X.indices, dtype=np.int32)
X_indptr = np.asarray(X.indptr, dtype=np.int32)
return X_data, X_indices, X_indptr
def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features):
self.distance_metric = distance_metric
self.n_features = n_features
cdef intp_t n_samples_X(self) noexcept nogil:
"""Number of samples in X."""
# This is an abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -999
cdef intp_t n_samples_Y(self) noexcept nogil:
"""Number of samples in Y."""
# This is an abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -999
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
return self.dist(i, j)
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
# This is an abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -1
@final
cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
"""Compute distances between row vectors of two arrays.
Parameters
----------
X: ndarray of shape (n_samples_X, n_features)
Rows represent vectors. Must be C-contiguous.
Y: ndarray of shape (n_samples_Y, n_features)
Rows represent vectors. Must be C-contiguous.
distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two row vectors of (X, Y).
"""
def __init__(
self,
const {{INPUT_DTYPE_t}}[:, ::1] X,
const {{INPUT_DTYPE_t}}[:, ::1] Y,
{{DistanceMetric}} distance_metric,
):
super().__init__(distance_metric, n_features=X.shape[1])
# Arrays have already been checked
self.X = X
self.Y = Y
@final
cdef intp_t n_samples_X(self) noexcept nogil:
return self.X.shape[0]
@final
cdef intp_t n_samples_Y(self) noexcept nogil:
return self.Y.shape[0]
@final
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
@final
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
@final
cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
"""Compute distances between vectors of two CSR matrices.
Parameters
----------
X: sparse matrix of shape (n_samples_X, n_features)
Rows represent vectors. Must be in CSR format.
Y: sparse matrix of shape (n_samples_Y, n_features)
Rows represent vectors. Must be in CSR format.
distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two vectors of (X, Y).
"""
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
super().__init__(distance_metric, n_features=X.shape[1])
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
@final
cdef intp_t n_samples_X(self) noexcept nogil:
return self.X_indptr.shape[0] - 1
@final
cdef intp_t n_samples_Y(self) noexcept nogil:
return self.Y_indptr.shape[0] - 1
@final
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.rdist_csr(
x1_data=&self.X_data[0],
x1_indices=&self.X_indices[0],
x2_data=&self.Y_data[0],
x2_indices=&self.Y_indices[0],
x1_start=self.X_indptr[i],
x1_end=self.X_indptr[i + 1],
x2_start=self.Y_indptr[j],
x2_end=self.Y_indptr[j + 1],
size=self.n_features,
)
@final
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.dist_csr(
x1_data=&self.X_data[0],
x1_indices=&self.X_indices[0],
x2_data=&self.Y_data[0],
x2_indices=&self.Y_indices[0],
x1_start=self.X_indptr[i],
x1_end=self.X_indptr[i + 1],
x2_start=self.Y_indptr[j],
x2_end=self.Y_indptr[j + 1],
size=self.n_features,
)
@final
cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
"""Compute distances between vectors of a CSR matrix and a dense array.
Parameters
----------
X: sparse matrix of shape (n_samples_X, n_features)
Rows represent vectors. Must be in CSR format.
Y: ndarray of shape (n_samples_Y, n_features)
Rows represent vectors. Must be C-contiguous.
distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two vectors of (X, Y).
"""
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
super().__init__(distance_metric, n_features=X.shape[1])
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
# We support the sparse-dense case by using the sparse-sparse interfaces
# of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to
# avoid introducing a new complex set of interfaces. In this case, we
# need to convert `Y` (the dense array) into a CSR matrix.
#
# Here we motive using another simpler CSR representation to use for `Y`.
#
# If we were to use the usual CSR representation for `Y`, storing all
# the columns indices in `indices` would have required allocating an
# array of n_samples × n_features elements with repeated contiguous
# integers from 0 to n_features - 1. This would have been very wasteful
# from a memory point of view. This alternative representation just uses
# the necessary amount of information needed and only necessitates
# shifting the address of `data` before calling the CSR × CSR routines.
#
# In this representation:
#
# - the `data` array is the original dense array, `Y`, whose first
# element's address is shifted before calling the CSR × CSR routine
#
# - the `indices` array is a single row of `n_features` elements:
#
# [0, 1, ..., n_features-1]
#
# - the `indptr` array is not materialised as the indices pointers'
# offset is constant (the offset equals `n_features`). Moreover, as
# `data` is shifted, constant `start` and `end` indices pointers
# respectively equalling 0 and n_features are used.
# Y array already has been checked here
self.n_Y = Y.shape[0]
self.Y_data = np.ravel(Y)
self.Y_indices = np.arange(self.n_features, dtype=np.int32)
@final
cdef intp_t n_samples_X(self) noexcept nogil:
return self.X_indptr.shape[0] - 1
@final
cdef intp_t n_samples_Y(self) noexcept nogil:
return self.n_Y
@final
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.rdist_csr(
x1_data=&self.X_data[0],
x1_indices=&self.X_indices[0],
# Increment the data pointer such that x2_start=0 is aligned with the
# j-th row
x2_data=&self.Y_data[0] + j * self.n_features,
x2_indices=&self.Y_indices[0],
x1_start=self.X_indptr[i],
x1_end=self.X_indptr[i + 1],
x2_start=0,
x2_end=self.n_features,
size=self.n_features,
)
@final
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
return self.distance_metric.dist_csr(
x1_data=&self.X_data[0],
x1_indices=&self.X_indices[0],
# Increment the data pointer such that x2_start=0 is aligned with the
# j-th row
x2_data=&self.Y_data[0] + j * self.n_features,
x2_indices=&self.Y_indices[0],
x1_start=self.X_indptr[i],
x1_end=self.X_indptr[i + 1],
x2_start=0,
x2_end=self.n_features,
size=self.n_features,
)
@final
cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
"""Compute distances between vectors of a dense array and a CSR matrix.
Parameters
----------
X: ndarray of shape (n_samples_X, n_features)
Rows represent vectors. Must be C-contiguous.
Y: sparse matrix of shape (n_samples_Y, n_features)
Rows represent vectors. Must be in CSR format.
distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two vectors of (X, Y).
"""
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
super().__init__(distance_metric, n_features=X.shape[1])
# Swapping arguments on the constructor
self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric)
@final
cdef intp_t n_samples_X(self) noexcept nogil:
# Swapping interface
return self.datasets_pair.n_samples_Y()
@final
cdef intp_t n_samples_Y(self) noexcept nogil:
# Swapping interface
return self.datasets_pair.n_samples_X()
@final
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
# Swapping arguments on the same interface
return self.datasets_pair.surrogate_dist(j, i)
@final
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
# Swapping arguments on the same interface
return self.datasets_pair.dist(j, i)
{{endfor}}

View File

@@ -0,0 +1,763 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from abc import abstractmethod
from typing import List
import numpy as np
from scipy.sparse import issparse
from sklearn import get_config
from sklearn.metrics._dist_metrics import BOOL_METRICS, METRIC_MAPPING64, DistanceMetric
from sklearn.metrics._pairwise_distances_reduction._argkmin import ArgKmin32, ArgKmin64
from sklearn.metrics._pairwise_distances_reduction._argkmin_classmode import (
ArgKminClassMode32,
ArgKminClassMode64,
)
from sklearn.metrics._pairwise_distances_reduction._base import (
_sqeuclidean_row_norms32,
_sqeuclidean_row_norms64,
)
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors import (
RadiusNeighbors32,
RadiusNeighbors64,
)
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors_classmode import (
RadiusNeighborsClassMode32,
RadiusNeighborsClassMode64,
)
def sqeuclidean_row_norms(X, num_threads):
"""Compute the squared euclidean norm of the rows of X in parallel.
Parameters
----------
X : ndarray or CSR matrix of shape (n_samples, n_features)
Input data. Must be c-contiguous.
num_threads : int
The number of OpenMP threads to use.
Returns
-------
sqeuclidean_row_norms : ndarray of shape (n_samples,)
Arrays containing the squared euclidean norm of each row of X.
"""
if X.dtype == np.float64:
return np.asarray(_sqeuclidean_row_norms64(X, num_threads))
if X.dtype == np.float32:
return np.asarray(_sqeuclidean_row_norms32(X, num_threads))
raise ValueError(
"Only float64 or float32 datasets are supported at this time, "
f"got: X.dtype={X.dtype}."
)
class BaseDistancesReductionDispatcher:
"""Abstract base dispatcher for pairwise distance computation & reduction.
Each dispatcher extending the base :class:`BaseDistancesReductionDispatcher`
dispatcher must implement the :meth:`compute` classmethod.
"""
@classmethod
def valid_metrics(cls) -> List[str]:
excluded = {
# PyFunc cannot be supported because it necessitates interacting with
# the CPython interpreter to call user defined functions.
"pyfunc",
"mahalanobis", # is numerically unstable
# In order to support discrete distance metrics, we need to have a
# stable simultaneous sort which preserves the order of the indices
# because there generally is a lot of occurrences for a given values
# of distances in this case.
# TODO: implement a stable simultaneous_sort.
"hamming",
*BOOL_METRICS,
}
return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded)
@classmethod
def is_usable_for(cls, X, Y, metric) -> bool:
"""Return True if the dispatcher can be used for the
given parameters.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
Input data.
Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
Input data.
metric : str, default='euclidean'
The distance metric to use.
For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric`.
Returns
-------
True if the dispatcher can be used, else False.
"""
# FIXME: the current Cython implementation is too slow for a large number of
# features. We temporarily disable it to fallback on SciPy's implementation.
# See: https://github.com/scikit-learn/scikit-learn/issues/28191
if (
issparse(X)
and issparse(Y)
and isinstance(metric, str)
and "euclidean" in metric
):
return False
def is_numpy_c_ordered(X):
return hasattr(X, "flags") and getattr(X.flags, "c_contiguous", False)
def is_valid_sparse_matrix(X):
return (
issparse(X)
and X.format == "csr"
and
# TODO: support CSR matrices without non-zeros elements
X.nnz > 0
and
# TODO: support CSR matrices with int64 indices and indptr
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
X.indices.dtype == X.indptr.dtype == np.int32
)
is_usable = (
get_config().get("enable_cython_pairwise_dist", True)
and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X))
and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
and X.dtype == Y.dtype
and X.dtype in (np.float32, np.float64)
and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric))
)
return is_usable
@classmethod
@abstractmethod
def compute(
cls,
X,
Y,
**kwargs,
):
"""Compute the reduction.
Parameters
----------
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
Input data.
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
Input data.
**kwargs : additional parameters for the reduction
Notes
-----
This method is an abstract class method: it has to be implemented
for all subclasses.
"""
class ArgKmin(BaseDistancesReductionDispatcher):
"""Compute the argkmin of row vectors of X on the ones of Y.
For each row vector of X, computes the indices of k first the rows
vectors of Y with the smallest distances.
ArgKmin is typically used to perform
bruteforce k-nearest neighbors queries.
This class is not meant to be instantiated, one should only use
its :meth:`compute` classmethod which handles allocation and
deallocation consistently.
"""
@classmethod
def compute(
cls,
X,
Y,
k,
metric="euclidean",
chunk_size=None,
metric_kwargs=None,
strategy=None,
return_distance=False,
):
"""Compute the argkmin reduction.
Parameters
----------
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
Input data.
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
Input data.
k : int
The k for the argkmin reduction.
metric : str, default='euclidean'
The distance metric to use for argkmin.
For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric`.
chunk_size : int, default=None,
The number of vectors per chunk. If None (default) looks-up in
scikit-learn configuration for `pairwise_dist_chunk_size`,
and use 256 if it is not set.
metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
The chunking strategy defining which dataset parallelization are made on.
For both strategies the computations happens with two nested loops,
respectively on chunks of X and chunks of Y.
Strategies differs on which loop (outer or inner) is made to run
in parallel with the Cython `prange` construct:
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
Each thread then iterates on all the chunks of Y. This strategy is
embarrassingly parallel and comes with no datastructures
synchronisation.
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
Each thread processes all the chunks of X in turn. This strategy is
a sequence of embarrassingly parallel subtasks (the inner loop on Y
chunks) with intermediate datastructures synchronisation at each
iteration of the sequential outer loop on X chunks.
- 'auto' relies on a simple heuristic to choose between
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
'parallel_on_X' is usually the most efficient strategy.
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
brings more opportunity for parallelism and is therefore more efficient
- None (default) looks-up in scikit-learn configuration for
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
return_distance : boolean, default=False
Return distances between each X vector and its
argkmin if set to True.
Returns
-------
If return_distance=False:
- argkmin_indices : ndarray of shape (n_samples_X, k)
Indices of the argkmin for each vector in X.
If return_distance=True:
- argkmin_distances : ndarray of shape (n_samples_X, k)
Distances to the argkmin for each vector in X.
- argkmin_indices : ndarray of shape (n_samples_X, k)
Indices of the argkmin for each vector in X.
Notes
-----
This classmethod inspects the arguments values to dispatch to the
dtype-specialized implementation of :class:`ArgKmin`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
"""
if X.dtype == Y.dtype == np.float64:
return ArgKmin64.compute(
X=X,
Y=Y,
k=k,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
return_distance=return_distance,
)
if X.dtype == Y.dtype == np.float32:
return ArgKmin32.compute(
X=X,
Y=Y,
k=k,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
return_distance=return_distance,
)
raise ValueError(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)
class RadiusNeighbors(BaseDistancesReductionDispatcher):
"""Compute radius-based neighbors for two sets of vectors.
For each row-vector X[i] of the queries X, find all the indices j of
row-vectors in Y such that:
dist(X[i], Y[j]) <= radius
The distance function `dist` depends on the values of the `metric`
and `metric_kwargs` parameters.
This class is not meant to be instantiated, one should only use
its :meth:`compute` classmethod which handles allocation and
deallocation consistently.
"""
@classmethod
def compute(
cls,
X,
Y,
radius,
metric="euclidean",
chunk_size=None,
metric_kwargs=None,
strategy=None,
return_distance=False,
sort_results=False,
):
"""Return the results of the reduction for the given arguments.
Parameters
----------
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
Input data.
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
Input data.
radius : float
The radius defining the neighborhood.
metric : str, default='euclidean'
The distance metric to use.
For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric`.
chunk_size : int, default=None,
The number of vectors per chunk. If None (default) looks-up in
scikit-learn configuration for `pairwise_dist_chunk_size`,
and use 256 if it is not set.
metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
The chunking strategy defining which dataset parallelization are made on.
For both strategies the computations happens with two nested loops,
respectively on chunks of X and chunks of Y.
Strategies differs on which loop (outer or inner) is made to run
in parallel with the Cython `prange` construct:
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
Each thread then iterates on all the chunks of Y. This strategy is
embarrassingly parallel and comes with no datastructures
synchronisation.
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
Each thread processes all the chunks of X in turn. This strategy is
a sequence of embarrassingly parallel subtasks (the inner loop on Y
chunks) with intermediate datastructures synchronisation at each
iteration of the sequential outer loop on X chunks.
- 'auto' relies on a simple heuristic to choose between
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
'parallel_on_X' is usually the most efficient strategy.
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
brings more opportunity for parallelism and is therefore more efficient
despite the synchronization step at each iteration of the outer loop
on chunks of `X`.
- None (default) looks-up in scikit-learn configuration for
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
return_distance : boolean, default=False
Return distances between each X vector and its neighbors if set to True.
sort_results : boolean, default=False
Sort results with respect to distances between each X vector and its
neighbors if set to True.
Returns
-------
If return_distance=False:
- neighbors_indices : ndarray of n_samples_X ndarray
Indices of the neighbors for each vector in X.
If return_distance=True:
- neighbors_indices : ndarray of n_samples_X ndarray
Indices of the neighbors for each vector in X.
- neighbors_distances : ndarray of n_samples_X ndarray
Distances to the neighbors for each vector in X.
Notes
-----
This classmethod inspects the arguments values to dispatch to the
dtype-specialized implementation of :class:`RadiusNeighbors`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
"""
if X.dtype == Y.dtype == np.float64:
return RadiusNeighbors64.compute(
X=X,
Y=Y,
radius=radius,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
sort_results=sort_results,
return_distance=return_distance,
)
if X.dtype == Y.dtype == np.float32:
return RadiusNeighbors32.compute(
X=X,
Y=Y,
radius=radius,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
sort_results=sort_results,
return_distance=return_distance,
)
raise ValueError(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)
class ArgKminClassMode(BaseDistancesReductionDispatcher):
"""Compute the argkmin of row vectors of X on the ones of Y with labels.
For each row vector of X, computes the indices of k first the rows
vectors of Y with the smallest distances. Computes weighted mode of labels.
ArgKminClassMode is typically used to perform bruteforce k-nearest neighbors
queries when the weighted mode of the labels for the k-nearest neighbors
are required, such as in `predict` methods.
This class is not meant to be instantiated, one should only use
its :meth:`compute` classmethod which handles allocation and
deallocation consistently.
"""
@classmethod
def valid_metrics(cls) -> List[str]:
excluded = {
# Euclidean is technically usable for ArgKminClassMode
# but its current implementation would not be competitive.
# TODO: implement Euclidean specialization using GEMM.
"euclidean",
"sqeuclidean",
}
return list(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
@classmethod
def compute(
cls,
X,
Y,
k,
weights,
Y_labels,
unique_Y_labels,
metric="euclidean",
chunk_size=None,
metric_kwargs=None,
strategy=None,
):
"""Compute the argkmin reduction.
Parameters
----------
X : ndarray of shape (n_samples_X, n_features)
The input array to be labelled.
Y : ndarray of shape (n_samples_Y, n_features)
The input array whose class membership are provided through the
`Y_labels` parameter.
k : int
The number of nearest neighbors to consider.
weights : ndarray
The weights applied over the `Y_labels` of `Y` when computing the
weighted mode of the labels.
Y_labels : ndarray
An array containing the index of the class membership of the
associated samples in `Y`. This is used in labeling `X`.
unique_Y_labels : ndarray
An array containing all unique indices contained in the
corresponding `Y_labels` array.
metric : str, default='euclidean'
The distance metric to use. For a list of available metrics, see
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
Currently does not support `'precomputed'`.
chunk_size : int, default=None,
The number of vectors per chunk. If None (default) looks-up in
scikit-learn configuration for `pairwise_dist_chunk_size`,
and use 256 if it is not set.
metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
The chunking strategy defining which dataset parallelization are made on.
For both strategies the computations happens with two nested loops,
respectively on chunks of X and chunks of Y.
Strategies differs on which loop (outer or inner) is made to run
in parallel with the Cython `prange` construct:
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
Each thread then iterates on all the chunks of Y. This strategy is
embarrassingly parallel and comes with no datastructures
synchronisation.
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
Each thread processes all the chunks of X in turn. This strategy is
a sequence of embarrassingly parallel subtasks (the inner loop on Y
chunks) with intermediate datastructures synchronisation at each
iteration of the sequential outer loop on X chunks.
- 'auto' relies on a simple heuristic to choose between
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
'parallel_on_X' is usually the most efficient strategy.
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
brings more opportunity for parallelism and is therefore more efficient
despite the synchronization step at each iteration of the outer loop
on chunks of `X`.
- None (default) looks-up in scikit-learn configuration for
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
Returns
-------
probabilities : ndarray of shape (n_samples_X, n_classes)
An array containing the class probabilities for each sample.
Notes
-----
This classmethod is responsible for introspecting the arguments
values to dispatch to the most appropriate implementation of
:class:`PairwiseDistancesArgKmin`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
"""
if weights not in {"uniform", "distance"}:
raise ValueError(
"Only the 'uniform' or 'distance' weights options are supported"
f" at this time. Got: {weights=}."
)
if X.dtype == Y.dtype == np.float64:
return ArgKminClassMode64.compute(
X=X,
Y=Y,
k=k,
weights=weights,
Y_labels=np.array(Y_labels, dtype=np.intp),
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
)
if X.dtype == Y.dtype == np.float32:
return ArgKminClassMode32.compute(
X=X,
Y=Y,
k=k,
weights=weights,
Y_labels=np.array(Y_labels, dtype=np.intp),
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
)
raise ValueError(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)
class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
"""Compute radius-based class modes of row vectors of X using the
those of Y.
For each row-vector X[i] of the queries X, find all the indices j of
row-vectors in Y such that:
dist(X[i], Y[j]) <= radius
RadiusNeighborsClassMode is typically used to perform bruteforce
radius neighbors queries when the weighted mode of the labels for
the nearest neighbors within the specified radius are required,
such as in `predict` methods.
This class is not meant to be instantiated, one should only use
its :meth:`compute` classmethod which handles allocation and
deallocation consistently.
"""
@classmethod
def valid_metrics(cls) -> List[str]:
excluded = {
# Euclidean is technically usable for RadiusNeighborsClassMode
# but it would not be competitive.
# TODO: implement Euclidean specialization using GEMM.
"euclidean",
"sqeuclidean",
}
return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
@classmethod
def compute(
cls,
X,
Y,
radius,
weights,
Y_labels,
unique_Y_labels,
outlier_label,
metric="euclidean",
chunk_size=None,
metric_kwargs=None,
strategy=None,
):
"""Return the results of the reduction for the given arguments.
Parameters
----------
X : ndarray of shape (n_samples_X, n_features)
The input array to be labelled.
Y : ndarray of shape (n_samples_Y, n_features)
The input array whose class membership is provided through
the `Y_labels` parameter.
radius : float
The radius defining the neighborhood.
weights : ndarray
The weights applied to the `Y_labels` when computing the
weighted mode of the labels.
Y_labels : ndarray
An array containing the index of the class membership of the
associated samples in `Y`. This is used in labeling `X`.
unique_Y_labels : ndarray
An array containing all unique class labels.
outlier_label : int, default=None
Label for outlier samples (samples with no neighbors in given
radius). In the default case when the value is None if any
outlier is detected, a ValueError will be raised. The outlier
label should be selected from among the unique 'Y' labels. If
it is specified with a different value a warning will be raised
and all class probabilities of outliers will be assigned to be 0.
metric : str, default='euclidean'
The distance metric to use. For a list of available metrics, see
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
Currently does not support `'precomputed'`.
chunk_size : int, default=None,
The number of vectors per chunk. If None (default) looks-up in
scikit-learn configuration for `pairwise_dist_chunk_size`,
and use 256 if it is not set.
metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
The chunking strategy defining which dataset parallelization are made on.
For both strategies the computations happens with two nested loops,
respectively on chunks of X and chunks of Y.
Strategies differs on which loop (outer or inner) is made to run
in parallel with the Cython `prange` construct:
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
Each thread then iterates on all the chunks of Y. This strategy is
embarrassingly parallel and comes with no datastructures
synchronisation.
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
Each thread processes all the chunks of X in turn. This strategy is
a sequence of embarrassingly parallel subtasks (the inner loop on Y
chunks) with intermediate datastructures synchronisation at each
iteration of the sequential outer loop on X chunks.
- 'auto' relies on a simple heuristic to choose between
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
'parallel_on_X' is usually the most efficient strategy.
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
brings more opportunity for parallelism and is therefore more efficient
despite the synchronization step at each iteration of the outer loop
on chunks of `X`.
- None (default) looks-up in scikit-learn configuration for
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
Returns
-------
probabilities : ndarray of shape (n_samples_X, n_classes)
An array containing the class probabilities for each sample.
"""
if weights not in {"uniform", "distance"}:
raise ValueError(
"Only the 'uniform' or 'distance' weights options are supported"
f" at this time. Got: {weights=}."
)
if X.dtype == Y.dtype == np.float64:
return RadiusNeighborsClassMode64.compute(
X=X,
Y=Y,
radius=radius,
weights=weights,
Y_labels=np.array(Y_labels, dtype=np.intp),
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
outlier_label=outlier_label,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
)
if X.dtype == Y.dtype == np.float32:
return RadiusNeighborsClassMode32.compute(
X=X,
Y=Y,
radius=radius,
weights=weights,
Y_labels=np.array(Y_labels, dtype=np.intp),
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
outlier_label=outlier_label,
metric=metric,
chunk_size=chunk_size,
metric_kwargs=metric_kwargs,
strategy=strategy,
)
raise ValueError(
"Only float64 or float32 datasets pairs are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
)

View File

@@ -0,0 +1,228 @@
{{py:
implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
#
# We also use the float64 dtype and C-type names as defined in
# `sklearn.utils._typedefs` to maintain consistency.
#
('64', False, 'float64_t', 'np.float64'),
('32', True, 'float32_t', 'np.float32')
]
}}
from libcpp.vector cimport vector
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
cdef void _middle_term_sparse_sparse_64(
const float64_t[:] X_data,
const int32_t[:] X_indices,
const int32_t[:] X_indptr,
intp_t X_start,
intp_t X_end,
const float64_t[:] Y_data,
const int32_t[:] Y_indices,
const int32_t[:] Y_indptr,
intp_t Y_start,
intp_t Y_end,
float64_t * D,
) noexcept nogil
{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
cdef class MiddleTermComputer{{name_suffix}}:
cdef:
intp_t effective_n_threads
intp_t chunks_n_threads
intp_t dist_middle_terms_chunks_size
intp_t n_features
intp_t chunk_size
# Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
vector[vector[float64_t]] dist_middle_terms_chunks
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_init(self) noexcept nogil
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:, ::1] X
const {{INPUT_DTYPE_t}}[:, ::1] Y
{{if upcast_to_float64}}
# Buffers for upcasting chunks of X and Y from 32bit to 64bit
vector[vector[float64_t]] X_c_upcast
vector[vector[float64_t]] Y_c_upcast
{{endif}}
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
cdef:
const float64_t[:] X_data
const int32_t[:] X_indices
const int32_t[:] X_indptr
const float64_t[:] Y_data
const int32_t[:] Y_indices
const int32_t[:] Y_indptr
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
cdef:
const float64_t[:] X_data
const int32_t[:] X_indices
const int32_t[:] X_indptr
const {{INPUT_DTYPE_t}}[:, ::1] Y
# We treat the dense-sparse case with the sparse-dense case by simply
# treating the dist_middle_terms as F-ordered and by swapping arguments.
# This attribute is meant to encode the case and adapt the logic
# accordingly.
bint c_ordered_middle_term
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil
{{endfor}}

View File

@@ -0,0 +1,633 @@
{{py:
implementation_specific_values = [
# Values are the following ones:
#
# name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
#
# We also use the float64 dtype and C-type names as defined in
# `sklearn.utils._typedefs` to maintain consistency.
#
('64', False, 'float64_t', 'np.float64'),
('32', True, 'float32_t', 'np.float32')
]
}}
from libcpp.vector cimport vector
from libcpp.algorithm cimport fill
from sklearn.utils._cython_blas cimport (
BLAS_Order,
BLAS_Trans,
NoTrans,
RowMajor,
Trans,
_gemm,
)
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
import numpy as np
from scipy.sparse import issparse, csr_matrix
cdef void _middle_term_sparse_sparse_64(
const float64_t[:] X_data,
const int32_t[:] X_indices,
const int32_t[:] X_indptr,
intp_t X_start,
intp_t X_end,
const float64_t[:] Y_data,
const int32_t[:] Y_indices,
const int32_t[:] Y_indptr,
intp_t Y_start,
intp_t Y_end,
float64_t * D,
) noexcept nogil:
# This routine assumes that D points to the first element of a
# zeroed buffer of length at least equal to n_X × n_Y, conceptually
# representing a 2-d C-ordered array.
cdef:
intp_t i, j, k
intp_t n_X = X_end - X_start
intp_t n_Y = Y_end - Y_start
intp_t x_col, x_ptr, y_col, y_ptr
for i in range(n_X):
for x_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
x_col = X_indices[x_ptr]
for j in range(n_Y):
k = i * n_Y + j
for y_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
y_col = Y_indices[y_ptr]
if x_col == y_col:
D[k] += -2 * X_data[x_ptr] * Y_data[y_ptr]
{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
cdef void _middle_term_sparse_dense_{{name_suffix}}(
const float64_t[:] X_data,
const int32_t[:] X_indices,
const int32_t[:] X_indptr,
intp_t X_start,
intp_t X_end,
const {{INPUT_DTYPE_t}}[:, ::1] Y,
intp_t Y_start,
intp_t Y_end,
bint c_ordered_middle_term,
float64_t * dist_middle_terms,
) noexcept nogil:
# This routine assumes that dist_middle_terms is a pointer to the first element
# of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually
# representing a 2-d C-ordered of F-ordered array.
cdef:
intp_t i, j, k
intp_t n_X = X_end - X_start
intp_t n_Y = Y_end - Y_start
intp_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
for i in range(n_X):
for j in range(n_Y):
k = i * n_Y + j if c_ordered_middle_term else j * n_X + i
for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
X_i_col_idx = X_indices[X_i_ptr]
dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx]
cdef class MiddleTermComputer{{name_suffix}}:
"""Helper class to compute a Euclidean distance matrix in chunks.
This is an abstract base class that is further specialized depending
on the type of data (dense or sparse).
`EuclideanDistance` subclasses relies on the squared Euclidean
distances between chunks of vectors X_c and Y_c using the
following decomposition for the (i,j) pair :
||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
This helper class is in charge of wrapping the common logic to compute
the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
"""
@classmethod
def get_for(
cls,
X,
Y,
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
) -> MiddleTermComputer{{name_suffix}}:
"""Return the MiddleTermComputer implementation for the given arguments.
Parameters
----------
X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
Input data.
If provided as an ndarray, it must be C-contiguous.
Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
Input data.
If provided as an ndarray, it must be C-contiguous.
Returns
-------
middle_term_computer: MiddleTermComputer{{name_suffix}}
The suited MiddleTermComputer{{name_suffix}} implementation.
"""
X_is_sparse = issparse(X)
Y_is_sparse = issparse(Y)
if not X_is_sparse and not Y_is_sparse:
return DenseDenseMiddleTermComputer{{name_suffix}}(
X,
Y,
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
)
if X_is_sparse and Y_is_sparse:
return SparseSparseMiddleTermComputer{{name_suffix}}(
X,
Y,
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
)
if X_is_sparse and not Y_is_sparse:
return SparseDenseMiddleTermComputer{{name_suffix}}(
X,
Y,
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
c_ordered_middle_term=True
)
if not X_is_sparse and Y_is_sparse:
# NOTE: The Dense-Sparse case is implement via the Sparse-Dense case.
#
# To do so:
# - X (dense) and Y (sparse) are swapped
# - the distance middle term is seen as F-ordered for consistency
# (c_ordered_middle_term = False)
return SparseDenseMiddleTermComputer{{name_suffix}}(
# Mind that X and Y are swapped here.
Y,
X,
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
c_ordered_middle_term=False,
)
raise NotImplementedError(
"X and Y must be CSR sparse matrices or numpy arrays."
)
@classmethod
def unpack_csr_matrix(cls, X: csr_matrix):
"""Ensure that the CSR matrix is indexed with np.int32."""
X_data = np.asarray(X.data, dtype=np.float64)
X_indices = np.asarray(X.indices, dtype=np.int32)
X_indptr = np.asarray(X.indptr, dtype=np.int32)
return X_data, X_indices, X_indptr
def __init__(
self,
intp_t effective_n_threads,
intp_t chunks_n_threads,
intp_t dist_middle_terms_chunks_size,
intp_t n_features,
intp_t chunk_size,
):
self.effective_n_threads = effective_n_threads
self.chunks_n_threads = chunks_n_threads
self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
self.n_features = n_features
self.chunk_size = chunk_size
self.dist_middle_terms_chunks = vector[vector[float64_t]](self.effective_n_threads)
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
return
cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil:
self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
return
cdef void _parallel_on_Y_init(self) noexcept nogil:
for thread_num in range(self.chunks_n_threads):
self.dist_middle_terms_chunks[thread_num].resize(
self.dist_middle_terms_chunks_size
)
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
return
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil:
return
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
return NULL
cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
"""Computes the middle term of the Euclidean distance between two chunked dense matrices
X_c and Y_c.
dist_middle_terms = - 2 X_c_i.Y_c_j^T
This class use the BLAS gemm routine to perform the dot product of each chunks
of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
"""
def __init__(
self,
const {{INPUT_DTYPE_t}}[:, ::1] X,
const {{INPUT_DTYPE_t}}[:, ::1] Y,
intp_t effective_n_threads,
intp_t chunks_n_threads,
intp_t dist_middle_terms_chunks_size,
intp_t n_features,
intp_t chunk_size,
):
super().__init__(
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
)
self.X = X
self.Y = Y
{{if upcast_to_float64}}
# We populate the buffer for upcasting chunks of X and Y from float32 to float64.
self.X_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
self.Y_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
upcast_buffer_n_elements = self.chunk_size * n_features
for thread_num in range(self.effective_n_threads):
self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
{{endif}}
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
{{if upcast_to_float64}}
cdef:
intp_t i, j
intp_t n_chunk_samples = Y_end - Y_start
# Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
for i in range(n_chunk_samples):
for j in range(self.n_features):
self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
{{else}}
return
{{endif}}
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
{{if upcast_to_float64}}
cdef:
intp_t i, j
intp_t n_chunk_samples = X_end - X_start
# Upcasting X_c=X[X_start:X_end, :] from float32 to float64
for i in range(n_chunk_samples):
for j in range(self.n_features):
self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
{{else}}
return
{{endif}}
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
{{if upcast_to_float64}}
cdef:
intp_t i, j
intp_t n_chunk_samples = X_end - X_start
# Upcasting X_c=X[X_start:X_end, :] from float32 to float64
for i in range(n_chunk_samples):
for j in range(self.n_features):
self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
{{else}}
return
{{endif}}
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num
) noexcept nogil:
{{if upcast_to_float64}}
cdef:
intp_t i, j
intp_t n_chunk_samples = Y_end - Y_start
# Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
for i in range(n_chunk_samples):
for j in range(self.n_features):
self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
{{else}}
return
{{endif}}
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
float64_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
# Careful: LDA, LDB and LDC are given for F-ordered arrays
# in BLAS documentations, for instance:
# https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
#
# Here, we use their counterpart values to work with C-ordered arrays.
BLAS_Order order = RowMajor
BLAS_Trans ta = NoTrans
BLAS_Trans tb = Trans
intp_t m = X_end - X_start
intp_t n = Y_end - Y_start
intp_t K = self.n_features
float64_t alpha = - 2.
{{if upcast_to_float64}}
float64_t * A = self.X_c_upcast[thread_num].data()
float64_t * B = self.Y_c_upcast[thread_num].data()
{{else}}
# Casting for A and B to remove the const is needed because APIs exposed via
# scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
# See: https://github.com/scipy/scipy/issues/14262
float64_t * A = <float64_t *> &self.X[X_start, 0]
float64_t * B = <float64_t *> &self.Y[Y_start, 0]
{{endif}}
intp_t lda = self.n_features
intp_t ldb = self.n_features
float64_t beta = 0.
intp_t ldc = Y_end - Y_start
# dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
_gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
return dist_middle_terms
cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
"""Middle term of the Euclidean distance between two chunked CSR matrices.
The result is return as a contiguous array.
dist_middle_terms = - 2 X_c_i.Y_c_j^T
The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
This routine iterates over the data, indices and indptr arrays of the sparse matrices without
densifying them.
"""
def __init__(
self,
X,
Y,
intp_t effective_n_threads,
intp_t chunks_n_threads,
intp_t dist_middle_terms_chunks_size,
intp_t n_features,
intp_t chunk_size,
):
super().__init__(
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
)
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
# Flush the thread dist_middle_terms_chunks to 0.0
fill(
self.dist_middle_terms_chunks[thread_num].begin(),
self.dist_middle_terms_chunks[thread_num].end(),
0.0,
)
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
# Flush the thread dist_middle_terms_chunks to 0.0
fill(
self.dist_middle_terms_chunks[thread_num].begin(),
self.dist_middle_terms_chunks[thread_num].end(),
0.0,
)
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
float64_t *dist_middle_terms = (
self.dist_middle_terms_chunks[thread_num].data()
)
_middle_term_sparse_sparse_64(
self.X_data,
self.X_indices,
self.X_indptr,
X_start,
X_end,
self.Y_data,
self.Y_indices,
self.Y_indptr,
Y_start,
Y_end,
dist_middle_terms,
)
return dist_middle_terms
cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
"""Middle term of the Euclidean distance between chunks of a CSR matrix and an np.ndarray.
The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}.
This routine iterates over the data, indices and indptr arrays of the sparse matrices
without densifying them.
"""
def __init__(
self,
X,
Y,
intp_t effective_n_threads,
intp_t chunks_n_threads,
intp_t dist_middle_terms_chunks_size,
intp_t n_features,
intp_t chunk_size,
bint c_ordered_middle_term,
):
super().__init__(
effective_n_threads,
chunks_n_threads,
dist_middle_terms_chunks_size,
n_features,
chunk_size,
)
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
self.Y = Y
self.c_ordered_middle_term = c_ordered_middle_term
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
# Fill the thread's dist_middle_terms_chunks with 0.0 before
# computing its elements in _compute_dist_middle_terms.
fill(
self.dist_middle_terms_chunks[thread_num].begin(),
self.dist_middle_terms_chunks[thread_num].end(),
0.0,
)
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
# Fill the thread's dist_middle_terms_chunks with 0.0 before
# computing its elements in _compute_dist_middle_terms.
fill(
self.dist_middle_terms_chunks[thread_num].begin(),
self.dist_middle_terms_chunks[thread_num].end(),
0.0,
)
cdef float64_t * _compute_dist_middle_terms(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
float64_t *dist_middle_terms = (
self.dist_middle_terms_chunks[thread_num].data()
)
# For the dense-sparse case, we use the sparse-dense case
# with dist_middle_terms seen as F-ordered.
# Hence we swap indices pointers here.
if not self.c_ordered_middle_term:
X_start, Y_start = Y_start, X_start
X_end, Y_end = Y_end, X_end
_middle_term_sparse_dense_{{name_suffix}}(
self.X_data,
self.X_indices,
self.X_indptr,
X_start,
X_end,
self.Y,
Y_start,
Y_end,
self.c_ordered_middle_term,
dist_middle_terms,
)
return dist_middle_terms
{{endfor}}

View File

@@ -0,0 +1,90 @@
cimport numpy as cnp
from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector
from cython cimport final
from sklearn.utils._typedefs cimport intp_t, float64_t
cnp.import_array()
######################
## std::vector to np.ndarray coercion
# As type covariance is not supported for C++ containers via Cython,
# we need to redefine fused types.
ctypedef fused vector_double_intp_t:
vector[intp_t]
vector[float64_t]
ctypedef fused vector_vector_double_intp_t:
vector[vector[intp_t]]
vector[vector[float64_t]]
cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
shared_ptr[vector_vector_double_intp_t] vecs
)
#####################
{{for name_suffix in ['64', '32']}}
from sklearn.metrics._pairwise_distances_reduction._base cimport BaseDistancesReduction{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
"""float{{name_suffix}} implementation of the RadiusNeighbors."""
cdef:
float64_t radius
# DistanceMetric{{name_suffix}} compute rank-preserving surrogate distance via rdist
# which are proxies necessitating less computations.
# We get the equivalent for the radius to be able to compare it against
# vectors' rank-preserving surrogate distances.
float64_t r_radius
# Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
#
# For this implementation, we want resizable buffers which we will wrap
# into numpy arrays at the end. std::vector comes as a handy container
# for interacting efficiently with resizable buffers.
#
# Though it is possible to access their buffer address with
# std::vector::data, they can't be stolen: buffers lifetime
# is tied to their std::vector and are deallocated when
# std::vectors are.
#
# To solve this, we dynamically allocate std::vectors and then
# encapsulate them in a StdVectorSentinel responsible for
# freeing them when the associated np.ndarray is freed.
#
# Shared pointers (defined via shared_ptr) are use for safer memory management.
# Unique pointers (defined via unique_ptr) can't be used as datastructures
# are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
shared_ptr[vector[vector[intp_t]]] neigh_indices
shared_ptr[vector[vector[float64_t]]] neigh_distances
# Used as array of pointers to private datastructures used in threads.
vector[shared_ptr[vector[vector[intp_t]]]] neigh_indices_chunks
vector[shared_ptr[vector[vector[float64_t]]]] neigh_distances_chunks
bint sort_results
@final
cdef void _merge_vectors(
self,
intp_t idx,
intp_t num_threads,
) noexcept nogil
cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
"""EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
cdef:
MiddleTermComputer{{name_suffix}} middle_term_computer
const float64_t[::1] X_norm_squared
const float64_t[::1] Y_norm_squared
bint use_squared_distances
{{endfor}}

View File

@@ -0,0 +1,514 @@
cimport numpy as cnp
import numpy as np
import warnings
from libcpp.memory cimport shared_ptr, make_shared
from libcpp.vector cimport vector
from libcpp.algorithm cimport move
from cython cimport final
from cython.operator cimport dereference as deref
from cython.parallel cimport parallel, prange
from sklearn.utils._sorting cimport simultaneous_sort
from sklearn.utils._typedefs cimport intp_t, float64_t
from sklearn.utils._vector_sentinel cimport vector_to_nd_array
from numbers import Real
from scipy.sparse import issparse
from sklearn.utils import check_array, check_scalar
from sklearn.utils.fixes import _in_unstable_openblas_configuration
from sklearn.utils.parallel import _get_threadpool_controller
cnp.import_array()
######################
cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
shared_ptr[vector_vector_double_intp_t] vecs
):
"""Coerce a std::vector of std::vector to an ndarray of ndarray."""
cdef:
intp_t n = deref(vecs).size()
cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
for i in range(n):
nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
return nd_arrays_of_nd_arrays
#####################
{{for name_suffix in ['64', '32']}}
from sklearn.metrics._pairwise_distances_reduction._base cimport (
BaseDistancesReduction{{name_suffix}},
_sqeuclidean_row_norms{{name_suffix}}
)
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
"""float{{name_suffix}} implementation of the RadiusNeighbors."""
@classmethod
def compute(
cls,
X,
Y,
float64_t radius,
str metric="euclidean",
chunk_size=None,
dict metric_kwargs=None,
str strategy=None,
bint return_distance=False,
bint sort_results=False,
):
"""Compute the radius-neighbors reduction.
This classmethod is responsible for introspecting the arguments
values to dispatch to the most appropriate implementation of
:class:`RadiusNeighbors{{name_suffix}}`.
This allows decoupling the API entirely from the implementation details
whilst maintaining RAII: all temporarily allocated datastructures necessary
for the concrete implementation are therefore freed when this classmethod
returns.
No instance should directly be created outside of this class method.
"""
if metric in ("euclidean", "sqeuclidean"):
# Specialized implementation of RadiusNeighbors for the Euclidean
# distance for the dense-dense and sparse-sparse cases.
# This implementation computes the distances by chunk using
# a decomposition of the Squared Euclidean distance.
# This specialisation has an improved arithmetic intensity for both
# the dense and sparse settings, allowing in most case speed-ups of
# several orders of magnitude compared to the generic RadiusNeighbors
# implementation.
# For more information see MiddleTermComputer.
use_squared_distances = metric == "sqeuclidean"
pda = EuclideanRadiusNeighbors{{name_suffix}}(
X=X, Y=Y, radius=radius,
use_squared_distances=use_squared_distances,
chunk_size=chunk_size,
strategy=strategy,
sort_results=sort_results,
metric_kwargs=metric_kwargs,
)
else:
# Fall back on a generic implementation that handles most scipy
# metrics by computing the distances between 2 vectors at a time.
pda = RadiusNeighbors{{name_suffix}}(
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
radius=radius,
chunk_size=chunk_size,
strategy=strategy,
sort_results=sort_results,
)
# Limit the number of threads in second level of nested parallelism for BLAS
# to avoid threads over-subscription (in GEMM for instance).
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
if pda.execute_in_parallel_on_Y:
pda._parallel_on_Y()
else:
pda._parallel_on_X()
return pda._finalize_results(return_distance)
def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
float64_t radius,
chunk_size=None,
strategy=None,
sort_results=False,
):
super().__init__(
datasets_pair=datasets_pair,
chunk_size=chunk_size,
strategy=strategy,
)
self.radius = check_scalar(radius, "radius", Real, min_val=0)
self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
self.sort_results = sort_results
# Allocating pointers to datastructures but not the datastructures themselves.
# There are as many pointers as effective threads.
#
# For the sake of explicitness:
# - when parallelizing on X, the pointers of those heaps are referencing
# self.neigh_distances and self.neigh_indices
# - when parallelizing on Y, the pointers of those heaps are referencing
# std::vectors of std::vectors which are thread-wise-allocated and whose
# content will be merged into self.neigh_distances and self.neigh_indices.
self.neigh_distances_chunks = vector[shared_ptr[vector[vector[float64_t]]]](
self.chunks_n_threads
)
self.neigh_indices_chunks = vector[shared_ptr[vector[vector[intp_t]]]](
self.chunks_n_threads
)
# Temporary datastructures which will be coerced to numpy arrays on before
# RadiusNeighbors.compute "return" and will be then freed.
self.neigh_distances = make_shared[vector[vector[float64_t]]](self.n_samples_X)
self.neigh_indices = make_shared[vector[vector[intp_t]]](self.n_samples_X)
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
intp_t i, j
float64_t r_dist_i_j
for i in range(X_start, X_end):
for j in range(Y_start, Y_end):
r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
if r_dist_i_j <= self.r_radius:
deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
def _finalize_results(self, bint return_distance=False):
if return_distance:
# We need to recompute distances because we relied on
# surrogate distances for the reduction.
self.compute_exact_distances()
return (
coerce_vectors_to_nd_arrays(self.neigh_distances),
coerce_vectors_to_nd_arrays(self.neigh_indices),
)
return coerce_vectors_to_nd_arrays(self.neigh_indices)
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
# As this strategy is embarrassingly parallel, we can set the
# thread vectors' pointers to the main vectors'.
self.neigh_distances_chunks[thread_num] = self.neigh_distances
self.neigh_indices_chunks[thread_num] = self.neigh_indices
@final
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
cdef:
intp_t idx
# Sorting neighbors for each query vector of X
if self.sort_results:
for idx in range(X_start, X_end):
simultaneous_sort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
)
cdef void _parallel_on_Y_init(
self,
) noexcept nogil:
cdef:
intp_t thread_num
# As chunks of X are shared across threads, so must datastructures to avoid race
# conditions: each thread has its own vectors of n_samples_X vectors which are
# then merged back in the main n_samples_X vectors.
for thread_num in range(self.chunks_n_threads):
self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[float64_t]]](self.n_samples_X)
self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[intp_t]]](self.n_samples_X)
@final
cdef void _merge_vectors(
self,
intp_t idx,
intp_t num_threads,
) noexcept nogil:
cdef:
intp_t thread_num
intp_t idx_n_elements = 0
intp_t last_element_idx = deref(self.neigh_indices)[idx].size()
# Resizing buffers only once for the given number of elements.
for thread_num in range(num_threads):
idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
# Moving the elements by range using the range first element
# as the reference for the insertion.
for thread_num in range(num_threads):
move(
deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
deref(self.neigh_distances_chunks[thread_num])[idx].end(),
deref(self.neigh_distances)[idx].begin() + last_element_idx
)
move(
deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
deref(self.neigh_indices_chunks[thread_num])[idx].end(),
deref(self.neigh_indices)[idx].begin() + last_element_idx
)
last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil:
cdef:
intp_t idx
with nogil, parallel(num_threads=self.effective_n_threads):
# Merge vectors used in threads into the main ones.
# This is done in parallel sample-wise (no need for locks).
for idx in prange(self.n_samples_X, schedule='static'):
self._merge_vectors(idx, self.chunks_n_threads)
# The content of the vector have been std::moved.
# Hence they can't be used anymore and can be deleted.
# Their deletion is carried out automatically as the
# implementation relies on shared pointers.
# Sort in parallel in ascending order w.r.t the distances if requested.
if self.sort_results:
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
)
return
cdef void compute_exact_distances(self) noexcept nogil:
"""Convert rank-preserving distances to pairwise distances in parallel."""
cdef:
intp_t i
vector[intp_t].size_type j
for i in prange(self.n_samples_X, nogil=True, schedule='static',
num_threads=self.effective_n_threads):
for j in range(deref(self.neigh_indices)[i].size()):
deref(self.neigh_distances)[i][j] = (
self.datasets_pair.distance_metric._rdist_to_dist(
# Guard against potential -0., causing nan production.
max(deref(self.neigh_distances)[i][j], 0.)
)
)
cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
"""EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
@classmethod
def is_usable_for(cls, X, Y, metric) -> bool:
return (RadiusNeighbors{{name_suffix}}.is_usable_for(X, Y, metric)
and not _in_unstable_openblas_configuration())
def __init__(
self,
X,
Y,
float64_t radius,
bint use_squared_distances=False,
chunk_size=None,
strategy=None,
sort_results=False,
metric_kwargs=None,
):
if (
isinstance(metric_kwargs, dict) and
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
):
warnings.warn(
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.",
UserWarning,
stacklevel=3,
)
super().__init__(
# The datasets pair here is used for exact distances computations
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
radius=radius,
chunk_size=chunk_size,
strategy=strategy,
sort_results=sort_results,
)
cdef:
intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
X,
Y,
self.effective_n_threads,
self.chunks_n_threads,
dist_middle_terms_chunks_size,
n_features=X.shape[1],
chunk_size=self.chunk_size,
)
if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
self.Y_norm_squared = check_array(
metric_kwargs.pop("Y_norm_squared"),
ensure_2d=False,
input_name="Y_norm_squared",
dtype=np.float64,
)
else:
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
Y,
self.effective_n_threads,
)
if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
self.X_norm_squared = check_array(
metric_kwargs.pop("X_norm_squared"),
ensure_2d=False,
input_name="X_norm_squared",
dtype=np.float64,
)
else:
# Do not recompute norms if datasets are identical.
self.X_norm_squared = (
self.Y_norm_squared if X is Y else
_sqeuclidean_row_norms{{name_suffix}}(
X,
self.effective_n_threads,
)
)
self.use_squared_distances = use_squared_distances
if use_squared_distances:
# In this specialisation and this setup, the value passed to the radius is
# already considered to be the adapted radius, so we overwrite it.
self.r_radius = radius
@final
cdef void _parallel_on_X_parallel_init(
self,
intp_t thread_num,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
@final
cdef void _parallel_on_X_init_chunk(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
@final
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
self,
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num,
)
@final
cdef void _parallel_on_Y_init(
self,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self)
self.middle_term_computer._parallel_on_Y_init()
@final
cdef void _parallel_on_Y_parallel_init(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
@final
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
RadiusNeighbors{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
self,
X_start, X_end,
Y_start, Y_end,
thread_num,
)
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
X_start, X_end, Y_start, Y_end, thread_num
)
@final
cdef void compute_exact_distances(self) noexcept nogil:
if not self.use_squared_distances:
RadiusNeighbors{{name_suffix}}.compute_exact_distances(self)
@final
cdef void _compute_and_reduce_distances_on_chunks(
self,
intp_t X_start,
intp_t X_end,
intp_t Y_start,
intp_t Y_end,
intp_t thread_num,
) noexcept nogil:
cdef:
intp_t i, j
float64_t sqeuclidean_dist_i_j
intp_t n_X = X_end - X_start
intp_t n_Y = Y_end - Y_start
float64_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
X_start, X_end, Y_start, Y_end, thread_num
)
# Pushing the distance and their associated indices in vectors.
for i in range(n_X):
for j in range(n_Y):
sqeuclidean_dist_i_j = (
self.X_norm_squared[i + X_start]
+ dist_middle_terms[i * n_Y + j]
+ self.Y_norm_squared[j + Y_start]
)
# Catastrophic cancellation might cause -0. to be present,
# e.g. when computing d(x_i, y_i) when X is Y.
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
if sqeuclidean_dist_i_j <= self.r_radius:
deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j)
deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
{{endfor}}

View File

@@ -0,0 +1,217 @@
import warnings
from cython cimport floating, final, integral
from cython.operator cimport dereference as deref
from cython.parallel cimport parallel, prange
from sklearn.metrics._pairwise_distances_reduction._classmode cimport WeightingStrategy
from sklearn.utils._typedefs cimport intp_t, float64_t, uint8_t
import numpy as np
from scipy.sparse import issparse
from sklearn.utils.parallel import _get_threadpool_controller
{{for name_suffix in ["32", "64"]}}
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors cimport RadiusNeighbors{{name_suffix}}
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
"""
{{name_suffix}}bit implementation of RadiusNeighborsClassMode.
"""
cdef:
const intp_t[::1] Y_labels
const intp_t[::1] unique_Y_labels
intp_t outlier_label_index
bint outlier_label_exists
bint outliers_exist
uint8_t[::1] outliers
object outlier_label
float64_t[:, ::1] class_scores
WeightingStrategy weight_type
@classmethod
def compute(
cls,
X,
Y,
float64_t radius,
weights,
Y_labels,
unique_Y_labels,
outlier_label=None,
str metric="euclidean",
chunk_size=None,
dict metric_kwargs=None,
str strategy=None,
):
# Use a generic implementation that handles most scipy
# metrics by computing the distances between 2 vectors at a time.
pda = RadiusNeighborsClassMode{{name_suffix}}(
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
radius=radius,
chunk_size=chunk_size,
strategy=strategy,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=outlier_label,
)
# Limit the number of threads in second level of nested parallelism for BLAS
# to avoid threads over-subscription (in GEMM for instance).
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
if pda.execute_in_parallel_on_Y:
pda._parallel_on_Y()
else:
pda._parallel_on_X()
return pda._finalize_results()
def __init__(
self,
DatasetsPair{{name_suffix}} datasets_pair,
const intp_t[::1] Y_labels,
const intp_t[::1] unique_Y_labels,
float64_t radius,
chunk_size=None,
strategy=None,
weights=None,
outlier_label=None,
):
super().__init__(
datasets_pair=datasets_pair,
chunk_size=chunk_size,
strategy=strategy,
radius=radius,
)
if weights == "uniform":
self.weight_type = WeightingStrategy.uniform
elif weights == "distance":
self.weight_type = WeightingStrategy.distance
else:
self.weight_type = WeightingStrategy.callable
self.Y_labels = Y_labels
self.unique_Y_labels = unique_Y_labels
self.outlier_label_index = -1
self.outliers_exist = False
self.outlier_label = outlier_label
self.outliers = np.zeros(self.n_samples_X, dtype=np.bool_)
cdef intp_t idx
if self.outlier_label is not None:
for idx in range(self.unique_Y_labels.shape[0]):
if self.unique_Y_labels[idx] == outlier_label:
self.outlier_label_index = idx
# Map from set of unique labels to their indices in `class_scores`
# Buffer used in building a histogram for one-pass weighted mode
self.class_scores = np.zeros(
(self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
)
cdef inline void weighted_histogram_mode(
self,
intp_t sample_index,
intp_t sample_n_neighbors,
intp_t* indices,
float64_t* distances,
) noexcept nogil:
cdef:
intp_t neighbor_idx, neighbor_class_idx, label_index
float64_t score_incr = 1
bint use_distance_weighting = (
self.weight_type == WeightingStrategy.distance
)
if sample_n_neighbors == 0:
self.outliers_exist = True
self.outliers[sample_index] = True
if self.outlier_label_index >= 0:
self.class_scores[sample_index][self.outlier_label_index] = score_incr
return
# Iterate over the neighbors. This can be different for
# each of the samples as they are based on the radius.
for neighbor_rank in range(sample_n_neighbors):
if use_distance_weighting:
score_incr = 1 / distances[neighbor_rank]
neighbor_idx = indices[neighbor_rank]
neighbor_class_idx = self.Y_labels[neighbor_idx]
self.class_scores[sample_index][neighbor_class_idx] += score_incr
return
@final
cdef void _parallel_on_X_prange_iter_finalize(
self,
intp_t thread_num,
intp_t X_start,
intp_t X_end,
) noexcept nogil:
cdef:
intp_t idx
for idx in range(X_start, X_end):
self.weighted_histogram_mode(
sample_index=idx,
sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
indices=deref(self.neigh_indices)[idx].data(),
distances=deref(self.neigh_distances)[idx].data(),
)
return
@final
cdef void _parallel_on_Y_finalize(
self,
) noexcept nogil:
cdef:
intp_t idx
with nogil, parallel(num_threads=self.effective_n_threads):
# Merge vectors used in threads into the main ones.
# This is done in parallel sample-wise (no need for locks).
for idx in prange(self.n_samples_X, schedule='static'):
self._merge_vectors(idx, self.chunks_n_threads)
for idx in prange(self.n_samples_X, schedule='static'):
self.weighted_histogram_mode(
sample_index=idx,
sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
indices=deref(self.neigh_indices)[idx].data(),
distances=deref(self.neigh_distances)[idx].data(),
)
return
def _finalize_results(self):
if self.outliers_exist and self.outlier_label is None:
raise ValueError(
"No neighbors found for test samples %r, "
"you can try using larger radius, "
"giving a label for outliers, "
"or considering removing them from your dataset."
% np.where(self.outliers)[0]
)
if self.outliers_exist and self.outlier_label_index < 0:
warnings.warn(
"Outlier label %s is not in training "
"classes. All class probabilities of "
"outliers will be assigned with 0."
% self.outlier_label
)
probabilities = np.asarray(self.class_scores)
normalizer = probabilities.sum(axis=1, keepdims=True)
normalizer[normalizer == 0.0] = 1.0
probabilities /= normalizer
return probabilities
{{endfor}}

View File

@@ -0,0 +1,193 @@
# Note: the dependencies between different Cython files in
# _pairwise_distances_reduction is probably one of the most involved in
# scikit-learn. If you change this file make sure you build from scratch:
# rm -rf build; make dev-meson
# run a command like this:
# ninja -C build/cp312 -t missingdeps
# and make sure that the output is something like:
# No missing dependencies on generated files found.
# _pairwise_distances_reduction is cimported from other subpackages so this is
# needed for the cimport to work
_pairwise_distances_reduction_cython_tree = [
fs.copyfile('__init__.py'),
# We are in a sub-module of metrics, so we always need to have
# sklearn/metrics/__init__.py copied to the build directory to avoid the
# error:
# relative cimport beyond main package is not allowed
metrics_cython_tree
]
_classmode_pxd = fs.copyfile('_classmode.pxd')
_datasets_pair_pxd = custom_target(
'_datasets_pair_pxd',
output: '_datasets_pair.pxd',
input: '_datasets_pair.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
)
_datasets_pair_pyx = custom_target(
'_datasets_pair_pyx',
output: '_datasets_pair.pyx',
input: '_datasets_pair.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_datasets_pair_pxd, _pairwise_distances_reduction_cython_tree, utils_cython_tree],
)
_datasets_pair = py.extension_module(
'_datasets_pair',
cython_gen_cpp.process(_datasets_pair_pyx),
dependencies: [np_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_base_pxd = custom_target(
'_base_pxd',
output: '_base.pxd',
input: '_base.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
)
_base_pyx = custom_target(
'_base_pyx',
output: '_base.pyx',
input: '_base.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_base_pxd, _pairwise_distances_reduction_cython_tree,
_datasets_pair_pxd, utils_cython_tree],
)
_base = py.extension_module(
'_base',
cython_gen_cpp.process(_base_pyx),
dependencies: [np_dep, openmp_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_middle_term_computer_pxd = custom_target(
'_middle_term_computer_pxd',
output: '_middle_term_computer.pxd',
input: '_middle_term_computer.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
)
_middle_term_computer_pyx = custom_target(
'_middle_term_computer_pyx',
output: '_middle_term_computer.pyx',
input: '_middle_term_computer.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_middle_term_computer_pxd,
_pairwise_distances_reduction_cython_tree,
utils_cython_tree],
)
_middle_term_computer = py.extension_module(
'_middle_term_computer',
cython_gen_cpp.process(_middle_term_computer_pyx),
dependencies: [np_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_argkmin_pxd = custom_target(
'_argkmin_pxd',
output: '_argkmin.pxd',
input: '_argkmin.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
)
_argkmin_pyx = custom_target(
'_argkmin_pyx',
output: '_argkmin.pyx',
input: '_argkmin.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_argkmin_pxd,
_pairwise_distances_reduction_cython_tree,
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd],
)
_argkmin = py.extension_module(
'_argkmin',
cython_gen_cpp.process(_argkmin_pyx),
dependencies: [np_dep, openmp_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_radius_neighbors_pxd = custom_target(
'_radius_neighbors_pxd',
output: '_radius_neighbors.pxd',
input: '_radius_neighbors.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
)
_radius_neighbors_pyx = custom_target(
'_radius_neighbors_pyx',
output: '_radius_neighbors.pyx',
input: '_radius_neighbors.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_radius_neighbors_pxd,
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd,
_pairwise_distances_reduction_cython_tree, utils_cython_tree],
)
_radius_neighbors = py.extension_module(
'_radius_neighbors',
cython_gen_cpp.process(_radius_neighbors_pyx),
dependencies: [np_dep, openmp_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_argkmin_classmode_pyx = custom_target(
'_argkmin_classmode_pyx',
output: '_argkmin_classmode.pyx',
input: '_argkmin_classmode.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_classmode_pxd,
_argkmin_pxd, _pairwise_distances_reduction_cython_tree,
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd, utils_cython_tree],
)
_argkmin_classmode = py.extension_module(
'_argkmin_classmode',
cython_gen_cpp.process(_argkmin_classmode_pyx),
dependencies: [np_dep, openmp_dep],
# XXX: for some reason -fno-sized-deallocation is needed otherwise there is
# an error with undefined symbol _ZdlPv at import time in manylinux wheels.
# See https://github.com/scikit-learn/scikit-learn/issues/28596 for more details.
cpp_args: ['-fno-sized-deallocation'],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)
_radius_neighbors_classmode_pyx = custom_target(
'_radius_neighbors_classmode_pyx',
output: '_radius_neighbors_classmode.pyx',
input: '_radius_neighbors_classmode.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: [_classmode_pxd,
_middle_term_computer_pxd, _radius_neighbors_pxd,
_pairwise_distances_reduction_cython_tree,
_datasets_pair_pxd, _base_pxd, utils_cython_tree],
)
_radius_neighbors_classmode = py.extension_module(
'_radius_neighbors_classmode',
cython_gen_cpp.process(_radius_neighbors_classmode_pyx),
dependencies: [np_dep, openmp_dep],
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
install: true
)

View File

@@ -0,0 +1,107 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython cimport floating
from cython.parallel cimport prange
from libc.math cimport fabs
from sklearn.utils._typedefs cimport intp_t
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
def _chi2_kernel_fast(floating[:, :] X,
floating[:, :] Y,
floating[:, :] result):
cdef intp_t i, j, k
cdef intp_t n_samples_X = X.shape[0]
cdef intp_t n_samples_Y = Y.shape[0]
cdef intp_t n_features = X.shape[1]
cdef double res, nom, denom
with nogil:
for i in range(n_samples_X):
for j in range(n_samples_Y):
res = 0
for k in range(n_features):
denom = (X[i, k] - Y[j, k])
nom = (X[i, k] + Y[j, k])
if nom != 0:
res += denom * denom / nom
result[i, j] = -res
def _sparse_manhattan(
const floating[::1] X_data,
const int[:] X_indices,
const int[:] X_indptr,
const floating[::1] Y_data,
const int[:] Y_indices,
const int[:] Y_indptr,
double[:, ::1] D,
):
"""Pairwise L1 distances for CSR matrices.
Usage:
>>> D = np.zeros(X.shape[0], Y.shape[0])
>>> _sparse_manhattan(X.data, X.indices, X.indptr,
... Y.data, Y.indices, Y.indptr,
... D)
"""
cdef intp_t px, py, i, j, ix, iy
cdef double d = 0.0
cdef int m = D.shape[0]
cdef int n = D.shape[1]
cdef int X_indptr_end = 0
cdef int Y_indptr_end = 0
cdef int num_threads = _openmp_effective_n_threads()
# We scan the matrices row by row.
# Given row px in X and row py in Y, we find the positions (i and j
# respectively), in .indices where the indices for the two rows start.
# If the indices (ix and iy) are the same, the corresponding data values
# are processed and the cursors i and j are advanced.
# If not, the lowest index is considered. Its associated data value is
# processed and its cursor is advanced.
# We proceed like this until one of the cursors hits the end for its row.
# Then we process all remaining data values in the other row.
# Below the avoidance of inplace operators is intentional.
# When prange is used, the inplace operator has a special meaning, i.e. it
# signals a "reduction"
for px in prange(m, nogil=True, num_threads=num_threads):
X_indptr_end = X_indptr[px + 1]
for py in range(n):
Y_indptr_end = Y_indptr[py + 1]
i = X_indptr[px]
j = Y_indptr[py]
d = 0.0
while i < X_indptr_end and j < Y_indptr_end:
ix = X_indices[i]
iy = Y_indices[j]
if ix == iy:
d = d + fabs(X_data[i] - Y_data[j])
i = i + 1
j = j + 1
elif ix < iy:
d = d + fabs(X_data[i])
i = i + 1
else:
d = d + fabs(Y_data[j])
j = j + 1
if i == X_indptr_end:
while j < Y_indptr_end:
d = d + fabs(Y_data[j])
j = j + 1
else:
while i < X_indptr_end:
d = d + fabs(X_data[i])
i = i + 1
D[px, py] = d

View File

@@ -0,0 +1,2 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

View File

@@ -0,0 +1,499 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from itertools import product
import numpy as np
from sklearn.base import is_classifier
from sklearn.metrics import confusion_matrix
from sklearn.utils._optional_dependencies import check_matplotlib_support
from sklearn.utils._plotting import _validate_style_kwargs
from sklearn.utils.multiclass import unique_labels
class ConfusionMatrixDisplay:
"""Confusion Matrix visualization.
It is recommended to use
:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or
:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to
create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
attributes.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <confusion_matrix>`.
Parameters
----------
confusion_matrix : ndarray of shape (n_classes, n_classes)
Confusion matrix.
display_labels : ndarray of shape (n_classes,), default=None
Display labels for plot. If None, display labels are set from 0 to
`n_classes - 1`.
Attributes
----------
im_ : matplotlib AxesImage
Image representing the confusion matrix.
text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \
or None
Array of matplotlib axes. `None` if `include_values` is false.
ax_ : matplotlib Axes
Axes with confusion matrix.
figure_ : matplotlib Figure
Figure containing the confusion matrix.
See Also
--------
confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a
classification.
ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
given an estimator, the data, and the label.
ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
given the true and predicted labels.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
... random_state=0)
>>> clf = SVC(random_state=0)
>>> clf.fit(X_train, y_train)
SVC(random_state=0)
>>> predictions = clf.predict(X_test)
>>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
>>> disp = ConfusionMatrixDisplay(confusion_matrix=cm,
... display_labels=clf.classes_)
>>> disp.plot()
<...>
>>> plt.show()
"""
def __init__(self, confusion_matrix, *, display_labels=None):
self.confusion_matrix = confusion_matrix
self.display_labels = display_labels
def plot(
self,
*,
include_values=True,
cmap="viridis",
xticks_rotation="horizontal",
values_format=None,
ax=None,
colorbar=True,
im_kw=None,
text_kw=None,
):
"""Plot visualization.
Parameters
----------
include_values : bool, default=True
Includes values in confusion matrix.
cmap : str or matplotlib Colormap, default='viridis'
Colormap recognized by matplotlib.
xticks_rotation : {'vertical', 'horizontal'} or float, \
default='horizontal'
Rotation of xtick labels.
values_format : str, default=None
Format specification for values in confusion matrix. If `None`,
the format specification is 'd' or '.2g' whichever is shorter.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
colorbar : bool, default=True
Whether or not to add a colorbar to the plot.
im_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
text_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.text` call.
.. versionadded:: 1.2
Returns
-------
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
Returns a :class:`~sklearn.metrics.ConfusionMatrixDisplay` instance
that contains all the information to plot the confusion matrix.
"""
check_matplotlib_support("ConfusionMatrixDisplay.plot")
import matplotlib.pyplot as plt
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.figure
cm = self.confusion_matrix
n_classes = cm.shape[0]
default_im_kw = dict(interpolation="nearest", cmap=cmap)
im_kw = im_kw or {}
im_kw = _validate_style_kwargs(default_im_kw, im_kw)
text_kw = text_kw or {}
self.im_ = ax.imshow(cm, **im_kw)
self.text_ = None
cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0)
if include_values:
self.text_ = np.empty_like(cm, dtype=object)
# print text with appropriate color depending on background
thresh = (cm.max() + cm.min()) / 2.0
for i, j in product(range(n_classes), range(n_classes)):
color = cmap_max if cm[i, j] < thresh else cmap_min
if values_format is None:
text_cm = format(cm[i, j], ".2g")
if cm.dtype.kind != "f":
text_d = format(cm[i, j], "d")
if len(text_d) < len(text_cm):
text_cm = text_d
else:
text_cm = format(cm[i, j], values_format)
default_text_kwargs = dict(ha="center", va="center", color=color)
text_kwargs = _validate_style_kwargs(default_text_kwargs, text_kw)
self.text_[i, j] = ax.text(j, i, text_cm, **text_kwargs)
if self.display_labels is None:
display_labels = np.arange(n_classes)
else:
display_labels = self.display_labels
if colorbar:
fig.colorbar(self.im_, ax=ax)
ax.set(
xticks=np.arange(n_classes),
yticks=np.arange(n_classes),
xticklabels=display_labels,
yticklabels=display_labels,
ylabel="True label",
xlabel="Predicted label",
)
ax.set_ylim((n_classes - 0.5, -0.5))
plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)
self.figure_ = fig
self.ax_ = ax
return self
@classmethod
def from_estimator(
cls,
estimator,
X,
y,
*,
labels=None,
sample_weight=None,
normalize=None,
display_labels=None,
include_values=True,
xticks_rotation="horizontal",
values_format=None,
cmap="viridis",
ax=None,
colorbar=True,
im_kw=None,
text_kw=None,
):
"""Plot Confusion Matrix given an estimator and some data.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <confusion_matrix>`.
.. versionadded:: 1.0
Parameters
----------
estimator : estimator instance
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a classifier.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
labels : array-like of shape (n_classes,), default=None
List of labels to index the confusion matrix. This may be used to
reorder or select a subset of labels. If `None` is given, those
that appear at least once in `y_true` or `y_pred` are used in
sorted order.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
normalize : {'true', 'pred', 'all'}, default=None
Either to normalize the counts display in the matrix:
- if `'true'`, the confusion matrix is normalized over the true
conditions (e.g. rows);
- if `'pred'`, the confusion matrix is normalized over the
predicted conditions (e.g. columns);
- if `'all'`, the confusion matrix is normalized by the total
number of samples;
- if `None` (default), the confusion matrix will not be normalized.
display_labels : array-like of shape (n_classes,), default=None
Target names used for plotting. By default, `labels` will be used
if it is defined, otherwise the unique labels of `y_true` and
`y_pred` will be used.
include_values : bool, default=True
Includes values in confusion matrix.
xticks_rotation : {'vertical', 'horizontal'} or float, \
default='horizontal'
Rotation of xtick labels.
values_format : str, default=None
Format specification for values in confusion matrix. If `None`, the
format specification is 'd' or '.2g' whichever is shorter.
cmap : str or matplotlib Colormap, default='viridis'
Colormap recognized by matplotlib.
ax : matplotlib Axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
colorbar : bool, default=True
Whether or not to add a colorbar to the plot.
im_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
text_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.text` call.
.. versionadded:: 1.2
Returns
-------
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
See Also
--------
ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
given the true and predicted labels.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import ConfusionMatrixDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = SVC(random_state=0)
>>> clf.fit(X_train, y_train)
SVC(random_state=0)
>>> ConfusionMatrixDisplay.from_estimator(
... clf, X_test, y_test)
<...>
>>> plt.show()
For a detailed example of using a confusion matrix to evaluate a
Support Vector Classifier, please see
:ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py`
"""
method_name = f"{cls.__name__}.from_estimator"
check_matplotlib_support(method_name)
if not is_classifier(estimator):
raise ValueError(f"{method_name} only supports classifiers")
y_pred = estimator.predict(X)
return cls.from_predictions(
y,
y_pred,
sample_weight=sample_weight,
labels=labels,
normalize=normalize,
display_labels=display_labels,
include_values=include_values,
cmap=cmap,
ax=ax,
xticks_rotation=xticks_rotation,
values_format=values_format,
colorbar=colorbar,
im_kw=im_kw,
text_kw=text_kw,
)
@classmethod
def from_predictions(
cls,
y_true,
y_pred,
*,
labels=None,
sample_weight=None,
normalize=None,
display_labels=None,
include_values=True,
xticks_rotation="horizontal",
values_format=None,
cmap="viridis",
ax=None,
colorbar=True,
im_kw=None,
text_kw=None,
):
"""Plot Confusion Matrix given true and predicted labels.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <confusion_matrix>`.
.. versionadded:: 1.0
Parameters
----------
y_true : array-like of shape (n_samples,)
True labels.
y_pred : array-like of shape (n_samples,)
The predicted labels given by the method `predict` of an
classifier.
labels : array-like of shape (n_classes,), default=None
List of labels to index the confusion matrix. This may be used to
reorder or select a subset of labels. If `None` is given, those
that appear at least once in `y_true` or `y_pred` are used in
sorted order.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
normalize : {'true', 'pred', 'all'}, default=None
Either to normalize the counts display in the matrix:
- if `'true'`, the confusion matrix is normalized over the true
conditions (e.g. rows);
- if `'pred'`, the confusion matrix is normalized over the
predicted conditions (e.g. columns);
- if `'all'`, the confusion matrix is normalized by the total
number of samples;
- if `None` (default), the confusion matrix will not be normalized.
display_labels : array-like of shape (n_classes,), default=None
Target names used for plotting. By default, `labels` will be used
if it is defined, otherwise the unique labels of `y_true` and
`y_pred` will be used.
include_values : bool, default=True
Includes values in confusion matrix.
xticks_rotation : {'vertical', 'horizontal'} or float, \
default='horizontal'
Rotation of xtick labels.
values_format : str, default=None
Format specification for values in confusion matrix. If `None`, the
format specification is 'd' or '.2g' whichever is shorter.
cmap : str or matplotlib Colormap, default='viridis'
Colormap recognized by matplotlib.
ax : matplotlib Axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
colorbar : bool, default=True
Whether or not to add a colorbar to the plot.
im_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
text_kw : dict, default=None
Dict with keywords passed to `matplotlib.pyplot.text` call.
.. versionadded:: 1.2
Returns
-------
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
See Also
--------
ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
given an estimator, the data, and the label.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import ConfusionMatrixDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = SVC(random_state=0)
>>> clf.fit(X_train, y_train)
SVC(random_state=0)
>>> y_pred = clf.predict(X_test)
>>> ConfusionMatrixDisplay.from_predictions(
... y_test, y_pred)
<...>
>>> plt.show()
"""
check_matplotlib_support(f"{cls.__name__}.from_predictions")
if display_labels is None:
if labels is None:
display_labels = unique_labels(y_true, y_pred)
else:
display_labels = labels
cm = confusion_matrix(
y_true,
y_pred,
sample_weight=sample_weight,
labels=labels,
normalize=normalize,
)
disp = cls(confusion_matrix=cm, display_labels=display_labels)
return disp.plot(
include_values=include_values,
cmap=cmap,
ax=ax,
xticks_rotation=xticks_rotation,
values_format=values_format,
colorbar=colorbar,
im_kw=im_kw,
text_kw=text_kw,
)

View File

@@ -0,0 +1,388 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
import scipy as sp
from sklearn.metrics._ranking import det_curve
from sklearn.utils._plotting import (
_BinaryClassifierCurveDisplayMixin,
_deprecate_y_pred_parameter,
)
class DetCurveDisplay(_BinaryClassifierCurveDisplayMixin):
"""Detection Error Tradeoff (DET) curve visualization.
It is recommended to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`
or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a
visualizer. All parameters are stored as attributes.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <det_curve>`.
.. versionadded:: 0.24
Parameters
----------
fpr : ndarray
False positive rate.
fnr : ndarray
False negative rate.
estimator_name : str, default=None
Name of estimator. If None, the estimator name is not shown.
pos_label : int, float, bool or str, default=None
The label of the positive class. If not `None`, this value is displayed in
the x- and y-axes labels.
Attributes
----------
line_ : matplotlib Artist
DET Curve.
ax_ : matplotlib Axes
Axes with DET Curve.
figure_ : matplotlib Figure
Figure containing the curve.
See Also
--------
det_curve : Compute error rates for different probability thresholds.
DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
some data.
DetCurveDisplay.from_predictions : Plot DET curve given the true and
predicted labels.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import det_curve, DetCurveDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(n_samples=1000, random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.4, random_state=0)
>>> clf = SVC(random_state=0).fit(X_train, y_train)
>>> y_score = clf.decision_function(X_test)
>>> fpr, fnr, _ = det_curve(y_test, y_score)
>>> display = DetCurveDisplay(
... fpr=fpr, fnr=fnr, estimator_name="SVC"
... )
>>> display.plot()
<...>
>>> plt.show()
"""
def __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None):
self.fpr = fpr
self.fnr = fnr
self.estimator_name = estimator_name
self.pos_label = pos_label
@classmethod
def from_estimator(
cls,
estimator,
X,
y,
*,
sample_weight=None,
drop_intermediate=True,
response_method="auto",
pos_label=None,
name=None,
ax=None,
**kwargs,
):
"""Plot DET curve given an estimator and data.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <det_curve>`.
.. versionadded:: 1.0
Parameters
----------
estimator : estimator instance
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a classifier.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=True
Whether to drop thresholds where true positives (tp) do not change
from the previous or subsequent threshold. All points with the same
tp value have the same `fnr` and thus same y coordinate.
.. versionadded:: 1.7
response_method : {'predict_proba', 'decision_function', 'auto'} \
default='auto'
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the predicted target response. If set
to 'auto', :term:`predict_proba` is tried first and if it does not
exist :term:`decision_function` is tried next.
pos_label : int, float, bool or str, default=None
The label of the positive class. By default, `estimators.classes_[1]`
is considered as the positive class.
name : str, default=None
Name of DET curve for labeling. If `None`, use the name of the
estimator.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
**kwargs : dict
Additional keywords arguments passed to matplotlib `plot` function.
Returns
-------
display : :class:`~sklearn.metrics.DetCurveDisplay`
Object that stores computed values.
See Also
--------
det_curve : Compute error rates for different probability thresholds.
DetCurveDisplay.from_predictions : Plot DET curve given the true and
predicted labels.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import DetCurveDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(n_samples=1000, random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.4, random_state=0)
>>> clf = SVC(random_state=0).fit(X_train, y_train)
>>> DetCurveDisplay.from_estimator(
... clf, X_test, y_test)
<...>
>>> plt.show()
"""
y_score, pos_label, name = cls._validate_and_get_response_values(
estimator,
X,
y,
response_method=response_method,
pos_label=pos_label,
name=name,
)
return cls.from_predictions(
y_true=y,
y_score=y_score,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
name=name,
ax=ax,
pos_label=pos_label,
**kwargs,
)
@classmethod
def from_predictions(
cls,
y_true,
y_score=None,
*,
sample_weight=None,
drop_intermediate=True,
pos_label=None,
name=None,
ax=None,
y_pred="deprecated",
**kwargs,
):
"""Plot the DET curve given the true and predicted labels.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the
:ref:`Model Evaluation Guide <det_curve>`.
.. versionadded:: 1.0
Parameters
----------
y_true : array-like of shape (n_samples,)
True labels.
y_score : array-like of shape (n_samples,)
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by `decision_function` on some classifiers).
.. versionadded:: 1.8
`y_pred` has been renamed to `y_score`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=True
Whether to drop thresholds where true positives (tp) do not change
from the previous or subsequent threshold. All points with the same
tp value have the same `fnr` and thus same y coordinate.
.. versionadded:: 1.7
pos_label : int, float, bool or str, default=None
The label of the positive class. When `pos_label=None`, if `y_true`
is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an
error will be raised.
name : str, default=None
Name of DET curve for labeling. If `None`, name will be set to
`"Classifier"`.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
y_pred : array-like of shape (n_samples,)
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by “decision_function” on some classifiers).
.. deprecated:: 1.8
`y_pred` is deprecated and will be removed in 1.10. Use
`y_score` instead.
**kwargs : dict
Additional keywords arguments passed to matplotlib `plot` function.
Returns
-------
display : :class:`~sklearn.metrics.DetCurveDisplay`
Object that stores computed values.
See Also
--------
det_curve : Compute error rates for different probability thresholds.
DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
some data.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import DetCurveDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(n_samples=1000, random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.4, random_state=0)
>>> clf = SVC(random_state=0).fit(X_train, y_train)
>>> y_score = clf.decision_function(X_test)
>>> DetCurveDisplay.from_predictions(
... y_test, y_score)
<...>
>>> plt.show()
"""
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.8")
pos_label_validated, name = cls._validate_from_predictions_params(
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
)
fpr, fnr, _ = det_curve(
y_true,
y_score,
pos_label=pos_label,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
)
viz = cls(
fpr=fpr,
fnr=fnr,
estimator_name=name,
pos_label=pos_label_validated,
)
return viz.plot(ax=ax, name=name, **kwargs)
def plot(self, ax=None, *, name=None, **kwargs):
"""Plot visualization.
Parameters
----------
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
name : str, default=None
Name of DET curve for labeling. If `None`, use `estimator_name` if
it is not `None`, otherwise no labeling is shown.
**kwargs : dict
Additional keywords arguments passed to matplotlib `plot` function.
Returns
-------
display : :class:`~sklearn.metrics.DetCurveDisplay`
Object that stores computed values.
"""
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
line_kwargs = {} if name is None else {"label": name}
line_kwargs.update(**kwargs)
# We have the following bounds:
# sp.stats.norm.ppf(0.0) = -np.inf
# sp.stats.norm.ppf(1.0) = np.inf
# We therefore clip to eps and 1 - eps to not provide infinity to matplotlib.
eps = np.finfo(self.fpr.dtype).eps
self.fpr = self.fpr.clip(eps, 1 - eps)
self.fnr = self.fnr.clip(eps, 1 - eps)
(self.line_,) = self.ax_.plot(
sp.stats.norm.ppf(self.fpr),
sp.stats.norm.ppf(self.fnr),
**line_kwargs,
)
info_pos_label = (
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
)
xlabel = "False Positive Rate" + info_pos_label
ylabel = "False Negative Rate" + info_pos_label
self.ax_.set(xlabel=xlabel, ylabel=ylabel)
if "label" in line_kwargs:
self.ax_.legend(loc="lower right")
ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999]
tick_locations = sp.stats.norm.ppf(ticks)
tick_labels = [
"{:.0%}".format(s) if (100 * s).is_integer() else "{:.1%}".format(s)
for s in ticks
]
self.ax_.set_xticks(tick_locations)
self.ax_.set_xticklabels(tick_labels)
self.ax_.set_xlim(-3, 3)
self.ax_.set_yticks(tick_locations)
self.ax_.set_yticklabels(tick_labels)
self.ax_.set_ylim(-3, 3)
return self

View File

@@ -0,0 +1,582 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from collections import Counter
from sklearn.metrics._ranking import average_precision_score, precision_recall_curve
from sklearn.utils._plotting import (
_BinaryClassifierCurveDisplayMixin,
_deprecate_estimator_name,
_deprecate_y_pred_parameter,
_despine,
_validate_style_kwargs,
)
class PrecisionRecallDisplay(_BinaryClassifierCurveDisplayMixin):
"""Precision Recall visualization.
It is recommended to use
:func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or
:func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create
a :class:`~sklearn.metrics.PrecisionRecallDisplay`. All parameters are
stored as attributes.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <precision_recall_f_measure_metrics>`.
Parameters
----------
precision : ndarray
Precision values.
recall : ndarray
Recall values.
average_precision : float, default=None
Average precision. If None, the average precision is not shown.
name : str, default=None
Name of estimator. If None, then the estimator name is not shown.
.. versionchanged:: 1.8
`estimator_name` was deprecated in favor of `name`.
pos_label : int, float, bool or str, default=None
The class considered the positive class when precision and recall metrics
computed. If not `None`, this value is displayed in the x- and y-axes labels.
.. versionadded:: 0.24
prevalence_pos_label : float, default=None
The prevalence of the positive label. It is used for plotting the
chance level line. If None, the chance level line will not be plotted
even if `plot_chance_level` is set to True when plotting.
.. versionadded:: 1.3
estimator_name : str, default=None
Name of estimator. If None, the estimator name is not shown.
.. deprecated:: 1.8
`estimator_name` is deprecated and will be removed in 1.10. Use `name`
instead.
Attributes
----------
line_ : matplotlib Artist
Precision recall curve.
chance_level_ : matplotlib Artist or None
The chance level line. It is `None` if the chance level is not plotted.
.. versionadded:: 1.3
ax_ : matplotlib Axes
Axes with precision recall curve.
figure_ : matplotlib Figure
Figure containing the curve.
See Also
--------
precision_recall_curve : Compute precision-recall pairs for different
probability thresholds.
PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given
a binary classifier.
PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve
using predictions from a binary classifier.
Notes
-----
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) in
scikit-learn is computed without any interpolation. To be consistent with
this metric, the precision-recall curve is plotted without any
interpolation as well (step-wise style).
You can change this style by passing the keyword argument
`drawstyle="default"` in :meth:`plot`, :meth:`from_estimator`, or
:meth:`from_predictions`. However, the curve will not be strictly
consistent with the reported average precision.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import (precision_recall_curve,
... PrecisionRecallDisplay)
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
... random_state=0)
>>> clf = SVC(random_state=0)
>>> clf.fit(X_train, y_train)
SVC(random_state=0)
>>> predictions = clf.predict(X_test)
>>> precision, recall, _ = precision_recall_curve(y_test, predictions)
>>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)
>>> disp.plot()
<...>
>>> plt.show()
"""
def __init__(
self,
precision,
recall,
*,
average_precision=None,
name=None,
pos_label=None,
prevalence_pos_label=None,
estimator_name="deprecated",
):
self.name = _deprecate_estimator_name(estimator_name, name, "1.8")
self.precision = precision
self.recall = recall
self.average_precision = average_precision
self.pos_label = pos_label
self.prevalence_pos_label = prevalence_pos_label
def plot(
self,
ax=None,
*,
name=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
**kwargs,
):
"""Plot visualization.
Extra keyword arguments will be passed to matplotlib's `plot`.
Parameters
----------
ax : Matplotlib Axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
name : str, default=None
Name of precision recall curve for labeling. If `None`, use
`name` if not `None`, otherwise no labeling is shown.
plot_chance_level : bool, default=False
Whether to plot the chance level. The chance level is the prevalence
of the positive label computed from the data passed during
:meth:`from_estimator` or :meth:`from_predictions` call.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
**kwargs : dict
Keyword arguments to be passed to matplotlib's `plot`.
Returns
-------
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
Object that stores computed values.
Notes
-----
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
in scikit-learn is computed without any interpolation. To be consistent
with this metric, the precision-recall curve is plotted without any
interpolation as well (step-wise style).
You can change this style by passing the keyword argument
`drawstyle="default"`. However, the curve will not be strictly
consistent with the reported average precision.
"""
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
default_line_kwargs = {"drawstyle": "steps-post"}
if self.average_precision is not None and name is not None:
default_line_kwargs["label"] = (
f"{name} (AP = {self.average_precision:0.2f})"
)
elif self.average_precision is not None:
default_line_kwargs["label"] = f"AP = {self.average_precision:0.2f}"
elif name is not None:
default_line_kwargs["label"] = name
line_kwargs = _validate_style_kwargs(default_line_kwargs, kwargs)
(self.line_,) = self.ax_.plot(self.recall, self.precision, **line_kwargs)
info_pos_label = (
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
)
xlabel = "Recall" + info_pos_label
ylabel = "Precision" + info_pos_label
self.ax_.set(
xlabel=xlabel,
xlim=(-0.01, 1.01),
ylabel=ylabel,
ylim=(-0.01, 1.01),
aspect="equal",
)
if plot_chance_level:
if self.prevalence_pos_label is None:
raise ValueError(
"You must provide prevalence_pos_label when constructing the "
"PrecisionRecallDisplay object in order to plot the chance "
"level line. Alternatively, you may use "
"PrecisionRecallDisplay.from_estimator or "
"PrecisionRecallDisplay.from_predictions "
"to automatically set prevalence_pos_label"
)
default_chance_level_line_kw = {
"label": f"Chance level (AP = {self.prevalence_pos_label:0.2f})",
"color": "k",
"linestyle": "--",
}
if chance_level_kw is None:
chance_level_kw = {}
chance_level_line_kw = _validate_style_kwargs(
default_chance_level_line_kw, chance_level_kw
)
(self.chance_level_,) = self.ax_.plot(
(0, 1),
(self.prevalence_pos_label, self.prevalence_pos_label),
**chance_level_line_kw,
)
else:
self.chance_level_ = None
if despine:
_despine(self.ax_)
if "label" in line_kwargs or plot_chance_level:
self.ax_.legend(loc="lower left")
return self
@classmethod
def from_estimator(
cls,
estimator,
X,
y,
*,
sample_weight=None,
drop_intermediate=False,
response_method="auto",
pos_label=None,
name=None,
ax=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
**kwargs,
):
"""Plot precision-recall curve given an estimator and some data.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <precision_recall_f_measure_metrics>`.
Parameters
----------
estimator : estimator instance
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a classifier.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=False
Whether to drop some suboptimal thresholds which would not appear
on a plotted precision-recall curve. This is useful in order to
create lighter precision-recall curves.
.. versionadded:: 1.3
response_method : {'predict_proba', 'decision_function', 'auto'}, \
default='auto'
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. If set to 'auto',
:term:`predict_proba` is tried first and if it does not exist
:term:`decision_function` is tried next.
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing the
precision and recall metrics. By default, `estimators.classes_[1]`
is considered as the positive class.
name : str, default=None
Name for labeling curve. If `None`, no name is used.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is created.
plot_chance_level : bool, default=False
Whether to plot the chance level. The chance level is the prevalence
of the positive label computed from the data passed during
:meth:`from_estimator` or :meth:`from_predictions` call.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
**kwargs : dict
Keyword arguments to be passed to matplotlib's `plot`.
Returns
-------
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
See Also
--------
PrecisionRecallDisplay.from_predictions : Plot precision-recall curve
using estimated probabilities or output of decision function.
Notes
-----
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
in scikit-learn is computed without any interpolation. To be consistent
with this metric, the precision-recall curve is plotted without any
interpolation as well (step-wise style).
You can change this style by passing the keyword argument
`drawstyle="default"`. However, the curve will not be strictly
consistent with the reported average precision.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import PrecisionRecallDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.linear_model import LogisticRegression
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = LogisticRegression()
>>> clf.fit(X_train, y_train)
LogisticRegression()
>>> PrecisionRecallDisplay.from_estimator(
... clf, X_test, y_test)
<...>
>>> plt.show()
"""
y_score, pos_label, name = cls._validate_and_get_response_values(
estimator,
X,
y,
response_method=response_method,
pos_label=pos_label,
name=name,
)
return cls.from_predictions(
y,
y_score,
sample_weight=sample_weight,
name=name,
pos_label=pos_label,
drop_intermediate=drop_intermediate,
ax=ax,
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
despine=despine,
**kwargs,
)
@classmethod
def from_predictions(
cls,
y_true,
y_score=None,
*,
sample_weight=None,
drop_intermediate=False,
pos_label=None,
name=None,
ax=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
y_pred="deprecated",
**kwargs,
):
"""Plot precision-recall curve given binary class predictions.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <precision_recall_f_measure_metrics>`.
Parameters
----------
y_true : array-like of shape (n_samples,)
True binary labels.
y_score : array-like of shape (n_samples,)
Estimated probabilities or output of decision function.
.. versionadded:: 1.8
`y_pred` has been renamed to `y_score`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=False
Whether to drop some suboptimal thresholds which would not appear
on a plotted precision-recall curve. This is useful in order to
create lighter precision-recall curves.
.. versionadded:: 1.3
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing the
precision and recall metrics. When `pos_label=None`, if `y_true` is
in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an error
will be raised.
name : str, default=None
Name for labeling curve. If `None`, name will be set to
`"Classifier"`.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is created.
plot_chance_level : bool, default=False
Whether to plot the chance level. The chance level is the prevalence
of the positive label computed from the data passed during
:meth:`from_estimator` or :meth:`from_predictions` call.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
y_pred : array-like of shape (n_samples,)
Estimated probabilities or output of decision function.
.. deprecated:: 1.8
`y_pred` is deprecated and will be removed in 1.10. Use
`y_score` instead.
**kwargs : dict
Keyword arguments to be passed to matplotlib's `plot`.
Returns
-------
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
See Also
--------
PrecisionRecallDisplay.from_estimator : Plot precision-recall curve
using an estimator.
Notes
-----
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
in scikit-learn is computed without any interpolation. To be consistent
with this metric, the precision-recall curve is plotted without any
interpolation as well (step-wise style).
You can change this style by passing the keyword argument
`drawstyle="default"`. However, the curve will not be strictly
consistent with the reported average precision.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import PrecisionRecallDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.linear_model import LogisticRegression
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = LogisticRegression()
>>> clf.fit(X_train, y_train)
LogisticRegression()
>>> y_score = clf.predict_proba(X_test)[:, 1]
>>> PrecisionRecallDisplay.from_predictions(
... y_test, y_score)
<...>
>>> plt.show()
"""
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.8")
pos_label, name = cls._validate_from_predictions_params(
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
)
precision, recall, _ = precision_recall_curve(
y_true,
y_score,
pos_label=pos_label,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
)
average_precision = average_precision_score(
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
)
class_count = Counter(y_true)
prevalence_pos_label = class_count[pos_label] / sum(class_count.values())
viz = cls(
precision=precision,
recall=recall,
average_precision=average_precision,
name=name,
pos_label=pos_label,
prevalence_pos_label=prevalence_pos_label,
)
return viz.plot(
ax=ax,
name=name,
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
despine=despine,
**kwargs,
)

View File

@@ -0,0 +1,413 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numbers
import numpy as np
from sklearn.utils import _safe_indexing, check_random_state
from sklearn.utils._optional_dependencies import check_matplotlib_support
from sklearn.utils._plotting import _validate_style_kwargs
class PredictionErrorDisplay:
"""Visualization of the prediction error of a regression model.
This tool can display "residuals vs predicted" or "actual vs predicted"
using scatter plots to qualitatively assess the behavior of a regressor,
preferably on held-out data points.
See the details in the docstrings of
:func:`~sklearn.metrics.PredictionErrorDisplay.from_estimator` or
:func:`~sklearn.metrics.PredictionErrorDisplay.from_predictions` to
create a visualizer. All parameters are stored as attributes.
For general information regarding `scikit-learn` visualization tools, read
more in the :ref:`Visualization Guide <visualizations>`.
For details regarding interpreting these plots, refer to the
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
.. versionadded:: 1.2
Parameters
----------
y_true : ndarray of shape (n_samples,)
True values.
y_pred : ndarray of shape (n_samples,)
Prediction values.
Attributes
----------
line_ : matplotlib Artist
Optimal line representing `y_true == y_pred`. Therefore, it is a
diagonal line for `kind="predictions"` and a horizontal line for
`kind="residuals"`.
errors_lines_ : matplotlib Artist or None
Residual lines. If `with_errors=False`, then it is set to `None`.
scatter_ : matplotlib Artist
Scatter data points.
ax_ : matplotlib Axes
Axes with the different matplotlib axis.
figure_ : matplotlib Figure
Figure containing the scatter and lines.
See Also
--------
PredictionErrorDisplay.from_estimator : Prediction error visualization
given an estimator and some data.
PredictionErrorDisplay.from_predictions : Prediction error visualization
given the true and predicted targets.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.linear_model import Ridge
>>> from sklearn.metrics import PredictionErrorDisplay
>>> X, y = load_diabetes(return_X_y=True)
>>> ridge = Ridge().fit(X, y)
>>> y_pred = ridge.predict(X)
>>> display = PredictionErrorDisplay(y_true=y, y_pred=y_pred)
>>> display.plot()
<...>
>>> plt.show()
"""
def __init__(self, *, y_true, y_pred):
self.y_true = y_true
self.y_pred = y_pred
def plot(
self,
ax=None,
*,
kind="residual_vs_predicted",
scatter_kwargs=None,
line_kwargs=None,
):
"""Plot visualization.
Extra keyword arguments will be passed to matplotlib's ``plot``.
Parameters
----------
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
default="residual_vs_predicted"
The type of plot to draw:
- "actual_vs_predicted" draws the observed values (y-axis) vs.
the predicted values (x-axis).
- "residual_vs_predicted" draws the residuals, i.e. difference
between observed and predicted values, (y-axis) vs. the predicted
values (x-axis).
scatter_kwargs : dict, default=None
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
call.
line_kwargs : dict, default=None
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
call to draw the optimal line.
Returns
-------
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
Object that stores computed values.
"""
check_matplotlib_support(f"{self.__class__.__name__}.plot")
expected_kind = ("actual_vs_predicted", "residual_vs_predicted")
if kind not in expected_kind:
raise ValueError(
f"`kind` must be one of {', '.join(expected_kind)}. "
f"Got {kind!r} instead."
)
import matplotlib.pyplot as plt
if scatter_kwargs is None:
scatter_kwargs = {}
if line_kwargs is None:
line_kwargs = {}
default_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8}
default_line_kwargs = {"color": "black", "alpha": 0.7, "linestyle": "--"}
scatter_kwargs = _validate_style_kwargs(default_scatter_kwargs, scatter_kwargs)
line_kwargs = _validate_style_kwargs(default_line_kwargs, line_kwargs)
scatter_kwargs = {**default_scatter_kwargs, **scatter_kwargs}
line_kwargs = {**default_line_kwargs, **line_kwargs}
if ax is None:
_, ax = plt.subplots()
if kind == "actual_vs_predicted":
max_value = max(np.max(self.y_true), np.max(self.y_pred))
min_value = min(np.min(self.y_true), np.min(self.y_pred))
self.line_ = ax.plot(
[min_value, max_value], [min_value, max_value], **line_kwargs
)[0]
x_data, y_data = self.y_pred, self.y_true
xlabel, ylabel = "Predicted values", "Actual values"
self.scatter_ = ax.scatter(x_data, y_data, **scatter_kwargs)
# force to have a squared axis
ax.set_aspect("equal", adjustable="datalim")
ax.set_xticks(np.linspace(min_value, max_value, num=5))
ax.set_yticks(np.linspace(min_value, max_value, num=5))
else: # kind == "residual_vs_predicted"
self.line_ = ax.plot(
[np.min(self.y_pred), np.max(self.y_pred)],
[0, 0],
**line_kwargs,
)[0]
self.scatter_ = ax.scatter(
self.y_pred, self.y_true - self.y_pred, **scatter_kwargs
)
xlabel, ylabel = "Predicted values", "Residuals (actual - predicted)"
ax.set(xlabel=xlabel, ylabel=ylabel)
self.ax_ = ax
self.figure_ = ax.figure
return self
@classmethod
def from_estimator(
cls,
estimator,
X,
y,
*,
kind="residual_vs_predicted",
subsample=1_000,
random_state=None,
ax=None,
scatter_kwargs=None,
line_kwargs=None,
):
"""Plot the prediction error given a regressor and some data.
For general information regarding `scikit-learn` visualization tools,
read more in the :ref:`Visualization Guide <visualizations>`.
For details regarding interpreting these plots, refer to the
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
.. versionadded:: 1.2
Parameters
----------
estimator : estimator instance
Fitted regressor or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a regressor.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
default="residual_vs_predicted"
The type of plot to draw:
- "actual_vs_predicted" draws the observed values (y-axis) vs.
the predicted values (x-axis).
- "residual_vs_predicted" draws the residuals, i.e. difference
between observed and predicted values, (y-axis) vs. the predicted
values (x-axis).
subsample : float, int or None, default=1_000
Sampling the samples to be shown on the scatter plot. If `float`,
it should be between 0 and 1 and represents the proportion of the
original dataset. If `int`, it represents the number of samples
display on the scatter plot. If `None`, no subsampling will be
applied. by default, 1000 samples or less will be displayed.
random_state : int or RandomState, default=None
Controls the randomness when `subsample` is not `None`.
See :term:`Glossary <random_state>` for details.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
scatter_kwargs : dict, default=None
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
call.
line_kwargs : dict, default=None
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
call to draw the optimal line.
Returns
-------
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
Object that stores the computed values.
See Also
--------
PredictionErrorDisplay : Prediction error visualization for regression.
PredictionErrorDisplay.from_predictions : Prediction error visualization
given the true and predicted targets.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.linear_model import Ridge
>>> from sklearn.metrics import PredictionErrorDisplay
>>> X, y = load_diabetes(return_X_y=True)
>>> ridge = Ridge().fit(X, y)
>>> disp = PredictionErrorDisplay.from_estimator(ridge, X, y)
>>> plt.show()
"""
check_matplotlib_support(f"{cls.__name__}.from_estimator")
y_pred = estimator.predict(X)
return cls.from_predictions(
y_true=y,
y_pred=y_pred,
kind=kind,
subsample=subsample,
random_state=random_state,
ax=ax,
scatter_kwargs=scatter_kwargs,
line_kwargs=line_kwargs,
)
@classmethod
def from_predictions(
cls,
y_true,
y_pred,
*,
kind="residual_vs_predicted",
subsample=1_000,
random_state=None,
ax=None,
scatter_kwargs=None,
line_kwargs=None,
):
"""Plot the prediction error given the true and predicted targets.
For general information regarding `scikit-learn` visualization tools,
read more in the :ref:`Visualization Guide <visualizations>`.
For details regarding interpreting these plots, refer to the
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
.. versionadded:: 1.2
Parameters
----------
y_true : array-like of shape (n_samples,)
True target values.
y_pred : array-like of shape (n_samples,)
Predicted target values.
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
default="residual_vs_predicted"
The type of plot to draw:
- "actual_vs_predicted" draws the observed values (y-axis) vs.
the predicted values (x-axis).
- "residual_vs_predicted" draws the residuals, i.e. difference
between observed and predicted values, (y-axis) vs. the predicted
values (x-axis).
subsample : float, int or None, default=1_000
Sampling the samples to be shown on the scatter plot. If `float`,
it should be between 0 and 1 and represents the proportion of the
original dataset. If `int`, it represents the number of samples
display on the scatter plot. If `None`, no subsampling will be
applied. by default, 1000 samples or less will be displayed.
random_state : int or RandomState, default=None
Controls the randomness when `subsample` is not `None`.
See :term:`Glossary <random_state>` for details.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
scatter_kwargs : dict, default=None
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
call.
line_kwargs : dict, default=None
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
call to draw the optimal line.
Returns
-------
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
Object that stores the computed values.
See Also
--------
PredictionErrorDisplay : Prediction error visualization for regression.
PredictionErrorDisplay.from_estimator : Prediction error visualization
given an estimator and some data.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.linear_model import Ridge
>>> from sklearn.metrics import PredictionErrorDisplay
>>> X, y = load_diabetes(return_X_y=True)
>>> ridge = Ridge().fit(X, y)
>>> y_pred = ridge.predict(X)
>>> disp = PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred)
>>> plt.show()
"""
check_matplotlib_support(f"{cls.__name__}.from_predictions")
random_state = check_random_state(random_state)
n_samples = len(y_true)
if isinstance(subsample, numbers.Integral):
if subsample <= 0:
raise ValueError(
f"When an integer, subsample={subsample} should be positive."
)
elif isinstance(subsample, numbers.Real):
if subsample <= 0 or subsample >= 1:
raise ValueError(
f"When a floating-point, subsample={subsample} should"
" be in the (0, 1) range."
)
subsample = int(n_samples * subsample)
if subsample is not None and subsample < n_samples:
indices = random_state.choice(np.arange(n_samples), size=subsample)
y_true = _safe_indexing(y_true, indices, axis=0)
y_pred = _safe_indexing(y_pred, indices, axis=0)
viz = cls(
y_true=y_true,
y_pred=y_pred,
)
return viz.plot(
ax=ax,
kind=kind,
scatter_kwargs=scatter_kwargs,
line_kwargs=line_kwargs,
)

View File

@@ -0,0 +1,787 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from sklearn.metrics._ranking import auc, roc_curve
from sklearn.utils import _safe_indexing
from sklearn.utils._plotting import (
_BinaryClassifierCurveDisplayMixin,
_check_param_lengths,
_convert_to_list_leaving_none,
_deprecate_estimator_name,
_deprecate_y_pred_parameter,
_despine,
_validate_style_kwargs,
)
from sklearn.utils._response import _get_response_values_binary
class RocCurveDisplay(_BinaryClassifierCurveDisplayMixin):
"""ROC Curve visualization.
It is recommended to use
:func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or
:func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or
:func:`~sklearn.metrics.RocCurveDisplay.from_cv_results` to create
a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are
stored as attributes.
For general information regarding `scikit-learn` visualization tools, see
the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <roc_metrics>`.
Parameters
----------
fpr : ndarray or list of ndarrays
False positive rates. Each ndarray should contain values for a single curve.
If plotting multiple curves, list should be of same length as `tpr`.
.. versionchanged:: 1.7
Now accepts a list for plotting multiple curves.
tpr : ndarray or list of ndarrays
True positive rates. Each ndarray should contain values for a single curve.
If plotting multiple curves, list should be of same length as `fpr`.
.. versionchanged:: 1.7
Now accepts a list for plotting multiple curves.
roc_auc : float or list of floats, default=None
Area under ROC curve, used for labeling each curve in the legend.
If plotting multiple curves, should be a list of the same length as `fpr`
and `tpr`. If `None`, ROC AUC scores are not shown in the legend.
.. versionchanged:: 1.7
Now accepts a list for plotting multiple curves.
name : str or list of str, default=None
Name for labeling legend entries. The number of legend entries is determined
by the `curve_kwargs` passed to `plot`, and is not affected by `name`.
To label each curve, provide a list of strings. To avoid labeling
individual curves that have the same appearance, a list cannot be used in
conjunction with `curve_kwargs` being a dictionary or None. If a
string is provided, it will be used to either label the single legend entry
or if there are multiple legend entries, label each individual curve with
the same name. If `None`, no name is shown in the legend.
.. versionchanged:: 1.7
`estimator_name` was deprecated in favor of `name`.
pos_label : int, float, bool or str, default=None
The class considered the positive class when ROC AUC metrics computed.
If not `None`, this value is displayed in the x- and y-axes labels.
.. versionadded:: 0.24
estimator_name : str, default=None
Name of estimator. If None, the estimator name is not shown.
.. deprecated:: 1.7
`estimator_name` is deprecated and will be removed in 1.9. Use `name`
instead.
Attributes
----------
line_ : matplotlib Artist or list of matplotlib Artists
ROC Curves.
.. versionchanged:: 1.7
This attribute can now be a list of Artists, for when multiple curves
are plotted.
chance_level_ : matplotlib Artist or None
The chance level line. It is `None` if the chance level is not plotted.
.. versionadded:: 1.3
ax_ : matplotlib Axes
Axes with ROC Curve.
figure_ : matplotlib Figure
Figure containing the curve.
See Also
--------
roc_curve : Compute Receiver operating characteristic (ROC) curve.
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
(ROC) curve given an estimator and some data.
RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
(ROC) curve given the true and predicted values.
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
cross-validation results.
roc_auc_score : Compute the area under the ROC curve.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> import numpy as np
>>> from sklearn import metrics
>>> y_true = np.array([0, 0, 1, 1])
>>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
>>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
>>> roc_auc = metrics.auc(fpr, tpr)
>>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
... name='example estimator')
>>> display.plot()
<...>
>>> plt.show()
"""
def __init__(
self,
*,
fpr,
tpr,
roc_auc=None,
name=None,
pos_label=None,
estimator_name="deprecated",
):
self.fpr = fpr
self.tpr = tpr
self.roc_auc = roc_auc
self.name = _deprecate_estimator_name(estimator_name, name, "1.7")
self.pos_label = pos_label
def _validate_plot_params(self, *, ax, name):
self.ax_, self.figure_, name = super()._validate_plot_params(ax=ax, name=name)
fpr = _convert_to_list_leaving_none(self.fpr)
tpr = _convert_to_list_leaving_none(self.tpr)
roc_auc = _convert_to_list_leaving_none(self.roc_auc)
name = _convert_to_list_leaving_none(name)
optional = {"self.roc_auc": roc_auc}
if isinstance(name, list) and len(name) != 1:
optional.update({"'name' (or self.name)": name})
_check_param_lengths(
required={"self.fpr": fpr, "self.tpr": tpr},
optional=optional,
class_name="RocCurveDisplay",
)
return fpr, tpr, roc_auc, name
def plot(
self,
ax=None,
*,
name=None,
curve_kwargs=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
**kwargs,
):
"""Plot visualization.
Parameters
----------
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
name : str or list of str, default=None
Name for labeling legend entries. The number of legend entries
is determined by `curve_kwargs`, and is not affected by `name`.
To label each curve, provide a list of strings. To avoid labeling
individual curves that have the same appearance, a list cannot be used in
conjunction with `curve_kwargs` being a dictionary or None. If a
string is provided, it will be used to either label the single legend entry
or if there are multiple legend entries, label each individual curve with
the same name. If `None`, set to `name` provided at `RocCurveDisplay`
initialization. If still `None`, no name is shown in the legend.
.. versionadded:: 1.7
curve_kwargs : dict or list of dict, default=None
Keywords arguments to be passed to matplotlib's `plot` function
to draw individual ROC curves. For single curve plotting, should be
a dictionary. For multi-curve plotting, if a list is provided the
parameters are applied to the ROC curves of each CV fold
sequentially and a legend entry is added for each curve.
If a single dictionary is provided, the same parameters are applied
to all ROC curves and a single legend entry for all curves is added,
labeled with the mean ROC AUC score.
.. versionadded:: 1.7
plot_chance_level : bool, default=False
Whether to plot the chance level.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
**kwargs : dict
Keyword arguments to be passed to matplotlib's `plot`.
.. deprecated:: 1.7
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
arguments to `curve_kwargs` as a dictionary instead.
Returns
-------
display : :class:`~sklearn.metrics.RocCurveDisplay`
Object that stores computed values.
"""
fpr, tpr, roc_auc, name = self._validate_plot_params(ax=ax, name=name)
n_curves = len(fpr)
if not isinstance(curve_kwargs, list) and n_curves > 1:
if roc_auc:
legend_metric = {"mean": np.mean(roc_auc), "std": np.std(roc_auc)}
else:
legend_metric = {"mean": None, "std": None}
else:
roc_auc = roc_auc if roc_auc is not None else [None] * n_curves
legend_metric = {"metric": roc_auc}
curve_kwargs = self._validate_curve_kwargs(
n_curves,
name,
legend_metric,
"AUC",
curve_kwargs=curve_kwargs,
default_multi_curve_kwargs={
"alpha": 0.5,
"linestyle": "--",
"color": "blue",
},
**kwargs,
)
default_chance_level_line_kw = {
"label": "Chance level (AUC = 0.5)",
"color": "k",
"linestyle": "--",
}
if chance_level_kw is None:
chance_level_kw = {}
chance_level_kw = _validate_style_kwargs(
default_chance_level_line_kw, chance_level_kw
)
self.line_ = []
for fpr, tpr, line_kw in zip(fpr, tpr, curve_kwargs):
self.line_.extend(self.ax_.plot(fpr, tpr, **line_kw))
# Return single artist if only one curve is plotted
if len(self.line_) == 1:
self.line_ = self.line_[0]
info_pos_label = (
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
)
xlabel = "False Positive Rate" + info_pos_label
ylabel = "True Positive Rate" + info_pos_label
self.ax_.set(
xlabel=xlabel,
xlim=(-0.01, 1.01),
ylabel=ylabel,
ylim=(-0.01, 1.01),
aspect="equal",
)
if plot_chance_level:
(self.chance_level_,) = self.ax_.plot((0, 1), (0, 1), **chance_level_kw)
else:
self.chance_level_ = None
if despine:
_despine(self.ax_)
if curve_kwargs[0].get("label") is not None or (
plot_chance_level and chance_level_kw.get("label") is not None
):
self.ax_.legend(loc="lower right")
return self
@classmethod
def from_estimator(
cls,
estimator,
X,
y,
*,
sample_weight=None,
drop_intermediate=True,
response_method="auto",
pos_label=None,
name=None,
ax=None,
curve_kwargs=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
**kwargs,
):
"""Create a ROC Curve display from an estimator.
For general information regarding `scikit-learn` visualization tools,
see the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <roc_metrics>`.
Parameters
----------
estimator : estimator instance
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a classifier.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=True
Whether to drop thresholds where the resulting point is collinear
with its neighbors in ROC space. This has no effect on the ROC AUC
or visual shape of the curve, but reduces the number of plotted
points.
response_method : {'predict_proba', 'decision_function', 'auto'} \
default='auto'
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. If set to 'auto',
:term:`predict_proba` is tried first and if it does not exist
:term:`decision_function` is tried next.
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing the ROC AUC.
By default, `estimators.classes_[1]` is considered
as the positive class.
name : str, default=None
Name of ROC Curve for labeling. If `None`, use the name of the
estimator.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is created.
curve_kwargs : dict, default=None
Keywords arguments to be passed to matplotlib's `plot` function.
.. versionadded:: 1.7
plot_chance_level : bool, default=False
Whether to plot the chance level.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
**kwargs : dict
Keyword arguments to be passed to matplotlib's `plot`.
.. deprecated:: 1.7
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
arguments to `curve_kwargs` as a dictionary instead.
Returns
-------
display : :class:`~sklearn.metrics.RocCurveDisplay`
The ROC Curve display.
See Also
--------
roc_curve : Compute Receiver operating characteristic (ROC) curve.
RocCurveDisplay.from_predictions : ROC Curve visualization given the
probabilities of scores of a classifier.
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
cross-validation results.
roc_auc_score : Compute the area under the ROC curve.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import RocCurveDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = SVC(random_state=0).fit(X_train, y_train)
>>> RocCurveDisplay.from_estimator(
... clf, X_test, y_test)
<...>
>>> plt.show()
"""
y_score, pos_label, name = cls._validate_and_get_response_values(
estimator,
X,
y,
response_method=response_method,
pos_label=pos_label,
name=name,
)
return cls.from_predictions(
y_true=y,
y_score=y_score,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
name=name,
ax=ax,
curve_kwargs=curve_kwargs,
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
despine=despine,
**kwargs,
)
@classmethod
def from_predictions(
cls,
y_true,
y_score=None,
*,
sample_weight=None,
drop_intermediate=True,
pos_label=None,
name=None,
ax=None,
curve_kwargs=None,
plot_chance_level=False,
chance_level_kw=None,
despine=False,
y_pred="deprecated",
**kwargs,
):
"""Plot ROC curve given the true and predicted values.
For general information regarding `scikit-learn` visualization tools,
see the :ref:`Visualization Guide <visualizations>`.
For guidance on interpreting these plots, refer to the :ref:`Model
Evaluation Guide <roc_metrics>`.
.. versionadded:: 1.0
Parameters
----------
y_true : array-like of shape (n_samples,)
True labels.
y_score : array-like of shape (n_samples,)
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by “decision_function” on some classifiers).
.. versionadded:: 1.7
`y_pred` has been renamed to `y_score`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=True
Whether to drop thresholds where the resulting point is collinear
with its neighbors in ROC space. This has no effect on the ROC AUC
or visual shape of the curve, but reduces the number of plotted
points.
pos_label : int, float, bool or str, default=None
The label of the positive class when computing the ROC AUC.
When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label`
is set to 1, otherwise an error will be raised.
name : str, default=None
Name of ROC curve for legend labeling. If `None`, name will be set to
`"Classifier"`.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
curve_kwargs : dict, default=None
Keywords arguments to be passed to matplotlib's `plot` function.
.. versionadded:: 1.7
plot_chance_level : bool, default=False
Whether to plot the chance level.
.. versionadded:: 1.3
chance_level_kw : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
.. versionadded:: 1.3
despine : bool, default=False
Whether to remove the top and right spines from the plot.
.. versionadded:: 1.6
y_pred : array-like of shape (n_samples,)
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by “decision_function” on some classifiers).
.. deprecated:: 1.7
`y_pred` is deprecated and will be removed in 1.9. Use
`y_score` instead.
**kwargs : dict
Additional keywords arguments passed to matplotlib `plot` function.
.. deprecated:: 1.7
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
arguments to `curve_kwargs` as a dictionary instead.
Returns
-------
display : :class:`~sklearn.metrics.RocCurveDisplay`
Object that stores computed values.
See Also
--------
roc_curve : Compute Receiver operating characteristic (ROC) curve.
RocCurveDisplay.from_estimator : ROC Curve visualization given an
estimator and some data.
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
cross-validation results.
roc_auc_score : Compute the area under the ROC curve.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import RocCurveDisplay
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, random_state=0)
>>> clf = SVC(random_state=0).fit(X_train, y_train)
>>> y_score = clf.decision_function(X_test)
>>> RocCurveDisplay.from_predictions(y_test, y_score)
<...>
>>> plt.show()
"""
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.7")
pos_label_validated, name = cls._validate_from_predictions_params(
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
)
fpr, tpr, _ = roc_curve(
y_true,
y_score,
pos_label=pos_label,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
)
roc_auc = auc(fpr, tpr)
viz = cls(
fpr=fpr,
tpr=tpr,
roc_auc=roc_auc,
name=name,
pos_label=pos_label_validated,
)
return viz.plot(
ax=ax,
curve_kwargs=curve_kwargs,
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
despine=despine,
**kwargs,
)
@classmethod
def from_cv_results(
cls,
cv_results,
X,
y,
*,
sample_weight=None,
drop_intermediate=True,
response_method="auto",
pos_label=None,
ax=None,
name=None,
curve_kwargs=None,
plot_chance_level=False,
chance_level_kwargs=None,
despine=False,
):
"""Create a multi-fold ROC curve display given cross-validation results.
.. versionadded:: 1.7
Parameters
----------
cv_results : dict
Dictionary as returned by :func:`~sklearn.model_selection.cross_validate`
using `return_estimator=True` and `return_indices=True` (i.e., dictionary
should contain the keys "estimator" and "indices").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
drop_intermediate : bool, default=True
Whether to drop some suboptimal thresholds which would not appear
on a plotted ROC curve. This is useful in order to create lighter
ROC curves.
response_method : {'predict_proba', 'decision_function', 'auto'} \
default='auto'
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. If set to 'auto',
:term:`predict_proba` is tried first and if it does not exist
:term:`decision_function` is tried next.
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing the ROC AUC
metrics. By default, `estimator.classes_[1]` (using `estimator` from
`cv_results`) is considered as the positive class.
ax : matplotlib axes, default=None
Axes object to plot on. If `None`, a new figure and axes is
created.
name : str or list of str, default=None
Name for labeling legend entries. The number of legend entries
is determined by `curve_kwargs`, and is not affected by `name`.
To label each curve, provide a list of strings. To avoid labeling
individual curves that have the same appearance, a list cannot be used in
conjunction with `curve_kwargs` being a dictionary or None. If a
string is provided, it will be used to either label the single legend entry
or if there are multiple legend entries, label each individual curve with
the same name. If `None`, no name is shown in the legend.
curve_kwargs : dict or list of dict, default=None
Keywords arguments to be passed to matplotlib's `plot` function
to draw individual ROC curves. If a list is provided the
parameters are applied to the ROC curves of each CV fold
sequentially and a legend entry is added for each curve.
If a single dictionary is provided, the same parameters are applied
to all ROC curves and a single legend entry for all curves is added,
labeled with the mean ROC AUC score.
plot_chance_level : bool, default=False
Whether to plot the chance level.
chance_level_kwargs : dict, default=None
Keyword arguments to be passed to matplotlib's `plot` for rendering
the chance level line.
despine : bool, default=False
Whether to remove the top and right spines from the plot.
Returns
-------
display : :class:`~sklearn.metrics.RocCurveDisplay`
The multi-fold ROC curve display.
See Also
--------
roc_curve : Compute Receiver operating characteristic (ROC) curve.
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
(ROC) curve given an estimator and some data.
RocCurveDisplay.from_predictions : ROC Curve visualization given the
probabilities of scores of a classifier.
roc_auc_score : Compute the area under the ROC curve.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from sklearn.datasets import make_classification
>>> from sklearn.metrics import RocCurveDisplay
>>> from sklearn.model_selection import cross_validate
>>> from sklearn.svm import SVC
>>> X, y = make_classification(random_state=0)
>>> clf = SVC(random_state=0)
>>> cv_results = cross_validate(
... clf, X, y, cv=3, return_estimator=True, return_indices=True)
>>> RocCurveDisplay.from_cv_results(cv_results, X, y)
<...>
>>> plt.show()
"""
cls._validate_from_cv_results_params(
cv_results,
X,
y,
sample_weight=sample_weight,
)
fpr_folds, tpr_folds, auc_folds = [], [], []
for estimator, test_indices in zip(
cv_results["estimator"], cv_results["indices"]["test"]
):
y_true = _safe_indexing(y, test_indices)
y_pred, pos_label_ = _get_response_values_binary(
estimator,
_safe_indexing(X, test_indices),
response_method=response_method,
pos_label=pos_label,
)
sample_weight_fold = (
None
if sample_weight is None
else _safe_indexing(sample_weight, test_indices)
)
fpr, tpr, _ = roc_curve(
y_true,
y_pred,
pos_label=pos_label_,
sample_weight=sample_weight_fold,
drop_intermediate=drop_intermediate,
)
roc_auc = auc(fpr, tpr)
fpr_folds.append(fpr)
tpr_folds.append(tpr)
auc_folds.append(roc_auc)
viz = cls(
fpr=fpr_folds,
tpr=tpr_folds,
roc_auc=auc_folds,
name=name,
pos_label=pos_label_,
)
return viz.plot(
ax=ax,
curve_kwargs=curve_kwargs,
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kwargs,
despine=despine,
)

View File

@@ -0,0 +1,321 @@
import numpy as np
import pytest
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.calibration import CalibrationDisplay
from sklearn.compose import make_column_transformer
from sklearn.datasets import load_iris
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
ConfusionMatrixDisplay,
DetCurveDisplay,
PrecisionRecallDisplay,
PredictionErrorDisplay,
RocCurveDisplay,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@pytest.fixture(scope="module")
def data():
return load_iris(return_X_y=True)
@pytest.fixture(scope="module")
def data_binary(data):
X, y = data
return X[y < 2], y[y < 2]
@pytest.mark.parametrize(
"Display",
[CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
)
def test_display_curve_error_classifier(pyplot, data, data_binary, Display):
"""Check that a proper error is raised when only binary classification is
supported."""
X, y = data
X_binary, y_binary = data_binary
clf = DecisionTreeClassifier().fit(X, y)
# Case 1: multiclass classifier with multiclass target
msg = "Expected 'estimator' to be a binary classifier. Got 3 classes instead."
with pytest.raises(ValueError, match=msg):
Display.from_estimator(clf, X, y)
# Case 2: multiclass classifier with binary target
with pytest.raises(ValueError, match=msg):
Display.from_estimator(clf, X_binary, y_binary)
# Case 3: binary classifier with multiclass target
clf = DecisionTreeClassifier().fit(X_binary, y_binary)
msg = "The target y is not binary. Got multiclass type of target."
with pytest.raises(ValueError, match=msg):
Display.from_estimator(clf, X, y)
@pytest.mark.parametrize(
"Display",
[CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
)
def test_display_curve_error_regression(pyplot, data_binary, Display):
"""Check that we raise an error with regressor."""
# Case 1: regressor
X, y = data_binary
regressor = DecisionTreeRegressor().fit(X, y)
msg = "Expected 'estimator' to be a binary classifier. Got DecisionTreeRegressor"
with pytest.raises(ValueError, match=msg):
Display.from_estimator(regressor, X, y)
# Case 2: regression target
classifier = DecisionTreeClassifier().fit(X, y)
# Force `y_true` to be seen as a regression problem
y = y + 0.5
msg = "The target y is not binary. Got continuous type of target."
with pytest.raises(ValueError, match=msg):
Display.from_estimator(classifier, X, y)
with pytest.raises(ValueError, match=msg):
Display.from_predictions(y, regressor.fit(X, y).predict(X))
@pytest.mark.parametrize(
"response_method, msg",
[
(
"predict_proba",
"MyClassifier has none of the following attributes: predict_proba.",
),
(
"decision_function",
"MyClassifier has none of the following attributes: decision_function.",
),
(
"auto",
(
"MyClassifier has none of the following attributes: predict_proba,"
" decision_function."
),
),
(
"bad_method",
"MyClassifier has none of the following attributes: bad_method.",
),
],
)
@pytest.mark.parametrize(
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
)
def test_display_curve_error_no_response(
pyplot,
data_binary,
response_method,
msg,
Display,
):
"""Check that a proper error is raised when the response method requested
is not defined for the given trained classifier."""
X, y = data_binary
class MyClassifier(ClassifierMixin, BaseEstimator):
def fit(self, X, y):
self.classes_ = [0, 1]
return self
clf = MyClassifier().fit(X, y)
with pytest.raises(AttributeError, match=msg):
Display.from_estimator(clf, X, y, response_method=response_method)
@pytest.mark.parametrize(
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
)
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_display_curve_estimator_name_multiple_calls(
pyplot,
data_binary,
Display,
constructor_name,
):
"""Check that passing `name` when calling `plot` will overwrite the original name
in the legend."""
X, y = data_binary
clf_name = "my hand-crafted name"
clf = LogisticRegression().fit(X, y)
y_pred = clf.predict_proba(X)[:, 1]
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
if constructor_name == "from_estimator":
disp = Display.from_estimator(clf, X, y, name=clf_name)
else:
disp = Display.from_predictions(y, y_pred, name=clf_name)
# TODO: Clean-up once `estimator_name` deprecated in all displays
if Display in (PrecisionRecallDisplay, RocCurveDisplay):
assert disp.name == clf_name
else:
assert disp.estimator_name == clf_name
pyplot.close("all")
disp.plot()
assert clf_name in disp.line_.get_label()
pyplot.close("all")
clf_name = "another_name"
disp.plot(name=clf_name)
assert clf_name in disp.line_.get_label()
@pytest.mark.parametrize(
"clf",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
),
],
)
@pytest.mark.parametrize(
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
)
def test_display_curve_not_fitted_errors_old_name(pyplot, data_binary, clf, Display):
"""Check that a proper error is raised when the classifier is not
fitted."""
X, y = data_binary
# clone since we parametrize the test and the classifier will be fitted
# when testing the second and subsequent plotting function
model = clone(clf)
with pytest.raises(NotFittedError):
Display.from_estimator(model, X, y)
model.fit(X, y)
disp = Display.from_estimator(model, X, y)
assert model.__class__.__name__ in disp.line_.get_label()
# TODO: Clean-up once `estimator_name` deprecated in all displays
if Display in (PrecisionRecallDisplay, RocCurveDisplay):
assert disp.name == model.__class__.__name__
else:
assert disp.estimator_name == model.__class__.__name__
@pytest.mark.parametrize(
"clf",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
),
],
)
@pytest.mark.parametrize("Display", [RocCurveDisplay])
def test_display_curve_not_fitted_errors(pyplot, data_binary, clf, Display):
"""Check that a proper error is raised when the classifier is not fitted."""
X, y = data_binary
# clone since we parametrize the test and the classifier will be fitted
# when testing the second and subsequent plotting function
model = clone(clf)
with pytest.raises(NotFittedError):
Display.from_estimator(model, X, y)
model.fit(X, y)
disp = Display.from_estimator(model, X, y)
assert model.__class__.__name__ in disp.line_.get_label()
assert disp.name == model.__class__.__name__
@pytest.mark.parametrize(
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
)
def test_display_curve_n_samples_consistency(pyplot, data_binary, Display):
"""Check the error raised when `y_pred` or `sample_weight` have inconsistent
length."""
X, y = data_binary
classifier = DecisionTreeClassifier().fit(X, y)
msg = "Found input variables with inconsistent numbers of samples"
with pytest.raises(ValueError, match=msg):
Display.from_estimator(classifier, X[:-2], y)
with pytest.raises(ValueError, match=msg):
Display.from_estimator(classifier, X, y[:-2])
with pytest.raises(ValueError, match=msg):
Display.from_estimator(classifier, X, y, sample_weight=np.ones(X.shape[0] - 2))
@pytest.mark.parametrize(
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
)
def test_display_curve_error_pos_label(pyplot, data_binary, Display):
"""Check consistence of error message when `pos_label` should be specified."""
X, y = data_binary
y = y + 10
classifier = DecisionTreeClassifier().fit(X, y)
y_pred = classifier.predict_proba(X)[:, -1]
msg = r"y_true takes value in {10, 11} and pos_label is not specified"
with pytest.raises(ValueError, match=msg):
Display.from_predictions(y, y_pred)
@pytest.mark.parametrize(
"Display",
[
CalibrationDisplay,
DetCurveDisplay,
PrecisionRecallDisplay,
RocCurveDisplay,
PredictionErrorDisplay,
ConfusionMatrixDisplay,
],
)
@pytest.mark.parametrize(
"constructor",
["from_predictions", "from_estimator"],
)
def test_classifier_display_curve_named_constructor_return_type(
pyplot, data_binary, Display, constructor
):
"""Check that named constructors return the correct type when subclassed.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/27675
"""
X, y = data_binary
# This can be anything - we just need to check the named constructor return
# type so the only requirement here is instantiating the class without error
y_pred = y
classifier = LogisticRegression().fit(X, y)
class SubclassOfDisplay(Display):
pass
if constructor == "from_predictions":
curve = SubclassOfDisplay.from_predictions(y, y_pred)
else: # constructor == "from_estimator"
curve = SubclassOfDisplay.from_estimator(classifier, X, y)
assert isinstance(curve, SubclassOfDisplay)
# TODO(1.10): Remove once deprecated in all Displays
@pytest.mark.parametrize(
"Display, display_kwargs",
[
# TODO(1.10): Remove
(
PrecisionRecallDisplay,
{"precision": np.array([1, 0.5, 0]), "recall": np.array([0, 0.5, 1])},
),
# TODO(1.9): Remove
(RocCurveDisplay, {"fpr": np.array([0, 0.5, 1]), "tpr": np.array([0, 0.5, 1])}),
],
)
def test_display_estimator_name_deprecation(pyplot, Display, display_kwargs):
"""Check deprecation of `estimator_name`."""
with pytest.warns(FutureWarning, match="`estimator_name` is deprecated in"):
Display(**display_kwargs, estimator_name="test")

View File

@@ -0,0 +1,374 @@
import numpy as np
import pytest
from numpy.testing import (
assert_allclose,
assert_array_equal,
)
from sklearn.compose import make_column_transformer
from sklearn.datasets import make_classification
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
def test_confusion_matrix_display_validation(pyplot):
"""Check that we raise the proper error when validating parameters."""
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=5, random_state=0
)
with pytest.raises(NotFittedError):
ConfusionMatrixDisplay.from_estimator(SVC(), X, y)
regressor = SVR().fit(X, y)
y_pred_regressor = regressor.predict(X)
y_pred_classifier = SVC().fit(X, y).predict(X)
err_msg = "ConfusionMatrixDisplay.from_estimator only supports classifiers"
with pytest.raises(ValueError, match=err_msg):
ConfusionMatrixDisplay.from_estimator(regressor, X, y)
err_msg = "Mix type of y not allowed, got types"
with pytest.raises(ValueError, match=err_msg):
# Force `y_true` to be seen as a regression problem
ConfusionMatrixDisplay.from_predictions(y + 0.5, y_pred_classifier)
with pytest.raises(ValueError, match=err_msg):
ConfusionMatrixDisplay.from_predictions(y, y_pred_regressor)
err_msg = "Found input variables with inconsistent numbers of samples"
with pytest.raises(ValueError, match=err_msg):
ConfusionMatrixDisplay.from_predictions(y, y_pred_classifier[::2])
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("with_labels", [True, False])
@pytest.mark.parametrize("with_display_labels", [True, False])
def test_confusion_matrix_display_custom_labels(
pyplot, constructor_name, with_labels, with_display_labels
):
"""Check the resulting plot when labels are given."""
n_classes = 5
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
)
classifier = SVC().fit(X, y)
y_pred = classifier.predict(X)
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
ax = pyplot.gca()
labels = [2, 1, 0, 3, 4] if with_labels else None
display_labels = ["b", "d", "a", "e", "f"] if with_display_labels else None
cm = confusion_matrix(y, y_pred, labels=labels)
common_kwargs = {
"ax": ax,
"display_labels": display_labels,
"labels": labels,
}
if constructor_name == "from_estimator":
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
else:
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
assert_allclose(disp.confusion_matrix, cm)
if with_display_labels:
expected_display_labels = display_labels
elif with_labels:
expected_display_labels = labels
else:
expected_display_labels = list(range(n_classes))
expected_display_labels_str = [str(name) for name in expected_display_labels]
x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
assert_array_equal(disp.display_labels, expected_display_labels)
assert_array_equal(x_ticks, expected_display_labels_str)
assert_array_equal(y_ticks, expected_display_labels_str)
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("normalize", ["true", "pred", "all", None])
@pytest.mark.parametrize("include_values", [True, False])
def test_confusion_matrix_display_plotting(
pyplot,
constructor_name,
normalize,
include_values,
):
"""Check the overall plotting rendering."""
n_classes = 5
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
)
classifier = SVC().fit(X, y)
y_pred = classifier.predict(X)
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
ax = pyplot.gca()
cmap = "plasma"
cm = confusion_matrix(y, y_pred)
common_kwargs = {
"normalize": normalize,
"cmap": cmap,
"ax": ax,
"include_values": include_values,
}
if constructor_name == "from_estimator":
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
else:
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
assert disp.ax_ == ax
if normalize == "true":
cm = cm / cm.sum(axis=1, keepdims=True)
elif normalize == "pred":
cm = cm / cm.sum(axis=0, keepdims=True)
elif normalize == "all":
cm = cm / cm.sum()
assert_allclose(disp.confusion_matrix, cm)
import matplotlib as mpl
assert isinstance(disp.im_, mpl.image.AxesImage)
assert disp.im_.get_cmap().name == cmap
assert isinstance(disp.ax_, pyplot.Axes)
assert isinstance(disp.figure_, pyplot.Figure)
assert disp.ax_.get_ylabel() == "True label"
assert disp.ax_.get_xlabel() == "Predicted label"
x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
expected_display_labels = list(range(n_classes))
expected_display_labels_str = [str(name) for name in expected_display_labels]
assert_array_equal(disp.display_labels, expected_display_labels)
assert_array_equal(x_ticks, expected_display_labels_str)
assert_array_equal(y_ticks, expected_display_labels_str)
image_data = disp.im_.get_array().data
assert_allclose(image_data, cm)
if include_values:
assert disp.text_.shape == (n_classes, n_classes)
fmt = ".2g"
expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")])
text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
assert_array_equal(expected_text, text_text)
else:
assert disp.text_ is None
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_confusion_matrix_display(pyplot, constructor_name):
"""Check the behaviour of the default constructor without using the class
methods."""
n_classes = 5
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
)
classifier = SVC().fit(X, y)
y_pred = classifier.predict(X)
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
cm = confusion_matrix(y, y_pred)
common_kwargs = {
"normalize": None,
"include_values": True,
"cmap": "viridis",
"xticks_rotation": 45.0,
}
if constructor_name == "from_estimator":
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
else:
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
assert_allclose(disp.confusion_matrix, cm)
assert disp.text_.shape == (n_classes, n_classes)
rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
assert_allclose(rotations, 45.0)
image_data = disp.im_.get_array().data
assert_allclose(image_data, cm)
disp.plot(cmap="plasma")
assert disp.im_.get_cmap().name == "plasma"
disp.plot(include_values=False)
assert disp.text_ is None
disp.plot(xticks_rotation=90.0)
rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
assert_allclose(rotations, 90.0)
disp.plot(values_format="e")
expected_text = np.array([format(v, "e") for v in cm.ravel(order="C")])
text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
assert_array_equal(expected_text, text_text)
def test_confusion_matrix_contrast(pyplot):
"""Check that the text color is appropriate depending on background."""
cm = np.eye(2) / 2
disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
disp.plot(cmap=pyplot.cm.gray)
# diagonal text is black
assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
# off-diagonal text is white
assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
disp.plot(cmap=pyplot.cm.gray_r)
# diagonal text is white
assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
# off-diagonal text is black
assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
# Regression test for #15920
cm = np.array([[19, 34], [32, 58]])
disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
disp.plot(cmap=pyplot.cm.Blues)
min_color = pyplot.cm.Blues(0)
max_color = pyplot.cm.Blues(255)
assert_allclose(disp.text_[0, 0].get_color(), max_color)
assert_allclose(disp.text_[0, 1].get_color(), max_color)
assert_allclose(disp.text_[1, 0].get_color(), max_color)
assert_allclose(disp.text_[1, 1].get_color(), min_color)
@pytest.mark.parametrize(
"clf",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), [0, 1])),
LogisticRegression(),
),
],
ids=["clf", "pipeline-clf", "pipeline-column_transformer-clf"],
)
def test_confusion_matrix_pipeline(pyplot, clf):
"""Check the behaviour of the plotting with more complex pipeline."""
n_classes = 5
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
)
with pytest.raises(NotFittedError):
ConfusionMatrixDisplay.from_estimator(clf, X, y)
clf.fit(X, y)
y_pred = clf.predict(X)
disp = ConfusionMatrixDisplay.from_estimator(clf, X, y)
cm = confusion_matrix(y, y_pred)
assert_allclose(disp.confusion_matrix, cm)
assert disp.text_.shape == (n_classes, n_classes)
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name):
"""Check that when labels=None, the unique values in `y_pred` and `y_true`
will be used.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/18405
"""
n_classes = 5
X, y = make_classification(
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
)
classifier = SVC().fit(X, y)
y_pred = classifier.predict(X)
# create unseen labels in `y_true` not seen during fitting and not present
# in 'classifier.classes_'
y = y + 1
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
common_kwargs = {"labels": None}
if constructor_name == "from_estimator":
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
else:
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
expected_labels = [str(i) for i in range(n_classes + 1)]
assert_array_equal(expected_labels, display_labels)
def test_colormap_max(pyplot):
"""Check that the max color is used for the color of the text."""
gray = pyplot.get_cmap("gray", 1024)
confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]])
disp = ConfusionMatrixDisplay(confusion_matrix)
disp.plot(cmap=gray)
color = disp.text_[1, 0].get_color()
assert_allclose(color, [1.0, 1.0, 1.0, 1.0])
def test_im_kw_adjust_vmin_vmax(pyplot):
"""Check that im_kw passes kwargs to imshow"""
confusion_matrix = np.array([[0.48, 0.04], [0.08, 0.4]])
disp = ConfusionMatrixDisplay(confusion_matrix)
disp.plot(im_kw=dict(vmin=0.0, vmax=0.8))
clim = disp.im_.get_clim()
assert clim[0] == pytest.approx(0.0)
assert clim[1] == pytest.approx(0.8)
def test_confusion_matrix_text_kw(pyplot):
"""Check that text_kw is passed to the text call."""
font_size = 15.0
X, y = make_classification(random_state=0)
classifier = SVC().fit(X, y)
# from_estimator passes the font size
disp = ConfusionMatrixDisplay.from_estimator(
classifier, X, y, text_kw={"fontsize": font_size}
)
for text in disp.text_.reshape(-1):
assert text.get_fontsize() == font_size
# plot adjusts plot to new font size
new_font_size = 20.0
disp.plot(text_kw={"fontsize": new_font_size})
for text in disp.text_.reshape(-1):
assert text.get_fontsize() == new_font_size
# from_predictions passes the font size
y_pred = classifier.predict(X)
disp = ConfusionMatrixDisplay.from_predictions(
y, y_pred, text_kw={"fontsize": font_size}
)
for text in disp.text_.reshape(-1):
assert text.get_fontsize() == font_size

View File

@@ -0,0 +1,131 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import DetCurveDisplay, det_curve
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
@pytest.mark.parametrize("with_sample_weight", [True, False])
@pytest.mark.parametrize("drop_intermediate", [True, False])
@pytest.mark.parametrize("with_strings", [True, False])
def test_det_curve_display(
pyplot,
constructor_name,
response_method,
with_sample_weight,
drop_intermediate,
with_strings,
):
X, y = load_iris(return_X_y=True)
# Binarize the data with only the two first classes
X, y = X[y < 2], y[y < 2]
pos_label = None
if with_strings:
y = np.array(["c", "b"])[y]
pos_label = "c"
if with_sample_weight:
rng = np.random.RandomState(42)
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
else:
sample_weight = None
lr = LogisticRegression()
lr.fit(X, y)
y_score = getattr(lr, response_method)(X)
if y_score.ndim == 2:
y_score = y_score[:, 1]
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
common_kwargs = {
"name": lr.__class__.__name__,
"alpha": 0.8,
"sample_weight": sample_weight,
"drop_intermediate": drop_intermediate,
"pos_label": pos_label,
}
if constructor_name == "from_estimator":
disp = DetCurveDisplay.from_estimator(lr, X, y, **common_kwargs)
else:
disp = DetCurveDisplay.from_predictions(y, y_score, **common_kwargs)
fpr, fnr, _ = det_curve(
y,
y_score,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
)
assert_allclose(disp.fpr, fpr, atol=1e-7)
assert_allclose(disp.fnr, fnr, atol=1e-7)
assert disp.estimator_name == "LogisticRegression"
# cannot fail thanks to pyplot fixture
import matplotlib as mpl
assert isinstance(disp.line_, mpl.lines.Line2D)
assert disp.line_.get_alpha() == 0.8
assert isinstance(disp.ax_, mpl.axes.Axes)
assert isinstance(disp.figure_, mpl.figure.Figure)
assert disp.line_.get_label() == "LogisticRegression"
expected_pos_label = 1 if pos_label is None else pos_label
expected_ylabel = f"False Negative Rate (Positive label: {expected_pos_label})"
expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
assert disp.ax_.get_ylabel() == expected_ylabel
assert disp.ax_.get_xlabel() == expected_xlabel
@pytest.mark.parametrize(
"constructor_name, expected_clf_name",
[
("from_estimator", "LogisticRegression"),
("from_predictions", "Classifier"),
],
)
def test_det_curve_display_default_name(
pyplot,
constructor_name,
expected_clf_name,
):
# Check the default name display in the figure when `name` is not provided
X, y = load_iris(return_X_y=True)
# Binarize the data with only the two first classes
X, y = X[y < 2], y[y < 2]
lr = LogisticRegression().fit(X, y)
y_score = lr.predict_proba(X)[:, 1]
if constructor_name == "from_estimator":
disp = DetCurveDisplay.from_estimator(lr, X, y)
else:
disp = DetCurveDisplay.from_predictions(y, y_score)
assert disp.estimator_name == expected_clf_name
assert disp.line_.get_label() == expected_clf_name
# TODO(1.10): remove
def test_y_score_and_y_pred_specified_error(pyplot):
"""1. Check that an error is raised when both y_score and y_pred are specified.
2. Check that a warning is raised when y_pred is specified.
"""
y_true = np.array([0, 0, 1, 1])
y_score = np.array([0.1, 0.4, 0.35, 0.8])
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
with pytest.raises(
ValueError, match="`y_pred` and `y_score` cannot be both specified"
):
DetCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.8"):
DetCurveDisplay.from_predictions(y_true, y_pred=y_score)

View File

@@ -0,0 +1,400 @@
from collections import Counter
import numpy as np
import pytest
from scipy.integrate import trapezoid
from sklearn.compose import make_column_transformer
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
PrecisionRecallDisplay,
average_precision_score,
precision_recall_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
@pytest.mark.parametrize("drop_intermediate", [True, False])
def test_precision_recall_display_plotting(
pyplot, constructor_name, response_method, drop_intermediate
):
"""Check the overall plotting rendering."""
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
pos_label = 1
classifier = LogisticRegression().fit(X, y)
classifier.fit(X, y)
y_score = getattr(classifier, response_method)(X)
y_score = y_score if y_score.ndim == 1 else y_score[:, pos_label]
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(
classifier,
X,
y,
response_method=response_method,
drop_intermediate=drop_intermediate,
)
else:
display = PrecisionRecallDisplay.from_predictions(
y, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
)
precision, recall, _ = precision_recall_curve(
y, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
)
average_precision = average_precision_score(y, y_score, pos_label=pos_label)
np.testing.assert_allclose(display.precision, precision)
np.testing.assert_allclose(display.recall, recall)
assert display.average_precision == pytest.approx(average_precision)
import matplotlib as mpl
assert isinstance(display.line_, mpl.lines.Line2D)
assert isinstance(display.ax_, mpl.axes.Axes)
assert isinstance(display.figure_, mpl.figure.Figure)
assert display.ax_.get_xlabel() == "Recall (Positive label: 1)"
assert display.ax_.get_ylabel() == "Precision (Positive label: 1)"
assert display.ax_.get_adjustable() == "box"
assert display.ax_.get_aspect() in ("equal", 1.0)
assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
# plotting passing some new parameters
display.plot(alpha=0.8, name="MySpecialEstimator")
expected_label = f"MySpecialEstimator (AP = {average_precision:0.2f})"
assert display.line_.get_label() == expected_label
assert display.line_.get_alpha() == pytest.approx(0.8)
# Check that the chance level line is not plotted by default
assert display.chance_level_ is None
@pytest.mark.parametrize("chance_level_kw", [None, {"color": "r"}, {"c": "r"}])
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_precision_recall_chance_level_line(
pyplot,
chance_level_kw,
constructor_name,
):
"""Check the chance level line plotting behavior."""
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
pos_prevalence = Counter(y)[1] / len(y)
lr = LogisticRegression()
y_score = lr.fit(X, y).predict_proba(X)[:, 1]
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(
lr,
X,
y,
plot_chance_level=True,
chance_level_kw=chance_level_kw,
)
else:
display = PrecisionRecallDisplay.from_predictions(
y,
y_score,
plot_chance_level=True,
chance_level_kw=chance_level_kw,
)
import matplotlib as mpl
assert isinstance(display.chance_level_, mpl.lines.Line2D)
assert tuple(display.chance_level_.get_xdata()) == (0, 1)
assert tuple(display.chance_level_.get_ydata()) == (pos_prevalence, pos_prevalence)
# Checking for chance level line styles
if chance_level_kw is None:
assert display.chance_level_.get_color() == "k"
else:
assert display.chance_level_.get_color() == "r"
@pytest.mark.parametrize(
"constructor_name, default_label",
[
("from_estimator", "LogisticRegression (AP = {:.2f})"),
("from_predictions", "Classifier (AP = {:.2f})"),
],
)
def test_precision_recall_display_name(pyplot, constructor_name, default_label):
"""Check the behaviour of the name parameters"""
X, y = make_classification(n_classes=2, n_samples=100, random_state=0)
pos_label = 1
classifier = LogisticRegression().fit(X, y)
classifier.fit(X, y)
y_score = classifier.predict_proba(X)[:, pos_label]
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(classifier, X, y)
else:
display = PrecisionRecallDisplay.from_predictions(
y, y_score, pos_label=pos_label
)
average_precision = average_precision_score(y, y_score, pos_label=pos_label)
# check that the default name is used
assert display.line_.get_label() == default_label.format(average_precision)
# check that the name can be set
display.plot(name="MySpecialEstimator")
assert (
display.line_.get_label()
== f"MySpecialEstimator (AP = {average_precision:.2f})"
)
@pytest.mark.parametrize(
"clf",
[
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
),
],
)
def test_precision_recall_display_pipeline(pyplot, clf):
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
with pytest.raises(NotFittedError):
PrecisionRecallDisplay.from_estimator(clf, X, y)
clf.fit(X, y)
display = PrecisionRecallDisplay.from_estimator(clf, X, y)
assert display.name == clf.__class__.__name__
def test_precision_recall_display_string_labels(pyplot):
# regression test #15738
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target_names[cancer.target]
lr = make_pipeline(StandardScaler(), LogisticRegression())
lr.fit(X, y)
for klass in cancer.target_names:
assert klass in lr.classes_
display = PrecisionRecallDisplay.from_estimator(lr, X, y)
y_score = lr.predict_proba(X)[:, 1]
avg_prec = average_precision_score(y, y_score, pos_label=lr.classes_[1])
assert display.average_precision == pytest.approx(avg_prec)
assert display.name == lr.__class__.__name__
err_msg = r"y_true takes value in {'benign', 'malignant'}"
with pytest.raises(ValueError, match=err_msg):
PrecisionRecallDisplay.from_predictions(y, y_score)
display = PrecisionRecallDisplay.from_predictions(
y, y_score, pos_label=lr.classes_[1]
)
assert display.average_precision == pytest.approx(avg_prec)
@pytest.mark.parametrize(
"average_precision, name, expected_label",
[
(0.9, None, "AP = 0.90"),
(None, "my_est", "my_est"),
(0.8, "my_est2", "my_est2 (AP = 0.80)"),
],
)
def test_default_labels(pyplot, average_precision, name, expected_label):
"""Check the default labels used in the display."""
precision = np.array([1, 0.5, 0])
recall = np.array([0, 0.5, 1])
display = PrecisionRecallDisplay(
precision,
recall,
average_precision=average_precision,
name=name,
)
display.plot()
assert display.line_.get_label() == expected_label
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_method):
# check that we can provide the positive label and display the proper
# statistics
X, y = load_breast_cancer(return_X_y=True)
# create a highly imbalanced version of the breast cancer dataset
idx_positive = np.flatnonzero(y == 1)
idx_negative = np.flatnonzero(y == 0)
idx_selected = np.hstack([idx_negative, idx_positive[:25]])
X, y = X[idx_selected], y[idx_selected]
X, y = shuffle(X, y, random_state=42)
# only use 2 features to make the problem even harder
X = X[:, :2]
y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
stratify=y,
random_state=0,
)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
# sanity check to be sure the positive class is classes_[0] and that we
# are betrayed by the class imbalance
assert classifier.classes_.tolist() == ["cancer", "not cancer"]
y_score = getattr(classifier, response_method)(X_test)
# we select the corresponding probability columns or reverse the decision
# function otherwise
y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0]
y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1]
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(
classifier,
X_test,
y_test,
pos_label="cancer",
response_method=response_method,
)
else:
display = PrecisionRecallDisplay.from_predictions(
y_test,
y_score_cancer,
pos_label="cancer",
)
# we should obtain the statistics of the "cancer" class
avg_prec_limit = 0.65
assert display.average_precision < avg_prec_limit
assert -trapezoid(display.precision, display.recall) < avg_prec_limit
# otherwise we should obtain the statistics of the "not cancer" class
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(
classifier,
X_test,
y_test,
response_method=response_method,
pos_label="not cancer",
)
else:
display = PrecisionRecallDisplay.from_predictions(
y_test,
y_score_not_cancer,
pos_label="not cancer",
)
avg_prec_limit = 0.95
assert display.average_precision > avg_prec_limit
assert -trapezoid(display.precision, display.recall) > avg_prec_limit
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_precision_recall_prevalence_pos_label_reusable(pyplot, constructor_name):
# Check that even if one passes plot_chance_level=False the first time
# one can still call disp.plot with plot_chance_level=True and get the
# chance level line
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
lr = LogisticRegression()
y_score = lr.fit(X, y).predict_proba(X)[:, 1]
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(
lr, X, y, plot_chance_level=False
)
else:
display = PrecisionRecallDisplay.from_predictions(
y, y_score, plot_chance_level=False
)
assert display.chance_level_ is None
import matplotlib as mpl
# When calling from_estimator or from_predictions,
# prevalence_pos_label should have been set, so that directly
# calling plot_chance_level=True should plot the chance level line
display.plot(plot_chance_level=True)
assert isinstance(display.chance_level_, mpl.lines.Line2D)
def test_precision_recall_raise_no_prevalence(pyplot):
# Check that raises correctly when plotting chance level with
# no prvelance_pos_label is provided
precision = np.array([1, 0.5, 0])
recall = np.array([0, 0.5, 1])
display = PrecisionRecallDisplay(precision, recall)
msg = (
"You must provide prevalence_pos_label when constructing the "
"PrecisionRecallDisplay object in order to plot the chance "
"level line. Alternatively, you may use "
"PrecisionRecallDisplay.from_estimator or "
"PrecisionRecallDisplay.from_predictions "
"to automatically set prevalence_pos_label"
)
with pytest.raises(ValueError, match=msg):
display.plot(plot_chance_level=True)
@pytest.mark.parametrize("despine", [True, False])
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_plot_precision_recall_despine(pyplot, despine, constructor_name):
# Check that the despine keyword is working correctly
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
clf = LogisticRegression().fit(X, y)
clf.fit(X, y)
y_score = clf.decision_function(X)
# safe guard for the binary if/else construction
assert constructor_name in ("from_estimator", "from_predictions")
if constructor_name == "from_estimator":
display = PrecisionRecallDisplay.from_estimator(clf, X, y, despine=despine)
else:
display = PrecisionRecallDisplay.from_predictions(y, y_score, despine=despine)
for s in ["top", "right"]:
assert display.ax_.spines[s].get_visible() is not despine
if despine:
for s in ["bottom", "left"]:
assert display.ax_.spines[s].get_bounds() == (0, 1)
# TODO(1.10): remove
def test_y_score_and_y_pred_specified_error(pyplot):
"""1. Check that an error is raised when both y_score and y_pred are specified.
2. Check that a warning is raised when y_pred is specified.
"""
y_true = np.array([0, 1, 1, 0])
y_score = np.array([0.1, 0.4, 0.35, 0.8])
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
with pytest.raises(
ValueError, match="`y_pred` and `y_score` cannot be both specified"
):
PrecisionRecallDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.8"):
PrecisionRecallDisplay.from_predictions(y_true, y_pred=y_score)

View File

@@ -0,0 +1,169 @@
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import load_diabetes
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Ridge
from sklearn.metrics import PredictionErrorDisplay
X, y = load_diabetes(return_X_y=True)
@pytest.fixture
def regressor_fitted():
return Ridge().fit(X, y)
@pytest.mark.parametrize(
"regressor, params, err_type, err_msg",
[
(
Ridge().fit(X, y),
{"subsample": -1},
ValueError,
"When an integer, subsample=-1 should be",
),
(
Ridge().fit(X, y),
{"subsample": 20.0},
ValueError,
"When a floating-point, subsample=20.0 should be",
),
(
Ridge().fit(X, y),
{"subsample": -20.0},
ValueError,
"When a floating-point, subsample=-20.0 should be",
),
(
Ridge().fit(X, y),
{"kind": "xxx"},
ValueError,
"`kind` must be one of",
),
],
)
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
def test_prediction_error_display_raise_error(
pyplot, class_method, regressor, params, err_type, err_msg
):
"""Check that we raise the proper error when making the parameters
# validation."""
with pytest.raises(err_type, match=err_msg):
if class_method == "from_estimator":
PredictionErrorDisplay.from_estimator(regressor, X, y, **params)
else:
y_pred = regressor.predict(X)
PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred, **params)
def test_from_estimator_not_fitted(pyplot):
"""Check that we raise a `NotFittedError` when the passed regressor is not
fit."""
regressor = Ridge()
with pytest.raises(NotFittedError, match="is not fitted yet."):
PredictionErrorDisplay.from_estimator(regressor, X, y)
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize("kind", ["actual_vs_predicted", "residual_vs_predicted"])
def test_prediction_error_display(pyplot, regressor_fitted, class_method, kind):
"""Check the default behaviour of the display."""
if class_method == "from_estimator":
display = PredictionErrorDisplay.from_estimator(
regressor_fitted, X, y, kind=kind
)
else:
y_pred = regressor_fitted.predict(X)
display = PredictionErrorDisplay.from_predictions(
y_true=y, y_pred=y_pred, kind=kind
)
if kind == "actual_vs_predicted":
assert_allclose(display.line_.get_xdata(), display.line_.get_ydata())
assert display.ax_.get_xlabel() == "Predicted values"
assert display.ax_.get_ylabel() == "Actual values"
assert display.line_ is not None
else:
assert display.ax_.get_xlabel() == "Predicted values"
assert display.ax_.get_ylabel() == "Residuals (actual - predicted)"
assert display.line_ is not None
assert display.ax_.get_legend() is None
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize(
"subsample, expected_size",
[(5, 5), (0.1, int(X.shape[0] * 0.1)), (None, X.shape[0])],
)
def test_plot_prediction_error_subsample(
pyplot, regressor_fitted, class_method, subsample, expected_size
):
"""Check the behaviour of `subsample`."""
if class_method == "from_estimator":
display = PredictionErrorDisplay.from_estimator(
regressor_fitted, X, y, subsample=subsample
)
else:
y_pred = regressor_fitted.predict(X)
display = PredictionErrorDisplay.from_predictions(
y_true=y, y_pred=y_pred, subsample=subsample
)
assert len(display.scatter_.get_offsets()) == expected_size
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
def test_plot_prediction_error_ax(pyplot, regressor_fitted, class_method):
"""Check that we can pass an axis to the display."""
_, ax = pyplot.subplots()
if class_method == "from_estimator":
display = PredictionErrorDisplay.from_estimator(regressor_fitted, X, y, ax=ax)
else:
y_pred = regressor_fitted.predict(X)
display = PredictionErrorDisplay.from_predictions(
y_true=y, y_pred=y_pred, ax=ax
)
assert display.ax_ is ax
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
@pytest.mark.parametrize(
"scatter_kwargs",
[None, {"color": "blue", "alpha": 0.9}, {"c": "blue", "alpha": 0.9}],
)
@pytest.mark.parametrize(
"line_kwargs", [None, {"color": "red", "linestyle": "-"}, {"c": "red", "ls": "-"}]
)
def test_prediction_error_custom_artist(
pyplot, regressor_fitted, class_method, scatter_kwargs, line_kwargs
):
"""Check that we can tune the style of the line and the scatter."""
extra_params = {
"kind": "actual_vs_predicted",
"scatter_kwargs": scatter_kwargs,
"line_kwargs": line_kwargs,
}
if class_method == "from_estimator":
display = PredictionErrorDisplay.from_estimator(
regressor_fitted, X, y, **extra_params
)
else:
y_pred = regressor_fitted.predict(X)
display = PredictionErrorDisplay.from_predictions(
y_true=y, y_pred=y_pred, **extra_params
)
if line_kwargs is not None:
assert display.line_.get_linestyle() == "-"
assert display.line_.get_color() == "red"
else:
assert display.line_.get_linestyle() == "--"
assert display.line_.get_color() == "black"
assert display.line_.get_alpha() == 0.7
if scatter_kwargs is not None:
assert_allclose(display.scatter_.get_facecolor(), [[0.0, 0.0, 1.0, 0.9]])
assert_allclose(display.scatter_.get_edgecolor(), [[0.0, 0.0, 1.0, 0.9]])
else:
assert display.scatter_.get_alpha() == 0.8

View File

@@ -0,0 +1,989 @@
from collections.abc import Mapping
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy.integrate import trapezoid
from sklearn import clone
from sklearn.compose import make_column_transformer
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.exceptions import NotFittedError, UndefinedMetricWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import RocCurveDisplay, auc, roc_curve
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import _safe_indexing, shuffle
from sklearn.utils._response import _get_response_values_binary
@pytest.fixture(scope="module")
def data_binary():
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=5,
n_redundant=2,
flip_y=0.1,
class_sep=0.8,
random_state=42,
)
return X, y
def _check_figure_axes_and_labels(display, pos_label):
"""Check mpl axes and figure defaults are correct."""
import matplotlib as mpl
assert isinstance(display.ax_, mpl.axes.Axes)
assert isinstance(display.figure_, mpl.figure.Figure)
assert display.ax_.get_adjustable() == "box"
assert display.ax_.get_aspect() in ("equal", 1.0)
assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
expected_pos_label = 1 if pos_label is None else pos_label
expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})"
expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
assert display.ax_.get_ylabel() == expected_ylabel
assert display.ax_.get_xlabel() == expected_xlabel
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
@pytest.mark.parametrize("with_sample_weight", [True, False])
@pytest.mark.parametrize("drop_intermediate", [True, False])
@pytest.mark.parametrize("with_strings", [True, False])
@pytest.mark.parametrize(
"constructor_name, default_name",
[
("from_estimator", "LogisticRegression"),
("from_predictions", "Classifier"),
],
)
def test_roc_curve_display_plotting(
pyplot,
response_method,
data_binary,
with_sample_weight,
drop_intermediate,
with_strings,
constructor_name,
default_name,
):
"""Check the overall plotting behaviour for single curve."""
X, y = data_binary
pos_label = None
if with_strings:
y = np.array(["c", "b"])[y]
pos_label = "c"
if with_sample_weight:
rng = np.random.RandomState(42)
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
else:
sample_weight = None
lr = LogisticRegression()
lr.fit(X, y)
y_score = getattr(lr, response_method)(X)
y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(
lr,
X,
y,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
curve_kwargs={"alpha": 0.8},
)
else:
display = RocCurveDisplay.from_predictions(
y,
y_score,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
curve_kwargs={"alpha": 0.8},
)
fpr, tpr, _ = roc_curve(
y,
y_score,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
)
assert_allclose(display.roc_auc, auc(fpr, tpr))
assert_allclose(display.fpr, fpr)
assert_allclose(display.tpr, tpr)
assert display.name == default_name
import matplotlib as mpl
_check_figure_axes_and_labels(display, pos_label)
assert isinstance(display.line_, mpl.lines.Line2D)
assert display.line_.get_alpha() == 0.8
expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})"
assert display.line_.get_label() == expected_label
@pytest.mark.parametrize(
"params, err_msg",
[
(
{
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"tpr": [np.array([0, 0.5, 1])],
"roc_auc": None,
"name": None,
},
"self.fpr and self.tpr from `RocCurveDisplay` initialization,",
),
(
{
"fpr": [np.array([0, 0.5, 1])],
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"roc_auc": [0.8, 0.9],
"name": None,
},
"self.fpr, self.tpr and self.roc_auc from `RocCurveDisplay`",
),
(
{
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"roc_auc": [0.8],
"name": None,
},
"Got: self.fpr: 2, self.tpr: 2, self.roc_auc: 1",
),
(
{
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"roc_auc": [0.8, 0.9],
"name": ["curve1", "curve2", "curve3"],
},
r"self.fpr, self.tpr, self.roc_auc and 'name' \(or self.name\)",
),
(
{
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
"roc_auc": [0.8, 0.9],
# List of length 1 is always allowed
"name": ["curve1"],
},
None,
),
],
)
def test_roc_curve_plot_parameter_length_validation(pyplot, params, err_msg):
"""Check `plot` parameter length validation performed correctly."""
display = RocCurveDisplay(**params)
if err_msg:
with pytest.raises(ValueError, match=err_msg):
display.plot()
else:
# No error should be raised
display.plot()
def test_validate_plot_params(pyplot):
"""Check `_validate_plot_params` returns the correct variables."""
fpr = np.array([0, 0.5, 1])
tpr = [np.array([0, 0.5, 1])]
roc_auc = None
name = "test_curve"
# Initialize display with test inputs
display = RocCurveDisplay(
fpr=fpr,
tpr=tpr,
roc_auc=roc_auc,
name=name,
pos_label=None,
)
fpr_out, tpr_out, roc_auc_out, name_out = display._validate_plot_params(
ax=None, name=None
)
assert isinstance(fpr_out, list)
assert isinstance(tpr_out, list)
assert len(fpr_out) == 1
assert len(tpr_out) == 1
assert roc_auc_out is None
assert name_out == ["test_curve"]
def test_roc_curve_from_cv_results_param_validation(pyplot, data_binary):
"""Check parameter validation is correct."""
X, y = data_binary
# `cv_results` missing key
cv_results_no_est = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
)
cv_results_no_indices = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
)
for cv_results in (cv_results_no_est, cv_results_no_indices):
with pytest.raises(
ValueError,
match="`cv_results` does not contain one of the following required",
):
RocCurveDisplay.from_cv_results(cv_results, X, y)
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
# `X` wrong length
with pytest.raises(ValueError, match="`X` does not contain the correct"):
RocCurveDisplay.from_cv_results(cv_results, X[:10, :], y)
# `y` not binary
y_multi = y.copy()
y_multi[0] = 2
with pytest.raises(ValueError, match="The target `y` is not binary."):
RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
# input inconsistent length
with pytest.raises(ValueError, match="Found input variables with inconsistent"):
RocCurveDisplay.from_cv_results(cv_results, X, y[:10])
with pytest.raises(ValueError, match="Found input variables with inconsistent"):
RocCurveDisplay.from_cv_results(cv_results, X, y, sample_weight=[1, 2])
# `pos_label` inconsistency
y_multi[y_multi == 1] = 2
with pytest.warns(UndefinedMetricWarning, match="No positive samples in y_true"):
RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
# `name` is list while `curve_kwargs` is None or dict
for curve_kwargs in (None, {"alpha": 0.2}):
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
name=["one", "two", "three"],
curve_kwargs=curve_kwargs,
)
# `curve_kwargs` incorrect length
with pytest.raises(ValueError, match="`curve_kwargs` must be None, a dictionary"):
RocCurveDisplay.from_cv_results(cv_results, X, y, curve_kwargs=[{"alpha": 1}])
# `curve_kwargs` both alias provided
with pytest.raises(TypeError, match="Got both c and"):
RocCurveDisplay.from_cv_results(
cv_results, X, y, curve_kwargs={"c": "blue", "color": "red"}
)
@pytest.mark.parametrize(
"curve_kwargs",
[None, {"alpha": 0.2}, [{"alpha": 0.2}, {"alpha": 0.3}, {"alpha": 0.4}]],
)
def test_roc_curve_display_from_cv_results_curve_kwargs(
pyplot, data_binary, curve_kwargs
):
"""Check `curve_kwargs` correctly passed."""
X, y = data_binary
n_cv = 3
cv_results = cross_validate(
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
)
display = RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
curve_kwargs=curve_kwargs,
)
if curve_kwargs is None:
# Default `alpha` used
assert all(line.get_alpha() == 0.5 for line in display.line_)
elif isinstance(curve_kwargs, Mapping):
# `alpha` from dict used for all curves
assert all(line.get_alpha() == 0.2 for line in display.line_)
else:
# Different `alpha` used for each curve
assert all(
line.get_alpha() == curve_kwargs[i]["alpha"]
for i, line in enumerate(display.line_)
)
# Other default kwargs should be the same
for line in display.line_:
assert line.get_linestyle() == "--"
assert line.get_color() == "blue"
# TODO(1.9): Remove in 1.9
@pytest.mark.parametrize(
"constructor_name", ["from_estimator", "from_predictions", "plot"]
)
def test_roc_curve_display_kwargs_deprecation(pyplot, data_binary, constructor_name):
"""Check **kwargs deprecated correctly in favour of `curve_kwargs`."""
X, y = data_binary
lr = LogisticRegression()
lr.fit(X, y)
fpr = np.array([0, 0.5, 1])
tpr = np.array([0, 0.5, 1])
# Error when both `curve_kwargs` and `**kwargs` provided
with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
if constructor_name == "from_estimator":
RocCurveDisplay.from_estimator(
lr, X, y, curve_kwargs={"alpha": 1}, label="test"
)
elif constructor_name == "from_predictions":
RocCurveDisplay.from_predictions(
y, y, curve_kwargs={"alpha": 1}, label="test"
)
else:
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(
curve_kwargs={"alpha": 1}, label="test"
)
# Warning when `**kwargs`` provided
with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and will be"):
if constructor_name == "from_estimator":
RocCurveDisplay.from_estimator(lr, X, y, label="test")
elif constructor_name == "from_predictions":
RocCurveDisplay.from_predictions(y, y, label="test")
else:
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="test")
@pytest.mark.parametrize(
"curve_kwargs",
[
None,
{"color": "blue"},
[{"color": "blue"}, {"color": "green"}, {"color": "red"}],
],
)
@pytest.mark.parametrize("drop_intermediate", [True, False])
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
@pytest.mark.parametrize("with_sample_weight", [True, False])
@pytest.mark.parametrize("with_strings", [True, False])
def test_roc_curve_display_plotting_from_cv_results(
pyplot,
data_binary,
with_strings,
with_sample_weight,
response_method,
drop_intermediate,
curve_kwargs,
):
"""Check overall plotting of `from_cv_results`."""
X, y = data_binary
pos_label = None
if with_strings:
y = np.array(["c", "b"])[y]
pos_label = "c"
if with_sample_weight:
rng = np.random.RandomState(42)
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
else:
sample_weight = None
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
display = RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
sample_weight=sample_weight,
drop_intermediate=drop_intermediate,
response_method=response_method,
pos_label=pos_label,
curve_kwargs=curve_kwargs,
)
for idx, (estimator, test_indices) in enumerate(
zip(cv_results["estimator"], cv_results["indices"]["test"])
):
y_true = _safe_indexing(y, test_indices)
y_pred = _get_response_values_binary(
estimator,
_safe_indexing(X, test_indices),
response_method=response_method,
pos_label=pos_label,
)[0]
sample_weight_fold = (
None
if sample_weight is None
else _safe_indexing(sample_weight, test_indices)
)
fpr, tpr, _ = roc_curve(
y_true,
y_pred,
sample_weight=sample_weight_fold,
drop_intermediate=drop_intermediate,
pos_label=pos_label,
)
assert_allclose(display.roc_auc[idx], auc(fpr, tpr))
assert_allclose(display.fpr[idx], fpr)
assert_allclose(display.tpr[idx], tpr)
assert display.name is None
import matplotlib as mpl
_check_figure_axes_and_labels(display, pos_label)
if with_sample_weight:
aggregate_expected_labels = ["AUC = 0.64 +/- 0.04", "_child1", "_child2"]
else:
aggregate_expected_labels = ["AUC = 0.61 +/- 0.05", "_child1", "_child2"]
for idx, line in enumerate(display.line_):
assert isinstance(line, mpl.lines.Line2D)
# Default alpha for `from_cv_results`
line.get_alpha() == 0.5
if isinstance(curve_kwargs, list):
# Each individual curve labelled
assert line.get_label() == f"AUC = {display.roc_auc[idx]:.2f}"
else:
# Single aggregate label
assert line.get_label() == aggregate_expected_labels[idx]
@pytest.mark.parametrize("roc_auc", [[1.0, 1.0, 1.0], None])
@pytest.mark.parametrize(
"curve_kwargs",
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
)
@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
def test_roc_curve_plot_legend_label(pyplot, data_binary, name, curve_kwargs, roc_auc):
"""Check legend label correct with all `curve_kwargs`, `name` combinations."""
fpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
tpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
if not isinstance(curve_kwargs, list) and isinstance(name, list):
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
name=name, curve_kwargs=curve_kwargs
)
else:
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
name=name, curve_kwargs=curve_kwargs
)
legend = display.ax_.get_legend()
if legend is None:
# No legend is created, exit test early
assert name is None
assert roc_auc is None
return
else:
legend_labels = [text.get_text() for text in legend.get_texts()]
if isinstance(curve_kwargs, list):
# Multiple labels in legend
assert len(legend_labels) == 3
for idx, label in enumerate(legend_labels):
if name is None:
expected_label = "AUC = 1.00" if roc_auc else None
assert label == expected_label
elif isinstance(name, str):
expected_label = "single (AUC = 1.00)" if roc_auc else "single"
assert label == expected_label
else:
# `name` is a list of different strings
expected_label = (
f"{name[idx]} (AUC = 1.00)" if roc_auc else f"{name[idx]}"
)
assert label == expected_label
else:
# Single label in legend
assert len(legend_labels) == 1
if name is None:
expected_label = "AUC = 1.00 +/- 0.00" if roc_auc else None
assert legend_labels[0] == expected_label
else:
# name is single string
expected_label = "single (AUC = 1.00 +/- 0.00)" if roc_auc else "single"
assert legend_labels[0] == expected_label
@pytest.mark.parametrize(
"curve_kwargs",
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
)
@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
def test_roc_curve_from_cv_results_legend_label(
pyplot, data_binary, name, curve_kwargs
):
"""Check legend label correct with all `curve_kwargs`, `name` combinations."""
X, y = data_binary
n_cv = 3
cv_results = cross_validate(
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
)
if not isinstance(curve_kwargs, list) and isinstance(name, list):
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
RocCurveDisplay.from_cv_results(
cv_results, X, y, name=name, curve_kwargs=curve_kwargs
)
else:
display = RocCurveDisplay.from_cv_results(
cv_results, X, y, name=name, curve_kwargs=curve_kwargs
)
legend = display.ax_.get_legend()
legend_labels = [text.get_text() for text in legend.get_texts()]
if isinstance(curve_kwargs, list):
# Multiple labels in legend
assert len(legend_labels) == 3
auc = ["0.62", "0.66", "0.55"]
for idx, label in enumerate(legend_labels):
if name is None:
assert label == f"AUC = {auc[idx]}"
elif isinstance(name, str):
assert label == f"single (AUC = {auc[idx]})"
else:
# `name` is a list of different strings
assert label == f"{name[idx]} (AUC = {auc[idx]})"
else:
# Single label in legend
assert len(legend_labels) == 1
if name is None:
assert legend_labels[0] == "AUC = 0.61 +/- 0.05"
else:
# name is single string
assert legend_labels[0] == "single (AUC = 0.61 +/- 0.05)"
@pytest.mark.parametrize(
"curve_kwargs",
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
)
def test_roc_curve_from_cv_results_curve_kwargs(pyplot, data_binary, curve_kwargs):
"""Check line kwargs passed correctly in `from_cv_results`."""
X, y = data_binary
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
display = RocCurveDisplay.from_cv_results(
cv_results, X, y, curve_kwargs=curve_kwargs
)
for idx, line in enumerate(display.line_):
color = line.get_color()
if curve_kwargs is None:
# Default color
assert color == "blue"
elif isinstance(curve_kwargs, Mapping):
# All curves "red"
assert color == "red"
else:
assert color == curve_kwargs[idx]["c"]
def test_roc_curve_from_cv_results_pos_label_inferred(pyplot, data_binary):
"""Check `pos_label` inferred correctly by `from_cv_results(pos_label=None)`."""
X, y = data_binary
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
disp = RocCurveDisplay.from_cv_results(cv_results, X, y, pos_label=None)
# Should be `estimator.classes_[1]`
assert disp.pos_label == 1
def _check_chance_level(plot_chance_level, chance_level_kw, display):
"""Check chance level line and line styles correct."""
import matplotlib as mpl
if plot_chance_level:
assert isinstance(display.chance_level_, mpl.lines.Line2D)
assert tuple(display.chance_level_.get_xdata()) == (0, 1)
assert tuple(display.chance_level_.get_ydata()) == (0, 1)
else:
assert display.chance_level_ is None
# Checking for chance level line styles
if plot_chance_level and chance_level_kw is None:
assert display.chance_level_.get_color() == "k"
assert display.chance_level_.get_linestyle() == "--"
assert display.chance_level_.get_label() == "Chance level (AUC = 0.5)"
elif plot_chance_level:
if "c" in chance_level_kw:
assert display.chance_level_.get_color() == chance_level_kw["c"]
else:
assert display.chance_level_.get_color() == chance_level_kw["color"]
if "lw" in chance_level_kw:
assert display.chance_level_.get_linewidth() == chance_level_kw["lw"]
else:
assert display.chance_level_.get_linewidth() == chance_level_kw["linewidth"]
if "ls" in chance_level_kw:
assert display.chance_level_.get_linestyle() == chance_level_kw["ls"]
else:
assert display.chance_level_.get_linestyle() == chance_level_kw["linestyle"]
@pytest.mark.parametrize("plot_chance_level", [True, False])
@pytest.mark.parametrize("label", [None, "Test Label"])
@pytest.mark.parametrize(
"chance_level_kw",
[
None,
{"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
{"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
{"lw": 1, "color": "blue", "ls": "-", "label": None},
],
)
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_roc_curve_chance_level_line(
pyplot,
data_binary,
plot_chance_level,
chance_level_kw,
label,
constructor_name,
):
"""Check chance level plotting behavior of `from_predictions`, `from_estimator`."""
X, y = data_binary
lr = LogisticRegression()
lr.fit(X, y)
y_score = getattr(lr, "predict_proba")(X)
y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(
lr,
X,
y,
curve_kwargs={"alpha": 0.8, "label": label},
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
)
else:
display = RocCurveDisplay.from_predictions(
y,
y_score,
curve_kwargs={"alpha": 0.8, "label": label},
plot_chance_level=plot_chance_level,
chance_level_kw=chance_level_kw,
)
import matplotlib as mpl
assert isinstance(display.line_, mpl.lines.Line2D)
assert display.line_.get_alpha() == 0.8
assert isinstance(display.ax_, mpl.axes.Axes)
assert isinstance(display.figure_, mpl.figure.Figure)
_check_chance_level(plot_chance_level, chance_level_kw, display)
# Checking for legend behaviour
if plot_chance_level and chance_level_kw is not None:
if label is not None or chance_level_kw.get("label") is not None:
legend = display.ax_.get_legend()
assert legend is not None # Legend should be present if any label is set
legend_labels = [text.get_text() for text in legend.get_texts()]
if label is not None:
assert label in legend_labels
if chance_level_kw.get("label") is not None:
assert chance_level_kw["label"] in legend_labels
else:
assert display.ax_.get_legend() is None
@pytest.mark.parametrize("plot_chance_level", [True, False])
@pytest.mark.parametrize(
"chance_level_kw",
[
None,
{"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
{"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
{"lw": 1, "color": "blue", "ls": "-", "label": None},
],
)
@pytest.mark.parametrize("curve_kwargs", [None, {"alpha": 0.8}])
def test_roc_curve_chance_level_line_from_cv_results(
pyplot,
data_binary,
plot_chance_level,
chance_level_kw,
curve_kwargs,
):
"""Check chance level plotting behavior with `from_cv_results`."""
X, y = data_binary
n_cv = 3
cv_results = cross_validate(
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
)
display = RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
plot_chance_level=plot_chance_level,
chance_level_kwargs=chance_level_kw,
curve_kwargs=curve_kwargs,
)
import matplotlib as mpl
assert all(isinstance(line, mpl.lines.Line2D) for line in display.line_)
# Ensure both curve line kwargs passed correctly as well
if curve_kwargs:
assert all(line.get_alpha() == 0.8 for line in display.line_)
assert isinstance(display.ax_, mpl.axes.Axes)
assert isinstance(display.figure_, mpl.figure.Figure)
_check_chance_level(plot_chance_level, chance_level_kw, display)
legend = display.ax_.get_legend()
# There is always a legend, to indicate each 'Fold' curve
assert legend is not None
legend_labels = [text.get_text() for text in legend.get_texts()]
if plot_chance_level and chance_level_kw is not None:
if chance_level_kw.get("label") is not None:
assert chance_level_kw["label"] in legend_labels
else:
assert len(legend_labels) == 1
@pytest.mark.parametrize(
"clf",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
),
],
)
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf, constructor_name):
"""Check the behaviour with complex pipeline."""
X, y = data_binary
clf = clone(clf)
if constructor_name == "from_estimator":
with pytest.raises(NotFittedError):
RocCurveDisplay.from_estimator(clf, X, y)
clf.fit(X, y)
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(clf, X, y)
name = clf.__class__.__name__
else:
display = RocCurveDisplay.from_predictions(y, y)
name = "Classifier"
assert name in display.line_.get_label()
assert display.name == name
@pytest.mark.parametrize(
"roc_auc, name, curve_kwargs, expected_labels",
[
([0.9, 0.8], None, None, ["AUC = 0.85 +/- 0.05", "_child1"]),
([0.9, 0.8], "Est name", None, ["Est name (AUC = 0.85 +/- 0.05)", "_child1"]),
(
[0.8, 0.7],
["fold1", "fold2"],
[{"c": "blue"}, {"c": "red"}],
["fold1 (AUC = 0.80)", "fold2 (AUC = 0.70)"],
),
(None, ["fold1", "fold2"], [{"c": "blue"}, {"c": "red"}], ["fold1", "fold2"]),
],
)
def test_roc_curve_display_default_labels(
pyplot, roc_auc, name, curve_kwargs, expected_labels
):
"""Check the default labels used in the display."""
fpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
tpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=name).plot(
curve_kwargs=curve_kwargs
)
for idx, expected_label in enumerate(expected_labels):
assert disp.line_[idx].get_label() == expected_label
def _check_auc(display, constructor_name):
roc_auc_limit = 0.95679
roc_auc_limit_multi = [0.97007, 0.985915, 0.980952]
if constructor_name == "from_cv_results":
for idx, roc_auc in enumerate(display.roc_auc):
assert roc_auc == pytest.approx(roc_auc_limit_multi[idx])
else:
assert display.roc_auc == pytest.approx(roc_auc_limit)
assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
@pytest.mark.parametrize(
"constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
)
def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
# check that we can provide the positive label and display the proper
# statistics
X, y = load_breast_cancer(return_X_y=True)
# create a highly imbalanced version of the breast cancer dataset
idx_positive = np.flatnonzero(y == 1)
idx_negative = np.flatnonzero(y == 0)
idx_selected = np.hstack([idx_negative, idx_positive[:25]])
X, y = X[idx_selected], y[idx_selected]
X, y = shuffle(X, y, random_state=42)
# only use 2 features to make the problem even harder
X = X[:, :2]
y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
stratify=y,
random_state=0,
)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
# Sanity check to be sure the positive class is `classes_[0]`
# Class imbalance ensures a large difference in prediction values between classes,
# allowing us to catch errors when we switch `pos_label`
assert classifier.classes_.tolist() == ["cancer", "not cancer"]
y_score = getattr(classifier, response_method)(X_test)
# we select the corresponding probability columns or reverse the decision
# function otherwise
y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0]
y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1]
pos_label = "cancer"
y_score = y_score_cancer
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(
classifier,
X_test,
y_test,
pos_label=pos_label,
response_method=response_method,
)
elif constructor_name == "from_predictions":
display = RocCurveDisplay.from_predictions(
y_test,
y_score,
pos_label=pos_label,
)
else:
display = RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
response_method=response_method,
pos_label=pos_label,
)
_check_auc(display, constructor_name)
pos_label = "not cancer"
y_score = y_score_not_cancer
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(
classifier,
X_test,
y_test,
response_method=response_method,
pos_label=pos_label,
)
elif constructor_name == "from_predictions":
display = RocCurveDisplay.from_predictions(
y_test,
y_score,
pos_label=pos_label,
)
else:
display = RocCurveDisplay.from_cv_results(
cv_results,
X,
y,
response_method=response_method,
pos_label=pos_label,
)
_check_auc(display, constructor_name)
# TODO(1.9): remove
def test_y_score_and_y_pred_specified_error(pyplot):
"""1. Check that an error is raised when both y_score and y_pred are specified.
2. Check that a warning is raised when y_pred is specified.
"""
y_true = np.array([0, 1, 1, 0])
y_score = np.array([0.1, 0.4, 0.35, 0.8])
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
with pytest.raises(
ValueError, match="`y_pred` and `y_score` cannot be both specified"
):
RocCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.7"):
display_y_pred = RocCurveDisplay.from_predictions(y_true, y_pred=y_score)
desired_fpr, desired_fnr, _ = roc_curve(y_true, y_score)
assert_allclose(display_y_pred.fpr, desired_fpr)
assert_allclose(display_y_pred.tpr, desired_fnr)
display_y_score = RocCurveDisplay.from_predictions(y_true, y_score)
assert_allclose(display_y_score.fpr, desired_fpr)
assert_allclose(display_y_score.tpr, desired_fnr)
@pytest.mark.parametrize("despine", [True, False])
@pytest.mark.parametrize(
"constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
)
def test_plot_roc_curve_despine(pyplot, data_binary, despine, constructor_name):
# Check that the despine keyword is working correctly
X, y = data_binary
lr = LogisticRegression().fit(X, y)
lr.fit(X, y)
cv_results = cross_validate(
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
)
y_pred = lr.decision_function(X)
# safe guard for the if/else construction
assert constructor_name in ("from_estimator", "from_predictions", "from_cv_results")
if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(lr, X, y, despine=despine)
elif constructor_name == "from_predictions":
display = RocCurveDisplay.from_predictions(y, y_pred, despine=despine)
else:
display = RocCurveDisplay.from_cv_results(cv_results, X, y, despine=despine)
for s in ["top", "right"]:
assert display.ax_.spines[s].get_visible() is not despine
if despine:
for s in ["bottom", "left"]:
assert display.ax_.spines[s].get_bounds() == (0, 1)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
"""Evaluation metrics for cluster analysis results.
- Supervised evaluation uses a ground truth class values for each sample.
- Unsupervised evaluation does not use ground truths and measures the "quality" of the
model itself.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.metrics.cluster._bicluster import consensus_score
from sklearn.metrics.cluster._supervised import (
adjusted_mutual_info_score,
adjusted_rand_score,
completeness_score,
contingency_matrix,
entropy,
expected_mutual_information,
fowlkes_mallows_score,
homogeneity_completeness_v_measure,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
pair_confusion_matrix,
rand_score,
v_measure_score,
)
from sklearn.metrics.cluster._unsupervised import (
calinski_harabasz_score,
davies_bouldin_score,
silhouette_samples,
silhouette_score,
)
__all__ = [
"adjusted_mutual_info_score",
"adjusted_rand_score",
"calinski_harabasz_score",
"completeness_score",
"consensus_score",
"contingency_matrix",
"davies_bouldin_score",
# TODO(1.10): Remove
"entropy",
"expected_mutual_information",
"fowlkes_mallows_score",
"homogeneity_completeness_v_measure",
"homogeneity_score",
"mutual_info_score",
"normalized_mutual_info_score",
"pair_confusion_matrix",
"rand_score",
"silhouette_samples",
"silhouette_score",
"v_measure_score",
]

View File

@@ -0,0 +1,114 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.utils._param_validation import StrOptions, validate_params
from sklearn.utils.validation import check_array, check_consistent_length
__all__ = ["consensus_score"]
def _check_rows_and_columns(a, b):
"""Unpacks the row and column arrays and checks their shape."""
check_consistent_length(*a)
check_consistent_length(*b)
checks = lambda x: check_array(x, ensure_2d=False)
a_rows, a_cols = map(checks, a)
b_rows, b_cols = map(checks, b)
return a_rows, a_cols, b_rows, b_cols
def _jaccard(a_rows, a_cols, b_rows, b_cols):
"""Jaccard coefficient on the elements of the two biclusters."""
intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
a_size = a_rows.sum() * a_cols.sum()
b_size = b_rows.sum() * b_cols.sum()
return intersection / (a_size + b_size - intersection)
def _pairwise_similarity(a, b, similarity):
"""Computes pairwise similarity matrix.
result[i, j] is the Jaccard coefficient of a's bicluster i and b's
bicluster j.
"""
a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
n_a = a_rows.shape[0]
n_b = b_rows.shape[0]
result = np.array(
[
[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
for i in range(n_a)
]
)
return result
@validate_params(
{
"a": [tuple],
"b": [tuple],
"similarity": [callable, StrOptions({"jaccard"})],
},
prefer_skip_nested_validation=True,
)
def consensus_score(a, b, *, similarity="jaccard"):
"""The similarity of two sets of biclusters.
Similarity between individual biclusters is computed. Then the best
matching between sets is found by solving a linear sum assignment problem,
using a modified Jonker-Volgenant algorithm.
The final score is the sum of similarities divided by the size of
the larger set.
Read more in the :ref:`User Guide <biclustering>`.
Parameters
----------
a : tuple (rows, columns)
Tuple of row and column indicators for a set of biclusters.
b : tuple (rows, columns)
Another set of biclusters like ``a``.
similarity : 'jaccard' or callable, default='jaccard'
May be the string "jaccard" to use the Jaccard coefficient, or
any function that takes four arguments, each of which is a 1d
indicator vector: (a_rows, a_columns, b_rows, b_columns).
Returns
-------
consensus_score : float
Consensus score, a non-negative value, sum of similarities
divided by size of larger set.
See Also
--------
scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
References
----------
* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
for bicluster acquisition
<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
Examples
--------
>>> from sklearn.metrics import consensus_score
>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
>>> consensus_score(a, b, similarity='jaccard')
1.0
"""
if similarity == "jaccard":
similarity = _jaccard
matrix = _pairwise_similarity(a, b, similarity)
row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
n_a = len(a[0])
n_b = len(b[0])
return float(matrix[row_indices, col_indices].sum() / max(n_a, n_b))

View File

@@ -0,0 +1,69 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from libc.math cimport exp, lgamma
from sklearn.utils._typedefs cimport float64_t, int64_t
import numpy as np
from scipy.special import gammaln
def expected_mutual_information(contingency, int64_t n_samples):
"""Calculate the expected mutual information for two labelings."""
cdef:
float64_t emi = 0
int64_t n_rows, n_cols
float64_t term2, term3, gln
int64_t[::1] a_view, b_view
float64_t[::1] term1
float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij
float64_t[::1] log_a, log_b
Py_ssize_t i, j, nij
int64_t start, end
n_rows, n_cols = contingency.shape
a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False))
b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False))
a_view = a
b_view = b
# any labelling with zero entropy implies EMI = 0
if a.size == 1 or b.size == 1:
return 0.0
# There are three major terms to the EMI equation, which are multiplied to
# and then summed over varying nij values.
# While nijs[0] will never be used, having it simplifies the indexing.
nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue.
# term1 is nij / N
term1 = nijs / n_samples
# term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
log_a = np.log(a)
log_b = np.log(b)
# term2 uses log(N * nij) = log(N) + log(nij)
log_Nnij = np.log(n_samples) + np.log(nijs)
# term3 is large, and involved many factorials. Calculate these in log
# space to stop overflows.
gln_a = gammaln(a + 1)
gln_b = gammaln(b + 1)
gln_Na = gammaln(n_samples - a + 1)
gln_Nb = gammaln(n_samples - b + 1)
gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1)
# emi itself is a summation over the various values.
for i in range(n_rows):
for j in range(n_cols):
start = max(1, a_view[i] - n_samples + b_view[j])
end = min(a_view[i], b_view[j]) + 1
for nij in range(start, end):
term2 = log_Nnij[nij] - log_a[i] - log_b[j]
# Numerators are positive, denominators are negative.
gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
- gln_Nnij[nij] - lgamma(a_view[i] - nij + 1)
- lgamma(b_view[j] - nij + 1)
- lgamma(n_samples - a_view[i] - b_view[j] + nij + 1))
term3 = exp(gln)
emi += (term1[nij] * term2 * term3)
return emi

View File

@@ -0,0 +1,487 @@
"""Unsupervised evaluation metrics."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import functools
from numbers import Integral
import numpy as np
from scipy.sparse import issparse
from sklearn.externals.array_api_compat import is_numpy_array
from sklearn.metrics.pairwise import (
_VALID_METRICS,
pairwise_distances,
pairwise_distances_chunked,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import _safe_indexing, check_random_state, check_X_y
from sklearn.utils._array_api import (
_average,
_convert_to_numpy,
_is_numpy_namespace,
_max_precision_float_dtype,
get_namespace_and_device,
xpx,
)
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
def check_number_of_labels(n_labels, n_samples):
"""Check that number of labels are valid.
Parameters
----------
n_labels : int
Number of labels.
n_samples : int
Number of samples.
"""
if not 1 < n_labels < n_samples:
raise ValueError(
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
% n_labels
)
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"labels": ["array-like"],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
"sample_size": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
},
prefer_skip_nested_validation=True,
)
def silhouette_score(
X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds
):
"""Compute the mean Silhouette Coefficient of all samples.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (``a``) and the mean nearest-cluster distance (``b``) for each
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
b)``. To clarify, ``b`` is the distance between a sample and the nearest
cluster that the sample is not a part of.
Note that Silhouette Coefficient is only defined if number of labels
is ``2 <= n_labels <= n_samples - 1``.
This function returns the mean Silhouette Coefficient over all samples.
To obtain the values for each sample, use :func:`silhouette_samples`.
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters. Negative values generally indicate that a sample has
been assigned to the wrong cluster, as a different cluster is more similar.
Read more in the :ref:`User Guide <silhouette_coefficient>`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
"precomputed" or (n_samples_a, n_features) otherwise
An array of pairwise distances between samples, or a feature array.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is
the distance array itself, use ``metric="precomputed"``.
sample_size : int, default=None
The size of the sample to use when computing the Silhouette Coefficient
on a random subset of the data.
If ``sample_size is None``, no sampling is used.
random_state : int, RandomState instance or None, default=None
Determines random number generation for selecting a subset of samples.
Used when ``sample_size is not None``.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : float
Mean Silhouette Coefficient for all samples.
References
----------
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65.
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
.. [2] `Wikipedia entry on the Silhouette Coefficient
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> from sklearn.metrics import silhouette_score
>>> X, y = make_blobs(random_state=42)
>>> kmeans = KMeans(n_clusters=2, random_state=42)
>>> silhouette_score(X, kmeans.fit_predict(X))
0.49...
"""
if sample_size is not None:
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
random_state = check_random_state(random_state)
indices = random_state.permutation(X.shape[0])[:sample_size]
if metric == "precomputed":
X, labels = X[indices].T[indices].T, labels[indices]
else:
X, labels = X[indices], labels[indices]
return float(np.mean(silhouette_samples(X, labels, metric=metric, **kwds)))
def _silhouette_reduce(D_chunk, start, labels, label_freqs):
"""Accumulate silhouette statistics for vertical chunk of X.
Parameters
----------
D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
Precomputed distances for a chunk. If a sparse matrix is provided,
only CSR format is accepted.
start : int
First index in the chunk.
labels : array-like of shape (n_samples,)
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
label_freqs : array-like
Distribution of cluster labels in ``labels``.
"""
n_chunk_samples = D_chunk.shape[0]
# accumulate distances from each sample to each cluster
cluster_distances = np.zeros(
(n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype
)
if issparse(D_chunk):
if D_chunk.format != "csr":
raise TypeError(
"Expected CSR matrix. Please pass sparse matrix in CSR format."
)
for i in range(n_chunk_samples):
indptr = D_chunk.indptr
indices = D_chunk.indices[indptr[i] : indptr[i + 1]]
sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]]
sample_labels = np.take(labels, indices)
cluster_distances[i] += np.bincount(
sample_labels, weights=sample_weights, minlength=len(label_freqs)
)
else:
for i in range(n_chunk_samples):
sample_weights = D_chunk[i]
sample_labels = labels
cluster_distances[i] += np.bincount(
sample_labels, weights=sample_weights, minlength=len(label_freqs)
)
# intra_index selects intra-cluster distances within cluster_distances
end = start + n_chunk_samples
intra_index = (np.arange(n_chunk_samples), labels[start:end])
# intra_cluster_distances are averaged over cluster size outside this function
intra_cluster_distances = cluster_distances[intra_index]
# of the remaining distances we normalise and extract the minimum
cluster_distances[intra_index] = np.inf
cluster_distances /= label_freqs
inter_cluster_distances = cluster_distances.min(axis=1)
return intra_cluster_distances, inter_cluster_distances
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"labels": ["array-like"],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
},
prefer_skip_nested_validation=True,
)
def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
"""Compute the Silhouette Coefficient for each sample.
The Silhouette Coefficient is a measure of how well samples are clustered
with samples that are similar to themselves. Clustering models with a high
Silhouette Coefficient are said to be dense, where samples in the same
cluster are similar to each other, and well separated, where samples in
different clusters are not very similar to each other.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (``a``) and the mean nearest-cluster distance (``b``) for each
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
b)``.
Note that Silhouette Coefficient is only defined if number of labels
is 2 ``<= n_labels <= n_samples - 1``.
This function returns the Silhouette Coefficient for each sample.
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters.
Read more in the :ref:`User Guide <silhouette_coefficient>`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
"precomputed" or (n_samples_a, n_features) otherwise
An array of pairwise distances between samples, or a feature array. If
a sparse matrix is provided, CSR format should be favoured avoiding
an additional copy.
labels : array-like of shape (n_samples,)
Label values for each sample.
metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by :func:`~sklearn.metrics.pairwise_distances`.
If ``X`` is the distance array itself, use "precomputed" as the metric.
Precomputed distance matrices must have 0 along the diagonal.
**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a ``scipy.spatial.distance`` metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : array-like of shape (n_samples,)
Silhouette Coefficients for each sample.
References
----------
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65.
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
.. [2] `Wikipedia entry on the Silhouette Coefficient
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
Examples
--------
>>> from sklearn.metrics import silhouette_samples
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> X, y = make_blobs(n_samples=50, random_state=42)
>>> kmeans = KMeans(n_clusters=3, random_state=42)
>>> labels = kmeans.fit_predict(X)
>>> silhouette_samples(X, labels)
array([...])
"""
X, labels = check_X_y(X, labels, accept_sparse=["csr"])
# Check for non-zero diagonal entries in precomputed distance matrix
if metric == "precomputed":
error_msg = ValueError(
"The precomputed distance matrix contains non-zero "
"elements on the diagonal. Use np.fill_diagonal(X, 0)."
)
if X.dtype.kind == "f":
atol = np.finfo(X.dtype).eps * 100
if np.any(np.abs(X.diagonal()) > atol):
raise error_msg
elif np.any(X.diagonal() != 0): # integral dtype
raise error_msg
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples = len(labels)
label_freqs = np.bincount(labels)
check_number_of_labels(len(le.classes_), n_samples)
kwds["metric"] = metric
reduce_func = functools.partial(
_silhouette_reduce, labels=labels, label_freqs=label_freqs
)
results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
intra_clust_dists, inter_clust_dists = results
intra_clust_dists = np.concatenate(intra_clust_dists)
inter_clust_dists = np.concatenate(inter_clust_dists)
denom = (label_freqs - 1).take(labels, mode="clip")
with np.errstate(divide="ignore", invalid="ignore"):
intra_clust_dists /= denom
sil_samples = inter_clust_dists - intra_clust_dists
with np.errstate(divide="ignore", invalid="ignore"):
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
# nan values are for clusters of size 1, and should be 0
return xpx.nan_to_num(sil_samples)
@validate_params(
{
"X": ["array-like"],
"labels": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def calinski_harabasz_score(X, labels):
"""Compute the Calinski and Harabasz score.
It is also known as the Variance Ratio Criterion.
The score is defined as ratio of the sum of between-cluster dispersion and
of within-cluster dispersion.
Read more in the :ref:`User Guide <calinski_harabasz_index>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
A list of ``n_features``-dimensional data points. Each row corresponds
to a single data point.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
Returns
-------
score : float
The resulting Calinski-Harabasz score.
References
----------
.. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
analysis". Communications in Statistics
<https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> from sklearn.metrics import calinski_harabasz_score
>>> X, _ = make_blobs(random_state=0)
>>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X)
>>> calinski_harabasz_score(X, kmeans.labels_)
114.8...
"""
xp, _, device_ = get_namespace_and_device(X, labels)
if _is_numpy_namespace(xp) and not is_numpy_array(X):
# This is required to handle the case where `array_api_dispatch` is False but
# we are still dealing with `X` as a non-NumPy array e.g. a PyTorch tensor.
X = _convert_to_numpy(X, xp=xp)
else:
X = xp.astype(X, _max_precision_float_dtype(xp, device_), copy=False)
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = le.classes_.shape[0]
check_number_of_labels(n_labels, n_samples)
extra_disp, intra_disp = 0.0, 0.0
mean = xp.mean(X, axis=0)
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = xp.mean(cluster_k, axis=0)
extra_disp += cluster_k.shape[0] * xp.sum((mean_k - mean) ** 2)
intra_disp += xp.sum((cluster_k - mean_k) ** 2)
return float(
1.0
if intra_disp == 0.0
else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))
)
@validate_params(
{
"X": ["array-like"],
"labels": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def davies_bouldin_score(X, labels):
"""Compute the Davies-Bouldin score.
The score is defined as the average similarity measure of each cluster with
its most similar cluster, where similarity is the ratio of within-cluster
distances to between-cluster distances. Thus, clusters which are farther
apart and less dispersed will result in a better score.
The minimum score is zero, with lower values indicating better clustering.
Read more in the :ref:`User Guide <davies-bouldin_index>`.
.. versionadded:: 0.20
Parameters
----------
X : array-like of shape (n_samples, n_features)
A list of ``n_features``-dimensional data points. Each row corresponds
to a single data point.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
Returns
-------
score: float
The resulting Davies-Bouldin score.
References
----------
.. [1] Davies, David L.; Bouldin, Donald W. (1979).
`"A Cluster Separation Measure"
<https://ieeexplore.ieee.org/document/4766909>`__.
IEEE Transactions on Pattern Analysis and Machine Intelligence.
PAMI-1 (2): 224-227
Examples
--------
>>> from sklearn.metrics import davies_bouldin_score
>>> X = [[0, 1], [1, 1], [3, 4]]
>>> labels = [0, 0, 1]
>>> davies_bouldin_score(X, labels)
0.12...
"""
xp, _, device_ = get_namespace_and_device(X, labels)
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = le.classes_.shape[0]
check_number_of_labels(n_labels, n_samples)
dtype = _max_precision_float_dtype(xp, device_)
intra_dists = xp.zeros(n_labels, dtype=dtype, device=device_)
centroids = xp.zeros((n_labels, X.shape[1]), dtype=dtype, device=device_)
for k in range(n_labels):
cluster_k = _safe_indexing(X, xp.nonzero(labels == k)[0])
centroid = _average(cluster_k, axis=0, xp=xp)
centroids[k, ...] = centroid
intra_dists[k] = _average(
pairwise_distances(cluster_k, xp.stack([centroid])), xp=xp
)
centroid_distances = pairwise_distances(centroids)
zero = xp.asarray(0.0, device=device_, dtype=dtype)
if xp.all(xpx.isclose(intra_dists, zero)) or xp.all(
xpx.isclose(centroid_distances, zero)
):
return 0.0
centroid_distances[centroid_distances == 0] = xp.inf
combined_intra_dists = intra_dists[:, None] + intra_dists
scores = xp.max(combined_intra_dists / centroid_distances, axis=1)
return float(_average(scores, xp=xp))

View File

@@ -0,0 +1,6 @@
py.extension_module(
'_expected_mutual_info_fast',
cython_gen.process('_expected_mutual_info_fast.pyx'),
subdir: 'sklearn/metrics/cluster',
install: true
)

View File

@@ -0,0 +1,56 @@
"""Testing for bicluster metrics module"""
import numpy as np
from sklearn.metrics import consensus_score
from sklearn.metrics.cluster._bicluster import _jaccard
from sklearn.utils._testing import assert_almost_equal
def test_jaccard():
a1 = np.array([True, True, False, False])
a2 = np.array([True, True, True, True])
a3 = np.array([False, True, True, False])
a4 = np.array([False, False, True, True])
assert _jaccard(a1, a1, a1, a1) == 1
assert _jaccard(a1, a1, a2, a2) == 0.25
assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
assert _jaccard(a1, a1, a4, a4) == 0
def test_consensus_score():
a = [[True, True, False, False], [False, False, True, True]]
b = a[::-1]
assert consensus_score((a, a), (a, a)) == 1
assert consensus_score((a, a), (b, b)) == 1
assert consensus_score((a, b), (a, b)) == 1
assert consensus_score((a, b), (b, a)) == 1
assert consensus_score((a, a), (b, a)) == 0
assert consensus_score((a, a), (a, b)) == 0
assert consensus_score((b, b), (a, b)) == 0
assert consensus_score((b, b), (b, a)) == 0
def test_consensus_score_issue2445():
"""Different number of biclusters in A and B"""
a_rows = np.array(
[
[True, True, False, False],
[False, False, True, True],
[False, False, False, True],
]
)
a_cols = np.array(
[
[True, True, False, False],
[False, False, True, True],
[False, False, False, True],
]
)
idx = [0, 2]
s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
# B contains 2 of the 3 biclusters in A, so score should be 2/3
assert_almost_equal(s, 2.0 / 3.0)

View File

@@ -0,0 +1,279 @@
from functools import partial
from itertools import chain
import numpy as np
import pytest
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
calinski_harabasz_score,
completeness_score,
davies_bouldin_score,
fowlkes_mallows_score,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
rand_score,
silhouette_score,
v_measure_score,
)
from sklearn.metrics.tests.test_common import check_array_api_metric
from sklearn.utils._array_api import (
_get_namespace_device_dtype_ids,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import assert_allclose
# Dictionaries of metrics
# ------------------------
# The goal of having those dictionaries is to have an easy way to call a
# particular metric and associate a name to each function:
# - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
# ground truth value)
# - UNSUPERVISED_METRICS: all unsupervised cluster metrics
#
# Those dictionaries will be used to test systematically some invariance
# properties, e.g. invariance toward several input layout.
#
SUPERVISED_METRICS = {
"adjusted_mutual_info_score": adjusted_mutual_info_score,
"adjusted_rand_score": adjusted_rand_score,
"rand_score": rand_score,
"completeness_score": completeness_score,
"homogeneity_score": homogeneity_score,
"mutual_info_score": mutual_info_score,
"normalized_mutual_info_score": normalized_mutual_info_score,
"v_measure_score": v_measure_score,
"fowlkes_mallows_score": fowlkes_mallows_score,
}
UNSUPERVISED_METRICS = {
"silhouette_score": silhouette_score,
"silhouette_manhattan": partial(silhouette_score, metric="manhattan"),
"calinski_harabasz_score": calinski_harabasz_score,
"davies_bouldin_score": davies_bouldin_score,
}
# Lists of metrics with common properties
# ---------------------------------------
# Lists of metrics with common properties are used to test systematically some
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
# that are symmetric with respect to their input argument y_true and y_pred.
#
# --------------------------------------------------------------------
# Symmetric with respect to their input arguments y_true and y_pred.
# Symmetric metrics only apply to supervised clusters.
SYMMETRIC_METRICS = [
"adjusted_rand_score",
"rand_score",
"v_measure_score",
"mutual_info_score",
"adjusted_mutual_info_score",
"normalized_mutual_info_score",
"fowlkes_mallows_score",
]
NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
# Metrics whose upper bound is 1
NORMALIZED_METRICS = [
"adjusted_rand_score",
"rand_score",
"homogeneity_score",
"completeness_score",
"v_measure_score",
"adjusted_mutual_info_score",
"fowlkes_mallows_score",
"normalized_mutual_info_score",
]
rng = np.random.RandomState(0)
y1 = rng.randint(3, size=30)
y2 = rng.randint(3, size=30)
def test_symmetric_non_symmetric_union():
assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted(
SUPERVISED_METRICS
)
@pytest.mark.parametrize(
"metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS]
)
def test_symmetry(metric_name, y1, y2):
metric = SUPERVISED_METRICS[metric_name]
assert metric(y1, y2) == pytest.approx(metric(y2, y1))
@pytest.mark.parametrize(
"metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
)
def test_non_symmetry(metric_name, y1, y2):
metric = SUPERVISED_METRICS[metric_name]
assert metric(y1, y2) != pytest.approx(metric(y2, y1))
@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
def test_normalized_output(metric_name):
upper_bound_1 = [0, 0, 0, 1, 1, 1]
upper_bound_2 = [0, 0, 0, 1, 1, 1]
metric = SUPERVISED_METRICS[metric_name]
assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
lower_bound_1 = [0, 0, 0, 0, 0, 0]
lower_bound_2 = [0, 1, 2, 3, 4, 5]
score = np.array(
[metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)]
)
assert not (score < 0).any()
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
def test_permute_labels(metric_name):
# All clustering metrics do not change score due to permutations of labels
# that is when 0 and 1 exchanged.
y_label = np.array([0, 0, 0, 1, 1, 0, 1])
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
if metric_name in SUPERVISED_METRICS:
metric = SUPERVISED_METRICS[metric_name]
score_1 = metric(y_pred, y_label)
assert_allclose(score_1, metric(1 - y_pred, y_label))
assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
assert_allclose(score_1, metric(y_pred, 1 - y_label))
else:
metric = UNSUPERVISED_METRICS[metric_name]
X = np.random.randint(10, size=(7, 10))
score_1 = metric(X, y_pred)
assert_allclose(score_1, metric(X, 1 - y_pred))
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
# For all clustering metrics Input parameters can be both
# in the form of arrays lists, positive, negative or string
def test_format_invariance(metric_name):
y_true = [0, 0, 0, 0, 1, 1, 1, 1]
y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
def generate_formats(y):
y = np.array(y)
yield y, "array of ints"
yield y.tolist(), "list of ints"
yield [str(x) + "-a" for x in y.tolist()], "list of strs"
yield (
np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
"array of strs",
)
yield y - 1, "including negative ints"
yield y + 1, "strictly positive ints"
if metric_name in SUPERVISED_METRICS:
metric = SUPERVISED_METRICS[metric_name]
score_1 = metric(y_true, y_pred)
y_true_gen = generate_formats(y_true)
y_pred_gen = generate_formats(y_pred)
for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen):
assert score_1 == metric(y_true_fmt, y_pred_fmt)
else:
metric = UNSUPERVISED_METRICS[metric_name]
X = np.random.randint(10, size=(8, 10))
score_1 = metric(X, y_true)
assert score_1 == metric(X.astype(float), y_true)
y_true_gen = generate_formats(y_true)
for y_true_fmt, fmt_name in y_true_gen:
assert score_1 == metric(X, y_true_fmt)
@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
def test_single_sample(metric):
# only the supervised metrics support single sample
for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
metric([i], [j])
@pytest.mark.parametrize(
"metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
)
def test_inf_nan_input(metric_name, metric_func):
if metric_name in SUPERVISED_METRICS:
invalids = [
([0, 1], [np.inf, np.inf]),
([0, 1], [np.nan, np.nan]),
([0, 1], [np.nan, np.inf]),
]
else:
X = np.random.randint(10, size=(2, 10))
invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
for args in invalids:
metric_func(*args)
@pytest.mark.parametrize("name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
def test_returned_value_consistency(name):
"""Ensure that the returned values of all metrics are consistent.
It can only be a float. It should not be a numpy float64 or float32.
"""
rng = np.random.RandomState(0)
X = rng.randint(10, size=(20, 10))
labels_true = rng.randint(0, 3, size=(20,))
labels_pred = rng.randint(0, 3, size=(20,))
if name in SUPERVISED_METRICS:
metric = SUPERVISED_METRICS[name]
score = metric(labels_true, labels_pred)
else:
metric = UNSUPERVISED_METRICS[name]
score = metric(X, labels_pred)
assert isinstance(score, float)
assert not isinstance(score, (np.float64, np.float32))
def check_array_api_unsupervised_metric(metric, array_namespace, device, dtype_name):
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
X = np.random.randint(10, size=(7, 10))
check_array_api_metric(
metric,
array_namespace,
device,
dtype_name,
a_np=X,
b_np=y_pred,
)
array_api_metric_checkers = {
calinski_harabasz_score: [
check_array_api_unsupervised_metric,
],
davies_bouldin_score: [
check_array_api_unsupervised_metric,
],
}
def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers):
for metric, checkers in metric_checkers.items():
for checker in checkers:
yield metric, checker
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations())
def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func):
check_func(metric, array_namespace, device, dtype_name)

View File

@@ -0,0 +1,532 @@
import warnings
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
from sklearn.base import config_context
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
completeness_score,
contingency_matrix,
expected_mutual_information,
fowlkes_mallows_score,
homogeneity_completeness_v_measure,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
pair_confusion_matrix,
rand_score,
v_measure_score,
)
from sklearn.metrics.cluster._supervised import (
_entropy,
_generalized_average,
check_clusterings,
entropy,
)
from sklearn.utils import assert_all_finite
from sklearn.utils._array_api import (
_get_namespace_device_dtype_ids,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal
score_funcs = [
adjusted_rand_score,
rand_score,
homogeneity_score,
completeness_score,
v_measure_score,
adjusted_mutual_info_score,
normalized_mutual_info_score,
]
@pytest.mark.parametrize("score_func", score_funcs)
def test_error_messages_on_wrong_input(score_func):
expected = r"Found input variables with inconsistent numbers of samples: \[2, 3\]"
with pytest.raises(ValueError, match=expected):
score_func([0, 1], [1, 1, 1])
expected = r"labels_true must be 1D: shape is \(2"
with pytest.raises(ValueError, match=expected):
score_func([[0, 1], [1, 0]], [1, 1, 1])
expected = r"labels_pred must be 1D: shape is \(2"
with pytest.raises(ValueError, match=expected):
score_func([0, 1, 0], [[1, 1], [0, 0]])
def test_generalized_average():
a, b = 1, 2
methods = ["min", "geometric", "arithmetic", "max"]
means = [_generalized_average(a, b, method) for method in methods]
assert means[0] <= means[1] <= means[2] <= means[3]
c, d = 12, 12
means = [_generalized_average(c, d, method) for method in methods]
assert means[0] == means[1] == means[2] == means[3]
@pytest.mark.parametrize("score_func", score_funcs)
def test_perfect_matches(score_func):
assert score_func([], []) == pytest.approx(1.0)
assert score_func([0], [1]) == pytest.approx(1.0)
assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0)
assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0)
assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
@pytest.mark.parametrize(
"score_func",
[
normalized_mutual_info_score,
adjusted_mutual_info_score,
],
)
@pytest.mark.parametrize("average_method", ["min", "geometric", "arithmetic", "max"])
def test_perfect_matches_with_changing_means(score_func, average_method):
assert score_func([], [], average_method=average_method) == pytest.approx(1.0)
assert score_func([0], [1], average_method=average_method) == pytest.approx(1.0)
assert score_func(
[0, 0, 0], [0, 0, 0], average_method=average_method
) == pytest.approx(1.0)
assert score_func(
[0, 1, 0], [42, 7, 42], average_method=average_method
) == pytest.approx(1.0)
assert score_func(
[0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=average_method
) == pytest.approx(1.0)
assert score_func(
[0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=average_method
) == pytest.approx(1.0)
assert score_func(
[0, 1, 2], [42, 7, 2], average_method=average_method
) == pytest.approx(1.0)
# Non-regression tests for: https://github.com/scikit-learn/scikit-learn/issues/30950
assert score_func([0, 1], [0, 1], average_method=average_method) == pytest.approx(
1.0
)
assert score_func(
[0, 1, 2, 3], [0, 1, 2, 3], average_method=average_method
) == pytest.approx(1.0)
def test_homogeneous_but_not_complete_labeling():
# homogeneous but not complete clustering
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2])
assert_almost_equal(h, 1.00, 2)
assert_almost_equal(c, 0.69, 2)
assert_almost_equal(v, 0.81, 2)
def test_complete_but_not_homogeneous_labeling():
# complete but not homogeneous clustering
h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1])
assert_almost_equal(h, 0.58, 2)
assert_almost_equal(c, 1.00, 2)
assert_almost_equal(v, 0.73, 2)
def test_not_complete_and_not_homogeneous_labeling():
# neither complete nor homogeneous but not so bad either
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
def test_beta_parameter():
# test for when beta passed to
# homogeneity_completeness_v_measure
# and v_measure_score
beta_test = 0.2
h_test = 0.67
c_test = 0.42
v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)
h, c, v = homogeneity_completeness_v_measure(
[0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
)
assert_almost_equal(h, h_test, 2)
assert_almost_equal(c, c_test, 2)
assert_almost_equal(v, v_test, 2)
v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
assert_almost_equal(v, v_test, 2)
def test_non_consecutive_labels():
# regression tests for labels with gaps
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(ari_1, 0.24, 2)
assert_almost_equal(ari_2, 0.24, 2)
ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(ri_1, 0.66, 2)
assert_almost_equal(ri_2, 0.66, 2)
def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42):
# Compute score for random uniform cluster labelings
random_labels = np.random.RandomState(seed).randint
scores = np.zeros((len(k_range), n_runs))
for i, k in enumerate(k_range):
for j in range(n_runs):
labels_a = random_labels(low=0, high=k, size=n_samples)
labels_b = random_labels(low=0, high=k, size=n_samples)
scores[i, j] = score_func(labels_a, labels_b)
return scores
def test_adjustment_for_chance():
# Check that adjusted scores are almost zero on random labels
n_clusters_range = [2, 10, 50, 90]
n_samples = 100
n_runs = 10
scores = uniform_labelings_scores(
adjusted_rand_score, n_samples, n_clusters_range, n_runs
)
max_abs_scores = np.abs(scores).max(axis=1)
assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
def test_adjusted_mutual_info_score():
# Compute the Adjusted Mutual Information and test against known values
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
# Mutual information
mi = mutual_info_score(labels_a, labels_b)
assert_almost_equal(mi, 0.41022, 5)
# with provided sparse contingency
C = contingency_matrix(labels_a, labels_b, sparse=True)
mi = mutual_info_score(labels_a, labels_b, contingency=C)
assert_almost_equal(mi, 0.41022, 5)
# with provided dense contingency
C = contingency_matrix(labels_a, labels_b)
mi = mutual_info_score(labels_a, labels_b, contingency=C)
assert_almost_equal(mi, 0.41022, 5)
# Expected mutual information
n_samples = C.sum()
emi = expected_mutual_information(C, n_samples)
assert_almost_equal(emi, 0.15042, 5)
# Adjusted mutual information
ami = adjusted_mutual_info_score(labels_a, labels_b)
assert_almost_equal(ami, 0.27821, 5)
ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
assert ami == pytest.approx(1.0)
# Test with a very large array
a110 = np.array([list(labels_a) * 110]).flatten()
b110 = np.array([list(labels_b) * 110]).flatten()
ami = adjusted_mutual_info_score(a110, b110)
assert_almost_equal(ami, 0.38, 2)
def test_expected_mutual_info_overflow():
# Test for regression where contingency cell exceeds 2**16
# leading to overflow in np.outer, resulting in EMI > 1
assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
def test_int_overflow_mutual_info_fowlkes_mallows_score():
# Test overflow in mutual_info_classif and fowlkes_mallows_score
x = np.array(
[1] * (52632 + 2529)
+ [2] * (14660 + 793)
+ [3] * (3271 + 204)
+ [4] * (814 + 39)
+ [5] * (316 + 20)
)
y = np.array(
[0] * 52632
+ [1] * 2529
+ [0] * 14660
+ [1] * 793
+ [0] * 3271
+ [1] * 204
+ [0] * 814
+ [1] * 39
+ [0] * 316
+ [1] * 20
)
assert_all_finite(mutual_info_score(x, y))
assert_all_finite(fowlkes_mallows_score(x, y))
# TODO(1.10): Remove
def test_public_entropy_deprecation():
with pytest.warns(FutureWarning, match="Function entropy is deprecated"):
entropy([0, 0, 42.0])
def test_entropy():
assert_almost_equal(_entropy([0, 0, 42.0]), 0.6365141, 5)
assert_almost_equal(_entropy([]), 1)
assert _entropy([1, 1, 1, 1]) == 0
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_entropy_array_api(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device)
empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device)
int_labels = xp.asarray([1, 1, 1, 1], device=device)
with config_context(array_api_dispatch=True):
assert _entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
assert _entropy(empty_int32_labels) == 1
assert _entropy(int_labels) == 0
def test_contingency_matrix():
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
C = contingency_matrix(labels_a, labels_b)
C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
assert_array_almost_equal(C, C2)
C = contingency_matrix(labels_a, labels_b, eps=0.1)
assert_array_almost_equal(C, C2 + 0.1)
def test_contingency_matrix_sparse():
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
C = contingency_matrix(labels_a, labels_b)
C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
assert_array_almost_equal(C, C_sparse)
with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
def test_exactly_zero_info_score():
# Check numerical stability when information is exactly zero
for i in np.logspace(1, 4, 4).astype(int):
labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0)
assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
for method in ["min", "geometric", "arithmetic", "max"]:
assert (
adjusted_mutual_info_score(labels_a, labels_b, average_method=method)
== 0.0
)
assert normalized_mutual_info_score(
labels_a, labels_b, average_method=method
) == pytest.approx(0.0)
def test_v_measure_and_mutual_information(seed=36):
# Check relation between v_measure, entropy and mutual information
for i in np.logspace(1, 4, 4).astype(int):
random_state = np.random.RandomState(seed)
labels_a, labels_b = (
random_state.randint(0, 10, i),
random_state.randint(0, 10, i),
)
assert_almost_equal(
v_measure_score(labels_a, labels_b),
2.0
* mutual_info_score(labels_a, labels_b)
/ (_entropy(labels_a) + _entropy(labels_b)),
0,
)
avg = "arithmetic"
assert_almost_equal(
v_measure_score(labels_a, labels_b),
normalized_mutual_info_score(labels_a, labels_b, average_method=avg),
)
def test_fowlkes_mallows_score():
# General case
score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))
# Perfect match but where the label names changed
perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
assert_almost_equal(perfect_score, 1.0)
# Worst case
worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
assert_almost_equal(worst_score, 0.0)
def test_fowlkes_mallows_score_properties():
# handcrafted example
labels_a = np.array([0, 0, 0, 1, 1, 2])
labels_b = np.array([1, 1, 2, 2, 0, 0])
expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0))
# FMI = TP / sqrt((TP + FP) * (TP + FN))
score_original = fowlkes_mallows_score(labels_a, labels_b)
assert_almost_equal(score_original, expected)
# symmetric property
score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
assert_almost_equal(score_symmetric, expected)
# permutation property
score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
assert_almost_equal(score_permuted, expected)
# symmetric and permutation(both together)
score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
assert_almost_equal(score_both, expected)
@pytest.mark.parametrize(
"labels_true, labels_pred",
[
(["a"] * 6, [1, 1, 0, 0, 1, 1]),
([1] * 6, [1, 1, 0, 0, 1, 1]),
([1, 1, 0, 0, 1, 1], ["a"] * 6),
([1, 1, 0, 0, 1, 1], [1] * 6),
(["a"] * 6, ["a"] * 6),
],
)
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
# Check that MI = 0 when one or both labelling are constant
# non-regression test for #16355
assert mutual_info_score(labels_true, labels_pred) == 0
def test_check_clustering_error():
# Test warning message for continuous values
rng = np.random.RandomState(42)
noise = rng.rand(500)
wavelength = np.linspace(0.01, 1, 500) * 1e-6
msg = (
"Clustering metrics expects discrete values but received "
"continuous values for label, and continuous values for "
"target"
)
with pytest.warns(UserWarning, match=msg):
check_clusterings(wavelength, noise)
def test_pair_confusion_matrix_fully_dispersed():
# edge case: every element is its own cluster
N = 100
clustering1 = list(range(N))
clustering2 = clustering1
expected = np.array([[N * (N - 1), 0], [0, 0]])
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def test_pair_confusion_matrix_single_cluster():
# edge case: only one cluster
N = 100
clustering1 = np.zeros((N,))
clustering2 = clustering1
expected = np.array([[0, 0], [0, N * (N - 1)]])
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def test_pair_confusion_matrix():
# regular case: different non-trivial clusterings
n = 10
N = n**2
clustering1 = np.hstack([[i + 1] * n for i in range(n)])
clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
# basic quadratic implementation
expected = np.zeros(shape=(2, 2), dtype=np.int64)
for i in range(len(clustering1)):
for j in range(len(clustering2)):
if i != j:
same_cluster_1 = int(clustering1[i] == clustering1[j])
same_cluster_2 = int(clustering2[i] == clustering2[j])
expected[same_cluster_1, same_cluster_2] += 1
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
@pytest.mark.parametrize(
"clustering1, clustering2",
[(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))],
)
def test_rand_score_edge_cases(clustering1, clustering2):
# edge case 1: every element is its own cluster
# edge case 2: only one cluster
assert_allclose(rand_score(clustering1, clustering2), 1.0)
def test_rand_score():
# regular case: different non-trivial clusterings
clustering1 = [0, 0, 0, 1, 1, 1]
clustering2 = [0, 1, 0, 1, 2, 2]
# pair confusion matrix
D11 = 2 * 2 # ordered pairs (1, 3), (5, 6)
D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
D01 = 2 * 1 # ordered pair (2, 4)
D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs
# rand score
expected_numerator = D00 + D11
expected_denominator = D00 + D01 + D10 + D11
expected = expected_numerator / expected_denominator
assert_allclose(rand_score(clustering1, clustering2), expected)
def test_adjusted_rand_score_overflow():
"""Check that large amount of data will not lead to overflow in
`adjusted_rand_score`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20305
"""
rng = np.random.RandomState(0)
y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
adjusted_rand_score(y_true, y_pred)
@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"])
def test_normalized_mutual_info_score_bounded(average_method):
"""Check that nmi returns a score between 0 (included) and 1 (excluded
for non-perfect match)
Non-regression test for issue #13836
"""
labels1 = [0] * 469
labels2 = [1] + labels1[1:]
labels3 = [0, 1] + labels1[2:]
# labels1 is constant. The mutual info between labels1 and any other labelling is 0.
nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method)
assert nmi == 0
# non constant, non perfect matching labels
nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method)
assert 0 <= nmi < 1
# TODO(1.9): remove
@pytest.mark.parametrize("sparse", [True, False])
def test_fowlkes_mallows_sparse_deprecated(sparse):
"""Check deprecation warning for 'sparse' parameter of fowlkes_mallows_score."""
with pytest.warns(
FutureWarning, match="The 'sparse' parameter was deprecated in 1.7"
):
fowlkes_mallows_score([0, 1], [1, 1], sparse=sparse)

View File

@@ -0,0 +1,413 @@
import warnings
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy.sparse import issparse
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.metrics.cluster import (
calinski_harabasz_score,
davies_bouldin_score,
silhouette_samples,
silhouette_score,
)
from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import (
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
@pytest.mark.parametrize(
"sparse_container",
[None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
@pytest.mark.parametrize("sample_size", [None, "half"])
def test_silhouette(sparse_container, sample_size):
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X, y = dataset.data, dataset.target
if sparse_container is not None:
X = sparse_container(X)
sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
D = pairwise_distances(X, metric="euclidean")
# Given that the actual labels are used, we can assume that S would be positive.
score_precomputed = silhouette_score(
D, y, metric="precomputed", sample_size=sample_size, random_state=0
)
score_euclidean = silhouette_score(
X, y, metric="euclidean", sample_size=sample_size, random_state=0
)
assert score_precomputed > 0
assert score_euclidean > 0
assert score_precomputed == pytest.approx(score_euclidean)
def test_cluster_size_1():
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
# (cluster 0). We also test the case where there are identical samples
# as the only members of a cluster (cluster 2). To our knowledge, this case
# is not discussed in reference material, and we choose for it a sample
# score of 1.
X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
labels = np.array([0, 1, 1, 1, 2, 2])
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
# Cluster 1: intra-cluster = [.5, .5, 1]
# inter-cluster = [1, 1, 1]
# silhouette = [.5, .5, 0]
# Cluster 2: intra-cluster = [0, 0]
# inter-cluster = [arbitrary, arbitrary]
# silhouette = [1., 1.]
silhouette = silhouette_score(X, labels)
assert not np.isnan(silhouette)
ss = silhouette_samples(X, labels)
assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
def test_silhouette_paper_example():
# Explicitly check per-sample results against Rousseeuw (1987)
# Data from Table 1
lower = [
5.58,
7.00,
6.50,
7.08,
7.00,
3.83,
4.83,
5.08,
8.17,
5.83,
2.17,
5.75,
6.67,
6.92,
4.92,
6.42,
5.00,
5.58,
6.00,
4.67,
6.42,
3.42,
5.50,
6.42,
6.42,
5.00,
3.92,
6.17,
2.50,
4.92,
6.25,
7.33,
4.50,
2.25,
6.33,
2.75,
6.08,
6.67,
4.25,
2.67,
6.00,
6.17,
6.17,
6.92,
6.17,
5.25,
6.83,
4.50,
3.75,
5.75,
5.42,
6.08,
5.83,
6.67,
3.67,
4.75,
3.00,
6.08,
6.67,
5.00,
5.58,
4.83,
6.17,
5.67,
6.50,
6.92,
]
D = np.zeros((12, 12))
D[np.tril_indices(12, -1)] = lower
D += D.T
names = [
"BEL",
"BRA",
"CHI",
"CUB",
"EGY",
"FRA",
"IND",
"ISR",
"USA",
"USS",
"YUG",
"ZAI",
]
# Data from Figure 2
labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
expected1 = {
"USA": 0.43,
"BEL": 0.39,
"FRA": 0.35,
"ISR": 0.30,
"BRA": 0.22,
"EGY": 0.20,
"ZAI": 0.19,
"CUB": 0.40,
"USS": 0.34,
"CHI": 0.33,
"YUG": 0.26,
"IND": -0.04,
}
score1 = 0.28
# Data from Figure 3
labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
expected2 = {
"USA": 0.47,
"FRA": 0.44,
"BEL": 0.42,
"ISR": 0.37,
"EGY": 0.02,
"ZAI": 0.28,
"BRA": 0.25,
"IND": 0.17,
"CUB": 0.48,
"USS": 0.44,
"YUG": 0.31,
"CHI": 0.31,
}
score2 = 0.33
for labels, expected, score in [
(labels1, expected1, score1),
(labels2, expected2, score2),
]:
expected = [expected[name] for name in names]
# we check to 2dp because that's what's in the paper
pytest.approx(
expected,
silhouette_samples(D, np.array(labels), metric="precomputed"),
abs=1e-2,
)
pytest.approx(
score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
)
def test_correct_labelsize():
# Assert 1 < n_labels < n_samples
dataset = datasets.load_iris()
X = dataset.data
# n_labels = n_samples
y = np.arange(X.shape[0])
err_msg = (
r"Number of labels is %d\. Valid values are 2 "
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
)
with pytest.raises(ValueError, match=err_msg):
silhouette_score(X, y)
# n_labels = 1
y = np.zeros(X.shape[0])
err_msg = (
r"Number of labels is %d\. Valid values are 2 "
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
)
with pytest.raises(ValueError, match=err_msg):
silhouette_score(X, y)
def test_non_encoded_labels():
dataset = datasets.load_iris()
X = dataset.data
labels = dataset.target
assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
assert_array_equal(
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
)
def test_non_numpy_labels():
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_silhouette_nonzero_diag(dtype):
# Make sure silhouette_samples requires diagonal to be zero.
# Non-regression test for #12178
# Construct a zero-diagonal matrix
dists = pairwise_distances(
np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
)
labels = [0, 0, 0, 1, 1, 1]
# small values on the diagonal are OK
dists[2][2] = np.finfo(dists.dtype).eps * 10
silhouette_samples(dists, labels, metric="precomputed")
# values bigger than eps * 100 are not
dists[2][2] = np.finfo(dists.dtype).eps * 1000
with pytest.raises(ValueError, match="contains non-zero"):
silhouette_samples(dists, labels, metric="precomputed")
@pytest.mark.parametrize(
"sparse_container",
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
def test_silhouette_samples_precomputed_sparse(sparse_container):
"""Check that silhouette_samples works for sparse matrices correctly."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
y = [0, 0, 0, 0, 1, 1, 1, 1]
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
assert issparse(pdist_sparse)
output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
assert_allclose(output_with_sparse_input, output_with_dense_input)
@pytest.mark.parametrize(
"sparse_container",
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
def test_silhouette_samples_euclidean_sparse(sparse_container):
"""Check that silhouette_samples works for sparse matrices correctly."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
y = [0, 0, 0, 0, 1, 1, 1, 1]
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
assert issparse(pdist_sparse)
output_with_sparse_input = silhouette_samples(pdist_sparse, y)
output_with_dense_input = silhouette_samples(pdist_dense, y)
assert_allclose(output_with_sparse_input, output_with_dense_input)
@pytest.mark.parametrize(
"sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
)
def test_silhouette_reduce(sparse_container):
"""Check for non-CSR input to private method `_silhouette_reduce`."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
y = [0, 0, 0, 0, 1, 1, 1, 1]
label_freqs = np.bincount(y)
with pytest.raises(
TypeError,
match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
):
_silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
def assert_raises_on_only_one_label(func):
"""Assert message when there is only one label"""
rng = np.random.RandomState(seed=0)
with pytest.raises(ValueError, match="Number of labels is"):
func(rng.rand(10, 2), np.zeros(10))
def assert_raises_on_all_points_same_cluster(func):
"""Assert message when all point are in different clusters"""
rng = np.random.RandomState(seed=0)
with pytest.raises(ValueError, match="Number of labels is"):
func(rng.rand(10, 2), np.arange(10))
def test_calinski_harabasz_score():
assert_raises_on_only_one_label(calinski_harabasz_score)
assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
# Assert the value is 1. when all samples are equals
assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
# Assert the value is 0. when all the mean cluster are equal
assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
# General case (with non numpy arrays)
X = (
[[0, 0], [1, 1]] * 5
+ [[3, 3], [4, 4]] * 5
+ [[0, 4], [1, 3]] * 5
+ [[3, 1], [4, 0]] * 5
)
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def test_davies_bouldin_score():
assert_raises_on_only_one_label(davies_bouldin_score)
assert_raises_on_all_points_same_cluster(davies_bouldin_score)
# Assert the value is 0. when all samples are equals
assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
0.0
)
# Assert the value is 0. when all the mean cluster are equal
assert davies_bouldin_score(
[[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
) == pytest.approx(0.0)
# General case (with non numpy arrays)
X = (
[[0, 0], [1, 1]] * 5
+ [[3, 3], [4, 4]] * 5
+ [[0, 4], [1, 3]] * 5
+ [[3, 1], [4, 0]] * 5
)
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
# Ensure divide by zero warning is not raised in general case
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
davies_bouldin_score(X, labels)
# General case - cluster have one sample
X = [[0, 0], [2, 2], [3, 3], [5, 5]]
labels = [0, 0, 1, 2]
pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
def test_silhouette_score_integer_precomputed():
"""Check that silhouette_score works for precomputed metrics that are integers.
Non-regression test for #22107.
"""
result = silhouette_score(
[[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
)
assert result == pytest.approx(1 / 6)
# non-zero on diagonal for ints raises an error
with pytest.raises(ValueError, match="contains non-zero"):
silhouette_score(
[[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
)

View File

@@ -0,0 +1,49 @@
# Metrics is cimported from other subpackages so this is needed for the cimport
# to work
metrics_cython_tree = [
fs.copyfile('__init__.py')
]
# Some metrics code cimports code from utils, we may as well copy all the necessary files
metrics_cython_tree += utils_cython_tree
_dist_metrics_pxd = custom_target(
'_dist_metrics_pxd',
output: '_dist_metrics.pxd',
input: '_dist_metrics.pxd.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# Need to install the generated pxd because it is needed in other subpackages
# Cython code, e.g. sklearn.cluster
install_dir: sklearn_dir / 'metrics',
install: true,
)
metrics_cython_tree += [_dist_metrics_pxd]
_dist_metrics_pyx = custom_target(
'_dist_metrics_pyx',
output: '_dist_metrics.pyx',
input: '_dist_metrics.pyx.tp',
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
# TODO in principle this should go in py.exension_module below. This is
# temporary work-around for dependency issue with .pyx.tp files. For more
# details, see https://github.com/mesonbuild/meson/issues/13212
depends: metrics_cython_tree,
)
_dist_metrics = py.extension_module(
'_dist_metrics',
cython_gen.process(_dist_metrics_pyx),
dependencies: [np_dep],
subdir: 'sklearn/metrics',
install: true
)
py.extension_module(
'_pairwise_fast',
[cython_gen.process('_pairwise_fast.pyx'), metrics_cython_tree],
dependencies: [openmp_dep],
subdir: 'sklearn/metrics',
install: true
)
subdir('_pairwise_distances_reduction')
subdir('cluster')

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More