Videre
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
"""Score functions, performance metrics, pairwise metrics and distance computations."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.metrics import cluster
|
||||
from sklearn.metrics._classification import (
|
||||
accuracy_score,
|
||||
balanced_accuracy_score,
|
||||
brier_score_loss,
|
||||
class_likelihood_ratios,
|
||||
classification_report,
|
||||
cohen_kappa_score,
|
||||
confusion_matrix,
|
||||
d2_brier_score,
|
||||
d2_log_loss_score,
|
||||
f1_score,
|
||||
fbeta_score,
|
||||
hamming_loss,
|
||||
hinge_loss,
|
||||
jaccard_score,
|
||||
log_loss,
|
||||
matthews_corrcoef,
|
||||
multilabel_confusion_matrix,
|
||||
precision_recall_fscore_support,
|
||||
precision_score,
|
||||
recall_score,
|
||||
zero_one_loss,
|
||||
)
|
||||
from sklearn.metrics._dist_metrics import DistanceMetric
|
||||
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
|
||||
from sklearn.metrics._plot.det_curve import DetCurveDisplay
|
||||
from sklearn.metrics._plot.precision_recall_curve import PrecisionRecallDisplay
|
||||
from sklearn.metrics._plot.regression import PredictionErrorDisplay
|
||||
from sklearn.metrics._plot.roc_curve import RocCurveDisplay
|
||||
from sklearn.metrics._ranking import (
|
||||
auc,
|
||||
average_precision_score,
|
||||
confusion_matrix_at_thresholds,
|
||||
coverage_error,
|
||||
dcg_score,
|
||||
det_curve,
|
||||
label_ranking_average_precision_score,
|
||||
label_ranking_loss,
|
||||
ndcg_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
top_k_accuracy_score,
|
||||
)
|
||||
from sklearn.metrics._regression import (
|
||||
d2_absolute_error_score,
|
||||
d2_pinball_score,
|
||||
d2_tweedie_score,
|
||||
explained_variance_score,
|
||||
max_error,
|
||||
mean_absolute_error,
|
||||
mean_absolute_percentage_error,
|
||||
mean_gamma_deviance,
|
||||
mean_pinball_loss,
|
||||
mean_poisson_deviance,
|
||||
mean_squared_error,
|
||||
mean_squared_log_error,
|
||||
mean_tweedie_deviance,
|
||||
median_absolute_error,
|
||||
r2_score,
|
||||
root_mean_squared_error,
|
||||
root_mean_squared_log_error,
|
||||
)
|
||||
from sklearn.metrics._scorer import (
|
||||
check_scoring,
|
||||
get_scorer,
|
||||
get_scorer_names,
|
||||
make_scorer,
|
||||
)
|
||||
from sklearn.metrics.cluster import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
calinski_harabasz_score,
|
||||
completeness_score,
|
||||
consensus_score,
|
||||
davies_bouldin_score,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_completeness_v_measure,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
pair_confusion_matrix,
|
||||
rand_score,
|
||||
silhouette_samples,
|
||||
silhouette_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.metrics.pairwise import (
|
||||
euclidean_distances,
|
||||
nan_euclidean_distances,
|
||||
pairwise_distances,
|
||||
pairwise_distances_argmin,
|
||||
pairwise_distances_argmin_min,
|
||||
pairwise_distances_chunked,
|
||||
pairwise_kernels,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ConfusionMatrixDisplay",
|
||||
"DetCurveDisplay",
|
||||
"DistanceMetric",
|
||||
"PrecisionRecallDisplay",
|
||||
"PredictionErrorDisplay",
|
||||
"RocCurveDisplay",
|
||||
"accuracy_score",
|
||||
"adjusted_mutual_info_score",
|
||||
"adjusted_rand_score",
|
||||
"auc",
|
||||
"average_precision_score",
|
||||
"balanced_accuracy_score",
|
||||
"brier_score_loss",
|
||||
"calinski_harabasz_score",
|
||||
"check_scoring",
|
||||
"class_likelihood_ratios",
|
||||
"classification_report",
|
||||
"cluster",
|
||||
"cohen_kappa_score",
|
||||
"completeness_score",
|
||||
"confusion_matrix",
|
||||
"confusion_matrix_at_thresholds",
|
||||
"consensus_score",
|
||||
"coverage_error",
|
||||
"d2_absolute_error_score",
|
||||
"d2_brier_score",
|
||||
"d2_log_loss_score",
|
||||
"d2_pinball_score",
|
||||
"d2_tweedie_score",
|
||||
"davies_bouldin_score",
|
||||
"dcg_score",
|
||||
"det_curve",
|
||||
"euclidean_distances",
|
||||
"explained_variance_score",
|
||||
"f1_score",
|
||||
"fbeta_score",
|
||||
"fowlkes_mallows_score",
|
||||
"get_scorer",
|
||||
"get_scorer_names",
|
||||
"hamming_loss",
|
||||
"hinge_loss",
|
||||
"homogeneity_completeness_v_measure",
|
||||
"homogeneity_score",
|
||||
"jaccard_score",
|
||||
"label_ranking_average_precision_score",
|
||||
"label_ranking_loss",
|
||||
"log_loss",
|
||||
"make_scorer",
|
||||
"matthews_corrcoef",
|
||||
"max_error",
|
||||
"mean_absolute_error",
|
||||
"mean_absolute_percentage_error",
|
||||
"mean_gamma_deviance",
|
||||
"mean_pinball_loss",
|
||||
"mean_poisson_deviance",
|
||||
"mean_squared_error",
|
||||
"mean_squared_log_error",
|
||||
"mean_tweedie_deviance",
|
||||
"median_absolute_error",
|
||||
"multilabel_confusion_matrix",
|
||||
"mutual_info_score",
|
||||
"nan_euclidean_distances",
|
||||
"ndcg_score",
|
||||
"normalized_mutual_info_score",
|
||||
"pair_confusion_matrix",
|
||||
"pairwise_distances",
|
||||
"pairwise_distances_argmin",
|
||||
"pairwise_distances_argmin_min",
|
||||
"pairwise_distances_chunked",
|
||||
"pairwise_kernels",
|
||||
"precision_recall_curve",
|
||||
"precision_recall_fscore_support",
|
||||
"precision_score",
|
||||
"r2_score",
|
||||
"rand_score",
|
||||
"recall_score",
|
||||
"roc_auc_score",
|
||||
"roc_curve",
|
||||
"root_mean_squared_error",
|
||||
"root_mean_squared_log_error",
|
||||
"silhouette_samples",
|
||||
"silhouette_score",
|
||||
"top_k_accuracy_score",
|
||||
"v_measure_score",
|
||||
"zero_one_loss",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Common code for all metrics.
|
||||
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from itertools import combinations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils import check_array, check_consistent_length
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
|
||||
def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight=None):
|
||||
"""Average a binary metric for multilabel classification.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array, shape = [n_samples] or [n_samples, n_classes]
|
||||
True binary labels in binary label indicators.
|
||||
|
||||
y_score : array, shape = [n_samples] or [n_samples, n_classes]
|
||||
Target scores, can either be probability estimates of the positive
|
||||
class, confidence values, or binary decisions.
|
||||
|
||||
average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'
|
||||
If ``None``, the scores for each class are returned. Otherwise,
|
||||
this determines the type of averaging performed on the data:
|
||||
|
||||
``'micro'``:
|
||||
Calculate metrics globally by considering each element of the label
|
||||
indicator matrix as a label.
|
||||
``'macro'``:
|
||||
Calculate metrics for each label, and find their unweighted
|
||||
mean. This does not take label imbalance into account.
|
||||
``'weighted'``:
|
||||
Calculate metrics for each label, and find their average, weighted
|
||||
by support (the number of true instances for each label).
|
||||
``'samples'``:
|
||||
Calculate metrics for each instance, and find their average.
|
||||
|
||||
Will be ignored when ``y_true`` is binary.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
binary_metric : callable, returns shape [n_classes]
|
||||
The binary metric function to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float or array of shape [n_classes]
|
||||
If not ``None``, average the score, else return the score for each
|
||||
classes.
|
||||
|
||||
"""
|
||||
average_options = (None, "micro", "macro", "weighted", "samples")
|
||||
if average not in average_options:
|
||||
raise ValueError("average has to be one of {0}".format(average_options))
|
||||
|
||||
y_type = type_of_target(y_true)
|
||||
if y_type not in ("binary", "multilabel-indicator"):
|
||||
raise ValueError("{0} format is not supported".format(y_type))
|
||||
|
||||
if y_type == "binary":
|
||||
return binary_metric(y_true, y_score, sample_weight=sample_weight)
|
||||
|
||||
check_consistent_length(y_true, y_score, sample_weight)
|
||||
y_true = check_array(y_true)
|
||||
y_score = check_array(y_score)
|
||||
|
||||
not_average_axis = 1
|
||||
score_weight = sample_weight
|
||||
average_weight = None
|
||||
|
||||
if average == "micro":
|
||||
if score_weight is not None:
|
||||
score_weight = np.repeat(score_weight, y_true.shape[1])
|
||||
y_true = y_true.ravel()
|
||||
y_score = y_score.ravel()
|
||||
|
||||
elif average == "weighted":
|
||||
if score_weight is not None:
|
||||
average_weight = np.sum(
|
||||
np.multiply(y_true, np.reshape(score_weight, (-1, 1))), axis=0
|
||||
)
|
||||
else:
|
||||
average_weight = np.sum(y_true, axis=0)
|
||||
if np.isclose(average_weight.sum(), 0.0):
|
||||
return 0
|
||||
|
||||
elif average == "samples":
|
||||
# swap average_weight <-> score_weight
|
||||
average_weight = score_weight
|
||||
score_weight = None
|
||||
not_average_axis = 0
|
||||
|
||||
if y_true.ndim == 1:
|
||||
y_true = y_true.reshape((-1, 1))
|
||||
|
||||
if y_score.ndim == 1:
|
||||
y_score = y_score.reshape((-1, 1))
|
||||
|
||||
n_classes = y_score.shape[not_average_axis]
|
||||
score = np.zeros((n_classes,))
|
||||
for c in range(n_classes):
|
||||
y_true_c = y_true.take([c], axis=not_average_axis).ravel()
|
||||
y_score_c = y_score.take([c], axis=not_average_axis).ravel()
|
||||
score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight)
|
||||
|
||||
# Average the results
|
||||
if average is not None:
|
||||
if average_weight is not None:
|
||||
# Scores with 0 weights are forced to be 0, preventing the average
|
||||
# score from being affected by 0-weighted NaN elements.
|
||||
average_weight = np.asarray(average_weight)
|
||||
score[average_weight == 0] = 0
|
||||
return float(np.average(score, weights=average_weight))
|
||||
else:
|
||||
return score
|
||||
|
||||
|
||||
def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average="macro"):
|
||||
"""Average one-versus-one scores for multiclass classification.
|
||||
|
||||
Uses the binary metric for one-vs-one multiclass classification,
|
||||
where the score is computed according to the Hand & Till (2001) algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
binary_metric : callable
|
||||
The binary metric function to use that accepts the following as input:
|
||||
y_true_target : array, shape = [n_samples_target]
|
||||
Some sub-array of y_true for a pair of classes designated
|
||||
positive and negative in the one-vs-one scheme.
|
||||
y_score_target : array, shape = [n_samples_target]
|
||||
Scores corresponding to the probability estimates
|
||||
of a sample belonging to the designated positive class label
|
||||
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True multiclass labels.
|
||||
|
||||
y_score : array-like of shape (n_samples, n_classes)
|
||||
Target scores corresponding to probability estimates of a sample
|
||||
belonging to a particular class.
|
||||
|
||||
average : {'macro', 'weighted'}, default='macro'
|
||||
Determines the type of averaging performed on the pairwise binary
|
||||
metric scores:
|
||||
``'macro'``:
|
||||
Calculate metrics for each label, and find their unweighted
|
||||
mean. This does not take label imbalance into account. Classes
|
||||
are assumed to be uniformly distributed.
|
||||
``'weighted'``:
|
||||
Calculate metrics for each label, taking into account the
|
||||
prevalence of the classes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Average of the pairwise binary metric scores.
|
||||
"""
|
||||
check_consistent_length(y_true, y_score)
|
||||
|
||||
y_true_unique = np.unique(y_true)
|
||||
n_classes = y_true_unique.shape[0]
|
||||
n_pairs = n_classes * (n_classes - 1) // 2
|
||||
pair_scores = np.empty(n_pairs)
|
||||
|
||||
is_weighted = average == "weighted"
|
||||
prevalence = np.empty(n_pairs) if is_weighted else None
|
||||
|
||||
# Compute scores treating a as positive class and b as negative class,
|
||||
# then b as positive class and a as negative class
|
||||
for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
|
||||
a_mask = y_true == a
|
||||
b_mask = y_true == b
|
||||
ab_mask = np.logical_or(a_mask, b_mask)
|
||||
|
||||
if is_weighted:
|
||||
prevalence[ix] = np.average(ab_mask)
|
||||
|
||||
a_true = a_mask[ab_mask]
|
||||
b_true = b_mask[ab_mask]
|
||||
|
||||
a_true_score = binary_metric(a_true, y_score[ab_mask, a])
|
||||
b_true_score = binary_metric(b_true, y_score[ab_mask, b])
|
||||
pair_scores[ix] = (a_true_score + b_true_score) / 2
|
||||
|
||||
return np.average(pair_scores, weights=prevalence)
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,268 @@
|
||||
from libc.math cimport sqrt, exp
|
||||
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
|
||||
|
||||
cdef class DistanceMetric:
|
||||
pass
|
||||
|
||||
######################################################################
|
||||
# Inline distance functions
|
||||
#
|
||||
# We use these for the default (euclidean) case so that they can be
|
||||
# inlined. This leads to faster computation for the most common case
|
||||
cdef inline float64_t euclidean_dist64(
|
||||
const float64_t* x1,
|
||||
const float64_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t> (x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return sqrt(d)
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist64(
|
||||
const float64_t* x1,
|
||||
const float64_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t>(x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return d
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_dist_to_rdist64(const float64_t dist) except -1 nogil:
|
||||
return dist * dist
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist_to_dist64(const float64_t dist) except -1 nogil:
|
||||
return sqrt(dist)
|
||||
|
||||
|
||||
######################################################################
|
||||
# DistanceMetric64 base class
|
||||
cdef class DistanceMetric64(DistanceMetric):
|
||||
# The following attributes are required for a few of the subclasses.
|
||||
# we must define them here so that cython's limited polymorphism will work.
|
||||
# Because we don't expect to instantiate a lot of these objects, the
|
||||
# extra memory overhead of this setup should not be an issue.
|
||||
cdef float64_t p
|
||||
cdef const float64_t[::1] vec
|
||||
cdef const float64_t[:, ::1] mat
|
||||
cdef intp_t size
|
||||
cdef object func
|
||||
cdef object kwargs
|
||||
|
||||
cdef float64_t dist(
|
||||
self,
|
||||
const float64_t* x1,
|
||||
const float64_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float64_t rdist(
|
||||
self,
|
||||
const float64_t* x1,
|
||||
const float64_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float64_t dist_csr(
|
||||
self,
|
||||
const float64_t* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const float64_t* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float64_t rdist_csr(
|
||||
self,
|
||||
const float64_t* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const float64_t* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int pdist(
|
||||
self,
|
||||
const float64_t[:, ::1] X,
|
||||
float64_t[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int cdist(
|
||||
self,
|
||||
const float64_t[:, ::1] X,
|
||||
const float64_t[:, ::1] Y,
|
||||
float64_t[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int pdist_csr(
|
||||
self,
|
||||
const float64_t* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const intp_t size,
|
||||
float64_t[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int cdist_csr(
|
||||
self,
|
||||
const float64_t* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const float64_t* x2_data,
|
||||
const int32_t[::1] x2_indices,
|
||||
const int32_t[::1] x2_indptr,
|
||||
const intp_t size,
|
||||
float64_t[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float64_t _rdist_to_dist(self, float64_t rdist) except -1 nogil
|
||||
|
||||
cdef float64_t _dist_to_rdist(self, float64_t dist) except -1 nogil
|
||||
|
||||
######################################################################
|
||||
# Inline distance functions
|
||||
#
|
||||
# We use these for the default (euclidean) case so that they can be
|
||||
# inlined. This leads to faster computation for the most common case
|
||||
cdef inline float64_t euclidean_dist32(
|
||||
const float32_t* x1,
|
||||
const float32_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t> (x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return sqrt(d)
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist32(
|
||||
const float32_t* x1,
|
||||
const float32_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t>(x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return d
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_dist_to_rdist32(const float32_t dist) except -1 nogil:
|
||||
return dist * dist
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist_to_dist32(const float32_t dist) except -1 nogil:
|
||||
return sqrt(dist)
|
||||
|
||||
|
||||
######################################################################
|
||||
# DistanceMetric32 base class
|
||||
cdef class DistanceMetric32(DistanceMetric):
|
||||
# The following attributes are required for a few of the subclasses.
|
||||
# we must define them here so that cython's limited polymorphism will work.
|
||||
# Because we don't expect to instantiate a lot of these objects, the
|
||||
# extra memory overhead of this setup should not be an issue.
|
||||
cdef float64_t p
|
||||
cdef const float64_t[::1] vec
|
||||
cdef const float64_t[:, ::1] mat
|
||||
cdef intp_t size
|
||||
cdef object func
|
||||
cdef object kwargs
|
||||
|
||||
cdef float32_t dist(
|
||||
self,
|
||||
const float32_t* x1,
|
||||
const float32_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float32_t rdist(
|
||||
self,
|
||||
const float32_t* x1,
|
||||
const float32_t* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float32_t dist_csr(
|
||||
self,
|
||||
const float32_t* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const float32_t* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float32_t rdist_csr(
|
||||
self,
|
||||
const float32_t* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const float32_t* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int pdist(
|
||||
self,
|
||||
const float32_t[:, ::1] X,
|
||||
float32_t[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int cdist(
|
||||
self,
|
||||
const float32_t[:, ::1] X,
|
||||
const float32_t[:, ::1] Y,
|
||||
float32_t[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int pdist_csr(
|
||||
self,
|
||||
const float32_t* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const intp_t size,
|
||||
float32_t[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int cdist_csr(
|
||||
self,
|
||||
const float32_t* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const float32_t* x2_data,
|
||||
const int32_t[::1] x2_indices,
|
||||
const int32_t[::1] x2_indptr,
|
||||
const intp_t size,
|
||||
float32_t[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef float32_t _rdist_to_dist(self, float32_t rdist) except -1 nogil
|
||||
|
||||
cdef float32_t _dist_to_rdist(self, float32_t dist) except -1 nogil
|
||||
@@ -0,0 +1,152 @@
|
||||
{{py:
|
||||
|
||||
implementation_specific_values = [
|
||||
# Values are the following ones:
|
||||
#
|
||||
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
('64', 'float64_t', 'np.float64'),
|
||||
('32', 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
}}
|
||||
from libc.math cimport sqrt, exp
|
||||
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
|
||||
|
||||
cdef class DistanceMetric:
|
||||
pass
|
||||
|
||||
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
######################################################################
|
||||
# Inline distance functions
|
||||
#
|
||||
# We use these for the default (euclidean) case so that they can be
|
||||
# inlined. This leads to faster computation for the most common case
|
||||
cdef inline float64_t euclidean_dist{{name_suffix}}(
|
||||
const {{INPUT_DTYPE_t}}* x1,
|
||||
const {{INPUT_DTYPE_t}}* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t> (x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return sqrt(d)
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist{{name_suffix}}(
|
||||
const {{INPUT_DTYPE_t}}* x1,
|
||||
const {{INPUT_DTYPE_t}}* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil:
|
||||
cdef float64_t tmp, d=0
|
||||
cdef intp_t j
|
||||
for j in range(size):
|
||||
tmp = <float64_t>(x1[j] - x2[j])
|
||||
d += tmp * tmp
|
||||
return d
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
|
||||
return dist * dist
|
||||
|
||||
|
||||
cdef inline float64_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil:
|
||||
return sqrt(dist)
|
||||
|
||||
|
||||
######################################################################
|
||||
# DistanceMetric{{name_suffix}} base class
|
||||
cdef class DistanceMetric{{name_suffix}}(DistanceMetric):
|
||||
# The following attributes are required for a few of the subclasses.
|
||||
# we must define them here so that cython's limited polymorphism will work.
|
||||
# Because we don't expect to instantiate a lot of these objects, the
|
||||
# extra memory overhead of this setup should not be an issue.
|
||||
cdef float64_t p
|
||||
cdef const float64_t[::1] vec
|
||||
cdef const float64_t[:, ::1] mat
|
||||
cdef intp_t size
|
||||
cdef object func
|
||||
cdef object kwargs
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} dist(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1,
|
||||
const {{INPUT_DTYPE_t}}* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} rdist(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1,
|
||||
const {{INPUT_DTYPE_t}}* x2,
|
||||
intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} dist_csr(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const {{INPUT_DTYPE_t}}* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} rdist_csr(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1_data,
|
||||
const int32_t* x1_indices,
|
||||
const {{INPUT_DTYPE_t}}* x2_data,
|
||||
const int32_t* x2_indices,
|
||||
const int32_t x1_start,
|
||||
const int32_t x1_end,
|
||||
const int32_t x2_start,
|
||||
const int32_t x2_end,
|
||||
const intp_t size,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int pdist(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X,
|
||||
{{INPUT_DTYPE_t}}[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int cdist(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y,
|
||||
{{INPUT_DTYPE_t}}[:, ::1] D,
|
||||
) except -1
|
||||
|
||||
cdef int pdist_csr(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const intp_t size,
|
||||
{{INPUT_DTYPE_t}}[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef int cdist_csr(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}* x1_data,
|
||||
const int32_t[::1] x1_indices,
|
||||
const int32_t[::1] x1_indptr,
|
||||
const {{INPUT_DTYPE_t}}* x2_data,
|
||||
const int32_t[::1] x2_indices,
|
||||
const int32_t[::1] x2_indptr,
|
||||
const intp_t size,
|
||||
{{INPUT_DTYPE_t}}[:, ::1] D,
|
||||
) except -1 nogil
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil
|
||||
|
||||
cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil
|
||||
|
||||
{{endfor}}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,112 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
#
|
||||
# Pairwise Distances Reductions
|
||||
# =============================
|
||||
#
|
||||
# Overview
|
||||
# --------
|
||||
#
|
||||
# This module provides routines to compute pairwise distances between a set
|
||||
# of row vectors of X and another set of row vectors of Y and apply a
|
||||
# reduction on top. The canonical example is the brute-force computation
|
||||
# of the top k nearest neighbors by leveraging the arg-k-min reduction.
|
||||
#
|
||||
# The reduction takes a matrix of pairwise distances between rows of X and Y
|
||||
# as input and outputs an aggregate data-structure for each row of X. The
|
||||
# aggregate values are typically smaller than the number of rows in Y, hence
|
||||
# the term reduction.
|
||||
#
|
||||
# For computational reasons, the reduction are performed on the fly on chunks
|
||||
# of rows of X and Y so as to keep intermediate data-structures in CPU cache
|
||||
# and avoid unnecessary round trips of large distance arrays with the RAM
|
||||
# that would otherwise severely degrade the speed by making the overall
|
||||
# processing memory-bound.
|
||||
#
|
||||
# Finally, the routines follow a generic parallelization template to process
|
||||
# chunks of data with OpenMP loops (via Cython prange), either on rows of X
|
||||
# or rows of Y depending on their respective sizes.
|
||||
#
|
||||
#
|
||||
# Dispatching to specialized implementations
|
||||
# ------------------------------------------
|
||||
#
|
||||
# Dispatchers are meant to be used in the Python code. Under the hood, a
|
||||
# dispatcher must only define the logic to choose at runtime to the correct
|
||||
# dtype-specialized :class:`BaseDistancesReductionDispatcher` implementation based
|
||||
# on the dtype of X and of Y.
|
||||
#
|
||||
#
|
||||
# High-level diagram
|
||||
# ------------------
|
||||
#
|
||||
# Legend:
|
||||
#
|
||||
# A ---⊳ B: A inherits from B
|
||||
# A ---x B: A dispatches to B
|
||||
#
|
||||
#
|
||||
# (base dispatcher)
|
||||
# BaseDistancesReductionDispatcher
|
||||
# ∆
|
||||
# |
|
||||
# |
|
||||
# +------------------+---------------+---------------+------------------+
|
||||
# | | | |
|
||||
# | (dispatcher) (dispatcher) |
|
||||
# | ArgKmin RadiusNeighbors |
|
||||
# | | | |
|
||||
# | | | |
|
||||
# | | (float{32,64} implem.) | |
|
||||
# | | BaseDistancesReduction{32,64} | |
|
||||
# | | ∆ | |
|
||||
# (dispatcher) | | | (dispatcher)
|
||||
# ArgKminClassMode | | | RadiusNeighborsClassMode
|
||||
# | | +----------+----------+ | |
|
||||
# | | | | | |
|
||||
# | | | | | |
|
||||
# | x | | x |
|
||||
# | +-------⊳ ArgKmin{32,64} RadiusNeighbors{32,64} ⊲---+ |
|
||||
# x | | ∆ ∆ | | x
|
||||
# ArgKminClassMode{32,64} | | | | RadiusNeighborsClassMode{32,64}
|
||||
# ===================================== Specializations ============================================
|
||||
# | | | |
|
||||
# | | | |
|
||||
# x | | x
|
||||
# EuclideanArgKmin{32,64} EuclideanRadiusNeighbors{32,64}
|
||||
#
|
||||
#
|
||||
# For instance :class:`ArgKmin` dispatches to:
|
||||
# - :class:`ArgKmin64` if X and Y are two `float64` array-likes
|
||||
# - :class:`ArgKmin32` if X and Y are two `float32` array-likes
|
||||
#
|
||||
# In addition, if the metric parameter is set to "euclidean" or "sqeuclidean",
|
||||
# then some direct subclass of `BaseDistancesReduction{32,64}` further dispatches
|
||||
# to one of their subclass for euclidean-specialized implementation. For instance,
|
||||
# :class:`ArgKmin64` dispatches to :class:`EuclideanArgKmin64`.
|
||||
#
|
||||
# Those Euclidean-specialized implementations relies on optimal implementations of
|
||||
# a decomposition of the squared euclidean distance matrix into a sum of three terms
|
||||
# (see :class:`MiddleTermComputer{32,64}`).
|
||||
#
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._dispatcher import (
|
||||
ArgKmin,
|
||||
ArgKminClassMode,
|
||||
BaseDistancesReductionDispatcher,
|
||||
RadiusNeighbors,
|
||||
RadiusNeighborsClassMode,
|
||||
sqeuclidean_row_norms,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ArgKmin",
|
||||
"ArgKminClassMode",
|
||||
"BaseDistancesReductionDispatcher",
|
||||
"RadiusNeighbors",
|
||||
"RadiusNeighborsClassMode",
|
||||
"sqeuclidean_row_norms",
|
||||
]
|
||||
|
||||
# ruff: noqa: E501
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,31 @@
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
|
||||
{{for name_suffix in ['64', '32']}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._base cimport BaseDistancesReduction{{name_suffix}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
|
||||
|
||||
cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
|
||||
"""float{{name_suffix}} implementation of the ArgKmin."""
|
||||
|
||||
cdef:
|
||||
intp_t k
|
||||
|
||||
intp_t[:, ::1] argkmin_indices
|
||||
float64_t[:, ::1] argkmin_distances
|
||||
|
||||
# Used as array of pointers to private datastructures used in threads.
|
||||
float64_t ** heaps_r_distances_chunks
|
||||
intp_t ** heaps_indices_chunks
|
||||
|
||||
|
||||
cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
|
||||
"""EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
|
||||
cdef:
|
||||
MiddleTermComputer{{name_suffix}} middle_term_computer
|
||||
const float64_t[::1] X_norm_squared
|
||||
const float64_t[::1] Y_norm_squared
|
||||
|
||||
bint use_squared_distances
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,512 @@
|
||||
from libc.stdlib cimport free, malloc
|
||||
from libc.float cimport DBL_MAX
|
||||
from cython cimport final
|
||||
from cython.parallel cimport parallel, prange
|
||||
|
||||
from sklearn.utils._heap cimport heap_push
|
||||
from sklearn.utils._sorting cimport simultaneous_sort
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from numbers import Integral
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.utils import check_array, check_scalar
|
||||
from sklearn.utils.fixes import _in_unstable_openblas_configuration
|
||||
from sklearn.utils.parallel import _get_threadpool_controller
|
||||
|
||||
{{for name_suffix in ['64', '32']}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._base cimport (
|
||||
BaseDistancesReduction{{name_suffix}},
|
||||
_sqeuclidean_row_norms{{name_suffix}},
|
||||
)
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
|
||||
|
||||
|
||||
cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
|
||||
"""float{{name_suffix}} implementation of the ArgKmin."""
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
intp_t k,
|
||||
metric="euclidean",
|
||||
chunk_size=None,
|
||||
dict metric_kwargs=None,
|
||||
str strategy=None,
|
||||
bint return_distance=False,
|
||||
):
|
||||
"""Compute the argkmin reduction.
|
||||
|
||||
This classmethod is responsible for introspecting the arguments
|
||||
values to dispatch to the most appropriate implementation of
|
||||
:class:`ArgKmin{{name_suffix}}`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
|
||||
No instance should directly be created outside of this class method.
|
||||
"""
|
||||
# Limit the number of threads in second level of nested parallelism for BLAS
|
||||
# to avoid threads over-subscription (in DOT or GEMM for instance).
|
||||
with _get_threadpool_controller().limit(limits=1, user_api='blas'):
|
||||
if metric in ("euclidean", "sqeuclidean"):
|
||||
# Specialized implementation of ArgKmin for the Euclidean distance
|
||||
# for the dense-dense and sparse-sparse cases.
|
||||
# This implementation computes the distances by chunk using
|
||||
# a decomposition of the Squared Euclidean distance.
|
||||
# This specialisation has an improved arithmetic intensity for both
|
||||
# the dense and sparse settings, allowing in most case speed-ups of
|
||||
# several orders of magnitude compared to the generic ArgKmin
|
||||
# implementation.
|
||||
# Note that squared norms of X and Y are precomputed in the
|
||||
# constructor of this class by issuing BLAS calls that may use
|
||||
# multithreading (depending on the BLAS implementation), hence calling
|
||||
# the constructor needs to be protected under the threadpool_limits
|
||||
# context, along with the main calls to _parallel_on_Y and
|
||||
# _parallel_on_X.
|
||||
# For more information see MiddleTermComputer.
|
||||
use_squared_distances = metric == "sqeuclidean"
|
||||
pda = EuclideanArgKmin{{name_suffix}}(
|
||||
X=X, Y=Y, k=k,
|
||||
use_squared_distances=use_squared_distances,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
metric_kwargs=metric_kwargs,
|
||||
)
|
||||
else:
|
||||
# Fall back on a generic implementation that handles most scipy
|
||||
# metrics by computing the distances between 2 vectors at a time.
|
||||
pda = ArgKmin{{name_suffix}}(
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
|
||||
k=k,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
if pda.execute_in_parallel_on_Y:
|
||||
pda._parallel_on_Y()
|
||||
else:
|
||||
pda._parallel_on_X()
|
||||
|
||||
return pda._finalize_results(return_distance)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
DatasetsPair{{name_suffix}} datasets_pair,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
intp_t k=1,
|
||||
):
|
||||
super().__init__(
|
||||
datasets_pair=datasets_pair,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
)
|
||||
self.k = check_scalar(k, "k", Integral, min_val=1)
|
||||
|
||||
# Allocating pointers to datastructures but not the datastructures themselves.
|
||||
# There are as many pointers as effective threads.
|
||||
#
|
||||
# For the sake of explicitness:
|
||||
# - when parallelizing on X, the pointers of those heaps are referencing
|
||||
# (with proper offsets) addresses of the two main heaps (see below)
|
||||
# - when parallelizing on Y, the pointers of those heaps are referencing
|
||||
# small heaps which are thread-wise-allocated and whose content will be
|
||||
# merged with the main heaps'.
|
||||
self.heaps_r_distances_chunks = <float64_t **> malloc(
|
||||
sizeof(float64_t *) * self.chunks_n_threads
|
||||
)
|
||||
self.heaps_indices_chunks = <intp_t **> malloc(
|
||||
sizeof(intp_t *) * self.chunks_n_threads
|
||||
)
|
||||
|
||||
# Main heaps which will be returned as results by `ArgKmin{{name_suffix}}.compute`.
|
||||
self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp)
|
||||
self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.heaps_indices_chunks is not NULL:
|
||||
free(self.heaps_indices_chunks)
|
||||
|
||||
if self.heaps_r_distances_chunks is not NULL:
|
||||
free(self.heaps_r_distances_chunks)
|
||||
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_samples_X = X_end - X_start
|
||||
intp_t n_samples_Y = Y_end - Y_start
|
||||
float64_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
|
||||
intp_t *heaps_indices = self.heaps_indices_chunks[thread_num]
|
||||
|
||||
# Pushing the distances and their associated indices on a heap
|
||||
# which by construction will keep track of the argkmin.
|
||||
for i in range(n_samples_X):
|
||||
for j in range(n_samples_Y):
|
||||
heap_push(
|
||||
values=heaps_r_distances + i * self.k,
|
||||
indices=heaps_indices + i * self.k,
|
||||
size=self.k,
|
||||
val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
|
||||
val_idx=Y_start + j,
|
||||
)
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
# As this strategy is embarrassingly parallel, we can set each
|
||||
# thread's heaps pointer to the proper position on the main heaps.
|
||||
self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
|
||||
self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
|
||||
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx
|
||||
|
||||
# Sorting the main heaps portion associated to `X[X_start:X_end]`
|
||||
# in ascending order w.r.t the distances.
|
||||
for idx in range(X_end - X_start):
|
||||
simultaneous_sort(
|
||||
self.heaps_r_distances_chunks[thread_num] + idx * self.k,
|
||||
self.heaps_indices_chunks[thread_num] + idx * self.k,
|
||||
self.k
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
# Maximum number of scalar elements (the last chunks can be smaller)
|
||||
intp_t heaps_size = self.X_n_samples_chunk * self.k
|
||||
intp_t thread_num
|
||||
|
||||
# The allocation is done in parallel for data locality purposes: this way
|
||||
# the heaps used in each threads are allocated in pages which are closer
|
||||
# to the CPU core used by the thread.
|
||||
# See comments about First Touch Placement Policy:
|
||||
# https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
|
||||
for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
|
||||
num_threads=self.chunks_n_threads):
|
||||
# As chunks of X are shared across threads, so must their
|
||||
# heaps. To solve this, each thread has its own heaps
|
||||
# which are then synchronised back in the main ones.
|
||||
self.heaps_r_distances_chunks[thread_num] = <float64_t *> malloc(
|
||||
heaps_size * sizeof(float64_t)
|
||||
)
|
||||
self.heaps_indices_chunks[thread_num] = <intp_t *> malloc(
|
||||
heaps_size * sizeof(intp_t)
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
# Initialising heaps (memset can't be used here)
|
||||
for idx in range(self.X_n_samples_chunk * self.k):
|
||||
self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
|
||||
self.heaps_indices_chunks[thread_num][idx] = -1
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_synchronize(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx, jdx, thread_num
|
||||
with nogil, parallel(num_threads=self.effective_n_threads):
|
||||
# Synchronising the thread heaps with the main heaps.
|
||||
# This is done in parallel sample-wise (no need for locks).
|
||||
#
|
||||
# This might break each thread's data locality as each heap which
|
||||
# was allocated in a thread is being now being used in several threads.
|
||||
#
|
||||
# Still, this parallel pattern has shown to be efficient in practice.
|
||||
for idx in prange(X_end - X_start, schedule="static"):
|
||||
for thread_num in range(self.chunks_n_threads):
|
||||
for jdx in range(self.k):
|
||||
heap_push(
|
||||
values=&self.argkmin_distances[X_start + idx, 0],
|
||||
indices=&self.argkmin_indices[X_start + idx, 0],
|
||||
size=self.k,
|
||||
val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
|
||||
val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx, thread_num
|
||||
|
||||
with nogil, parallel(num_threads=self.chunks_n_threads):
|
||||
# Deallocating temporary datastructures
|
||||
for thread_num in prange(self.chunks_n_threads, schedule='static'):
|
||||
free(self.heaps_r_distances_chunks[thread_num])
|
||||
free(self.heaps_indices_chunks[thread_num])
|
||||
|
||||
# Sorting the main in ascending order w.r.t the distances.
|
||||
# This is done in parallel sample-wise (no need for locks).
|
||||
for idx in prange(self.n_samples_X, schedule='static'):
|
||||
simultaneous_sort(
|
||||
&self.argkmin_distances[idx, 0],
|
||||
&self.argkmin_indices[idx, 0],
|
||||
self.k,
|
||||
)
|
||||
return
|
||||
|
||||
cdef void compute_exact_distances(self) noexcept nogil:
|
||||
cdef:
|
||||
intp_t i, j
|
||||
float64_t[:, ::1] distances = self.argkmin_distances
|
||||
for i in prange(self.n_samples_X, schedule='static', nogil=True,
|
||||
num_threads=self.effective_n_threads):
|
||||
for j in range(self.k):
|
||||
distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
|
||||
# Guard against potential -0., causing nan production.
|
||||
max(distances[i, j], 0.)
|
||||
)
|
||||
|
||||
def _finalize_results(self, bint return_distance=False):
|
||||
if return_distance:
|
||||
# We need to recompute distances because we relied on
|
||||
# surrogate distances for the reduction.
|
||||
self.compute_exact_distances()
|
||||
|
||||
# Values are returned identically to the way `KNeighborsMixin.kneighbors`
|
||||
# returns values. This is counter-intuitive but this allows not using
|
||||
# complex adaptations where `ArgKmin.compute` is called.
|
||||
return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
|
||||
|
||||
return np.asarray(self.argkmin_indices)
|
||||
|
||||
|
||||
cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
|
||||
"""EuclideanDistance-specialisation of ArgKmin{{name_suffix}}."""
|
||||
|
||||
@classmethod
|
||||
def is_usable_for(cls, X, Y, metric) -> bool:
|
||||
return (ArgKmin{{name_suffix}}.is_usable_for(X, Y, metric) and
|
||||
not _in_unstable_openblas_configuration())
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X,
|
||||
Y,
|
||||
intp_t k,
|
||||
bint use_squared_distances=False,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
metric_kwargs=None,
|
||||
):
|
||||
if (
|
||||
isinstance(metric_kwargs, dict) and
|
||||
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
|
||||
):
|
||||
warnings.warn(
|
||||
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
|
||||
f"usable for this case (EuclideanArgKmin64) and will be ignored.",
|
||||
UserWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
# The datasets pair here is used for exact distances computations
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
k=k,
|
||||
)
|
||||
cdef:
|
||||
intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
|
||||
|
||||
self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
|
||||
X,
|
||||
Y,
|
||||
self.effective_n_threads,
|
||||
self.chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features=X.shape[1],
|
||||
chunk_size=self.chunk_size,
|
||||
)
|
||||
|
||||
if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
|
||||
self.Y_norm_squared = check_array(
|
||||
metric_kwargs.pop("Y_norm_squared"),
|
||||
ensure_2d=False,
|
||||
input_name="Y_norm_squared",
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
|
||||
Y,
|
||||
self.effective_n_threads,
|
||||
)
|
||||
|
||||
if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
|
||||
self.X_norm_squared = check_array(
|
||||
metric_kwargs.pop("X_norm_squared"),
|
||||
ensure_2d=False,
|
||||
input_name="X_norm_squared",
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
# Do not recompute norms if datasets are identical.
|
||||
self.X_norm_squared = (
|
||||
self.Y_norm_squared if X is Y else
|
||||
_sqeuclidean_row_norms{{name_suffix}}(
|
||||
X,
|
||||
self.effective_n_threads,
|
||||
)
|
||||
)
|
||||
|
||||
self.use_squared_distances = use_squared_distances
|
||||
|
||||
@final
|
||||
cdef void compute_exact_distances(self) noexcept nogil:
|
||||
if not self.use_squared_distances:
|
||||
ArgKmin{{name_suffix}}.compute_exact_distances(self)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
|
||||
self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
|
||||
self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num,
|
||||
)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_Y_init(self)
|
||||
self.middle_term_computer._parallel_on_Y_init()
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
|
||||
self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
ArgKmin{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num
|
||||
)
|
||||
|
||||
@final
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t i, j
|
||||
float64_t sqeuclidean_dist_i_j
|
||||
intp_t n_X = X_end - X_start
|
||||
intp_t n_Y = Y_end - Y_start
|
||||
float64_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
|
||||
X_start, X_end, Y_start, Y_end, thread_num
|
||||
)
|
||||
float64_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
|
||||
intp_t * heaps_indices = self.heaps_indices_chunks[thread_num]
|
||||
|
||||
# Pushing the distance and their associated indices on heaps
|
||||
# which keep tracks of the argkmin.
|
||||
for i in range(n_X):
|
||||
for j in range(n_Y):
|
||||
sqeuclidean_dist_i_j = (
|
||||
self.X_norm_squared[i + X_start] +
|
||||
dist_middle_terms[i * n_Y + j] +
|
||||
self.Y_norm_squared[j + Y_start]
|
||||
)
|
||||
|
||||
# Catastrophic cancellation might cause -0. to be present,
|
||||
# e.g. when computing d(x_i, y_i) when X is Y.
|
||||
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
|
||||
|
||||
heap_push(
|
||||
values=heaps_r_distances + i * self.k,
|
||||
indices=heaps_indices + i * self.k,
|
||||
size=self.k,
|
||||
val=sqeuclidean_dist_i_j,
|
||||
val_idx=j + Y_start,
|
||||
)
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,182 @@
|
||||
from cython cimport floating, integral
|
||||
from cython.parallel cimport parallel, prange
|
||||
from libcpp.map cimport map as cpp_map, pair as cpp_pair
|
||||
from libc.stdlib cimport free
|
||||
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
from sklearn.utils.parallel import _get_threadpool_controller
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.metrics._pairwise_distances_reduction._classmode cimport WeightingStrategy
|
||||
|
||||
{{for name_suffix in ["32", "64"]}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._argkmin cimport ArgKmin{{name_suffix}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
|
||||
"""
|
||||
{{name_suffix}}bit implementation of ArgKminClassMode.
|
||||
"""
|
||||
cdef:
|
||||
const intp_t[:] Y_labels,
|
||||
const intp_t[:] unique_Y_labels
|
||||
float64_t[:, :] class_scores
|
||||
cpp_map[intp_t, intp_t] labels_to_index
|
||||
WeightingStrategy weight_type
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
intp_t k,
|
||||
weights,
|
||||
Y_labels,
|
||||
unique_Y_labels,
|
||||
str metric="euclidean",
|
||||
chunk_size=None,
|
||||
dict metric_kwargs=None,
|
||||
str strategy=None,
|
||||
):
|
||||
"""Compute the argkmin reduction with Y_labels.
|
||||
|
||||
This classmethod is responsible for introspecting the arguments
|
||||
values to dispatch to the most appropriate implementation of
|
||||
:class:`ArgKminClassMode{{name_suffix}}`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
|
||||
No instance _must_ directly be created outside of this class method.
|
||||
"""
|
||||
# Use a generic implementation that handles most scipy
|
||||
# metrics by computing the distances between 2 vectors at a time.
|
||||
pda = ArgKminClassMode{{name_suffix}}(
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
|
||||
k=k,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
weights=weights,
|
||||
Y_labels=Y_labels,
|
||||
unique_Y_labels=unique_Y_labels,
|
||||
)
|
||||
|
||||
# Limit the number of threads in second level of nested parallelism for BLAS
|
||||
# to avoid threads over-subscription (in GEMM for instance).
|
||||
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
|
||||
if pda.execute_in_parallel_on_Y:
|
||||
pda._parallel_on_Y()
|
||||
else:
|
||||
pda._parallel_on_X()
|
||||
|
||||
return pda._finalize_results()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
DatasetsPair{{name_suffix}} datasets_pair,
|
||||
const intp_t[:] Y_labels,
|
||||
const intp_t[:] unique_Y_labels,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
intp_t k=1,
|
||||
weights=None,
|
||||
):
|
||||
super().__init__(
|
||||
datasets_pair=datasets_pair,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
k=k,
|
||||
)
|
||||
|
||||
if weights == "uniform":
|
||||
self.weight_type = WeightingStrategy.uniform
|
||||
elif weights == "distance":
|
||||
self.weight_type = WeightingStrategy.distance
|
||||
else:
|
||||
self.weight_type = WeightingStrategy.callable
|
||||
self.Y_labels = Y_labels
|
||||
|
||||
self.unique_Y_labels = unique_Y_labels
|
||||
|
||||
cdef intp_t idx, neighbor_class_idx
|
||||
# Map from set of unique labels to their indices in `class_scores`
|
||||
# Buffer used in building a histogram for one-pass weighted mode
|
||||
self.class_scores = np.zeros(
|
||||
(self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
|
||||
)
|
||||
|
||||
def _finalize_results(self):
|
||||
probabilities = np.asarray(self.class_scores)
|
||||
probabilities /= probabilities.sum(axis=1, keepdims=True)
|
||||
return probabilities
|
||||
|
||||
cdef inline void weighted_histogram_mode(
|
||||
self,
|
||||
intp_t sample_index,
|
||||
intp_t* indices,
|
||||
float64_t* distances,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t neighbor_idx, neighbor_class_idx, label_index, multi_output_index
|
||||
float64_t score_incr = 1
|
||||
# TODO: Implement other WeightingStrategy values
|
||||
bint use_distance_weighting = (
|
||||
self.weight_type == WeightingStrategy.distance
|
||||
)
|
||||
|
||||
# Iterate through the sample k-nearest neighbours
|
||||
for neighbor_rank in range(self.k):
|
||||
# Absolute indice of the neighbor_rank-th Nearest Neighbors
|
||||
# in range [0, n_samples_Y)
|
||||
# TODO: inspect if it worth permuting this condition
|
||||
# and the for-loop above for improved branching.
|
||||
if use_distance_weighting:
|
||||
score_incr = 1 / distances[neighbor_rank]
|
||||
neighbor_idx = indices[neighbor_rank]
|
||||
neighbor_class_idx = self.Y_labels[neighbor_idx]
|
||||
self.class_scores[sample_index][neighbor_class_idx] += score_incr
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx, sample_index
|
||||
for idx in range(X_end - X_start):
|
||||
# One-pass top-one weighted mode
|
||||
# Compute the absolute index in [0, n_samples_X)
|
||||
sample_index = X_start + idx
|
||||
self.weighted_histogram_mode(
|
||||
sample_index,
|
||||
&self.heaps_indices_chunks[thread_num][idx * self.k],
|
||||
&self.heaps_r_distances_chunks[thread_num][idx * self.k],
|
||||
)
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t sample_index, thread_num
|
||||
|
||||
with nogil, parallel(num_threads=self.chunks_n_threads):
|
||||
# Deallocating temporary datastructures
|
||||
for thread_num in prange(self.chunks_n_threads, schedule='static'):
|
||||
free(self.heaps_r_distances_chunks[thread_num])
|
||||
free(self.heaps_indices_chunks[thread_num])
|
||||
|
||||
for sample_index in prange(self.n_samples_X, schedule='static'):
|
||||
self.weighted_histogram_mode(
|
||||
sample_index,
|
||||
&self.argkmin_indices[sample_index][0],
|
||||
&self.argkmin_distances[sample_index][0],
|
||||
)
|
||||
return
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,135 @@
|
||||
from cython cimport final
|
||||
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
|
||||
{{for name_suffix in ['64', '32']}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
|
||||
cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
|
||||
X,
|
||||
intp_t num_threads,
|
||||
)
|
||||
|
||||
cdef class BaseDistancesReduction{{name_suffix}}:
|
||||
"""
|
||||
Base float{{name_suffix}} implementation template of the pairwise-distances
|
||||
reduction backends.
|
||||
|
||||
Implementations inherit from this template and may override the several
|
||||
defined hooks as needed in order to easily extend functionality with
|
||||
minimal redundant code.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
readonly DatasetsPair{{name_suffix}} datasets_pair
|
||||
|
||||
# The number of threads that can be used is stored in effective_n_threads.
|
||||
#
|
||||
# The number of threads to use in the parallelization strategy
|
||||
# (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
|
||||
# for small datasets, fewer threads might be needed to loop over pair of chunks.
|
||||
#
|
||||
# Hence, the number of threads that _will_ be used for looping over chunks
|
||||
# is stored in chunks_n_threads, allowing solely using what we need.
|
||||
#
|
||||
# Thus, an invariant is:
|
||||
#
|
||||
# chunks_n_threads <= effective_n_threads
|
||||
#
|
||||
intp_t effective_n_threads
|
||||
intp_t chunks_n_threads
|
||||
|
||||
intp_t n_samples_chunk, chunk_size
|
||||
|
||||
intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
|
||||
intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
|
||||
|
||||
bint execute_in_parallel_on_Y
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X(self) noexcept nogil
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y(self) noexcept nogil
|
||||
|
||||
# Placeholder methods which have to be implemented
|
||||
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
# Placeholder methods which can be implemented
|
||||
|
||||
cdef void compute_exact_distances(self) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_parallel_finalize(
|
||||
self,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_synchronize(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,505 @@
|
||||
from cython cimport final
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.parallel cimport parallel, prange
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.utils._cython_blas cimport _dot
|
||||
from sklearn.utils._openmp_helpers cimport omp_get_thread_num
|
||||
from sklearn.utils._typedefs cimport intp_t, float32_t, float64_t, int32_t
|
||||
|
||||
from sklearn import get_config
|
||||
from sklearn.utils import check_scalar
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
#####################
|
||||
|
||||
cdef float64_t[::1] _sqeuclidean_row_norms64_dense(
|
||||
const float64_t[:, ::1] X,
|
||||
intp_t num_threads,
|
||||
):
|
||||
"""Compute the squared euclidean norm of the rows of X in parallel.
|
||||
|
||||
This is faster than using np.einsum("ij, ij->i") even when using a single thread.
|
||||
"""
|
||||
cdef:
|
||||
# Casting for X to remove the const qualifier is needed because APIs
|
||||
# exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
|
||||
# const qualifier.
|
||||
# See: https://github.com/scipy/scipy/issues/14262
|
||||
float64_t * X_ptr = <float64_t *> &X[0, 0]
|
||||
intp_t idx = 0
|
||||
intp_t n = X.shape[0]
|
||||
intp_t d = X.shape[1]
|
||||
float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
|
||||
|
||||
for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
|
||||
squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
|
||||
|
||||
return squared_row_norms
|
||||
|
||||
|
||||
cdef float64_t[::1] _sqeuclidean_row_norms32_dense(
|
||||
const float32_t[:, ::1] X,
|
||||
intp_t num_threads,
|
||||
):
|
||||
"""Compute the squared euclidean norm of the rows of X in parallel.
|
||||
|
||||
This is faster than using np.einsum("ij, ij->i") even when using a single thread.
|
||||
"""
|
||||
cdef:
|
||||
# Casting for X to remove the const qualifier is needed because APIs
|
||||
# exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
|
||||
# const qualifier.
|
||||
# See: https://github.com/scipy/scipy/issues/14262
|
||||
float32_t * X_ptr = <float32_t *> &X[0, 0]
|
||||
intp_t i = 0, j = 0
|
||||
intp_t thread_num
|
||||
intp_t n = X.shape[0]
|
||||
intp_t d = X.shape[1]
|
||||
float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64)
|
||||
|
||||
# To upcast the i-th row of X from float32 to float64
|
||||
vector[vector[float64_t]] X_i_upcast = vector[vector[float64_t]](
|
||||
num_threads, vector[float64_t](d)
|
||||
)
|
||||
|
||||
with nogil, parallel(num_threads=num_threads):
|
||||
thread_num = omp_get_thread_num()
|
||||
|
||||
for i in prange(n, schedule='static'):
|
||||
# Upcasting the i-th row of X from float32 to float64
|
||||
for j in range(d):
|
||||
X_i_upcast[thread_num][j] = <float64_t> deref(X_ptr + i * d + j)
|
||||
|
||||
squared_row_norms[i] = _dot(
|
||||
d, X_i_upcast[thread_num].data(), 1,
|
||||
X_i_upcast[thread_num].data(), 1,
|
||||
)
|
||||
|
||||
return squared_row_norms
|
||||
|
||||
|
||||
cdef float64_t[::1] _sqeuclidean_row_norms64_sparse(
|
||||
const float64_t[:] X_data,
|
||||
const int32_t[:] X_indptr,
|
||||
intp_t num_threads,
|
||||
):
|
||||
cdef:
|
||||
intp_t n = X_indptr.shape[0] - 1
|
||||
int32_t X_i_ptr, idx = 0
|
||||
float64_t[::1] squared_row_norms = np.zeros(n, dtype=np.float64)
|
||||
|
||||
for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
|
||||
for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]):
|
||||
squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr]
|
||||
|
||||
return squared_row_norms
|
||||
|
||||
|
||||
{{for name_suffix in ["64", "32"]}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
|
||||
cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}(
|
||||
X,
|
||||
intp_t num_threads,
|
||||
):
|
||||
if issparse(X):
|
||||
# TODO: remove this instruction which is a cast in the float32 case
|
||||
# by moving squared row norms computations in MiddleTermComputer.
|
||||
X_data = np.asarray(X.data, dtype=np.float64)
|
||||
X_indptr = np.asarray(X.indptr, dtype=np.int32)
|
||||
return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads)
|
||||
else:
|
||||
return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads)
|
||||
|
||||
|
||||
cdef class BaseDistancesReduction{{name_suffix}}:
|
||||
"""
|
||||
Base float{{name_suffix}} implementation template of the pairwise-distances
|
||||
reduction backends.
|
||||
|
||||
Implementations inherit from this template and may override the several
|
||||
defined hooks as needed in order to easily extend functionality with
|
||||
minimal redundant code.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
DatasetsPair{{name_suffix}} datasets_pair,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
):
|
||||
cdef:
|
||||
intp_t X_n_full_chunks, Y_n_full_chunks
|
||||
|
||||
if chunk_size is None:
|
||||
chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
|
||||
|
||||
self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
|
||||
|
||||
self.effective_n_threads = _openmp_effective_n_threads()
|
||||
|
||||
self.datasets_pair = datasets_pair
|
||||
|
||||
self.n_samples_X = datasets_pair.n_samples_X()
|
||||
self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
|
||||
X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
|
||||
X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
|
||||
self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
|
||||
|
||||
if X_n_samples_remainder != 0:
|
||||
self.X_n_samples_last_chunk = X_n_samples_remainder
|
||||
else:
|
||||
self.X_n_samples_last_chunk = self.X_n_samples_chunk
|
||||
|
||||
self.n_samples_Y = datasets_pair.n_samples_Y()
|
||||
self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
|
||||
Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
|
||||
Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
|
||||
self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
|
||||
|
||||
if Y_n_samples_remainder != 0:
|
||||
self.Y_n_samples_last_chunk = Y_n_samples_remainder
|
||||
else:
|
||||
self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
|
||||
|
||||
if strategy is None:
|
||||
strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
|
||||
|
||||
if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
|
||||
raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
|
||||
f"or 'auto', but currently strategy='{self.strategy}'.")
|
||||
|
||||
if strategy == 'auto':
|
||||
# This is a simple heuristic whose constant for the
|
||||
# comparison has been chosen based on experiments.
|
||||
# parallel_on_X has less synchronization overhead than
|
||||
# parallel_on_Y and should therefore be used whenever
|
||||
# n_samples_X is large enough to not starve any of the
|
||||
# available hardware threads.
|
||||
if self.n_samples_Y < self.n_samples_X:
|
||||
# No point to even consider parallelizing on Y in this case. This
|
||||
# is in particular important to do this on machines with a large
|
||||
# number of hardware threads.
|
||||
strategy = 'parallel_on_X'
|
||||
elif 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
|
||||
# If Y is larger than X, but X is still large enough to allow for
|
||||
# parallelism, we might still want to favor parallelizing on X.
|
||||
strategy = 'parallel_on_X'
|
||||
else:
|
||||
strategy = 'parallel_on_Y'
|
||||
|
||||
self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
|
||||
|
||||
# Not using less, not using more.
|
||||
self.chunks_n_threads = min(
|
||||
self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
|
||||
self.effective_n_threads,
|
||||
)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X(self) noexcept nogil:
|
||||
"""Perform computation and reduction in parallel on chunks of X.
|
||||
|
||||
This strategy dispatches tasks statically on threads. Each task
|
||||
processes exactly only one chunk of X, computing and reducing
|
||||
distances matrices between vectors of this chunk and vectors of all
|
||||
chunks of Y, one chunk of Y at a time.
|
||||
|
||||
This strategy is embarrassingly parallel with no intermediate data
|
||||
structures synchronization at all.
|
||||
|
||||
Private datastructures are modified internally by threads.
|
||||
|
||||
Private template methods can be implemented on subclasses to
|
||||
interact with those datastructures at various stages.
|
||||
"""
|
||||
cdef:
|
||||
intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
|
||||
intp_t thread_num
|
||||
|
||||
with nogil, parallel(num_threads=self.chunks_n_threads):
|
||||
thread_num = omp_get_thread_num()
|
||||
|
||||
# Allocating thread datastructures
|
||||
self._parallel_on_X_parallel_init(thread_num)
|
||||
|
||||
for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
|
||||
X_start = X_chunk_idx * self.X_n_samples_chunk
|
||||
if X_chunk_idx == self.X_n_chunks - 1:
|
||||
X_end = X_start + self.X_n_samples_last_chunk
|
||||
else:
|
||||
X_end = X_start + self.X_n_samples_chunk
|
||||
|
||||
# Reinitializing thread datastructures for the new X chunk
|
||||
self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
|
||||
|
||||
for Y_chunk_idx in range(self.Y_n_chunks):
|
||||
Y_start = Y_chunk_idx * self.Y_n_samples_chunk
|
||||
if Y_chunk_idx == self.Y_n_chunks - 1:
|
||||
Y_end = Y_start + self.Y_n_samples_last_chunk
|
||||
else:
|
||||
Y_end = Y_start + self.Y_n_samples_chunk
|
||||
|
||||
self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
|
||||
self._compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
|
||||
# Adjusting thread datastructures on the full pass on Y
|
||||
self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
|
||||
|
||||
# end: for X_chunk_idx
|
||||
|
||||
# Deallocating thread datastructures
|
||||
self._parallel_on_X_parallel_finalize(thread_num)
|
||||
|
||||
# end: with nogil, parallel
|
||||
return
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y(self) noexcept nogil:
|
||||
"""Perform computation and reduction in parallel on chunks of Y.
|
||||
|
||||
This strategy is a sequence of embarrassingly parallel subtasks:
|
||||
chunks of X are iterated over sequentially, and for each chunk of X,
|
||||
tasks are dispatched statically on threads. Each task processes one
|
||||
and only one chunk of Y, computing and reducing distances matrices
|
||||
between vectors of the chunk of X and vectors of the Y.
|
||||
|
||||
It comes with lock-free and parallelized intermediate data structures
|
||||
that synchronize at each iteration of the sequential outer loop on X
|
||||
chunks.
|
||||
|
||||
Private datastructures are modified internally by threads.
|
||||
|
||||
Private template methods can be implemented on subclasses to
|
||||
interact with those datastructures at various stages.
|
||||
"""
|
||||
cdef:
|
||||
intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
|
||||
intp_t thread_num
|
||||
|
||||
# Allocating datastructures shared by all threads
|
||||
self._parallel_on_Y_init()
|
||||
|
||||
for X_chunk_idx in range(self.X_n_chunks):
|
||||
X_start = X_chunk_idx * self.X_n_samples_chunk
|
||||
if X_chunk_idx == self.X_n_chunks - 1:
|
||||
X_end = X_start + self.X_n_samples_last_chunk
|
||||
else:
|
||||
X_end = X_start + self.X_n_samples_chunk
|
||||
|
||||
with nogil, parallel(num_threads=self.chunks_n_threads):
|
||||
thread_num = omp_get_thread_num()
|
||||
|
||||
# Initializing datastructures used in this thread
|
||||
self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
|
||||
|
||||
for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
|
||||
Y_start = Y_chunk_idx * self.Y_n_samples_chunk
|
||||
if Y_chunk_idx == self.Y_n_chunks - 1:
|
||||
Y_end = Y_start + self.Y_n_samples_last_chunk
|
||||
else:
|
||||
Y_end = Y_start + self.Y_n_samples_chunk
|
||||
|
||||
self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
|
||||
self._compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
# end: prange
|
||||
|
||||
# end: with nogil, parallel
|
||||
|
||||
# Synchronizing the thread datastructures with the main ones
|
||||
self._parallel_on_Y_synchronize(X_start, X_end)
|
||||
|
||||
# end: for X_chunk_idx
|
||||
# Deallocating temporary datastructures and adjusting main datastructures
|
||||
self._parallel_on_Y_finalize()
|
||||
return
|
||||
|
||||
# Placeholder methods which have to be implemented
|
||||
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
"""Compute the pairwise distances on two chunks of X and Y and reduce them.
|
||||
|
||||
This is THE core computational method of BaseDistancesReduction{{name_suffix}}.
|
||||
This must be implemented in subclasses agnostically from the parallelization
|
||||
strategies.
|
||||
"""
|
||||
return
|
||||
|
||||
def _finalize_results(self, bint return_distance):
|
||||
"""Callback adapting datastructures before returning results.
|
||||
|
||||
This must be implemented in subclasses.
|
||||
"""
|
||||
return None
|
||||
|
||||
# Placeholder methods which can be implemented
|
||||
|
||||
cdef void compute_exact_distances(self) noexcept nogil:
|
||||
"""Convert rank-preserving distances to exact distances or recompute them."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
"""Allocate datastructures used in a thread given its number."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
"""Initialize datastructures used in a thread given its number.
|
||||
|
||||
In this method, EuclideanDistance specialisations of subclass of
|
||||
BaseDistancesReduction _must_ call:
|
||||
|
||||
self.middle_term_computer._parallel_on_X_init_chunk(
|
||||
thread_num, X_start, X_end,
|
||||
)
|
||||
|
||||
to ensure the proper upcast of X[X_start:X_end] to float64 prior
|
||||
to the reduction with float64 accumulator buffers when X.dtype is
|
||||
float32.
|
||||
"""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
"""Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
|
||||
|
||||
In this method, EuclideanDistance specialisations of subclass of
|
||||
BaseDistancesReduction _must_ call:
|
||||
|
||||
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num,
|
||||
)
|
||||
|
||||
to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
|
||||
to the reduction with float64 accumulator buffers when Y.dtype is
|
||||
float32.
|
||||
"""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
"""Interact with datastructures after a reduction on chunks."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_parallel_finalize(
|
||||
self,
|
||||
intp_t thread_num
|
||||
) noexcept nogil:
|
||||
"""Interact with datastructures after executing all the reductions."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
"""Allocate datastructures used in all threads."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
"""Initialize datastructures used in a thread given its number.
|
||||
|
||||
In this method, EuclideanDistance specialisations of subclass of
|
||||
BaseDistancesReduction _must_ call:
|
||||
|
||||
self.middle_term_computer._parallel_on_Y_parallel_init(
|
||||
thread_num, X_start, X_end,
|
||||
)
|
||||
|
||||
to ensure the proper upcast of X[X_start:X_end] to float64 prior
|
||||
to the reduction with float64 accumulator buffers when X.dtype is
|
||||
float32.
|
||||
"""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
"""Initialize datastructures just before the _compute_and_reduce_distances_on_chunks.
|
||||
|
||||
In this method, EuclideanDistance specialisations of subclass of
|
||||
BaseDistancesReduction _must_ call:
|
||||
|
||||
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num,
|
||||
)
|
||||
|
||||
to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior
|
||||
to the reduction with float64 accumulator buffers when Y.dtype is
|
||||
float32.
|
||||
"""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_synchronize(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
"""Update thread datastructures before leaving a parallel region."""
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
"""Update datastructures after executing all the reductions."""
|
||||
return
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,5 @@
|
||||
cpdef enum WeightingStrategy:
|
||||
uniform = 0
|
||||
# TODO: Implement the following options in weighted_histogram_mode
|
||||
distance = 1
|
||||
callable = 2
|
||||
Binary file not shown.
@@ -0,0 +1,67 @@
|
||||
{{py:
|
||||
|
||||
implementation_specific_values = [
|
||||
# Values are the following ones:
|
||||
#
|
||||
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
('64', 'DistanceMetric64', 'float64_t'),
|
||||
('32', 'DistanceMetric32', 'float32_t')
|
||||
]
|
||||
|
||||
}}
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
|
||||
from sklearn.metrics._dist_metrics cimport DistanceMetric64, DistanceMetric32, DistanceMetric
|
||||
|
||||
{{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}}
|
||||
|
||||
|
||||
cdef class DatasetsPair{{name_suffix}}:
|
||||
cdef:
|
||||
{{DistanceMetric}} distance_metric
|
||||
intp_t n_features
|
||||
|
||||
cdef intp_t n_samples_X(self) noexcept nogil
|
||||
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil
|
||||
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil
|
||||
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil
|
||||
|
||||
|
||||
cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
cdef:
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y
|
||||
|
||||
|
||||
cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
cdef:
|
||||
const {{INPUT_DTYPE_t}}[:] X_data
|
||||
const int32_t[::1] X_indices
|
||||
const int32_t[::1] X_indptr
|
||||
|
||||
const {{INPUT_DTYPE_t}}[:] Y_data
|
||||
const int32_t[::1] Y_indices
|
||||
const int32_t[::1] Y_indptr
|
||||
|
||||
|
||||
cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
cdef:
|
||||
const {{INPUT_DTYPE_t}}[:] X_data
|
||||
const int32_t[::1] X_indices
|
||||
const int32_t[::1] X_indptr
|
||||
|
||||
const {{INPUT_DTYPE_t}}[:] Y_data
|
||||
const int32_t[::1] Y_indices
|
||||
intp_t n_Y
|
||||
|
||||
|
||||
cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
cdef:
|
||||
# As distance metrics are commutative, we can simply rely
|
||||
# on the implementation of SparseDenseDatasetsPair and
|
||||
# swap arguments.
|
||||
DatasetsPair{{name_suffix}} datasets_pair
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,406 @@
|
||||
import copy
|
||||
|
||||
{{py:
|
||||
|
||||
implementation_specific_values = [
|
||||
# Values are the following ones:
|
||||
#
|
||||
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
('64', 'DistanceMetric64', 'float64_t', 'np.float64'),
|
||||
('32', 'DistanceMetric32', 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
}}
|
||||
import numpy as np
|
||||
|
||||
from cython cimport final
|
||||
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, intp_t
|
||||
|
||||
from scipy.sparse import issparse, csr_matrix
|
||||
|
||||
{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
cdef class DatasetsPair{{name_suffix}}:
|
||||
"""Abstract class which wraps a pair of datasets (X, Y).
|
||||
|
||||
This class allows computing distances between a single pair of rows of
|
||||
of X and Y at a time given the pair of their indices (i, j). This class is
|
||||
specialized for each metric thanks to the :func:`get_for` factory classmethod.
|
||||
|
||||
The handling of parallelization over chunks to compute the distances
|
||||
and aggregation for several rows at a time is done in dedicated
|
||||
subclasses of :class:`BaseDistancesReductionDispatcher` that in-turn rely on
|
||||
subclasses of :class:`DatasetsPair` for each pair of rows in the data. The
|
||||
goal is to make it possible to decouple the generic parallelization and
|
||||
aggregation logic from metric-specific computation as much as possible.
|
||||
|
||||
X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
|
||||
in subclasses.
|
||||
|
||||
This class avoids the overhead of dispatching distance computations
|
||||
to :class:`sklearn.metrics.DistanceMetric` based on the physical
|
||||
representation of the vectors (sparse vs. dense). It makes use of
|
||||
cython.final to remove the overhead of dispatching method calls.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_metric: {{DistanceMetric}}
|
||||
The distance metric responsible for computing distances
|
||||
between two vectors of (X, Y).
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_for(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
metric="euclidean",
|
||||
dict metric_kwargs=None,
|
||||
) -> DatasetsPair{{name_suffix}}:
|
||||
"""Return the DatasetsPair implementation for the given arguments.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
If provided as an ndarray, it must be C-contiguous.
|
||||
If provided as a sparse matrix, it must be in CSR format.
|
||||
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
If provided as an ndarray, it must be C-contiguous.
|
||||
If provided as a sparse matrix, it must be in CSR format.
|
||||
|
||||
metric : str or DistanceMetric object, default='euclidean'
|
||||
The distance metric to compute between rows of X and Y.
|
||||
The default metric is a fast implementation of the Euclidean
|
||||
metric. For a list of available metrics, see the documentation
|
||||
of :class:`~sklearn.metrics.DistanceMetric`.
|
||||
|
||||
metric_kwargs : dict, default=None
|
||||
Keyword arguments to pass to specified metric function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datasets_pair: DatasetsPair{{name_suffix}}
|
||||
The suited DatasetsPair{{name_suffix}} implementation.
|
||||
"""
|
||||
# X_norm_squared and Y_norm_squared might be propagated
|
||||
# down to DatasetsPairs via metrics_kwargs when the Euclidean
|
||||
# specialisations can't be used.
|
||||
# To prevent X_norm_squared and Y_norm_squared to be passed
|
||||
# down to DistanceMetrics (whose constructors would raise
|
||||
# a RuntimeError), we pop them here.
|
||||
if metric_kwargs is not None:
|
||||
# Copying metric_kwargs not to pop "X_norm_squared"
|
||||
# and "Y_norm_squared" where they are used
|
||||
metric_kwargs = copy.copy(metric_kwargs)
|
||||
metric_kwargs.pop("X_norm_squared", None)
|
||||
metric_kwargs.pop("Y_norm_squared", None)
|
||||
cdef:
|
||||
{{DistanceMetric}} distance_metric = DistanceMetric.get_metric(
|
||||
metric,
|
||||
{{INPUT_DTYPE}},
|
||||
**(metric_kwargs or {})
|
||||
)
|
||||
|
||||
# Metric-specific checks that do not replace nor duplicate `check_array`.
|
||||
distance_metric._validate_data(X)
|
||||
distance_metric._validate_data(Y)
|
||||
|
||||
X_is_sparse = issparse(X)
|
||||
Y_is_sparse = issparse(Y)
|
||||
|
||||
if not X_is_sparse and not Y_is_sparse:
|
||||
return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
|
||||
|
||||
if X_is_sparse and Y_is_sparse:
|
||||
return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
|
||||
|
||||
if X_is_sparse and not Y_is_sparse:
|
||||
return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
|
||||
|
||||
return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
|
||||
|
||||
@classmethod
|
||||
def unpack_csr_matrix(cls, X: csr_matrix):
|
||||
"""Ensure that the CSR matrix is indexed with np.int32."""
|
||||
X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}})
|
||||
X_indices = np.asarray(X.indices, dtype=np.int32)
|
||||
X_indptr = np.asarray(X.indptr, dtype=np.int32)
|
||||
return X_data, X_indices, X_indptr
|
||||
|
||||
def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features):
|
||||
self.distance_metric = distance_metric
|
||||
self.n_features = n_features
|
||||
|
||||
cdef intp_t n_samples_X(self) noexcept nogil:
|
||||
"""Number of samples in X."""
|
||||
# This is an abstract method.
|
||||
# This _must_ always be overwritten in subclasses.
|
||||
# TODO: add "with gil: raise" here when supporting Cython 3.0
|
||||
return -999
|
||||
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil:
|
||||
"""Number of samples in Y."""
|
||||
# This is an abstract method.
|
||||
# This _must_ always be overwritten in subclasses.
|
||||
# TODO: add "with gil: raise" here when supporting Cython 3.0
|
||||
return -999
|
||||
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.dist(i, j)
|
||||
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
# This is an abstract method.
|
||||
# This _must_ always be overwritten in subclasses.
|
||||
# TODO: add "with gil: raise" here when supporting Cython 3.0
|
||||
return -1
|
||||
|
||||
@final
|
||||
cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
"""Compute distances between row vectors of two arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X: ndarray of shape (n_samples_X, n_features)
|
||||
Rows represent vectors. Must be C-contiguous.
|
||||
|
||||
Y: ndarray of shape (n_samples_Y, n_features)
|
||||
Rows represent vectors. Must be C-contiguous.
|
||||
|
||||
distance_metric: DistanceMetric
|
||||
The distance metric responsible for computing distances
|
||||
between two row vectors of (X, Y).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y,
|
||||
{{DistanceMetric}} distance_metric,
|
||||
):
|
||||
super().__init__(distance_metric, n_features=X.shape[1])
|
||||
# Arrays have already been checked
|
||||
self.X = X
|
||||
self.Y = Y
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_X(self) noexcept nogil:
|
||||
return self.X.shape[0]
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil:
|
||||
return self.Y.shape[0]
|
||||
|
||||
@final
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
|
||||
|
||||
@final
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features)
|
||||
|
||||
|
||||
@final
|
||||
cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
"""Compute distances between vectors of two CSR matrices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X: sparse matrix of shape (n_samples_X, n_features)
|
||||
Rows represent vectors. Must be in CSR format.
|
||||
|
||||
Y: sparse matrix of shape (n_samples_Y, n_features)
|
||||
Rows represent vectors. Must be in CSR format.
|
||||
|
||||
distance_metric: DistanceMetric
|
||||
The distance metric responsible for computing distances
|
||||
between two vectors of (X, Y).
|
||||
"""
|
||||
|
||||
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
|
||||
super().__init__(distance_metric, n_features=X.shape[1])
|
||||
|
||||
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
|
||||
self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_X(self) noexcept nogil:
|
||||
return self.X_indptr.shape[0] - 1
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil:
|
||||
return self.Y_indptr.shape[0] - 1
|
||||
|
||||
@final
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.distance_metric.rdist_csr(
|
||||
x1_data=&self.X_data[0],
|
||||
x1_indices=&self.X_indices[0],
|
||||
x2_data=&self.Y_data[0],
|
||||
x2_indices=&self.Y_indices[0],
|
||||
x1_start=self.X_indptr[i],
|
||||
x1_end=self.X_indptr[i + 1],
|
||||
x2_start=self.Y_indptr[j],
|
||||
x2_end=self.Y_indptr[j + 1],
|
||||
size=self.n_features,
|
||||
)
|
||||
|
||||
@final
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.distance_metric.dist_csr(
|
||||
x1_data=&self.X_data[0],
|
||||
x1_indices=&self.X_indices[0],
|
||||
x2_data=&self.Y_data[0],
|
||||
x2_indices=&self.Y_indices[0],
|
||||
x1_start=self.X_indptr[i],
|
||||
x1_end=self.X_indptr[i + 1],
|
||||
x2_start=self.Y_indptr[j],
|
||||
x2_end=self.Y_indptr[j + 1],
|
||||
size=self.n_features,
|
||||
)
|
||||
|
||||
|
||||
@final
|
||||
cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
"""Compute distances between vectors of a CSR matrix and a dense array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X: sparse matrix of shape (n_samples_X, n_features)
|
||||
Rows represent vectors. Must be in CSR format.
|
||||
|
||||
Y: ndarray of shape (n_samples_Y, n_features)
|
||||
Rows represent vectors. Must be C-contiguous.
|
||||
|
||||
distance_metric: DistanceMetric
|
||||
The distance metric responsible for computing distances
|
||||
between two vectors of (X, Y).
|
||||
"""
|
||||
|
||||
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
|
||||
super().__init__(distance_metric, n_features=X.shape[1])
|
||||
|
||||
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
|
||||
|
||||
# We support the sparse-dense case by using the sparse-sparse interfaces
|
||||
# of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to
|
||||
# avoid introducing a new complex set of interfaces. In this case, we
|
||||
# need to convert `Y` (the dense array) into a CSR matrix.
|
||||
#
|
||||
# Here we motive using another simpler CSR representation to use for `Y`.
|
||||
#
|
||||
# If we were to use the usual CSR representation for `Y`, storing all
|
||||
# the columns indices in `indices` would have required allocating an
|
||||
# array of n_samples × n_features elements with repeated contiguous
|
||||
# integers from 0 to n_features - 1. This would have been very wasteful
|
||||
# from a memory point of view. This alternative representation just uses
|
||||
# the necessary amount of information needed and only necessitates
|
||||
# shifting the address of `data` before calling the CSR × CSR routines.
|
||||
#
|
||||
# In this representation:
|
||||
#
|
||||
# - the `data` array is the original dense array, `Y`, whose first
|
||||
# element's address is shifted before calling the CSR × CSR routine
|
||||
#
|
||||
# - the `indices` array is a single row of `n_features` elements:
|
||||
#
|
||||
# [0, 1, ..., n_features-1]
|
||||
#
|
||||
# - the `indptr` array is not materialised as the indices pointers'
|
||||
# offset is constant (the offset equals `n_features`). Moreover, as
|
||||
# `data` is shifted, constant `start` and `end` indices pointers
|
||||
# respectively equalling 0 and n_features are used.
|
||||
|
||||
# Y array already has been checked here
|
||||
self.n_Y = Y.shape[0]
|
||||
self.Y_data = np.ravel(Y)
|
||||
self.Y_indices = np.arange(self.n_features, dtype=np.int32)
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_X(self) noexcept nogil:
|
||||
return self.X_indptr.shape[0] - 1
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil:
|
||||
return self.n_Y
|
||||
|
||||
@final
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
return self.distance_metric.rdist_csr(
|
||||
x1_data=&self.X_data[0],
|
||||
x1_indices=&self.X_indices[0],
|
||||
# Increment the data pointer such that x2_start=0 is aligned with the
|
||||
# j-th row
|
||||
x2_data=&self.Y_data[0] + j * self.n_features,
|
||||
x2_indices=&self.Y_indices[0],
|
||||
x1_start=self.X_indptr[i],
|
||||
x1_end=self.X_indptr[i + 1],
|
||||
x2_start=0,
|
||||
x2_end=self.n_features,
|
||||
size=self.n_features,
|
||||
)
|
||||
|
||||
@final
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
|
||||
return self.distance_metric.dist_csr(
|
||||
x1_data=&self.X_data[0],
|
||||
x1_indices=&self.X_indices[0],
|
||||
# Increment the data pointer such that x2_start=0 is aligned with the
|
||||
# j-th row
|
||||
x2_data=&self.Y_data[0] + j * self.n_features,
|
||||
x2_indices=&self.Y_indices[0],
|
||||
x1_start=self.X_indptr[i],
|
||||
x1_end=self.X_indptr[i + 1],
|
||||
x2_start=0,
|
||||
x2_end=self.n_features,
|
||||
size=self.n_features,
|
||||
)
|
||||
|
||||
|
||||
@final
|
||||
cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
|
||||
"""Compute distances between vectors of a dense array and a CSR matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X: ndarray of shape (n_samples_X, n_features)
|
||||
Rows represent vectors. Must be C-contiguous.
|
||||
|
||||
Y: sparse matrix of shape (n_samples_Y, n_features)
|
||||
Rows represent vectors. Must be in CSR format.
|
||||
|
||||
distance_metric: DistanceMetric
|
||||
The distance metric responsible for computing distances
|
||||
between two vectors of (X, Y).
|
||||
"""
|
||||
|
||||
def __init__(self, X, Y, {{DistanceMetric}} distance_metric):
|
||||
super().__init__(distance_metric, n_features=X.shape[1])
|
||||
# Swapping arguments on the constructor
|
||||
self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric)
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_X(self) noexcept nogil:
|
||||
# Swapping interface
|
||||
return self.datasets_pair.n_samples_Y()
|
||||
|
||||
@final
|
||||
cdef intp_t n_samples_Y(self) noexcept nogil:
|
||||
# Swapping interface
|
||||
return self.datasets_pair.n_samples_X()
|
||||
|
||||
@final
|
||||
cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
# Swapping arguments on the same interface
|
||||
return self.datasets_pair.surrogate_dist(j, i)
|
||||
|
||||
@final
|
||||
cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil:
|
||||
# Swapping arguments on the same interface
|
||||
return self.datasets_pair.dist(j, i)
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,763 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from abc import abstractmethod
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import get_config
|
||||
from sklearn.metrics._dist_metrics import BOOL_METRICS, METRIC_MAPPING64, DistanceMetric
|
||||
from sklearn.metrics._pairwise_distances_reduction._argkmin import ArgKmin32, ArgKmin64
|
||||
from sklearn.metrics._pairwise_distances_reduction._argkmin_classmode import (
|
||||
ArgKminClassMode32,
|
||||
ArgKminClassMode64,
|
||||
)
|
||||
from sklearn.metrics._pairwise_distances_reduction._base import (
|
||||
_sqeuclidean_row_norms32,
|
||||
_sqeuclidean_row_norms64,
|
||||
)
|
||||
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors import (
|
||||
RadiusNeighbors32,
|
||||
RadiusNeighbors64,
|
||||
)
|
||||
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors_classmode import (
|
||||
RadiusNeighborsClassMode32,
|
||||
RadiusNeighborsClassMode64,
|
||||
)
|
||||
|
||||
|
||||
def sqeuclidean_row_norms(X, num_threads):
|
||||
"""Compute the squared euclidean norm of the rows of X in parallel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray or CSR matrix of shape (n_samples, n_features)
|
||||
Input data. Must be c-contiguous.
|
||||
|
||||
num_threads : int
|
||||
The number of OpenMP threads to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sqeuclidean_row_norms : ndarray of shape (n_samples,)
|
||||
Arrays containing the squared euclidean norm of each row of X.
|
||||
"""
|
||||
if X.dtype == np.float64:
|
||||
return np.asarray(_sqeuclidean_row_norms64(X, num_threads))
|
||||
if X.dtype == np.float32:
|
||||
return np.asarray(_sqeuclidean_row_norms32(X, num_threads))
|
||||
|
||||
raise ValueError(
|
||||
"Only float64 or float32 datasets are supported at this time, "
|
||||
f"got: X.dtype={X.dtype}."
|
||||
)
|
||||
|
||||
|
||||
class BaseDistancesReductionDispatcher:
|
||||
"""Abstract base dispatcher for pairwise distance computation & reduction.
|
||||
|
||||
Each dispatcher extending the base :class:`BaseDistancesReductionDispatcher`
|
||||
dispatcher must implement the :meth:`compute` classmethod.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def valid_metrics(cls) -> List[str]:
|
||||
excluded = {
|
||||
# PyFunc cannot be supported because it necessitates interacting with
|
||||
# the CPython interpreter to call user defined functions.
|
||||
"pyfunc",
|
||||
"mahalanobis", # is numerically unstable
|
||||
# In order to support discrete distance metrics, we need to have a
|
||||
# stable simultaneous sort which preserves the order of the indices
|
||||
# because there generally is a lot of occurrences for a given values
|
||||
# of distances in this case.
|
||||
# TODO: implement a stable simultaneous_sort.
|
||||
"hamming",
|
||||
*BOOL_METRICS,
|
||||
}
|
||||
return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded)
|
||||
|
||||
@classmethod
|
||||
def is_usable_for(cls, X, Y, metric) -> bool:
|
||||
"""Return True if the dispatcher can be used for the
|
||||
given parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
|
||||
Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
|
||||
metric : str, default='euclidean'
|
||||
The distance metric to use.
|
||||
For a list of available metrics, see the documentation of
|
||||
:class:`~sklearn.metrics.DistanceMetric`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
True if the dispatcher can be used, else False.
|
||||
"""
|
||||
|
||||
# FIXME: the current Cython implementation is too slow for a large number of
|
||||
# features. We temporarily disable it to fallback on SciPy's implementation.
|
||||
# See: https://github.com/scikit-learn/scikit-learn/issues/28191
|
||||
if (
|
||||
issparse(X)
|
||||
and issparse(Y)
|
||||
and isinstance(metric, str)
|
||||
and "euclidean" in metric
|
||||
):
|
||||
return False
|
||||
|
||||
def is_numpy_c_ordered(X):
|
||||
return hasattr(X, "flags") and getattr(X.flags, "c_contiguous", False)
|
||||
|
||||
def is_valid_sparse_matrix(X):
|
||||
return (
|
||||
issparse(X)
|
||||
and X.format == "csr"
|
||||
and
|
||||
# TODO: support CSR matrices without non-zeros elements
|
||||
X.nnz > 0
|
||||
and
|
||||
# TODO: support CSR matrices with int64 indices and indptr
|
||||
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
|
||||
X.indices.dtype == X.indptr.dtype == np.int32
|
||||
)
|
||||
|
||||
is_usable = (
|
||||
get_config().get("enable_cython_pairwise_dist", True)
|
||||
and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X))
|
||||
and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
|
||||
and X.dtype == Y.dtype
|
||||
and X.dtype in (np.float32, np.float64)
|
||||
and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric))
|
||||
)
|
||||
|
||||
return is_usable
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
**kwargs,
|
||||
):
|
||||
"""Compute the reduction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
|
||||
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
|
||||
**kwargs : additional parameters for the reduction
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method is an abstract class method: it has to be implemented
|
||||
for all subclasses.
|
||||
"""
|
||||
|
||||
|
||||
class ArgKmin(BaseDistancesReductionDispatcher):
|
||||
"""Compute the argkmin of row vectors of X on the ones of Y.
|
||||
|
||||
For each row vector of X, computes the indices of k first the rows
|
||||
vectors of Y with the smallest distances.
|
||||
|
||||
ArgKmin is typically used to perform
|
||||
bruteforce k-nearest neighbors queries.
|
||||
|
||||
This class is not meant to be instantiated, one should only use
|
||||
its :meth:`compute` classmethod which handles allocation and
|
||||
deallocation consistently.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
k,
|
||||
metric="euclidean",
|
||||
chunk_size=None,
|
||||
metric_kwargs=None,
|
||||
strategy=None,
|
||||
return_distance=False,
|
||||
):
|
||||
"""Compute the argkmin reduction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
|
||||
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
|
||||
k : int
|
||||
The k for the argkmin reduction.
|
||||
|
||||
metric : str, default='euclidean'
|
||||
The distance metric to use for argkmin.
|
||||
For a list of available metrics, see the documentation of
|
||||
:class:`~sklearn.metrics.DistanceMetric`.
|
||||
|
||||
chunk_size : int, default=None,
|
||||
The number of vectors per chunk. If None (default) looks-up in
|
||||
scikit-learn configuration for `pairwise_dist_chunk_size`,
|
||||
and use 256 if it is not set.
|
||||
|
||||
metric_kwargs : dict, default=None
|
||||
Keyword arguments to pass to specified metric function.
|
||||
|
||||
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
|
||||
The chunking strategy defining which dataset parallelization are made on.
|
||||
|
||||
For both strategies the computations happens with two nested loops,
|
||||
respectively on chunks of X and chunks of Y.
|
||||
Strategies differs on which loop (outer or inner) is made to run
|
||||
in parallel with the Cython `prange` construct:
|
||||
|
||||
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
|
||||
Each thread then iterates on all the chunks of Y. This strategy is
|
||||
embarrassingly parallel and comes with no datastructures
|
||||
synchronisation.
|
||||
|
||||
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
|
||||
Each thread processes all the chunks of X in turn. This strategy is
|
||||
a sequence of embarrassingly parallel subtasks (the inner loop on Y
|
||||
chunks) with intermediate datastructures synchronisation at each
|
||||
iteration of the sequential outer loop on X chunks.
|
||||
|
||||
- 'auto' relies on a simple heuristic to choose between
|
||||
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
|
||||
'parallel_on_X' is usually the most efficient strategy.
|
||||
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
|
||||
brings more opportunity for parallelism and is therefore more efficient
|
||||
|
||||
- None (default) looks-up in scikit-learn configuration for
|
||||
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
|
||||
|
||||
return_distance : boolean, default=False
|
||||
Return distances between each X vector and its
|
||||
argkmin if set to True.
|
||||
|
||||
Returns
|
||||
-------
|
||||
If return_distance=False:
|
||||
- argkmin_indices : ndarray of shape (n_samples_X, k)
|
||||
Indices of the argkmin for each vector in X.
|
||||
|
||||
If return_distance=True:
|
||||
- argkmin_distances : ndarray of shape (n_samples_X, k)
|
||||
Distances to the argkmin for each vector in X.
|
||||
- argkmin_indices : ndarray of shape (n_samples_X, k)
|
||||
Indices of the argkmin for each vector in X.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This classmethod inspects the arguments values to dispatch to the
|
||||
dtype-specialized implementation of :class:`ArgKmin`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
"""
|
||||
if X.dtype == Y.dtype == np.float64:
|
||||
return ArgKmin64.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
k=k,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
return_distance=return_distance,
|
||||
)
|
||||
|
||||
if X.dtype == Y.dtype == np.float32:
|
||||
return ArgKmin32.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
k=k,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
return_distance=return_distance,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
"Only float64 or float32 datasets pairs are supported at this time, "
|
||||
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
|
||||
)
|
||||
|
||||
|
||||
class RadiusNeighbors(BaseDistancesReductionDispatcher):
|
||||
"""Compute radius-based neighbors for two sets of vectors.
|
||||
|
||||
For each row-vector X[i] of the queries X, find all the indices j of
|
||||
row-vectors in Y such that:
|
||||
|
||||
dist(X[i], Y[j]) <= radius
|
||||
|
||||
The distance function `dist` depends on the values of the `metric`
|
||||
and `metric_kwargs` parameters.
|
||||
|
||||
This class is not meant to be instantiated, one should only use
|
||||
its :meth:`compute` classmethod which handles allocation and
|
||||
deallocation consistently.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
radius,
|
||||
metric="euclidean",
|
||||
chunk_size=None,
|
||||
metric_kwargs=None,
|
||||
strategy=None,
|
||||
return_distance=False,
|
||||
sort_results=False,
|
||||
):
|
||||
"""Return the results of the reduction for the given arguments.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray or CSR matrix of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
|
||||
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
|
||||
radius : float
|
||||
The radius defining the neighborhood.
|
||||
|
||||
metric : str, default='euclidean'
|
||||
The distance metric to use.
|
||||
For a list of available metrics, see the documentation of
|
||||
:class:`~sklearn.metrics.DistanceMetric`.
|
||||
|
||||
chunk_size : int, default=None,
|
||||
The number of vectors per chunk. If None (default) looks-up in
|
||||
scikit-learn configuration for `pairwise_dist_chunk_size`,
|
||||
and use 256 if it is not set.
|
||||
|
||||
metric_kwargs : dict, default=None
|
||||
Keyword arguments to pass to specified metric function.
|
||||
|
||||
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
|
||||
The chunking strategy defining which dataset parallelization are made on.
|
||||
|
||||
For both strategies the computations happens with two nested loops,
|
||||
respectively on chunks of X and chunks of Y.
|
||||
Strategies differs on which loop (outer or inner) is made to run
|
||||
in parallel with the Cython `prange` construct:
|
||||
|
||||
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
|
||||
Each thread then iterates on all the chunks of Y. This strategy is
|
||||
embarrassingly parallel and comes with no datastructures
|
||||
synchronisation.
|
||||
|
||||
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
|
||||
Each thread processes all the chunks of X in turn. This strategy is
|
||||
a sequence of embarrassingly parallel subtasks (the inner loop on Y
|
||||
chunks) with intermediate datastructures synchronisation at each
|
||||
iteration of the sequential outer loop on X chunks.
|
||||
|
||||
- 'auto' relies on a simple heuristic to choose between
|
||||
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
|
||||
'parallel_on_X' is usually the most efficient strategy.
|
||||
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
|
||||
brings more opportunity for parallelism and is therefore more efficient
|
||||
despite the synchronization step at each iteration of the outer loop
|
||||
on chunks of `X`.
|
||||
|
||||
- None (default) looks-up in scikit-learn configuration for
|
||||
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
|
||||
|
||||
return_distance : boolean, default=False
|
||||
Return distances between each X vector and its neighbors if set to True.
|
||||
|
||||
sort_results : boolean, default=False
|
||||
Sort results with respect to distances between each X vector and its
|
||||
neighbors if set to True.
|
||||
|
||||
Returns
|
||||
-------
|
||||
If return_distance=False:
|
||||
- neighbors_indices : ndarray of n_samples_X ndarray
|
||||
Indices of the neighbors for each vector in X.
|
||||
|
||||
If return_distance=True:
|
||||
- neighbors_indices : ndarray of n_samples_X ndarray
|
||||
Indices of the neighbors for each vector in X.
|
||||
- neighbors_distances : ndarray of n_samples_X ndarray
|
||||
Distances to the neighbors for each vector in X.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This classmethod inspects the arguments values to dispatch to the
|
||||
dtype-specialized implementation of :class:`RadiusNeighbors`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
"""
|
||||
if X.dtype == Y.dtype == np.float64:
|
||||
return RadiusNeighbors64.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
radius=radius,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
sort_results=sort_results,
|
||||
return_distance=return_distance,
|
||||
)
|
||||
|
||||
if X.dtype == Y.dtype == np.float32:
|
||||
return RadiusNeighbors32.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
radius=radius,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
sort_results=sort_results,
|
||||
return_distance=return_distance,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
"Only float64 or float32 datasets pairs are supported at this time, "
|
||||
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
|
||||
)
|
||||
|
||||
|
||||
class ArgKminClassMode(BaseDistancesReductionDispatcher):
|
||||
"""Compute the argkmin of row vectors of X on the ones of Y with labels.
|
||||
|
||||
For each row vector of X, computes the indices of k first the rows
|
||||
vectors of Y with the smallest distances. Computes weighted mode of labels.
|
||||
|
||||
ArgKminClassMode is typically used to perform bruteforce k-nearest neighbors
|
||||
queries when the weighted mode of the labels for the k-nearest neighbors
|
||||
are required, such as in `predict` methods.
|
||||
|
||||
This class is not meant to be instantiated, one should only use
|
||||
its :meth:`compute` classmethod which handles allocation and
|
||||
deallocation consistently.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def valid_metrics(cls) -> List[str]:
|
||||
excluded = {
|
||||
# Euclidean is technically usable for ArgKminClassMode
|
||||
# but its current implementation would not be competitive.
|
||||
# TODO: implement Euclidean specialization using GEMM.
|
||||
"euclidean",
|
||||
"sqeuclidean",
|
||||
}
|
||||
return list(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
k,
|
||||
weights,
|
||||
Y_labels,
|
||||
unique_Y_labels,
|
||||
metric="euclidean",
|
||||
chunk_size=None,
|
||||
metric_kwargs=None,
|
||||
strategy=None,
|
||||
):
|
||||
"""Compute the argkmin reduction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples_X, n_features)
|
||||
The input array to be labelled.
|
||||
|
||||
Y : ndarray of shape (n_samples_Y, n_features)
|
||||
The input array whose class membership are provided through the
|
||||
`Y_labels` parameter.
|
||||
|
||||
k : int
|
||||
The number of nearest neighbors to consider.
|
||||
|
||||
weights : ndarray
|
||||
The weights applied over the `Y_labels` of `Y` when computing the
|
||||
weighted mode of the labels.
|
||||
|
||||
Y_labels : ndarray
|
||||
An array containing the index of the class membership of the
|
||||
associated samples in `Y`. This is used in labeling `X`.
|
||||
|
||||
unique_Y_labels : ndarray
|
||||
An array containing all unique indices contained in the
|
||||
corresponding `Y_labels` array.
|
||||
|
||||
metric : str, default='euclidean'
|
||||
The distance metric to use. For a list of available metrics, see
|
||||
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
|
||||
Currently does not support `'precomputed'`.
|
||||
|
||||
chunk_size : int, default=None,
|
||||
The number of vectors per chunk. If None (default) looks-up in
|
||||
scikit-learn configuration for `pairwise_dist_chunk_size`,
|
||||
and use 256 if it is not set.
|
||||
|
||||
metric_kwargs : dict, default=None
|
||||
Keyword arguments to pass to specified metric function.
|
||||
|
||||
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
|
||||
The chunking strategy defining which dataset parallelization are made on.
|
||||
|
||||
For both strategies the computations happens with two nested loops,
|
||||
respectively on chunks of X and chunks of Y.
|
||||
Strategies differs on which loop (outer or inner) is made to run
|
||||
in parallel with the Cython `prange` construct:
|
||||
|
||||
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
|
||||
Each thread then iterates on all the chunks of Y. This strategy is
|
||||
embarrassingly parallel and comes with no datastructures
|
||||
synchronisation.
|
||||
|
||||
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
|
||||
Each thread processes all the chunks of X in turn. This strategy is
|
||||
a sequence of embarrassingly parallel subtasks (the inner loop on Y
|
||||
chunks) with intermediate datastructures synchronisation at each
|
||||
iteration of the sequential outer loop on X chunks.
|
||||
|
||||
- 'auto' relies on a simple heuristic to choose between
|
||||
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
|
||||
'parallel_on_X' is usually the most efficient strategy.
|
||||
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
|
||||
brings more opportunity for parallelism and is therefore more efficient
|
||||
despite the synchronization step at each iteration of the outer loop
|
||||
on chunks of `X`.
|
||||
|
||||
- None (default) looks-up in scikit-learn configuration for
|
||||
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
probabilities : ndarray of shape (n_samples_X, n_classes)
|
||||
An array containing the class probabilities for each sample.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This classmethod is responsible for introspecting the arguments
|
||||
values to dispatch to the most appropriate implementation of
|
||||
:class:`PairwiseDistancesArgKmin`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
"""
|
||||
if weights not in {"uniform", "distance"}:
|
||||
raise ValueError(
|
||||
"Only the 'uniform' or 'distance' weights options are supported"
|
||||
f" at this time. Got: {weights=}."
|
||||
)
|
||||
if X.dtype == Y.dtype == np.float64:
|
||||
return ArgKminClassMode64.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
k=k,
|
||||
weights=weights,
|
||||
Y_labels=np.array(Y_labels, dtype=np.intp),
|
||||
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
if X.dtype == Y.dtype == np.float32:
|
||||
return ArgKminClassMode32.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
k=k,
|
||||
weights=weights,
|
||||
Y_labels=np.array(Y_labels, dtype=np.intp),
|
||||
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
"Only float64 or float32 datasets pairs are supported at this time, "
|
||||
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
|
||||
)
|
||||
|
||||
|
||||
class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
|
||||
"""Compute radius-based class modes of row vectors of X using the
|
||||
those of Y.
|
||||
|
||||
For each row-vector X[i] of the queries X, find all the indices j of
|
||||
row-vectors in Y such that:
|
||||
|
||||
dist(X[i], Y[j]) <= radius
|
||||
|
||||
RadiusNeighborsClassMode is typically used to perform bruteforce
|
||||
radius neighbors queries when the weighted mode of the labels for
|
||||
the nearest neighbors within the specified radius are required,
|
||||
such as in `predict` methods.
|
||||
|
||||
This class is not meant to be instantiated, one should only use
|
||||
its :meth:`compute` classmethod which handles allocation and
|
||||
deallocation consistently.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def valid_metrics(cls) -> List[str]:
|
||||
excluded = {
|
||||
# Euclidean is technically usable for RadiusNeighborsClassMode
|
||||
# but it would not be competitive.
|
||||
# TODO: implement Euclidean specialization using GEMM.
|
||||
"euclidean",
|
||||
"sqeuclidean",
|
||||
}
|
||||
return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
radius,
|
||||
weights,
|
||||
Y_labels,
|
||||
unique_Y_labels,
|
||||
outlier_label,
|
||||
metric="euclidean",
|
||||
chunk_size=None,
|
||||
metric_kwargs=None,
|
||||
strategy=None,
|
||||
):
|
||||
"""Return the results of the reduction for the given arguments.
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples_X, n_features)
|
||||
The input array to be labelled.
|
||||
Y : ndarray of shape (n_samples_Y, n_features)
|
||||
The input array whose class membership is provided through
|
||||
the `Y_labels` parameter.
|
||||
radius : float
|
||||
The radius defining the neighborhood.
|
||||
weights : ndarray
|
||||
The weights applied to the `Y_labels` when computing the
|
||||
weighted mode of the labels.
|
||||
Y_labels : ndarray
|
||||
An array containing the index of the class membership of the
|
||||
associated samples in `Y`. This is used in labeling `X`.
|
||||
unique_Y_labels : ndarray
|
||||
An array containing all unique class labels.
|
||||
outlier_label : int, default=None
|
||||
Label for outlier samples (samples with no neighbors in given
|
||||
radius). In the default case when the value is None if any
|
||||
outlier is detected, a ValueError will be raised. The outlier
|
||||
label should be selected from among the unique 'Y' labels. If
|
||||
it is specified with a different value a warning will be raised
|
||||
and all class probabilities of outliers will be assigned to be 0.
|
||||
metric : str, default='euclidean'
|
||||
The distance metric to use. For a list of available metrics, see
|
||||
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
|
||||
Currently does not support `'precomputed'`.
|
||||
chunk_size : int, default=None,
|
||||
The number of vectors per chunk. If None (default) looks-up in
|
||||
scikit-learn configuration for `pairwise_dist_chunk_size`,
|
||||
and use 256 if it is not set.
|
||||
metric_kwargs : dict, default=None
|
||||
Keyword arguments to pass to specified metric function.
|
||||
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
|
||||
The chunking strategy defining which dataset parallelization are made on.
|
||||
For both strategies the computations happens with two nested loops,
|
||||
respectively on chunks of X and chunks of Y.
|
||||
Strategies differs on which loop (outer or inner) is made to run
|
||||
in parallel with the Cython `prange` construct:
|
||||
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
|
||||
Each thread then iterates on all the chunks of Y. This strategy is
|
||||
embarrassingly parallel and comes with no datastructures
|
||||
synchronisation.
|
||||
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
|
||||
Each thread processes all the chunks of X in turn. This strategy is
|
||||
a sequence of embarrassingly parallel subtasks (the inner loop on Y
|
||||
chunks) with intermediate datastructures synchronisation at each
|
||||
iteration of the sequential outer loop on X chunks.
|
||||
- 'auto' relies on a simple heuristic to choose between
|
||||
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
|
||||
'parallel_on_X' is usually the most efficient strategy.
|
||||
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
|
||||
brings more opportunity for parallelism and is therefore more efficient
|
||||
despite the synchronization step at each iteration of the outer loop
|
||||
on chunks of `X`.
|
||||
- None (default) looks-up in scikit-learn configuration for
|
||||
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
|
||||
Returns
|
||||
-------
|
||||
probabilities : ndarray of shape (n_samples_X, n_classes)
|
||||
An array containing the class probabilities for each sample.
|
||||
"""
|
||||
if weights not in {"uniform", "distance"}:
|
||||
raise ValueError(
|
||||
"Only the 'uniform' or 'distance' weights options are supported"
|
||||
f" at this time. Got: {weights=}."
|
||||
)
|
||||
if X.dtype == Y.dtype == np.float64:
|
||||
return RadiusNeighborsClassMode64.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
radius=radius,
|
||||
weights=weights,
|
||||
Y_labels=np.array(Y_labels, dtype=np.intp),
|
||||
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
|
||||
outlier_label=outlier_label,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
if X.dtype == Y.dtype == np.float32:
|
||||
return RadiusNeighborsClassMode32.compute(
|
||||
X=X,
|
||||
Y=Y,
|
||||
radius=radius,
|
||||
weights=weights,
|
||||
Y_labels=np.array(Y_labels, dtype=np.intp),
|
||||
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
|
||||
outlier_label=outlier_label,
|
||||
metric=metric,
|
||||
chunk_size=chunk_size,
|
||||
metric_kwargs=metric_kwargs,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
"Only float64 or float32 datasets pairs are supported at this time, "
|
||||
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
|
||||
)
|
||||
Binary file not shown.
@@ -0,0 +1,228 @@
|
||||
{{py:
|
||||
|
||||
implementation_specific_values = [
|
||||
# Values are the following ones:
|
||||
#
|
||||
# name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
#
|
||||
# We also use the float64 dtype and C-type names as defined in
|
||||
# `sklearn.utils._typedefs` to maintain consistency.
|
||||
#
|
||||
('64', False, 'float64_t', 'np.float64'),
|
||||
('32', True, 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
}}
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
|
||||
|
||||
|
||||
cdef void _middle_term_sparse_sparse_64(
|
||||
const float64_t[:] X_data,
|
||||
const int32_t[:] X_indices,
|
||||
const int32_t[:] X_indptr,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
const float64_t[:] Y_data,
|
||||
const int32_t[:] Y_indices,
|
||||
const int32_t[:] Y_indptr,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
float64_t * D,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
|
||||
cdef class MiddleTermComputer{{name_suffix}}:
|
||||
cdef:
|
||||
intp_t effective_n_threads
|
||||
intp_t chunks_n_threads
|
||||
intp_t dist_middle_terms_chunks_size
|
||||
intp_t n_features
|
||||
intp_t chunk_size
|
||||
|
||||
# Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
|
||||
vector[vector[float64_t]] dist_middle_terms_chunks
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_init(self) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
cdef:
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y
|
||||
|
||||
{{if upcast_to_float64}}
|
||||
# Buffers for upcasting chunks of X and Y from 32bit to 64bit
|
||||
vector[vector[float64_t]] X_c_upcast
|
||||
vector[vector[float64_t]] Y_c_upcast
|
||||
{{endif}}
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
cdef:
|
||||
const float64_t[:] X_data
|
||||
const int32_t[:] X_indices
|
||||
const int32_t[:] X_indptr
|
||||
|
||||
const float64_t[:] Y_data
|
||||
const int32_t[:] Y_indices
|
||||
const int32_t[:] Y_indptr
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
cdef:
|
||||
const float64_t[:] X_data
|
||||
const int32_t[:] X_indices
|
||||
const int32_t[:] X_indptr
|
||||
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y
|
||||
|
||||
# We treat the dense-sparse case with the sparse-dense case by simply
|
||||
# treating the dist_middle_terms as F-ordered and by swapping arguments.
|
||||
# This attribute is meant to encode the case and adapt the logic
|
||||
# accordingly.
|
||||
bint c_ordered_middle_term
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,633 @@
|
||||
{{py:
|
||||
|
||||
implementation_specific_values = [
|
||||
# Values are the following ones:
|
||||
#
|
||||
# name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE
|
||||
#
|
||||
# We also use the float64 dtype and C-type names as defined in
|
||||
# `sklearn.utils._typedefs` to maintain consistency.
|
||||
#
|
||||
('64', False, 'float64_t', 'np.float64'),
|
||||
('32', True, 'float32_t', 'np.float32')
|
||||
]
|
||||
|
||||
}}
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.algorithm cimport fill
|
||||
|
||||
from sklearn.utils._cython_blas cimport (
|
||||
BLAS_Order,
|
||||
BLAS_Trans,
|
||||
NoTrans,
|
||||
RowMajor,
|
||||
Trans,
|
||||
_gemm,
|
||||
)
|
||||
from sklearn.utils._typedefs cimport float64_t, float32_t, int32_t, intp_t
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse, csr_matrix
|
||||
|
||||
|
||||
cdef void _middle_term_sparse_sparse_64(
|
||||
const float64_t[:] X_data,
|
||||
const int32_t[:] X_indices,
|
||||
const int32_t[:] X_indptr,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
const float64_t[:] Y_data,
|
||||
const int32_t[:] Y_indices,
|
||||
const int32_t[:] Y_indptr,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
float64_t * D,
|
||||
) noexcept nogil:
|
||||
# This routine assumes that D points to the first element of a
|
||||
# zeroed buffer of length at least equal to n_X × n_Y, conceptually
|
||||
# representing a 2-d C-ordered array.
|
||||
cdef:
|
||||
intp_t i, j, k
|
||||
intp_t n_X = X_end - X_start
|
||||
intp_t n_Y = Y_end - Y_start
|
||||
intp_t x_col, x_ptr, y_col, y_ptr
|
||||
|
||||
for i in range(n_X):
|
||||
for x_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
|
||||
x_col = X_indices[x_ptr]
|
||||
for j in range(n_Y):
|
||||
k = i * n_Y + j
|
||||
for y_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
|
||||
y_col = Y_indices[y_ptr]
|
||||
if x_col == y_col:
|
||||
D[k] += -2 * X_data[x_ptr] * Y_data[y_ptr]
|
||||
|
||||
|
||||
{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
|
||||
|
||||
cdef void _middle_term_sparse_dense_{{name_suffix}}(
|
||||
const float64_t[:] X_data,
|
||||
const int32_t[:] X_indices,
|
||||
const int32_t[:] X_indptr,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
bint c_ordered_middle_term,
|
||||
float64_t * dist_middle_terms,
|
||||
) noexcept nogil:
|
||||
# This routine assumes that dist_middle_terms is a pointer to the first element
|
||||
# of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually
|
||||
# representing a 2-d C-ordered of F-ordered array.
|
||||
cdef:
|
||||
intp_t i, j, k
|
||||
intp_t n_X = X_end - X_start
|
||||
intp_t n_Y = Y_end - Y_start
|
||||
intp_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
|
||||
|
||||
for i in range(n_X):
|
||||
for j in range(n_Y):
|
||||
k = i * n_Y + j if c_ordered_middle_term else j * n_X + i
|
||||
for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
|
||||
X_i_col_idx = X_indices[X_i_ptr]
|
||||
dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx]
|
||||
|
||||
|
||||
cdef class MiddleTermComputer{{name_suffix}}:
|
||||
"""Helper class to compute a Euclidean distance matrix in chunks.
|
||||
|
||||
This is an abstract base class that is further specialized depending
|
||||
on the type of data (dense or sparse).
|
||||
|
||||
`EuclideanDistance` subclasses relies on the squared Euclidean
|
||||
distances between chunks of vectors X_c and Y_c using the
|
||||
following decomposition for the (i,j) pair :
|
||||
|
||||
|
||||
||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
|
||||
|
||||
|
||||
This helper class is in charge of wrapping the common logic to compute
|
||||
the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_for(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
) -> MiddleTermComputer{{name_suffix}}:
|
||||
"""Return the MiddleTermComputer implementation for the given arguments.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
|
||||
Input data.
|
||||
If provided as an ndarray, it must be C-contiguous.
|
||||
|
||||
Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
|
||||
Input data.
|
||||
If provided as an ndarray, it must be C-contiguous.
|
||||
|
||||
Returns
|
||||
-------
|
||||
middle_term_computer: MiddleTermComputer{{name_suffix}}
|
||||
The suited MiddleTermComputer{{name_suffix}} implementation.
|
||||
"""
|
||||
X_is_sparse = issparse(X)
|
||||
Y_is_sparse = issparse(Y)
|
||||
|
||||
if not X_is_sparse and not Y_is_sparse:
|
||||
return DenseDenseMiddleTermComputer{{name_suffix}}(
|
||||
X,
|
||||
Y,
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
)
|
||||
if X_is_sparse and Y_is_sparse:
|
||||
return SparseSparseMiddleTermComputer{{name_suffix}}(
|
||||
X,
|
||||
Y,
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
)
|
||||
if X_is_sparse and not Y_is_sparse:
|
||||
return SparseDenseMiddleTermComputer{{name_suffix}}(
|
||||
X,
|
||||
Y,
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
c_ordered_middle_term=True
|
||||
)
|
||||
if not X_is_sparse and Y_is_sparse:
|
||||
# NOTE: The Dense-Sparse case is implement via the Sparse-Dense case.
|
||||
#
|
||||
# To do so:
|
||||
# - X (dense) and Y (sparse) are swapped
|
||||
# - the distance middle term is seen as F-ordered for consistency
|
||||
# (c_ordered_middle_term = False)
|
||||
return SparseDenseMiddleTermComputer{{name_suffix}}(
|
||||
# Mind that X and Y are swapped here.
|
||||
Y,
|
||||
X,
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
c_ordered_middle_term=False,
|
||||
)
|
||||
raise NotImplementedError(
|
||||
"X and Y must be CSR sparse matrices or numpy arrays."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def unpack_csr_matrix(cls, X: csr_matrix):
|
||||
"""Ensure that the CSR matrix is indexed with np.int32."""
|
||||
X_data = np.asarray(X.data, dtype=np.float64)
|
||||
X_indices = np.asarray(X.indices, dtype=np.int32)
|
||||
X_indptr = np.asarray(X.indptr, dtype=np.int32)
|
||||
return X_data, X_indices, X_indptr
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
intp_t effective_n_threads,
|
||||
intp_t chunks_n_threads,
|
||||
intp_t dist_middle_terms_chunks_size,
|
||||
intp_t n_features,
|
||||
intp_t chunk_size,
|
||||
):
|
||||
self.effective_n_threads = effective_n_threads
|
||||
self.chunks_n_threads = chunks_n_threads
|
||||
self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
|
||||
self.n_features = n_features
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
self.dist_middle_terms_chunks = vector[vector[float64_t]](self.effective_n_threads)
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
return
|
||||
|
||||
cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil:
|
||||
self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_init(self) noexcept nogil:
|
||||
for thread_num in range(self.chunks_n_threads):
|
||||
self.dist_middle_terms_chunks[thread_num].resize(
|
||||
self.dist_middle_terms_chunks_size
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
return
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil:
|
||||
return
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
return NULL
|
||||
|
||||
|
||||
cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
"""Computes the middle term of the Euclidean distance between two chunked dense matrices
|
||||
X_c and Y_c.
|
||||
|
||||
dist_middle_terms = - 2 X_c_i.Y_c_j^T
|
||||
|
||||
This class use the BLAS gemm routine to perform the dot product of each chunks
|
||||
of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] X,
|
||||
const {{INPUT_DTYPE_t}}[:, ::1] Y,
|
||||
intp_t effective_n_threads,
|
||||
intp_t chunks_n_threads,
|
||||
intp_t dist_middle_terms_chunks_size,
|
||||
intp_t n_features,
|
||||
intp_t chunk_size,
|
||||
):
|
||||
super().__init__(
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
)
|
||||
self.X = X
|
||||
self.Y = Y
|
||||
|
||||
{{if upcast_to_float64}}
|
||||
# We populate the buffer for upcasting chunks of X and Y from float32 to float64.
|
||||
self.X_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
|
||||
self.Y_c_upcast = vector[vector[float64_t]](self.effective_n_threads)
|
||||
|
||||
upcast_buffer_n_elements = self.chunk_size * n_features
|
||||
|
||||
for thread_num in range(self.effective_n_threads):
|
||||
self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
|
||||
self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
|
||||
{{endif}}
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
{{if upcast_to_float64}}
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_chunk_samples = Y_end - Y_start
|
||||
|
||||
# Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
|
||||
for i in range(n_chunk_samples):
|
||||
for j in range(self.n_features):
|
||||
self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
|
||||
{{else}}
|
||||
return
|
||||
{{endif}}
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
{{if upcast_to_float64}}
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_chunk_samples = X_end - X_start
|
||||
|
||||
# Upcasting X_c=X[X_start:X_end, :] from float32 to float64
|
||||
for i in range(n_chunk_samples):
|
||||
for j in range(self.n_features):
|
||||
self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
|
||||
{{else}}
|
||||
return
|
||||
{{endif}}
|
||||
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
{{if upcast_to_float64}}
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_chunk_samples = X_end - X_start
|
||||
|
||||
# Upcasting X_c=X[X_start:X_end, :] from float32 to float64
|
||||
for i in range(n_chunk_samples):
|
||||
for j in range(self.n_features):
|
||||
self.X_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.X[X_start + i, j]
|
||||
{{else}}
|
||||
return
|
||||
{{endif}}
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num
|
||||
) noexcept nogil:
|
||||
{{if upcast_to_float64}}
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_chunk_samples = Y_end - Y_start
|
||||
|
||||
# Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
|
||||
for i in range(n_chunk_samples):
|
||||
for j in range(self.n_features):
|
||||
self.Y_c_upcast[thread_num][i * self.n_features + j] = <float64_t> self.Y[Y_start + i, j]
|
||||
{{else}}
|
||||
return
|
||||
{{endif}}
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
float64_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
|
||||
|
||||
# Careful: LDA, LDB and LDC are given for F-ordered arrays
|
||||
# in BLAS documentations, for instance:
|
||||
# https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
|
||||
#
|
||||
# Here, we use their counterpart values to work with C-ordered arrays.
|
||||
BLAS_Order order = RowMajor
|
||||
BLAS_Trans ta = NoTrans
|
||||
BLAS_Trans tb = Trans
|
||||
intp_t m = X_end - X_start
|
||||
intp_t n = Y_end - Y_start
|
||||
intp_t K = self.n_features
|
||||
float64_t alpha = - 2.
|
||||
{{if upcast_to_float64}}
|
||||
float64_t * A = self.X_c_upcast[thread_num].data()
|
||||
float64_t * B = self.Y_c_upcast[thread_num].data()
|
||||
{{else}}
|
||||
# Casting for A and B to remove the const is needed because APIs exposed via
|
||||
# scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
|
||||
# See: https://github.com/scipy/scipy/issues/14262
|
||||
float64_t * A = <float64_t *> &self.X[X_start, 0]
|
||||
float64_t * B = <float64_t *> &self.Y[Y_start, 0]
|
||||
{{endif}}
|
||||
intp_t lda = self.n_features
|
||||
intp_t ldb = self.n_features
|
||||
float64_t beta = 0.
|
||||
intp_t ldc = Y_end - Y_start
|
||||
|
||||
# dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
|
||||
_gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
|
||||
|
||||
return dist_middle_terms
|
||||
|
||||
|
||||
cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
"""Middle term of the Euclidean distance between two chunked CSR matrices.
|
||||
|
||||
The result is return as a contiguous array.
|
||||
|
||||
dist_middle_terms = - 2 X_c_i.Y_c_j^T
|
||||
|
||||
The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
|
||||
This routine iterates over the data, indices and indptr arrays of the sparse matrices without
|
||||
densifying them.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X,
|
||||
Y,
|
||||
intp_t effective_n_threads,
|
||||
intp_t chunks_n_threads,
|
||||
intp_t dist_middle_terms_chunks_size,
|
||||
intp_t n_features,
|
||||
intp_t chunk_size,
|
||||
):
|
||||
super().__init__(
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
)
|
||||
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
|
||||
self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
# Flush the thread dist_middle_terms_chunks to 0.0
|
||||
fill(
|
||||
self.dist_middle_terms_chunks[thread_num].begin(),
|
||||
self.dist_middle_terms_chunks[thread_num].end(),
|
||||
0.0,
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
# Flush the thread dist_middle_terms_chunks to 0.0
|
||||
fill(
|
||||
self.dist_middle_terms_chunks[thread_num].begin(),
|
||||
self.dist_middle_terms_chunks[thread_num].end(),
|
||||
0.0,
|
||||
)
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
float64_t *dist_middle_terms = (
|
||||
self.dist_middle_terms_chunks[thread_num].data()
|
||||
)
|
||||
|
||||
_middle_term_sparse_sparse_64(
|
||||
self.X_data,
|
||||
self.X_indices,
|
||||
self.X_indptr,
|
||||
X_start,
|
||||
X_end,
|
||||
self.Y_data,
|
||||
self.Y_indices,
|
||||
self.Y_indptr,
|
||||
Y_start,
|
||||
Y_end,
|
||||
dist_middle_terms,
|
||||
)
|
||||
|
||||
return dist_middle_terms
|
||||
|
||||
cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
|
||||
"""Middle term of the Euclidean distance between chunks of a CSR matrix and an np.ndarray.
|
||||
|
||||
The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}.
|
||||
This routine iterates over the data, indices and indptr arrays of the sparse matrices
|
||||
without densifying them.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X,
|
||||
Y,
|
||||
intp_t effective_n_threads,
|
||||
intp_t chunks_n_threads,
|
||||
intp_t dist_middle_terms_chunks_size,
|
||||
intp_t n_features,
|
||||
intp_t chunk_size,
|
||||
bint c_ordered_middle_term,
|
||||
):
|
||||
super().__init__(
|
||||
effective_n_threads,
|
||||
chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features,
|
||||
chunk_size,
|
||||
)
|
||||
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
|
||||
self.Y = Y
|
||||
self.c_ordered_middle_term = c_ordered_middle_term
|
||||
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
# Fill the thread's dist_middle_terms_chunks with 0.0 before
|
||||
# computing its elements in _compute_dist_middle_terms.
|
||||
fill(
|
||||
self.dist_middle_terms_chunks[thread_num].begin(),
|
||||
self.dist_middle_terms_chunks[thread_num].end(),
|
||||
0.0,
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
# Fill the thread's dist_middle_terms_chunks with 0.0 before
|
||||
# computing its elements in _compute_dist_middle_terms.
|
||||
fill(
|
||||
self.dist_middle_terms_chunks[thread_num].begin(),
|
||||
self.dist_middle_terms_chunks[thread_num].end(),
|
||||
0.0,
|
||||
)
|
||||
|
||||
cdef float64_t * _compute_dist_middle_terms(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
float64_t *dist_middle_terms = (
|
||||
self.dist_middle_terms_chunks[thread_num].data()
|
||||
)
|
||||
|
||||
# For the dense-sparse case, we use the sparse-dense case
|
||||
# with dist_middle_terms seen as F-ordered.
|
||||
# Hence we swap indices pointers here.
|
||||
if not self.c_ordered_middle_term:
|
||||
X_start, Y_start = Y_start, X_start
|
||||
X_end, Y_end = Y_end, X_end
|
||||
|
||||
_middle_term_sparse_dense_{{name_suffix}}(
|
||||
self.X_data,
|
||||
self.X_indices,
|
||||
self.X_indptr,
|
||||
X_start,
|
||||
X_end,
|
||||
self.Y,
|
||||
Y_start,
|
||||
Y_end,
|
||||
self.c_ordered_middle_term,
|
||||
dist_middle_terms,
|
||||
)
|
||||
|
||||
return dist_middle_terms
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,90 @@
|
||||
cimport numpy as cnp
|
||||
|
||||
from libcpp.memory cimport shared_ptr
|
||||
from libcpp.vector cimport vector
|
||||
from cython cimport final
|
||||
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
######################
|
||||
## std::vector to np.ndarray coercion
|
||||
# As type covariance is not supported for C++ containers via Cython,
|
||||
# we need to redefine fused types.
|
||||
ctypedef fused vector_double_intp_t:
|
||||
vector[intp_t]
|
||||
vector[float64_t]
|
||||
|
||||
|
||||
ctypedef fused vector_vector_double_intp_t:
|
||||
vector[vector[intp_t]]
|
||||
vector[vector[float64_t]]
|
||||
|
||||
cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
|
||||
shared_ptr[vector_vector_double_intp_t] vecs
|
||||
)
|
||||
|
||||
#####################
|
||||
{{for name_suffix in ['64', '32']}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._base cimport BaseDistancesReduction{{name_suffix}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
|
||||
|
||||
cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
|
||||
"""float{{name_suffix}} implementation of the RadiusNeighbors."""
|
||||
|
||||
cdef:
|
||||
float64_t radius
|
||||
|
||||
# DistanceMetric{{name_suffix}} compute rank-preserving surrogate distance via rdist
|
||||
# which are proxies necessitating less computations.
|
||||
# We get the equivalent for the radius to be able to compare it against
|
||||
# vectors' rank-preserving surrogate distances.
|
||||
float64_t r_radius
|
||||
|
||||
# Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
|
||||
#
|
||||
# For this implementation, we want resizable buffers which we will wrap
|
||||
# into numpy arrays at the end. std::vector comes as a handy container
|
||||
# for interacting efficiently with resizable buffers.
|
||||
#
|
||||
# Though it is possible to access their buffer address with
|
||||
# std::vector::data, they can't be stolen: buffers lifetime
|
||||
# is tied to their std::vector and are deallocated when
|
||||
# std::vectors are.
|
||||
#
|
||||
# To solve this, we dynamically allocate std::vectors and then
|
||||
# encapsulate them in a StdVectorSentinel responsible for
|
||||
# freeing them when the associated np.ndarray is freed.
|
||||
#
|
||||
# Shared pointers (defined via shared_ptr) are use for safer memory management.
|
||||
# Unique pointers (defined via unique_ptr) can't be used as datastructures
|
||||
# are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
|
||||
shared_ptr[vector[vector[intp_t]]] neigh_indices
|
||||
shared_ptr[vector[vector[float64_t]]] neigh_distances
|
||||
|
||||
# Used as array of pointers to private datastructures used in threads.
|
||||
vector[shared_ptr[vector[vector[intp_t]]]] neigh_indices_chunks
|
||||
vector[shared_ptr[vector[vector[float64_t]]]] neigh_distances_chunks
|
||||
|
||||
bint sort_results
|
||||
|
||||
@final
|
||||
cdef void _merge_vectors(
|
||||
self,
|
||||
intp_t idx,
|
||||
intp_t num_threads,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
|
||||
"""EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
|
||||
cdef:
|
||||
MiddleTermComputer{{name_suffix}} middle_term_computer
|
||||
const float64_t[::1] X_norm_squared
|
||||
const float64_t[::1] Y_norm_squared
|
||||
|
||||
bint use_squared_distances
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,514 @@
|
||||
cimport numpy as cnp
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from libcpp.memory cimport shared_ptr, make_shared
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.algorithm cimport move
|
||||
from cython cimport final
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.parallel cimport parallel, prange
|
||||
|
||||
from sklearn.utils._sorting cimport simultaneous_sort
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t
|
||||
from sklearn.utils._vector_sentinel cimport vector_to_nd_array
|
||||
|
||||
from numbers import Real
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.utils import check_array, check_scalar
|
||||
from sklearn.utils.fixes import _in_unstable_openblas_configuration
|
||||
from sklearn.utils.parallel import _get_threadpool_controller
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
######################
|
||||
|
||||
cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
|
||||
shared_ptr[vector_vector_double_intp_t] vecs
|
||||
):
|
||||
"""Coerce a std::vector of std::vector to an ndarray of ndarray."""
|
||||
cdef:
|
||||
intp_t n = deref(vecs).size()
|
||||
cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
|
||||
|
||||
for i in range(n):
|
||||
nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
|
||||
|
||||
return nd_arrays_of_nd_arrays
|
||||
|
||||
#####################
|
||||
{{for name_suffix in ['64', '32']}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._base cimport (
|
||||
BaseDistancesReduction{{name_suffix}},
|
||||
_sqeuclidean_row_norms{{name_suffix}}
|
||||
)
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
from sklearn.metrics._pairwise_distances_reduction._middle_term_computer cimport MiddleTermComputer{{name_suffix}}
|
||||
|
||||
|
||||
cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
|
||||
"""float{{name_suffix}} implementation of the RadiusNeighbors."""
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
float64_t radius,
|
||||
str metric="euclidean",
|
||||
chunk_size=None,
|
||||
dict metric_kwargs=None,
|
||||
str strategy=None,
|
||||
bint return_distance=False,
|
||||
bint sort_results=False,
|
||||
):
|
||||
"""Compute the radius-neighbors reduction.
|
||||
|
||||
This classmethod is responsible for introspecting the arguments
|
||||
values to dispatch to the most appropriate implementation of
|
||||
:class:`RadiusNeighbors{{name_suffix}}`.
|
||||
|
||||
This allows decoupling the API entirely from the implementation details
|
||||
whilst maintaining RAII: all temporarily allocated datastructures necessary
|
||||
for the concrete implementation are therefore freed when this classmethod
|
||||
returns.
|
||||
|
||||
No instance should directly be created outside of this class method.
|
||||
"""
|
||||
if metric in ("euclidean", "sqeuclidean"):
|
||||
# Specialized implementation of RadiusNeighbors for the Euclidean
|
||||
# distance for the dense-dense and sparse-sparse cases.
|
||||
# This implementation computes the distances by chunk using
|
||||
# a decomposition of the Squared Euclidean distance.
|
||||
# This specialisation has an improved arithmetic intensity for both
|
||||
# the dense and sparse settings, allowing in most case speed-ups of
|
||||
# several orders of magnitude compared to the generic RadiusNeighbors
|
||||
# implementation.
|
||||
# For more information see MiddleTermComputer.
|
||||
use_squared_distances = metric == "sqeuclidean"
|
||||
pda = EuclideanRadiusNeighbors{{name_suffix}}(
|
||||
X=X, Y=Y, radius=radius,
|
||||
use_squared_distances=use_squared_distances,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
sort_results=sort_results,
|
||||
metric_kwargs=metric_kwargs,
|
||||
)
|
||||
else:
|
||||
# Fall back on a generic implementation that handles most scipy
|
||||
# metrics by computing the distances between 2 vectors at a time.
|
||||
pda = RadiusNeighbors{{name_suffix}}(
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
|
||||
radius=radius,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
sort_results=sort_results,
|
||||
)
|
||||
|
||||
# Limit the number of threads in second level of nested parallelism for BLAS
|
||||
# to avoid threads over-subscription (in GEMM for instance).
|
||||
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
|
||||
if pda.execute_in_parallel_on_Y:
|
||||
pda._parallel_on_Y()
|
||||
else:
|
||||
pda._parallel_on_X()
|
||||
|
||||
return pda._finalize_results(return_distance)
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
DatasetsPair{{name_suffix}} datasets_pair,
|
||||
float64_t radius,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
sort_results=False,
|
||||
):
|
||||
super().__init__(
|
||||
datasets_pair=datasets_pair,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
self.radius = check_scalar(radius, "radius", Real, min_val=0)
|
||||
self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
|
||||
self.sort_results = sort_results
|
||||
|
||||
# Allocating pointers to datastructures but not the datastructures themselves.
|
||||
# There are as many pointers as effective threads.
|
||||
#
|
||||
# For the sake of explicitness:
|
||||
# - when parallelizing on X, the pointers of those heaps are referencing
|
||||
# self.neigh_distances and self.neigh_indices
|
||||
# - when parallelizing on Y, the pointers of those heaps are referencing
|
||||
# std::vectors of std::vectors which are thread-wise-allocated and whose
|
||||
# content will be merged into self.neigh_distances and self.neigh_indices.
|
||||
self.neigh_distances_chunks = vector[shared_ptr[vector[vector[float64_t]]]](
|
||||
self.chunks_n_threads
|
||||
)
|
||||
self.neigh_indices_chunks = vector[shared_ptr[vector[vector[intp_t]]]](
|
||||
self.chunks_n_threads
|
||||
)
|
||||
|
||||
# Temporary datastructures which will be coerced to numpy arrays on before
|
||||
# RadiusNeighbors.compute "return" and will be then freed.
|
||||
self.neigh_distances = make_shared[vector[vector[float64_t]]](self.n_samples_X)
|
||||
self.neigh_indices = make_shared[vector[vector[intp_t]]](self.n_samples_X)
|
||||
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t i, j
|
||||
float64_t r_dist_i_j
|
||||
|
||||
for i in range(X_start, X_end):
|
||||
for j in range(Y_start, Y_end):
|
||||
r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
|
||||
if r_dist_i_j <= self.r_radius:
|
||||
deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
|
||||
deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
|
||||
|
||||
def _finalize_results(self, bint return_distance=False):
|
||||
if return_distance:
|
||||
# We need to recompute distances because we relied on
|
||||
# surrogate distances for the reduction.
|
||||
self.compute_exact_distances()
|
||||
return (
|
||||
coerce_vectors_to_nd_arrays(self.neigh_distances),
|
||||
coerce_vectors_to_nd_arrays(self.neigh_indices),
|
||||
)
|
||||
|
||||
return coerce_vectors_to_nd_arrays(self.neigh_indices)
|
||||
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
|
||||
# As this strategy is embarrassingly parallel, we can set the
|
||||
# thread vectors' pointers to the main vectors'.
|
||||
self.neigh_distances_chunks[thread_num] = self.neigh_distances
|
||||
self.neigh_indices_chunks[thread_num] = self.neigh_indices
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx
|
||||
|
||||
# Sorting neighbors for each query vector of X
|
||||
if self.sort_results:
|
||||
for idx in range(X_start, X_end):
|
||||
simultaneous_sort(
|
||||
deref(self.neigh_distances)[idx].data(),
|
||||
deref(self.neigh_indices)[idx].data(),
|
||||
deref(self.neigh_indices)[idx].size()
|
||||
)
|
||||
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t thread_num
|
||||
# As chunks of X are shared across threads, so must datastructures to avoid race
|
||||
# conditions: each thread has its own vectors of n_samples_X vectors which are
|
||||
# then merged back in the main n_samples_X vectors.
|
||||
for thread_num in range(self.chunks_n_threads):
|
||||
self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[float64_t]]](self.n_samples_X)
|
||||
self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[intp_t]]](self.n_samples_X)
|
||||
|
||||
@final
|
||||
cdef void _merge_vectors(
|
||||
self,
|
||||
intp_t idx,
|
||||
intp_t num_threads,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t thread_num
|
||||
intp_t idx_n_elements = 0
|
||||
intp_t last_element_idx = deref(self.neigh_indices)[idx].size()
|
||||
|
||||
# Resizing buffers only once for the given number of elements.
|
||||
for thread_num in range(num_threads):
|
||||
idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
|
||||
|
||||
deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
|
||||
deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
|
||||
|
||||
# Moving the elements by range using the range first element
|
||||
# as the reference for the insertion.
|
||||
for thread_num in range(num_threads):
|
||||
move(
|
||||
deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
|
||||
deref(self.neigh_distances_chunks[thread_num])[idx].end(),
|
||||
deref(self.neigh_distances)[idx].begin() + last_element_idx
|
||||
)
|
||||
move(
|
||||
deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
|
||||
deref(self.neigh_indices_chunks[thread_num])[idx].end(),
|
||||
deref(self.neigh_indices)[idx].begin() + last_element_idx
|
||||
)
|
||||
last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
|
||||
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx
|
||||
|
||||
with nogil, parallel(num_threads=self.effective_n_threads):
|
||||
# Merge vectors used in threads into the main ones.
|
||||
# This is done in parallel sample-wise (no need for locks).
|
||||
for idx in prange(self.n_samples_X, schedule='static'):
|
||||
self._merge_vectors(idx, self.chunks_n_threads)
|
||||
|
||||
# The content of the vector have been std::moved.
|
||||
# Hence they can't be used anymore and can be deleted.
|
||||
# Their deletion is carried out automatically as the
|
||||
# implementation relies on shared pointers.
|
||||
|
||||
# Sort in parallel in ascending order w.r.t the distances if requested.
|
||||
if self.sort_results:
|
||||
for idx in prange(self.n_samples_X, schedule='static'):
|
||||
simultaneous_sort(
|
||||
deref(self.neigh_distances)[idx].data(),
|
||||
deref(self.neigh_indices)[idx].data(),
|
||||
deref(self.neigh_indices)[idx].size()
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
cdef void compute_exact_distances(self) noexcept nogil:
|
||||
"""Convert rank-preserving distances to pairwise distances in parallel."""
|
||||
cdef:
|
||||
intp_t i
|
||||
vector[intp_t].size_type j
|
||||
|
||||
for i in prange(self.n_samples_X, nogil=True, schedule='static',
|
||||
num_threads=self.effective_n_threads):
|
||||
for j in range(deref(self.neigh_indices)[i].size()):
|
||||
deref(self.neigh_distances)[i][j] = (
|
||||
self.datasets_pair.distance_metric._rdist_to_dist(
|
||||
# Guard against potential -0., causing nan production.
|
||||
max(deref(self.neigh_distances)[i][j], 0.)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
|
||||
"""EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}."""
|
||||
|
||||
@classmethod
|
||||
def is_usable_for(cls, X, Y, metric) -> bool:
|
||||
return (RadiusNeighbors{{name_suffix}}.is_usable_for(X, Y, metric)
|
||||
and not _in_unstable_openblas_configuration())
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X,
|
||||
Y,
|
||||
float64_t radius,
|
||||
bint use_squared_distances=False,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
sort_results=False,
|
||||
metric_kwargs=None,
|
||||
):
|
||||
if (
|
||||
isinstance(metric_kwargs, dict) and
|
||||
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
|
||||
):
|
||||
warnings.warn(
|
||||
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
|
||||
f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.",
|
||||
UserWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
# The datasets pair here is used for exact distances computations
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"),
|
||||
radius=radius,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
sort_results=sort_results,
|
||||
)
|
||||
cdef:
|
||||
intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
|
||||
|
||||
self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for(
|
||||
X,
|
||||
Y,
|
||||
self.effective_n_threads,
|
||||
self.chunks_n_threads,
|
||||
dist_middle_terms_chunks_size,
|
||||
n_features=X.shape[1],
|
||||
chunk_size=self.chunk_size,
|
||||
)
|
||||
|
||||
if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
|
||||
self.Y_norm_squared = check_array(
|
||||
metric_kwargs.pop("Y_norm_squared"),
|
||||
ensure_2d=False,
|
||||
input_name="Y_norm_squared",
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}(
|
||||
Y,
|
||||
self.effective_n_threads,
|
||||
)
|
||||
|
||||
if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
|
||||
self.X_norm_squared = check_array(
|
||||
metric_kwargs.pop("X_norm_squared"),
|
||||
ensure_2d=False,
|
||||
input_name="X_norm_squared",
|
||||
dtype=np.float64,
|
||||
)
|
||||
else:
|
||||
# Do not recompute norms if datasets are identical.
|
||||
self.X_norm_squared = (
|
||||
self.Y_norm_squared if X is Y else
|
||||
_sqeuclidean_row_norms{{name_suffix}}(
|
||||
X,
|
||||
self.effective_n_threads,
|
||||
)
|
||||
)
|
||||
|
||||
self.use_squared_distances = use_squared_distances
|
||||
|
||||
if use_squared_distances:
|
||||
# In this specialisation and this setup, the value passed to the radius is
|
||||
# already considered to be the adapted radius, so we overwrite it.
|
||||
self.r_radius = radius
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num)
|
||||
self.middle_term_computer._parallel_on_X_parallel_init(thread_num)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_init_chunk(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
|
||||
self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num,
|
||||
)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_init(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self)
|
||||
self.middle_term_computer._parallel_on_Y_init()
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_parallel_init(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
|
||||
self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
RadiusNeighbors{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
X_start, X_end,
|
||||
Y_start, Y_end,
|
||||
thread_num,
|
||||
)
|
||||
self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
|
||||
X_start, X_end, Y_start, Y_end, thread_num
|
||||
)
|
||||
|
||||
@final
|
||||
cdef void compute_exact_distances(self) noexcept nogil:
|
||||
if not self.use_squared_distances:
|
||||
RadiusNeighbors{{name_suffix}}.compute_exact_distances(self)
|
||||
|
||||
@final
|
||||
cdef void _compute_and_reduce_distances_on_chunks(
|
||||
self,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
intp_t Y_start,
|
||||
intp_t Y_end,
|
||||
intp_t thread_num,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t i, j
|
||||
float64_t sqeuclidean_dist_i_j
|
||||
intp_t n_X = X_end - X_start
|
||||
intp_t n_Y = Y_end - Y_start
|
||||
float64_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms(
|
||||
X_start, X_end, Y_start, Y_end, thread_num
|
||||
)
|
||||
|
||||
# Pushing the distance and their associated indices in vectors.
|
||||
for i in range(n_X):
|
||||
for j in range(n_Y):
|
||||
sqeuclidean_dist_i_j = (
|
||||
self.X_norm_squared[i + X_start]
|
||||
+ dist_middle_terms[i * n_Y + j]
|
||||
+ self.Y_norm_squared[j + Y_start]
|
||||
)
|
||||
|
||||
# Catastrophic cancellation might cause -0. to be present,
|
||||
# e.g. when computing d(x_i, y_i) when X is Y.
|
||||
sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j)
|
||||
|
||||
if sqeuclidean_dist_i_j <= self.r_radius:
|
||||
deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j)
|
||||
deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,217 @@
|
||||
import warnings
|
||||
|
||||
from cython cimport floating, final, integral
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.parallel cimport parallel, prange
|
||||
from sklearn.metrics._pairwise_distances_reduction._classmode cimport WeightingStrategy
|
||||
from sklearn.utils._typedefs cimport intp_t, float64_t, uint8_t
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.utils.parallel import _get_threadpool_controller
|
||||
|
||||
|
||||
{{for name_suffix in ["32", "64"]}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._radius_neighbors cimport RadiusNeighbors{{name_suffix}}
|
||||
from sklearn.metrics._pairwise_distances_reduction._datasets_pair cimport DatasetsPair{{name_suffix}}
|
||||
|
||||
cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}}):
|
||||
"""
|
||||
{{name_suffix}}bit implementation of RadiusNeighborsClassMode.
|
||||
"""
|
||||
cdef:
|
||||
const intp_t[::1] Y_labels
|
||||
const intp_t[::1] unique_Y_labels
|
||||
intp_t outlier_label_index
|
||||
bint outlier_label_exists
|
||||
bint outliers_exist
|
||||
uint8_t[::1] outliers
|
||||
object outlier_label
|
||||
float64_t[:, ::1] class_scores
|
||||
WeightingStrategy weight_type
|
||||
|
||||
@classmethod
|
||||
def compute(
|
||||
cls,
|
||||
X,
|
||||
Y,
|
||||
float64_t radius,
|
||||
weights,
|
||||
Y_labels,
|
||||
unique_Y_labels,
|
||||
outlier_label=None,
|
||||
str metric="euclidean",
|
||||
chunk_size=None,
|
||||
dict metric_kwargs=None,
|
||||
str strategy=None,
|
||||
):
|
||||
# Use a generic implementation that handles most scipy
|
||||
# metrics by computing the distances between 2 vectors at a time.
|
||||
pda = RadiusNeighborsClassMode{{name_suffix}}(
|
||||
datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs),
|
||||
radius=radius,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
weights=weights,
|
||||
Y_labels=Y_labels,
|
||||
unique_Y_labels=unique_Y_labels,
|
||||
outlier_label=outlier_label,
|
||||
)
|
||||
|
||||
# Limit the number of threads in second level of nested parallelism for BLAS
|
||||
# to avoid threads over-subscription (in GEMM for instance).
|
||||
with _get_threadpool_controller().limit(limits=1, user_api="blas"):
|
||||
if pda.execute_in_parallel_on_Y:
|
||||
pda._parallel_on_Y()
|
||||
else:
|
||||
pda._parallel_on_X()
|
||||
|
||||
return pda._finalize_results()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
DatasetsPair{{name_suffix}} datasets_pair,
|
||||
const intp_t[::1] Y_labels,
|
||||
const intp_t[::1] unique_Y_labels,
|
||||
float64_t radius,
|
||||
chunk_size=None,
|
||||
strategy=None,
|
||||
weights=None,
|
||||
outlier_label=None,
|
||||
):
|
||||
super().__init__(
|
||||
datasets_pair=datasets_pair,
|
||||
chunk_size=chunk_size,
|
||||
strategy=strategy,
|
||||
radius=radius,
|
||||
)
|
||||
|
||||
if weights == "uniform":
|
||||
self.weight_type = WeightingStrategy.uniform
|
||||
elif weights == "distance":
|
||||
self.weight_type = WeightingStrategy.distance
|
||||
else:
|
||||
self.weight_type = WeightingStrategy.callable
|
||||
|
||||
self.Y_labels = Y_labels
|
||||
self.unique_Y_labels = unique_Y_labels
|
||||
self.outlier_label_index = -1
|
||||
self.outliers_exist = False
|
||||
self.outlier_label = outlier_label
|
||||
self.outliers = np.zeros(self.n_samples_X, dtype=np.bool_)
|
||||
|
||||
cdef intp_t idx
|
||||
if self.outlier_label is not None:
|
||||
for idx in range(self.unique_Y_labels.shape[0]):
|
||||
if self.unique_Y_labels[idx] == outlier_label:
|
||||
self.outlier_label_index = idx
|
||||
|
||||
# Map from set of unique labels to their indices in `class_scores`
|
||||
# Buffer used in building a histogram for one-pass weighted mode
|
||||
self.class_scores = np.zeros(
|
||||
(self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64,
|
||||
)
|
||||
|
||||
|
||||
cdef inline void weighted_histogram_mode(
|
||||
self,
|
||||
intp_t sample_index,
|
||||
intp_t sample_n_neighbors,
|
||||
intp_t* indices,
|
||||
float64_t* distances,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t neighbor_idx, neighbor_class_idx, label_index
|
||||
float64_t score_incr = 1
|
||||
bint use_distance_weighting = (
|
||||
self.weight_type == WeightingStrategy.distance
|
||||
)
|
||||
|
||||
if sample_n_neighbors == 0:
|
||||
self.outliers_exist = True
|
||||
self.outliers[sample_index] = True
|
||||
if self.outlier_label_index >= 0:
|
||||
self.class_scores[sample_index][self.outlier_label_index] = score_incr
|
||||
|
||||
return
|
||||
|
||||
# Iterate over the neighbors. This can be different for
|
||||
# each of the samples as they are based on the radius.
|
||||
for neighbor_rank in range(sample_n_neighbors):
|
||||
if use_distance_weighting:
|
||||
score_incr = 1 / distances[neighbor_rank]
|
||||
|
||||
neighbor_idx = indices[neighbor_rank]
|
||||
neighbor_class_idx = self.Y_labels[neighbor_idx]
|
||||
self.class_scores[sample_index][neighbor_class_idx] += score_incr
|
||||
|
||||
return
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_X_prange_iter_finalize(
|
||||
self,
|
||||
intp_t thread_num,
|
||||
intp_t X_start,
|
||||
intp_t X_end,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx
|
||||
|
||||
for idx in range(X_start, X_end):
|
||||
self.weighted_histogram_mode(
|
||||
sample_index=idx,
|
||||
sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
|
||||
indices=deref(self.neigh_indices)[idx].data(),
|
||||
distances=deref(self.neigh_distances)[idx].data(),
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
@final
|
||||
cdef void _parallel_on_Y_finalize(
|
||||
self,
|
||||
) noexcept nogil:
|
||||
cdef:
|
||||
intp_t idx
|
||||
|
||||
with nogil, parallel(num_threads=self.effective_n_threads):
|
||||
# Merge vectors used in threads into the main ones.
|
||||
# This is done in parallel sample-wise (no need for locks).
|
||||
for idx in prange(self.n_samples_X, schedule='static'):
|
||||
self._merge_vectors(idx, self.chunks_n_threads)
|
||||
|
||||
for idx in prange(self.n_samples_X, schedule='static'):
|
||||
self.weighted_histogram_mode(
|
||||
sample_index=idx,
|
||||
sample_n_neighbors=deref(self.neigh_indices)[idx].size(),
|
||||
indices=deref(self.neigh_indices)[idx].data(),
|
||||
distances=deref(self.neigh_distances)[idx].data(),
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
def _finalize_results(self):
|
||||
if self.outliers_exist and self.outlier_label is None:
|
||||
raise ValueError(
|
||||
"No neighbors found for test samples %r, "
|
||||
"you can try using larger radius, "
|
||||
"giving a label for outliers, "
|
||||
"or considering removing them from your dataset."
|
||||
% np.where(self.outliers)[0]
|
||||
)
|
||||
|
||||
if self.outliers_exist and self.outlier_label_index < 0:
|
||||
warnings.warn(
|
||||
"Outlier label %s is not in training "
|
||||
"classes. All class probabilities of "
|
||||
"outliers will be assigned with 0."
|
||||
% self.outlier_label
|
||||
)
|
||||
|
||||
probabilities = np.asarray(self.class_scores)
|
||||
normalizer = probabilities.sum(axis=1, keepdims=True)
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
probabilities /= normalizer
|
||||
return probabilities
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,193 @@
|
||||
# Note: the dependencies between different Cython files in
|
||||
# _pairwise_distances_reduction is probably one of the most involved in
|
||||
# scikit-learn. If you change this file make sure you build from scratch:
|
||||
# rm -rf build; make dev-meson
|
||||
# run a command like this:
|
||||
# ninja -C build/cp312 -t missingdeps
|
||||
# and make sure that the output is something like:
|
||||
# No missing dependencies on generated files found.
|
||||
|
||||
# _pairwise_distances_reduction is cimported from other subpackages so this is
|
||||
# needed for the cimport to work
|
||||
_pairwise_distances_reduction_cython_tree = [
|
||||
fs.copyfile('__init__.py'),
|
||||
# We are in a sub-module of metrics, so we always need to have
|
||||
# sklearn/metrics/__init__.py copied to the build directory to avoid the
|
||||
# error:
|
||||
# relative cimport beyond main package is not allowed
|
||||
metrics_cython_tree
|
||||
]
|
||||
|
||||
_classmode_pxd = fs.copyfile('_classmode.pxd')
|
||||
|
||||
_datasets_pair_pxd = custom_target(
|
||||
'_datasets_pair_pxd',
|
||||
output: '_datasets_pair.pxd',
|
||||
input: '_datasets_pair.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
|
||||
)
|
||||
_datasets_pair_pyx = custom_target(
|
||||
'_datasets_pair_pyx',
|
||||
output: '_datasets_pair.pyx',
|
||||
input: '_datasets_pair.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_datasets_pair_pxd, _pairwise_distances_reduction_cython_tree, utils_cython_tree],
|
||||
)
|
||||
_datasets_pair = py.extension_module(
|
||||
'_datasets_pair',
|
||||
cython_gen_cpp.process(_datasets_pair_pyx),
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_base_pxd = custom_target(
|
||||
'_base_pxd',
|
||||
output: '_base.pxd',
|
||||
input: '_base.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
|
||||
)
|
||||
_base_pyx = custom_target(
|
||||
'_base_pyx',
|
||||
output: '_base.pyx',
|
||||
input: '_base.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_base_pxd, _pairwise_distances_reduction_cython_tree,
|
||||
_datasets_pair_pxd, utils_cython_tree],
|
||||
)
|
||||
_base = py.extension_module(
|
||||
'_base',
|
||||
cython_gen_cpp.process(_base_pyx),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_middle_term_computer_pxd = custom_target(
|
||||
'_middle_term_computer_pxd',
|
||||
output: '_middle_term_computer.pxd',
|
||||
input: '_middle_term_computer.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
|
||||
)
|
||||
_middle_term_computer_pyx = custom_target(
|
||||
'_middle_term_computer_pyx',
|
||||
output: '_middle_term_computer.pyx',
|
||||
input: '_middle_term_computer.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_middle_term_computer_pxd,
|
||||
_pairwise_distances_reduction_cython_tree,
|
||||
utils_cython_tree],
|
||||
)
|
||||
_middle_term_computer = py.extension_module(
|
||||
'_middle_term_computer',
|
||||
cython_gen_cpp.process(_middle_term_computer_pyx),
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_argkmin_pxd = custom_target(
|
||||
'_argkmin_pxd',
|
||||
output: '_argkmin.pxd',
|
||||
input: '_argkmin.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
|
||||
)
|
||||
_argkmin_pyx = custom_target(
|
||||
'_argkmin_pyx',
|
||||
output: '_argkmin.pyx',
|
||||
input: '_argkmin.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_argkmin_pxd,
|
||||
_pairwise_distances_reduction_cython_tree,
|
||||
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd],
|
||||
)
|
||||
_argkmin = py.extension_module(
|
||||
'_argkmin',
|
||||
cython_gen_cpp.process(_argkmin_pyx),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_radius_neighbors_pxd = custom_target(
|
||||
'_radius_neighbors_pxd',
|
||||
output: '_radius_neighbors.pxd',
|
||||
input: '_radius_neighbors.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@']
|
||||
)
|
||||
_radius_neighbors_pyx = custom_target(
|
||||
'_radius_neighbors_pyx',
|
||||
output: '_radius_neighbors.pyx',
|
||||
input: '_radius_neighbors.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_radius_neighbors_pxd,
|
||||
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd,
|
||||
_pairwise_distances_reduction_cython_tree, utils_cython_tree],
|
||||
)
|
||||
_radius_neighbors = py.extension_module(
|
||||
'_radius_neighbors',
|
||||
cython_gen_cpp.process(_radius_neighbors_pyx),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_argkmin_classmode_pyx = custom_target(
|
||||
'_argkmin_classmode_pyx',
|
||||
output: '_argkmin_classmode.pyx',
|
||||
input: '_argkmin_classmode.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_classmode_pxd,
|
||||
_argkmin_pxd, _pairwise_distances_reduction_cython_tree,
|
||||
_datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd, utils_cython_tree],
|
||||
)
|
||||
_argkmin_classmode = py.extension_module(
|
||||
'_argkmin_classmode',
|
||||
cython_gen_cpp.process(_argkmin_classmode_pyx),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
# XXX: for some reason -fno-sized-deallocation is needed otherwise there is
|
||||
# an error with undefined symbol _ZdlPv at import time in manylinux wheels.
|
||||
# See https://github.com/scikit-learn/scikit-learn/issues/28596 for more details.
|
||||
cpp_args: ['-fno-sized-deallocation'],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
|
||||
_radius_neighbors_classmode_pyx = custom_target(
|
||||
'_radius_neighbors_classmode_pyx',
|
||||
output: '_radius_neighbors_classmode.pyx',
|
||||
input: '_radius_neighbors_classmode.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [_classmode_pxd,
|
||||
_middle_term_computer_pxd, _radius_neighbors_pxd,
|
||||
_pairwise_distances_reduction_cython_tree,
|
||||
_datasets_pair_pxd, _base_pxd, utils_cython_tree],
|
||||
)
|
||||
_radius_neighbors_classmode = py.extension_module(
|
||||
'_radius_neighbors_classmode',
|
||||
cython_gen_cpp.process(_radius_neighbors_classmode_pyx),
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
subdir: 'sklearn/metrics/_pairwise_distances_reduction',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
@@ -0,0 +1,107 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from cython cimport floating
|
||||
from cython.parallel cimport prange
|
||||
from libc.math cimport fabs
|
||||
|
||||
from sklearn.utils._typedefs cimport intp_t
|
||||
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
|
||||
def _chi2_kernel_fast(floating[:, :] X,
|
||||
floating[:, :] Y,
|
||||
floating[:, :] result):
|
||||
cdef intp_t i, j, k
|
||||
cdef intp_t n_samples_X = X.shape[0]
|
||||
cdef intp_t n_samples_Y = Y.shape[0]
|
||||
cdef intp_t n_features = X.shape[1]
|
||||
cdef double res, nom, denom
|
||||
|
||||
with nogil:
|
||||
for i in range(n_samples_X):
|
||||
for j in range(n_samples_Y):
|
||||
res = 0
|
||||
for k in range(n_features):
|
||||
denom = (X[i, k] - Y[j, k])
|
||||
nom = (X[i, k] + Y[j, k])
|
||||
if nom != 0:
|
||||
res += denom * denom / nom
|
||||
result[i, j] = -res
|
||||
|
||||
|
||||
def _sparse_manhattan(
|
||||
const floating[::1] X_data,
|
||||
const int[:] X_indices,
|
||||
const int[:] X_indptr,
|
||||
const floating[::1] Y_data,
|
||||
const int[:] Y_indices,
|
||||
const int[:] Y_indptr,
|
||||
double[:, ::1] D,
|
||||
):
|
||||
"""Pairwise L1 distances for CSR matrices.
|
||||
|
||||
Usage:
|
||||
>>> D = np.zeros(X.shape[0], Y.shape[0])
|
||||
>>> _sparse_manhattan(X.data, X.indices, X.indptr,
|
||||
... Y.data, Y.indices, Y.indptr,
|
||||
... D)
|
||||
"""
|
||||
cdef intp_t px, py, i, j, ix, iy
|
||||
cdef double d = 0.0
|
||||
|
||||
cdef int m = D.shape[0]
|
||||
cdef int n = D.shape[1]
|
||||
|
||||
cdef int X_indptr_end = 0
|
||||
cdef int Y_indptr_end = 0
|
||||
|
||||
cdef int num_threads = _openmp_effective_n_threads()
|
||||
|
||||
# We scan the matrices row by row.
|
||||
# Given row px in X and row py in Y, we find the positions (i and j
|
||||
# respectively), in .indices where the indices for the two rows start.
|
||||
# If the indices (ix and iy) are the same, the corresponding data values
|
||||
# are processed and the cursors i and j are advanced.
|
||||
# If not, the lowest index is considered. Its associated data value is
|
||||
# processed and its cursor is advanced.
|
||||
# We proceed like this until one of the cursors hits the end for its row.
|
||||
# Then we process all remaining data values in the other row.
|
||||
|
||||
# Below the avoidance of inplace operators is intentional.
|
||||
# When prange is used, the inplace operator has a special meaning, i.e. it
|
||||
# signals a "reduction"
|
||||
|
||||
for px in prange(m, nogil=True, num_threads=num_threads):
|
||||
X_indptr_end = X_indptr[px + 1]
|
||||
for py in range(n):
|
||||
Y_indptr_end = Y_indptr[py + 1]
|
||||
i = X_indptr[px]
|
||||
j = Y_indptr[py]
|
||||
d = 0.0
|
||||
while i < X_indptr_end and j < Y_indptr_end:
|
||||
ix = X_indices[i]
|
||||
iy = Y_indices[j]
|
||||
|
||||
if ix == iy:
|
||||
d = d + fabs(X_data[i] - Y_data[j])
|
||||
i = i + 1
|
||||
j = j + 1
|
||||
elif ix < iy:
|
||||
d = d + fabs(X_data[i])
|
||||
i = i + 1
|
||||
else:
|
||||
d = d + fabs(Y_data[j])
|
||||
j = j + 1
|
||||
|
||||
if i == X_indptr_end:
|
||||
while j < Y_indptr_end:
|
||||
d = d + fabs(Y_data[j])
|
||||
j = j + 1
|
||||
else:
|
||||
while i < X_indptr_end:
|
||||
d = d + fabs(X_data[i])
|
||||
i = i + 1
|
||||
|
||||
D[px, py] = d
|
||||
@@ -0,0 +1,2 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,499 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import is_classifier
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.utils._optional_dependencies import check_matplotlib_support
|
||||
from sklearn.utils._plotting import _validate_style_kwargs
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
|
||||
|
||||
class ConfusionMatrixDisplay:
|
||||
"""Confusion Matrix visualization.
|
||||
|
||||
It is recommended to use
|
||||
:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or
|
||||
:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to
|
||||
create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
|
||||
attributes.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <confusion_matrix>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confusion_matrix : ndarray of shape (n_classes, n_classes)
|
||||
Confusion matrix.
|
||||
|
||||
display_labels : ndarray of shape (n_classes,), default=None
|
||||
Display labels for plot. If None, display labels are set from 0 to
|
||||
`n_classes - 1`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
im_ : matplotlib AxesImage
|
||||
Image representing the confusion matrix.
|
||||
|
||||
text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \
|
||||
or None
|
||||
Array of matplotlib axes. `None` if `include_values` is false.
|
||||
|
||||
ax_ : matplotlib Axes
|
||||
Axes with confusion matrix.
|
||||
|
||||
figure_ : matplotlib Figure
|
||||
Figure containing the confusion matrix.
|
||||
|
||||
See Also
|
||||
--------
|
||||
confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a
|
||||
classification.
|
||||
ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
|
||||
given an estimator, the data, and the label.
|
||||
ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
|
||||
given the true and predicted labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
... random_state=0)
|
||||
>>> clf = SVC(random_state=0)
|
||||
>>> clf.fit(X_train, y_train)
|
||||
SVC(random_state=0)
|
||||
>>> predictions = clf.predict(X_test)
|
||||
>>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
|
||||
>>> disp = ConfusionMatrixDisplay(confusion_matrix=cm,
|
||||
... display_labels=clf.classes_)
|
||||
>>> disp.plot()
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
|
||||
def __init__(self, confusion_matrix, *, display_labels=None):
|
||||
self.confusion_matrix = confusion_matrix
|
||||
self.display_labels = display_labels
|
||||
|
||||
def plot(
|
||||
self,
|
||||
*,
|
||||
include_values=True,
|
||||
cmap="viridis",
|
||||
xticks_rotation="horizontal",
|
||||
values_format=None,
|
||||
ax=None,
|
||||
colorbar=True,
|
||||
im_kw=None,
|
||||
text_kw=None,
|
||||
):
|
||||
"""Plot visualization.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
include_values : bool, default=True
|
||||
Includes values in confusion matrix.
|
||||
|
||||
cmap : str or matplotlib Colormap, default='viridis'
|
||||
Colormap recognized by matplotlib.
|
||||
|
||||
xticks_rotation : {'vertical', 'horizontal'} or float, \
|
||||
default='horizontal'
|
||||
Rotation of xtick labels.
|
||||
|
||||
values_format : str, default=None
|
||||
Format specification for values in confusion matrix. If `None`,
|
||||
the format specification is 'd' or '.2g' whichever is shorter.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
colorbar : bool, default=True
|
||||
Whether or not to add a colorbar to the plot.
|
||||
|
||||
im_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
|
||||
|
||||
text_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.text` call.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
|
||||
Returns a :class:`~sklearn.metrics.ConfusionMatrixDisplay` instance
|
||||
that contains all the information to plot the confusion matrix.
|
||||
"""
|
||||
check_matplotlib_support("ConfusionMatrixDisplay.plot")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
if ax is None:
|
||||
fig, ax = plt.subplots()
|
||||
else:
|
||||
fig = ax.figure
|
||||
|
||||
cm = self.confusion_matrix
|
||||
n_classes = cm.shape[0]
|
||||
|
||||
default_im_kw = dict(interpolation="nearest", cmap=cmap)
|
||||
im_kw = im_kw or {}
|
||||
im_kw = _validate_style_kwargs(default_im_kw, im_kw)
|
||||
text_kw = text_kw or {}
|
||||
|
||||
self.im_ = ax.imshow(cm, **im_kw)
|
||||
self.text_ = None
|
||||
cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0)
|
||||
|
||||
if include_values:
|
||||
self.text_ = np.empty_like(cm, dtype=object)
|
||||
|
||||
# print text with appropriate color depending on background
|
||||
thresh = (cm.max() + cm.min()) / 2.0
|
||||
|
||||
for i, j in product(range(n_classes), range(n_classes)):
|
||||
color = cmap_max if cm[i, j] < thresh else cmap_min
|
||||
|
||||
if values_format is None:
|
||||
text_cm = format(cm[i, j], ".2g")
|
||||
if cm.dtype.kind != "f":
|
||||
text_d = format(cm[i, j], "d")
|
||||
if len(text_d) < len(text_cm):
|
||||
text_cm = text_d
|
||||
else:
|
||||
text_cm = format(cm[i, j], values_format)
|
||||
|
||||
default_text_kwargs = dict(ha="center", va="center", color=color)
|
||||
text_kwargs = _validate_style_kwargs(default_text_kwargs, text_kw)
|
||||
|
||||
self.text_[i, j] = ax.text(j, i, text_cm, **text_kwargs)
|
||||
|
||||
if self.display_labels is None:
|
||||
display_labels = np.arange(n_classes)
|
||||
else:
|
||||
display_labels = self.display_labels
|
||||
if colorbar:
|
||||
fig.colorbar(self.im_, ax=ax)
|
||||
ax.set(
|
||||
xticks=np.arange(n_classes),
|
||||
yticks=np.arange(n_classes),
|
||||
xticklabels=display_labels,
|
||||
yticklabels=display_labels,
|
||||
ylabel="True label",
|
||||
xlabel="Predicted label",
|
||||
)
|
||||
|
||||
ax.set_ylim((n_classes - 0.5, -0.5))
|
||||
plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)
|
||||
|
||||
self.figure_ = fig
|
||||
self.ax_ = ax
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_estimator(
|
||||
cls,
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
labels=None,
|
||||
sample_weight=None,
|
||||
normalize=None,
|
||||
display_labels=None,
|
||||
include_values=True,
|
||||
xticks_rotation="horizontal",
|
||||
values_format=None,
|
||||
cmap="viridis",
|
||||
ax=None,
|
||||
colorbar=True,
|
||||
im_kw=None,
|
||||
text_kw=None,
|
||||
):
|
||||
"""Plot Confusion Matrix given an estimator and some data.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <confusion_matrix>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance
|
||||
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
|
||||
in which the last estimator is a classifier.
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
labels : array-like of shape (n_classes,), default=None
|
||||
List of labels to index the confusion matrix. This may be used to
|
||||
reorder or select a subset of labels. If `None` is given, those
|
||||
that appear at least once in `y_true` or `y_pred` are used in
|
||||
sorted order.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
normalize : {'true', 'pred', 'all'}, default=None
|
||||
Either to normalize the counts display in the matrix:
|
||||
|
||||
- if `'true'`, the confusion matrix is normalized over the true
|
||||
conditions (e.g. rows);
|
||||
- if `'pred'`, the confusion matrix is normalized over the
|
||||
predicted conditions (e.g. columns);
|
||||
- if `'all'`, the confusion matrix is normalized by the total
|
||||
number of samples;
|
||||
- if `None` (default), the confusion matrix will not be normalized.
|
||||
|
||||
display_labels : array-like of shape (n_classes,), default=None
|
||||
Target names used for plotting. By default, `labels` will be used
|
||||
if it is defined, otherwise the unique labels of `y_true` and
|
||||
`y_pred` will be used.
|
||||
|
||||
include_values : bool, default=True
|
||||
Includes values in confusion matrix.
|
||||
|
||||
xticks_rotation : {'vertical', 'horizontal'} or float, \
|
||||
default='horizontal'
|
||||
Rotation of xtick labels.
|
||||
|
||||
values_format : str, default=None
|
||||
Format specification for values in confusion matrix. If `None`, the
|
||||
format specification is 'd' or '.2g' whichever is shorter.
|
||||
|
||||
cmap : str or matplotlib Colormap, default='viridis'
|
||||
Colormap recognized by matplotlib.
|
||||
|
||||
ax : matplotlib Axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
colorbar : bool, default=True
|
||||
Whether or not to add a colorbar to the plot.
|
||||
|
||||
im_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
|
||||
|
||||
text_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.text` call.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
|
||||
|
||||
See Also
|
||||
--------
|
||||
ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
|
||||
given the true and predicted labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import ConfusionMatrixDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = SVC(random_state=0)
|
||||
>>> clf.fit(X_train, y_train)
|
||||
SVC(random_state=0)
|
||||
>>> ConfusionMatrixDisplay.from_estimator(
|
||||
... clf, X_test, y_test)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
|
||||
For a detailed example of using a confusion matrix to evaluate a
|
||||
Support Vector Classifier, please see
|
||||
:ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py`
|
||||
"""
|
||||
method_name = f"{cls.__name__}.from_estimator"
|
||||
check_matplotlib_support(method_name)
|
||||
if not is_classifier(estimator):
|
||||
raise ValueError(f"{method_name} only supports classifiers")
|
||||
y_pred = estimator.predict(X)
|
||||
|
||||
return cls.from_predictions(
|
||||
y,
|
||||
y_pred,
|
||||
sample_weight=sample_weight,
|
||||
labels=labels,
|
||||
normalize=normalize,
|
||||
display_labels=display_labels,
|
||||
include_values=include_values,
|
||||
cmap=cmap,
|
||||
ax=ax,
|
||||
xticks_rotation=xticks_rotation,
|
||||
values_format=values_format,
|
||||
colorbar=colorbar,
|
||||
im_kw=im_kw,
|
||||
text_kw=text_kw,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_predictions(
|
||||
cls,
|
||||
y_true,
|
||||
y_pred,
|
||||
*,
|
||||
labels=None,
|
||||
sample_weight=None,
|
||||
normalize=None,
|
||||
display_labels=None,
|
||||
include_values=True,
|
||||
xticks_rotation="horizontal",
|
||||
values_format=None,
|
||||
cmap="viridis",
|
||||
ax=None,
|
||||
colorbar=True,
|
||||
im_kw=None,
|
||||
text_kw=None,
|
||||
):
|
||||
"""Plot Confusion Matrix given true and predicted labels.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <confusion_matrix>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True labels.
|
||||
|
||||
y_pred : array-like of shape (n_samples,)
|
||||
The predicted labels given by the method `predict` of an
|
||||
classifier.
|
||||
|
||||
labels : array-like of shape (n_classes,), default=None
|
||||
List of labels to index the confusion matrix. This may be used to
|
||||
reorder or select a subset of labels. If `None` is given, those
|
||||
that appear at least once in `y_true` or `y_pred` are used in
|
||||
sorted order.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
normalize : {'true', 'pred', 'all'}, default=None
|
||||
Either to normalize the counts display in the matrix:
|
||||
|
||||
- if `'true'`, the confusion matrix is normalized over the true
|
||||
conditions (e.g. rows);
|
||||
- if `'pred'`, the confusion matrix is normalized over the
|
||||
predicted conditions (e.g. columns);
|
||||
- if `'all'`, the confusion matrix is normalized by the total
|
||||
number of samples;
|
||||
- if `None` (default), the confusion matrix will not be normalized.
|
||||
|
||||
display_labels : array-like of shape (n_classes,), default=None
|
||||
Target names used for plotting. By default, `labels` will be used
|
||||
if it is defined, otherwise the unique labels of `y_true` and
|
||||
`y_pred` will be used.
|
||||
|
||||
include_values : bool, default=True
|
||||
Includes values in confusion matrix.
|
||||
|
||||
xticks_rotation : {'vertical', 'horizontal'} or float, \
|
||||
default='horizontal'
|
||||
Rotation of xtick labels.
|
||||
|
||||
values_format : str, default=None
|
||||
Format specification for values in confusion matrix. If `None`, the
|
||||
format specification is 'd' or '.2g' whichever is shorter.
|
||||
|
||||
cmap : str or matplotlib Colormap, default='viridis'
|
||||
Colormap recognized by matplotlib.
|
||||
|
||||
ax : matplotlib Axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
colorbar : bool, default=True
|
||||
Whether or not to add a colorbar to the plot.
|
||||
|
||||
im_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.imshow` call.
|
||||
|
||||
text_kw : dict, default=None
|
||||
Dict with keywords passed to `matplotlib.pyplot.text` call.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
|
||||
|
||||
See Also
|
||||
--------
|
||||
ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
|
||||
given an estimator, the data, and the label.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import ConfusionMatrixDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = SVC(random_state=0)
|
||||
>>> clf.fit(X_train, y_train)
|
||||
SVC(random_state=0)
|
||||
>>> y_pred = clf.predict(X_test)
|
||||
>>> ConfusionMatrixDisplay.from_predictions(
|
||||
... y_test, y_pred)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
check_matplotlib_support(f"{cls.__name__}.from_predictions")
|
||||
|
||||
if display_labels is None:
|
||||
if labels is None:
|
||||
display_labels = unique_labels(y_true, y_pred)
|
||||
else:
|
||||
display_labels = labels
|
||||
|
||||
cm = confusion_matrix(
|
||||
y_true,
|
||||
y_pred,
|
||||
sample_weight=sample_weight,
|
||||
labels=labels,
|
||||
normalize=normalize,
|
||||
)
|
||||
|
||||
disp = cls(confusion_matrix=cm, display_labels=display_labels)
|
||||
|
||||
return disp.plot(
|
||||
include_values=include_values,
|
||||
cmap=cmap,
|
||||
ax=ax,
|
||||
xticks_rotation=xticks_rotation,
|
||||
values_format=values_format,
|
||||
colorbar=colorbar,
|
||||
im_kw=im_kw,
|
||||
text_kw=text_kw,
|
||||
)
|
||||
@@ -0,0 +1,388 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
|
||||
from sklearn.metrics._ranking import det_curve
|
||||
from sklearn.utils._plotting import (
|
||||
_BinaryClassifierCurveDisplayMixin,
|
||||
_deprecate_y_pred_parameter,
|
||||
)
|
||||
|
||||
|
||||
class DetCurveDisplay(_BinaryClassifierCurveDisplayMixin):
|
||||
"""Detection Error Tradeoff (DET) curve visualization.
|
||||
|
||||
It is recommended to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`
|
||||
or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a
|
||||
visualizer. All parameters are stored as attributes.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <det_curve>`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fpr : ndarray
|
||||
False positive rate.
|
||||
|
||||
fnr : ndarray
|
||||
False negative rate.
|
||||
|
||||
estimator_name : str, default=None
|
||||
Name of estimator. If None, the estimator name is not shown.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The label of the positive class. If not `None`, this value is displayed in
|
||||
the x- and y-axes labels.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
line_ : matplotlib Artist
|
||||
DET Curve.
|
||||
|
||||
ax_ : matplotlib Axes
|
||||
Axes with DET Curve.
|
||||
|
||||
figure_ : matplotlib Figure
|
||||
Figure containing the curve.
|
||||
|
||||
See Also
|
||||
--------
|
||||
det_curve : Compute error rates for different probability thresholds.
|
||||
DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
|
||||
some data.
|
||||
DetCurveDisplay.from_predictions : Plot DET curve given the true and
|
||||
predicted labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import det_curve, DetCurveDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(n_samples=1000, random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, test_size=0.4, random_state=0)
|
||||
>>> clf = SVC(random_state=0).fit(X_train, y_train)
|
||||
>>> y_score = clf.decision_function(X_test)
|
||||
>>> fpr, fnr, _ = det_curve(y_test, y_score)
|
||||
>>> display = DetCurveDisplay(
|
||||
... fpr=fpr, fnr=fnr, estimator_name="SVC"
|
||||
... )
|
||||
>>> display.plot()
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
|
||||
def __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None):
|
||||
self.fpr = fpr
|
||||
self.fnr = fnr
|
||||
self.estimator_name = estimator_name
|
||||
self.pos_label = pos_label
|
||||
|
||||
@classmethod
|
||||
def from_estimator(
|
||||
cls,
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=True,
|
||||
response_method="auto",
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot DET curve given an estimator and data.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <det_curve>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance
|
||||
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
|
||||
in which the last estimator is a classifier.
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=True
|
||||
Whether to drop thresholds where true positives (tp) do not change
|
||||
from the previous or subsequent threshold. All points with the same
|
||||
tp value have the same `fnr` and thus same y coordinate.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
response_method : {'predict_proba', 'decision_function', 'auto'} \
|
||||
default='auto'
|
||||
Specifies whether to use :term:`predict_proba` or
|
||||
:term:`decision_function` as the predicted target response. If set
|
||||
to 'auto', :term:`predict_proba` is tried first and if it does not
|
||||
exist :term:`decision_function` is tried next.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The label of the positive class. By default, `estimators.classes_[1]`
|
||||
is considered as the positive class.
|
||||
|
||||
name : str, default=None
|
||||
Name of DET curve for labeling. If `None`, use the name of the
|
||||
estimator.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
**kwargs : dict
|
||||
Additional keywords arguments passed to matplotlib `plot` function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.DetCurveDisplay`
|
||||
Object that stores computed values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
det_curve : Compute error rates for different probability thresholds.
|
||||
DetCurveDisplay.from_predictions : Plot DET curve given the true and
|
||||
predicted labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import DetCurveDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(n_samples=1000, random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, test_size=0.4, random_state=0)
|
||||
>>> clf = SVC(random_state=0).fit(X_train, y_train)
|
||||
>>> DetCurveDisplay.from_estimator(
|
||||
... clf, X_test, y_test)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score, pos_label, name = cls._validate_and_get_response_values(
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
)
|
||||
|
||||
return cls.from_predictions(
|
||||
y_true=y,
|
||||
y_score=y_score,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
name=name,
|
||||
ax=ax,
|
||||
pos_label=pos_label,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_predictions(
|
||||
cls,
|
||||
y_true,
|
||||
y_score=None,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=True,
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
y_pred="deprecated",
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot the DET curve given the true and predicted labels.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <det_curve>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True labels.
|
||||
|
||||
y_score : array-like of shape (n_samples,)
|
||||
Target scores, can either be probability estimates of the positive
|
||||
class, confidence values, or non-thresholded measure of decisions
|
||||
(as returned by `decision_function` on some classifiers).
|
||||
|
||||
.. versionadded:: 1.8
|
||||
`y_pred` has been renamed to `y_score`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=True
|
||||
Whether to drop thresholds where true positives (tp) do not change
|
||||
from the previous or subsequent threshold. All points with the same
|
||||
tp value have the same `fnr` and thus same y coordinate.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The label of the positive class. When `pos_label=None`, if `y_true`
|
||||
is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an
|
||||
error will be raised.
|
||||
|
||||
name : str, default=None
|
||||
Name of DET curve for labeling. If `None`, name will be set to
|
||||
`"Classifier"`.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
y_pred : array-like of shape (n_samples,)
|
||||
Target scores, can either be probability estimates of the positive
|
||||
class, confidence values, or non-thresholded measure of decisions
|
||||
(as returned by “decision_function” on some classifiers).
|
||||
|
||||
.. deprecated:: 1.8
|
||||
`y_pred` is deprecated and will be removed in 1.10. Use
|
||||
`y_score` instead.
|
||||
|
||||
**kwargs : dict
|
||||
Additional keywords arguments passed to matplotlib `plot` function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.DetCurveDisplay`
|
||||
Object that stores computed values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
det_curve : Compute error rates for different probability thresholds.
|
||||
DetCurveDisplay.from_estimator : Plot DET curve given an estimator and
|
||||
some data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import DetCurveDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(n_samples=1000, random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, test_size=0.4, random_state=0)
|
||||
>>> clf = SVC(random_state=0).fit(X_train, y_train)
|
||||
>>> y_score = clf.decision_function(X_test)
|
||||
>>> DetCurveDisplay.from_predictions(
|
||||
... y_test, y_score)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.8")
|
||||
pos_label_validated, name = cls._validate_from_predictions_params(
|
||||
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
|
||||
)
|
||||
|
||||
fpr, fnr, _ = det_curve(
|
||||
y_true,
|
||||
y_score,
|
||||
pos_label=pos_label,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
)
|
||||
|
||||
viz = cls(
|
||||
fpr=fpr,
|
||||
fnr=fnr,
|
||||
estimator_name=name,
|
||||
pos_label=pos_label_validated,
|
||||
)
|
||||
|
||||
return viz.plot(ax=ax, name=name, **kwargs)
|
||||
|
||||
def plot(self, ax=None, *, name=None, **kwargs):
|
||||
"""Plot visualization.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
name : str, default=None
|
||||
Name of DET curve for labeling. If `None`, use `estimator_name` if
|
||||
it is not `None`, otherwise no labeling is shown.
|
||||
|
||||
**kwargs : dict
|
||||
Additional keywords arguments passed to matplotlib `plot` function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.DetCurveDisplay`
|
||||
Object that stores computed values.
|
||||
"""
|
||||
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
|
||||
|
||||
line_kwargs = {} if name is None else {"label": name}
|
||||
line_kwargs.update(**kwargs)
|
||||
|
||||
# We have the following bounds:
|
||||
# sp.stats.norm.ppf(0.0) = -np.inf
|
||||
# sp.stats.norm.ppf(1.0) = np.inf
|
||||
# We therefore clip to eps and 1 - eps to not provide infinity to matplotlib.
|
||||
eps = np.finfo(self.fpr.dtype).eps
|
||||
self.fpr = self.fpr.clip(eps, 1 - eps)
|
||||
self.fnr = self.fnr.clip(eps, 1 - eps)
|
||||
|
||||
(self.line_,) = self.ax_.plot(
|
||||
sp.stats.norm.ppf(self.fpr),
|
||||
sp.stats.norm.ppf(self.fnr),
|
||||
**line_kwargs,
|
||||
)
|
||||
info_pos_label = (
|
||||
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
|
||||
)
|
||||
|
||||
xlabel = "False Positive Rate" + info_pos_label
|
||||
ylabel = "False Negative Rate" + info_pos_label
|
||||
self.ax_.set(xlabel=xlabel, ylabel=ylabel)
|
||||
|
||||
if "label" in line_kwargs:
|
||||
self.ax_.legend(loc="lower right")
|
||||
|
||||
ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999]
|
||||
tick_locations = sp.stats.norm.ppf(ticks)
|
||||
tick_labels = [
|
||||
"{:.0%}".format(s) if (100 * s).is_integer() else "{:.1%}".format(s)
|
||||
for s in ticks
|
||||
]
|
||||
self.ax_.set_xticks(tick_locations)
|
||||
self.ax_.set_xticklabels(tick_labels)
|
||||
self.ax_.set_xlim(-3, 3)
|
||||
self.ax_.set_yticks(tick_locations)
|
||||
self.ax_.set_yticklabels(tick_labels)
|
||||
self.ax_.set_ylim(-3, 3)
|
||||
|
||||
return self
|
||||
@@ -0,0 +1,582 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from sklearn.metrics._ranking import average_precision_score, precision_recall_curve
|
||||
from sklearn.utils._plotting import (
|
||||
_BinaryClassifierCurveDisplayMixin,
|
||||
_deprecate_estimator_name,
|
||||
_deprecate_y_pred_parameter,
|
||||
_despine,
|
||||
_validate_style_kwargs,
|
||||
)
|
||||
|
||||
|
||||
class PrecisionRecallDisplay(_BinaryClassifierCurveDisplayMixin):
|
||||
"""Precision Recall visualization.
|
||||
|
||||
It is recommended to use
|
||||
:func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or
|
||||
:func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create
|
||||
a :class:`~sklearn.metrics.PrecisionRecallDisplay`. All parameters are
|
||||
stored as attributes.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <precision_recall_f_measure_metrics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
precision : ndarray
|
||||
Precision values.
|
||||
|
||||
recall : ndarray
|
||||
Recall values.
|
||||
|
||||
average_precision : float, default=None
|
||||
Average precision. If None, the average precision is not shown.
|
||||
|
||||
name : str, default=None
|
||||
Name of estimator. If None, then the estimator name is not shown.
|
||||
|
||||
.. versionchanged:: 1.8
|
||||
`estimator_name` was deprecated in favor of `name`.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered the positive class when precision and recall metrics
|
||||
computed. If not `None`, this value is displayed in the x- and y-axes labels.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
prevalence_pos_label : float, default=None
|
||||
The prevalence of the positive label. It is used for plotting the
|
||||
chance level line. If None, the chance level line will not be plotted
|
||||
even if `plot_chance_level` is set to True when plotting.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
estimator_name : str, default=None
|
||||
Name of estimator. If None, the estimator name is not shown.
|
||||
|
||||
.. deprecated:: 1.8
|
||||
`estimator_name` is deprecated and will be removed in 1.10. Use `name`
|
||||
instead.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
line_ : matplotlib Artist
|
||||
Precision recall curve.
|
||||
|
||||
chance_level_ : matplotlib Artist or None
|
||||
The chance level line. It is `None` if the chance level is not plotted.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
ax_ : matplotlib Axes
|
||||
Axes with precision recall curve.
|
||||
|
||||
figure_ : matplotlib Figure
|
||||
Figure containing the curve.
|
||||
|
||||
See Also
|
||||
--------
|
||||
precision_recall_curve : Compute precision-recall pairs for different
|
||||
probability thresholds.
|
||||
PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given
|
||||
a binary classifier.
|
||||
PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve
|
||||
using predictions from a binary classifier.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) in
|
||||
scikit-learn is computed without any interpolation. To be consistent with
|
||||
this metric, the precision-recall curve is plotted without any
|
||||
interpolation as well (step-wise style).
|
||||
|
||||
You can change this style by passing the keyword argument
|
||||
`drawstyle="default"` in :meth:`plot`, :meth:`from_estimator`, or
|
||||
:meth:`from_predictions`. However, the curve will not be strictly
|
||||
consistent with the reported average precision.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import (precision_recall_curve,
|
||||
... PrecisionRecallDisplay)
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
... random_state=0)
|
||||
>>> clf = SVC(random_state=0)
|
||||
>>> clf.fit(X_train, y_train)
|
||||
SVC(random_state=0)
|
||||
>>> predictions = clf.predict(X_test)
|
||||
>>> precision, recall, _ = precision_recall_curve(y_test, predictions)
|
||||
>>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)
|
||||
>>> disp.plot()
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
precision,
|
||||
recall,
|
||||
*,
|
||||
average_precision=None,
|
||||
name=None,
|
||||
pos_label=None,
|
||||
prevalence_pos_label=None,
|
||||
estimator_name="deprecated",
|
||||
):
|
||||
self.name = _deprecate_estimator_name(estimator_name, name, "1.8")
|
||||
self.precision = precision
|
||||
self.recall = recall
|
||||
self.average_precision = average_precision
|
||||
self.pos_label = pos_label
|
||||
self.prevalence_pos_label = prevalence_pos_label
|
||||
|
||||
def plot(
|
||||
self,
|
||||
ax=None,
|
||||
*,
|
||||
name=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot visualization.
|
||||
|
||||
Extra keyword arguments will be passed to matplotlib's `plot`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ax : Matplotlib Axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
name : str, default=None
|
||||
Name of precision recall curve for labeling. If `None`, use
|
||||
`name` if not `None`, otherwise no labeling is shown.
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level. The chance level is the prevalence
|
||||
of the positive label computed from the data passed during
|
||||
:meth:`from_estimator` or :meth:`from_predictions` call.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to matplotlib's `plot`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
|
||||
Object that stores computed values.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
|
||||
in scikit-learn is computed without any interpolation. To be consistent
|
||||
with this metric, the precision-recall curve is plotted without any
|
||||
interpolation as well (step-wise style).
|
||||
|
||||
You can change this style by passing the keyword argument
|
||||
`drawstyle="default"`. However, the curve will not be strictly
|
||||
consistent with the reported average precision.
|
||||
"""
|
||||
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
|
||||
|
||||
default_line_kwargs = {"drawstyle": "steps-post"}
|
||||
if self.average_precision is not None and name is not None:
|
||||
default_line_kwargs["label"] = (
|
||||
f"{name} (AP = {self.average_precision:0.2f})"
|
||||
)
|
||||
elif self.average_precision is not None:
|
||||
default_line_kwargs["label"] = f"AP = {self.average_precision:0.2f}"
|
||||
elif name is not None:
|
||||
default_line_kwargs["label"] = name
|
||||
|
||||
line_kwargs = _validate_style_kwargs(default_line_kwargs, kwargs)
|
||||
|
||||
(self.line_,) = self.ax_.plot(self.recall, self.precision, **line_kwargs)
|
||||
|
||||
info_pos_label = (
|
||||
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
|
||||
)
|
||||
|
||||
xlabel = "Recall" + info_pos_label
|
||||
ylabel = "Precision" + info_pos_label
|
||||
self.ax_.set(
|
||||
xlabel=xlabel,
|
||||
xlim=(-0.01, 1.01),
|
||||
ylabel=ylabel,
|
||||
ylim=(-0.01, 1.01),
|
||||
aspect="equal",
|
||||
)
|
||||
|
||||
if plot_chance_level:
|
||||
if self.prevalence_pos_label is None:
|
||||
raise ValueError(
|
||||
"You must provide prevalence_pos_label when constructing the "
|
||||
"PrecisionRecallDisplay object in order to plot the chance "
|
||||
"level line. Alternatively, you may use "
|
||||
"PrecisionRecallDisplay.from_estimator or "
|
||||
"PrecisionRecallDisplay.from_predictions "
|
||||
"to automatically set prevalence_pos_label"
|
||||
)
|
||||
|
||||
default_chance_level_line_kw = {
|
||||
"label": f"Chance level (AP = {self.prevalence_pos_label:0.2f})",
|
||||
"color": "k",
|
||||
"linestyle": "--",
|
||||
}
|
||||
|
||||
if chance_level_kw is None:
|
||||
chance_level_kw = {}
|
||||
|
||||
chance_level_line_kw = _validate_style_kwargs(
|
||||
default_chance_level_line_kw, chance_level_kw
|
||||
)
|
||||
|
||||
(self.chance_level_,) = self.ax_.plot(
|
||||
(0, 1),
|
||||
(self.prevalence_pos_label, self.prevalence_pos_label),
|
||||
**chance_level_line_kw,
|
||||
)
|
||||
else:
|
||||
self.chance_level_ = None
|
||||
|
||||
if despine:
|
||||
_despine(self.ax_)
|
||||
|
||||
if "label" in line_kwargs or plot_chance_level:
|
||||
self.ax_.legend(loc="lower left")
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_estimator(
|
||||
cls,
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=False,
|
||||
response_method="auto",
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot precision-recall curve given an estimator and some data.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <precision_recall_f_measure_metrics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance
|
||||
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
|
||||
in which the last estimator is a classifier.
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=False
|
||||
Whether to drop some suboptimal thresholds which would not appear
|
||||
on a plotted precision-recall curve. This is useful in order to
|
||||
create lighter precision-recall curves.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
response_method : {'predict_proba', 'decision_function', 'auto'}, \
|
||||
default='auto'
|
||||
Specifies whether to use :term:`predict_proba` or
|
||||
:term:`decision_function` as the target response. If set to 'auto',
|
||||
:term:`predict_proba` is tried first and if it does not exist
|
||||
:term:`decision_function` is tried next.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered as the positive class when computing the
|
||||
precision and recall metrics. By default, `estimators.classes_[1]`
|
||||
is considered as the positive class.
|
||||
|
||||
name : str, default=None
|
||||
Name for labeling curve. If `None`, no name is used.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is created.
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level. The chance level is the prevalence
|
||||
of the positive label computed from the data passed during
|
||||
:meth:`from_estimator` or :meth:`from_predictions` call.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to matplotlib's `plot`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
|
||||
|
||||
See Also
|
||||
--------
|
||||
PrecisionRecallDisplay.from_predictions : Plot precision-recall curve
|
||||
using estimated probabilities or output of decision function.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
|
||||
in scikit-learn is computed without any interpolation. To be consistent
|
||||
with this metric, the precision-recall curve is plotted without any
|
||||
interpolation as well (step-wise style).
|
||||
|
||||
You can change this style by passing the keyword argument
|
||||
`drawstyle="default"`. However, the curve will not be strictly
|
||||
consistent with the reported average precision.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import PrecisionRecallDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.linear_model import LogisticRegression
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = LogisticRegression()
|
||||
>>> clf.fit(X_train, y_train)
|
||||
LogisticRegression()
|
||||
>>> PrecisionRecallDisplay.from_estimator(
|
||||
... clf, X_test, y_test)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score, pos_label, name = cls._validate_and_get_response_values(
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
)
|
||||
|
||||
return cls.from_predictions(
|
||||
y,
|
||||
y_score,
|
||||
sample_weight=sample_weight,
|
||||
name=name,
|
||||
pos_label=pos_label,
|
||||
drop_intermediate=drop_intermediate,
|
||||
ax=ax,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
despine=despine,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_predictions(
|
||||
cls,
|
||||
y_true,
|
||||
y_score=None,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=False,
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
y_pred="deprecated",
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot precision-recall curve given binary class predictions.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <precision_recall_f_measure_metrics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True binary labels.
|
||||
|
||||
y_score : array-like of shape (n_samples,)
|
||||
Estimated probabilities or output of decision function.
|
||||
|
||||
.. versionadded:: 1.8
|
||||
`y_pred` has been renamed to `y_score`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=False
|
||||
Whether to drop some suboptimal thresholds which would not appear
|
||||
on a plotted precision-recall curve. This is useful in order to
|
||||
create lighter precision-recall curves.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered as the positive class when computing the
|
||||
precision and recall metrics. When `pos_label=None`, if `y_true` is
|
||||
in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an error
|
||||
will be raised.
|
||||
|
||||
name : str, default=None
|
||||
Name for labeling curve. If `None`, name will be set to
|
||||
`"Classifier"`.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is created.
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level. The chance level is the prevalence
|
||||
of the positive label computed from the data passed during
|
||||
:meth:`from_estimator` or :meth:`from_predictions` call.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
y_pred : array-like of shape (n_samples,)
|
||||
Estimated probabilities or output of decision function.
|
||||
|
||||
.. deprecated:: 1.8
|
||||
`y_pred` is deprecated and will be removed in 1.10. Use
|
||||
`y_score` instead.
|
||||
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to matplotlib's `plot`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PrecisionRecallDisplay`
|
||||
|
||||
See Also
|
||||
--------
|
||||
PrecisionRecallDisplay.from_estimator : Plot precision-recall curve
|
||||
using an estimator.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The average precision (cf. :func:`~sklearn.metrics.average_precision_score`)
|
||||
in scikit-learn is computed without any interpolation. To be consistent
|
||||
with this metric, the precision-recall curve is plotted without any
|
||||
interpolation as well (step-wise style).
|
||||
|
||||
You can change this style by passing the keyword argument
|
||||
`drawstyle="default"`. However, the curve will not be strictly
|
||||
consistent with the reported average precision.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import PrecisionRecallDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.linear_model import LogisticRegression
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = LogisticRegression()
|
||||
>>> clf.fit(X_train, y_train)
|
||||
LogisticRegression()
|
||||
>>> y_score = clf.predict_proba(X_test)[:, 1]
|
||||
>>> PrecisionRecallDisplay.from_predictions(
|
||||
... y_test, y_score)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.8")
|
||||
pos_label, name = cls._validate_from_predictions_params(
|
||||
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
|
||||
)
|
||||
|
||||
precision, recall, _ = precision_recall_curve(
|
||||
y_true,
|
||||
y_score,
|
||||
pos_label=pos_label,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
)
|
||||
average_precision = average_precision_score(
|
||||
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
|
||||
)
|
||||
|
||||
class_count = Counter(y_true)
|
||||
prevalence_pos_label = class_count[pos_label] / sum(class_count.values())
|
||||
|
||||
viz = cls(
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
average_precision=average_precision,
|
||||
name=name,
|
||||
pos_label=pos_label,
|
||||
prevalence_pos_label=prevalence_pos_label,
|
||||
)
|
||||
|
||||
return viz.plot(
|
||||
ax=ax,
|
||||
name=name,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
despine=despine,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,413 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils import _safe_indexing, check_random_state
|
||||
from sklearn.utils._optional_dependencies import check_matplotlib_support
|
||||
from sklearn.utils._plotting import _validate_style_kwargs
|
||||
|
||||
|
||||
class PredictionErrorDisplay:
|
||||
"""Visualization of the prediction error of a regression model.
|
||||
|
||||
This tool can display "residuals vs predicted" or "actual vs predicted"
|
||||
using scatter plots to qualitatively assess the behavior of a regressor,
|
||||
preferably on held-out data points.
|
||||
|
||||
See the details in the docstrings of
|
||||
:func:`~sklearn.metrics.PredictionErrorDisplay.from_estimator` or
|
||||
:func:`~sklearn.metrics.PredictionErrorDisplay.from_predictions` to
|
||||
create a visualizer. All parameters are stored as attributes.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, read
|
||||
more in the :ref:`Visualization Guide <visualizations>`.
|
||||
For details regarding interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : ndarray of shape (n_samples,)
|
||||
True values.
|
||||
|
||||
y_pred : ndarray of shape (n_samples,)
|
||||
Prediction values.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
line_ : matplotlib Artist
|
||||
Optimal line representing `y_true == y_pred`. Therefore, it is a
|
||||
diagonal line for `kind="predictions"` and a horizontal line for
|
||||
`kind="residuals"`.
|
||||
|
||||
errors_lines_ : matplotlib Artist or None
|
||||
Residual lines. If `with_errors=False`, then it is set to `None`.
|
||||
|
||||
scatter_ : matplotlib Artist
|
||||
Scatter data points.
|
||||
|
||||
ax_ : matplotlib Axes
|
||||
Axes with the different matplotlib axis.
|
||||
|
||||
figure_ : matplotlib Figure
|
||||
Figure containing the scatter and lines.
|
||||
|
||||
See Also
|
||||
--------
|
||||
PredictionErrorDisplay.from_estimator : Prediction error visualization
|
||||
given an estimator and some data.
|
||||
PredictionErrorDisplay.from_predictions : Prediction error visualization
|
||||
given the true and predicted targets.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import load_diabetes
|
||||
>>> from sklearn.linear_model import Ridge
|
||||
>>> from sklearn.metrics import PredictionErrorDisplay
|
||||
>>> X, y = load_diabetes(return_X_y=True)
|
||||
>>> ridge = Ridge().fit(X, y)
|
||||
>>> y_pred = ridge.predict(X)
|
||||
>>> display = PredictionErrorDisplay(y_true=y, y_pred=y_pred)
|
||||
>>> display.plot()
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
|
||||
def __init__(self, *, y_true, y_pred):
|
||||
self.y_true = y_true
|
||||
self.y_pred = y_pred
|
||||
|
||||
def plot(
|
||||
self,
|
||||
ax=None,
|
||||
*,
|
||||
kind="residual_vs_predicted",
|
||||
scatter_kwargs=None,
|
||||
line_kwargs=None,
|
||||
):
|
||||
"""Plot visualization.
|
||||
|
||||
Extra keyword arguments will be passed to matplotlib's ``plot``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
|
||||
default="residual_vs_predicted"
|
||||
The type of plot to draw:
|
||||
|
||||
- "actual_vs_predicted" draws the observed values (y-axis) vs.
|
||||
the predicted values (x-axis).
|
||||
- "residual_vs_predicted" draws the residuals, i.e. difference
|
||||
between observed and predicted values, (y-axis) vs. the predicted
|
||||
values (x-axis).
|
||||
|
||||
scatter_kwargs : dict, default=None
|
||||
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
|
||||
call.
|
||||
|
||||
line_kwargs : dict, default=None
|
||||
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
|
||||
call to draw the optimal line.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
|
||||
|
||||
Object that stores computed values.
|
||||
"""
|
||||
check_matplotlib_support(f"{self.__class__.__name__}.plot")
|
||||
|
||||
expected_kind = ("actual_vs_predicted", "residual_vs_predicted")
|
||||
if kind not in expected_kind:
|
||||
raise ValueError(
|
||||
f"`kind` must be one of {', '.join(expected_kind)}. "
|
||||
f"Got {kind!r} instead."
|
||||
)
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
if scatter_kwargs is None:
|
||||
scatter_kwargs = {}
|
||||
if line_kwargs is None:
|
||||
line_kwargs = {}
|
||||
|
||||
default_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8}
|
||||
default_line_kwargs = {"color": "black", "alpha": 0.7, "linestyle": "--"}
|
||||
|
||||
scatter_kwargs = _validate_style_kwargs(default_scatter_kwargs, scatter_kwargs)
|
||||
line_kwargs = _validate_style_kwargs(default_line_kwargs, line_kwargs)
|
||||
|
||||
scatter_kwargs = {**default_scatter_kwargs, **scatter_kwargs}
|
||||
line_kwargs = {**default_line_kwargs, **line_kwargs}
|
||||
|
||||
if ax is None:
|
||||
_, ax = plt.subplots()
|
||||
|
||||
if kind == "actual_vs_predicted":
|
||||
max_value = max(np.max(self.y_true), np.max(self.y_pred))
|
||||
min_value = min(np.min(self.y_true), np.min(self.y_pred))
|
||||
self.line_ = ax.plot(
|
||||
[min_value, max_value], [min_value, max_value], **line_kwargs
|
||||
)[0]
|
||||
|
||||
x_data, y_data = self.y_pred, self.y_true
|
||||
xlabel, ylabel = "Predicted values", "Actual values"
|
||||
|
||||
self.scatter_ = ax.scatter(x_data, y_data, **scatter_kwargs)
|
||||
|
||||
# force to have a squared axis
|
||||
ax.set_aspect("equal", adjustable="datalim")
|
||||
ax.set_xticks(np.linspace(min_value, max_value, num=5))
|
||||
ax.set_yticks(np.linspace(min_value, max_value, num=5))
|
||||
else: # kind == "residual_vs_predicted"
|
||||
self.line_ = ax.plot(
|
||||
[np.min(self.y_pred), np.max(self.y_pred)],
|
||||
[0, 0],
|
||||
**line_kwargs,
|
||||
)[0]
|
||||
self.scatter_ = ax.scatter(
|
||||
self.y_pred, self.y_true - self.y_pred, **scatter_kwargs
|
||||
)
|
||||
xlabel, ylabel = "Predicted values", "Residuals (actual - predicted)"
|
||||
|
||||
ax.set(xlabel=xlabel, ylabel=ylabel)
|
||||
|
||||
self.ax_ = ax
|
||||
self.figure_ = ax.figure
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_estimator(
|
||||
cls,
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
kind="residual_vs_predicted",
|
||||
subsample=1_000,
|
||||
random_state=None,
|
||||
ax=None,
|
||||
scatter_kwargs=None,
|
||||
line_kwargs=None,
|
||||
):
|
||||
"""Plot the prediction error given a regressor and some data.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools,
|
||||
read more in the :ref:`Visualization Guide <visualizations>`.
|
||||
For details regarding interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance
|
||||
Fitted regressor or a fitted :class:`~sklearn.pipeline.Pipeline`
|
||||
in which the last estimator is a regressor.
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
|
||||
default="residual_vs_predicted"
|
||||
The type of plot to draw:
|
||||
|
||||
- "actual_vs_predicted" draws the observed values (y-axis) vs.
|
||||
the predicted values (x-axis).
|
||||
- "residual_vs_predicted" draws the residuals, i.e. difference
|
||||
between observed and predicted values, (y-axis) vs. the predicted
|
||||
values (x-axis).
|
||||
|
||||
subsample : float, int or None, default=1_000
|
||||
Sampling the samples to be shown on the scatter plot. If `float`,
|
||||
it should be between 0 and 1 and represents the proportion of the
|
||||
original dataset. If `int`, it represents the number of samples
|
||||
display on the scatter plot. If `None`, no subsampling will be
|
||||
applied. by default, 1000 samples or less will be displayed.
|
||||
|
||||
random_state : int or RandomState, default=None
|
||||
Controls the randomness when `subsample` is not `None`.
|
||||
See :term:`Glossary <random_state>` for details.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
scatter_kwargs : dict, default=None
|
||||
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
|
||||
call.
|
||||
|
||||
line_kwargs : dict, default=None
|
||||
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
|
||||
call to draw the optimal line.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
|
||||
Object that stores the computed values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
PredictionErrorDisplay : Prediction error visualization for regression.
|
||||
PredictionErrorDisplay.from_predictions : Prediction error visualization
|
||||
given the true and predicted targets.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import load_diabetes
|
||||
>>> from sklearn.linear_model import Ridge
|
||||
>>> from sklearn.metrics import PredictionErrorDisplay
|
||||
>>> X, y = load_diabetes(return_X_y=True)
|
||||
>>> ridge = Ridge().fit(X, y)
|
||||
>>> disp = PredictionErrorDisplay.from_estimator(ridge, X, y)
|
||||
>>> plt.show()
|
||||
"""
|
||||
check_matplotlib_support(f"{cls.__name__}.from_estimator")
|
||||
|
||||
y_pred = estimator.predict(X)
|
||||
|
||||
return cls.from_predictions(
|
||||
y_true=y,
|
||||
y_pred=y_pred,
|
||||
kind=kind,
|
||||
subsample=subsample,
|
||||
random_state=random_state,
|
||||
ax=ax,
|
||||
scatter_kwargs=scatter_kwargs,
|
||||
line_kwargs=line_kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_predictions(
|
||||
cls,
|
||||
y_true,
|
||||
y_pred,
|
||||
*,
|
||||
kind="residual_vs_predicted",
|
||||
subsample=1_000,
|
||||
random_state=None,
|
||||
ax=None,
|
||||
scatter_kwargs=None,
|
||||
line_kwargs=None,
|
||||
):
|
||||
"""Plot the prediction error given the true and predicted targets.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools,
|
||||
read more in the :ref:`Visualization Guide <visualizations>`.
|
||||
For details regarding interpreting these plots, refer to the
|
||||
:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True target values.
|
||||
|
||||
y_pred : array-like of shape (n_samples,)
|
||||
Predicted target values.
|
||||
|
||||
kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
|
||||
default="residual_vs_predicted"
|
||||
The type of plot to draw:
|
||||
|
||||
- "actual_vs_predicted" draws the observed values (y-axis) vs.
|
||||
the predicted values (x-axis).
|
||||
- "residual_vs_predicted" draws the residuals, i.e. difference
|
||||
between observed and predicted values, (y-axis) vs. the predicted
|
||||
values (x-axis).
|
||||
|
||||
subsample : float, int or None, default=1_000
|
||||
Sampling the samples to be shown on the scatter plot. If `float`,
|
||||
it should be between 0 and 1 and represents the proportion of the
|
||||
original dataset. If `int`, it represents the number of samples
|
||||
display on the scatter plot. If `None`, no subsampling will be
|
||||
applied. by default, 1000 samples or less will be displayed.
|
||||
|
||||
random_state : int or RandomState, default=None
|
||||
Controls the randomness when `subsample` is not `None`.
|
||||
See :term:`Glossary <random_state>` for details.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
scatter_kwargs : dict, default=None
|
||||
Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
|
||||
call.
|
||||
|
||||
line_kwargs : dict, default=None
|
||||
Dictionary with keyword passed to the `matplotlib.pyplot.plot`
|
||||
call to draw the optimal line.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.PredictionErrorDisplay`
|
||||
Object that stores the computed values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
PredictionErrorDisplay : Prediction error visualization for regression.
|
||||
PredictionErrorDisplay.from_estimator : Prediction error visualization
|
||||
given an estimator and some data.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import load_diabetes
|
||||
>>> from sklearn.linear_model import Ridge
|
||||
>>> from sklearn.metrics import PredictionErrorDisplay
|
||||
>>> X, y = load_diabetes(return_X_y=True)
|
||||
>>> ridge = Ridge().fit(X, y)
|
||||
>>> y_pred = ridge.predict(X)
|
||||
>>> disp = PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred)
|
||||
>>> plt.show()
|
||||
"""
|
||||
check_matplotlib_support(f"{cls.__name__}.from_predictions")
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
n_samples = len(y_true)
|
||||
if isinstance(subsample, numbers.Integral):
|
||||
if subsample <= 0:
|
||||
raise ValueError(
|
||||
f"When an integer, subsample={subsample} should be positive."
|
||||
)
|
||||
elif isinstance(subsample, numbers.Real):
|
||||
if subsample <= 0 or subsample >= 1:
|
||||
raise ValueError(
|
||||
f"When a floating-point, subsample={subsample} should"
|
||||
" be in the (0, 1) range."
|
||||
)
|
||||
subsample = int(n_samples * subsample)
|
||||
|
||||
if subsample is not None and subsample < n_samples:
|
||||
indices = random_state.choice(np.arange(n_samples), size=subsample)
|
||||
y_true = _safe_indexing(y_true, indices, axis=0)
|
||||
y_pred = _safe_indexing(y_pred, indices, axis=0)
|
||||
|
||||
viz = cls(
|
||||
y_true=y_true,
|
||||
y_pred=y_pred,
|
||||
)
|
||||
|
||||
return viz.plot(
|
||||
ax=ax,
|
||||
kind=kind,
|
||||
scatter_kwargs=scatter_kwargs,
|
||||
line_kwargs=line_kwargs,
|
||||
)
|
||||
@@ -0,0 +1,787 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.metrics._ranking import auc, roc_curve
|
||||
from sklearn.utils import _safe_indexing
|
||||
from sklearn.utils._plotting import (
|
||||
_BinaryClassifierCurveDisplayMixin,
|
||||
_check_param_lengths,
|
||||
_convert_to_list_leaving_none,
|
||||
_deprecate_estimator_name,
|
||||
_deprecate_y_pred_parameter,
|
||||
_despine,
|
||||
_validate_style_kwargs,
|
||||
)
|
||||
from sklearn.utils._response import _get_response_values_binary
|
||||
|
||||
|
||||
class RocCurveDisplay(_BinaryClassifierCurveDisplayMixin):
|
||||
"""ROC Curve visualization.
|
||||
|
||||
It is recommended to use
|
||||
:func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or
|
||||
:func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or
|
||||
:func:`~sklearn.metrics.RocCurveDisplay.from_cv_results` to create
|
||||
a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are
|
||||
stored as attributes.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools, see
|
||||
the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <roc_metrics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fpr : ndarray or list of ndarrays
|
||||
False positive rates. Each ndarray should contain values for a single curve.
|
||||
If plotting multiple curves, list should be of same length as `tpr`.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Now accepts a list for plotting multiple curves.
|
||||
|
||||
tpr : ndarray or list of ndarrays
|
||||
True positive rates. Each ndarray should contain values for a single curve.
|
||||
If plotting multiple curves, list should be of same length as `fpr`.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Now accepts a list for plotting multiple curves.
|
||||
|
||||
roc_auc : float or list of floats, default=None
|
||||
Area under ROC curve, used for labeling each curve in the legend.
|
||||
If plotting multiple curves, should be a list of the same length as `fpr`
|
||||
and `tpr`. If `None`, ROC AUC scores are not shown in the legend.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Now accepts a list for plotting multiple curves.
|
||||
|
||||
name : str or list of str, default=None
|
||||
Name for labeling legend entries. The number of legend entries is determined
|
||||
by the `curve_kwargs` passed to `plot`, and is not affected by `name`.
|
||||
To label each curve, provide a list of strings. To avoid labeling
|
||||
individual curves that have the same appearance, a list cannot be used in
|
||||
conjunction with `curve_kwargs` being a dictionary or None. If a
|
||||
string is provided, it will be used to either label the single legend entry
|
||||
or if there are multiple legend entries, label each individual curve with
|
||||
the same name. If `None`, no name is shown in the legend.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
`estimator_name` was deprecated in favor of `name`.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered the positive class when ROC AUC metrics computed.
|
||||
If not `None`, this value is displayed in the x- and y-axes labels.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
estimator_name : str, default=None
|
||||
Name of estimator. If None, the estimator name is not shown.
|
||||
|
||||
.. deprecated:: 1.7
|
||||
`estimator_name` is deprecated and will be removed in 1.9. Use `name`
|
||||
instead.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
line_ : matplotlib Artist or list of matplotlib Artists
|
||||
ROC Curves.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
This attribute can now be a list of Artists, for when multiple curves
|
||||
are plotted.
|
||||
|
||||
chance_level_ : matplotlib Artist or None
|
||||
The chance level line. It is `None` if the chance level is not plotted.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
ax_ : matplotlib Axes
|
||||
Axes with ROC Curve.
|
||||
|
||||
figure_ : matplotlib Figure
|
||||
Figure containing the curve.
|
||||
|
||||
See Also
|
||||
--------
|
||||
roc_curve : Compute Receiver operating characteristic (ROC) curve.
|
||||
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
|
||||
(ROC) curve given an estimator and some data.
|
||||
RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
|
||||
(ROC) curve given the true and predicted values.
|
||||
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
|
||||
cross-validation results.
|
||||
roc_auc_score : Compute the area under the ROC curve.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import metrics
|
||||
>>> y_true = np.array([0, 0, 1, 1])
|
||||
>>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
|
||||
>>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
|
||||
>>> roc_auc = metrics.auc(fpr, tpr)
|
||||
>>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
|
||||
... name='example estimator')
|
||||
>>> display.plot()
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
fpr,
|
||||
tpr,
|
||||
roc_auc=None,
|
||||
name=None,
|
||||
pos_label=None,
|
||||
estimator_name="deprecated",
|
||||
):
|
||||
self.fpr = fpr
|
||||
self.tpr = tpr
|
||||
self.roc_auc = roc_auc
|
||||
self.name = _deprecate_estimator_name(estimator_name, name, "1.7")
|
||||
self.pos_label = pos_label
|
||||
|
||||
def _validate_plot_params(self, *, ax, name):
|
||||
self.ax_, self.figure_, name = super()._validate_plot_params(ax=ax, name=name)
|
||||
|
||||
fpr = _convert_to_list_leaving_none(self.fpr)
|
||||
tpr = _convert_to_list_leaving_none(self.tpr)
|
||||
roc_auc = _convert_to_list_leaving_none(self.roc_auc)
|
||||
name = _convert_to_list_leaving_none(name)
|
||||
|
||||
optional = {"self.roc_auc": roc_auc}
|
||||
if isinstance(name, list) and len(name) != 1:
|
||||
optional.update({"'name' (or self.name)": name})
|
||||
_check_param_lengths(
|
||||
required={"self.fpr": fpr, "self.tpr": tpr},
|
||||
optional=optional,
|
||||
class_name="RocCurveDisplay",
|
||||
)
|
||||
return fpr, tpr, roc_auc, name
|
||||
|
||||
def plot(
|
||||
self,
|
||||
ax=None,
|
||||
*,
|
||||
name=None,
|
||||
curve_kwargs=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot visualization.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
name : str or list of str, default=None
|
||||
Name for labeling legend entries. The number of legend entries
|
||||
is determined by `curve_kwargs`, and is not affected by `name`.
|
||||
To label each curve, provide a list of strings. To avoid labeling
|
||||
individual curves that have the same appearance, a list cannot be used in
|
||||
conjunction with `curve_kwargs` being a dictionary or None. If a
|
||||
string is provided, it will be used to either label the single legend entry
|
||||
or if there are multiple legend entries, label each individual curve with
|
||||
the same name. If `None`, set to `name` provided at `RocCurveDisplay`
|
||||
initialization. If still `None`, no name is shown in the legend.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
curve_kwargs : dict or list of dict, default=None
|
||||
Keywords arguments to be passed to matplotlib's `plot` function
|
||||
to draw individual ROC curves. For single curve plotting, should be
|
||||
a dictionary. For multi-curve plotting, if a list is provided the
|
||||
parameters are applied to the ROC curves of each CV fold
|
||||
sequentially and a legend entry is added for each curve.
|
||||
If a single dictionary is provided, the same parameters are applied
|
||||
to all ROC curves and a single legend entry for all curves is added,
|
||||
labeled with the mean ROC AUC score.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to matplotlib's `plot`.
|
||||
|
||||
.. deprecated:: 1.7
|
||||
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
|
||||
arguments to `curve_kwargs` as a dictionary instead.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.RocCurveDisplay`
|
||||
Object that stores computed values.
|
||||
"""
|
||||
fpr, tpr, roc_auc, name = self._validate_plot_params(ax=ax, name=name)
|
||||
n_curves = len(fpr)
|
||||
if not isinstance(curve_kwargs, list) and n_curves > 1:
|
||||
if roc_auc:
|
||||
legend_metric = {"mean": np.mean(roc_auc), "std": np.std(roc_auc)}
|
||||
else:
|
||||
legend_metric = {"mean": None, "std": None}
|
||||
else:
|
||||
roc_auc = roc_auc if roc_auc is not None else [None] * n_curves
|
||||
legend_metric = {"metric": roc_auc}
|
||||
|
||||
curve_kwargs = self._validate_curve_kwargs(
|
||||
n_curves,
|
||||
name,
|
||||
legend_metric,
|
||||
"AUC",
|
||||
curve_kwargs=curve_kwargs,
|
||||
default_multi_curve_kwargs={
|
||||
"alpha": 0.5,
|
||||
"linestyle": "--",
|
||||
"color": "blue",
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
default_chance_level_line_kw = {
|
||||
"label": "Chance level (AUC = 0.5)",
|
||||
"color": "k",
|
||||
"linestyle": "--",
|
||||
}
|
||||
|
||||
if chance_level_kw is None:
|
||||
chance_level_kw = {}
|
||||
|
||||
chance_level_kw = _validate_style_kwargs(
|
||||
default_chance_level_line_kw, chance_level_kw
|
||||
)
|
||||
|
||||
self.line_ = []
|
||||
for fpr, tpr, line_kw in zip(fpr, tpr, curve_kwargs):
|
||||
self.line_.extend(self.ax_.plot(fpr, tpr, **line_kw))
|
||||
# Return single artist if only one curve is plotted
|
||||
if len(self.line_) == 1:
|
||||
self.line_ = self.line_[0]
|
||||
|
||||
info_pos_label = (
|
||||
f" (Positive label: {self.pos_label})" if self.pos_label is not None else ""
|
||||
)
|
||||
|
||||
xlabel = "False Positive Rate" + info_pos_label
|
||||
ylabel = "True Positive Rate" + info_pos_label
|
||||
self.ax_.set(
|
||||
xlabel=xlabel,
|
||||
xlim=(-0.01, 1.01),
|
||||
ylabel=ylabel,
|
||||
ylim=(-0.01, 1.01),
|
||||
aspect="equal",
|
||||
)
|
||||
|
||||
if plot_chance_level:
|
||||
(self.chance_level_,) = self.ax_.plot((0, 1), (0, 1), **chance_level_kw)
|
||||
else:
|
||||
self.chance_level_ = None
|
||||
|
||||
if despine:
|
||||
_despine(self.ax_)
|
||||
|
||||
if curve_kwargs[0].get("label") is not None or (
|
||||
plot_chance_level and chance_level_kw.get("label") is not None
|
||||
):
|
||||
self.ax_.legend(loc="lower right")
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_estimator(
|
||||
cls,
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=True,
|
||||
response_method="auto",
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
curve_kwargs=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Create a ROC Curve display from an estimator.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools,
|
||||
see the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <roc_metrics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance
|
||||
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
|
||||
in which the last estimator is a classifier.
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=True
|
||||
Whether to drop thresholds where the resulting point is collinear
|
||||
with its neighbors in ROC space. This has no effect on the ROC AUC
|
||||
or visual shape of the curve, but reduces the number of plotted
|
||||
points.
|
||||
|
||||
response_method : {'predict_proba', 'decision_function', 'auto'} \
|
||||
default='auto'
|
||||
Specifies whether to use :term:`predict_proba` or
|
||||
:term:`decision_function` as the target response. If set to 'auto',
|
||||
:term:`predict_proba` is tried first and if it does not exist
|
||||
:term:`decision_function` is tried next.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered as the positive class when computing the ROC AUC.
|
||||
By default, `estimators.classes_[1]` is considered
|
||||
as the positive class.
|
||||
|
||||
name : str, default=None
|
||||
Name of ROC Curve for labeling. If `None`, use the name of the
|
||||
estimator.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is created.
|
||||
|
||||
curve_kwargs : dict, default=None
|
||||
Keywords arguments to be passed to matplotlib's `plot` function.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to matplotlib's `plot`.
|
||||
|
||||
.. deprecated:: 1.7
|
||||
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
|
||||
arguments to `curve_kwargs` as a dictionary instead.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.RocCurveDisplay`
|
||||
The ROC Curve display.
|
||||
|
||||
See Also
|
||||
--------
|
||||
roc_curve : Compute Receiver operating characteristic (ROC) curve.
|
||||
RocCurveDisplay.from_predictions : ROC Curve visualization given the
|
||||
probabilities of scores of a classifier.
|
||||
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
|
||||
cross-validation results.
|
||||
roc_auc_score : Compute the area under the ROC curve.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import RocCurveDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = SVC(random_state=0).fit(X_train, y_train)
|
||||
>>> RocCurveDisplay.from_estimator(
|
||||
... clf, X_test, y_test)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score, pos_label, name = cls._validate_and_get_response_values(
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
)
|
||||
|
||||
return cls.from_predictions(
|
||||
y_true=y,
|
||||
y_score=y_score,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
ax=ax,
|
||||
curve_kwargs=curve_kwargs,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
despine=despine,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_predictions(
|
||||
cls,
|
||||
y_true,
|
||||
y_score=None,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=True,
|
||||
pos_label=None,
|
||||
name=None,
|
||||
ax=None,
|
||||
curve_kwargs=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kw=None,
|
||||
despine=False,
|
||||
y_pred="deprecated",
|
||||
**kwargs,
|
||||
):
|
||||
"""Plot ROC curve given the true and predicted values.
|
||||
|
||||
For general information regarding `scikit-learn` visualization tools,
|
||||
see the :ref:`Visualization Guide <visualizations>`.
|
||||
For guidance on interpreting these plots, refer to the :ref:`Model
|
||||
Evaluation Guide <roc_metrics>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True labels.
|
||||
|
||||
y_score : array-like of shape (n_samples,)
|
||||
Target scores, can either be probability estimates of the positive
|
||||
class, confidence values, or non-thresholded measure of decisions
|
||||
(as returned by “decision_function” on some classifiers).
|
||||
|
||||
.. versionadded:: 1.7
|
||||
`y_pred` has been renamed to `y_score`.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=True
|
||||
Whether to drop thresholds where the resulting point is collinear
|
||||
with its neighbors in ROC space. This has no effect on the ROC AUC
|
||||
or visual shape of the curve, but reduces the number of plotted
|
||||
points.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The label of the positive class when computing the ROC AUC.
|
||||
When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label`
|
||||
is set to 1, otherwise an error will be raised.
|
||||
|
||||
name : str, default=None
|
||||
Name of ROC curve for legend labeling. If `None`, name will be set to
|
||||
`"Classifier"`.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
curve_kwargs : dict, default=None
|
||||
Keywords arguments to be passed to matplotlib's `plot` function.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
chance_level_kw : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
.. versionadded:: 1.6
|
||||
|
||||
y_pred : array-like of shape (n_samples,)
|
||||
Target scores, can either be probability estimates of the positive
|
||||
class, confidence values, or non-thresholded measure of decisions
|
||||
(as returned by “decision_function” on some classifiers).
|
||||
|
||||
.. deprecated:: 1.7
|
||||
`y_pred` is deprecated and will be removed in 1.9. Use
|
||||
`y_score` instead.
|
||||
|
||||
**kwargs : dict
|
||||
Additional keywords arguments passed to matplotlib `plot` function.
|
||||
|
||||
.. deprecated:: 1.7
|
||||
kwargs is deprecated and will be removed in 1.9. Pass matplotlib
|
||||
arguments to `curve_kwargs` as a dictionary instead.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.RocCurveDisplay`
|
||||
Object that stores computed values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
roc_curve : Compute Receiver operating characteristic (ROC) curve.
|
||||
RocCurveDisplay.from_estimator : ROC Curve visualization given an
|
||||
estimator and some data.
|
||||
RocCurveDisplay.from_cv_results : Plot multi-fold ROC curves given
|
||||
cross-validation results.
|
||||
roc_auc_score : Compute the area under the ROC curve.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import RocCurveDisplay
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=0)
|
||||
>>> clf = SVC(random_state=0).fit(X_train, y_train)
|
||||
>>> y_score = clf.decision_function(X_test)
|
||||
>>> RocCurveDisplay.from_predictions(y_test, y_score)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
y_score = _deprecate_y_pred_parameter(y_score, y_pred, "1.7")
|
||||
pos_label_validated, name = cls._validate_from_predictions_params(
|
||||
y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name
|
||||
)
|
||||
|
||||
fpr, tpr, _ = roc_curve(
|
||||
y_true,
|
||||
y_score,
|
||||
pos_label=pos_label,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
)
|
||||
roc_auc = auc(fpr, tpr)
|
||||
|
||||
viz = cls(
|
||||
fpr=fpr,
|
||||
tpr=tpr,
|
||||
roc_auc=roc_auc,
|
||||
name=name,
|
||||
pos_label=pos_label_validated,
|
||||
)
|
||||
|
||||
return viz.plot(
|
||||
ax=ax,
|
||||
curve_kwargs=curve_kwargs,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
despine=despine,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cv_results(
|
||||
cls,
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
sample_weight=None,
|
||||
drop_intermediate=True,
|
||||
response_method="auto",
|
||||
pos_label=None,
|
||||
ax=None,
|
||||
name=None,
|
||||
curve_kwargs=None,
|
||||
plot_chance_level=False,
|
||||
chance_level_kwargs=None,
|
||||
despine=False,
|
||||
):
|
||||
"""Create a multi-fold ROC curve display given cross-validation results.
|
||||
|
||||
.. versionadded:: 1.7
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cv_results : dict
|
||||
Dictionary as returned by :func:`~sklearn.model_selection.cross_validate`
|
||||
using `return_estimator=True` and `return_indices=True` (i.e., dictionary
|
||||
should contain the keys "estimator" and "indices").
|
||||
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input values.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
drop_intermediate : bool, default=True
|
||||
Whether to drop some suboptimal thresholds which would not appear
|
||||
on a plotted ROC curve. This is useful in order to create lighter
|
||||
ROC curves.
|
||||
|
||||
response_method : {'predict_proba', 'decision_function', 'auto'} \
|
||||
default='auto'
|
||||
Specifies whether to use :term:`predict_proba` or
|
||||
:term:`decision_function` as the target response. If set to 'auto',
|
||||
:term:`predict_proba` is tried first and if it does not exist
|
||||
:term:`decision_function` is tried next.
|
||||
|
||||
pos_label : int, float, bool or str, default=None
|
||||
The class considered as the positive class when computing the ROC AUC
|
||||
metrics. By default, `estimator.classes_[1]` (using `estimator` from
|
||||
`cv_results`) is considered as the positive class.
|
||||
|
||||
ax : matplotlib axes, default=None
|
||||
Axes object to plot on. If `None`, a new figure and axes is
|
||||
created.
|
||||
|
||||
name : str or list of str, default=None
|
||||
Name for labeling legend entries. The number of legend entries
|
||||
is determined by `curve_kwargs`, and is not affected by `name`.
|
||||
To label each curve, provide a list of strings. To avoid labeling
|
||||
individual curves that have the same appearance, a list cannot be used in
|
||||
conjunction with `curve_kwargs` being a dictionary or None. If a
|
||||
string is provided, it will be used to either label the single legend entry
|
||||
or if there are multiple legend entries, label each individual curve with
|
||||
the same name. If `None`, no name is shown in the legend.
|
||||
|
||||
curve_kwargs : dict or list of dict, default=None
|
||||
Keywords arguments to be passed to matplotlib's `plot` function
|
||||
to draw individual ROC curves. If a list is provided the
|
||||
parameters are applied to the ROC curves of each CV fold
|
||||
sequentially and a legend entry is added for each curve.
|
||||
If a single dictionary is provided, the same parameters are applied
|
||||
to all ROC curves and a single legend entry for all curves is added,
|
||||
labeled with the mean ROC AUC score.
|
||||
|
||||
plot_chance_level : bool, default=False
|
||||
Whether to plot the chance level.
|
||||
|
||||
chance_level_kwargs : dict, default=None
|
||||
Keyword arguments to be passed to matplotlib's `plot` for rendering
|
||||
the chance level line.
|
||||
|
||||
despine : bool, default=False
|
||||
Whether to remove the top and right spines from the plot.
|
||||
|
||||
Returns
|
||||
-------
|
||||
display : :class:`~sklearn.metrics.RocCurveDisplay`
|
||||
The multi-fold ROC curve display.
|
||||
|
||||
See Also
|
||||
--------
|
||||
roc_curve : Compute Receiver operating characteristic (ROC) curve.
|
||||
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
|
||||
(ROC) curve given an estimator and some data.
|
||||
RocCurveDisplay.from_predictions : ROC Curve visualization given the
|
||||
probabilities of scores of a classifier.
|
||||
roc_auc_score : Compute the area under the ROC curve.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> from sklearn.metrics import RocCurveDisplay
|
||||
>>> from sklearn.model_selection import cross_validate
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> X, y = make_classification(random_state=0)
|
||||
>>> clf = SVC(random_state=0)
|
||||
>>> cv_results = cross_validate(
|
||||
... clf, X, y, cv=3, return_estimator=True, return_indices=True)
|
||||
>>> RocCurveDisplay.from_cv_results(cv_results, X, y)
|
||||
<...>
|
||||
>>> plt.show()
|
||||
"""
|
||||
cls._validate_from_cv_results_params(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
fpr_folds, tpr_folds, auc_folds = [], [], []
|
||||
for estimator, test_indices in zip(
|
||||
cv_results["estimator"], cv_results["indices"]["test"]
|
||||
):
|
||||
y_true = _safe_indexing(y, test_indices)
|
||||
y_pred, pos_label_ = _get_response_values_binary(
|
||||
estimator,
|
||||
_safe_indexing(X, test_indices),
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
sample_weight_fold = (
|
||||
None
|
||||
if sample_weight is None
|
||||
else _safe_indexing(sample_weight, test_indices)
|
||||
)
|
||||
fpr, tpr, _ = roc_curve(
|
||||
y_true,
|
||||
y_pred,
|
||||
pos_label=pos_label_,
|
||||
sample_weight=sample_weight_fold,
|
||||
drop_intermediate=drop_intermediate,
|
||||
)
|
||||
roc_auc = auc(fpr, tpr)
|
||||
|
||||
fpr_folds.append(fpr)
|
||||
tpr_folds.append(tpr)
|
||||
auc_folds.append(roc_auc)
|
||||
|
||||
viz = cls(
|
||||
fpr=fpr_folds,
|
||||
tpr=tpr_folds,
|
||||
roc_auc=auc_folds,
|
||||
name=name,
|
||||
pos_label=pos_label_,
|
||||
)
|
||||
return viz.plot(
|
||||
ax=ax,
|
||||
curve_kwargs=curve_kwargs,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kwargs,
|
||||
despine=despine,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,321 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
||||
from sklearn.calibration import CalibrationDisplay
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import (
|
||||
ConfusionMatrixDisplay,
|
||||
DetCurveDisplay,
|
||||
PrecisionRecallDisplay,
|
||||
PredictionErrorDisplay,
|
||||
RocCurveDisplay,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def data():
|
||||
return load_iris(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def data_binary(data):
|
||||
X, y = data
|
||||
return X[y < 2], y[y < 2]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display",
|
||||
[CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
|
||||
)
|
||||
def test_display_curve_error_classifier(pyplot, data, data_binary, Display):
|
||||
"""Check that a proper error is raised when only binary classification is
|
||||
supported."""
|
||||
X, y = data
|
||||
X_binary, y_binary = data_binary
|
||||
clf = DecisionTreeClassifier().fit(X, y)
|
||||
|
||||
# Case 1: multiclass classifier with multiclass target
|
||||
msg = "Expected 'estimator' to be a binary classifier. Got 3 classes instead."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(clf, X, y)
|
||||
|
||||
# Case 2: multiclass classifier with binary target
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(clf, X_binary, y_binary)
|
||||
|
||||
# Case 3: binary classifier with multiclass target
|
||||
clf = DecisionTreeClassifier().fit(X_binary, y_binary)
|
||||
msg = "The target y is not binary. Got multiclass type of target."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(clf, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display",
|
||||
[CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay],
|
||||
)
|
||||
def test_display_curve_error_regression(pyplot, data_binary, Display):
|
||||
"""Check that we raise an error with regressor."""
|
||||
|
||||
# Case 1: regressor
|
||||
X, y = data_binary
|
||||
regressor = DecisionTreeRegressor().fit(X, y)
|
||||
|
||||
msg = "Expected 'estimator' to be a binary classifier. Got DecisionTreeRegressor"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(regressor, X, y)
|
||||
|
||||
# Case 2: regression target
|
||||
classifier = DecisionTreeClassifier().fit(X, y)
|
||||
# Force `y_true` to be seen as a regression problem
|
||||
y = y + 0.5
|
||||
msg = "The target y is not binary. Got continuous type of target."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(classifier, X, y)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_predictions(y, regressor.fit(X, y).predict(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method, msg",
|
||||
[
|
||||
(
|
||||
"predict_proba",
|
||||
"MyClassifier has none of the following attributes: predict_proba.",
|
||||
),
|
||||
(
|
||||
"decision_function",
|
||||
"MyClassifier has none of the following attributes: decision_function.",
|
||||
),
|
||||
(
|
||||
"auto",
|
||||
(
|
||||
"MyClassifier has none of the following attributes: predict_proba,"
|
||||
" decision_function."
|
||||
),
|
||||
),
|
||||
(
|
||||
"bad_method",
|
||||
"MyClassifier has none of the following attributes: bad_method.",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
|
||||
)
|
||||
def test_display_curve_error_no_response(
|
||||
pyplot,
|
||||
data_binary,
|
||||
response_method,
|
||||
msg,
|
||||
Display,
|
||||
):
|
||||
"""Check that a proper error is raised when the response method requested
|
||||
is not defined for the given trained classifier."""
|
||||
X, y = data_binary
|
||||
|
||||
class MyClassifier(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.classes_ = [0, 1]
|
||||
return self
|
||||
|
||||
clf = MyClassifier().fit(X, y)
|
||||
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
Display.from_estimator(clf, X, y, response_method=response_method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
|
||||
)
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_display_curve_estimator_name_multiple_calls(
|
||||
pyplot,
|
||||
data_binary,
|
||||
Display,
|
||||
constructor_name,
|
||||
):
|
||||
"""Check that passing `name` when calling `plot` will overwrite the original name
|
||||
in the legend."""
|
||||
X, y = data_binary
|
||||
clf_name = "my hand-crafted name"
|
||||
clf = LogisticRegression().fit(X, y)
|
||||
y_pred = clf.predict_proba(X)[:, 1]
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
disp = Display.from_estimator(clf, X, y, name=clf_name)
|
||||
else:
|
||||
disp = Display.from_predictions(y, y_pred, name=clf_name)
|
||||
# TODO: Clean-up once `estimator_name` deprecated in all displays
|
||||
if Display in (PrecisionRecallDisplay, RocCurveDisplay):
|
||||
assert disp.name == clf_name
|
||||
else:
|
||||
assert disp.estimator_name == clf_name
|
||||
pyplot.close("all")
|
||||
disp.plot()
|
||||
assert clf_name in disp.line_.get_label()
|
||||
pyplot.close("all")
|
||||
clf_name = "another_name"
|
||||
disp.plot(name=clf_name)
|
||||
assert clf_name in disp.line_.get_label()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clf",
|
||||
[
|
||||
LogisticRegression(),
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
|
||||
)
|
||||
def test_display_curve_not_fitted_errors_old_name(pyplot, data_binary, clf, Display):
|
||||
"""Check that a proper error is raised when the classifier is not
|
||||
fitted."""
|
||||
X, y = data_binary
|
||||
# clone since we parametrize the test and the classifier will be fitted
|
||||
# when testing the second and subsequent plotting function
|
||||
model = clone(clf)
|
||||
with pytest.raises(NotFittedError):
|
||||
Display.from_estimator(model, X, y)
|
||||
model.fit(X, y)
|
||||
disp = Display.from_estimator(model, X, y)
|
||||
assert model.__class__.__name__ in disp.line_.get_label()
|
||||
# TODO: Clean-up once `estimator_name` deprecated in all displays
|
||||
if Display in (PrecisionRecallDisplay, RocCurveDisplay):
|
||||
assert disp.name == model.__class__.__name__
|
||||
else:
|
||||
assert disp.estimator_name == model.__class__.__name__
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clf",
|
||||
[
|
||||
LogisticRegression(),
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("Display", [RocCurveDisplay])
|
||||
def test_display_curve_not_fitted_errors(pyplot, data_binary, clf, Display):
|
||||
"""Check that a proper error is raised when the classifier is not fitted."""
|
||||
X, y = data_binary
|
||||
# clone since we parametrize the test and the classifier will be fitted
|
||||
# when testing the second and subsequent plotting function
|
||||
model = clone(clf)
|
||||
with pytest.raises(NotFittedError):
|
||||
Display.from_estimator(model, X, y)
|
||||
model.fit(X, y)
|
||||
disp = Display.from_estimator(model, X, y)
|
||||
assert model.__class__.__name__ in disp.line_.get_label()
|
||||
assert disp.name == model.__class__.__name__
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
|
||||
)
|
||||
def test_display_curve_n_samples_consistency(pyplot, data_binary, Display):
|
||||
"""Check the error raised when `y_pred` or `sample_weight` have inconsistent
|
||||
length."""
|
||||
X, y = data_binary
|
||||
classifier = DecisionTreeClassifier().fit(X, y)
|
||||
|
||||
msg = "Found input variables with inconsistent numbers of samples"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(classifier, X[:-2], y)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(classifier, X, y[:-2])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_estimator(classifier, X, y, sample_weight=np.ones(X.shape[0] - 2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay]
|
||||
)
|
||||
def test_display_curve_error_pos_label(pyplot, data_binary, Display):
|
||||
"""Check consistence of error message when `pos_label` should be specified."""
|
||||
X, y = data_binary
|
||||
y = y + 10
|
||||
|
||||
classifier = DecisionTreeClassifier().fit(X, y)
|
||||
y_pred = classifier.predict_proba(X)[:, -1]
|
||||
msg = r"y_true takes value in {10, 11} and pos_label is not specified"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Display.from_predictions(y, y_pred)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Display",
|
||||
[
|
||||
CalibrationDisplay,
|
||||
DetCurveDisplay,
|
||||
PrecisionRecallDisplay,
|
||||
RocCurveDisplay,
|
||||
PredictionErrorDisplay,
|
||||
ConfusionMatrixDisplay,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"constructor",
|
||||
["from_predictions", "from_estimator"],
|
||||
)
|
||||
def test_classifier_display_curve_named_constructor_return_type(
|
||||
pyplot, data_binary, Display, constructor
|
||||
):
|
||||
"""Check that named constructors return the correct type when subclassed.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/pull/27675
|
||||
"""
|
||||
X, y = data_binary
|
||||
|
||||
# This can be anything - we just need to check the named constructor return
|
||||
# type so the only requirement here is instantiating the class without error
|
||||
y_pred = y
|
||||
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
|
||||
class SubclassOfDisplay(Display):
|
||||
pass
|
||||
|
||||
if constructor == "from_predictions":
|
||||
curve = SubclassOfDisplay.from_predictions(y, y_pred)
|
||||
else: # constructor == "from_estimator"
|
||||
curve = SubclassOfDisplay.from_estimator(classifier, X, y)
|
||||
|
||||
assert isinstance(curve, SubclassOfDisplay)
|
||||
|
||||
|
||||
# TODO(1.10): Remove once deprecated in all Displays
|
||||
@pytest.mark.parametrize(
|
||||
"Display, display_kwargs",
|
||||
[
|
||||
# TODO(1.10): Remove
|
||||
(
|
||||
PrecisionRecallDisplay,
|
||||
{"precision": np.array([1, 0.5, 0]), "recall": np.array([0, 0.5, 1])},
|
||||
),
|
||||
# TODO(1.9): Remove
|
||||
(RocCurveDisplay, {"fpr": np.array([0, 0.5, 1]), "tpr": np.array([0, 0.5, 1])}),
|
||||
],
|
||||
)
|
||||
def test_display_estimator_name_deprecation(pyplot, Display, display_kwargs):
|
||||
"""Check deprecation of `estimator_name`."""
|
||||
with pytest.warns(FutureWarning, match="`estimator_name` is deprecated in"):
|
||||
Display(**display_kwargs, estimator_name="test")
|
||||
@@ -0,0 +1,374 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import SVC, SVR
|
||||
|
||||
|
||||
def test_confusion_matrix_display_validation(pyplot):
|
||||
"""Check that we raise the proper error when validating parameters."""
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=5, random_state=0
|
||||
)
|
||||
|
||||
with pytest.raises(NotFittedError):
|
||||
ConfusionMatrixDisplay.from_estimator(SVC(), X, y)
|
||||
|
||||
regressor = SVR().fit(X, y)
|
||||
y_pred_regressor = regressor.predict(X)
|
||||
y_pred_classifier = SVC().fit(X, y).predict(X)
|
||||
|
||||
err_msg = "ConfusionMatrixDisplay.from_estimator only supports classifiers"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ConfusionMatrixDisplay.from_estimator(regressor, X, y)
|
||||
|
||||
err_msg = "Mix type of y not allowed, got types"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
# Force `y_true` to be seen as a regression problem
|
||||
ConfusionMatrixDisplay.from_predictions(y + 0.5, y_pred_classifier)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ConfusionMatrixDisplay.from_predictions(y, y_pred_regressor)
|
||||
|
||||
err_msg = "Found input variables with inconsistent numbers of samples"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ConfusionMatrixDisplay.from_predictions(y, y_pred_classifier[::2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("with_labels", [True, False])
|
||||
@pytest.mark.parametrize("with_display_labels", [True, False])
|
||||
def test_confusion_matrix_display_custom_labels(
|
||||
pyplot, constructor_name, with_labels, with_display_labels
|
||||
):
|
||||
"""Check the resulting plot when labels are given."""
|
||||
n_classes = 5
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
|
||||
)
|
||||
classifier = SVC().fit(X, y)
|
||||
y_pred = classifier.predict(X)
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
ax = pyplot.gca()
|
||||
labels = [2, 1, 0, 3, 4] if with_labels else None
|
||||
display_labels = ["b", "d", "a", "e", "f"] if with_display_labels else None
|
||||
|
||||
cm = confusion_matrix(y, y_pred, labels=labels)
|
||||
common_kwargs = {
|
||||
"ax": ax,
|
||||
"display_labels": display_labels,
|
||||
"labels": labels,
|
||||
}
|
||||
if constructor_name == "from_estimator":
|
||||
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
|
||||
else:
|
||||
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
|
||||
assert_allclose(disp.confusion_matrix, cm)
|
||||
|
||||
if with_display_labels:
|
||||
expected_display_labels = display_labels
|
||||
elif with_labels:
|
||||
expected_display_labels = labels
|
||||
else:
|
||||
expected_display_labels = list(range(n_classes))
|
||||
|
||||
expected_display_labels_str = [str(name) for name in expected_display_labels]
|
||||
|
||||
x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
|
||||
y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
|
||||
|
||||
assert_array_equal(disp.display_labels, expected_display_labels)
|
||||
assert_array_equal(x_ticks, expected_display_labels_str)
|
||||
assert_array_equal(y_ticks, expected_display_labels_str)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("normalize", ["true", "pred", "all", None])
|
||||
@pytest.mark.parametrize("include_values", [True, False])
|
||||
def test_confusion_matrix_display_plotting(
|
||||
pyplot,
|
||||
constructor_name,
|
||||
normalize,
|
||||
include_values,
|
||||
):
|
||||
"""Check the overall plotting rendering."""
|
||||
n_classes = 5
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
|
||||
)
|
||||
classifier = SVC().fit(X, y)
|
||||
y_pred = classifier.predict(X)
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
ax = pyplot.gca()
|
||||
cmap = "plasma"
|
||||
|
||||
cm = confusion_matrix(y, y_pred)
|
||||
common_kwargs = {
|
||||
"normalize": normalize,
|
||||
"cmap": cmap,
|
||||
"ax": ax,
|
||||
"include_values": include_values,
|
||||
}
|
||||
if constructor_name == "from_estimator":
|
||||
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
|
||||
else:
|
||||
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
|
||||
|
||||
assert disp.ax_ == ax
|
||||
|
||||
if normalize == "true":
|
||||
cm = cm / cm.sum(axis=1, keepdims=True)
|
||||
elif normalize == "pred":
|
||||
cm = cm / cm.sum(axis=0, keepdims=True)
|
||||
elif normalize == "all":
|
||||
cm = cm / cm.sum()
|
||||
|
||||
assert_allclose(disp.confusion_matrix, cm)
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(disp.im_, mpl.image.AxesImage)
|
||||
assert disp.im_.get_cmap().name == cmap
|
||||
assert isinstance(disp.ax_, pyplot.Axes)
|
||||
assert isinstance(disp.figure_, pyplot.Figure)
|
||||
|
||||
assert disp.ax_.get_ylabel() == "True label"
|
||||
assert disp.ax_.get_xlabel() == "Predicted label"
|
||||
|
||||
x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
|
||||
y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
|
||||
|
||||
expected_display_labels = list(range(n_classes))
|
||||
|
||||
expected_display_labels_str = [str(name) for name in expected_display_labels]
|
||||
|
||||
assert_array_equal(disp.display_labels, expected_display_labels)
|
||||
assert_array_equal(x_ticks, expected_display_labels_str)
|
||||
assert_array_equal(y_ticks, expected_display_labels_str)
|
||||
|
||||
image_data = disp.im_.get_array().data
|
||||
assert_allclose(image_data, cm)
|
||||
|
||||
if include_values:
|
||||
assert disp.text_.shape == (n_classes, n_classes)
|
||||
fmt = ".2g"
|
||||
expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")])
|
||||
text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
|
||||
assert_array_equal(expected_text, text_text)
|
||||
else:
|
||||
assert disp.text_ is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_confusion_matrix_display(pyplot, constructor_name):
|
||||
"""Check the behaviour of the default constructor without using the class
|
||||
methods."""
|
||||
n_classes = 5
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
|
||||
)
|
||||
classifier = SVC().fit(X, y)
|
||||
y_pred = classifier.predict(X)
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
cm = confusion_matrix(y, y_pred)
|
||||
common_kwargs = {
|
||||
"normalize": None,
|
||||
"include_values": True,
|
||||
"cmap": "viridis",
|
||||
"xticks_rotation": 45.0,
|
||||
}
|
||||
if constructor_name == "from_estimator":
|
||||
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
|
||||
else:
|
||||
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
|
||||
|
||||
assert_allclose(disp.confusion_matrix, cm)
|
||||
assert disp.text_.shape == (n_classes, n_classes)
|
||||
|
||||
rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
|
||||
assert_allclose(rotations, 45.0)
|
||||
|
||||
image_data = disp.im_.get_array().data
|
||||
assert_allclose(image_data, cm)
|
||||
|
||||
disp.plot(cmap="plasma")
|
||||
assert disp.im_.get_cmap().name == "plasma"
|
||||
|
||||
disp.plot(include_values=False)
|
||||
assert disp.text_ is None
|
||||
|
||||
disp.plot(xticks_rotation=90.0)
|
||||
rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
|
||||
assert_allclose(rotations, 90.0)
|
||||
|
||||
disp.plot(values_format="e")
|
||||
expected_text = np.array([format(v, "e") for v in cm.ravel(order="C")])
|
||||
text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
|
||||
assert_array_equal(expected_text, text_text)
|
||||
|
||||
|
||||
def test_confusion_matrix_contrast(pyplot):
|
||||
"""Check that the text color is appropriate depending on background."""
|
||||
|
||||
cm = np.eye(2) / 2
|
||||
disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
|
||||
|
||||
disp.plot(cmap=pyplot.cm.gray)
|
||||
# diagonal text is black
|
||||
assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
|
||||
assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
|
||||
|
||||
# off-diagonal text is white
|
||||
assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
|
||||
assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
disp.plot(cmap=pyplot.cm.gray_r)
|
||||
# diagonal text is white
|
||||
assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
|
||||
assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
|
||||
|
||||
# off-diagonal text is black
|
||||
assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
|
||||
assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Regression test for #15920
|
||||
cm = np.array([[19, 34], [32, 58]])
|
||||
disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
|
||||
|
||||
disp.plot(cmap=pyplot.cm.Blues)
|
||||
min_color = pyplot.cm.Blues(0)
|
||||
max_color = pyplot.cm.Blues(255)
|
||||
assert_allclose(disp.text_[0, 0].get_color(), max_color)
|
||||
assert_allclose(disp.text_[0, 1].get_color(), max_color)
|
||||
assert_allclose(disp.text_[1, 0].get_color(), max_color)
|
||||
assert_allclose(disp.text_[1, 1].get_color(), min_color)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clf",
|
||||
[
|
||||
LogisticRegression(),
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), [0, 1])),
|
||||
LogisticRegression(),
|
||||
),
|
||||
],
|
||||
ids=["clf", "pipeline-clf", "pipeline-column_transformer-clf"],
|
||||
)
|
||||
def test_confusion_matrix_pipeline(pyplot, clf):
|
||||
"""Check the behaviour of the plotting with more complex pipeline."""
|
||||
n_classes = 5
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
|
||||
)
|
||||
with pytest.raises(NotFittedError):
|
||||
ConfusionMatrixDisplay.from_estimator(clf, X, y)
|
||||
clf.fit(X, y)
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
disp = ConfusionMatrixDisplay.from_estimator(clf, X, y)
|
||||
cm = confusion_matrix(y, y_pred)
|
||||
|
||||
assert_allclose(disp.confusion_matrix, cm)
|
||||
assert disp.text_.shape == (n_classes, n_classes)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name):
|
||||
"""Check that when labels=None, the unique values in `y_pred` and `y_true`
|
||||
will be used.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/pull/18405
|
||||
"""
|
||||
n_classes = 5
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
|
||||
)
|
||||
classifier = SVC().fit(X, y)
|
||||
y_pred = classifier.predict(X)
|
||||
# create unseen labels in `y_true` not seen during fitting and not present
|
||||
# in 'classifier.classes_'
|
||||
y = y + 1
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
common_kwargs = {"labels": None}
|
||||
if constructor_name == "from_estimator":
|
||||
disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs)
|
||||
else:
|
||||
disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs)
|
||||
|
||||
display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
|
||||
expected_labels = [str(i) for i in range(n_classes + 1)]
|
||||
assert_array_equal(expected_labels, display_labels)
|
||||
|
||||
|
||||
def test_colormap_max(pyplot):
|
||||
"""Check that the max color is used for the color of the text."""
|
||||
gray = pyplot.get_cmap("gray", 1024)
|
||||
confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]])
|
||||
|
||||
disp = ConfusionMatrixDisplay(confusion_matrix)
|
||||
disp.plot(cmap=gray)
|
||||
|
||||
color = disp.text_[1, 0].get_color()
|
||||
assert_allclose(color, [1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
|
||||
def test_im_kw_adjust_vmin_vmax(pyplot):
|
||||
"""Check that im_kw passes kwargs to imshow"""
|
||||
|
||||
confusion_matrix = np.array([[0.48, 0.04], [0.08, 0.4]])
|
||||
disp = ConfusionMatrixDisplay(confusion_matrix)
|
||||
disp.plot(im_kw=dict(vmin=0.0, vmax=0.8))
|
||||
|
||||
clim = disp.im_.get_clim()
|
||||
assert clim[0] == pytest.approx(0.0)
|
||||
assert clim[1] == pytest.approx(0.8)
|
||||
|
||||
|
||||
def test_confusion_matrix_text_kw(pyplot):
|
||||
"""Check that text_kw is passed to the text call."""
|
||||
font_size = 15.0
|
||||
X, y = make_classification(random_state=0)
|
||||
classifier = SVC().fit(X, y)
|
||||
|
||||
# from_estimator passes the font size
|
||||
disp = ConfusionMatrixDisplay.from_estimator(
|
||||
classifier, X, y, text_kw={"fontsize": font_size}
|
||||
)
|
||||
for text in disp.text_.reshape(-1):
|
||||
assert text.get_fontsize() == font_size
|
||||
|
||||
# plot adjusts plot to new font size
|
||||
new_font_size = 20.0
|
||||
disp.plot(text_kw={"fontsize": new_font_size})
|
||||
for text in disp.text_.reshape(-1):
|
||||
assert text.get_fontsize() == new_font_size
|
||||
|
||||
# from_predictions passes the font size
|
||||
y_pred = classifier.predict(X)
|
||||
disp = ConfusionMatrixDisplay.from_predictions(
|
||||
y, y_pred, text_kw={"fontsize": font_size}
|
||||
)
|
||||
for text in disp.text_.reshape(-1):
|
||||
assert text.get_fontsize() == font_size
|
||||
@@ -0,0 +1,131 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import DetCurveDisplay, det_curve
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
@pytest.mark.parametrize("with_sample_weight", [True, False])
|
||||
@pytest.mark.parametrize("drop_intermediate", [True, False])
|
||||
@pytest.mark.parametrize("with_strings", [True, False])
|
||||
def test_det_curve_display(
|
||||
pyplot,
|
||||
constructor_name,
|
||||
response_method,
|
||||
with_sample_weight,
|
||||
drop_intermediate,
|
||||
with_strings,
|
||||
):
|
||||
X, y = load_iris(return_X_y=True)
|
||||
# Binarize the data with only the two first classes
|
||||
X, y = X[y < 2], y[y < 2]
|
||||
|
||||
pos_label = None
|
||||
if with_strings:
|
||||
y = np.array(["c", "b"])[y]
|
||||
pos_label = "c"
|
||||
|
||||
if with_sample_weight:
|
||||
rng = np.random.RandomState(42)
|
||||
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
|
||||
else:
|
||||
sample_weight = None
|
||||
|
||||
lr = LogisticRegression()
|
||||
lr.fit(X, y)
|
||||
y_score = getattr(lr, response_method)(X)
|
||||
if y_score.ndim == 2:
|
||||
y_score = y_score[:, 1]
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
common_kwargs = {
|
||||
"name": lr.__class__.__name__,
|
||||
"alpha": 0.8,
|
||||
"sample_weight": sample_weight,
|
||||
"drop_intermediate": drop_intermediate,
|
||||
"pos_label": pos_label,
|
||||
}
|
||||
if constructor_name == "from_estimator":
|
||||
disp = DetCurveDisplay.from_estimator(lr, X, y, **common_kwargs)
|
||||
else:
|
||||
disp = DetCurveDisplay.from_predictions(y, y_score, **common_kwargs)
|
||||
|
||||
fpr, fnr, _ = det_curve(
|
||||
y,
|
||||
y_score,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
|
||||
assert_allclose(disp.fpr, fpr, atol=1e-7)
|
||||
assert_allclose(disp.fnr, fnr, atol=1e-7)
|
||||
|
||||
assert disp.estimator_name == "LogisticRegression"
|
||||
|
||||
# cannot fail thanks to pyplot fixture
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(disp.line_, mpl.lines.Line2D)
|
||||
assert disp.line_.get_alpha() == 0.8
|
||||
assert isinstance(disp.ax_, mpl.axes.Axes)
|
||||
assert isinstance(disp.figure_, mpl.figure.Figure)
|
||||
assert disp.line_.get_label() == "LogisticRegression"
|
||||
|
||||
expected_pos_label = 1 if pos_label is None else pos_label
|
||||
expected_ylabel = f"False Negative Rate (Positive label: {expected_pos_label})"
|
||||
expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
|
||||
assert disp.ax_.get_ylabel() == expected_ylabel
|
||||
assert disp.ax_.get_xlabel() == expected_xlabel
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, expected_clf_name",
|
||||
[
|
||||
("from_estimator", "LogisticRegression"),
|
||||
("from_predictions", "Classifier"),
|
||||
],
|
||||
)
|
||||
def test_det_curve_display_default_name(
|
||||
pyplot,
|
||||
constructor_name,
|
||||
expected_clf_name,
|
||||
):
|
||||
# Check the default name display in the figure when `name` is not provided
|
||||
X, y = load_iris(return_X_y=True)
|
||||
# Binarize the data with only the two first classes
|
||||
X, y = X[y < 2], y[y < 2]
|
||||
|
||||
lr = LogisticRegression().fit(X, y)
|
||||
y_score = lr.predict_proba(X)[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
disp = DetCurveDisplay.from_estimator(lr, X, y)
|
||||
else:
|
||||
disp = DetCurveDisplay.from_predictions(y, y_score)
|
||||
|
||||
assert disp.estimator_name == expected_clf_name
|
||||
assert disp.line_.get_label() == expected_clf_name
|
||||
|
||||
|
||||
# TODO(1.10): remove
|
||||
def test_y_score_and_y_pred_specified_error(pyplot):
|
||||
"""1. Check that an error is raised when both y_score and y_pred are specified.
|
||||
2. Check that a warning is raised when y_pred is specified.
|
||||
"""
|
||||
y_true = np.array([0, 0, 1, 1])
|
||||
y_score = np.array([0.1, 0.4, 0.35, 0.8])
|
||||
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="`y_pred` and `y_score` cannot be both specified"
|
||||
):
|
||||
DetCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
|
||||
|
||||
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.8"):
|
||||
DetCurveDisplay.from_predictions(y_true, y_pred=y_score)
|
||||
@@ -0,0 +1,400 @@
|
||||
from collections import Counter
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.integrate import trapezoid
|
||||
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import load_breast_cancer, make_classification
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import (
|
||||
PrecisionRecallDisplay,
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
)
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
@pytest.mark.parametrize("drop_intermediate", [True, False])
|
||||
def test_precision_recall_display_plotting(
|
||||
pyplot, constructor_name, response_method, drop_intermediate
|
||||
):
|
||||
"""Check the overall plotting rendering."""
|
||||
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
|
||||
pos_label = 1
|
||||
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
classifier.fit(X, y)
|
||||
|
||||
y_score = getattr(classifier, response_method)(X)
|
||||
y_score = y_score if y_score.ndim == 1 else y_score[:, pos_label]
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(
|
||||
classifier,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
drop_intermediate=drop_intermediate,
|
||||
)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
|
||||
)
|
||||
|
||||
precision, recall, _ = precision_recall_curve(
|
||||
y, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
|
||||
)
|
||||
average_precision = average_precision_score(y, y_score, pos_label=pos_label)
|
||||
|
||||
np.testing.assert_allclose(display.precision, precision)
|
||||
np.testing.assert_allclose(display.recall, recall)
|
||||
assert display.average_precision == pytest.approx(average_precision)
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(display.line_, mpl.lines.Line2D)
|
||||
assert isinstance(display.ax_, mpl.axes.Axes)
|
||||
assert isinstance(display.figure_, mpl.figure.Figure)
|
||||
|
||||
assert display.ax_.get_xlabel() == "Recall (Positive label: 1)"
|
||||
assert display.ax_.get_ylabel() == "Precision (Positive label: 1)"
|
||||
assert display.ax_.get_adjustable() == "box"
|
||||
assert display.ax_.get_aspect() in ("equal", 1.0)
|
||||
assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
|
||||
|
||||
# plotting passing some new parameters
|
||||
display.plot(alpha=0.8, name="MySpecialEstimator")
|
||||
expected_label = f"MySpecialEstimator (AP = {average_precision:0.2f})"
|
||||
assert display.line_.get_label() == expected_label
|
||||
assert display.line_.get_alpha() == pytest.approx(0.8)
|
||||
|
||||
# Check that the chance level line is not plotted by default
|
||||
assert display.chance_level_ is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chance_level_kw", [None, {"color": "r"}, {"c": "r"}])
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_precision_recall_chance_level_line(
|
||||
pyplot,
|
||||
chance_level_kw,
|
||||
constructor_name,
|
||||
):
|
||||
"""Check the chance level line plotting behavior."""
|
||||
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
|
||||
pos_prevalence = Counter(y)[1] / len(y)
|
||||
|
||||
lr = LogisticRegression()
|
||||
y_score = lr.fit(X, y).predict_proba(X)[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(
|
||||
lr,
|
||||
X,
|
||||
y,
|
||||
plot_chance_level=True,
|
||||
chance_level_kw=chance_level_kw,
|
||||
)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y,
|
||||
y_score,
|
||||
plot_chance_level=True,
|
||||
chance_level_kw=chance_level_kw,
|
||||
)
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(display.chance_level_, mpl.lines.Line2D)
|
||||
assert tuple(display.chance_level_.get_xdata()) == (0, 1)
|
||||
assert tuple(display.chance_level_.get_ydata()) == (pos_prevalence, pos_prevalence)
|
||||
|
||||
# Checking for chance level line styles
|
||||
if chance_level_kw is None:
|
||||
assert display.chance_level_.get_color() == "k"
|
||||
else:
|
||||
assert display.chance_level_.get_color() == "r"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, default_label",
|
||||
[
|
||||
("from_estimator", "LogisticRegression (AP = {:.2f})"),
|
||||
("from_predictions", "Classifier (AP = {:.2f})"),
|
||||
],
|
||||
)
|
||||
def test_precision_recall_display_name(pyplot, constructor_name, default_label):
|
||||
"""Check the behaviour of the name parameters"""
|
||||
X, y = make_classification(n_classes=2, n_samples=100, random_state=0)
|
||||
pos_label = 1
|
||||
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
classifier.fit(X, y)
|
||||
|
||||
y_score = classifier.predict_proba(X)[:, pos_label]
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(classifier, X, y)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y, y_score, pos_label=pos_label
|
||||
)
|
||||
|
||||
average_precision = average_precision_score(y, y_score, pos_label=pos_label)
|
||||
|
||||
# check that the default name is used
|
||||
assert display.line_.get_label() == default_label.format(average_precision)
|
||||
|
||||
# check that the name can be set
|
||||
display.plot(name="MySpecialEstimator")
|
||||
assert (
|
||||
display.line_.get_label()
|
||||
== f"MySpecialEstimator (AP = {average_precision:.2f})"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clf",
|
||||
[
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_precision_recall_display_pipeline(pyplot, clf):
|
||||
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
|
||||
with pytest.raises(NotFittedError):
|
||||
PrecisionRecallDisplay.from_estimator(clf, X, y)
|
||||
clf.fit(X, y)
|
||||
display = PrecisionRecallDisplay.from_estimator(clf, X, y)
|
||||
assert display.name == clf.__class__.__name__
|
||||
|
||||
|
||||
def test_precision_recall_display_string_labels(pyplot):
|
||||
# regression test #15738
|
||||
cancer = load_breast_cancer()
|
||||
X, y = cancer.data, cancer.target_names[cancer.target]
|
||||
|
||||
lr = make_pipeline(StandardScaler(), LogisticRegression())
|
||||
lr.fit(X, y)
|
||||
for klass in cancer.target_names:
|
||||
assert klass in lr.classes_
|
||||
display = PrecisionRecallDisplay.from_estimator(lr, X, y)
|
||||
|
||||
y_score = lr.predict_proba(X)[:, 1]
|
||||
avg_prec = average_precision_score(y, y_score, pos_label=lr.classes_[1])
|
||||
|
||||
assert display.average_precision == pytest.approx(avg_prec)
|
||||
assert display.name == lr.__class__.__name__
|
||||
|
||||
err_msg = r"y_true takes value in {'benign', 'malignant'}"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
PrecisionRecallDisplay.from_predictions(y, y_score)
|
||||
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y, y_score, pos_label=lr.classes_[1]
|
||||
)
|
||||
assert display.average_precision == pytest.approx(avg_prec)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"average_precision, name, expected_label",
|
||||
[
|
||||
(0.9, None, "AP = 0.90"),
|
||||
(None, "my_est", "my_est"),
|
||||
(0.8, "my_est2", "my_est2 (AP = 0.80)"),
|
||||
],
|
||||
)
|
||||
def test_default_labels(pyplot, average_precision, name, expected_label):
|
||||
"""Check the default labels used in the display."""
|
||||
precision = np.array([1, 0.5, 0])
|
||||
recall = np.array([0, 0.5, 1])
|
||||
display = PrecisionRecallDisplay(
|
||||
precision,
|
||||
recall,
|
||||
average_precision=average_precision,
|
||||
name=name,
|
||||
)
|
||||
display.plot()
|
||||
assert display.line_.get_label() == expected_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_method):
|
||||
# check that we can provide the positive label and display the proper
|
||||
# statistics
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
# create a highly imbalanced version of the breast cancer dataset
|
||||
idx_positive = np.flatnonzero(y == 1)
|
||||
idx_negative = np.flatnonzero(y == 0)
|
||||
idx_selected = np.hstack([idx_negative, idx_positive[:25]])
|
||||
X, y = X[idx_selected], y[idx_selected]
|
||||
X, y = shuffle(X, y, random_state=42)
|
||||
# only use 2 features to make the problem even harder
|
||||
X = X[:, :2]
|
||||
y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
stratify=y,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
classifier = LogisticRegression()
|
||||
classifier.fit(X_train, y_train)
|
||||
|
||||
# sanity check to be sure the positive class is classes_[0] and that we
|
||||
# are betrayed by the class imbalance
|
||||
assert classifier.classes_.tolist() == ["cancer", "not cancer"]
|
||||
|
||||
y_score = getattr(classifier, response_method)(X_test)
|
||||
# we select the corresponding probability columns or reverse the decision
|
||||
# function otherwise
|
||||
y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0]
|
||||
y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(
|
||||
classifier,
|
||||
X_test,
|
||||
y_test,
|
||||
pos_label="cancer",
|
||||
response_method=response_method,
|
||||
)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y_test,
|
||||
y_score_cancer,
|
||||
pos_label="cancer",
|
||||
)
|
||||
# we should obtain the statistics of the "cancer" class
|
||||
avg_prec_limit = 0.65
|
||||
assert display.average_precision < avg_prec_limit
|
||||
assert -trapezoid(display.precision, display.recall) < avg_prec_limit
|
||||
|
||||
# otherwise we should obtain the statistics of the "not cancer" class
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(
|
||||
classifier,
|
||||
X_test,
|
||||
y_test,
|
||||
response_method=response_method,
|
||||
pos_label="not cancer",
|
||||
)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y_test,
|
||||
y_score_not_cancer,
|
||||
pos_label="not cancer",
|
||||
)
|
||||
avg_prec_limit = 0.95
|
||||
assert display.average_precision > avg_prec_limit
|
||||
assert -trapezoid(display.precision, display.recall) > avg_prec_limit
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_precision_recall_prevalence_pos_label_reusable(pyplot, constructor_name):
|
||||
# Check that even if one passes plot_chance_level=False the first time
|
||||
# one can still call disp.plot with plot_chance_level=True and get the
|
||||
# chance level line
|
||||
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
|
||||
|
||||
lr = LogisticRegression()
|
||||
y_score = lr.fit(X, y).predict_proba(X)[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(
|
||||
lr, X, y, plot_chance_level=False
|
||||
)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(
|
||||
y, y_score, plot_chance_level=False
|
||||
)
|
||||
assert display.chance_level_ is None
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
# When calling from_estimator or from_predictions,
|
||||
# prevalence_pos_label should have been set, so that directly
|
||||
# calling plot_chance_level=True should plot the chance level line
|
||||
display.plot(plot_chance_level=True)
|
||||
assert isinstance(display.chance_level_, mpl.lines.Line2D)
|
||||
|
||||
|
||||
def test_precision_recall_raise_no_prevalence(pyplot):
|
||||
# Check that raises correctly when plotting chance level with
|
||||
# no prvelance_pos_label is provided
|
||||
precision = np.array([1, 0.5, 0])
|
||||
recall = np.array([0, 0.5, 1])
|
||||
display = PrecisionRecallDisplay(precision, recall)
|
||||
|
||||
msg = (
|
||||
"You must provide prevalence_pos_label when constructing the "
|
||||
"PrecisionRecallDisplay object in order to plot the chance "
|
||||
"level line. Alternatively, you may use "
|
||||
"PrecisionRecallDisplay.from_estimator or "
|
||||
"PrecisionRecallDisplay.from_predictions "
|
||||
"to automatically set prevalence_pos_label"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
display.plot(plot_chance_level=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("despine", [True, False])
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_plot_precision_recall_despine(pyplot, despine, constructor_name):
|
||||
# Check that the despine keyword is working correctly
|
||||
X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
|
||||
|
||||
clf = LogisticRegression().fit(X, y)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_score = clf.decision_function(X)
|
||||
|
||||
# safe guard for the binary if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions")
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = PrecisionRecallDisplay.from_estimator(clf, X, y, despine=despine)
|
||||
else:
|
||||
display = PrecisionRecallDisplay.from_predictions(y, y_score, despine=despine)
|
||||
|
||||
for s in ["top", "right"]:
|
||||
assert display.ax_.spines[s].get_visible() is not despine
|
||||
|
||||
if despine:
|
||||
for s in ["bottom", "left"]:
|
||||
assert display.ax_.spines[s].get_bounds() == (0, 1)
|
||||
|
||||
|
||||
# TODO(1.10): remove
|
||||
def test_y_score_and_y_pred_specified_error(pyplot):
|
||||
"""1. Check that an error is raised when both y_score and y_pred are specified.
|
||||
2. Check that a warning is raised when y_pred is specified.
|
||||
"""
|
||||
y_true = np.array([0, 1, 1, 0])
|
||||
y_score = np.array([0.1, 0.4, 0.35, 0.8])
|
||||
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="`y_pred` and `y_score` cannot be both specified"
|
||||
):
|
||||
PrecisionRecallDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
|
||||
|
||||
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.8"):
|
||||
PrecisionRecallDisplay.from_predictions(y_true, y_pred=y_score)
|
||||
@@ -0,0 +1,169 @@
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.metrics import PredictionErrorDisplay
|
||||
|
||||
X, y = load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def regressor_fitted():
|
||||
return Ridge().fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"regressor, params, err_type, err_msg",
|
||||
[
|
||||
(
|
||||
Ridge().fit(X, y),
|
||||
{"subsample": -1},
|
||||
ValueError,
|
||||
"When an integer, subsample=-1 should be",
|
||||
),
|
||||
(
|
||||
Ridge().fit(X, y),
|
||||
{"subsample": 20.0},
|
||||
ValueError,
|
||||
"When a floating-point, subsample=20.0 should be",
|
||||
),
|
||||
(
|
||||
Ridge().fit(X, y),
|
||||
{"subsample": -20.0},
|
||||
ValueError,
|
||||
"When a floating-point, subsample=-20.0 should be",
|
||||
),
|
||||
(
|
||||
Ridge().fit(X, y),
|
||||
{"kind": "xxx"},
|
||||
ValueError,
|
||||
"`kind` must be one of",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
|
||||
def test_prediction_error_display_raise_error(
|
||||
pyplot, class_method, regressor, params, err_type, err_msg
|
||||
):
|
||||
"""Check that we raise the proper error when making the parameters
|
||||
# validation."""
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
if class_method == "from_estimator":
|
||||
PredictionErrorDisplay.from_estimator(regressor, X, y, **params)
|
||||
else:
|
||||
y_pred = regressor.predict(X)
|
||||
PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred, **params)
|
||||
|
||||
|
||||
def test_from_estimator_not_fitted(pyplot):
|
||||
"""Check that we raise a `NotFittedError` when the passed regressor is not
|
||||
fit."""
|
||||
regressor = Ridge()
|
||||
with pytest.raises(NotFittedError, match="is not fitted yet."):
|
||||
PredictionErrorDisplay.from_estimator(regressor, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize("kind", ["actual_vs_predicted", "residual_vs_predicted"])
|
||||
def test_prediction_error_display(pyplot, regressor_fitted, class_method, kind):
|
||||
"""Check the default behaviour of the display."""
|
||||
if class_method == "from_estimator":
|
||||
display = PredictionErrorDisplay.from_estimator(
|
||||
regressor_fitted, X, y, kind=kind
|
||||
)
|
||||
else:
|
||||
y_pred = regressor_fitted.predict(X)
|
||||
display = PredictionErrorDisplay.from_predictions(
|
||||
y_true=y, y_pred=y_pred, kind=kind
|
||||
)
|
||||
|
||||
if kind == "actual_vs_predicted":
|
||||
assert_allclose(display.line_.get_xdata(), display.line_.get_ydata())
|
||||
assert display.ax_.get_xlabel() == "Predicted values"
|
||||
assert display.ax_.get_ylabel() == "Actual values"
|
||||
assert display.line_ is not None
|
||||
else:
|
||||
assert display.ax_.get_xlabel() == "Predicted values"
|
||||
assert display.ax_.get_ylabel() == "Residuals (actual - predicted)"
|
||||
assert display.line_ is not None
|
||||
|
||||
assert display.ax_.get_legend() is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize(
|
||||
"subsample, expected_size",
|
||||
[(5, 5), (0.1, int(X.shape[0] * 0.1)), (None, X.shape[0])],
|
||||
)
|
||||
def test_plot_prediction_error_subsample(
|
||||
pyplot, regressor_fitted, class_method, subsample, expected_size
|
||||
):
|
||||
"""Check the behaviour of `subsample`."""
|
||||
if class_method == "from_estimator":
|
||||
display = PredictionErrorDisplay.from_estimator(
|
||||
regressor_fitted, X, y, subsample=subsample
|
||||
)
|
||||
else:
|
||||
y_pred = regressor_fitted.predict(X)
|
||||
display = PredictionErrorDisplay.from_predictions(
|
||||
y_true=y, y_pred=y_pred, subsample=subsample
|
||||
)
|
||||
assert len(display.scatter_.get_offsets()) == expected_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
|
||||
def test_plot_prediction_error_ax(pyplot, regressor_fitted, class_method):
|
||||
"""Check that we can pass an axis to the display."""
|
||||
_, ax = pyplot.subplots()
|
||||
if class_method == "from_estimator":
|
||||
display = PredictionErrorDisplay.from_estimator(regressor_fitted, X, y, ax=ax)
|
||||
else:
|
||||
y_pred = regressor_fitted.predict(X)
|
||||
display = PredictionErrorDisplay.from_predictions(
|
||||
y_true=y, y_pred=y_pred, ax=ax
|
||||
)
|
||||
assert display.ax_ is ax
|
||||
|
||||
|
||||
@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"])
|
||||
@pytest.mark.parametrize(
|
||||
"scatter_kwargs",
|
||||
[None, {"color": "blue", "alpha": 0.9}, {"c": "blue", "alpha": 0.9}],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"line_kwargs", [None, {"color": "red", "linestyle": "-"}, {"c": "red", "ls": "-"}]
|
||||
)
|
||||
def test_prediction_error_custom_artist(
|
||||
pyplot, regressor_fitted, class_method, scatter_kwargs, line_kwargs
|
||||
):
|
||||
"""Check that we can tune the style of the line and the scatter."""
|
||||
extra_params = {
|
||||
"kind": "actual_vs_predicted",
|
||||
"scatter_kwargs": scatter_kwargs,
|
||||
"line_kwargs": line_kwargs,
|
||||
}
|
||||
if class_method == "from_estimator":
|
||||
display = PredictionErrorDisplay.from_estimator(
|
||||
regressor_fitted, X, y, **extra_params
|
||||
)
|
||||
else:
|
||||
y_pred = regressor_fitted.predict(X)
|
||||
display = PredictionErrorDisplay.from_predictions(
|
||||
y_true=y, y_pred=y_pred, **extra_params
|
||||
)
|
||||
|
||||
if line_kwargs is not None:
|
||||
assert display.line_.get_linestyle() == "-"
|
||||
assert display.line_.get_color() == "red"
|
||||
else:
|
||||
assert display.line_.get_linestyle() == "--"
|
||||
assert display.line_.get_color() == "black"
|
||||
assert display.line_.get_alpha() == 0.7
|
||||
|
||||
if scatter_kwargs is not None:
|
||||
assert_allclose(display.scatter_.get_facecolor(), [[0.0, 0.0, 1.0, 0.9]])
|
||||
assert_allclose(display.scatter_.get_edgecolor(), [[0.0, 0.0, 1.0, 0.9]])
|
||||
else:
|
||||
assert display.scatter_.get_alpha() == 0.8
|
||||
@@ -0,0 +1,989 @@
|
||||
from collections.abc import Mapping
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy.integrate import trapezoid
|
||||
|
||||
from sklearn import clone
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import load_breast_cancer, make_classification
|
||||
from sklearn.exceptions import NotFittedError, UndefinedMetricWarning
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import RocCurveDisplay, auc, roc_curve
|
||||
from sklearn.model_selection import cross_validate, train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils import _safe_indexing, shuffle
|
||||
from sklearn.utils._response import _get_response_values_binary
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def data_binary():
|
||||
X, y = make_classification(
|
||||
n_samples=200,
|
||||
n_features=20,
|
||||
n_informative=5,
|
||||
n_redundant=2,
|
||||
flip_y=0.1,
|
||||
class_sep=0.8,
|
||||
random_state=42,
|
||||
)
|
||||
return X, y
|
||||
|
||||
|
||||
def _check_figure_axes_and_labels(display, pos_label):
|
||||
"""Check mpl axes and figure defaults are correct."""
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(display.ax_, mpl.axes.Axes)
|
||||
assert isinstance(display.figure_, mpl.figure.Figure)
|
||||
assert display.ax_.get_adjustable() == "box"
|
||||
assert display.ax_.get_aspect() in ("equal", 1.0)
|
||||
assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01)
|
||||
|
||||
expected_pos_label = 1 if pos_label is None else pos_label
|
||||
expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})"
|
||||
expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"
|
||||
|
||||
assert display.ax_.get_ylabel() == expected_ylabel
|
||||
assert display.ax_.get_xlabel() == expected_xlabel
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
@pytest.mark.parametrize("with_sample_weight", [True, False])
|
||||
@pytest.mark.parametrize("drop_intermediate", [True, False])
|
||||
@pytest.mark.parametrize("with_strings", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, default_name",
|
||||
[
|
||||
("from_estimator", "LogisticRegression"),
|
||||
("from_predictions", "Classifier"),
|
||||
],
|
||||
)
|
||||
def test_roc_curve_display_plotting(
|
||||
pyplot,
|
||||
response_method,
|
||||
data_binary,
|
||||
with_sample_weight,
|
||||
drop_intermediate,
|
||||
with_strings,
|
||||
constructor_name,
|
||||
default_name,
|
||||
):
|
||||
"""Check the overall plotting behaviour for single curve."""
|
||||
X, y = data_binary
|
||||
|
||||
pos_label = None
|
||||
if with_strings:
|
||||
y = np.array(["c", "b"])[y]
|
||||
pos_label = "c"
|
||||
|
||||
if with_sample_weight:
|
||||
rng = np.random.RandomState(42)
|
||||
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
|
||||
else:
|
||||
sample_weight = None
|
||||
|
||||
lr = LogisticRegression()
|
||||
lr.fit(X, y)
|
||||
|
||||
y_score = getattr(lr, response_method)(X)
|
||||
y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(
|
||||
lr,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
curve_kwargs={"alpha": 0.8},
|
||||
)
|
||||
else:
|
||||
display = RocCurveDisplay.from_predictions(
|
||||
y,
|
||||
y_score,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
curve_kwargs={"alpha": 0.8},
|
||||
)
|
||||
|
||||
fpr, tpr, _ = roc_curve(
|
||||
y,
|
||||
y_score,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
|
||||
assert_allclose(display.roc_auc, auc(fpr, tpr))
|
||||
assert_allclose(display.fpr, fpr)
|
||||
assert_allclose(display.tpr, tpr)
|
||||
|
||||
assert display.name == default_name
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
_check_figure_axes_and_labels(display, pos_label)
|
||||
assert isinstance(display.line_, mpl.lines.Line2D)
|
||||
assert display.line_.get_alpha() == 0.8
|
||||
|
||||
expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})"
|
||||
assert display.line_.get_label() == expected_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
(
|
||||
{
|
||||
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"tpr": [np.array([0, 0.5, 1])],
|
||||
"roc_auc": None,
|
||||
"name": None,
|
||||
},
|
||||
"self.fpr and self.tpr from `RocCurveDisplay` initialization,",
|
||||
),
|
||||
(
|
||||
{
|
||||
"fpr": [np.array([0, 0.5, 1])],
|
||||
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"roc_auc": [0.8, 0.9],
|
||||
"name": None,
|
||||
},
|
||||
"self.fpr, self.tpr and self.roc_auc from `RocCurveDisplay`",
|
||||
),
|
||||
(
|
||||
{
|
||||
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"roc_auc": [0.8],
|
||||
"name": None,
|
||||
},
|
||||
"Got: self.fpr: 2, self.tpr: 2, self.roc_auc: 1",
|
||||
),
|
||||
(
|
||||
{
|
||||
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"roc_auc": [0.8, 0.9],
|
||||
"name": ["curve1", "curve2", "curve3"],
|
||||
},
|
||||
r"self.fpr, self.tpr, self.roc_auc and 'name' \(or self.name\)",
|
||||
),
|
||||
(
|
||||
{
|
||||
"fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])],
|
||||
"roc_auc": [0.8, 0.9],
|
||||
# List of length 1 is always allowed
|
||||
"name": ["curve1"],
|
||||
},
|
||||
None,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_roc_curve_plot_parameter_length_validation(pyplot, params, err_msg):
|
||||
"""Check `plot` parameter length validation performed correctly."""
|
||||
display = RocCurveDisplay(**params)
|
||||
if err_msg:
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
display.plot()
|
||||
else:
|
||||
# No error should be raised
|
||||
display.plot()
|
||||
|
||||
|
||||
def test_validate_plot_params(pyplot):
|
||||
"""Check `_validate_plot_params` returns the correct variables."""
|
||||
fpr = np.array([0, 0.5, 1])
|
||||
tpr = [np.array([0, 0.5, 1])]
|
||||
roc_auc = None
|
||||
name = "test_curve"
|
||||
|
||||
# Initialize display with test inputs
|
||||
display = RocCurveDisplay(
|
||||
fpr=fpr,
|
||||
tpr=tpr,
|
||||
roc_auc=roc_auc,
|
||||
name=name,
|
||||
pos_label=None,
|
||||
)
|
||||
fpr_out, tpr_out, roc_auc_out, name_out = display._validate_plot_params(
|
||||
ax=None, name=None
|
||||
)
|
||||
|
||||
assert isinstance(fpr_out, list)
|
||||
assert isinstance(tpr_out, list)
|
||||
assert len(fpr_out) == 1
|
||||
assert len(tpr_out) == 1
|
||||
assert roc_auc_out is None
|
||||
assert name_out == ["test_curve"]
|
||||
|
||||
|
||||
def test_roc_curve_from_cv_results_param_validation(pyplot, data_binary):
|
||||
"""Check parameter validation is correct."""
|
||||
X, y = data_binary
|
||||
|
||||
# `cv_results` missing key
|
||||
cv_results_no_est = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
|
||||
)
|
||||
cv_results_no_indices = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False
|
||||
)
|
||||
for cv_results in (cv_results_no_est, cv_results_no_indices):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="`cv_results` does not contain one of the following required",
|
||||
):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y)
|
||||
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
# `X` wrong length
|
||||
with pytest.raises(ValueError, match="`X` does not contain the correct"):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X[:10, :], y)
|
||||
|
||||
# `y` not binary
|
||||
y_multi = y.copy()
|
||||
y_multi[0] = 2
|
||||
with pytest.raises(ValueError, match="The target `y` is not binary."):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
|
||||
|
||||
# input inconsistent length
|
||||
with pytest.raises(ValueError, match="Found input variables with inconsistent"):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y[:10])
|
||||
with pytest.raises(ValueError, match="Found input variables with inconsistent"):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y, sample_weight=[1, 2])
|
||||
|
||||
# `pos_label` inconsistency
|
||||
y_multi[y_multi == 1] = 2
|
||||
with pytest.warns(UndefinedMetricWarning, match="No positive samples in y_true"):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y_multi)
|
||||
|
||||
# `name` is list while `curve_kwargs` is None or dict
|
||||
for curve_kwargs in (None, {"alpha": 0.2}):
|
||||
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
|
||||
RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
name=["one", "two", "three"],
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
|
||||
# `curve_kwargs` incorrect length
|
||||
with pytest.raises(ValueError, match="`curve_kwargs` must be None, a dictionary"):
|
||||
RocCurveDisplay.from_cv_results(cv_results, X, y, curve_kwargs=[{"alpha": 1}])
|
||||
|
||||
# `curve_kwargs` both alias provided
|
||||
with pytest.raises(TypeError, match="Got both c and"):
|
||||
RocCurveDisplay.from_cv_results(
|
||||
cv_results, X, y, curve_kwargs={"c": "blue", "color": "red"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"curve_kwargs",
|
||||
[None, {"alpha": 0.2}, [{"alpha": 0.2}, {"alpha": 0.3}, {"alpha": 0.4}]],
|
||||
)
|
||||
def test_roc_curve_display_from_cv_results_curve_kwargs(
|
||||
pyplot, data_binary, curve_kwargs
|
||||
):
|
||||
"""Check `curve_kwargs` correctly passed."""
|
||||
X, y = data_binary
|
||||
n_cv = 3
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
|
||||
)
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
if curve_kwargs is None:
|
||||
# Default `alpha` used
|
||||
assert all(line.get_alpha() == 0.5 for line in display.line_)
|
||||
elif isinstance(curve_kwargs, Mapping):
|
||||
# `alpha` from dict used for all curves
|
||||
assert all(line.get_alpha() == 0.2 for line in display.line_)
|
||||
else:
|
||||
# Different `alpha` used for each curve
|
||||
assert all(
|
||||
line.get_alpha() == curve_kwargs[i]["alpha"]
|
||||
for i, line in enumerate(display.line_)
|
||||
)
|
||||
# Other default kwargs should be the same
|
||||
for line in display.line_:
|
||||
assert line.get_linestyle() == "--"
|
||||
assert line.get_color() == "blue"
|
||||
|
||||
|
||||
# TODO(1.9): Remove in 1.9
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name", ["from_estimator", "from_predictions", "plot"]
|
||||
)
|
||||
def test_roc_curve_display_kwargs_deprecation(pyplot, data_binary, constructor_name):
|
||||
"""Check **kwargs deprecated correctly in favour of `curve_kwargs`."""
|
||||
X, y = data_binary
|
||||
lr = LogisticRegression()
|
||||
lr.fit(X, y)
|
||||
fpr = np.array([0, 0.5, 1])
|
||||
tpr = np.array([0, 0.5, 1])
|
||||
|
||||
# Error when both `curve_kwargs` and `**kwargs` provided
|
||||
with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
|
||||
if constructor_name == "from_estimator":
|
||||
RocCurveDisplay.from_estimator(
|
||||
lr, X, y, curve_kwargs={"alpha": 1}, label="test"
|
||||
)
|
||||
elif constructor_name == "from_predictions":
|
||||
RocCurveDisplay.from_predictions(
|
||||
y, y, curve_kwargs={"alpha": 1}, label="test"
|
||||
)
|
||||
else:
|
||||
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(
|
||||
curve_kwargs={"alpha": 1}, label="test"
|
||||
)
|
||||
|
||||
# Warning when `**kwargs`` provided
|
||||
with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and will be"):
|
||||
if constructor_name == "from_estimator":
|
||||
RocCurveDisplay.from_estimator(lr, X, y, label="test")
|
||||
elif constructor_name == "from_predictions":
|
||||
RocCurveDisplay.from_predictions(y, y, label="test")
|
||||
else:
|
||||
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="test")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"curve_kwargs",
|
||||
[
|
||||
None,
|
||||
{"color": "blue"},
|
||||
[{"color": "blue"}, {"color": "green"}, {"color": "red"}],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("drop_intermediate", [True, False])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
@pytest.mark.parametrize("with_sample_weight", [True, False])
|
||||
@pytest.mark.parametrize("with_strings", [True, False])
|
||||
def test_roc_curve_display_plotting_from_cv_results(
|
||||
pyplot,
|
||||
data_binary,
|
||||
with_strings,
|
||||
with_sample_weight,
|
||||
response_method,
|
||||
drop_intermediate,
|
||||
curve_kwargs,
|
||||
):
|
||||
"""Check overall plotting of `from_cv_results`."""
|
||||
X, y = data_binary
|
||||
|
||||
pos_label = None
|
||||
if with_strings:
|
||||
y = np.array(["c", "b"])[y]
|
||||
pos_label = "c"
|
||||
|
||||
if with_sample_weight:
|
||||
rng = np.random.RandomState(42)
|
||||
sample_weight = rng.randint(1, 4, size=(X.shape[0]))
|
||||
else:
|
||||
sample_weight = None
|
||||
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
drop_intermediate=drop_intermediate,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
|
||||
for idx, (estimator, test_indices) in enumerate(
|
||||
zip(cv_results["estimator"], cv_results["indices"]["test"])
|
||||
):
|
||||
y_true = _safe_indexing(y, test_indices)
|
||||
y_pred = _get_response_values_binary(
|
||||
estimator,
|
||||
_safe_indexing(X, test_indices),
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
)[0]
|
||||
sample_weight_fold = (
|
||||
None
|
||||
if sample_weight is None
|
||||
else _safe_indexing(sample_weight, test_indices)
|
||||
)
|
||||
fpr, tpr, _ = roc_curve(
|
||||
y_true,
|
||||
y_pred,
|
||||
sample_weight=sample_weight_fold,
|
||||
drop_intermediate=drop_intermediate,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
assert_allclose(display.roc_auc[idx], auc(fpr, tpr))
|
||||
assert_allclose(display.fpr[idx], fpr)
|
||||
assert_allclose(display.tpr[idx], tpr)
|
||||
|
||||
assert display.name is None
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
_check_figure_axes_and_labels(display, pos_label)
|
||||
if with_sample_weight:
|
||||
aggregate_expected_labels = ["AUC = 0.64 +/- 0.04", "_child1", "_child2"]
|
||||
else:
|
||||
aggregate_expected_labels = ["AUC = 0.61 +/- 0.05", "_child1", "_child2"]
|
||||
for idx, line in enumerate(display.line_):
|
||||
assert isinstance(line, mpl.lines.Line2D)
|
||||
# Default alpha for `from_cv_results`
|
||||
line.get_alpha() == 0.5
|
||||
if isinstance(curve_kwargs, list):
|
||||
# Each individual curve labelled
|
||||
assert line.get_label() == f"AUC = {display.roc_auc[idx]:.2f}"
|
||||
else:
|
||||
# Single aggregate label
|
||||
assert line.get_label() == aggregate_expected_labels[idx]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("roc_auc", [[1.0, 1.0, 1.0], None])
|
||||
@pytest.mark.parametrize(
|
||||
"curve_kwargs",
|
||||
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
|
||||
)
|
||||
@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
|
||||
def test_roc_curve_plot_legend_label(pyplot, data_binary, name, curve_kwargs, roc_auc):
|
||||
"""Check legend label correct with all `curve_kwargs`, `name` combinations."""
|
||||
fpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
|
||||
tpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])]
|
||||
if not isinstance(curve_kwargs, list) and isinstance(name, list):
|
||||
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
|
||||
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
|
||||
name=name, curve_kwargs=curve_kwargs
|
||||
)
|
||||
|
||||
else:
|
||||
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(
|
||||
name=name, curve_kwargs=curve_kwargs
|
||||
)
|
||||
legend = display.ax_.get_legend()
|
||||
if legend is None:
|
||||
# No legend is created, exit test early
|
||||
assert name is None
|
||||
assert roc_auc is None
|
||||
return
|
||||
else:
|
||||
legend_labels = [text.get_text() for text in legend.get_texts()]
|
||||
|
||||
if isinstance(curve_kwargs, list):
|
||||
# Multiple labels in legend
|
||||
assert len(legend_labels) == 3
|
||||
for idx, label in enumerate(legend_labels):
|
||||
if name is None:
|
||||
expected_label = "AUC = 1.00" if roc_auc else None
|
||||
assert label == expected_label
|
||||
elif isinstance(name, str):
|
||||
expected_label = "single (AUC = 1.00)" if roc_auc else "single"
|
||||
assert label == expected_label
|
||||
else:
|
||||
# `name` is a list of different strings
|
||||
expected_label = (
|
||||
f"{name[idx]} (AUC = 1.00)" if roc_auc else f"{name[idx]}"
|
||||
)
|
||||
assert label == expected_label
|
||||
else:
|
||||
# Single label in legend
|
||||
assert len(legend_labels) == 1
|
||||
if name is None:
|
||||
expected_label = "AUC = 1.00 +/- 0.00" if roc_auc else None
|
||||
assert legend_labels[0] == expected_label
|
||||
else:
|
||||
# name is single string
|
||||
expected_label = "single (AUC = 1.00 +/- 0.00)" if roc_auc else "single"
|
||||
assert legend_labels[0] == expected_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"curve_kwargs",
|
||||
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
|
||||
)
|
||||
@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]])
|
||||
def test_roc_curve_from_cv_results_legend_label(
|
||||
pyplot, data_binary, name, curve_kwargs
|
||||
):
|
||||
"""Check legend label correct with all `curve_kwargs`, `name` combinations."""
|
||||
X, y = data_binary
|
||||
n_cv = 3
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
if not isinstance(curve_kwargs, list) and isinstance(name, list):
|
||||
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
|
||||
RocCurveDisplay.from_cv_results(
|
||||
cv_results, X, y, name=name, curve_kwargs=curve_kwargs
|
||||
)
|
||||
else:
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results, X, y, name=name, curve_kwargs=curve_kwargs
|
||||
)
|
||||
|
||||
legend = display.ax_.get_legend()
|
||||
legend_labels = [text.get_text() for text in legend.get_texts()]
|
||||
if isinstance(curve_kwargs, list):
|
||||
# Multiple labels in legend
|
||||
assert len(legend_labels) == 3
|
||||
auc = ["0.62", "0.66", "0.55"]
|
||||
for idx, label in enumerate(legend_labels):
|
||||
if name is None:
|
||||
assert label == f"AUC = {auc[idx]}"
|
||||
elif isinstance(name, str):
|
||||
assert label == f"single (AUC = {auc[idx]})"
|
||||
else:
|
||||
# `name` is a list of different strings
|
||||
assert label == f"{name[idx]} (AUC = {auc[idx]})"
|
||||
else:
|
||||
# Single label in legend
|
||||
assert len(legend_labels) == 1
|
||||
if name is None:
|
||||
assert legend_labels[0] == "AUC = 0.61 +/- 0.05"
|
||||
else:
|
||||
# name is single string
|
||||
assert legend_labels[0] == "single (AUC = 0.61 +/- 0.05)"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"curve_kwargs",
|
||||
[None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]],
|
||||
)
|
||||
def test_roc_curve_from_cv_results_curve_kwargs(pyplot, data_binary, curve_kwargs):
|
||||
"""Check line kwargs passed correctly in `from_cv_results`."""
|
||||
|
||||
X, y = data_binary
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results, X, y, curve_kwargs=curve_kwargs
|
||||
)
|
||||
|
||||
for idx, line in enumerate(display.line_):
|
||||
color = line.get_color()
|
||||
if curve_kwargs is None:
|
||||
# Default color
|
||||
assert color == "blue"
|
||||
elif isinstance(curve_kwargs, Mapping):
|
||||
# All curves "red"
|
||||
assert color == "red"
|
||||
else:
|
||||
assert color == curve_kwargs[idx]["c"]
|
||||
|
||||
|
||||
def test_roc_curve_from_cv_results_pos_label_inferred(pyplot, data_binary):
|
||||
"""Check `pos_label` inferred correctly by `from_cv_results(pos_label=None)`."""
|
||||
X, y = data_binary
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
disp = RocCurveDisplay.from_cv_results(cv_results, X, y, pos_label=None)
|
||||
# Should be `estimator.classes_[1]`
|
||||
assert disp.pos_label == 1
|
||||
|
||||
|
||||
def _check_chance_level(plot_chance_level, chance_level_kw, display):
|
||||
"""Check chance level line and line styles correct."""
|
||||
import matplotlib as mpl
|
||||
|
||||
if plot_chance_level:
|
||||
assert isinstance(display.chance_level_, mpl.lines.Line2D)
|
||||
assert tuple(display.chance_level_.get_xdata()) == (0, 1)
|
||||
assert tuple(display.chance_level_.get_ydata()) == (0, 1)
|
||||
else:
|
||||
assert display.chance_level_ is None
|
||||
|
||||
# Checking for chance level line styles
|
||||
if plot_chance_level and chance_level_kw is None:
|
||||
assert display.chance_level_.get_color() == "k"
|
||||
assert display.chance_level_.get_linestyle() == "--"
|
||||
assert display.chance_level_.get_label() == "Chance level (AUC = 0.5)"
|
||||
elif plot_chance_level:
|
||||
if "c" in chance_level_kw:
|
||||
assert display.chance_level_.get_color() == chance_level_kw["c"]
|
||||
else:
|
||||
assert display.chance_level_.get_color() == chance_level_kw["color"]
|
||||
if "lw" in chance_level_kw:
|
||||
assert display.chance_level_.get_linewidth() == chance_level_kw["lw"]
|
||||
else:
|
||||
assert display.chance_level_.get_linewidth() == chance_level_kw["linewidth"]
|
||||
if "ls" in chance_level_kw:
|
||||
assert display.chance_level_.get_linestyle() == chance_level_kw["ls"]
|
||||
else:
|
||||
assert display.chance_level_.get_linestyle() == chance_level_kw["linestyle"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("plot_chance_level", [True, False])
|
||||
@pytest.mark.parametrize("label", [None, "Test Label"])
|
||||
@pytest.mark.parametrize(
|
||||
"chance_level_kw",
|
||||
[
|
||||
None,
|
||||
{"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
|
||||
{"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
|
||||
{"lw": 1, "color": "blue", "ls": "-", "label": None},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_roc_curve_chance_level_line(
|
||||
pyplot,
|
||||
data_binary,
|
||||
plot_chance_level,
|
||||
chance_level_kw,
|
||||
label,
|
||||
constructor_name,
|
||||
):
|
||||
"""Check chance level plotting behavior of `from_predictions`, `from_estimator`."""
|
||||
X, y = data_binary
|
||||
|
||||
lr = LogisticRegression()
|
||||
lr.fit(X, y)
|
||||
|
||||
y_score = getattr(lr, "predict_proba")(X)
|
||||
y_score = y_score if y_score.ndim == 1 else y_score[:, 1]
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(
|
||||
lr,
|
||||
X,
|
||||
y,
|
||||
curve_kwargs={"alpha": 0.8, "label": label},
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
)
|
||||
else:
|
||||
display = RocCurveDisplay.from_predictions(
|
||||
y,
|
||||
y_score,
|
||||
curve_kwargs={"alpha": 0.8, "label": label},
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kw=chance_level_kw,
|
||||
)
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
assert isinstance(display.line_, mpl.lines.Line2D)
|
||||
assert display.line_.get_alpha() == 0.8
|
||||
assert isinstance(display.ax_, mpl.axes.Axes)
|
||||
assert isinstance(display.figure_, mpl.figure.Figure)
|
||||
|
||||
_check_chance_level(plot_chance_level, chance_level_kw, display)
|
||||
|
||||
# Checking for legend behaviour
|
||||
if plot_chance_level and chance_level_kw is not None:
|
||||
if label is not None or chance_level_kw.get("label") is not None:
|
||||
legend = display.ax_.get_legend()
|
||||
assert legend is not None # Legend should be present if any label is set
|
||||
legend_labels = [text.get_text() for text in legend.get_texts()]
|
||||
if label is not None:
|
||||
assert label in legend_labels
|
||||
if chance_level_kw.get("label") is not None:
|
||||
assert chance_level_kw["label"] in legend_labels
|
||||
else:
|
||||
assert display.ax_.get_legend() is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("plot_chance_level", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"chance_level_kw",
|
||||
[
|
||||
None,
|
||||
{"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"},
|
||||
{"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"},
|
||||
{"lw": 1, "color": "blue", "ls": "-", "label": None},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("curve_kwargs", [None, {"alpha": 0.8}])
|
||||
def test_roc_curve_chance_level_line_from_cv_results(
|
||||
pyplot,
|
||||
data_binary,
|
||||
plot_chance_level,
|
||||
chance_level_kw,
|
||||
curve_kwargs,
|
||||
):
|
||||
"""Check chance level plotting behavior with `from_cv_results`."""
|
||||
X, y = data_binary
|
||||
n_cv = 3
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
plot_chance_level=plot_chance_level,
|
||||
chance_level_kwargs=chance_level_kw,
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
assert all(isinstance(line, mpl.lines.Line2D) for line in display.line_)
|
||||
# Ensure both curve line kwargs passed correctly as well
|
||||
if curve_kwargs:
|
||||
assert all(line.get_alpha() == 0.8 for line in display.line_)
|
||||
assert isinstance(display.ax_, mpl.axes.Axes)
|
||||
assert isinstance(display.figure_, mpl.figure.Figure)
|
||||
|
||||
_check_chance_level(plot_chance_level, chance_level_kw, display)
|
||||
|
||||
legend = display.ax_.get_legend()
|
||||
# There is always a legend, to indicate each 'Fold' curve
|
||||
assert legend is not None
|
||||
legend_labels = [text.get_text() for text in legend.get_texts()]
|
||||
if plot_chance_level and chance_level_kw is not None:
|
||||
if chance_level_kw.get("label") is not None:
|
||||
assert chance_level_kw["label"] in legend_labels
|
||||
else:
|
||||
assert len(legend_labels) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clf",
|
||||
[
|
||||
LogisticRegression(),
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression()
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
|
||||
def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf, constructor_name):
|
||||
"""Check the behaviour with complex pipeline."""
|
||||
X, y = data_binary
|
||||
|
||||
clf = clone(clf)
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
with pytest.raises(NotFittedError):
|
||||
RocCurveDisplay.from_estimator(clf, X, y)
|
||||
|
||||
clf.fit(X, y)
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(clf, X, y)
|
||||
name = clf.__class__.__name__
|
||||
else:
|
||||
display = RocCurveDisplay.from_predictions(y, y)
|
||||
name = "Classifier"
|
||||
|
||||
assert name in display.line_.get_label()
|
||||
assert display.name == name
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"roc_auc, name, curve_kwargs, expected_labels",
|
||||
[
|
||||
([0.9, 0.8], None, None, ["AUC = 0.85 +/- 0.05", "_child1"]),
|
||||
([0.9, 0.8], "Est name", None, ["Est name (AUC = 0.85 +/- 0.05)", "_child1"]),
|
||||
(
|
||||
[0.8, 0.7],
|
||||
["fold1", "fold2"],
|
||||
[{"c": "blue"}, {"c": "red"}],
|
||||
["fold1 (AUC = 0.80)", "fold2 (AUC = 0.70)"],
|
||||
),
|
||||
(None, ["fold1", "fold2"], [{"c": "blue"}, {"c": "red"}], ["fold1", "fold2"]),
|
||||
],
|
||||
)
|
||||
def test_roc_curve_display_default_labels(
|
||||
pyplot, roc_auc, name, curve_kwargs, expected_labels
|
||||
):
|
||||
"""Check the default labels used in the display."""
|
||||
fpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
|
||||
tpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])]
|
||||
disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=name).plot(
|
||||
curve_kwargs=curve_kwargs
|
||||
)
|
||||
for idx, expected_label in enumerate(expected_labels):
|
||||
assert disp.line_[idx].get_label() == expected_label
|
||||
|
||||
|
||||
def _check_auc(display, constructor_name):
|
||||
roc_auc_limit = 0.95679
|
||||
roc_auc_limit_multi = [0.97007, 0.985915, 0.980952]
|
||||
|
||||
if constructor_name == "from_cv_results":
|
||||
for idx, roc_auc in enumerate(display.roc_auc):
|
||||
assert roc_auc == pytest.approx(roc_auc_limit_multi[idx])
|
||||
else:
|
||||
assert display.roc_auc == pytest.approx(roc_auc_limit)
|
||||
assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"])
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
|
||||
)
|
||||
def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
|
||||
# check that we can provide the positive label and display the proper
|
||||
# statistics
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
# create a highly imbalanced version of the breast cancer dataset
|
||||
idx_positive = np.flatnonzero(y == 1)
|
||||
idx_negative = np.flatnonzero(y == 0)
|
||||
idx_selected = np.hstack([idx_negative, idx_positive[:25]])
|
||||
X, y = X[idx_selected], y[idx_selected]
|
||||
X, y = shuffle(X, y, random_state=42)
|
||||
# only use 2 features to make the problem even harder
|
||||
X = X[:, :2]
|
||||
y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
stratify=y,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
classifier = LogisticRegression()
|
||||
classifier.fit(X_train, y_train)
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
# Sanity check to be sure the positive class is `classes_[0]`
|
||||
# Class imbalance ensures a large difference in prediction values between classes,
|
||||
# allowing us to catch errors when we switch `pos_label`
|
||||
assert classifier.classes_.tolist() == ["cancer", "not cancer"]
|
||||
|
||||
y_score = getattr(classifier, response_method)(X_test)
|
||||
# we select the corresponding probability columns or reverse the decision
|
||||
# function otherwise
|
||||
y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0]
|
||||
y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1]
|
||||
|
||||
pos_label = "cancer"
|
||||
y_score = y_score_cancer
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(
|
||||
classifier,
|
||||
X_test,
|
||||
y_test,
|
||||
pos_label=pos_label,
|
||||
response_method=response_method,
|
||||
)
|
||||
elif constructor_name == "from_predictions":
|
||||
display = RocCurveDisplay.from_predictions(
|
||||
y_test,
|
||||
y_score,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
else:
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
|
||||
_check_auc(display, constructor_name)
|
||||
|
||||
pos_label = "not cancer"
|
||||
y_score = y_score_not_cancer
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(
|
||||
classifier,
|
||||
X_test,
|
||||
y_test,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
elif constructor_name == "from_predictions":
|
||||
display = RocCurveDisplay.from_predictions(
|
||||
y_test,
|
||||
y_score,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
else:
|
||||
display = RocCurveDisplay.from_cv_results(
|
||||
cv_results,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
)
|
||||
|
||||
_check_auc(display, constructor_name)
|
||||
|
||||
|
||||
# TODO(1.9): remove
|
||||
def test_y_score_and_y_pred_specified_error(pyplot):
|
||||
"""1. Check that an error is raised when both y_score and y_pred are specified.
|
||||
2. Check that a warning is raised when y_pred is specified.
|
||||
"""
|
||||
y_true = np.array([0, 1, 1, 0])
|
||||
y_score = np.array([0.1, 0.4, 0.35, 0.8])
|
||||
y_pred = np.array([0.2, 0.3, 0.5, 0.1])
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="`y_pred` and `y_score` cannot be both specified"
|
||||
):
|
||||
RocCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred)
|
||||
|
||||
with pytest.warns(FutureWarning, match="y_pred was deprecated in 1.7"):
|
||||
display_y_pred = RocCurveDisplay.from_predictions(y_true, y_pred=y_score)
|
||||
desired_fpr, desired_fnr, _ = roc_curve(y_true, y_score)
|
||||
assert_allclose(display_y_pred.fpr, desired_fpr)
|
||||
assert_allclose(display_y_pred.tpr, desired_fnr)
|
||||
|
||||
display_y_score = RocCurveDisplay.from_predictions(y_true, y_score)
|
||||
assert_allclose(display_y_score.fpr, desired_fpr)
|
||||
assert_allclose(display_y_score.tpr, desired_fnr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("despine", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name", ["from_estimator", "from_predictions", "from_cv_results"]
|
||||
)
|
||||
def test_plot_roc_curve_despine(pyplot, data_binary, despine, constructor_name):
|
||||
# Check that the despine keyword is working correctly
|
||||
X, y = data_binary
|
||||
|
||||
lr = LogisticRegression().fit(X, y)
|
||||
lr.fit(X, y)
|
||||
cv_results = cross_validate(
|
||||
LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True
|
||||
)
|
||||
|
||||
y_pred = lr.decision_function(X)
|
||||
|
||||
# safe guard for the if/else construction
|
||||
assert constructor_name in ("from_estimator", "from_predictions", "from_cv_results")
|
||||
|
||||
if constructor_name == "from_estimator":
|
||||
display = RocCurveDisplay.from_estimator(lr, X, y, despine=despine)
|
||||
elif constructor_name == "from_predictions":
|
||||
display = RocCurveDisplay.from_predictions(y, y_pred, despine=despine)
|
||||
else:
|
||||
display = RocCurveDisplay.from_cv_results(cv_results, X, y, despine=despine)
|
||||
|
||||
for s in ["top", "right"]:
|
||||
assert display.ax_.spines[s].get_visible() is not despine
|
||||
|
||||
if despine:
|
||||
for s in ["bottom", "left"]:
|
||||
assert display.ax_.spines[s].get_bounds() == (0, 1)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,56 @@
|
||||
"""Evaluation metrics for cluster analysis results.
|
||||
|
||||
- Supervised evaluation uses a ground truth class values for each sample.
|
||||
- Unsupervised evaluation does not use ground truths and measures the "quality" of the
|
||||
model itself.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.metrics.cluster._bicluster import consensus_score
|
||||
from sklearn.metrics.cluster._supervised import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
completeness_score,
|
||||
contingency_matrix,
|
||||
entropy,
|
||||
expected_mutual_information,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_completeness_v_measure,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
pair_confusion_matrix,
|
||||
rand_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.metrics.cluster._unsupervised import (
|
||||
calinski_harabasz_score,
|
||||
davies_bouldin_score,
|
||||
silhouette_samples,
|
||||
silhouette_score,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"adjusted_mutual_info_score",
|
||||
"adjusted_rand_score",
|
||||
"calinski_harabasz_score",
|
||||
"completeness_score",
|
||||
"consensus_score",
|
||||
"contingency_matrix",
|
||||
"davies_bouldin_score",
|
||||
# TODO(1.10): Remove
|
||||
"entropy",
|
||||
"expected_mutual_information",
|
||||
"fowlkes_mallows_score",
|
||||
"homogeneity_completeness_v_measure",
|
||||
"homogeneity_score",
|
||||
"mutual_info_score",
|
||||
"normalized_mutual_info_score",
|
||||
"pair_confusion_matrix",
|
||||
"rand_score",
|
||||
"silhouette_samples",
|
||||
"silhouette_score",
|
||||
"v_measure_score",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,114 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from sklearn.utils._param_validation import StrOptions, validate_params
|
||||
from sklearn.utils.validation import check_array, check_consistent_length
|
||||
|
||||
__all__ = ["consensus_score"]
|
||||
|
||||
|
||||
def _check_rows_and_columns(a, b):
|
||||
"""Unpacks the row and column arrays and checks their shape."""
|
||||
check_consistent_length(*a)
|
||||
check_consistent_length(*b)
|
||||
checks = lambda x: check_array(x, ensure_2d=False)
|
||||
a_rows, a_cols = map(checks, a)
|
||||
b_rows, b_cols = map(checks, b)
|
||||
return a_rows, a_cols, b_rows, b_cols
|
||||
|
||||
|
||||
def _jaccard(a_rows, a_cols, b_rows, b_cols):
|
||||
"""Jaccard coefficient on the elements of the two biclusters."""
|
||||
intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
|
||||
|
||||
a_size = a_rows.sum() * a_cols.sum()
|
||||
b_size = b_rows.sum() * b_cols.sum()
|
||||
|
||||
return intersection / (a_size + b_size - intersection)
|
||||
|
||||
|
||||
def _pairwise_similarity(a, b, similarity):
|
||||
"""Computes pairwise similarity matrix.
|
||||
|
||||
result[i, j] is the Jaccard coefficient of a's bicluster i and b's
|
||||
bicluster j.
|
||||
|
||||
"""
|
||||
a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
|
||||
n_a = a_rows.shape[0]
|
||||
n_b = b_rows.shape[0]
|
||||
result = np.array(
|
||||
[
|
||||
[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
|
||||
for i in range(n_a)
|
||||
]
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"a": [tuple],
|
||||
"b": [tuple],
|
||||
"similarity": [callable, StrOptions({"jaccard"})],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def consensus_score(a, b, *, similarity="jaccard"):
|
||||
"""The similarity of two sets of biclusters.
|
||||
|
||||
Similarity between individual biclusters is computed. Then the best
|
||||
matching between sets is found by solving a linear sum assignment problem,
|
||||
using a modified Jonker-Volgenant algorithm.
|
||||
The final score is the sum of similarities divided by the size of
|
||||
the larger set.
|
||||
|
||||
Read more in the :ref:`User Guide <biclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : tuple (rows, columns)
|
||||
Tuple of row and column indicators for a set of biclusters.
|
||||
|
||||
b : tuple (rows, columns)
|
||||
Another set of biclusters like ``a``.
|
||||
|
||||
similarity : 'jaccard' or callable, default='jaccard'
|
||||
May be the string "jaccard" to use the Jaccard coefficient, or
|
||||
any function that takes four arguments, each of which is a 1d
|
||||
indicator vector: (a_rows, a_columns, b_rows, b_columns).
|
||||
|
||||
Returns
|
||||
-------
|
||||
consensus_score : float
|
||||
Consensus score, a non-negative value, sum of similarities
|
||||
divided by size of larger set.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
|
||||
|
||||
References
|
||||
----------
|
||||
* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
|
||||
for bicluster acquisition
|
||||
<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import consensus_score
|
||||
>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
|
||||
>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
|
||||
>>> consensus_score(a, b, similarity='jaccard')
|
||||
1.0
|
||||
"""
|
||||
if similarity == "jaccard":
|
||||
similarity = _jaccard
|
||||
matrix = _pairwise_similarity(a, b, similarity)
|
||||
row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
|
||||
n_a = len(a[0])
|
||||
n_b = len(b[0])
|
||||
return float(matrix[row_indices, col_indices].sum() / max(n_a, n_b))
|
||||
Binary file not shown.
@@ -0,0 +1,69 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from libc.math cimport exp, lgamma
|
||||
|
||||
from sklearn.utils._typedefs cimport float64_t, int64_t
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import gammaln
|
||||
|
||||
|
||||
def expected_mutual_information(contingency, int64_t n_samples):
|
||||
"""Calculate the expected mutual information for two labelings."""
|
||||
cdef:
|
||||
float64_t emi = 0
|
||||
int64_t n_rows, n_cols
|
||||
float64_t term2, term3, gln
|
||||
int64_t[::1] a_view, b_view
|
||||
float64_t[::1] term1
|
||||
float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij
|
||||
float64_t[::1] log_a, log_b
|
||||
Py_ssize_t i, j, nij
|
||||
int64_t start, end
|
||||
|
||||
n_rows, n_cols = contingency.shape
|
||||
a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False))
|
||||
b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False))
|
||||
a_view = a
|
||||
b_view = b
|
||||
|
||||
# any labelling with zero entropy implies EMI = 0
|
||||
if a.size == 1 or b.size == 1:
|
||||
return 0.0
|
||||
|
||||
# There are three major terms to the EMI equation, which are multiplied to
|
||||
# and then summed over varying nij values.
|
||||
# While nijs[0] will never be used, having it simplifies the indexing.
|
||||
nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
|
||||
nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue.
|
||||
# term1 is nij / N
|
||||
term1 = nijs / n_samples
|
||||
# term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
|
||||
log_a = np.log(a)
|
||||
log_b = np.log(b)
|
||||
# term2 uses log(N * nij) = log(N) + log(nij)
|
||||
log_Nnij = np.log(n_samples) + np.log(nijs)
|
||||
# term3 is large, and involved many factorials. Calculate these in log
|
||||
# space to stop overflows.
|
||||
gln_a = gammaln(a + 1)
|
||||
gln_b = gammaln(b + 1)
|
||||
gln_Na = gammaln(n_samples - a + 1)
|
||||
gln_Nb = gammaln(n_samples - b + 1)
|
||||
gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1)
|
||||
|
||||
# emi itself is a summation over the various values.
|
||||
for i in range(n_rows):
|
||||
for j in range(n_cols):
|
||||
start = max(1, a_view[i] - n_samples + b_view[j])
|
||||
end = min(a_view[i], b_view[j]) + 1
|
||||
for nij in range(start, end):
|
||||
term2 = log_Nnij[nij] - log_a[i] - log_b[j]
|
||||
# Numerators are positive, denominators are negative.
|
||||
gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
|
||||
- gln_Nnij[nij] - lgamma(a_view[i] - nij + 1)
|
||||
- lgamma(b_view[j] - nij + 1)
|
||||
- lgamma(n_samples - a_view[i] - b_view[j] + nij + 1))
|
||||
term3 = exp(gln)
|
||||
emi += (term1[nij] * term2 * term3)
|
||||
return emi
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,487 @@
|
||||
"""Unsupervised evaluation metrics."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import functools
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.externals.array_api_compat import is_numpy_array
|
||||
from sklearn.metrics.pairwise import (
|
||||
_VALID_METRICS,
|
||||
pairwise_distances,
|
||||
pairwise_distances_chunked,
|
||||
)
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.utils import _safe_indexing, check_random_state, check_X_y
|
||||
from sklearn.utils._array_api import (
|
||||
_average,
|
||||
_convert_to_numpy,
|
||||
_is_numpy_namespace,
|
||||
_max_precision_float_dtype,
|
||||
get_namespace_and_device,
|
||||
xpx,
|
||||
)
|
||||
from sklearn.utils._param_validation import Interval, StrOptions, validate_params
|
||||
|
||||
|
||||
def check_number_of_labels(n_labels, n_samples):
|
||||
"""Check that number of labels are valid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_labels : int
|
||||
Number of labels.
|
||||
|
||||
n_samples : int
|
||||
Number of samples.
|
||||
"""
|
||||
if not 1 < n_labels < n_samples:
|
||||
raise ValueError(
|
||||
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
|
||||
% n_labels
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"labels": ["array-like"],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
"sample_size": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def silhouette_score(
|
||||
X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds
|
||||
):
|
||||
"""Compute the mean Silhouette Coefficient of all samples.
|
||||
|
||||
The Silhouette Coefficient is calculated using the mean intra-cluster
|
||||
distance (``a``) and the mean nearest-cluster distance (``b``) for each
|
||||
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
|
||||
b)``. To clarify, ``b`` is the distance between a sample and the nearest
|
||||
cluster that the sample is not a part of.
|
||||
Note that Silhouette Coefficient is only defined if number of labels
|
||||
is ``2 <= n_labels <= n_samples - 1``.
|
||||
|
||||
This function returns the mean Silhouette Coefficient over all samples.
|
||||
To obtain the values for each sample, use :func:`silhouette_samples`.
|
||||
|
||||
The best value is 1 and the worst value is -1. Values near 0 indicate
|
||||
overlapping clusters. Negative values generally indicate that a sample has
|
||||
been assigned to the wrong cluster, as a different cluster is more similar.
|
||||
|
||||
Read more in the :ref:`User Guide <silhouette_coefficient>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
|
||||
"precomputed" or (n_samples_a, n_features) otherwise
|
||||
An array of pairwise distances between samples, or a feature array.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string, it must be one of the options
|
||||
allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is
|
||||
the distance array itself, use ``metric="precomputed"``.
|
||||
|
||||
sample_size : int, default=None
|
||||
The size of the sample to use when computing the Silhouette Coefficient
|
||||
on a random subset of the data.
|
||||
If ``sample_size is None``, no sampling is used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for selecting a subset of samples.
|
||||
Used when ``sample_size is not None``.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
**kwds : optional keyword parameters
|
||||
Any further parameters are passed directly to the distance function.
|
||||
If using a scipy.spatial.distance metric, the parameters are still
|
||||
metric dependent. See the scipy docs for usage examples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
silhouette : float
|
||||
Mean Silhouette Coefficient for all samples.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
|
||||
Interpretation and Validation of Cluster Analysis". Computational
|
||||
and Applied Mathematics 20: 53-65.
|
||||
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
|
||||
|
||||
.. [2] `Wikipedia entry on the Silhouette Coefficient
|
||||
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> from sklearn.metrics import silhouette_score
|
||||
>>> X, y = make_blobs(random_state=42)
|
||||
>>> kmeans = KMeans(n_clusters=2, random_state=42)
|
||||
>>> silhouette_score(X, kmeans.fit_predict(X))
|
||||
0.49...
|
||||
"""
|
||||
if sample_size is not None:
|
||||
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
|
||||
random_state = check_random_state(random_state)
|
||||
indices = random_state.permutation(X.shape[0])[:sample_size]
|
||||
if metric == "precomputed":
|
||||
X, labels = X[indices].T[indices].T, labels[indices]
|
||||
else:
|
||||
X, labels = X[indices], labels[indices]
|
||||
return float(np.mean(silhouette_samples(X, labels, metric=metric, **kwds)))
|
||||
|
||||
|
||||
def _silhouette_reduce(D_chunk, start, labels, label_freqs):
|
||||
"""Accumulate silhouette statistics for vertical chunk of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
|
||||
Precomputed distances for a chunk. If a sparse matrix is provided,
|
||||
only CSR format is accepted.
|
||||
start : int
|
||||
First index in the chunk.
|
||||
labels : array-like of shape (n_samples,)
|
||||
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
|
||||
label_freqs : array-like
|
||||
Distribution of cluster labels in ``labels``.
|
||||
"""
|
||||
n_chunk_samples = D_chunk.shape[0]
|
||||
# accumulate distances from each sample to each cluster
|
||||
cluster_distances = np.zeros(
|
||||
(n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype
|
||||
)
|
||||
|
||||
if issparse(D_chunk):
|
||||
if D_chunk.format != "csr":
|
||||
raise TypeError(
|
||||
"Expected CSR matrix. Please pass sparse matrix in CSR format."
|
||||
)
|
||||
for i in range(n_chunk_samples):
|
||||
indptr = D_chunk.indptr
|
||||
indices = D_chunk.indices[indptr[i] : indptr[i + 1]]
|
||||
sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]]
|
||||
sample_labels = np.take(labels, indices)
|
||||
cluster_distances[i] += np.bincount(
|
||||
sample_labels, weights=sample_weights, minlength=len(label_freqs)
|
||||
)
|
||||
else:
|
||||
for i in range(n_chunk_samples):
|
||||
sample_weights = D_chunk[i]
|
||||
sample_labels = labels
|
||||
cluster_distances[i] += np.bincount(
|
||||
sample_labels, weights=sample_weights, minlength=len(label_freqs)
|
||||
)
|
||||
|
||||
# intra_index selects intra-cluster distances within cluster_distances
|
||||
end = start + n_chunk_samples
|
||||
intra_index = (np.arange(n_chunk_samples), labels[start:end])
|
||||
# intra_cluster_distances are averaged over cluster size outside this function
|
||||
intra_cluster_distances = cluster_distances[intra_index]
|
||||
# of the remaining distances we normalise and extract the minimum
|
||||
cluster_distances[intra_index] = np.inf
|
||||
cluster_distances /= label_freqs
|
||||
inter_cluster_distances = cluster_distances.min(axis=1)
|
||||
return intra_cluster_distances, inter_cluster_distances
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"labels": ["array-like"],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
|
||||
"""Compute the Silhouette Coefficient for each sample.
|
||||
|
||||
The Silhouette Coefficient is a measure of how well samples are clustered
|
||||
with samples that are similar to themselves. Clustering models with a high
|
||||
Silhouette Coefficient are said to be dense, where samples in the same
|
||||
cluster are similar to each other, and well separated, where samples in
|
||||
different clusters are not very similar to each other.
|
||||
|
||||
The Silhouette Coefficient is calculated using the mean intra-cluster
|
||||
distance (``a``) and the mean nearest-cluster distance (``b``) for each
|
||||
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
|
||||
b)``.
|
||||
Note that Silhouette Coefficient is only defined if number of labels
|
||||
is 2 ``<= n_labels <= n_samples - 1``.
|
||||
|
||||
This function returns the Silhouette Coefficient for each sample.
|
||||
|
||||
The best value is 1 and the worst value is -1. Values near 0 indicate
|
||||
overlapping clusters.
|
||||
|
||||
Read more in the :ref:`User Guide <silhouette_coefficient>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
|
||||
"precomputed" or (n_samples_a, n_features) otherwise
|
||||
An array of pairwise distances between samples, or a feature array. If
|
||||
a sparse matrix is provided, CSR format should be favoured avoiding
|
||||
an additional copy.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Label values for each sample.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string, it must be one of the options
|
||||
allowed by :func:`~sklearn.metrics.pairwise_distances`.
|
||||
If ``X`` is the distance array itself, use "precomputed" as the metric.
|
||||
Precomputed distance matrices must have 0 along the diagonal.
|
||||
|
||||
**kwds : optional keyword parameters
|
||||
Any further parameters are passed directly to the distance function.
|
||||
If using a ``scipy.spatial.distance`` metric, the parameters are still
|
||||
metric dependent. See the scipy docs for usage examples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
silhouette : array-like of shape (n_samples,)
|
||||
Silhouette Coefficients for each sample.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
|
||||
Interpretation and Validation of Cluster Analysis". Computational
|
||||
and Applied Mathematics 20: 53-65.
|
||||
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
|
||||
|
||||
.. [2] `Wikipedia entry on the Silhouette Coefficient
|
||||
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import silhouette_samples
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> X, y = make_blobs(n_samples=50, random_state=42)
|
||||
>>> kmeans = KMeans(n_clusters=3, random_state=42)
|
||||
>>> labels = kmeans.fit_predict(X)
|
||||
>>> silhouette_samples(X, labels)
|
||||
array([...])
|
||||
"""
|
||||
X, labels = check_X_y(X, labels, accept_sparse=["csr"])
|
||||
|
||||
# Check for non-zero diagonal entries in precomputed distance matrix
|
||||
if metric == "precomputed":
|
||||
error_msg = ValueError(
|
||||
"The precomputed distance matrix contains non-zero "
|
||||
"elements on the diagonal. Use np.fill_diagonal(X, 0)."
|
||||
)
|
||||
if X.dtype.kind == "f":
|
||||
atol = np.finfo(X.dtype).eps * 100
|
||||
|
||||
if np.any(np.abs(X.diagonal()) > atol):
|
||||
raise error_msg
|
||||
elif np.any(X.diagonal() != 0): # integral dtype
|
||||
raise error_msg
|
||||
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
n_samples = len(labels)
|
||||
label_freqs = np.bincount(labels)
|
||||
check_number_of_labels(len(le.classes_), n_samples)
|
||||
|
||||
kwds["metric"] = metric
|
||||
reduce_func = functools.partial(
|
||||
_silhouette_reduce, labels=labels, label_freqs=label_freqs
|
||||
)
|
||||
results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
|
||||
intra_clust_dists, inter_clust_dists = results
|
||||
intra_clust_dists = np.concatenate(intra_clust_dists)
|
||||
inter_clust_dists = np.concatenate(inter_clust_dists)
|
||||
|
||||
denom = (label_freqs - 1).take(labels, mode="clip")
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
intra_clust_dists /= denom
|
||||
|
||||
sil_samples = inter_clust_dists - intra_clust_dists
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
|
||||
# nan values are for clusters of size 1, and should be 0
|
||||
return xpx.nan_to_num(sil_samples)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"labels": ["array-like"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def calinski_harabasz_score(X, labels):
|
||||
"""Compute the Calinski and Harabasz score.
|
||||
|
||||
It is also known as the Variance Ratio Criterion.
|
||||
|
||||
The score is defined as ratio of the sum of between-cluster dispersion and
|
||||
of within-cluster dispersion.
|
||||
|
||||
Read more in the :ref:`User Guide <calinski_harabasz_index>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
A list of ``n_features``-dimensional data points. Each row corresponds
|
||||
to a single data point.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
The resulting Calinski-Harabasz score.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
|
||||
analysis". Communications in Statistics
|
||||
<https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> from sklearn.metrics import calinski_harabasz_score
|
||||
>>> X, _ = make_blobs(random_state=0)
|
||||
>>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X)
|
||||
>>> calinski_harabasz_score(X, kmeans.labels_)
|
||||
114.8...
|
||||
"""
|
||||
|
||||
xp, _, device_ = get_namespace_and_device(X, labels)
|
||||
|
||||
if _is_numpy_namespace(xp) and not is_numpy_array(X):
|
||||
# This is required to handle the case where `array_api_dispatch` is False but
|
||||
# we are still dealing with `X` as a non-NumPy array e.g. a PyTorch tensor.
|
||||
X = _convert_to_numpy(X, xp=xp)
|
||||
else:
|
||||
X = xp.astype(X, _max_precision_float_dtype(xp, device_), copy=False)
|
||||
X, labels = check_X_y(X, labels)
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
|
||||
n_samples, _ = X.shape
|
||||
n_labels = le.classes_.shape[0]
|
||||
|
||||
check_number_of_labels(n_labels, n_samples)
|
||||
|
||||
extra_disp, intra_disp = 0.0, 0.0
|
||||
mean = xp.mean(X, axis=0)
|
||||
for k in range(n_labels):
|
||||
cluster_k = X[labels == k]
|
||||
mean_k = xp.mean(cluster_k, axis=0)
|
||||
extra_disp += cluster_k.shape[0] * xp.sum((mean_k - mean) ** 2)
|
||||
intra_disp += xp.sum((cluster_k - mean_k) ** 2)
|
||||
|
||||
return float(
|
||||
1.0
|
||||
if intra_disp == 0.0
|
||||
else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"labels": ["array-like"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def davies_bouldin_score(X, labels):
|
||||
"""Compute the Davies-Bouldin score.
|
||||
|
||||
The score is defined as the average similarity measure of each cluster with
|
||||
its most similar cluster, where similarity is the ratio of within-cluster
|
||||
distances to between-cluster distances. Thus, clusters which are farther
|
||||
apart and less dispersed will result in a better score.
|
||||
|
||||
The minimum score is zero, with lower values indicating better clustering.
|
||||
|
||||
Read more in the :ref:`User Guide <davies-bouldin_index>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
A list of ``n_features``-dimensional data points. Each row corresponds
|
||||
to a single data point.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score: float
|
||||
The resulting Davies-Bouldin score.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Davies, David L.; Bouldin, Donald W. (1979).
|
||||
`"A Cluster Separation Measure"
|
||||
<https://ieeexplore.ieee.org/document/4766909>`__.
|
||||
IEEE Transactions on Pattern Analysis and Machine Intelligence.
|
||||
PAMI-1 (2): 224-227
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import davies_bouldin_score
|
||||
>>> X = [[0, 1], [1, 1], [3, 4]]
|
||||
>>> labels = [0, 0, 1]
|
||||
>>> davies_bouldin_score(X, labels)
|
||||
0.12...
|
||||
"""
|
||||
xp, _, device_ = get_namespace_and_device(X, labels)
|
||||
X, labels = check_X_y(X, labels)
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
n_samples, _ = X.shape
|
||||
n_labels = le.classes_.shape[0]
|
||||
check_number_of_labels(n_labels, n_samples)
|
||||
|
||||
dtype = _max_precision_float_dtype(xp, device_)
|
||||
intra_dists = xp.zeros(n_labels, dtype=dtype, device=device_)
|
||||
centroids = xp.zeros((n_labels, X.shape[1]), dtype=dtype, device=device_)
|
||||
for k in range(n_labels):
|
||||
cluster_k = _safe_indexing(X, xp.nonzero(labels == k)[0])
|
||||
centroid = _average(cluster_k, axis=0, xp=xp)
|
||||
centroids[k, ...] = centroid
|
||||
intra_dists[k] = _average(
|
||||
pairwise_distances(cluster_k, xp.stack([centroid])), xp=xp
|
||||
)
|
||||
|
||||
centroid_distances = pairwise_distances(centroids)
|
||||
|
||||
zero = xp.asarray(0.0, device=device_, dtype=dtype)
|
||||
if xp.all(xpx.isclose(intra_dists, zero)) or xp.all(
|
||||
xpx.isclose(centroid_distances, zero)
|
||||
):
|
||||
return 0.0
|
||||
|
||||
centroid_distances[centroid_distances == 0] = xp.inf
|
||||
combined_intra_dists = intra_dists[:, None] + intra_dists
|
||||
scores = xp.max(combined_intra_dists / centroid_distances, axis=1)
|
||||
return float(_average(scores, xp=xp))
|
||||
@@ -0,0 +1,6 @@
|
||||
py.extension_module(
|
||||
'_expected_mutual_info_fast',
|
||||
cython_gen.process('_expected_mutual_info_fast.pyx'),
|
||||
subdir: 'sklearn/metrics/cluster',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,56 @@
|
||||
"""Testing for bicluster metrics module"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.metrics import consensus_score
|
||||
from sklearn.metrics.cluster._bicluster import _jaccard
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_jaccard():
|
||||
a1 = np.array([True, True, False, False])
|
||||
a2 = np.array([True, True, True, True])
|
||||
a3 = np.array([False, True, True, False])
|
||||
a4 = np.array([False, False, True, True])
|
||||
|
||||
assert _jaccard(a1, a1, a1, a1) == 1
|
||||
assert _jaccard(a1, a1, a2, a2) == 0.25
|
||||
assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
|
||||
assert _jaccard(a1, a1, a4, a4) == 0
|
||||
|
||||
|
||||
def test_consensus_score():
|
||||
a = [[True, True, False, False], [False, False, True, True]]
|
||||
b = a[::-1]
|
||||
|
||||
assert consensus_score((a, a), (a, a)) == 1
|
||||
assert consensus_score((a, a), (b, b)) == 1
|
||||
assert consensus_score((a, b), (a, b)) == 1
|
||||
assert consensus_score((a, b), (b, a)) == 1
|
||||
|
||||
assert consensus_score((a, a), (b, a)) == 0
|
||||
assert consensus_score((a, a), (a, b)) == 0
|
||||
assert consensus_score((b, b), (a, b)) == 0
|
||||
assert consensus_score((b, b), (b, a)) == 0
|
||||
|
||||
|
||||
def test_consensus_score_issue2445():
|
||||
"""Different number of biclusters in A and B"""
|
||||
a_rows = np.array(
|
||||
[
|
||||
[True, True, False, False],
|
||||
[False, False, True, True],
|
||||
[False, False, False, True],
|
||||
]
|
||||
)
|
||||
a_cols = np.array(
|
||||
[
|
||||
[True, True, False, False],
|
||||
[False, False, True, True],
|
||||
[False, False, False, True],
|
||||
]
|
||||
)
|
||||
idx = [0, 2]
|
||||
s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
|
||||
# B contains 2 of the 3 biclusters in A, so score should be 2/3
|
||||
assert_almost_equal(s, 2.0 / 3.0)
|
||||
@@ -0,0 +1,279 @@
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.metrics.cluster import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
calinski_harabasz_score,
|
||||
completeness_score,
|
||||
davies_bouldin_score,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
rand_score,
|
||||
silhouette_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.metrics.tests.test_common import check_array_api_metric
|
||||
from sklearn.utils._array_api import (
|
||||
_get_namespace_device_dtype_ids,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
# Dictionaries of metrics
|
||||
# ------------------------
|
||||
# The goal of having those dictionaries is to have an easy way to call a
|
||||
# particular metric and associate a name to each function:
|
||||
# - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
|
||||
# ground truth value)
|
||||
# - UNSUPERVISED_METRICS: all unsupervised cluster metrics
|
||||
#
|
||||
# Those dictionaries will be used to test systematically some invariance
|
||||
# properties, e.g. invariance toward several input layout.
|
||||
#
|
||||
|
||||
SUPERVISED_METRICS = {
|
||||
"adjusted_mutual_info_score": adjusted_mutual_info_score,
|
||||
"adjusted_rand_score": adjusted_rand_score,
|
||||
"rand_score": rand_score,
|
||||
"completeness_score": completeness_score,
|
||||
"homogeneity_score": homogeneity_score,
|
||||
"mutual_info_score": mutual_info_score,
|
||||
"normalized_mutual_info_score": normalized_mutual_info_score,
|
||||
"v_measure_score": v_measure_score,
|
||||
"fowlkes_mallows_score": fowlkes_mallows_score,
|
||||
}
|
||||
|
||||
UNSUPERVISED_METRICS = {
|
||||
"silhouette_score": silhouette_score,
|
||||
"silhouette_manhattan": partial(silhouette_score, metric="manhattan"),
|
||||
"calinski_harabasz_score": calinski_harabasz_score,
|
||||
"davies_bouldin_score": davies_bouldin_score,
|
||||
}
|
||||
|
||||
# Lists of metrics with common properties
|
||||
# ---------------------------------------
|
||||
# Lists of metrics with common properties are used to test systematically some
|
||||
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
|
||||
# that are symmetric with respect to their input argument y_true and y_pred.
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# Symmetric with respect to their input arguments y_true and y_pred.
|
||||
# Symmetric metrics only apply to supervised clusters.
|
||||
SYMMETRIC_METRICS = [
|
||||
"adjusted_rand_score",
|
||||
"rand_score",
|
||||
"v_measure_score",
|
||||
"mutual_info_score",
|
||||
"adjusted_mutual_info_score",
|
||||
"normalized_mutual_info_score",
|
||||
"fowlkes_mallows_score",
|
||||
]
|
||||
|
||||
NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
|
||||
|
||||
# Metrics whose upper bound is 1
|
||||
NORMALIZED_METRICS = [
|
||||
"adjusted_rand_score",
|
||||
"rand_score",
|
||||
"homogeneity_score",
|
||||
"completeness_score",
|
||||
"v_measure_score",
|
||||
"adjusted_mutual_info_score",
|
||||
"fowlkes_mallows_score",
|
||||
"normalized_mutual_info_score",
|
||||
]
|
||||
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
y1 = rng.randint(3, size=30)
|
||||
y2 = rng.randint(3, size=30)
|
||||
|
||||
|
||||
def test_symmetric_non_symmetric_union():
|
||||
assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted(
|
||||
SUPERVISED_METRICS
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS]
|
||||
)
|
||||
def test_symmetry(metric_name, y1, y2):
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric(y1, y2) == pytest.approx(metric(y2, y1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
|
||||
)
|
||||
def test_non_symmetry(metric_name, y1, y2):
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric(y1, y2) != pytest.approx(metric(y2, y1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
|
||||
def test_normalized_output(metric_name):
|
||||
upper_bound_1 = [0, 0, 0, 1, 1, 1]
|
||||
upper_bound_2 = [0, 0, 0, 1, 1, 1]
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
|
||||
assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
|
||||
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
|
||||
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
|
||||
assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
|
||||
|
||||
lower_bound_1 = [0, 0, 0, 0, 0, 0]
|
||||
lower_bound_2 = [0, 1, 2, 3, 4, 5]
|
||||
score = np.array(
|
||||
[metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)]
|
||||
)
|
||||
assert not (score < 0).any()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
|
||||
def test_permute_labels(metric_name):
|
||||
# All clustering metrics do not change score due to permutations of labels
|
||||
# that is when 0 and 1 exchanged.
|
||||
y_label = np.array([0, 0, 0, 1, 1, 0, 1])
|
||||
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
score_1 = metric(y_pred, y_label)
|
||||
assert_allclose(score_1, metric(1 - y_pred, y_label))
|
||||
assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
|
||||
assert_allclose(score_1, metric(y_pred, 1 - y_label))
|
||||
else:
|
||||
metric = UNSUPERVISED_METRICS[metric_name]
|
||||
X = np.random.randint(10, size=(7, 10))
|
||||
score_1 = metric(X, y_pred)
|
||||
assert_allclose(score_1, metric(X, 1 - y_pred))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
|
||||
# For all clustering metrics Input parameters can be both
|
||||
# in the form of arrays lists, positive, negative or string
|
||||
def test_format_invariance(metric_name):
|
||||
y_true = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
def generate_formats(y):
|
||||
y = np.array(y)
|
||||
yield y, "array of ints"
|
||||
yield y.tolist(), "list of ints"
|
||||
yield [str(x) + "-a" for x in y.tolist()], "list of strs"
|
||||
yield (
|
||||
np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
|
||||
"array of strs",
|
||||
)
|
||||
yield y - 1, "including negative ints"
|
||||
yield y + 1, "strictly positive ints"
|
||||
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
score_1 = metric(y_true, y_pred)
|
||||
y_true_gen = generate_formats(y_true)
|
||||
y_pred_gen = generate_formats(y_pred)
|
||||
for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen):
|
||||
assert score_1 == metric(y_true_fmt, y_pred_fmt)
|
||||
else:
|
||||
metric = UNSUPERVISED_METRICS[metric_name]
|
||||
X = np.random.randint(10, size=(8, 10))
|
||||
score_1 = metric(X, y_true)
|
||||
assert score_1 == metric(X.astype(float), y_true)
|
||||
y_true_gen = generate_formats(y_true)
|
||||
for y_true_fmt, fmt_name in y_true_gen:
|
||||
assert score_1 == metric(X, y_true_fmt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
|
||||
def test_single_sample(metric):
|
||||
# only the supervised metrics support single sample
|
||||
for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
|
||||
metric([i], [j])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
|
||||
)
|
||||
def test_inf_nan_input(metric_name, metric_func):
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
invalids = [
|
||||
([0, 1], [np.inf, np.inf]),
|
||||
([0, 1], [np.nan, np.nan]),
|
||||
([0, 1], [np.nan, np.inf]),
|
||||
]
|
||||
else:
|
||||
X = np.random.randint(10, size=(2, 10))
|
||||
invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
|
||||
with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
|
||||
for args in invalids:
|
||||
metric_func(*args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
|
||||
def test_returned_value_consistency(name):
|
||||
"""Ensure that the returned values of all metrics are consistent.
|
||||
|
||||
It can only be a float. It should not be a numpy float64 or float32.
|
||||
"""
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(10, size=(20, 10))
|
||||
labels_true = rng.randint(0, 3, size=(20,))
|
||||
labels_pred = rng.randint(0, 3, size=(20,))
|
||||
|
||||
if name in SUPERVISED_METRICS:
|
||||
metric = SUPERVISED_METRICS[name]
|
||||
score = metric(labels_true, labels_pred)
|
||||
else:
|
||||
metric = UNSUPERVISED_METRICS[name]
|
||||
score = metric(X, labels_pred)
|
||||
|
||||
assert isinstance(score, float)
|
||||
assert not isinstance(score, (np.float64, np.float32))
|
||||
|
||||
|
||||
def check_array_api_unsupervised_metric(metric, array_namespace, device, dtype_name):
|
||||
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
|
||||
X = np.random.randint(10, size=(7, 10))
|
||||
|
||||
check_array_api_metric(
|
||||
metric,
|
||||
array_namespace,
|
||||
device,
|
||||
dtype_name,
|
||||
a_np=X,
|
||||
b_np=y_pred,
|
||||
)
|
||||
|
||||
|
||||
array_api_metric_checkers = {
|
||||
calinski_harabasz_score: [
|
||||
check_array_api_unsupervised_metric,
|
||||
],
|
||||
davies_bouldin_score: [
|
||||
check_array_api_unsupervised_metric,
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers):
|
||||
for metric, checkers in metric_checkers.items():
|
||||
for checker in checkers:
|
||||
yield metric, checker
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations())
|
||||
def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func):
|
||||
check_func(metric, array_namespace, device, dtype_name)
|
||||
@@ -0,0 +1,532 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.base import config_context
|
||||
from sklearn.metrics.cluster import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
completeness_score,
|
||||
contingency_matrix,
|
||||
expected_mutual_information,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_completeness_v_measure,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
pair_confusion_matrix,
|
||||
rand_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.metrics.cluster._supervised import (
|
||||
_entropy,
|
||||
_generalized_average,
|
||||
check_clusterings,
|
||||
entropy,
|
||||
)
|
||||
from sklearn.utils import assert_all_finite
|
||||
from sklearn.utils._array_api import (
|
||||
_get_namespace_device_dtype_ids,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal
|
||||
|
||||
score_funcs = [
|
||||
adjusted_rand_score,
|
||||
rand_score,
|
||||
homogeneity_score,
|
||||
completeness_score,
|
||||
v_measure_score,
|
||||
adjusted_mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("score_func", score_funcs)
|
||||
def test_error_messages_on_wrong_input(score_func):
|
||||
expected = r"Found input variables with inconsistent numbers of samples: \[2, 3\]"
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([0, 1], [1, 1, 1])
|
||||
|
||||
expected = r"labels_true must be 1D: shape is \(2"
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([[0, 1], [1, 0]], [1, 1, 1])
|
||||
|
||||
expected = r"labels_pred must be 1D: shape is \(2"
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([0, 1, 0], [[1, 1], [0, 0]])
|
||||
|
||||
|
||||
def test_generalized_average():
|
||||
a, b = 1, 2
|
||||
methods = ["min", "geometric", "arithmetic", "max"]
|
||||
means = [_generalized_average(a, b, method) for method in methods]
|
||||
assert means[0] <= means[1] <= means[2] <= means[3]
|
||||
c, d = 12, 12
|
||||
means = [_generalized_average(c, d, method) for method in methods]
|
||||
assert means[0] == means[1] == means[2] == means[3]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("score_func", score_funcs)
|
||||
def test_perfect_matches(score_func):
|
||||
assert score_func([], []) == pytest.approx(1.0)
|
||||
assert score_func([0], [1]) == pytest.approx(1.0)
|
||||
assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
|
||||
assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
|
||||
assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0)
|
||||
assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0)
|
||||
assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"score_func",
|
||||
[
|
||||
normalized_mutual_info_score,
|
||||
adjusted_mutual_info_score,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("average_method", ["min", "geometric", "arithmetic", "max"])
|
||||
def test_perfect_matches_with_changing_means(score_func, average_method):
|
||||
assert score_func([], [], average_method=average_method) == pytest.approx(1.0)
|
||||
assert score_func([0], [1], average_method=average_method) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 0, 0], [0, 0, 0], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 1, 0], [42, 7, 42], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 1, 2], [42, 7, 2], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
# Non-regression tests for: https://github.com/scikit-learn/scikit-learn/issues/30950
|
||||
assert score_func([0, 1], [0, 1], average_method=average_method) == pytest.approx(
|
||||
1.0
|
||||
)
|
||||
assert score_func(
|
||||
[0, 1, 2, 3], [0, 1, 2, 3], average_method=average_method
|
||||
) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_homogeneous_but_not_complete_labeling():
|
||||
# homogeneous but not complete clustering
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 1.00, 2)
|
||||
assert_almost_equal(c, 0.69, 2)
|
||||
assert_almost_equal(v, 0.81, 2)
|
||||
|
||||
|
||||
def test_complete_but_not_homogeneous_labeling():
|
||||
# complete but not homogeneous clustering
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1])
|
||||
assert_almost_equal(h, 0.58, 2)
|
||||
assert_almost_equal(c, 1.00, 2)
|
||||
assert_almost_equal(v, 0.73, 2)
|
||||
|
||||
|
||||
def test_not_complete_and_not_homogeneous_labeling():
|
||||
# neither complete nor homogeneous but not so bad either
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
|
||||
def test_beta_parameter():
|
||||
# test for when beta passed to
|
||||
# homogeneity_completeness_v_measure
|
||||
# and v_measure_score
|
||||
beta_test = 0.2
|
||||
h_test = 0.67
|
||||
c_test = 0.42
|
||||
v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)
|
||||
|
||||
h, c, v = homogeneity_completeness_v_measure(
|
||||
[0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
|
||||
)
|
||||
assert_almost_equal(h, h_test, 2)
|
||||
assert_almost_equal(c, c_test, 2)
|
||||
assert_almost_equal(v, v_test, 2)
|
||||
|
||||
v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
|
||||
assert_almost_equal(v, v_test, 2)
|
||||
|
||||
|
||||
def test_non_consecutive_labels():
|
||||
# regression tests for labels with gaps
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(ari_1, 0.24, 2)
|
||||
assert_almost_equal(ari_2, 0.24, 2)
|
||||
|
||||
ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(ri_1, 0.66, 2)
|
||||
assert_almost_equal(ri_2, 0.66, 2)
|
||||
|
||||
|
||||
def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42):
|
||||
# Compute score for random uniform cluster labelings
|
||||
random_labels = np.random.RandomState(seed).randint
|
||||
scores = np.zeros((len(k_range), n_runs))
|
||||
for i, k in enumerate(k_range):
|
||||
for j in range(n_runs):
|
||||
labels_a = random_labels(low=0, high=k, size=n_samples)
|
||||
labels_b = random_labels(low=0, high=k, size=n_samples)
|
||||
scores[i, j] = score_func(labels_a, labels_b)
|
||||
return scores
|
||||
|
||||
|
||||
def test_adjustment_for_chance():
|
||||
# Check that adjusted scores are almost zero on random labels
|
||||
n_clusters_range = [2, 10, 50, 90]
|
||||
n_samples = 100
|
||||
n_runs = 10
|
||||
|
||||
scores = uniform_labelings_scores(
|
||||
adjusted_rand_score, n_samples, n_clusters_range, n_runs
|
||||
)
|
||||
|
||||
max_abs_scores = np.abs(scores).max(axis=1)
|
||||
assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
|
||||
|
||||
|
||||
def test_adjusted_mutual_info_score():
|
||||
# Compute the Adjusted Mutual Information and test against known values
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
# Mutual information
|
||||
mi = mutual_info_score(labels_a, labels_b)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# with provided sparse contingency
|
||||
C = contingency_matrix(labels_a, labels_b, sparse=True)
|
||||
mi = mutual_info_score(labels_a, labels_b, contingency=C)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# with provided dense contingency
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
mi = mutual_info_score(labels_a, labels_b, contingency=C)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# Expected mutual information
|
||||
n_samples = C.sum()
|
||||
emi = expected_mutual_information(C, n_samples)
|
||||
assert_almost_equal(emi, 0.15042, 5)
|
||||
# Adjusted mutual information
|
||||
ami = adjusted_mutual_info_score(labels_a, labels_b)
|
||||
assert_almost_equal(ami, 0.27821, 5)
|
||||
ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
|
||||
assert ami == pytest.approx(1.0)
|
||||
# Test with a very large array
|
||||
a110 = np.array([list(labels_a) * 110]).flatten()
|
||||
b110 = np.array([list(labels_b) * 110]).flatten()
|
||||
ami = adjusted_mutual_info_score(a110, b110)
|
||||
assert_almost_equal(ami, 0.38, 2)
|
||||
|
||||
|
||||
def test_expected_mutual_info_overflow():
|
||||
# Test for regression where contingency cell exceeds 2**16
|
||||
# leading to overflow in np.outer, resulting in EMI > 1
|
||||
assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
|
||||
|
||||
|
||||
def test_int_overflow_mutual_info_fowlkes_mallows_score():
|
||||
# Test overflow in mutual_info_classif and fowlkes_mallows_score
|
||||
x = np.array(
|
||||
[1] * (52632 + 2529)
|
||||
+ [2] * (14660 + 793)
|
||||
+ [3] * (3271 + 204)
|
||||
+ [4] * (814 + 39)
|
||||
+ [5] * (316 + 20)
|
||||
)
|
||||
y = np.array(
|
||||
[0] * 52632
|
||||
+ [1] * 2529
|
||||
+ [0] * 14660
|
||||
+ [1] * 793
|
||||
+ [0] * 3271
|
||||
+ [1] * 204
|
||||
+ [0] * 814
|
||||
+ [1] * 39
|
||||
+ [0] * 316
|
||||
+ [1] * 20
|
||||
)
|
||||
|
||||
assert_all_finite(mutual_info_score(x, y))
|
||||
assert_all_finite(fowlkes_mallows_score(x, y))
|
||||
|
||||
|
||||
# TODO(1.10): Remove
|
||||
def test_public_entropy_deprecation():
|
||||
with pytest.warns(FutureWarning, match="Function entropy is deprecated"):
|
||||
entropy([0, 0, 42.0])
|
||||
|
||||
|
||||
def test_entropy():
|
||||
assert_almost_equal(_entropy([0, 0, 42.0]), 0.6365141, 5)
|
||||
assert_almost_equal(_entropy([]), 1)
|
||||
assert _entropy([1, 1, 1, 1]) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_entropy_array_api(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device)
|
||||
empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device)
|
||||
int_labels = xp.asarray([1, 1, 1, 1], device=device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
assert _entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
|
||||
assert _entropy(empty_int32_labels) == 1
|
||||
assert _entropy(int_labels) == 0
|
||||
|
||||
|
||||
def test_contingency_matrix():
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
|
||||
assert_array_almost_equal(C, C2)
|
||||
C = contingency_matrix(labels_a, labels_b, eps=0.1)
|
||||
assert_array_almost_equal(C, C2 + 0.1)
|
||||
|
||||
|
||||
def test_contingency_matrix_sparse():
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
|
||||
assert_array_almost_equal(C, C_sparse)
|
||||
with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
|
||||
contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
|
||||
|
||||
|
||||
def test_exactly_zero_info_score():
|
||||
# Check numerical stability when information is exactly zero
|
||||
for i in np.logspace(1, 4, 4).astype(int):
|
||||
labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
|
||||
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
|
||||
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
for method in ["min", "geometric", "arithmetic", "max"]:
|
||||
assert (
|
||||
adjusted_mutual_info_score(labels_a, labels_b, average_method=method)
|
||||
== 0.0
|
||||
)
|
||||
assert normalized_mutual_info_score(
|
||||
labels_a, labels_b, average_method=method
|
||||
) == pytest.approx(0.0)
|
||||
|
||||
|
||||
def test_v_measure_and_mutual_information(seed=36):
|
||||
# Check relation between v_measure, entropy and mutual information
|
||||
for i in np.logspace(1, 4, 4).astype(int):
|
||||
random_state = np.random.RandomState(seed)
|
||||
labels_a, labels_b = (
|
||||
random_state.randint(0, 10, i),
|
||||
random_state.randint(0, 10, i),
|
||||
)
|
||||
assert_almost_equal(
|
||||
v_measure_score(labels_a, labels_b),
|
||||
2.0
|
||||
* mutual_info_score(labels_a, labels_b)
|
||||
/ (_entropy(labels_a) + _entropy(labels_b)),
|
||||
0,
|
||||
)
|
||||
avg = "arithmetic"
|
||||
assert_almost_equal(
|
||||
v_measure_score(labels_a, labels_b),
|
||||
normalized_mutual_info_score(labels_a, labels_b, average_method=avg),
|
||||
)
|
||||
|
||||
|
||||
def test_fowlkes_mallows_score():
|
||||
# General case
|
||||
score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
|
||||
assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))
|
||||
|
||||
# Perfect match but where the label names changed
|
||||
perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
|
||||
assert_almost_equal(perfect_score, 1.0)
|
||||
|
||||
# Worst case
|
||||
worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
|
||||
assert_almost_equal(worst_score, 0.0)
|
||||
|
||||
|
||||
def test_fowlkes_mallows_score_properties():
|
||||
# handcrafted example
|
||||
labels_a = np.array([0, 0, 0, 1, 1, 2])
|
||||
labels_b = np.array([1, 1, 2, 2, 0, 0])
|
||||
expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0))
|
||||
# FMI = TP / sqrt((TP + FP) * (TP + FN))
|
||||
|
||||
score_original = fowlkes_mallows_score(labels_a, labels_b)
|
||||
assert_almost_equal(score_original, expected)
|
||||
|
||||
# symmetric property
|
||||
score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
|
||||
assert_almost_equal(score_symmetric, expected)
|
||||
|
||||
# permutation property
|
||||
score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
|
||||
assert_almost_equal(score_permuted, expected)
|
||||
|
||||
# symmetric and permutation(both together)
|
||||
score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
|
||||
assert_almost_equal(score_both, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"labels_true, labels_pred",
|
||||
[
|
||||
(["a"] * 6, [1, 1, 0, 0, 1, 1]),
|
||||
([1] * 6, [1, 1, 0, 0, 1, 1]),
|
||||
([1, 1, 0, 0, 1, 1], ["a"] * 6),
|
||||
([1, 1, 0, 0, 1, 1], [1] * 6),
|
||||
(["a"] * 6, ["a"] * 6),
|
||||
],
|
||||
)
|
||||
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
|
||||
# Check that MI = 0 when one or both labelling are constant
|
||||
# non-regression test for #16355
|
||||
assert mutual_info_score(labels_true, labels_pred) == 0
|
||||
|
||||
|
||||
def test_check_clustering_error():
|
||||
# Test warning message for continuous values
|
||||
rng = np.random.RandomState(42)
|
||||
noise = rng.rand(500)
|
||||
wavelength = np.linspace(0.01, 1, 500) * 1e-6
|
||||
msg = (
|
||||
"Clustering metrics expects discrete values but received "
|
||||
"continuous values for label, and continuous values for "
|
||||
"target"
|
||||
)
|
||||
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
check_clusterings(wavelength, noise)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix_fully_dispersed():
|
||||
# edge case: every element is its own cluster
|
||||
N = 100
|
||||
clustering1 = list(range(N))
|
||||
clustering2 = clustering1
|
||||
expected = np.array([[N * (N - 1), 0], [0, 0]])
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix_single_cluster():
|
||||
# edge case: only one cluster
|
||||
N = 100
|
||||
clustering1 = np.zeros((N,))
|
||||
clustering2 = clustering1
|
||||
expected = np.array([[0, 0], [0, N * (N - 1)]])
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix():
|
||||
# regular case: different non-trivial clusterings
|
||||
n = 10
|
||||
N = n**2
|
||||
clustering1 = np.hstack([[i + 1] * n for i in range(n)])
|
||||
clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
|
||||
# basic quadratic implementation
|
||||
expected = np.zeros(shape=(2, 2), dtype=np.int64)
|
||||
for i in range(len(clustering1)):
|
||||
for j in range(len(clustering2)):
|
||||
if i != j:
|
||||
same_cluster_1 = int(clustering1[i] == clustering1[j])
|
||||
same_cluster_2 = int(clustering2[i] == clustering2[j])
|
||||
expected[same_cluster_1, same_cluster_2] += 1
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clustering1, clustering2",
|
||||
[(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))],
|
||||
)
|
||||
def test_rand_score_edge_cases(clustering1, clustering2):
|
||||
# edge case 1: every element is its own cluster
|
||||
# edge case 2: only one cluster
|
||||
assert_allclose(rand_score(clustering1, clustering2), 1.0)
|
||||
|
||||
|
||||
def test_rand_score():
|
||||
# regular case: different non-trivial clusterings
|
||||
clustering1 = [0, 0, 0, 1, 1, 1]
|
||||
clustering2 = [0, 1, 0, 1, 2, 2]
|
||||
# pair confusion matrix
|
||||
D11 = 2 * 2 # ordered pairs (1, 3), (5, 6)
|
||||
D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
|
||||
D01 = 2 * 1 # ordered pair (2, 4)
|
||||
D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs
|
||||
# rand score
|
||||
expected_numerator = D00 + D11
|
||||
expected_denominator = D00 + D01 + D10 + D11
|
||||
expected = expected_numerator / expected_denominator
|
||||
assert_allclose(rand_score(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_adjusted_rand_score_overflow():
|
||||
"""Check that large amount of data will not lead to overflow in
|
||||
`adjusted_rand_score`.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20305
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
|
||||
y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
adjusted_rand_score(y_true, y_pred)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"])
|
||||
def test_normalized_mutual_info_score_bounded(average_method):
|
||||
"""Check that nmi returns a score between 0 (included) and 1 (excluded
|
||||
for non-perfect match)
|
||||
|
||||
Non-regression test for issue #13836
|
||||
"""
|
||||
labels1 = [0] * 469
|
||||
labels2 = [1] + labels1[1:]
|
||||
labels3 = [0, 1] + labels1[2:]
|
||||
|
||||
# labels1 is constant. The mutual info between labels1 and any other labelling is 0.
|
||||
nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method)
|
||||
assert nmi == 0
|
||||
|
||||
# non constant, non perfect matching labels
|
||||
nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method)
|
||||
assert 0 <= nmi < 1
|
||||
|
||||
|
||||
# TODO(1.9): remove
|
||||
@pytest.mark.parametrize("sparse", [True, False])
|
||||
def test_fowlkes_mallows_sparse_deprecated(sparse):
|
||||
"""Check deprecation warning for 'sparse' parameter of fowlkes_mallows_score."""
|
||||
with pytest.warns(
|
||||
FutureWarning, match="The 'sparse' parameter was deprecated in 1.7"
|
||||
):
|
||||
fowlkes_mallows_score([0, 1], [1, 1], sparse=sparse)
|
||||
@@ -0,0 +1,413 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.cluster import (
|
||||
calinski_harabasz_score,
|
||||
davies_bouldin_score,
|
||||
silhouette_samples,
|
||||
silhouette_score,
|
||||
)
|
||||
from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import (
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
[None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
@pytest.mark.parametrize("sample_size", [None, "half"])
|
||||
def test_silhouette(sparse_container, sample_size):
|
||||
# Tests the Silhouette Coefficient.
|
||||
dataset = datasets.load_iris()
|
||||
X, y = dataset.data, dataset.target
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
|
||||
|
||||
D = pairwise_distances(X, metric="euclidean")
|
||||
# Given that the actual labels are used, we can assume that S would be positive.
|
||||
score_precomputed = silhouette_score(
|
||||
D, y, metric="precomputed", sample_size=sample_size, random_state=0
|
||||
)
|
||||
score_euclidean = silhouette_score(
|
||||
X, y, metric="euclidean", sample_size=sample_size, random_state=0
|
||||
)
|
||||
assert score_precomputed > 0
|
||||
assert score_euclidean > 0
|
||||
assert score_precomputed == pytest.approx(score_euclidean)
|
||||
|
||||
|
||||
def test_cluster_size_1():
|
||||
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
|
||||
# (cluster 0). We also test the case where there are identical samples
|
||||
# as the only members of a cluster (cluster 2). To our knowledge, this case
|
||||
# is not discussed in reference material, and we choose for it a sample
|
||||
# score of 1.
|
||||
X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
|
||||
labels = np.array([0, 1, 1, 1, 2, 2])
|
||||
|
||||
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
|
||||
# Cluster 1: intra-cluster = [.5, .5, 1]
|
||||
# inter-cluster = [1, 1, 1]
|
||||
# silhouette = [.5, .5, 0]
|
||||
# Cluster 2: intra-cluster = [0, 0]
|
||||
# inter-cluster = [arbitrary, arbitrary]
|
||||
# silhouette = [1., 1.]
|
||||
|
||||
silhouette = silhouette_score(X, labels)
|
||||
assert not np.isnan(silhouette)
|
||||
ss = silhouette_samples(X, labels)
|
||||
assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
|
||||
|
||||
|
||||
def test_silhouette_paper_example():
|
||||
# Explicitly check per-sample results against Rousseeuw (1987)
|
||||
# Data from Table 1
|
||||
lower = [
|
||||
5.58,
|
||||
7.00,
|
||||
6.50,
|
||||
7.08,
|
||||
7.00,
|
||||
3.83,
|
||||
4.83,
|
||||
5.08,
|
||||
8.17,
|
||||
5.83,
|
||||
2.17,
|
||||
5.75,
|
||||
6.67,
|
||||
6.92,
|
||||
4.92,
|
||||
6.42,
|
||||
5.00,
|
||||
5.58,
|
||||
6.00,
|
||||
4.67,
|
||||
6.42,
|
||||
3.42,
|
||||
5.50,
|
||||
6.42,
|
||||
6.42,
|
||||
5.00,
|
||||
3.92,
|
||||
6.17,
|
||||
2.50,
|
||||
4.92,
|
||||
6.25,
|
||||
7.33,
|
||||
4.50,
|
||||
2.25,
|
||||
6.33,
|
||||
2.75,
|
||||
6.08,
|
||||
6.67,
|
||||
4.25,
|
||||
2.67,
|
||||
6.00,
|
||||
6.17,
|
||||
6.17,
|
||||
6.92,
|
||||
6.17,
|
||||
5.25,
|
||||
6.83,
|
||||
4.50,
|
||||
3.75,
|
||||
5.75,
|
||||
5.42,
|
||||
6.08,
|
||||
5.83,
|
||||
6.67,
|
||||
3.67,
|
||||
4.75,
|
||||
3.00,
|
||||
6.08,
|
||||
6.67,
|
||||
5.00,
|
||||
5.58,
|
||||
4.83,
|
||||
6.17,
|
||||
5.67,
|
||||
6.50,
|
||||
6.92,
|
||||
]
|
||||
D = np.zeros((12, 12))
|
||||
D[np.tril_indices(12, -1)] = lower
|
||||
D += D.T
|
||||
|
||||
names = [
|
||||
"BEL",
|
||||
"BRA",
|
||||
"CHI",
|
||||
"CUB",
|
||||
"EGY",
|
||||
"FRA",
|
||||
"IND",
|
||||
"ISR",
|
||||
"USA",
|
||||
"USS",
|
||||
"YUG",
|
||||
"ZAI",
|
||||
]
|
||||
|
||||
# Data from Figure 2
|
||||
labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
|
||||
expected1 = {
|
||||
"USA": 0.43,
|
||||
"BEL": 0.39,
|
||||
"FRA": 0.35,
|
||||
"ISR": 0.30,
|
||||
"BRA": 0.22,
|
||||
"EGY": 0.20,
|
||||
"ZAI": 0.19,
|
||||
"CUB": 0.40,
|
||||
"USS": 0.34,
|
||||
"CHI": 0.33,
|
||||
"YUG": 0.26,
|
||||
"IND": -0.04,
|
||||
}
|
||||
score1 = 0.28
|
||||
|
||||
# Data from Figure 3
|
||||
labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
|
||||
expected2 = {
|
||||
"USA": 0.47,
|
||||
"FRA": 0.44,
|
||||
"BEL": 0.42,
|
||||
"ISR": 0.37,
|
||||
"EGY": 0.02,
|
||||
"ZAI": 0.28,
|
||||
"BRA": 0.25,
|
||||
"IND": 0.17,
|
||||
"CUB": 0.48,
|
||||
"USS": 0.44,
|
||||
"YUG": 0.31,
|
||||
"CHI": 0.31,
|
||||
}
|
||||
score2 = 0.33
|
||||
|
||||
for labels, expected, score in [
|
||||
(labels1, expected1, score1),
|
||||
(labels2, expected2, score2),
|
||||
]:
|
||||
expected = [expected[name] for name in names]
|
||||
# we check to 2dp because that's what's in the paper
|
||||
pytest.approx(
|
||||
expected,
|
||||
silhouette_samples(D, np.array(labels), metric="precomputed"),
|
||||
abs=1e-2,
|
||||
)
|
||||
pytest.approx(
|
||||
score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_correct_labelsize():
|
||||
# Assert 1 < n_labels < n_samples
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
|
||||
# n_labels = n_samples
|
||||
y = np.arange(X.shape[0])
|
||||
err_msg = (
|
||||
r"Number of labels is %d\. Valid values are 2 "
|
||||
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
silhouette_score(X, y)
|
||||
|
||||
# n_labels = 1
|
||||
y = np.zeros(X.shape[0])
|
||||
err_msg = (
|
||||
r"Number of labels is %d\. Valid values are 2 "
|
||||
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
silhouette_score(X, y)
|
||||
|
||||
|
||||
def test_non_encoded_labels():
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
labels = dataset.target
|
||||
assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
|
||||
assert_array_equal(
|
||||
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
|
||||
)
|
||||
|
||||
|
||||
def test_non_numpy_labels():
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
y = dataset.target
|
||||
assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_silhouette_nonzero_diag(dtype):
|
||||
# Make sure silhouette_samples requires diagonal to be zero.
|
||||
# Non-regression test for #12178
|
||||
|
||||
# Construct a zero-diagonal matrix
|
||||
dists = pairwise_distances(
|
||||
np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
|
||||
)
|
||||
labels = [0, 0, 0, 1, 1, 1]
|
||||
|
||||
# small values on the diagonal are OK
|
||||
dists[2][2] = np.finfo(dists.dtype).eps * 10
|
||||
silhouette_samples(dists, labels, metric="precomputed")
|
||||
|
||||
# values bigger than eps * 100 are not
|
||||
dists[2][2] = np.finfo(dists.dtype).eps * 1000
|
||||
with pytest.raises(ValueError, match="contains non-zero"):
|
||||
silhouette_samples(dists, labels, metric="precomputed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
def test_silhouette_samples_precomputed_sparse(sparse_container):
|
||||
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
assert issparse(pdist_sparse)
|
||||
output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
|
||||
output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
|
||||
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
def test_silhouette_samples_euclidean_sparse(sparse_container):
|
||||
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
assert issparse(pdist_sparse)
|
||||
output_with_sparse_input = silhouette_samples(pdist_sparse, y)
|
||||
output_with_dense_input = silhouette_samples(pdist_dense, y)
|
||||
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
|
||||
)
|
||||
def test_silhouette_reduce(sparse_container):
|
||||
"""Check for non-CSR input to private method `_silhouette_reduce`."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
label_freqs = np.bincount(y)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
|
||||
):
|
||||
_silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
|
||||
|
||||
|
||||
def assert_raises_on_only_one_label(func):
|
||||
"""Assert message when there is only one label"""
|
||||
rng = np.random.RandomState(seed=0)
|
||||
with pytest.raises(ValueError, match="Number of labels is"):
|
||||
func(rng.rand(10, 2), np.zeros(10))
|
||||
|
||||
|
||||
def assert_raises_on_all_points_same_cluster(func):
|
||||
"""Assert message when all point are in different clusters"""
|
||||
rng = np.random.RandomState(seed=0)
|
||||
with pytest.raises(ValueError, match="Number of labels is"):
|
||||
func(rng.rand(10, 2), np.arange(10))
|
||||
|
||||
|
||||
def test_calinski_harabasz_score():
|
||||
assert_raises_on_only_one_label(calinski_harabasz_score)
|
||||
|
||||
assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
|
||||
|
||||
# Assert the value is 1. when all samples are equals
|
||||
assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
|
||||
|
||||
# Assert the value is 0. when all the mean cluster are equal
|
||||
assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
|
||||
|
||||
# General case (with non numpy arrays)
|
||||
X = (
|
||||
[[0, 0], [1, 1]] * 5
|
||||
+ [[3, 3], [4, 4]] * 5
|
||||
+ [[0, 4], [1, 3]] * 5
|
||||
+ [[3, 1], [4, 0]] * 5
|
||||
)
|
||||
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||||
pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
|
||||
|
||||
|
||||
def test_davies_bouldin_score():
|
||||
assert_raises_on_only_one_label(davies_bouldin_score)
|
||||
assert_raises_on_all_points_same_cluster(davies_bouldin_score)
|
||||
|
||||
# Assert the value is 0. when all samples are equals
|
||||
assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
|
||||
0.0
|
||||
)
|
||||
|
||||
# Assert the value is 0. when all the mean cluster are equal
|
||||
assert davies_bouldin_score(
|
||||
[[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
|
||||
) == pytest.approx(0.0)
|
||||
|
||||
# General case (with non numpy arrays)
|
||||
X = (
|
||||
[[0, 0], [1, 1]] * 5
|
||||
+ [[3, 3], [4, 4]] * 5
|
||||
+ [[0, 4], [1, 3]] * 5
|
||||
+ [[3, 1], [4, 0]] * 5
|
||||
)
|
||||
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||||
pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
|
||||
|
||||
# Ensure divide by zero warning is not raised in general case
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
davies_bouldin_score(X, labels)
|
||||
|
||||
# General case - cluster have one sample
|
||||
X = [[0, 0], [2, 2], [3, 3], [5, 5]]
|
||||
labels = [0, 0, 1, 2]
|
||||
pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
|
||||
|
||||
|
||||
def test_silhouette_score_integer_precomputed():
|
||||
"""Check that silhouette_score works for precomputed metrics that are integers.
|
||||
|
||||
Non-regression test for #22107.
|
||||
"""
|
||||
result = silhouette_score(
|
||||
[[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||||
)
|
||||
assert result == pytest.approx(1 / 6)
|
||||
|
||||
# non-zero on diagonal for ints raises an error
|
||||
with pytest.raises(ValueError, match="contains non-zero"):
|
||||
silhouette_score(
|
||||
[[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||||
)
|
||||
@@ -0,0 +1,49 @@
|
||||
# Metrics is cimported from other subpackages so this is needed for the cimport
|
||||
# to work
|
||||
metrics_cython_tree = [
|
||||
fs.copyfile('__init__.py')
|
||||
]
|
||||
# Some metrics code cimports code from utils, we may as well copy all the necessary files
|
||||
metrics_cython_tree += utils_cython_tree
|
||||
|
||||
_dist_metrics_pxd = custom_target(
|
||||
'_dist_metrics_pxd',
|
||||
output: '_dist_metrics.pxd',
|
||||
input: '_dist_metrics.pxd.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# Need to install the generated pxd because it is needed in other subpackages
|
||||
# Cython code, e.g. sklearn.cluster
|
||||
install_dir: sklearn_dir / 'metrics',
|
||||
install: true,
|
||||
)
|
||||
metrics_cython_tree += [_dist_metrics_pxd]
|
||||
|
||||
_dist_metrics_pyx = custom_target(
|
||||
'_dist_metrics_pyx',
|
||||
output: '_dist_metrics.pyx',
|
||||
input: '_dist_metrics.pyx.tp',
|
||||
command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: metrics_cython_tree,
|
||||
)
|
||||
|
||||
_dist_metrics = py.extension_module(
|
||||
'_dist_metrics',
|
||||
cython_gen.process(_dist_metrics_pyx),
|
||||
dependencies: [np_dep],
|
||||
subdir: 'sklearn/metrics',
|
||||
install: true
|
||||
)
|
||||
|
||||
py.extension_module(
|
||||
'_pairwise_fast',
|
||||
[cython_gen.process('_pairwise_fast.pyx'), metrics_cython_tree],
|
||||
dependencies: [openmp_dep],
|
||||
subdir: 'sklearn/metrics',
|
||||
install: true
|
||||
)
|
||||
|
||||
subdir('_pairwise_distances_reduction')
|
||||
subdir('cluster')
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user