Videre
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,268 @@
|
||||
"""test the label propagation module"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.semi_supervised import _label_propagation as label_propagation
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
SPARSE_TYPES = ("sparse_csr", "sparse_csc", "sparse_csr_array", "sparse_csc_array")
|
||||
CONSTRUCTOR_TYPES = ("array",) + SPARSE_TYPES
|
||||
|
||||
ESTIMATORS = [
|
||||
(label_propagation.LabelPropagation, {"kernel": "rbf"}),
|
||||
(label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}),
|
||||
(
|
||||
label_propagation.LabelPropagation,
|
||||
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
|
||||
),
|
||||
(label_propagation.LabelSpreading, {"kernel": "rbf"}),
|
||||
(label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}),
|
||||
(
|
||||
label_propagation.LabelSpreading,
|
||||
{"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)},
|
||||
),
|
||||
]
|
||||
|
||||
LP_ESTIMATORS = [
|
||||
(klass, params)
|
||||
for (klass, params) in ESTIMATORS
|
||||
if klass == label_propagation.LabelPropagation
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_fit_transduction(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert clf.transduction_[2] == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_distribution(global_dtype, Estimator, parameters):
|
||||
if parameters["kernel"] == "knn":
|
||||
pytest.skip(
|
||||
"Unstable test for this configuration: changes in k-NN ordering break it."
|
||||
)
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_predict(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_predict_proba(global_dtype, Estimator, parameters):
|
||||
samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(samples, labels)
|
||||
assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9])
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha):
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
y[::3] = -1
|
||||
|
||||
gamma = 0.1
|
||||
clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y)
|
||||
# adopting notation from Zhou et al (2004):
|
||||
S = clf._build_graph()
|
||||
Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype)
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
Y = Y[:, :-1]
|
||||
|
||||
expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y)
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
clf = label_propagation.LabelSpreading(
|
||||
max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_allclose(expected, clf.label_distributions_)
|
||||
|
||||
|
||||
def test_label_propagation_closed_form(global_dtype):
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
y[::3] = -1
|
||||
Y = np.zeros((len(y), n_classes + 1))
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
unlabelled_idx = Y[:, (-1,)].nonzero()[0]
|
||||
labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
|
||||
|
||||
clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1)
|
||||
clf.fit(X, y)
|
||||
# adopting notation from Zhu et al 2002
|
||||
T_bar = clf._build_graph()
|
||||
Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))]
|
||||
Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))]
|
||||
Y = Y[:, :-1]
|
||||
Y_l = Y[labelled_idx, :]
|
||||
Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
|
||||
|
||||
expected = Y.copy()
|
||||
expected[unlabelled_idx, :] = Y_u
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
assert_allclose(expected, clf.label_distributions_, atol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("accepted_sparse_type", SPARSE_TYPES)
|
||||
@pytest.mark.parametrize("index_dtype", [np.int32, np.int64])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS)
|
||||
def test_sparse_input_types(
|
||||
accepted_sparse_type, index_dtype, dtype, Estimator, parameters
|
||||
):
|
||||
# This is non-regression test for #17085
|
||||
X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type)
|
||||
X.data = X.data.astype(dtype, copy=False)
|
||||
X.indices = X.indices.astype(index_dtype, copy=False)
|
||||
X.indptr = X.indptr.astype(index_dtype, copy=False)
|
||||
labels = [0, 1, -1]
|
||||
clf = Estimator(**parameters).fit(X, labels)
|
||||
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor", CONSTRUCTOR_TYPES)
|
||||
@pytest.mark.parametrize("Estimator, parameters", LP_ESTIMATORS)
|
||||
def test_label_propagation_build_graph_normalized(constructor, Estimator, parameters):
|
||||
# required but unused X and labels values
|
||||
X = np.array([[1.0, 0.0], [1.0, 1.0], [1.0, 3.0]])
|
||||
labels = [0, 1, -1]
|
||||
|
||||
# test normalization of an affinity_matrix
|
||||
aff_matrix = np.array([[1.0, 1.0, 0.0], [2.0, 1.0, 1.0], [0.0, 1.0, 3.0]])
|
||||
expected = np.array([[0.5, 0.5, 0.0], [0.5, 0.25, 0.25], [0.0, 0.25, 0.75]])
|
||||
|
||||
def kernel_affinity_matrix(x, y=None):
|
||||
return _convert_container(aff_matrix, constructor)
|
||||
|
||||
clf = Estimator(kernel=kernel_affinity_matrix).fit(X, labels)
|
||||
graph = clf._build_graph()
|
||||
assert_allclose(graph.sum(axis=1), 1) # normalized rows
|
||||
|
||||
if issparse(graph):
|
||||
graph = graph.toarray()
|
||||
assert_allclose(graph, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES)
|
||||
def test_convergence_speed(constructor_type):
|
||||
# This is a non-regression test for #5774
|
||||
X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type)
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000)
|
||||
mdl.fit(X, y)
|
||||
|
||||
# this should converge quickly:
|
||||
assert mdl.n_iter_ < 10
|
||||
assert_array_equal(mdl.predict(X), [0, 1, 1])
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
# This is a non-regression test for #5774
|
||||
X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]])
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1)
|
||||
warn_msg = "max_iter=1 was reached without convergence."
|
||||
with pytest.warns(ConvergenceWarning, match=warn_msg):
|
||||
mdl.fit(X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1)
|
||||
with pytest.warns(ConvergenceWarning, match=warn_msg):
|
||||
mdl.fit(X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"LabelPropagationCls",
|
||||
[label_propagation.LabelSpreading, label_propagation.LabelPropagation],
|
||||
)
|
||||
def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
|
||||
# check that we don't divide by zero in case of null normalizer
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/15946
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/9292
|
||||
X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]])
|
||||
y = np.array([0, 1, -1, -1])
|
||||
mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
mdl.fit(X, y)
|
||||
|
||||
|
||||
def test_predict_sparse_callable_kernel(global_dtype):
|
||||
# This is a non-regression test for #15866
|
||||
|
||||
# Custom sparse kernel (top-K RBF)
|
||||
def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
|
||||
nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2)
|
||||
nn.fit(X)
|
||||
W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma
|
||||
np.exp(W.data, out=W.data)
|
||||
assert issparse(W)
|
||||
return W.T
|
||||
|
||||
n_classes = 4
|
||||
n_samples = 500
|
||||
n_test = 10
|
||||
X, y = make_classification(
|
||||
n_classes=n_classes,
|
||||
n_samples=n_samples,
|
||||
n_features=20,
|
||||
n_informative=20,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
random_state=0,
|
||||
)
|
||||
X = X.astype(global_dtype)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=n_test, random_state=0
|
||||
)
|
||||
|
||||
model = label_propagation.LabelSpreading(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
||||
|
||||
model = label_propagation.LabelPropagation(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
||||
@@ -0,0 +1,380 @@
|
||||
from math import ceil
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import load_iris, make_blobs
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.semi_supervised import SelfTrainingClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tests.test_pipeline import SimpleEstimator
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# load the iris dataset and randomly permute it
|
||||
iris = load_iris()
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=0
|
||||
)
|
||||
|
||||
n_labeled_samples = 50
|
||||
|
||||
y_train_missing_labels = y_train.copy()
|
||||
y_train_missing_labels[n_labeled_samples:] = -1
|
||||
mapping = {0: "A", 1: "B", 2: "C", -1: "-1"}
|
||||
y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype(
|
||||
object
|
||||
)
|
||||
y_train_missing_strings[y_train_missing_labels == -1] = -1
|
||||
|
||||
|
||||
def test_warns_k_best():
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000)
|
||||
with pytest.warns(UserWarning, match="k_best is larger than"):
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[KNeighborsClassifier(), LogisticRegression()],
|
||||
)
|
||||
@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"])
|
||||
def test_classification(estimator, selection_crit):
|
||||
estimator = clone(estimator) # Avoid side effects from previous tests.
|
||||
# Check classification for various parameter settings.
|
||||
# Also assert that predictions for strings and numerical labels are equal.
|
||||
# Also test for multioutput classification
|
||||
threshold = 0.75
|
||||
max_iter = 10
|
||||
st = SelfTrainingClassifier(
|
||||
estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit
|
||||
)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
pred = st.predict(X_test)
|
||||
proba = st.predict_proba(X_test)
|
||||
|
||||
st_string = SelfTrainingClassifier(
|
||||
estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold
|
||||
)
|
||||
st_string.fit(X_train, y_train_missing_strings)
|
||||
pred_string = st_string.predict(X_test)
|
||||
proba_string = st_string.predict_proba(X_test)
|
||||
|
||||
assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
|
||||
assert_array_equal(proba, proba_string)
|
||||
|
||||
assert st.termination_condition_ == st_string.termination_condition_
|
||||
# Check consistency between labeled_iter, n_iter and max_iter
|
||||
labeled = y_train_missing_labels != -1
|
||||
# assert that labeled samples have labeled_iter = 0
|
||||
assert_array_equal(st.labeled_iter_ == 0, labeled)
|
||||
# assert that labeled samples do not change label during training
|
||||
assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled])
|
||||
|
||||
# assert that the max of the iterations is less than the total amount of
|
||||
# iterations
|
||||
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
|
||||
assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter
|
||||
|
||||
# check shapes
|
||||
assert st.labeled_iter_.shape == st.transduction_.shape
|
||||
assert st_string.labeled_iter_.shape == st_string.transduction_.shape
|
||||
|
||||
|
||||
def test_k_best():
|
||||
st = SelfTrainingClassifier(
|
||||
KNeighborsClassifier(n_neighbors=1),
|
||||
criterion="k_best",
|
||||
k_best=10,
|
||||
max_iter=None,
|
||||
)
|
||||
y_train_only_one_label = np.copy(y_train)
|
||||
y_train_only_one_label[1:] = -1
|
||||
n_samples = y_train.shape[0]
|
||||
|
||||
n_expected_iter = ceil((n_samples - 1) / 10)
|
||||
st.fit(X_train, y_train_only_one_label)
|
||||
assert st.n_iter_ == n_expected_iter
|
||||
|
||||
# Check labeled_iter_
|
||||
assert np.sum(st.labeled_iter_ == 0) == 1
|
||||
for i in range(1, n_expected_iter):
|
||||
assert np.sum(st.labeled_iter_ == i) == 10
|
||||
assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
def test_sanity_classification():
|
||||
estimator = SVC(gamma="scale", probability=True)
|
||||
estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])
|
||||
|
||||
st = SelfTrainingClassifier(estimator)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
pred1, pred2 = estimator.predict(X_test), st.predict(X_test)
|
||||
assert not np.array_equal(pred1, pred2)
|
||||
score_supervised = accuracy_score(estimator.predict(X_test), y_test)
|
||||
score_self_training = accuracy_score(st.predict(X_test), y_test)
|
||||
|
||||
assert score_self_training > score_supervised
|
||||
|
||||
|
||||
def test_none_iter():
|
||||
# Check that the all samples were labeled after a 'reasonable' number of
|
||||
# iterations.
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None)
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
assert st.n_iter_ < 10
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)],
|
||||
)
|
||||
@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings])
|
||||
def test_zero_iterations(estimator, y):
|
||||
estimator = clone(estimator) # Avoid side effects from previous tests.
|
||||
# Check classification for zero iterations.
|
||||
# Fitting a SelfTrainingClassifier with zero iterations should give the
|
||||
# same results as fitting a supervised classifier.
|
||||
# This also asserts that string arrays work as expected.
|
||||
|
||||
clf1 = SelfTrainingClassifier(estimator, max_iter=0)
|
||||
|
||||
clf1.fit(X_train, y)
|
||||
|
||||
clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])
|
||||
|
||||
assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
|
||||
assert clf1.termination_condition_ == "max_iter"
|
||||
|
||||
|
||||
def test_prefitted_throws_error():
|
||||
# Test that passing a pre-fitted classifier and calling predict throws an
|
||||
# error
|
||||
knn = KNeighborsClassifier()
|
||||
knn.fit(X_train, y_train)
|
||||
st = SelfTrainingClassifier(knn)
|
||||
with pytest.raises(
|
||||
NotFittedError,
|
||||
match="This SelfTrainingClassifier instance is not fitted yet",
|
||||
):
|
||||
st.predict(X_train)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_iter", range(1, 5))
|
||||
def test_labeled_iter(max_iter):
|
||||
# Check that the amount of datapoints labeled in iteration 0 is equal to
|
||||
# the amount of labeled datapoints we passed.
|
||||
st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)
|
||||
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
|
||||
assert amount_iter_0 == n_labeled_samples
|
||||
# Check that the max of the iterations is less than the total amount of
|
||||
# iterations
|
||||
assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
|
||||
|
||||
|
||||
def test_no_unlabeled():
|
||||
# Test that training on a fully labeled dataset produces the same results
|
||||
# as training the classifier by itself.
|
||||
knn = KNeighborsClassifier()
|
||||
knn.fit(X_train, y_train)
|
||||
st = SelfTrainingClassifier(knn)
|
||||
with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
|
||||
st.fit(X_train, y_train)
|
||||
assert_array_equal(knn.predict(X_test), st.predict(X_test))
|
||||
# Assert that all samples were labeled in iteration 0 (since there were no
|
||||
# unlabeled samples).
|
||||
assert np.all(st.labeled_iter_ == 0)
|
||||
assert st.termination_condition_ == "all_labeled"
|
||||
|
||||
|
||||
def test_early_stopping():
|
||||
svc = SVC(gamma="scale", probability=True)
|
||||
st = SelfTrainingClassifier(svc)
|
||||
X_train_easy = [[1], [0], [1], [0.5]]
|
||||
y_train_easy = [1, 0, -1, -1]
|
||||
# X = [[0.5]] cannot be predicted on with a high confidence, so training
|
||||
# stops early
|
||||
st.fit(X_train_easy, y_train_easy)
|
||||
assert st.n_iter_ == 1
|
||||
assert st.termination_condition_ == "no_change"
|
||||
|
||||
|
||||
def test_strings_dtype():
|
||||
clf = SelfTrainingClassifier(KNeighborsClassifier())
|
||||
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
|
||||
labels_multiclass = ["one", "two", "three"]
|
||||
|
||||
y_strings = np.take(labels_multiclass, y)
|
||||
|
||||
with pytest.raises(ValueError, match="dtype"):
|
||||
clf.fit(X, y_strings)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("verbose", [True, False])
|
||||
def test_verbose(capsys, verbose):
|
||||
clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if verbose:
|
||||
assert "iteration" in captured.out
|
||||
else:
|
||||
assert "iteration" not in captured.out
|
||||
|
||||
|
||||
def test_verbose_k_best(capsys):
|
||||
st = SelfTrainingClassifier(
|
||||
KNeighborsClassifier(n_neighbors=1),
|
||||
criterion="k_best",
|
||||
k_best=10,
|
||||
verbose=True,
|
||||
max_iter=None,
|
||||
)
|
||||
|
||||
y_train_only_one_label = np.copy(y_train)
|
||||
y_train_only_one_label[1:] = -1
|
||||
n_samples = y_train.shape[0]
|
||||
|
||||
n_expected_iter = ceil((n_samples - 1) / 10)
|
||||
st.fit(X_train, y_train_only_one_label)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
msg = "End of iteration {}, added {} new labels."
|
||||
for i in range(1, n_expected_iter):
|
||||
assert msg.format(i, 10) in captured.out
|
||||
|
||||
assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
|
||||
|
||||
|
||||
def test_k_best_selects_best():
|
||||
# Tests that the labels added by st really are the 10 best labels.
|
||||
est = LogisticRegression(random_state=0)
|
||||
st = SelfTrainingClassifier(est, criterion="k_best", max_iter=1, k_best=10)
|
||||
has_label = y_train_missing_labels != -1
|
||||
st.fit(X_train, y_train_missing_labels)
|
||||
|
||||
got_label = ~has_label & (st.transduction_ != -1)
|
||||
|
||||
est.fit(X_train[has_label], y_train_missing_labels[has_label])
|
||||
pred = est.predict_proba(X_train[~has_label])
|
||||
max_proba = np.max(pred, axis=1)
|
||||
|
||||
most_confident_est = X_train[~has_label][np.argsort(max_proba)[-10:]]
|
||||
added_by_st = X_train[np.where(got_label)].tolist()
|
||||
|
||||
for row in most_confident_est.tolist():
|
||||
assert row in added_by_st
|
||||
|
||||
|
||||
def test_estimator_meta_estimator():
|
||||
# Check that a meta-estimator relying on an estimator implementing
|
||||
# `predict_proba` will work even if it does not expose this method before being
|
||||
# fitted.
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/19119
|
||||
|
||||
estimator = StackingClassifier(
|
||||
estimators=[
|
||||
("svc_1", SVC(probability=True)),
|
||||
("svc_2", SVC(probability=True)),
|
||||
],
|
||||
final_estimator=SVC(probability=True),
|
||||
cv=2,
|
||||
)
|
||||
|
||||
assert hasattr(estimator, "predict_proba")
|
||||
clf = SelfTrainingClassifier(estimator=estimator)
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
clf.predict_proba(X_test)
|
||||
|
||||
estimator = StackingClassifier(
|
||||
estimators=[
|
||||
("svc_1", SVC(probability=False)),
|
||||
("svc_2", SVC(probability=False)),
|
||||
],
|
||||
final_estimator=SVC(probability=False),
|
||||
cv=2,
|
||||
)
|
||||
|
||||
assert not hasattr(estimator, "predict_proba")
|
||||
clf = SelfTrainingClassifier(estimator=estimator)
|
||||
with pytest.raises(AttributeError):
|
||||
clf.fit(X_train, y_train_missing_labels)
|
||||
|
||||
|
||||
def test_self_training_estimator_attribute_error():
|
||||
"""Check that we raise the proper AttributeErrors when the `estimator`
|
||||
does not implement the `predict_proba` method, which is called from within
|
||||
`fit`, or `decision_function`, which is decorated with `available_if`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28108
|
||||
"""
|
||||
# `SVC` with `probability=False` does not implement 'predict_proba' that
|
||||
# is required internally in `fit` of `SelfTrainingClassifier`. We expect
|
||||
# an AttributeError to be raised.
|
||||
estimator = SVC(probability=False, gamma="scale")
|
||||
self_training = SelfTrainingClassifier(estimator)
|
||||
|
||||
with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"):
|
||||
self_training.fit(X_train, y_train_missing_labels)
|
||||
|
||||
# `DecisionTreeClassifier` does not implement 'decision_function' and
|
||||
# should raise an AttributeError
|
||||
self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier())
|
||||
|
||||
outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'"
|
||||
inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
self_training.fit(X_train, y_train_missing_labels).decision_function(X_train)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
# Metadata routing tests
|
||||
# =================================================================
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"method", ["decision_function", "predict_log_proba", "predict_proba", "predict"]
|
||||
)
|
||||
def test_routing_passed_metadata_not_supported(method):
|
||||
"""Test that the right error message is raised when metadata is passed while
|
||||
not supported when `enable_metadata_routing=False`."""
|
||||
est = SelfTrainingClassifier(estimator=SimpleEstimator())
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a")
|
||||
|
||||
est = SelfTrainingClassifier(estimator=SimpleEstimator())
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
# make sure that the estimator thinks it is already fitted
|
||||
est.fitted_params_ = True
|
||||
getattr(est, method)([[1]], sample_weight=[1], prop="a")
|
||||
|
||||
|
||||
# End of routing tests
|
||||
# ====================
|
||||
Reference in New Issue
Block a user