This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,109 @@
"""
Testing for the base module (sklearn.ensemble.base).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from collections import OrderedDict
import numpy as np
from sklearn.datasets import load_iris
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble._base import _set_random_states
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
def test_base():
# Check BaseEnsemble methods.
ensemble = BaggingClassifier(
estimator=Perceptron(random_state=None), n_estimators=3
)
iris = load_iris()
ensemble.fit(iris.data, iris.target)
ensemble.estimators_ = [] # empty the list and create estimators manually
ensemble._make_estimator()
random_state = np.random.RandomState(3)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(append=False)
assert 3 == len(ensemble)
assert 3 == len(ensemble.estimators_)
assert isinstance(ensemble[0], Perceptron)
assert ensemble[0].random_state is None
assert isinstance(ensemble[1].random_state, int)
assert isinstance(ensemble[2].random_state, int)
assert ensemble[1].random_state != ensemble[2].random_state
np_int_ensemble = BaggingClassifier(
estimator=Perceptron(), n_estimators=np.int32(3)
)
np_int_ensemble.fit(iris.data, iris.target)
def test_set_random_states():
# Linear Discriminant Analysis doesn't have random state: smoke test
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
clf1 = Perceptron(random_state=None)
assert clf1.random_state is None
# check random_state is None still sets
_set_random_states(clf1, None)
assert isinstance(clf1.random_state, int)
# check random_state fixes results in consistent initialisation
_set_random_states(clf1, 3)
assert isinstance(clf1.random_state, int)
clf2 = Perceptron(random_state=None)
_set_random_states(clf2, 3)
assert clf1.random_state == clf2.random_state
# nested random_state
def make_steps():
return [
("sel", SelectFromModel(Perceptron(random_state=None))),
("clf", Perceptron(random_state=None)),
]
est1 = Pipeline(make_steps())
_set_random_states(est1, 3)
assert isinstance(est1.steps[0][1].estimator.random_state, int)
assert isinstance(est1.steps[1][1].random_state, int)
assert (
est1.get_params()["sel__estimator__random_state"]
!= est1.get_params()["clf__random_state"]
)
# ensure multiple random_state parameters are invariant to get_params()
# iteration order
class AlphaParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params))
class RevParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params, reverse=True))
for cls in [AlphaParamPipeline, RevParamPipeline]:
est2 = cls(make_steps())
_set_random_states(est2, 3)
assert (
est1.get_params()["sel__estimator__random_state"]
== est2.get_params()["sel__estimator__random_state"]
)
assert (
est1.get_params()["clf__random_state"]
== est2.get_params()["clf__random_state"]
)

View File

@@ -0,0 +1,263 @@
import numpy as np
import pytest
from sklearn.base import ClassifierMixin, clone, is_classifier
from sklearn.datasets import (
load_diabetes,
load_iris,
make_classification,
make_regression,
)
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
StackingClassifier,
StackingRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, SVR, LinearSVC
X, y = load_iris(return_X_y=True)
X_r, y_r = load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_classification(n_samples=10),
VotingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
]
),
),
(
*make_regression(n_samples=10),
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", SVR(kernel="linear")),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_regression(n_samples=10),
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", SVR(kernel="linear")),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
]
),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
# check that the behavior of `estimators`, `estimators_`,
# `named_estimators`, `named_estimators_` is consistent across all
# ensemble classes and when using `set_params()`.
estimator = clone(estimator) # Avoid side effects from shared instances
# before fit
assert "svm" in estimator.named_estimators
assert estimator.named_estimators.svm is estimator.estimators[1][1]
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
# check fitted attributes
estimator.fit(X, y)
assert len(estimator.named_estimators) == 3
assert len(estimator.named_estimators_) == 3
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
# check that set_params() does not add a new attribute
estimator_new_params = clone(estimator)
svm_estimator = SVC() if is_classifier(estimator) else SVR()
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
assert not hasattr(estimator_new_params, "svm")
assert (
estimator_new_params.named_estimators.lr.get_params()
== estimator.named_estimators.lr.get_params()
)
assert (
estimator_new_params.named_estimators.rf.get_params()
== estimator.named_estimators.rf.get_params()
)
# check the behavior when setting and dropping an estimator
estimator_dropped = clone(estimator)
estimator_dropped.set_params(svm="drop")
estimator_dropped.fit(X, y)
assert len(estimator_dropped.named_estimators) == 3
assert estimator_dropped.named_estimators.svm == "drop"
assert len(estimator_dropped.named_estimators_) == 3
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
for sub_est in estimator_dropped.named_estimators_:
# check that the correspondence is correct
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
# check that we can set the parameters of the underlying classifier
estimator.set_params(svm__C=10.0)
estimator.set_params(rf__max_depth=5)
assert (
estimator.get_params()["svm__C"]
== estimator.get_params()["svm"].get_params()["C"]
)
assert (
estimator.get_params()["rf__max_depth"]
== estimator.get_params()["rf"].get_params()["max_depth"]
)
@pytest.mark.parametrize(
"Ensemble",
[VotingClassifier, StackingRegressor, VotingRegressor],
)
def test_ensemble_heterogeneous_estimators_type(Ensemble):
# check that ensemble will fail during validation if the underlying
# estimators are not of the same type (i.e. classifier or regressor)
# StackingClassifier can have an underlying regresor so it's not checked
if issubclass(Ensemble, ClassifierMixin):
X, y = make_classification(n_samples=10)
estimators = [("lr", LinearRegression())]
ensemble_type = "classifier"
else:
X, y = make_regression(n_samples=10)
estimators = [("lr", LogisticRegression())]
ensemble_type = "regressor"
ensemble = Ensemble(estimators=estimators)
err_msg = "should be a {}".format(ensemble_type)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, Ensemble",
[
(*make_classification(n_samples=10), StackingClassifier),
(*make_classification(n_samples=10), VotingClassifier),
(*make_regression(n_samples=10), StackingRegressor),
(*make_regression(n_samples=10), VotingRegressor),
],
)
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
# raise an error when the name contains dunder
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr__", LogisticRegression())]
else:
estimators = [("lr__", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name is not unique
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
else:
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name conflicts with the parameters
if issubclass(Ensemble, ClassifierMixin):
estimators = [("estimators", LogisticRegression())]
else:
estimators = [("estimators", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = "Estimator names conflict with constructor arguments"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_classification(n_samples=10),
VotingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_regression(n_samples=10),
StackingRegressor(estimators=[("lr", LinearRegression())]),
),
(
*make_regression(n_samples=10),
VotingRegressor(estimators=[("lr", LinearRegression())]),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
# check that we raise a consistent error when all estimators are
# dropped
estimator.set_params(lr="drop")
with pytest.raises(ValueError, match="All estimators are dropped."):
estimator.fit(X, y)
@pytest.mark.parametrize(
"Ensemble, Estimator, X, y",
[
(StackingClassifier, LogisticRegression, X, y),
(StackingRegressor, LinearRegression, X_r, y_r),
(VotingClassifier, LogisticRegression, X, y),
(VotingRegressor, LinearRegression, X_r, y_r),
],
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
# check that Voting and Stacking predictor delegate the missing values
# validation to the underlying estimator.
X = X.copy()
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
ensemble.fit(X, y).score(X, y)

View File

@@ -0,0 +1,395 @@
"""
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from unittest.mock import Mock, patch
import numpy as np
import pytest
from joblib import parallel_backend
from sklearn.datasets import load_diabetes, load_iris, make_classification
from sklearn.ensemble import IsolationForest
from sklearn.ensemble._iforest import _average_path_length
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# load iris & diabetes dataset
iris = load_iris()
diabetes = load_diabetes()
def test_iforest(global_random_seed):
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid(
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
)
with ignore_warnings():
for params in grid:
IsolationForest(random_state=global_random_seed, **params).fit(
X_train
).predict(X_test)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse(global_random_seed, sparse_container):
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
IsolationForest(max_samples=1000).fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples="auto").fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples=np.int64(2)).fit(X)
# test X_test n_features match X_train one:
with pytest.raises(ValueError):
IsolationForest().fit(X).predict(X[:, 1:])
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=500)
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
clf.fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=0.4).fit(X)
assert clf.max_samples_ == 0.4 * X.shape[0]
def test_iforest_parallel_regression(global_random_seed):
"""Check parallel regression."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data, random_state=rng)
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_iforest_performance(global_random_seed):
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(global_random_seed)
X = 0.3 * rng.randn(600, 2)
X = rng.permutation(np.vstack((X + 2, X - 2)))
X_train = X[:1000]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
X_test = np.vstack((X[1000:], X_outliers))
y_test = np.array([0] * 200 + [1] * 200)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert roc_auc_score(y_test, y_pred) > 0.98
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_works(contamination, global_random_seed):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
# Test IsolationForest
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == clf._max_samples
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)
def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
assert_allclose(_average_path_length([0]), [0.0])
assert_allclose(_average_path_length([1]), [0.0])
assert_allclose(_average_path_length([2]), [1.0])
assert_allclose(_average_path_length([5]), [result_one])
assert_allclose(_average_path_length([999]), [result_two])
assert_allclose(
_average_path_length(np.array([1, 2, 5, 999])),
[0.0, 1.0, result_one, result_two],
)
# _average_path_length is increasing
avg_path_length = _average_path_length(np.arange(5))
assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
clf2 = IsolationForest().fit(X_train)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]),
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
)
assert_array_equal(
clf2.score_samples([[2.0, 2.0]]),
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
)
def test_iforest_warm_start():
"""Test iterative addition of iTrees to an iForest"""
rng = check_random_state(0)
X = rng.randn(20, 2)
# fit first 10 trees
clf = IsolationForest(
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
)
clf.fit(X)
# remember the 1st tree
tree_1 = clf.estimators_[0]
# fit another 10 trees
clf.set_params(n_estimators=20)
clf.fit(X)
# expecting 20 fitted trees and no overwritten trees
assert len(clf.estimators_) == 20
assert clf.estimators_[0] is tree_1
# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk has 3 rows):
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
@pytest.mark.thread_unsafe # monkeypatched code
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
# idem with chunk_size = 10 rows
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
@pytest.mark.thread_unsafe # monkeypatched code
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
def test_iforest_with_uniform_data():
"""Test whether iforest predicts inliers when using uniform data"""
# 2-d array of all 1s
X = np.ones((100, 10))
iforest = IsolationForest()
iforest.fit(X)
rng = np.random.RandomState(0)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(X + 1) == 1)
assert all(iforest.predict(X - 1) == 1)
# 2-d array where columns contain the same value across rows
X = np.repeat(rng.randn(1, 10), 100, 0)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
# Single row
X = rng.randn(1, 10)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_iforest_with_n_jobs_does_not_segfault(csc_container):
"""Check that Isolation Forest does not segfault with n_jobs=2
Non-regression test for #23252
"""
X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
X = csc_container(X)
IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
def test_iforest_preserve_feature_names():
"""Check that feature names are preserved when contamination is not "auto".
Feature names are required for consistency checks during scoring.
Non-regression test for Issue #25844
"""
pd = pytest.importorskip("pandas")
rng = np.random.RandomState(0)
X = pd.DataFrame(data=rng.randn(4), columns=["a"])
model = IsolationForest(random_state=0, contamination=0.05)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
model.fit(X)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse_input_float_contamination(sparse_container):
"""Check that `IsolationForest` accepts sparse matrix input and float value for
contamination.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27626
"""
X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
X = sparse_container(X)
X.sort_indices()
contamination = 0.1
iforest = IsolationForest(
n_estimators=5, contamination=contamination, random_state=0
).fit(X)
X_decision = iforest.decision_function(X)
assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)
@pytest.mark.parametrize("n_jobs", [1, 2])
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_predict_parallel(global_random_seed, contamination, n_jobs):
"""Check that `IsolationForest.predict` is parallelized."""
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
# Test IsolationForest
clf = IsolationForest(
random_state=global_random_seed, contamination=contamination, n_jobs=None
)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
clf_parallel = IsolationForest(
random_state=global_random_seed, contamination=contamination, n_jobs=-1
)
clf_parallel.fit(X)
with parallel_backend("threading", n_jobs=n_jobs):
pred_paralell = clf_parallel.predict(X)
# assert the same results as non-parallel
assert_array_equal(pred, pred_paralell)

View File

@@ -0,0 +1,795 @@
"""Testing for the VotingClassifier and VotingRegressor"""
import re
import numpy as np
import pytest
from sklearn import config_context, datasets
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_multilabel_classification
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tests.metadata_routing_common import (
ConsumingClassifier,
ConsumingRegressor,
_Registry,
check_recorded_metadata,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
X_r, y_r = datasets.load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"params, err_msg",
[
(
{"estimators": []},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [LogisticRegression()]},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [(213, LogisticRegression())]},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
"Number of `estimators` and weights must be equal",
),
],
)
def test_voting_classifier_estimator_init(params, err_msg):
ensemble = VotingClassifier(**params)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
def test_predictproba_hardvoting():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="hard",
)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf.predict_proba
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
assert not hasattr(eclf, "predict_proba")
eclf.fit(X_scaled, y)
assert not hasattr(eclf, "predict_proba")
def test_notfitted():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="soft",
)
ereg = VotingRegressor([("dr", DummyRegressor())])
msg = (
"This %s instance is not fitted yet. Call 'fit'"
" with appropriate arguments before using this estimator."
)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict_proba(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.transform(X)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.predict(X_r)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.transform(X_r)
def test_majority_label_iris(global_random_seed):
"""Check classification by majority label on dataset iris."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
scores = cross_val_score(eclf, X, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_tie_situation():
"""Check voting classifier selects smaller class label in tie situation."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
assert clf1.fit(X, y).predict(X)[52] == 2
assert clf2.fit(X, y).predict(X)[52] == 1
assert eclf.fit(X, y).predict(X)[52] == 1
def test_weights_iris(global_random_seed):
"""Check classification by average probabilities on dataset iris."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 2, 10],
)
scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_weights_regressor():
"""Check weighted average regression prediction on diabetes dataset."""
reg1 = DummyRegressor(strategy="mean")
reg2 = DummyRegressor(strategy="median")
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
ereg = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
)
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
X_r, y_r, test_size=0.25
)
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
avg = np.average(
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
)
assert_almost_equal(ereg_pred, avg, decimal=2)
ereg_weights_none = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
)
ereg_weights_equal = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
)
ereg_weights_none.fit(X_r_train, y_r_train)
ereg_weights_equal.fit(X_r_train, y_r_train)
ereg_none_pred = ereg_weights_none.predict(X_r_test)
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def test_predict_on_toy_problem(global_random_seed):
"""Manually check predicted class labels for toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array(
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
)
y = np.array([1, 1, 1, 2, 2, 2])
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="hard",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_predict_proba_on_toy_problem():
"""Calculate predicted probabilities on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
clf1_res = np.array(
[
[0.59790391, 0.40209609],
[0.57622162, 0.42377838],
[0.50728456, 0.49271544],
[0.40241774, 0.59758226],
]
)
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
clf3_res = np.array(
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
)
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[2, 1, 1],
)
eclf_res = eclf.fit(X, y).predict_proba(X)
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
eclf.fit(X, y).predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
def test_multilabel():
"""Check if error is raised for multilabel classification."""
X, y = make_multilabel_classification(
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
)
clf = OneVsRestClassifier(SVC(kernel="linear"))
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
try:
eclf.fit(X, y)
except NotImplementedError:
return
def test_gridsearch():
"""Check GridSearch support."""
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
)
params = {
"lr__C": [1.0, 100.0],
"voting": ["soft", "hard"],
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
grid.fit(X_scaled, y)
def test_parallel_fit(global_random_seed):
"""Check parallel backend of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_sample_weight(global_random_seed):
"""Tests sample_weight parameter of VotingClassifier"""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = CalibratedClassifierCV(SVC(random_state=global_random_seed), ensemble=False)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
clf1.fit(X_scaled, y, sample_weight)
assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
assert_array_almost_equal(
eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
)
# check that an error is raised and indicative if sample_weight is not
# supported.
clf4 = KNeighborsClassifier()
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
)
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
with pytest.raises(TypeError, match=msg):
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
# check that _fit_single_estimator will raise the right error
# it should raise the original error if this is not linked to sample_weight
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
def fit(self, X_scaled, y, sample_weight):
raise TypeError("Error unrelated to sample_weight.")
clf = ClassifierErrorFit()
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
clf.fit(X_scaled, y, sample_weight=sample_weight)
def test_sample_weight_kwargs():
"""Check that VotingClassifier passes sample_weight as kwargs"""
class MockClassifier(ClassifierMixin, BaseEstimator):
"""Mock Classifier to check that sample_weight is received as kwargs"""
def fit(self, X, y, *args, **sample_weight):
assert "sample_weight" in sample_weight
clf = MockClassifier()
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
# Should not raise an error.
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def test_voting_classifier_set_params(global_random_seed):
# check equivalence in the output when setting underlying estimators
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(
n_estimators=10, random_state=global_random_seed, max_depth=None
)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
).fit(X_scaled, y)
eclf2 = VotingClassifier(
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
)
eclf2.set_params(nb=clf2).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def test_set_estimator_drop():
# VotingClassifier set_params should be able to set estimators as drop
# Test predict
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 0, 0.5],
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 1, 0.5],
)
eclf2.set_params(rf="drop").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert dict(eclf2.estimators)["rf"] == "drop"
assert len(eclf2.estimators_) == 2
assert all(
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
)
assert eclf2.get_params()["rf"] == "drop"
eclf1.set_params(voting="soft").fit(X, y)
eclf2.set_params(voting="soft").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
msg = "All estimators are dropped. At least one is required"
with pytest.raises(ValueError, match=msg):
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
# Test soft voting transform
X1 = np.array([[1], [2]])
y1 = np.array([1, 2])
eclf1 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[0, 0.5],
flatten_transform=False,
).fit(X1, y1)
eclf2 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[1, 0.5],
flatten_transform=False,
)
eclf2.set_params(rf="drop").fit(X1, y1)
assert_array_almost_equal(
eclf1.transform(X1),
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
)
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
eclf1.set_params(voting="hard")
eclf2.set_params(voting="hard")
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format(global_random_seed):
# Test estimator weights inputs as list and array
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
)
eclf1.fit(X_scaled, y)
eclf2.fit(X_scaled, y)
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
def test_transform(global_random_seed):
"""Check transform method of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=True,
).fit(X, y)
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=False,
).fit(X, y)
assert_array_equal(eclf1.transform(X).shape, (4, 6))
assert_array_equal(eclf2.transform(X).shape, (4, 6))
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
assert_array_almost_equal(
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
)
@pytest.mark.parametrize(
"X, y, voter",
[
(
X,
y,
VotingClassifier(
[
("lr", LogisticRegression()),
("rf", RandomForestClassifier(n_estimators=5)),
]
),
),
(
X_r,
y_r,
VotingRegressor(
[
("lr", LinearRegression()),
("rf", RandomForestRegressor(n_estimators=5)),
]
),
),
],
)
def test_none_estimator_with_weights(X, y, voter):
# check that an estimator can be set to 'drop' and passing some weight
# regression test for
# https://github.com/scikit-learn/scikit-learn/issues/13777
voter = clone(voter)
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
voter.set_params(lr="drop")
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
y_pred = voter.predict(X_scaled)
assert y_pred.shape == y.shape
@pytest.mark.parametrize(
"est",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
]
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
]
),
],
ids=["VotingRegressor", "VotingClassifier"],
)
def test_n_features_in(est):
est = clone(est)
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
assert not hasattr(est, "n_features_in_")
est.fit(X, y)
assert est.n_features_in_ == 2
@pytest.mark.parametrize(
"estimator",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("rf", RandomForestRegressor(random_state=123)),
],
verbose=True,
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=123)),
("rf", RandomForestClassifier(random_state=123)),
],
verbose=True,
),
],
)
def test_voting_verbose(estimator, capsys):
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
pattern = (
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
)
clone(estimator).fit(X, y)
assert re.match(pattern, capsys.readouterr()[0])
def test_get_features_names_out_regressor():
"""Check get_feature_names_out output for regressor."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
("ignore", "drop"),
]
)
voting.fit(X, y)
names_out = voting.get_feature_names_out()
expected_names = ["votingregressor_lr", "votingregressor_tree"]
assert_array_equal(names_out, expected_names)
@pytest.mark.parametrize(
"kwargs, expected_names",
[
(
{"voting": "soft", "flatten_transform": True},
[
"votingclassifier_lr0",
"votingclassifier_lr1",
"votingclassifier_lr2",
"votingclassifier_tree0",
"votingclassifier_tree1",
"votingclassifier_tree2",
],
),
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
],
)
def test_get_features_names_out_classifier(kwargs, expected_names):
"""Check get_feature_names_out for classifier for different settings."""
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
y = [0, 1, 2, 0]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
**kwargs,
)
voting.fit(X, y)
X_trans = voting.transform(X)
names_out = voting.get_feature_names_out()
assert X_trans.shape[1] == len(expected_names)
assert_array_equal(names_out, expected_names)
def test_get_features_names_out_classifier_error():
"""Check that error is raised when voting="soft" and flatten_transform=False."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
voting="soft",
flatten_transform=False,
)
voting.fit(X, y)
msg = (
"get_feature_names_out is not supported when `voting='soft'` and "
"`flatten_transform=False`"
)
with pytest.raises(ValueError, match=msg):
voting.get_feature_names_out()
# Metadata Routing Tests
# ======================
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
def test_routing_passed_metadata_not_supported(Estimator, Child):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@config_context(enable_metadata_routing=True)
def test_get_metadata_routing_without_fit(Estimator, Child):
# Test that metadata_routing() doesn't raise when called before fit.
est = Estimator([("sub_est", Child())])
est.get_metadata_routing()
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
@config_context(enable_metadata_routing=True)
def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
"""Test that metadata is routed correctly for Voting*."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator(
[
(
"sub_est1",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
(
"sub_est2",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
]
)
est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
for estimator in est.estimators:
if prop == "sample_weight":
kwargs = {prop: sample_weight}
else:
kwargs = {prop: metadata}
# access sub-estimator in (name, est) with estimator[1]
registry = estimator[1].registry
assert len(registry)
for sub_est in registry:
check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs)
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@config_context(enable_metadata_routing=True)
def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
"""Test that the right error is raised when metadata is not requested."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator([("sub_est", Child())])
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested"
f" or not requested for {Child.__name__}.fit"
)
with pytest.raises(ValueError, match=re.escape(error_message)):
est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
# End of Metadata Routing Tests
# =============================

View File

@@ -0,0 +1,602 @@
"""Testing for the boost module (sklearn.ensemble.boost)."""
import re
import numpy as np
import pytest
from sklearn import datasets
from sklearn.base import BaseEstimator, clone
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.utils._mocking import NoSampleWeightWrapper
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
# Common random state
rng = np.random.RandomState(0)
# Toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
y_regr = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
y_t_class = ["foo", 1, 1]
y_t_regr = [-1, 1, 1]
# Load the iris dataset and randomly permute it
iris = datasets.load_iris()
perm = rng.permutation(iris.target.size)
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
# Load the diabetes dataset and randomly permute it
diabetes = datasets.load_diabetes()
diabetes.data, diabetes.target = shuffle(
diabetes.data, diabetes.target, random_state=rng
)
def test_oneclass_adaboost_proba():
# Test predict_proba robustness for one class label input.
# In response to issue #7501
# https://github.com/scikit-learn/scikit-learn/issues/7501
y_t = np.ones(len(X))
clf = AdaBoostClassifier().fit(X, y_t)
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def test_classification_toy():
# Check classification on a toy dataset.
clf = AdaBoostClassifier(random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert clf.predict_proba(T).shape == (len(T), 2)
assert clf.decision_function(T).shape == (len(T),)
def test_regression_toy():
# Check classification on a toy dataset.
clf = AdaBoostRegressor(random_state=0)
clf.fit(X, y_regr)
assert_array_equal(clf.predict(T), y_t_regr)
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf = AdaBoostClassifier()
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
assert proba.shape[1] == len(classes)
assert clf.decision_function(iris.data).shape[1] == len(classes)
score = clf.score(iris.data, iris.target)
assert score > 0.9, f"Failed with {score = }"
# Check we used multiple estimators
assert len(clf.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
def test_diabetes(loss):
# Check consistency on dataset diabetes.
reg = AdaBoostRegressor(loss=loss, random_state=0)
reg.fit(diabetes.data, diabetes.target)
score = reg.score(diabetes.data, diabetes.target)
assert score > 0.55
# Check we used multiple estimators
assert len(reg.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
def test_staged_predict():
# Check staged predictions.
rng = np.random.RandomState(0)
iris_weights = rng.randint(10, size=iris.target.shape)
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
clf = AdaBoostClassifier(n_estimators=10)
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
predictions = clf.predict(iris.data)
staged_predictions = [p for p in clf.staged_predict(iris.data)]
proba = clf.predict_proba(iris.data)
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
staged_scores = [
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_probas) == 10
assert_array_almost_equal(proba, staged_probas[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
# AdaBoost regression
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
predictions = clf.predict(diabetes.data)
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
staged_scores = [
s
for s in clf.staged_score(
diabetes.data, diabetes.target, sample_weight=diabetes_weights
)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
parameters = {
"n_estimators": (1, 2),
"estimator__max_depth": (1, 2),
}
clf = GridSearchCV(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
clf = GridSearchCV(boost, parameters)
clf.fit(diabetes.data, diabetes.target)
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
obj = AdaBoostClassifier()
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert score == score2
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(diabetes.data, diabetes.target)
score = obj.score(diabetes.data, diabetes.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(diabetes.data, diabetes.target)
assert score == score2
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(
n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1,
)
clf = AdaBoostClassifier()
clf.fit(X, y)
importances = clf.feature_importances_
assert importances.shape[0] == 10
assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_adaboost_classifier_sample_weight_error():
# Test that it gives proper exception on incorrect sample weight.
clf = AdaBoostClassifier()
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
with pytest.raises(ValueError, match=msg):
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
def test_estimator():
# Test different estimators.
from sklearn.ensemble import RandomForestClassifier
# XXX doesn't work with y_class because RF doesn't support classes_
# Shouldn't AdaBoost run a LabelBinarizer?
clf = AdaBoostClassifier(RandomForestClassifier())
clf.fit(X, y_regr)
clf = AdaBoostClassifier(SVC())
clf.fit(X, y_class)
from sklearn.ensemble import RandomForestRegressor
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
clf.fit(X, y_regr)
clf = AdaBoostRegressor(SVR(), random_state=0)
clf.fit(X, y_regr)
# Check that an empty discrete ensemble fails in fit, not predict.
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
y_fail = ["foo", "bar", 1, 2]
clf = AdaBoostClassifier(SVC())
with pytest.raises(ValueError, match="worse than random"):
clf.fit(X_fail, y_fail)
def test_sample_weights_infinite():
msg = "Sample weights have reached infinite values"
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0)
with pytest.warns(UserWarning, match=msg):
clf.fit(iris.data, iris.target)
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_classification(sparse_container, expected_internal_type):
# Check classification with sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_multilabel_classification(
n_classes=1, n_samples=15, n_features=5, random_state=42
)
# Flatten y to a 1d array
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
).fit(X_train, y_train)
# predict
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
dense_clf_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_clf_results, dense_clf_results)
# decision_function
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
dense_clf_results = dense_classifier.decision_function(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_log_proba
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_log_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_proba
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# score
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.score(X_test, y_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# staged_decision_function
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
dense_clf_results = dense_classifier.staged_decision_function(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_predict
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# staged_predict_proba
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_score
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# Verify sparsity of data is maintained during training
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([t == expected_internal_type for t in types])
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_regression(sparse_container, expected_internal_type):
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(
n_samples=15, n_features=50, n_targets=1, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train_sparse, y_train
)
# Trained on dense format
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train, y_train
)
# predict
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
dense_regr_results = dense_regressor.predict(X_test)
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
# staged_predict
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
dense_regr_results = dense_regressor.staged_predict(X_test)
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
types = [i.data_type_ for i in sparse_regressor.estimators_]
assert all([t == expected_internal_type for t in types])
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
def test_multidimensional_X():
"""
Check that the AdaBoost estimators can work with n-dimensional
data matrix
"""
rng = np.random.RandomState(0)
X = rng.randn(51, 3, 3)
yc = rng.choice([0, 1], 51)
yr = rng.randn(51)
boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
boost.fit(X, yc)
boost.predict(X)
boost.predict_proba(X)
boost = AdaBoostRegressor(DummyRegressor())
boost.fit(X, yr)
boost.predict(X)
def test_adaboostclassifier_without_sample_weight():
X, y = iris.data, iris.target
estimator = NoSampleWeightWrapper(DummyClassifier())
clf = AdaBoostClassifier(estimator=estimator)
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_adaboostregressor_sample_weight():
# check that giving weight will have an influence on the error computed
# for a weak learner
rng = np.random.RandomState(42)
X = np.linspace(0, 100, num=1000)
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
X = X.reshape(-1, 1)
# add an arbitrary outlier
X[-1] *= 10
y[-1] = 10000
# random_state=0 ensure that the underlying bootstrap will use the outlier
regr_no_outlier = AdaBoostRegressor(
estimator=LinearRegression(), n_estimators=1, random_state=0
)
regr_with_weight = clone(regr_no_outlier)
regr_with_outlier = clone(regr_no_outlier)
# fit 3 models:
# - a model containing the outlier
# - a model without the outlier
# - a model containing the outlier but with a null sample-weight
regr_with_outlier.fit(X, y)
regr_no_outlier.fit(X[:-1], y[:-1])
sample_weight = np.ones_like(y)
sample_weight[-1] = 0
regr_with_weight.fit(X, y, sample_weight=sample_weight)
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
assert score_with_outlier < score_no_outlier
assert score_with_outlier < score_with_weight
assert score_no_outlier == pytest.approx(score_with_weight)
def test_adaboost_consistent_predict():
# check that predict_proba and predict give consistent results
# regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/14084
X_train, X_test, y_train, y_test = train_test_split(
*datasets.load_digits(return_X_y=True), random_state=42
)
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)
assert_array_equal(
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
)
@pytest.mark.parametrize(
"model, X, y",
[
(AdaBoostClassifier(), iris.data, iris.target),
(AdaBoostRegressor(), diabetes.data, diabetes.target),
],
)
def test_adaboost_negative_weight_error(model, X, y):
sample_weight = np.ones_like(y)
sample_weight[-1] = -10
err_msg = "Negative values in data passed to `sample_weight`"
with pytest.raises(ValueError, match=err_msg):
model.fit(X, y, sample_weight=sample_weight)
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
"""Check that we don't create NaN feature importance with numerically
instable inputs.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20320
"""
rng = np.random.RandomState(42)
X = rng.normal(size=(1000, 10))
y = rng.choice([0, 1], size=1000)
sample_weight = np.ones_like(y) * 1e-263
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12)
ada_model.fit(X, y, sample_weight=sample_weight)
assert np.isnan(ada_model.feature_importances_).sum() == 0
def test_adaboost_decision_function(global_random_seed):
"""Check that the decision function respects the symmetric constraint for weak
learners.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/26520
"""
n_classes = 3
X, y = datasets.make_classification(
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
)
clf = AdaBoostClassifier(n_estimators=1, random_state=global_random_seed).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
# We can assert the same for staged_decision_function since we have a single learner
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
clf.set_params(n_estimators=5).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)