Videre
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
Testing for the base module (sklearn.ensemble.base).
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.ensemble._base import _set_random_states
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
from sklearn.linear_model import Perceptron
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
|
||||
def test_base():
|
||||
# Check BaseEnsemble methods.
|
||||
ensemble = BaggingClassifier(
|
||||
estimator=Perceptron(random_state=None), n_estimators=3
|
||||
)
|
||||
|
||||
iris = load_iris()
|
||||
ensemble.fit(iris.data, iris.target)
|
||||
ensemble.estimators_ = [] # empty the list and create estimators manually
|
||||
|
||||
ensemble._make_estimator()
|
||||
random_state = np.random.RandomState(3)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(append=False)
|
||||
|
||||
assert 3 == len(ensemble)
|
||||
assert 3 == len(ensemble.estimators_)
|
||||
|
||||
assert isinstance(ensemble[0], Perceptron)
|
||||
assert ensemble[0].random_state is None
|
||||
assert isinstance(ensemble[1].random_state, int)
|
||||
assert isinstance(ensemble[2].random_state, int)
|
||||
assert ensemble[1].random_state != ensemble[2].random_state
|
||||
|
||||
np_int_ensemble = BaggingClassifier(
|
||||
estimator=Perceptron(), n_estimators=np.int32(3)
|
||||
)
|
||||
np_int_ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_set_random_states():
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
|
||||
|
||||
clf1 = Perceptron(random_state=None)
|
||||
assert clf1.random_state is None
|
||||
# check random_state is None still sets
|
||||
_set_random_states(clf1, None)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
|
||||
# check random_state fixes results in consistent initialisation
|
||||
_set_random_states(clf1, 3)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
clf2 = Perceptron(random_state=None)
|
||||
_set_random_states(clf2, 3)
|
||||
assert clf1.random_state == clf2.random_state
|
||||
|
||||
# nested random_state
|
||||
|
||||
def make_steps():
|
||||
return [
|
||||
("sel", SelectFromModel(Perceptron(random_state=None))),
|
||||
("clf", Perceptron(random_state=None)),
|
||||
]
|
||||
|
||||
est1 = Pipeline(make_steps())
|
||||
_set_random_states(est1, 3)
|
||||
assert isinstance(est1.steps[0][1].estimator.random_state, int)
|
||||
assert isinstance(est1.steps[1][1].random_state, int)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
!= est1.get_params()["clf__random_state"]
|
||||
)
|
||||
|
||||
# ensure multiple random_state parameters are invariant to get_params()
|
||||
# iteration order
|
||||
|
||||
class AlphaParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params))
|
||||
|
||||
class RevParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params, reverse=True))
|
||||
|
||||
for cls in [AlphaParamPipeline, RevParamPipeline]:
|
||||
est2 = cls(make_steps())
|
||||
_set_random_states(est2, 3)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
== est2.get_params()["sel__estimator__random_state"]
|
||||
)
|
||||
assert (
|
||||
est1.get_params()["clf__random_state"]
|
||||
== est2.get_params()["clf__random_state"]
|
||||
)
|
||||
@@ -0,0 +1,263 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import ClassifierMixin, clone, is_classifier
|
||||
from sklearn.datasets import (
|
||||
load_diabetes,
|
||||
load_iris,
|
||||
make_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.ensemble import (
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
StackingClassifier,
|
||||
StackingRegressor,
|
||||
VotingClassifier,
|
||||
VotingRegressor,
|
||||
)
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.svm import SVC, SVR, LinearSVC
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
X_r, y_r = load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", SVR(kernel="linear")),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", SVR(kernel="linear")),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
|
||||
# check that the behavior of `estimators`, `estimators_`,
|
||||
# `named_estimators`, `named_estimators_` is consistent across all
|
||||
# ensemble classes and when using `set_params()`.
|
||||
estimator = clone(estimator) # Avoid side effects from shared instances
|
||||
|
||||
# before fit
|
||||
assert "svm" in estimator.named_estimators
|
||||
assert estimator.named_estimators.svm is estimator.estimators[1][1]
|
||||
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
|
||||
|
||||
# check fitted attributes
|
||||
estimator.fit(X, y)
|
||||
assert len(estimator.named_estimators) == 3
|
||||
assert len(estimator.named_estimators_) == 3
|
||||
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
|
||||
# check that set_params() does not add a new attribute
|
||||
estimator_new_params = clone(estimator)
|
||||
svm_estimator = SVC() if is_classifier(estimator) else SVR()
|
||||
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
|
||||
assert not hasattr(estimator_new_params, "svm")
|
||||
assert (
|
||||
estimator_new_params.named_estimators.lr.get_params()
|
||||
== estimator.named_estimators.lr.get_params()
|
||||
)
|
||||
assert (
|
||||
estimator_new_params.named_estimators.rf.get_params()
|
||||
== estimator.named_estimators.rf.get_params()
|
||||
)
|
||||
|
||||
# check the behavior when setting and dropping an estimator
|
||||
estimator_dropped = clone(estimator)
|
||||
estimator_dropped.set_params(svm="drop")
|
||||
estimator_dropped.fit(X, y)
|
||||
assert len(estimator_dropped.named_estimators) == 3
|
||||
assert estimator_dropped.named_estimators.svm == "drop"
|
||||
assert len(estimator_dropped.named_estimators_) == 3
|
||||
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
for sub_est in estimator_dropped.named_estimators_:
|
||||
# check that the correspondence is correct
|
||||
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
|
||||
|
||||
# check that we can set the parameters of the underlying classifier
|
||||
estimator.set_params(svm__C=10.0)
|
||||
estimator.set_params(rf__max_depth=5)
|
||||
assert (
|
||||
estimator.get_params()["svm__C"]
|
||||
== estimator.get_params()["svm"].get_params()["C"]
|
||||
)
|
||||
assert (
|
||||
estimator.get_params()["rf__max_depth"]
|
||||
== estimator.get_params()["rf"].get_params()["max_depth"]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble",
|
||||
[VotingClassifier, StackingRegressor, VotingRegressor],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_type(Ensemble):
|
||||
# check that ensemble will fail during validation if the underlying
|
||||
# estimators are not of the same type (i.e. classifier or regressor)
|
||||
# StackingClassifier can have an underlying regresor so it's not checked
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
X, y = make_classification(n_samples=10)
|
||||
estimators = [("lr", LinearRegression())]
|
||||
ensemble_type = "classifier"
|
||||
else:
|
||||
X, y = make_regression(n_samples=10)
|
||||
estimators = [("lr", LogisticRegression())]
|
||||
ensemble_type = "regressor"
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "should be a {}".format(ensemble_type)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, Ensemble",
|
||||
[
|
||||
(*make_classification(n_samples=10), StackingClassifier),
|
||||
(*make_classification(n_samples=10), VotingClassifier),
|
||||
(*make_regression(n_samples=10), StackingRegressor),
|
||||
(*make_regression(n_samples=10), VotingRegressor),
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
|
||||
# raise an error when the name contains dunder
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr__", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr__", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name is not unique
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name conflicts with the parameters
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("estimators", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("estimators", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "Estimator names conflict with constructor arguments"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
|
||||
# check that we raise a consistent error when all estimators are
|
||||
# dropped
|
||||
estimator.set_params(lr="drop")
|
||||
with pytest.raises(ValueError, match="All estimators are dropped."):
|
||||
estimator.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble, Estimator, X, y",
|
||||
[
|
||||
(StackingClassifier, LogisticRegression, X, y),
|
||||
(StackingRegressor, LinearRegression, X_r, y_r),
|
||||
(VotingClassifier, LogisticRegression, X, y),
|
||||
(VotingRegressor, LinearRegression, X_r, y_r),
|
||||
],
|
||||
)
|
||||
# FIXME: we should move this test in `estimator_checks` once we are able
|
||||
# to construct meta-estimator instances
|
||||
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
|
||||
# check that Voting and Stacking predictor delegate the missing values
|
||||
# validation to the underlying estimator.
|
||||
X = X.copy()
|
||||
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
|
||||
X[mask] = np.nan
|
||||
pipe = make_pipeline(SimpleImputer(), Estimator())
|
||||
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
|
||||
ensemble.fit(X, y).score(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import warnings
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from joblib import parallel_backend
|
||||
|
||||
from sklearn.datasets import load_diabetes, load_iris, make_classification
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.ensemble._iforest import _average_path_length
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.model_selection import ParameterGrid, train_test_split
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
# load iris & diabetes dataset
|
||||
iris = load_iris()
|
||||
diabetes = load_diabetes()
|
||||
|
||||
|
||||
def test_iforest(global_random_seed):
|
||||
"""Check Isolation Forest for various parameter settings."""
|
||||
X_train = np.array([[0, 1], [1, 2]])
|
||||
X_test = np.array([[2, 1], [1, 1]])
|
||||
|
||||
grid = ParameterGrid(
|
||||
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
|
||||
)
|
||||
|
||||
with ignore_warnings():
|
||||
for params in grid:
|
||||
IsolationForest(random_state=global_random_seed, **params).fit(
|
||||
X_train
|
||||
).predict(X_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_iforest_sparse(global_random_seed, sparse_container):
|
||||
"""Check IForest for various parameter settings on sparse input."""
|
||||
rng = check_random_state(global_random_seed)
|
||||
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
for params in grid:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=global_random_seed, **params
|
||||
).fit(X_train_sparse)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=global_random_seed, **params
|
||||
).fit(X_train)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
def test_iforest_error():
|
||||
"""Test that it gives proper exception on deficient input."""
|
||||
X = iris.data
|
||||
|
||||
# The dataset has less than 256 samples, explicitly setting
|
||||
# max_samples > n_samples should result in a warning. If not set
|
||||
# explicitly there should be no warning
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
IsolationForest(max_samples=1000).fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples="auto").fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples=np.int64(2)).fit(X)
|
||||
|
||||
# test X_test n_features match X_train one:
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest().fit(X).predict(X[:, 1:])
|
||||
|
||||
|
||||
def test_recalculate_max_depth():
|
||||
"""Check max_depth recalculation when max_samples is reset to n_samples"""
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
for est in clf.estimators_:
|
||||
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
|
||||
|
||||
|
||||
def test_max_samples_attribute():
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=500)
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf.fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=0.4).fit(X)
|
||||
assert clf.max_samples_ == 0.4 * X.shape[0]
|
||||
|
||||
|
||||
def test_iforest_parallel_regression(global_random_seed):
|
||||
"""Check parallel regression."""
|
||||
rng = check_random_state(global_random_seed)
|
||||
|
||||
X_train, X_test = train_test_split(diabetes.data, random_state=rng)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_iforest_performance(global_random_seed):
|
||||
"""Test Isolation Forest performs well"""
|
||||
|
||||
# Generate train/test data
|
||||
rng = check_random_state(global_random_seed)
|
||||
X = 0.3 * rng.randn(600, 2)
|
||||
X = rng.permutation(np.vstack((X + 2, X - 2)))
|
||||
X_train = X[:1000]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
|
||||
X_test = np.vstack((X[1000:], X_outliers))
|
||||
y_test = np.array([0] * 200 + [1] * 200)
|
||||
|
||||
# fit the model
|
||||
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that there is at most 6 errors (false positive or false negative)
|
||||
assert roc_auc_score(y_test, y_pred) > 0.98
|
||||
|
||||
|
||||
@pytest.mark.parametrize("contamination", [0.25, "auto"])
|
||||
def test_iforest_works(contamination, global_random_seed):
|
||||
# toy sample (the last two samples are outliers)
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
|
||||
|
||||
# Test IsolationForest
|
||||
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
|
||||
clf.fit(X)
|
||||
decision_func = -clf.decision_function(X)
|
||||
pred = clf.predict(X)
|
||||
# assert detect outliers:
|
||||
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
|
||||
assert_array_equal(pred, 6 * [1] + 2 * [-1])
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples in iforest and BaseBagging are identical
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == clf._max_samples
|
||||
|
||||
|
||||
def test_iforest_subsampled_features():
|
||||
# It tests non-regression for #5732 which failed at predict.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
clf = IsolationForest(max_features=0.8)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_average_path_length():
|
||||
# It tests non-regression for #8549 which used the wrong formula
|
||||
# for average path length, strictly for the integer case
|
||||
# Updated to check average path length when input is <= 2 (issue #11839)
|
||||
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
|
||||
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
|
||||
assert_allclose(_average_path_length([0]), [0.0])
|
||||
assert_allclose(_average_path_length([1]), [0.0])
|
||||
assert_allclose(_average_path_length([2]), [1.0])
|
||||
assert_allclose(_average_path_length([5]), [result_one])
|
||||
assert_allclose(_average_path_length([999]), [result_two])
|
||||
assert_allclose(
|
||||
_average_path_length(np.array([1, 2, 5, 999])),
|
||||
[0.0, 1.0, result_one, result_two],
|
||||
)
|
||||
# _average_path_length is increasing
|
||||
avg_path_length = _average_path_length(np.arange(5))
|
||||
assert_array_equal(avg_path_length, np.sort(avg_path_length))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = IsolationForest(contamination=0.1).fit(X_train)
|
||||
clf2 = IsolationForest().fit(X_train)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]),
|
||||
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf2.score_samples([[2.0, 2.0]]),
|
||||
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
|
||||
)
|
||||
|
||||
|
||||
def test_iforest_warm_start():
|
||||
"""Test iterative addition of iTrees to an iForest"""
|
||||
|
||||
rng = check_random_state(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
# fit first 10 trees
|
||||
clf = IsolationForest(
|
||||
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
|
||||
)
|
||||
clf.fit(X)
|
||||
# remember the 1st tree
|
||||
tree_1 = clf.estimators_[0]
|
||||
# fit another 10 trees
|
||||
clf.set_params(n_estimators=20)
|
||||
clf.fit(X)
|
||||
# expecting 20 fitted trees and no overwritten trees
|
||||
assert len(clf.estimators_) == 20
|
||||
assert clf.estimators_[0] is tree_1
|
||||
|
||||
|
||||
# mock get_chunk_n_rows to actually test more than one chunk (here one
|
||||
# chunk has 3 rows):
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 3}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
@pytest.mark.thread_unsafe # monkeypatched code
|
||||
def test_iforest_chunks_works1(
|
||||
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
|
||||
):
|
||||
test_iforest_works(contamination, global_random_seed)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
# idem with chunk_size = 10 rows
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 10}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
@pytest.mark.thread_unsafe # monkeypatched code
|
||||
def test_iforest_chunks_works2(
|
||||
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
|
||||
):
|
||||
test_iforest_works(contamination, global_random_seed)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
def test_iforest_with_uniform_data():
|
||||
"""Test whether iforest predicts inliers when using uniform data"""
|
||||
|
||||
# 2-d array of all 1s
|
||||
X = np.ones((100, 10))
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(X + 1) == 1)
|
||||
assert all(iforest.predict(X - 1) == 1)
|
||||
|
||||
# 2-d array where columns contain the same value across rows
|
||||
X = np.repeat(rng.randn(1, 10), 100, 0)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
# Single row
|
||||
X = rng.randn(1, 10)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_iforest_with_n_jobs_does_not_segfault(csc_container):
|
||||
"""Check that Isolation Forest does not segfault with n_jobs=2
|
||||
|
||||
Non-regression test for #23252
|
||||
"""
|
||||
X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
|
||||
X = csc_container(X)
|
||||
IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
|
||||
|
||||
|
||||
def test_iforest_preserve_feature_names():
|
||||
"""Check that feature names are preserved when contamination is not "auto".
|
||||
|
||||
Feature names are required for consistency checks during scoring.
|
||||
|
||||
Non-regression test for Issue #25844
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = pd.DataFrame(data=rng.randn(4), columns=["a"])
|
||||
model = IsolationForest(random_state=0, contamination=0.05)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
model.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_iforest_sparse_input_float_contamination(sparse_container):
|
||||
"""Check that `IsolationForest` accepts sparse matrix input and float value for
|
||||
contamination.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/27626
|
||||
"""
|
||||
X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
|
||||
X = sparse_container(X)
|
||||
X.sort_indices()
|
||||
contamination = 0.1
|
||||
iforest = IsolationForest(
|
||||
n_estimators=5, contamination=contamination, random_state=0
|
||||
).fit(X)
|
||||
|
||||
X_decision = iforest.decision_function(X)
|
||||
assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
@pytest.mark.parametrize("contamination", [0.25, "auto"])
|
||||
def test_iforest_predict_parallel(global_random_seed, contamination, n_jobs):
|
||||
"""Check that `IsolationForest.predict` is parallelized."""
|
||||
# toy sample (the last two samples are outliers)
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
|
||||
|
||||
# Test IsolationForest
|
||||
clf = IsolationForest(
|
||||
random_state=global_random_seed, contamination=contamination, n_jobs=None
|
||||
)
|
||||
clf.fit(X)
|
||||
decision_func = -clf.decision_function(X)
|
||||
pred = clf.predict(X)
|
||||
|
||||
# assert detect outliers:
|
||||
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
|
||||
assert_array_equal(pred, 6 * [1] + 2 * [-1])
|
||||
|
||||
clf_parallel = IsolationForest(
|
||||
random_state=global_random_seed, contamination=contamination, n_jobs=-1
|
||||
)
|
||||
clf_parallel.fit(X)
|
||||
with parallel_backend("threading", n_jobs=n_jobs):
|
||||
pred_paralell = clf_parallel.predict(X)
|
||||
|
||||
# assert the same results as non-parallel
|
||||
assert_array_equal(pred, pred_paralell)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,795 @@
|
||||
"""Testing for the VotingClassifier and VotingRegressor"""
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.datasets import make_multilabel_classification
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.ensemble import (
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
VotingClassifier,
|
||||
VotingRegressor,
|
||||
)
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tests.metadata_routing_common import (
|
||||
ConsumingClassifier,
|
||||
ConsumingRegressor,
|
||||
_Registry,
|
||||
check_recorded_metadata,
|
||||
)
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
# Load datasets
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data[:, 1:3], iris.target
|
||||
# Scaled to solve ConvergenceWarning throw by Logistic Regression
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
|
||||
X_r, y_r = datasets.load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
(
|
||||
{"estimators": []},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
|
||||
),
|
||||
(
|
||||
{"estimators": [LogisticRegression()]},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
|
||||
),
|
||||
(
|
||||
{"estimators": [(213, LogisticRegression())]},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
|
||||
),
|
||||
(
|
||||
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
|
||||
"Number of `estimators` and weights must be equal",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_classifier_estimator_init(params, err_msg):
|
||||
ensemble = VotingClassifier(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
def test_predictproba_hardvoting():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="hard",
|
||||
)
|
||||
|
||||
inner_msg = "predict_proba is not available when voting='hard'"
|
||||
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
eclf.predict_proba
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
eclf.fit(X_scaled, y)
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
|
||||
|
||||
def test_notfitted():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="soft",
|
||||
)
|
||||
ereg = VotingRegressor([("dr", DummyRegressor())])
|
||||
msg = (
|
||||
"This %s instance is not fitted yet. Call 'fit'"
|
||||
" with appropriate arguments before using this estimator."
|
||||
)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict_proba(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.transform(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.predict(X_r)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.transform(X_r)
|
||||
|
||||
|
||||
def test_majority_label_iris(global_random_seed):
|
||||
"""Check classification by majority label on dataset iris."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
scores = cross_val_score(eclf, X, y, scoring="accuracy")
|
||||
|
||||
assert scores.mean() >= 0.9
|
||||
|
||||
|
||||
def test_tie_situation():
|
||||
"""Check voting classifier selects smaller class label in tie situation."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
|
||||
assert clf1.fit(X, y).predict(X)[52] == 2
|
||||
assert clf2.fit(X, y).predict(X)[52] == 1
|
||||
assert eclf.fit(X, y).predict(X)[52] == 1
|
||||
|
||||
|
||||
def test_weights_iris(global_random_seed):
|
||||
"""Check classification by average probabilities on dataset iris."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 2, 10],
|
||||
)
|
||||
scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
|
||||
assert scores.mean() >= 0.9
|
||||
|
||||
|
||||
def test_weights_regressor():
|
||||
"""Check weighted average regression prediction on diabetes dataset."""
|
||||
reg1 = DummyRegressor(strategy="mean")
|
||||
reg2 = DummyRegressor(strategy="median")
|
||||
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
|
||||
ereg = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
|
||||
)
|
||||
|
||||
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
|
||||
X_r, y_r, test_size=0.25
|
||||
)
|
||||
|
||||
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
|
||||
avg = np.average(
|
||||
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
|
||||
)
|
||||
assert_almost_equal(ereg_pred, avg, decimal=2)
|
||||
|
||||
ereg_weights_none = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
|
||||
)
|
||||
ereg_weights_equal = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
|
||||
)
|
||||
ereg_weights_none.fit(X_r_train, y_r_train)
|
||||
ereg_weights_equal.fit(X_r_train, y_r_train)
|
||||
ereg_none_pred = ereg_weights_none.predict(X_r_test)
|
||||
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
|
||||
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
|
||||
|
||||
|
||||
def test_predict_on_toy_problem(global_random_seed):
|
||||
"""Manually check predicted class labels for toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
X = np.array(
|
||||
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
|
||||
)
|
||||
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
|
||||
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
|
||||
def test_predict_proba_on_toy_problem():
|
||||
"""Calculate predicted probabilities on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
clf1_res = np.array(
|
||||
[
|
||||
[0.59790391, 0.40209609],
|
||||
[0.57622162, 0.42377838],
|
||||
[0.50728456, 0.49271544],
|
||||
[0.40241774, 0.59758226],
|
||||
]
|
||||
)
|
||||
|
||||
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
|
||||
|
||||
clf3_res = np.array(
|
||||
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
|
||||
)
|
||||
|
||||
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
|
||||
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
|
||||
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
|
||||
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[2, 1, 1],
|
||||
)
|
||||
eclf_res = eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
|
||||
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
|
||||
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
|
||||
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
|
||||
|
||||
inner_msg = "predict_proba is not available when voting='hard'"
|
||||
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
def test_multilabel():
|
||||
"""Check if error is raised for multilabel classification."""
|
||||
X, y = make_multilabel_classification(
|
||||
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
|
||||
)
|
||||
clf = OneVsRestClassifier(SVC(kernel="linear"))
|
||||
|
||||
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
|
||||
|
||||
try:
|
||||
eclf.fit(X, y)
|
||||
except NotImplementedError:
|
||||
return
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
"""Check GridSearch support."""
|
||||
clf1 = LogisticRegression(random_state=1)
|
||||
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
)
|
||||
|
||||
params = {
|
||||
"lr__C": [1.0, 100.0],
|
||||
"voting": ["soft", "hard"],
|
||||
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
|
||||
}
|
||||
|
||||
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
|
||||
grid.fit(X_scaled, y)
|
||||
|
||||
|
||||
def test_parallel_fit(global_random_seed):
|
||||
"""Check parallel backend of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
def test_sample_weight(global_random_seed):
|
||||
"""Tests sample_weight parameter of VotingClassifier"""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = CalibratedClassifierCV(SVC(random_state=global_random_seed), ensemble=False)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X_scaled, y)
|
||||
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
|
||||
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
|
||||
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
|
||||
clf1.fit(X_scaled, y, sample_weight)
|
||||
assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
|
||||
)
|
||||
|
||||
# check that an error is raised and indicative if sample_weight is not
|
||||
# supported.
|
||||
clf4 = KNeighborsClassifier()
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
|
||||
)
|
||||
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
|
||||
|
||||
# check that _fit_single_estimator will raise the right error
|
||||
# it should raise the original error if this is not linked to sample_weight
|
||||
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X_scaled, y, sample_weight):
|
||||
raise TypeError("Error unrelated to sample_weight.")
|
||||
|
||||
clf = ClassifierErrorFit()
|
||||
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
|
||||
clf.fit(X_scaled, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_sample_weight_kwargs():
|
||||
"""Check that VotingClassifier passes sample_weight as kwargs"""
|
||||
|
||||
class MockClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Mock Classifier to check that sample_weight is received as kwargs"""
|
||||
|
||||
def fit(self, X, y, *args, **sample_weight):
|
||||
assert "sample_weight" in sample_weight
|
||||
|
||||
clf = MockClassifier()
|
||||
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
|
||||
|
||||
# Should not raise an error.
|
||||
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
|
||||
|
||||
def test_voting_classifier_set_params(global_random_seed):
|
||||
# check equivalence in the output when setting underlying estimators
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(
|
||||
n_estimators=10, random_state=global_random_seed, max_depth=None
|
||||
)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
|
||||
).fit(X_scaled, y)
|
||||
eclf2 = VotingClassifier(
|
||||
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
|
||||
)
|
||||
eclf2.set_params(nb=clf2).fit(X_scaled, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
|
||||
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
|
||||
|
||||
|
||||
def test_set_estimator_drop():
|
||||
# VotingClassifier set_params should be able to set estimators as drop
|
||||
# Test predict
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 0, 0.5],
|
||||
).fit(X, y)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 0.5],
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
|
||||
assert dict(eclf2.estimators)["rf"] == "drop"
|
||||
assert len(eclf2.estimators_) == 2
|
||||
assert all(
|
||||
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
|
||||
)
|
||||
assert eclf2.get_params()["rf"] == "drop"
|
||||
|
||||
eclf1.set_params(voting="soft").fit(X, y)
|
||||
eclf2.set_params(voting="soft").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
msg = "All estimators are dropped. At least one is required"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
|
||||
|
||||
# Test soft voting transform
|
||||
X1 = np.array([[1], [2]])
|
||||
y1 = np.array([1, 2])
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[0, 0.5],
|
||||
flatten_transform=False,
|
||||
).fit(X1, y1)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 0.5],
|
||||
flatten_transform=False,
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X1, y1)
|
||||
assert_array_almost_equal(
|
||||
eclf1.transform(X1),
|
||||
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
|
||||
)
|
||||
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
|
||||
eclf1.set_params(voting="hard")
|
||||
eclf2.set_params(voting="hard")
|
||||
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
|
||||
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
|
||||
|
||||
|
||||
def test_estimator_weights_format(global_random_seed):
|
||||
# Test estimator weights inputs as list and array
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
|
||||
)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
|
||||
)
|
||||
eclf1.fit(X_scaled, y)
|
||||
eclf2.fit(X_scaled, y)
|
||||
assert_array_almost_equal(
|
||||
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
|
||||
)
|
||||
|
||||
|
||||
def test_transform(global_random_seed):
|
||||
"""Check transform method of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=global_random_seed)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=True,
|
||||
).fit(X, y)
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf2.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
|
||||
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
|
||||
assert_array_almost_equal(
|
||||
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, voter",
|
||||
[
|
||||
(
|
||||
X,
|
||||
y,
|
||||
VotingClassifier(
|
||||
[
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
X_r,
|
||||
y_r,
|
||||
VotingRegressor(
|
||||
[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_none_estimator_with_weights(X, y, voter):
|
||||
# check that an estimator can be set to 'drop' and passing some weight
|
||||
# regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/13777
|
||||
voter = clone(voter)
|
||||
# Scaled to solve ConvergenceWarning throw by Logistic Regression
|
||||
X_scaled = StandardScaler().fit_transform(X)
|
||||
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
|
||||
voter.set_params(lr="drop")
|
||||
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
|
||||
y_pred = voter.predict(X_scaled)
|
||||
assert y_pred.shape == y.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
]
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
]
|
||||
),
|
||||
],
|
||||
ids=["VotingRegressor", "VotingClassifier"],
|
||||
)
|
||||
def test_n_features_in(est):
|
||||
est = clone(est)
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
assert not hasattr(est, "n_features_in_")
|
||||
est.fit(X, y)
|
||||
assert est.n_features_in_ == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=123)),
|
||||
("rf", RandomForestClassifier(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_verbose(estimator, capsys):
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
pattern = (
|
||||
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
|
||||
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
|
||||
)
|
||||
clone(estimator).fit(X, y)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
def test_get_features_names_out_regressor():
|
||||
"""Check get_feature_names_out output for regressor."""
|
||||
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
("ignore", "drop"),
|
||||
]
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
names_out = voting.get_feature_names_out()
|
||||
expected_names = ["votingregressor_lr", "votingregressor_tree"]
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs, expected_names",
|
||||
[
|
||||
(
|
||||
{"voting": "soft", "flatten_transform": True},
|
||||
[
|
||||
"votingclassifier_lr0",
|
||||
"votingclassifier_lr1",
|
||||
"votingclassifier_lr2",
|
||||
"votingclassifier_tree0",
|
||||
"votingclassifier_tree1",
|
||||
"votingclassifier_tree2",
|
||||
],
|
||||
),
|
||||
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
|
||||
],
|
||||
)
|
||||
def test_get_features_names_out_classifier(kwargs, expected_names):
|
||||
"""Check get_feature_names_out for classifier for different settings."""
|
||||
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
|
||||
y = [0, 1, 2, 0]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
**kwargs,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
X_trans = voting.transform(X)
|
||||
names_out = voting.get_feature_names_out()
|
||||
|
||||
assert X_trans.shape[1] == len(expected_names)
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
def test_get_features_names_out_classifier_error():
|
||||
"""Check that error is raised when voting="soft" and flatten_transform=False."""
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
msg = (
|
||||
"get_feature_names_out is not supported when `voting='soft'` and "
|
||||
"`flatten_transform=False`"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
voting.get_feature_names_out()
|
||||
|
||||
|
||||
# Metadata Routing Tests
|
||||
# ======================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
def test_routing_passed_metadata_not_supported(Estimator, Child):
|
||||
"""Test that the right error message is raised when metadata is passed while
|
||||
not supported when `enable_metadata_routing=False`."""
|
||||
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="is only supported if enable_metadata_routing=True"
|
||||
):
|
||||
Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_get_metadata_routing_without_fit(Estimator, Child):
|
||||
# Test that metadata_routing() doesn't raise when called before fit.
|
||||
est = Estimator([("sub_est", Child())])
|
||||
est.get_metadata_routing()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
|
||||
"""Test that metadata is routed correctly for Voting*."""
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
sample_weight, metadata = [1, 1, 1], "a"
|
||||
|
||||
est = Estimator(
|
||||
[
|
||||
(
|
||||
"sub_est1",
|
||||
Child(registry=_Registry()).set_fit_request(**{prop: True}),
|
||||
),
|
||||
(
|
||||
"sub_est2",
|
||||
Child(registry=_Registry()).set_fit_request(**{prop: True}),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
|
||||
|
||||
for estimator in est.estimators:
|
||||
if prop == "sample_weight":
|
||||
kwargs = {prop: sample_weight}
|
||||
else:
|
||||
kwargs = {prop: metadata}
|
||||
# access sub-estimator in (name, est) with estimator[1]
|
||||
registry = estimator[1].registry
|
||||
assert len(registry)
|
||||
for sub_est in registry:
|
||||
check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator, Child",
|
||||
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
|
||||
)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
|
||||
"""Test that the right error is raised when metadata is not requested."""
|
||||
X = np.array([[0, 1], [2, 2], [4, 6]])
|
||||
y = [1, 2, 3]
|
||||
sample_weight, metadata = [1, 1, 1], "a"
|
||||
|
||||
est = Estimator([("sub_est", Child())])
|
||||
|
||||
error_message = (
|
||||
"[sample_weight, metadata] are passed but are not explicitly set as requested"
|
||||
f" or not requested for {Child.__name__}.fit"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=re.escape(error_message)):
|
||||
est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
|
||||
|
||||
|
||||
# End of Metadata Routing Tests
|
||||
# =============================
|
||||
@@ -0,0 +1,602 @@
|
||||
"""Testing for the boost module (sklearn.ensemble.boost)."""
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.base import BaseEstimator, clone
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import GridSearchCV, train_test_split
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._mocking import NoSampleWeightWrapper
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
# Common random state
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# Toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
|
||||
y_regr = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
y_t_class = ["foo", 1, 1]
|
||||
y_t_regr = [-1, 1, 1]
|
||||
|
||||
# Load the iris dataset and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
|
||||
|
||||
# Load the diabetes dataset and randomly permute it
|
||||
diabetes = datasets.load_diabetes()
|
||||
diabetes.data, diabetes.target = shuffle(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
|
||||
def test_oneclass_adaboost_proba():
|
||||
# Test predict_proba robustness for one class label input.
|
||||
# In response to issue #7501
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/7501
|
||||
y_t = np.ones(len(X))
|
||||
clf = AdaBoostClassifier().fit(X, y_t)
|
||||
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
|
||||
|
||||
|
||||
def test_classification_toy():
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostClassifier(random_state=0)
|
||||
clf.fit(X, y_class)
|
||||
assert_array_equal(clf.predict(T), y_t_class)
|
||||
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
|
||||
assert clf.predict_proba(T).shape == (len(T), 2)
|
||||
assert clf.decision_function(T).shape == (len(T),)
|
||||
|
||||
|
||||
def test_regression_toy():
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostRegressor(random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
assert_array_equal(clf.predict(T), y_t_regr)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
classes = np.unique(iris.target)
|
||||
|
||||
clf = AdaBoostClassifier()
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
assert_array_equal(classes, clf.classes_)
|
||||
proba = clf.predict_proba(iris.data)
|
||||
|
||||
assert proba.shape[1] == len(classes)
|
||||
assert clf.decision_function(iris.data).shape[1] == len(classes)
|
||||
|
||||
score = clf.score(iris.data, iris.target)
|
||||
assert score > 0.9, f"Failed with {score = }"
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(clf.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
|
||||
def test_diabetes(loss):
|
||||
# Check consistency on dataset diabetes.
|
||||
reg = AdaBoostRegressor(loss=loss, random_state=0)
|
||||
reg.fit(diabetes.data, diabetes.target)
|
||||
score = reg.score(diabetes.data, diabetes.target)
|
||||
assert score > 0.55
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(reg.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
|
||||
|
||||
|
||||
def test_staged_predict():
|
||||
# Check staged predictions.
|
||||
rng = np.random.RandomState(0)
|
||||
iris_weights = rng.randint(10, size=iris.target.shape)
|
||||
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
|
||||
|
||||
clf = AdaBoostClassifier(n_estimators=10)
|
||||
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
|
||||
|
||||
predictions = clf.predict(iris.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(iris.data)]
|
||||
proba = clf.predict_proba(iris.data)
|
||||
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
|
||||
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
staged_scores = [
|
||||
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_probas) == 10
|
||||
assert_array_almost_equal(proba, staged_probas[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
# AdaBoost regression
|
||||
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
|
||||
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
|
||||
predictions = clf.predict(diabetes.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
|
||||
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
staged_scores = [
|
||||
s
|
||||
for s in clf.staged_score(
|
||||
diabetes.data, diabetes.target, sample_weight=diabetes_weights
|
||||
)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that base trees can be grid-searched.
|
||||
# AdaBoost classification
|
||||
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
|
||||
parameters = {
|
||||
"n_estimators": (1, 2),
|
||||
"estimator__max_depth": (1, 2),
|
||||
}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
# AdaBoost regression
|
||||
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
|
||||
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(diabetes.data, diabetes.target)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
# Check pickability.
|
||||
import pickle
|
||||
|
||||
# Adaboost classifier
|
||||
obj = AdaBoostClassifier()
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert score == score2
|
||||
|
||||
# Adaboost regressor
|
||||
obj = AdaBoostRegressor(random_state=0)
|
||||
obj.fit(diabetes.data, diabetes.target)
|
||||
score = obj.score(diabetes.data, diabetes.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(diabetes.data, diabetes.target)
|
||||
assert score == score2
|
||||
|
||||
|
||||
def test_importances():
|
||||
# Check variable importances.
|
||||
X, y = datasets.make_classification(
|
||||
n_samples=2000,
|
||||
n_features=10,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
shuffle=False,
|
||||
random_state=1,
|
||||
)
|
||||
|
||||
clf = AdaBoostClassifier()
|
||||
|
||||
clf.fit(X, y)
|
||||
importances = clf.feature_importances_
|
||||
|
||||
assert importances.shape[0] == 10
|
||||
assert (importances[:3, np.newaxis] >= importances[3:]).all()
|
||||
|
||||
|
||||
def test_adaboost_classifier_sample_weight_error():
|
||||
# Test that it gives proper exception on incorrect sample weight.
|
||||
clf = AdaBoostClassifier()
|
||||
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
|
||||
|
||||
|
||||
def test_estimator():
|
||||
# Test different estimators.
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
# XXX doesn't work with y_class because RF doesn't support classes_
|
||||
# Shouldn't AdaBoost run a LabelBinarizer?
|
||||
clf = AdaBoostClassifier(RandomForestClassifier())
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostClassifier(SVC())
|
||||
clf.fit(X, y_class)
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
|
||||
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostRegressor(SVR(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
# Check that an empty discrete ensemble fails in fit, not predict.
|
||||
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
|
||||
y_fail = ["foo", "bar", 1, 2]
|
||||
clf = AdaBoostClassifier(SVC())
|
||||
with pytest.raises(ValueError, match="worse than random"):
|
||||
clf.fit(X_fail, y_fail)
|
||||
|
||||
|
||||
def test_sample_weights_infinite():
|
||||
msg = "Sample weights have reached infinite values"
|
||||
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0)
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container, expected_internal_type",
|
||||
zip(
|
||||
[
|
||||
*CSC_CONTAINERS,
|
||||
*CSR_CONTAINERS,
|
||||
*LIL_CONTAINERS,
|
||||
*COO_CONTAINERS,
|
||||
*DOK_CONTAINERS,
|
||||
],
|
||||
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||||
),
|
||||
)
|
||||
def test_sparse_classification(sparse_container, expected_internal_type):
|
||||
# Check classification with sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_multilabel_classification(
|
||||
n_classes=1, n_samples=15, n_features=5, random_state=42
|
||||
)
|
||||
# Flatten y to a 1d array
|
||||
y = np.ravel(y)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostClassifier(
|
||||
estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = AdaBoostClassifier(
|
||||
estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict(X_test)
|
||||
assert_array_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# decision_function
|
||||
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.decision_function(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# predict_log_proba
|
||||
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict_log_proba(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# predict_proba
|
||||
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.predict_proba(X_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# score
|
||||
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
|
||||
dense_clf_results = dense_classifier.score(X_test, y_test)
|
||||
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||||
|
||||
# staged_decision_function
|
||||
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_decision_function(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_predict
|
||||
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_predict(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_predict_proba
|
||||
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
|
||||
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# staged_score
|
||||
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
|
||||
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
|
||||
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||||
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||||
|
||||
# Verify sparsity of data is maintained during training
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([t == expected_internal_type for t in types])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container, expected_internal_type",
|
||||
zip(
|
||||
[
|
||||
*CSC_CONTAINERS,
|
||||
*CSR_CONTAINERS,
|
||||
*LIL_CONTAINERS,
|
||||
*COO_CONTAINERS,
|
||||
*DOK_CONTAINERS,
|
||||
],
|
||||
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||||
),
|
||||
)
|
||||
def test_sparse_regression(sparse_container, expected_internal_type):
|
||||
# Check regression with sparse input.
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVR variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_regression(
|
||||
n_samples=15, n_features=50, n_targets=1, random_state=42
|
||||
)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
X_train_sparse = sparse_container(X_train)
|
||||
X_test_sparse = sparse_container(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||||
X_train_sparse, y_train
|
||||
)
|
||||
|
||||
# Trained on dense format
|
||||
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
# predict
|
||||
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
|
||||
dense_regr_results = dense_regressor.predict(X_test)
|
||||
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
|
||||
|
||||
# staged_predict
|
||||
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
|
||||
dense_regr_results = dense_regressor.staged_predict(X_test)
|
||||
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
|
||||
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
|
||||
|
||||
types = [i.data_type_ for i in sparse_regressor.estimators_]
|
||||
|
||||
assert all([t == expected_internal_type for t in types])
|
||||
|
||||
|
||||
def test_sample_weight_adaboost_regressor():
|
||||
"""
|
||||
AdaBoostRegressor should work without sample_weights in the base estimator
|
||||
The random weighted sampling is done internally in the _boost method in
|
||||
AdaBoostRegressor.
|
||||
"""
|
||||
|
||||
class DummyEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
pass
|
||||
|
||||
def predict(self, X):
|
||||
return np.zeros(X.shape[0])
|
||||
|
||||
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
|
||||
boost.fit(X, y_regr)
|
||||
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
|
||||
|
||||
|
||||
def test_multidimensional_X():
|
||||
"""
|
||||
Check that the AdaBoost estimators can work with n-dimensional
|
||||
data matrix
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.randn(51, 3, 3)
|
||||
yc = rng.choice([0, 1], 51)
|
||||
yr = rng.randn(51)
|
||||
|
||||
boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
|
||||
boost.fit(X, yc)
|
||||
boost.predict(X)
|
||||
boost.predict_proba(X)
|
||||
|
||||
boost = AdaBoostRegressor(DummyRegressor())
|
||||
boost.fit(X, yr)
|
||||
boost.predict(X)
|
||||
|
||||
|
||||
def test_adaboostclassifier_without_sample_weight():
|
||||
X, y = iris.data, iris.target
|
||||
estimator = NoSampleWeightWrapper(DummyClassifier())
|
||||
clf = AdaBoostClassifier(estimator=estimator)
|
||||
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_adaboostregressor_sample_weight():
|
||||
# check that giving weight will have an influence on the error computed
|
||||
# for a weak learner
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.linspace(0, 100, num=1000)
|
||||
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
|
||||
X = X.reshape(-1, 1)
|
||||
|
||||
# add an arbitrary outlier
|
||||
X[-1] *= 10
|
||||
y[-1] = 10000
|
||||
|
||||
# random_state=0 ensure that the underlying bootstrap will use the outlier
|
||||
regr_no_outlier = AdaBoostRegressor(
|
||||
estimator=LinearRegression(), n_estimators=1, random_state=0
|
||||
)
|
||||
regr_with_weight = clone(regr_no_outlier)
|
||||
regr_with_outlier = clone(regr_no_outlier)
|
||||
|
||||
# fit 3 models:
|
||||
# - a model containing the outlier
|
||||
# - a model without the outlier
|
||||
# - a model containing the outlier but with a null sample-weight
|
||||
regr_with_outlier.fit(X, y)
|
||||
regr_no_outlier.fit(X[:-1], y[:-1])
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = 0
|
||||
regr_with_weight.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
|
||||
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
|
||||
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
|
||||
|
||||
assert score_with_outlier < score_no_outlier
|
||||
assert score_with_outlier < score_with_weight
|
||||
assert score_no_outlier == pytest.approx(score_with_weight)
|
||||
|
||||
|
||||
def test_adaboost_consistent_predict():
|
||||
# check that predict_proba and predict give consistent results
|
||||
# regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/14084
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*datasets.load_digits(return_X_y=True), random_state=42
|
||||
)
|
||||
model = AdaBoostClassifier(random_state=42)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
assert_array_equal(
|
||||
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, X, y",
|
||||
[
|
||||
(AdaBoostClassifier(), iris.data, iris.target),
|
||||
(AdaBoostRegressor(), diabetes.data, diabetes.target),
|
||||
],
|
||||
)
|
||||
def test_adaboost_negative_weight_error(model, X, y):
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = -10
|
||||
|
||||
err_msg = "Negative values in data passed to `sample_weight`"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
model.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
|
||||
"""Check that we don't create NaN feature importance with numerically
|
||||
instable inputs.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20320
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.normal(size=(1000, 10))
|
||||
y = rng.choice([0, 1], size=1000)
|
||||
sample_weight = np.ones_like(y) * 1e-263
|
||||
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
|
||||
ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12)
|
||||
ada_model.fit(X, y, sample_weight=sample_weight)
|
||||
assert np.isnan(ada_model.feature_importances_).sum() == 0
|
||||
|
||||
|
||||
def test_adaboost_decision_function(global_random_seed):
|
||||
"""Check that the decision function respects the symmetric constraint for weak
|
||||
learners.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/26520
|
||||
"""
|
||||
n_classes = 3
|
||||
X, y = datasets.make_classification(
|
||||
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
|
||||
)
|
||||
clf = AdaBoostClassifier(n_estimators=1, random_state=global_random_seed).fit(X, y)
|
||||
|
||||
y_score = clf.decision_function(X)
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
# With a single learner, we expect to have a decision function in
|
||||
# {1, - 1 / (n_classes - 1)}.
|
||||
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||||
|
||||
# We can assert the same for staged_decision_function since we have a single learner
|
||||
for y_score in clf.staged_decision_function(X):
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
# With a single learner, we expect to have a decision function in
|
||||
# {1, - 1 / (n_classes - 1)}.
|
||||
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||||
|
||||
clf.set_params(n_estimators=5).fit(X, y)
|
||||
|
||||
y_score = clf.decision_function(X)
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
|
||||
for y_score in clf.staged_decision_function(X):
|
||||
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||||
Reference in New Issue
Block a user