Videre
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,584 @@
|
||||
import inspect
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassifierMixin,
|
||||
MetaEstimatorMixin,
|
||||
RegressorMixin,
|
||||
TransformerMixin,
|
||||
clone,
|
||||
)
|
||||
from sklearn.metrics._scorer import _Scorer, mean_squared_error
|
||||
from sklearn.model_selection import BaseCrossValidator
|
||||
from sklearn.model_selection._split import GroupsConsumerMixin
|
||||
from sklearn.utils._metadata_requests import (
|
||||
SIMPLE_METHODS,
|
||||
)
|
||||
from sklearn.utils.metadata_routing import (
|
||||
MetadataRouter,
|
||||
MethodMapping,
|
||||
process_routing,
|
||||
)
|
||||
from sklearn.utils.multiclass import _check_partial_fit_first_call
|
||||
|
||||
|
||||
def record_metadata(obj, record_default=True, **kwargs):
|
||||
"""Utility function to store passed metadata to a method of obj.
|
||||
|
||||
If record_default is False, kwargs whose values are "default" are skipped.
|
||||
This is so that checks on keyword arguments whose default was not changed
|
||||
are skipped.
|
||||
|
||||
"""
|
||||
stack = inspect.stack()
|
||||
callee = stack[1].function
|
||||
caller = stack[2].function
|
||||
if not hasattr(obj, "_records"):
|
||||
obj._records = defaultdict(lambda: defaultdict(list))
|
||||
if not record_default:
|
||||
kwargs = {
|
||||
key: val
|
||||
for key, val in kwargs.items()
|
||||
if not isinstance(val, str) or (val != "default")
|
||||
}
|
||||
obj._records[callee][caller].append(kwargs)
|
||||
|
||||
|
||||
def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs):
|
||||
"""Check whether the expected metadata is passed to the object's method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : estimator object
|
||||
sub-estimator to check routed params for
|
||||
method : str
|
||||
sub-estimator's method where metadata is routed to, or otherwise in
|
||||
the context of metadata routing referred to as 'callee'
|
||||
parent : str
|
||||
the parent method which should have called `method`, or otherwise in
|
||||
the context of metadata routing referred to as 'caller'
|
||||
split_params : tuple, default=empty
|
||||
specifies any parameters which are to be checked as being a subset
|
||||
of the original values
|
||||
**kwargs : dict
|
||||
passed metadata
|
||||
"""
|
||||
all_records = (
|
||||
getattr(obj, "_records", dict()).get(method, dict()).get(parent, list())
|
||||
)
|
||||
for record in all_records:
|
||||
# first check that the names of the metadata passed are the same as
|
||||
# expected. The names are stored as keys in `record`.
|
||||
assert set(kwargs.keys()) == set(record.keys()), (
|
||||
f"Expected {kwargs.keys()} vs {record.keys()}"
|
||||
)
|
||||
for key, value in kwargs.items():
|
||||
recorded_value = record[key]
|
||||
# The following condition is used to check for any specified parameters
|
||||
# being a subset of the original values
|
||||
if key in split_params and recorded_value is not None:
|
||||
assert np.isin(recorded_value, value).all()
|
||||
else:
|
||||
if isinstance(recorded_value, np.ndarray):
|
||||
assert_array_equal(recorded_value, value)
|
||||
else:
|
||||
assert recorded_value is value, (
|
||||
f"Expected {recorded_value} vs {value}. Method: {method}"
|
||||
)
|
||||
|
||||
|
||||
record_metadata_not_default = partial(record_metadata, record_default=False)
|
||||
|
||||
|
||||
def assert_request_is_empty(metadata_request, exclude=None):
|
||||
"""Check if a metadata request dict is empty.
|
||||
|
||||
One can exclude a method or a list of methods from the check using the
|
||||
``exclude`` parameter. If metadata_request is a MetadataRouter, then
|
||||
``exclude`` can be of the form ``{"object" : [method, ...]}``.
|
||||
"""
|
||||
if isinstance(metadata_request, MetadataRouter):
|
||||
for name, route_mapping in metadata_request:
|
||||
if exclude is not None and name in exclude:
|
||||
_exclude = exclude[name]
|
||||
else:
|
||||
_exclude = None
|
||||
assert_request_is_empty(route_mapping.router, exclude=_exclude)
|
||||
return
|
||||
|
||||
exclude = [] if exclude is None else exclude
|
||||
for method in SIMPLE_METHODS:
|
||||
if method in exclude:
|
||||
continue
|
||||
mmr = getattr(metadata_request, method)
|
||||
props = [
|
||||
prop
|
||||
for prop, alias in mmr.requests.items()
|
||||
if isinstance(alias, str) or alias is not None
|
||||
]
|
||||
assert not props
|
||||
|
||||
|
||||
def assert_request_equal(request, dictionary):
|
||||
for method, requests in dictionary.items():
|
||||
mmr = getattr(request, method)
|
||||
assert mmr.requests == requests
|
||||
|
||||
empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary]
|
||||
for method in empty_methods:
|
||||
assert not len(getattr(request, method).requests)
|
||||
|
||||
|
||||
class _Registry(list):
|
||||
# This list is used to get a reference to the sub-estimators, which are not
|
||||
# necessarily stored on the metaestimator. We need to override __deepcopy__
|
||||
# because the sub-estimators are probably cloned, which would result in a
|
||||
# new copy of the list, but we need copy and deep copy both to return the
|
||||
# same instance.
|
||||
def __deepcopy__(self, memo):
|
||||
return self
|
||||
|
||||
def __copy__(self):
|
||||
return self
|
||||
|
||||
|
||||
class ConsumingRegressor(RegressorMixin, BaseEstimator):
|
||||
"""A regressor consuming metadata.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
registry : list, default=None
|
||||
If a list, the estimator will append itself to the list in order to have
|
||||
a reference to the estimator later on. Since that reference is not
|
||||
required in all tests, registration can be skipped by leaving this value
|
||||
as None.
|
||||
"""
|
||||
|
||||
def __init__(self, registry=None):
|
||||
self.registry = registry
|
||||
|
||||
def partial_fit(self, X, y, sample_weight="default", metadata="default"):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return self
|
||||
|
||||
def fit(self, X, y, sample_weight="default", metadata="default"):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return self
|
||||
|
||||
def predict(self, X, y=None, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return np.zeros(shape=(len(X),))
|
||||
|
||||
def score(self, X, y, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
class NonConsumingClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""A classifier which accepts no metadata on any method."""
|
||||
|
||||
def __init__(self, alpha=0.0):
|
||||
self.alpha = alpha
|
||||
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
self.coef_ = np.ones_like(X)
|
||||
return self
|
||||
|
||||
def partial_fit(self, X, y, classes=None):
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
return self.predict(X)
|
||||
|
||||
def predict(self, X):
|
||||
y_pred = np.empty(shape=(len(X),))
|
||||
y_pred[: len(X) // 2] = 0
|
||||
y_pred[len(X) // 2 :] = 1
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X):
|
||||
# dummy probabilities to support predict_proba
|
||||
y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32)
|
||||
# each row sums up to 1.0:
|
||||
y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X))
|
||||
return y_proba
|
||||
|
||||
def predict_log_proba(self, X):
|
||||
# dummy probabilities to support predict_log_proba
|
||||
return self.predict_proba(X)
|
||||
|
||||
|
||||
class NonConsumingRegressor(RegressorMixin, BaseEstimator):
|
||||
"""A classifier which accepts no metadata on any method."""
|
||||
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
def partial_fit(self, X, y):
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(len(X)) # pragma: no cover
|
||||
|
||||
|
||||
class ConsumingClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""A classifier consuming metadata.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
registry : list, default=None
|
||||
If a list, the estimator will append itself to the list in order to have
|
||||
a reference to the estimator later on. Since that reference is not
|
||||
required in all tests, registration can be skipped by leaving this value
|
||||
as None.
|
||||
|
||||
alpha : float, default=0
|
||||
This parameter is only used to test the ``*SearchCV`` objects, and
|
||||
doesn't do anything.
|
||||
"""
|
||||
|
||||
def __init__(self, registry=None, alpha=0.0):
|
||||
self.alpha = alpha
|
||||
self.registry = registry
|
||||
|
||||
def partial_fit(
|
||||
self, X, y, classes=None, sample_weight="default", metadata="default"
|
||||
):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
_check_partial_fit_first_call(self, classes)
|
||||
return self
|
||||
|
||||
def fit(self, X, y, sample_weight="default", metadata="default"):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
|
||||
self.classes_ = np.unique(y)
|
||||
self.coef_ = np.ones_like(X)
|
||||
return self
|
||||
|
||||
def predict(self, X, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
y_score = np.empty(shape=(len(X),), dtype="int8")
|
||||
y_score[len(X) // 2 :] = 0
|
||||
y_score[: len(X) // 2] = 1
|
||||
return y_score
|
||||
|
||||
def predict_proba(self, X, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32)
|
||||
# each row sums up to 1.0:
|
||||
y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X))
|
||||
return y_proba
|
||||
|
||||
def predict_log_proba(self, X, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return self.predict_proba(X)
|
||||
|
||||
def decision_function(self, X, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
y_score = np.empty(shape=(len(X),))
|
||||
y_score[len(X) // 2 :] = 0
|
||||
y_score[: len(X) // 2] = 1
|
||||
return y_score
|
||||
|
||||
def score(self, X, y, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
class ConsumingClassifierWithoutPredictProba(ConsumingClassifier):
|
||||
"""ConsumingClassifier without a predict_proba method, but with predict_log_proba.
|
||||
|
||||
Used to mimic dynamic method selection such as in the `_parallel_predict_proba()`
|
||||
function called by `BaggingClassifier`.
|
||||
"""
|
||||
|
||||
@property
|
||||
def predict_proba(self):
|
||||
raise AttributeError("This estimator does not support predict_proba")
|
||||
|
||||
|
||||
class ConsumingClassifierWithoutPredictLogProba(ConsumingClassifier):
|
||||
"""ConsumingClassifier without a predict_log_proba method, but with predict_proba.
|
||||
|
||||
Used to mimic dynamic method selection such as in
|
||||
`BaggingClassifier.predict_log_proba()`.
|
||||
"""
|
||||
|
||||
@property
|
||||
def predict_log_proba(self):
|
||||
raise AttributeError("This estimator does not support predict_log_proba")
|
||||
|
||||
|
||||
class ConsumingClassifierWithOnlyPredict(ConsumingClassifier):
|
||||
"""ConsumingClassifier with only a predict method.
|
||||
|
||||
Used to mimic dynamic method selection such as in
|
||||
`BaggingClassifier.predict_log_proba()`.
|
||||
"""
|
||||
|
||||
@property
|
||||
def predict_proba(self):
|
||||
raise AttributeError("This estimator does not support predict_proba")
|
||||
|
||||
@property
|
||||
def predict_log_proba(self):
|
||||
raise AttributeError("This estimator does not support predict_log_proba")
|
||||
|
||||
|
||||
class ConsumingTransformer(TransformerMixin, BaseEstimator):
|
||||
"""A transformer which accepts metadata on fit and transform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
registry : list, default=None
|
||||
If a list, the estimator will append itself to the list in order to have
|
||||
a reference to the estimator later on. Since that reference is not
|
||||
required in all tests, registration can be skipped by leaving this value
|
||||
as None.
|
||||
"""
|
||||
|
||||
def __init__(self, registry=None):
|
||||
self.registry = registry
|
||||
|
||||
def fit(self, X, y=None, sample_weight="default", metadata="default"):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
self.fitted_ = True
|
||||
return self
|
||||
|
||||
def transform(self, X, sample_weight="default", metadata="default"):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return X + 1
|
||||
|
||||
def fit_transform(self, X, y, sample_weight="default", metadata="default"):
|
||||
# implementing ``fit_transform`` is necessary since
|
||||
# ``TransformerMixin.fit_transform`` doesn't route any metadata to
|
||||
# ``transform``, while here we want ``transform`` to receive
|
||||
# ``sample_weight`` and ``metadata``.
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform(
|
||||
X, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
|
||||
def inverse_transform(self, X, sample_weight=None, metadata=None):
|
||||
record_metadata_not_default(
|
||||
self, sample_weight=sample_weight, metadata=metadata
|
||||
)
|
||||
return X - 1
|
||||
|
||||
|
||||
class ConsumingNoFitTransformTransformer(BaseEstimator):
|
||||
"""A metadata consuming transformer that doesn't inherit from
|
||||
TransformerMixin, and thus doesn't implement `fit_transform`. Note that
|
||||
TransformerMixin's `fit_transform` doesn't route metadata to `transform`."""
|
||||
|
||||
def __init__(self, registry=None):
|
||||
self.registry = registry
|
||||
|
||||
def fit(self, X, y=None, sample_weight=None, metadata=None):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata(self, sample_weight=sample_weight, metadata=metadata)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X, sample_weight=None, metadata=None):
|
||||
record_metadata(self, sample_weight=sample_weight, metadata=metadata)
|
||||
return X
|
||||
|
||||
|
||||
class ConsumingScorer(_Scorer):
|
||||
def __init__(self, registry=None):
|
||||
super().__init__(
|
||||
score_func=mean_squared_error, sign=1, kwargs={}, response_method="predict"
|
||||
)
|
||||
self.registry = registry
|
||||
|
||||
def _score(self, method_caller, clf, X, y, **kwargs):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(self, **kwargs)
|
||||
|
||||
sample_weight = kwargs.get("sample_weight", None)
|
||||
return super()._score(method_caller, clf, X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class ConsumingSplitter(GroupsConsumerMixin, BaseCrossValidator):
|
||||
def __init__(self, registry=None):
|
||||
self.registry = registry
|
||||
|
||||
def split(self, X, y=None, groups="default", metadata="default"):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata_not_default(self, groups=groups, metadata=metadata)
|
||||
|
||||
split_index = len(X) // 2
|
||||
train_indices = list(range(0, split_index))
|
||||
test_indices = list(range(split_index, len(X)))
|
||||
yield test_indices, train_indices
|
||||
yield train_indices, test_indices
|
||||
|
||||
def get_n_splits(self, X=None, y=None, groups=None, metadata=None):
|
||||
return 2
|
||||
|
||||
def _iter_test_indices(self, X=None, y=None, groups=None):
|
||||
split_index = len(X) // 2
|
||||
train_indices = list(range(0, split_index))
|
||||
test_indices = list(range(split_index, len(X)))
|
||||
yield test_indices
|
||||
yield train_indices
|
||||
|
||||
|
||||
class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
|
||||
"""A meta-regressor which is only a router."""
|
||||
|
||||
def __init__(self, estimator):
|
||||
self.estimator = estimator
|
||||
|
||||
def fit(self, X, y, **fit_params):
|
||||
params = process_routing(self, "fit", **fit_params)
|
||||
self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
router = MetadataRouter(owner=self).add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=MethodMapping().add(caller="fit", callee="fit"),
|
||||
)
|
||||
return router
|
||||
|
||||
|
||||
class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator):
|
||||
"""A meta-regressor which is also a consumer."""
|
||||
|
||||
def __init__(self, estimator, registry=None):
|
||||
self.estimator = estimator
|
||||
self.registry = registry
|
||||
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata(self, sample_weight=sample_weight)
|
||||
params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params)
|
||||
self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
|
||||
return self
|
||||
|
||||
def predict(self, X, **predict_params):
|
||||
params = process_routing(self, "predict", **predict_params)
|
||||
return self.estimator_.predict(X, **params.estimator.predict)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
router = (
|
||||
MetadataRouter(owner=self)
|
||||
.add_self_request(self)
|
||||
.add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=MethodMapping()
|
||||
.add(caller="fit", callee="fit")
|
||||
.add(caller="predict", callee="predict"),
|
||||
)
|
||||
)
|
||||
return router
|
||||
|
||||
|
||||
class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
|
||||
"""A meta-estimator which also consumes sample_weight itself in ``fit``."""
|
||||
|
||||
def __init__(self, estimator, registry=None):
|
||||
self.estimator = estimator
|
||||
self.registry = registry
|
||||
|
||||
def fit(self, X, y, sample_weight=None, **kwargs):
|
||||
if self.registry is not None:
|
||||
self.registry.append(self)
|
||||
|
||||
record_metadata(self, sample_weight=sample_weight)
|
||||
params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs)
|
||||
self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit)
|
||||
return self
|
||||
|
||||
def get_metadata_routing(self):
|
||||
router = (
|
||||
MetadataRouter(owner=self)
|
||||
.add_self_request(self)
|
||||
.add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=MethodMapping().add(caller="fit", callee="fit"),
|
||||
)
|
||||
)
|
||||
return router
|
||||
|
||||
|
||||
class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
|
||||
"""A simple meta-transformer."""
|
||||
|
||||
def __init__(self, transformer):
|
||||
self.transformer = transformer
|
||||
|
||||
def fit(self, X, y=None, **fit_params):
|
||||
params = process_routing(self, "fit", **fit_params)
|
||||
self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit)
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None, **transform_params):
|
||||
params = process_routing(self, "transform", **transform_params)
|
||||
return self.transformer_.transform(X, **params.transformer.transform)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
return MetadataRouter(owner=self).add(
|
||||
transformer=self.transformer,
|
||||
method_mapping=MethodMapping()
|
||||
.add(caller="fit", callee="fit")
|
||||
.add(caller="transform", callee="transform"),
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,34 @@
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import __version__
|
||||
from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled
|
||||
|
||||
|
||||
def test_openmp_parallelism_enabled():
|
||||
# Check that sklearn is built with OpenMP-based parallelism enabled.
|
||||
# This test can be skipped by setting the environment variable
|
||||
# ``SKLEARN_SKIP_OPENMP_TEST``.
|
||||
if os.getenv("SKLEARN_SKIP_OPENMP_TEST"):
|
||||
pytest.skip("test explicitly skipped (SKLEARN_SKIP_OPENMP_TEST)")
|
||||
|
||||
base_url = "dev" if __version__.endswith(".dev0") else "stable"
|
||||
err_msg = textwrap.dedent(
|
||||
"""
|
||||
This test fails because scikit-learn has been built without OpenMP.
|
||||
This is not recommended since some estimators will run in sequential
|
||||
mode instead of leveraging thread-based parallelism.
|
||||
|
||||
You can find instructions to build scikit-learn with OpenMP at this
|
||||
address:
|
||||
|
||||
https://scikit-learn.org/{}/developers/advanced_installation.html
|
||||
|
||||
You can skip this test by setting the environment variable
|
||||
SKLEARN_SKIP_OPENMP_TEST to any value.
|
||||
"""
|
||||
).format(base_url)
|
||||
|
||||
assert _openmp_parallelism_enabled(), err_msg
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Smoke Test the check_build module
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.__check_build import raise_build_error
|
||||
|
||||
|
||||
def test_raise_build_error():
|
||||
with pytest.raises(ImportError):
|
||||
raise_build_error(ImportError())
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
General tests for all estimators in sklearn.
|
||||
"""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import os
|
||||
import pkgutil
|
||||
import re
|
||||
import warnings
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
import pytest
|
||||
from scipy.linalg import LinAlgWarning
|
||||
|
||||
import sklearn
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
# make it possible to discover experimental estimators when calling `all_estimators`
|
||||
from sklearn.experimental import (
|
||||
enable_halving_search_cv, # noqa: F401
|
||||
enable_iterative_imputer, # noqa: F401
|
||||
)
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import FeatureUnion, make_pipeline
|
||||
from sklearn.preprocessing import (
|
||||
FunctionTransformer,
|
||||
MinMaxScaler,
|
||||
OneHotEncoder,
|
||||
StandardScaler,
|
||||
)
|
||||
from sklearn.utils import all_estimators
|
||||
from sklearn.utils._test_common.instance_generator import (
|
||||
_get_check_estimator_ids,
|
||||
_get_expected_failed_checks,
|
||||
_tested_estimators,
|
||||
_yield_instances_for_check,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
SkipTest,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.estimator_checks import (
|
||||
check_dataframe_column_names_consistency,
|
||||
check_estimator,
|
||||
check_get_feature_names_out_error,
|
||||
check_global_output_transform_pandas,
|
||||
check_global_set_output_transform_polars,
|
||||
check_inplace_ensure_writeable,
|
||||
check_param_validation,
|
||||
check_set_output_transform,
|
||||
check_set_output_transform_pandas,
|
||||
check_set_output_transform_polars,
|
||||
check_transformer_get_feature_names_out,
|
||||
check_transformer_get_feature_names_out_pandas,
|
||||
parametrize_with_checks,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.thread_unsafe # import side-effects
|
||||
def test_all_estimator_no_base_class():
|
||||
# test that all_estimators doesn't find abstract classes.
|
||||
for name, Estimator in all_estimators():
|
||||
msg = (
|
||||
"Base estimators such as {0} should not be included in all_estimators"
|
||||
).format(name)
|
||||
assert not name.lower().startswith("base"), msg
|
||||
|
||||
|
||||
def _sample_func(x, y=1):
|
||||
pass
|
||||
|
||||
|
||||
class CallableEstimator(BaseEstimator):
|
||||
"""Dummy development stub for an estimator.
|
||||
|
||||
This is to make sure a callable estimator passes common tests.
|
||||
"""
|
||||
|
||||
def __call__(self):
|
||||
pass # pragma: nocover
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val, expected",
|
||||
[
|
||||
(partial(_sample_func, y=1), "_sample_func(y=1)"),
|
||||
(_sample_func, "_sample_func"),
|
||||
(partial(_sample_func, "world"), "_sample_func"),
|
||||
(LogisticRegression(C=2.0), "LogisticRegression(C=2.0)"),
|
||||
(
|
||||
LogisticRegression(
|
||||
random_state=1,
|
||||
solver="newton-cg",
|
||||
class_weight="balanced",
|
||||
warm_start=True,
|
||||
),
|
||||
(
|
||||
"LogisticRegression(class_weight='balanced',random_state=1,"
|
||||
"solver='newton-cg',warm_start=True)"
|
||||
),
|
||||
),
|
||||
(CallableEstimator(), "CallableEstimator()"),
|
||||
],
|
||||
)
|
||||
def test_get_check_estimator_ids(val, expected):
|
||||
assert _get_check_estimator_ids(val) == expected
|
||||
|
||||
|
||||
@parametrize_with_checks(
|
||||
list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks
|
||||
)
|
||||
def test_estimators(estimator, check, request):
|
||||
# Common tests for estimator instances
|
||||
with ignore_warnings(
|
||||
category=(FutureWarning, ConvergenceWarning, UserWarning, LinAlgWarning)
|
||||
):
|
||||
check(estimator)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Since version 1.0, it is not needed to import "
|
||||
"enable_hist_gradient_boosting anymore"
|
||||
)
|
||||
@pytest.mark.thread_unsafe # import side-effects
|
||||
def test_import_all_consistency():
|
||||
sklearn_path = [os.path.dirname(sklearn.__file__)]
|
||||
# Smoke test to check that any name in a __all__ list is actually defined
|
||||
# in the namespace of the module or package.
|
||||
pkgs = pkgutil.walk_packages(
|
||||
path=sklearn_path, prefix="sklearn.", onerror=lambda _: None
|
||||
)
|
||||
submods = [modname for _, modname, _ in pkgs]
|
||||
for modname in submods + ["sklearn"]:
|
||||
if ".tests." in modname or "sklearn.externals" in modname:
|
||||
continue
|
||||
# Avoid test suite depending on build dependencies, for example Cython
|
||||
if "sklearn._build_utils" in modname:
|
||||
continue
|
||||
package = __import__(modname, fromlist="dummy")
|
||||
for name in getattr(package, "__all__", ()):
|
||||
assert hasattr(package, name), "Module '{0}' has no attribute '{1}'".format(
|
||||
modname, name
|
||||
)
|
||||
|
||||
|
||||
def test_root_import_all_completeness():
|
||||
sklearn_path = [os.path.dirname(sklearn.__file__)]
|
||||
EXCEPTIONS = ("utils", "tests", "base", "conftest")
|
||||
for _, modname, _ in pkgutil.walk_packages(
|
||||
path=sklearn_path, onerror=lambda _: None
|
||||
):
|
||||
if "." in modname or modname.startswith("_") or modname in EXCEPTIONS:
|
||||
continue
|
||||
assert modname in sklearn.__all__
|
||||
|
||||
|
||||
@pytest.mark.thread_unsafe # import side-effects
|
||||
def test_all_tests_are_importable():
|
||||
# Ensure that for each contentful subpackage, there is a test directory
|
||||
# within it that is also a subpackage (i.e. a directory with __init__.py)
|
||||
|
||||
HAS_TESTS_EXCEPTIONS = re.compile(
|
||||
r"""(?x)
|
||||
\.externals(\.|$)|
|
||||
\.tests(\.|$)|
|
||||
\._
|
||||
"""
|
||||
)
|
||||
resource_modules = {
|
||||
"sklearn.datasets.data",
|
||||
"sklearn.datasets.descr",
|
||||
"sklearn.datasets.images",
|
||||
}
|
||||
sklearn_path = [os.path.dirname(sklearn.__file__)]
|
||||
lookup = {
|
||||
name: ispkg
|
||||
for _, name, ispkg in pkgutil.walk_packages(sklearn_path, prefix="sklearn.")
|
||||
}
|
||||
missing_tests = [
|
||||
name
|
||||
for name, ispkg in lookup.items()
|
||||
if ispkg
|
||||
and name not in resource_modules
|
||||
and not HAS_TESTS_EXCEPTIONS.search(name)
|
||||
and name + ".tests" not in lookup
|
||||
]
|
||||
assert missing_tests == [], (
|
||||
"{0} do not have `tests` subpackages. "
|
||||
"Perhaps they require "
|
||||
"__init__.py or a meson.build "
|
||||
"in the parent "
|
||||
"directory".format(missing_tests)
|
||||
)
|
||||
|
||||
|
||||
def test_class_support_removed():
|
||||
# Make sure passing classes to check_estimator or parametrize_with_checks
|
||||
# raises an error
|
||||
|
||||
msg = "Passing a class was deprecated.* isn't supported anymore"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
check_estimator(LogisticRegression)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parametrize_with_checks([LogisticRegression])
|
||||
|
||||
|
||||
def _estimators_that_predict_in_fit():
|
||||
for estimator in _tested_estimators():
|
||||
est_params = set(estimator.get_params())
|
||||
if "oob_score" in est_params:
|
||||
yield estimator.set_params(oob_score=True, bootstrap=True)
|
||||
elif "early_stopping" in est_params:
|
||||
est = estimator.set_params(early_stopping=True, n_iter_no_change=1)
|
||||
if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}:
|
||||
# TODO: FIX MLP to not check validation set during MLP
|
||||
yield pytest.param(
|
||||
est, marks=pytest.mark.xfail(msg="MLP still validates in fit")
|
||||
)
|
||||
else:
|
||||
yield est
|
||||
elif "n_iter_no_change" in est_params:
|
||||
yield estimator.set_params(n_iter_no_change=1)
|
||||
|
||||
|
||||
# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
|
||||
# delegates validation to a base estimator, the check is testing that the base estimator
|
||||
# is checking for column name consistency.
|
||||
column_name_estimators = list(
|
||||
chain(
|
||||
_tested_estimators(),
|
||||
[make_pipeline(LogisticRegression(C=1))],
|
||||
_estimators_that_predict_in_fit(),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator_orig", column_name_estimators, ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_pandas_column_name_consistency(estimator_orig):
|
||||
if isinstance(estimator_orig, ColumnTransformer):
|
||||
pytest.skip("ColumnTransformer is not tested here")
|
||||
if "check_dataframe_column_names_consistency" in _get_expected_failed_checks(
|
||||
estimator_orig
|
||||
):
|
||||
pytest.skip(
|
||||
"Estimator does not support check_dataframe_column_names_consistency"
|
||||
)
|
||||
for estimator in _yield_instances_for_check(
|
||||
check_dataframe_column_names_consistency, estimator_orig
|
||||
):
|
||||
with ignore_warnings(category=(FutureWarning)):
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
check_dataframe_column_names_consistency(
|
||||
estimator.__class__.__name__, estimator
|
||||
)
|
||||
for warning in record:
|
||||
assert "was fitted without feature names" not in str(warning.message)
|
||||
|
||||
|
||||
# TODO: As more modules support get_feature_names_out they should be removed
|
||||
# from this list to be tested
|
||||
GET_FEATURES_OUT_MODULES_TO_IGNORE = [
|
||||
"ensemble",
|
||||
"kernel_approximation",
|
||||
]
|
||||
|
||||
|
||||
def _include_in_get_feature_names_out_check(transformer):
|
||||
if hasattr(transformer, "get_feature_names_out"):
|
||||
return True
|
||||
module = transformer.__module__.split(".")[1]
|
||||
return module not in GET_FEATURES_OUT_MODULES_TO_IGNORE
|
||||
|
||||
|
||||
GET_FEATURES_OUT_ESTIMATORS = [
|
||||
est
|
||||
for est in _tested_estimators("transformer")
|
||||
if _include_in_get_feature_names_out_check(est)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_transformers_get_feature_names_out(transformer):
|
||||
with ignore_warnings(category=(FutureWarning)):
|
||||
check_transformer_get_feature_names_out(
|
||||
transformer.__class__.__name__, transformer
|
||||
)
|
||||
check_transformer_get_feature_names_out_pandas(
|
||||
transformer.__class__.__name__, transformer
|
||||
)
|
||||
|
||||
|
||||
ESTIMATORS_WITH_GET_FEATURE_NAMES_OUT = [
|
||||
est for est in _tested_estimators() if hasattr(est, "get_feature_names_out")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator", ESTIMATORS_WITH_GET_FEATURE_NAMES_OUT, ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_estimators_get_feature_names_out_error(estimator):
|
||||
estimator_name = estimator.__class__.__name__
|
||||
check_get_feature_names_out_error(estimator_name, estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator", list(_tested_estimators()), ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_check_param_validation(estimator):
|
||||
if isinstance(estimator, FeatureUnion):
|
||||
pytest.skip("FeatureUnion is not tested here")
|
||||
name = estimator.__class__.__name__
|
||||
check_param_validation(name, estimator)
|
||||
|
||||
|
||||
SET_OUTPUT_ESTIMATORS = list(
|
||||
chain(
|
||||
_tested_estimators("transformer"),
|
||||
[
|
||||
make_pipeline(StandardScaler(), MinMaxScaler()),
|
||||
OneHotEncoder(sparse_output=False),
|
||||
FunctionTransformer(feature_names_out="one-to-one"),
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator_orig", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_set_output_transform(estimator_orig):
|
||||
name = estimator_orig.__class__.__name__
|
||||
if not hasattr(estimator_orig, "set_output"):
|
||||
pytest.skip(
|
||||
f"Skipping check_set_output_transform for {name}: Does not support"
|
||||
" set_output API"
|
||||
)
|
||||
for estimator in _yield_instances_for_check(
|
||||
check_set_output_transform, estimator_orig
|
||||
):
|
||||
with ignore_warnings(category=(FutureWarning)):
|
||||
check_set_output_transform(estimator.__class__.__name__, estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator_orig", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"check_func",
|
||||
[
|
||||
check_set_output_transform_pandas,
|
||||
check_global_output_transform_pandas,
|
||||
check_set_output_transform_polars,
|
||||
check_global_set_output_transform_polars,
|
||||
],
|
||||
)
|
||||
def test_set_output_transform_configured(estimator_orig, check_func):
|
||||
name = estimator_orig.__class__.__name__
|
||||
if not hasattr(estimator_orig, "set_output"):
|
||||
pytest.skip(
|
||||
f"Skipping {check_func.__name__} for {name}: Does not support"
|
||||
" set_output API yet"
|
||||
)
|
||||
for estimator in _yield_instances_for_check(check_func, estimator_orig):
|
||||
with ignore_warnings(category=(FutureWarning)):
|
||||
check_func(estimator.__class__.__name__, estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator", _tested_estimators(), ids=_get_check_estimator_ids
|
||||
)
|
||||
def test_check_inplace_ensure_writeable(estimator):
|
||||
name = estimator.__class__.__name__
|
||||
|
||||
if hasattr(estimator, "copy"):
|
||||
estimator.set_params(copy=False)
|
||||
elif hasattr(estimator, "copy_X"):
|
||||
estimator.set_params(copy_X=False)
|
||||
else:
|
||||
raise SkipTest(f"{name} doesn't require writeable input.")
|
||||
|
||||
# The following estimators can work inplace only with certain settings
|
||||
if name == "HDBSCAN":
|
||||
estimator.set_params(metric="precomputed", algorithm="brute")
|
||||
|
||||
if name == "PCA":
|
||||
estimator.set_params(svd_solver="full")
|
||||
|
||||
if name == "KernelPCA":
|
||||
estimator.set_params(kernel="precomputed")
|
||||
|
||||
check_inplace_ensure_writeable(name, estimator)
|
||||
@@ -0,0 +1,168 @@
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn import config_context, get_config, set_config
|
||||
from sklearn.utils.fixes import _IS_WASM
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
|
||||
|
||||
def test_config_context():
|
||||
assert get_config() == {
|
||||
"assume_finite": False,
|
||||
"working_memory": 1024,
|
||||
"print_changed_only": True,
|
||||
"display": "diagram",
|
||||
"array_api_dispatch": False,
|
||||
"pairwise_dist_chunk_size": 256,
|
||||
"enable_cython_pairwise_dist": True,
|
||||
"transform_output": "default",
|
||||
"enable_metadata_routing": False,
|
||||
"skip_parameter_validation": False,
|
||||
}
|
||||
|
||||
# Not using as a context manager affects nothing
|
||||
config_context(assume_finite=True)
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
assert get_config() == {
|
||||
"assume_finite": True,
|
||||
"working_memory": 1024,
|
||||
"print_changed_only": True,
|
||||
"display": "diagram",
|
||||
"array_api_dispatch": False,
|
||||
"pairwise_dist_chunk_size": 256,
|
||||
"enable_cython_pairwise_dist": True,
|
||||
"transform_output": "default",
|
||||
"enable_metadata_routing": False,
|
||||
"skip_parameter_validation": False,
|
||||
}
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
with config_context(assume_finite=None):
|
||||
assert get_config()["assume_finite"] is True
|
||||
|
||||
assert get_config()["assume_finite"] is True
|
||||
|
||||
with config_context(assume_finite=False):
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
with config_context(assume_finite=None):
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
# global setting will not be retained outside of context that
|
||||
# did not modify this setting
|
||||
set_config(assume_finite=True)
|
||||
assert get_config()["assume_finite"] is True
|
||||
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
assert get_config()["assume_finite"] is True
|
||||
|
||||
assert get_config() == {
|
||||
"assume_finite": False,
|
||||
"working_memory": 1024,
|
||||
"print_changed_only": True,
|
||||
"display": "diagram",
|
||||
"array_api_dispatch": False,
|
||||
"pairwise_dist_chunk_size": 256,
|
||||
"enable_cython_pairwise_dist": True,
|
||||
"transform_output": "default",
|
||||
"enable_metadata_routing": False,
|
||||
"skip_parameter_validation": False,
|
||||
}
|
||||
|
||||
# No positional arguments
|
||||
with pytest.raises(TypeError):
|
||||
config_context(True)
|
||||
|
||||
# No unknown arguments
|
||||
with pytest.raises(TypeError):
|
||||
config_context(do_something_else=True).__enter__()
|
||||
|
||||
|
||||
def test_config_context_exception():
|
||||
assert get_config()["assume_finite"] is False
|
||||
try:
|
||||
with config_context(assume_finite=True):
|
||||
assert get_config()["assume_finite"] is True
|
||||
raise ValueError()
|
||||
except ValueError:
|
||||
pass
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
|
||||
def test_set_config():
|
||||
assert get_config()["assume_finite"] is False
|
||||
set_config(assume_finite=None)
|
||||
assert get_config()["assume_finite"] is False
|
||||
set_config(assume_finite=True)
|
||||
assert get_config()["assume_finite"] is True
|
||||
set_config(assume_finite=None)
|
||||
assert get_config()["assume_finite"] is True
|
||||
set_config(assume_finite=False)
|
||||
assert get_config()["assume_finite"] is False
|
||||
|
||||
# No unknown arguments
|
||||
with pytest.raises(TypeError):
|
||||
set_config(do_something_else=True)
|
||||
|
||||
|
||||
def set_assume_finite(assume_finite, sleep_duration):
|
||||
"""Return the value of assume_finite after waiting `sleep_duration`."""
|
||||
with config_context(assume_finite=assume_finite):
|
||||
time.sleep(sleep_duration)
|
||||
return get_config()["assume_finite"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
|
||||
def test_config_threadsafe_joblib(backend):
|
||||
"""Test that the global config is threadsafe with all joblib backends.
|
||||
Two jobs are spawned and sets assume_finite to two different values.
|
||||
When the job with a duration 0.1s completes, the assume_finite value
|
||||
should be the same as the value passed to the function. In other words,
|
||||
it is not influenced by the other job setting assume_finite to True.
|
||||
"""
|
||||
assume_finites = [False, True, False, True]
|
||||
sleep_durations = [0.1, 0.2, 0.1, 0.2]
|
||||
|
||||
items = Parallel(backend=backend, n_jobs=2)(
|
||||
delayed(set_assume_finite)(assume_finite, sleep_dur)
|
||||
for assume_finite, sleep_dur in zip(assume_finites, sleep_durations)
|
||||
)
|
||||
|
||||
assert items == [False, True, False, True]
|
||||
|
||||
|
||||
@pytest.mark.xfail(_IS_WASM, reason="cannot start threads")
|
||||
def test_config_threadsafe():
|
||||
"""Uses threads directly to test that the global config does not change
|
||||
between threads. Same test as `test_config_threadsafe_joblib` but with
|
||||
`ThreadPoolExecutor`."""
|
||||
|
||||
assume_finites = [False, True, False, True]
|
||||
sleep_durations = [0.1, 0.2, 0.1, 0.2]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as e:
|
||||
items = [
|
||||
output
|
||||
for output in e.map(set_assume_finite, assume_finites, sleep_durations)
|
||||
]
|
||||
|
||||
assert items == [False, True, False, True]
|
||||
|
||||
|
||||
def test_config_array_api_dispatch_error_scipy(monkeypatch):
|
||||
"""Check error when SciPy is too old"""
|
||||
monkeypatch.setattr(sklearn.utils._array_api.scipy, "__version__", "1.13.0")
|
||||
|
||||
with pytest.raises(ImportError, match="SciPy must be 1.14.0 or newer"):
|
||||
with config_context(array_api_dispatch=True):
|
||||
pass
|
||||
|
||||
with pytest.raises(ImportError, match="SciPy must be 1.14.0 or newer"):
|
||||
set_config(array_api_dispatch=True)
|
||||
@@ -0,0 +1,844 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.covariance import LedoitWolf, ShrunkCovariance, ledoit_wolf
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.discriminant_analysis import (
|
||||
LinearDiscriminantAnalysis,
|
||||
QuadraticDiscriminantAnalysis,
|
||||
_cov,
|
||||
)
|
||||
from sklearn.model_selection import ShuffleSplit, cross_val_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
# Data is just 6 separable points in the plane
|
||||
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype="f")
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
y3 = np.array([1, 1, 2, 2, 3, 3])
|
||||
|
||||
# Degenerate data with only one feature (still should be separable)
|
||||
X1 = np.array(
|
||||
[[-2], [-1], [-1], [1], [1], [2]],
|
||||
dtype="f",
|
||||
)
|
||||
|
||||
# Data is just 9 separable points in the plane
|
||||
X6 = np.array(
|
||||
[[0, 0], [-2, -2], [-2, -1], [-1, -1], [-1, -2], [1, 3], [1, 2], [2, 1], [2, 2]]
|
||||
)
|
||||
y6 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2])
|
||||
y7 = np.array([1, 2, 3, 2, 3, 1, 2, 3, 1])
|
||||
|
||||
# Degenerate data with 1 feature (still should be separable)
|
||||
X7 = np.array([[-3], [-2], [-1], [-1], [0], [1], [1], [2], [3]])
|
||||
|
||||
# Data that has zero variance in one dimension and needs regularization
|
||||
X2 = np.array(
|
||||
[[-3, 0], [-2, 0], [-1, 0], [-1, 0], [0, 0], [1, 0], [1, 0], [2, 0], [3, 0]]
|
||||
)
|
||||
|
||||
# One element class
|
||||
y4 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 2])
|
||||
|
||||
solver_shrinkage = [
|
||||
("svd", None),
|
||||
("lsqr", None),
|
||||
("eigen", None),
|
||||
("lsqr", "auto"),
|
||||
("lsqr", 0),
|
||||
("lsqr", 0.43),
|
||||
("eigen", "auto"),
|
||||
("eigen", 0),
|
||||
("eigen", 0.43),
|
||||
]
|
||||
|
||||
|
||||
def test_lda_predict():
|
||||
# Test LDA classification.
|
||||
# This checks that LDA implements fit and predict and returns correct
|
||||
# values for simple toy data.
|
||||
for test_case in solver_shrinkage:
|
||||
solver, shrinkage = test_case
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
y_pred = clf.fit(X, y).predict(X)
|
||||
assert_array_equal(y_pred, y, "solver %s" % solver)
|
||||
|
||||
# Assert that it works with 1D data
|
||||
y_pred1 = clf.fit(X1, y).predict(X1)
|
||||
assert_array_equal(y_pred1, y, "solver %s" % solver)
|
||||
|
||||
# Test probability estimates
|
||||
y_proba_pred1 = clf.predict_proba(X1)
|
||||
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
|
||||
y_log_proba_pred1 = clf.predict_log_proba(X1)
|
||||
assert_allclose(
|
||||
np.exp(y_log_proba_pred1),
|
||||
y_proba_pred1,
|
||||
rtol=1e-6,
|
||||
atol=1e-6,
|
||||
err_msg="solver %s" % solver,
|
||||
)
|
||||
|
||||
# Primarily test for commit 2f34950 -- "reuse" of priors
|
||||
y_pred3 = clf.fit(X, y3).predict(X)
|
||||
# LDA shouldn't be able to separate those
|
||||
assert np.any(y_pred3 != y3), "solver %s" % solver
|
||||
|
||||
clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
|
||||
with pytest.raises(NotImplementedError):
|
||||
clf.fit(X, y)
|
||||
|
||||
clf = LinearDiscriminantAnalysis(
|
||||
solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"covariance_estimator and shrinkage "
|
||||
"parameters are not None. "
|
||||
"Only one of the two can be set."
|
||||
),
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
# test bad solver with covariance_estimator
|
||||
clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf())
|
||||
with pytest.raises(
|
||||
ValueError, match="covariance estimator is not supported with svd"
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
# test bad covariance estimator
|
||||
clf = LinearDiscriminantAnalysis(
|
||||
solver="lsqr", covariance_estimator=KMeans(n_clusters=2, n_init="auto")
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_classes", [2, 3])
|
||||
@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
|
||||
def test_lda_predict_proba(solver, n_classes):
|
||||
def generate_dataset(n_samples, centers, covariances, random_state=None):
|
||||
"""Generate a multivariate normal data given some centers and
|
||||
covariances"""
|
||||
rng = check_random_state(random_state)
|
||||
X = np.vstack(
|
||||
[
|
||||
rng.multivariate_normal(mean, cov, size=n_samples // len(centers))
|
||||
for mean, cov in zip(centers, covariances)
|
||||
]
|
||||
)
|
||||
y = np.hstack(
|
||||
[[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]
|
||||
)
|
||||
return X, y
|
||||
|
||||
blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
|
||||
blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
|
||||
X, y = generate_dataset(
|
||||
n_samples=90000, centers=blob_centers, covariances=blob_stds, random_state=42
|
||||
)
|
||||
lda = LinearDiscriminantAnalysis(
|
||||
solver=solver, store_covariance=True, shrinkage=None
|
||||
).fit(X, y)
|
||||
# check that the empirical means and covariances are close enough to the
|
||||
# one used to generate the data
|
||||
assert_allclose(lda.means_, blob_centers, atol=1e-1)
|
||||
assert_allclose(lda.covariance_, blob_stds[0], atol=1)
|
||||
|
||||
# implement the method to compute the probability given in The Elements
|
||||
# of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
|
||||
# or LDA?")
|
||||
precision = linalg.inv(blob_stds[0])
|
||||
alpha_k = []
|
||||
alpha_k_0 = []
|
||||
for clazz in range(len(blob_centers) - 1):
|
||||
alpha_k.append(
|
||||
np.dot(precision, (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis])
|
||||
)
|
||||
alpha_k_0.append(
|
||||
np.dot(
|
||||
-0.5 * (blob_centers[clazz] + blob_centers[-1])[np.newaxis, :],
|
||||
alpha_k[-1],
|
||||
)
|
||||
)
|
||||
|
||||
sample = np.array([[-22, 22]])
|
||||
|
||||
def discriminant_func(sample, coef, intercept, clazz):
|
||||
return np.exp(intercept[clazz] + np.dot(sample, coef[clazz])).item()
|
||||
|
||||
prob = np.array(
|
||||
[
|
||||
float(
|
||||
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
|
||||
/ (
|
||||
1
|
||||
+ sum(
|
||||
[
|
||||
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
|
||||
for clazz in range(n_classes - 1)
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
for clazz in range(n_classes - 1)
|
||||
]
|
||||
)
|
||||
|
||||
prob_ref = 1 - np.sum(prob)
|
||||
|
||||
# check the consistency of the computed probability
|
||||
# all probabilities should sum to one
|
||||
prob_ref_2 = float(
|
||||
1
|
||||
/ (
|
||||
1
|
||||
+ sum(
|
||||
[
|
||||
discriminant_func(sample, alpha_k, alpha_k_0, clazz)
|
||||
for clazz in range(n_classes - 1)
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert prob_ref == pytest.approx(prob_ref_2)
|
||||
# check that the probability of LDA are close to the theoretical
|
||||
# probabilities
|
||||
assert_allclose(
|
||||
lda.predict_proba(sample), np.hstack([prob, prob_ref])[np.newaxis], atol=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_lda_priors():
|
||||
# Test priors (negative priors)
|
||||
priors = np.array([0.5, -0.5])
|
||||
clf = LinearDiscriminantAnalysis(priors=priors)
|
||||
msg = "priors must be non-negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test that priors passed as a list are correctly handled (run to see if
|
||||
# failure)
|
||||
clf = LinearDiscriminantAnalysis(priors=[0.5, 0.5])
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test that priors always sum to 1
|
||||
priors = np.array([0.5, 0.6])
|
||||
prior_norm = np.array([0.45, 0.55])
|
||||
clf = LinearDiscriminantAnalysis(priors=priors)
|
||||
|
||||
with pytest.warns(UserWarning):
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(clf.priors_, prior_norm, 2)
|
||||
|
||||
|
||||
def test_lda_coefs():
|
||||
# Test if the coefficients of the solvers are approximately the same.
|
||||
n_features = 2
|
||||
n_classes = 2
|
||||
n_samples = 1000
|
||||
X, y = make_blobs(
|
||||
n_samples=n_samples, n_features=n_features, centers=n_classes, random_state=11
|
||||
)
|
||||
|
||||
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
|
||||
clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
|
||||
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
|
||||
|
||||
clf_lda_svd.fit(X, y)
|
||||
clf_lda_lsqr.fit(X, y)
|
||||
clf_lda_eigen.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
|
||||
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
|
||||
assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)
|
||||
|
||||
|
||||
def test_lda_transform():
|
||||
# Test LDA transform.
|
||||
clf = LinearDiscriminantAnalysis(solver="svd", n_components=1)
|
||||
X_transformed = clf.fit(X, y).transform(X)
|
||||
assert X_transformed.shape[1] == 1
|
||||
clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1)
|
||||
X_transformed = clf.fit(X, y).transform(X)
|
||||
assert X_transformed.shape[1] == 1
|
||||
|
||||
clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
|
||||
clf.fit(X, y)
|
||||
msg = "transform not implemented for 'lsqr'"
|
||||
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
clf.transform(X)
|
||||
|
||||
|
||||
def test_lda_explained_variance_ratio():
|
||||
# Test if the sum of the normalized eigen vectors values equals 1,
|
||||
# Also tests whether the explained_variance_ratio_ formed by the
|
||||
# eigen solver is the same as the explained_variance_ratio_ formed
|
||||
# by the svd solver
|
||||
|
||||
state = np.random.RandomState(0)
|
||||
X = state.normal(loc=0, scale=100, size=(40, 20))
|
||||
y = state.randint(0, 3, size=(40,))
|
||||
|
||||
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
|
||||
clf_lda_eigen.fit(X, y)
|
||||
assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
|
||||
assert clf_lda_eigen.explained_variance_ratio_.shape == (2,), (
|
||||
"Unexpected length for explained_variance_ratio_"
|
||||
)
|
||||
|
||||
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
|
||||
clf_lda_svd.fit(X, y)
|
||||
assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)
|
||||
assert clf_lda_svd.explained_variance_ratio_.shape == (2,), (
|
||||
"Unexpected length for explained_variance_ratio_"
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf_lda_svd.explained_variance_ratio_, clf_lda_eigen.explained_variance_ratio_
|
||||
)
|
||||
|
||||
|
||||
def test_lda_orthogonality():
|
||||
# arrange four classes with their means in a kite-shaped pattern
|
||||
# the longer distance should be transformed to the first component, and
|
||||
# the shorter distance to the second component.
|
||||
means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])
|
||||
|
||||
# We construct perfectly symmetric distributions, so the LDA can estimate
|
||||
# precise means.
|
||||
scatter = np.array(
|
||||
[
|
||||
[0.1, 0, 0],
|
||||
[-0.1, 0, 0],
|
||||
[0, 0.1, 0],
|
||||
[0, -0.1, 0],
|
||||
[0, 0, 0.1],
|
||||
[0, 0, -0.1],
|
||||
]
|
||||
)
|
||||
|
||||
X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
|
||||
y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])
|
||||
|
||||
# Fit LDA and transform the means
|
||||
clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
|
||||
means_transformed = clf.transform(means)
|
||||
|
||||
d1 = means_transformed[3] - means_transformed[0]
|
||||
d2 = means_transformed[2] - means_transformed[1]
|
||||
d1 /= np.sqrt(np.sum(d1**2))
|
||||
d2 /= np.sqrt(np.sum(d2**2))
|
||||
|
||||
# the transformed within-class covariance should be the identity matrix
|
||||
assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))
|
||||
|
||||
# the means of classes 0 and 3 should lie on the first component
|
||||
assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)
|
||||
|
||||
# the means of classes 1 and 2 should lie on the second component
|
||||
assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
|
||||
|
||||
|
||||
def test_lda_scaling():
|
||||
# Test if classification works correctly with differently scaled features.
|
||||
n = 100
|
||||
rng = np.random.RandomState(1234)
|
||||
# use uniform distribution of features to make sure there is absolutely no
|
||||
# overlap between classes.
|
||||
x1 = rng.uniform(-1, 1, (n, 3)) + [-10, 0, 0]
|
||||
x2 = rng.uniform(-1, 1, (n, 3)) + [10, 0, 0]
|
||||
x = np.vstack((x1, x2)) * [1, 100, 10000]
|
||||
y = [-1] * n + [1] * n
|
||||
|
||||
for solver in ("svd", "lsqr", "eigen"):
|
||||
clf = LinearDiscriminantAnalysis(solver=solver)
|
||||
# should be able to separate the data perfectly
|
||||
assert clf.fit(x, y).score(x, y) == 1.0, "using covariance: %s" % solver
|
||||
|
||||
|
||||
def test_lda_store_covariance():
|
||||
# Test for solver 'lsqr' and 'eigen'
|
||||
# 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers
|
||||
for solver in ("lsqr", "eigen"):
|
||||
clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6)
|
||||
assert hasattr(clf, "covariance_")
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(
|
||||
X6, y6
|
||||
)
|
||||
assert hasattr(clf, "covariance_")
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
|
||||
)
|
||||
|
||||
# Test for SVD solver, the default is to not set the covariances_ attribute
|
||||
clf = LinearDiscriminantAnalysis(solver="svd").fit(X6, y6)
|
||||
assert not hasattr(clf, "covariance_")
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(X6, y6)
|
||||
assert hasattr(clf, "covariance_")
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(10))
|
||||
def test_lda_shrinkage(seed):
|
||||
# Test that shrunk covariance estimator and shrinkage parameter behave the
|
||||
# same
|
||||
rng = np.random.RandomState(seed)
|
||||
X = rng.rand(100, 10)
|
||||
y = rng.randint(3, size=(100))
|
||||
c1 = LinearDiscriminantAnalysis(store_covariance=True, shrinkage=0.5, solver="lsqr")
|
||||
c2 = LinearDiscriminantAnalysis(
|
||||
store_covariance=True,
|
||||
covariance_estimator=ShrunkCovariance(shrinkage=0.5),
|
||||
solver="lsqr",
|
||||
)
|
||||
c1.fit(X, y)
|
||||
c2.fit(X, y)
|
||||
assert_allclose(c1.means_, c2.means_)
|
||||
assert_allclose(c1.covariance_, c2.covariance_)
|
||||
|
||||
|
||||
def test_lda_ledoitwolf():
|
||||
# When shrinkage="auto" current implementation uses ledoitwolf estimation
|
||||
# of covariance after standardizing the data. This checks that it is indeed
|
||||
# the case
|
||||
class StandardizedLedoitWolf:
|
||||
def fit(self, X):
|
||||
sc = StandardScaler() # standardize features
|
||||
X_sc = sc.fit_transform(X)
|
||||
s = ledoit_wolf(X_sc)[0]
|
||||
# rescale
|
||||
s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :]
|
||||
self.covariance_ = s
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(100, 10)
|
||||
y = rng.randint(3, size=(100,))
|
||||
c1 = LinearDiscriminantAnalysis(
|
||||
store_covariance=True, shrinkage="auto", solver="lsqr"
|
||||
)
|
||||
c2 = LinearDiscriminantAnalysis(
|
||||
store_covariance=True,
|
||||
covariance_estimator=StandardizedLedoitWolf(),
|
||||
solver="lsqr",
|
||||
)
|
||||
c1.fit(X, y)
|
||||
c2.fit(X, y)
|
||||
assert_allclose(c1.means_, c2.means_)
|
||||
assert_allclose(c1.covariance_, c2.covariance_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_features", [3, 5])
|
||||
@pytest.mark.parametrize("n_classes", [5, 3])
|
||||
def test_lda_dimension_warning(n_classes, n_features):
|
||||
rng = check_random_state(0)
|
||||
n_samples = 10
|
||||
X = rng.randn(n_samples, n_features)
|
||||
# we create n_classes labels by repeating and truncating a
|
||||
# range(n_classes) until n_samples
|
||||
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
|
||||
max_components = min(n_features, n_classes - 1)
|
||||
|
||||
for n_components in [max_components - 1, None, max_components]:
|
||||
# if n_components <= min(n_classes - 1, n_features), no warning
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
lda.fit(X, y)
|
||||
|
||||
for n_components in [max_components + 1, max(n_features, n_classes - 1) + 1]:
|
||||
# if n_components > min(n_classes - 1, n_features), raise error.
|
||||
# We test one unit higher than max_components, and then something
|
||||
# larger than both n_features and n_classes - 1 to ensure the test
|
||||
# works for any value of n_component
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
msg = "n_components cannot be larger than "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lda.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data_type, expected_type",
|
||||
[
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
],
|
||||
)
|
||||
def test_lda_dtype_match(data_type, expected_type):
|
||||
for solver, shrinkage in solver_shrinkage:
|
||||
clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf.fit(X.astype(data_type), y.astype(data_type))
|
||||
assert clf.coef_.dtype == expected_type
|
||||
|
||||
|
||||
def test_lda_numeric_consistency_float32_float64():
|
||||
for solver, shrinkage in solver_shrinkage:
|
||||
clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf_32.fit(X.astype(np.float32), y.astype(np.float32))
|
||||
clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
|
||||
clf_64.fit(X.astype(np.float64), y.astype(np.float64))
|
||||
|
||||
# Check value consistency between types
|
||||
rtol = 1e-6
|
||||
assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["svd", "eigen"])
|
||||
def test_qda(solver):
|
||||
# QDA classification.
|
||||
# This checks that QDA implements fit and predict and returns
|
||||
# correct values for a simple toy dataset.
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver)
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
assert_array_equal(y_pred, y6)
|
||||
|
||||
# Assure that it works with 1D data
|
||||
y_pred1 = clf.fit(X7, y6).predict(X7)
|
||||
assert_array_equal(y_pred1, y6)
|
||||
|
||||
# Test probas estimates
|
||||
y_proba_pred1 = clf.predict_proba(X7)
|
||||
assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
|
||||
y_log_proba_pred1 = clf.predict_log_proba(X7)
|
||||
assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)
|
||||
|
||||
y_pred3 = clf.fit(X6, y7).predict(X6)
|
||||
# QDA shouldn't be able to separate those
|
||||
assert np.any(y_pred3 != y7)
|
||||
|
||||
# Classes should have at least 2 elements
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X6, y4)
|
||||
|
||||
|
||||
def test_qda_covariance_estimator():
|
||||
# Test that the correct errors are raised when using inappropriate
|
||||
# covariance estimators or shrinkage parameters with QDA.
|
||||
clf = QuadraticDiscriminantAnalysis(solver="svd", shrinkage="auto")
|
||||
with pytest.raises(NotImplementedError):
|
||||
clf.fit(X, y)
|
||||
|
||||
clf = QuadraticDiscriminantAnalysis(
|
||||
solver="eigen", shrinkage=0.1, covariance_estimator=ShrunkCovariance()
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"covariance_estimator and shrinkage parameters are not None. "
|
||||
"Only one of the two can be set."
|
||||
),
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
# test bad solver with covariance_estimator
|
||||
clf = QuadraticDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf())
|
||||
with pytest.raises(
|
||||
ValueError, match="covariance_estimator is not supported with solver='svd'"
|
||||
):
|
||||
clf.fit(X, y)
|
||||
|
||||
# test bad covariance estimator
|
||||
clf = QuadraticDiscriminantAnalysis(
|
||||
solver="eigen", covariance_estimator=KMeans(n_clusters=2, n_init="auto")
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_qda_ledoitwolf(global_random_seed):
|
||||
# When shrinkage="auto" current implementation uses ledoitwolf estimation
|
||||
# of covariance after standardizing the data. This checks that it is indeed
|
||||
# the case
|
||||
class StandardizedLedoitWolf:
|
||||
def fit(self, X):
|
||||
sc = StandardScaler() # standardize features
|
||||
X_sc = sc.fit_transform(X)
|
||||
s = ledoit_wolf(X_sc)[0]
|
||||
# rescale
|
||||
s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :]
|
||||
self.covariance_ = s
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = rng.rand(100, 10)
|
||||
y = rng.randint(3, size=(100,))
|
||||
c1 = QuadraticDiscriminantAnalysis(
|
||||
store_covariance=True, shrinkage="auto", solver="eigen"
|
||||
)
|
||||
c2 = QuadraticDiscriminantAnalysis(
|
||||
store_covariance=True,
|
||||
covariance_estimator=StandardizedLedoitWolf(),
|
||||
solver="eigen",
|
||||
)
|
||||
c1.fit(X, y)
|
||||
c2.fit(X, y)
|
||||
assert_allclose(c1.means_, c2.means_)
|
||||
assert_allclose(c1.covariance_, c2.covariance_)
|
||||
|
||||
|
||||
def test_qda_coefs(global_random_seed):
|
||||
# Test if the coefficients of the solvers are approximately the same.
|
||||
n_features = 2
|
||||
n_classes = 2
|
||||
n_samples = 3000
|
||||
X, y = make_blobs(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
centers=n_classes,
|
||||
cluster_std=[1.0, 3.0],
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
|
||||
clf_svd = QuadraticDiscriminantAnalysis(solver="svd")
|
||||
clf_eigen = QuadraticDiscriminantAnalysis(solver="eigen")
|
||||
|
||||
clf_svd.fit(X, y)
|
||||
clf_eigen.fit(X, y)
|
||||
|
||||
for class_idx in range(n_classes):
|
||||
assert_allclose(
|
||||
np.abs(clf_svd.rotations_[class_idx]),
|
||||
np.abs(clf_eigen.rotations_[class_idx]),
|
||||
rtol=1e-3,
|
||||
err_msg=f"SVD and Eigen rotations differ for class {class_idx}",
|
||||
)
|
||||
assert_allclose(
|
||||
clf_svd.scalings_[class_idx],
|
||||
clf_eigen.scalings_[class_idx],
|
||||
rtol=1e-3,
|
||||
err_msg=f"SVD and Eigen scalings differ for class {class_idx}",
|
||||
)
|
||||
|
||||
|
||||
def test_qda_priors():
|
||||
clf = QuadraticDiscriminantAnalysis()
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
n_pos = np.sum(y_pred == 2)
|
||||
|
||||
neg = 1e-10
|
||||
clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg]))
|
||||
y_pred = clf.fit(X6, y6).predict(X6)
|
||||
n_pos2 = np.sum(y_pred == 2)
|
||||
|
||||
assert n_pos2 > n_pos
|
||||
|
||||
|
||||
@pytest.mark.parametrize("priors_type", ["list", "tuple", "array"])
|
||||
def test_qda_prior_type(priors_type):
|
||||
"""Check that priors accept array-like."""
|
||||
priors = [0.5, 0.5]
|
||||
clf = QuadraticDiscriminantAnalysis(
|
||||
priors=_convert_container([0.5, 0.5], priors_type)
|
||||
).fit(X6, y6)
|
||||
assert isinstance(clf.priors_, np.ndarray)
|
||||
assert_array_equal(clf.priors_, priors)
|
||||
|
||||
|
||||
def test_qda_prior_copy():
|
||||
"""Check that altering `priors` without `fit` doesn't change `priors_`"""
|
||||
priors = np.array([0.5, 0.5])
|
||||
qda = QuadraticDiscriminantAnalysis(priors=priors).fit(X, y)
|
||||
|
||||
# we expect the following
|
||||
assert_array_equal(qda.priors_, qda.priors)
|
||||
|
||||
# altering `priors` without `fit` should not change `priors_`
|
||||
priors[0] = 0.2
|
||||
assert qda.priors_[0] != qda.priors[0]
|
||||
|
||||
|
||||
def test_qda_store_covariance():
|
||||
# The default is to not set the covariances_ attribute
|
||||
clf = QuadraticDiscriminantAnalysis().fit(X6, y6)
|
||||
assert not hasattr(clf, "covariance_")
|
||||
|
||||
# Test the actual attribute:
|
||||
clf = QuadraticDiscriminantAnalysis(store_covariance=True).fit(X6, y6)
|
||||
assert hasattr(clf, "covariance_")
|
||||
|
||||
assert_array_almost_equal(clf.covariance_[0], np.array([[0.7, 0.45], [0.45, 0.7]]))
|
||||
|
||||
assert_array_almost_equal(
|
||||
clf.covariance_[1],
|
||||
np.array([[0.33333333, -0.33333333], [-0.33333333, 0.66666667]]),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["svd", "eigen"])
|
||||
def test_qda_regularization(global_random_seed, solver):
|
||||
# The default is reg_param=0. and will cause issues when there is a
|
||||
# constant variable.
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
|
||||
# Fitting on data with constant variable without regularization
|
||||
# triggers a LinAlgError.
|
||||
msg = r"The covariance matrix of class .+ is not full rank."
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver)
|
||||
with pytest.raises(linalg.LinAlgError, match=msg):
|
||||
clf.fit(X2, y6)
|
||||
|
||||
with pytest.raises(AttributeError):
|
||||
y_pred = clf.predict(X2)
|
||||
|
||||
# Adding a little regularization fixes the fit time error.
|
||||
if solver == "svd":
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver, reg_param=0.01)
|
||||
elif solver == "eigen":
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver, shrinkage=0.01)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
clf.fit(X2, y6)
|
||||
y_pred = clf.predict(X2)
|
||||
assert_array_equal(y_pred, y6)
|
||||
|
||||
# LinAlgError should also be there for the n_samples_in_a_class <
|
||||
# n_features case.
|
||||
X = rng.normal(size=(9, 4))
|
||||
y = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2])
|
||||
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver)
|
||||
if solver == "svd":
|
||||
msg2 = msg + " When using `solver='svd'`"
|
||||
elif solver == "eigen":
|
||||
msg2 = msg
|
||||
|
||||
with pytest.raises(linalg.LinAlgError, match=msg2):
|
||||
clf.fit(X, y)
|
||||
|
||||
# The error will persist even with regularization for SVD
|
||||
# because the number of singular values is limited by n_samples_in_a_class.
|
||||
if solver == "svd":
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver, reg_param=0.3)
|
||||
with pytest.raises(linalg.LinAlgError, match=msg2):
|
||||
clf.fit(X, y)
|
||||
# The warning will be gone for Eigen with regularization, because
|
||||
# the covariance matrix will be full-rank.
|
||||
elif solver == "eigen":
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver, shrinkage=0.3)
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_covariance():
|
||||
x, y = make_blobs(n_samples=100, n_features=5, centers=1, random_state=42)
|
||||
|
||||
# make features correlated
|
||||
x = np.dot(x, np.arange(x.shape[1] ** 2).reshape(x.shape[1], x.shape[1]))
|
||||
|
||||
c_e = _cov(x, "empirical")
|
||||
assert_almost_equal(c_e, c_e.T)
|
||||
|
||||
c_s = _cov(x, "auto")
|
||||
assert_almost_equal(c_s, c_s.T)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
|
||||
def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
|
||||
"""
|
||||
Tests that if the number of samples equals the number
|
||||
of classes, a ValueError is raised.
|
||||
"""
|
||||
X = np.array([[0.5, 0.6], [0.6, 0.5]])
|
||||
y = np.array(["a", "b"])
|
||||
clf = LinearDiscriminantAnalysis(solver=solver)
|
||||
with pytest.raises(ValueError, match="The number of samples must be more"):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["svd", "eigen"])
|
||||
def test_raises_value_error_on_one_sample_per_class(solver):
|
||||
"""
|
||||
Tests that if a class has one sample, a ValueError is raised.
|
||||
"""
|
||||
X = np.array([[0.5, 0.6], [0.6, 0.5], [0.4, 0.4], [0.6, 0.5]])
|
||||
y = np.array(["a", "a", "a", "b"])
|
||||
clf = QuadraticDiscriminantAnalysis(solver=solver)
|
||||
with pytest.raises(ValueError, match="y has only 1 sample in class"):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out uses class name as prefix."""
|
||||
|
||||
est = LinearDiscriminantAnalysis().fit(X, y)
|
||||
names_out = est.get_feature_names_out()
|
||||
|
||||
class_name_lower = "LinearDiscriminantAnalysis".lower()
|
||||
expected_names_out = np.array(
|
||||
[
|
||||
f"{class_name_lower}{i}"
|
||||
for i in range(est.explained_variance_ratio_.shape[0])
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_features", [25])
|
||||
@pytest.mark.parametrize("train_size", [100])
|
||||
@pytest.mark.parametrize("solver_no_shrinkage", ["svd", "eigen"])
|
||||
def test_qda_shrinkage_performance(
|
||||
global_random_seed, n_features, train_size, solver_no_shrinkage
|
||||
):
|
||||
# Test that QDA with shrinkage performs better than without shrinkage on
|
||||
# a case where there's a small number of samples per class relative to
|
||||
# the number of features.
|
||||
n_samples = 1000
|
||||
n_features = n_features
|
||||
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
|
||||
# Sample from two Gaussians with different variances and same null means.
|
||||
vars1 = rng.uniform(2.0, 3.0, size=n_features)
|
||||
vars2 = rng.uniform(0.2, 1.0, size=n_features)
|
||||
|
||||
X = np.concatenate(
|
||||
[
|
||||
np.random.randn(n_samples // 2, n_features) * np.sqrt(vars1),
|
||||
np.random.randn(n_samples // 2, n_features) * np.sqrt(vars2),
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
y = np.array([0] * (n_samples // 2) + [1] * (n_samples // 2))
|
||||
|
||||
# Use small training sets to illustrate the regularization effect of
|
||||
# covariance shrinkage.
|
||||
cv = ShuffleSplit(n_splits=5, train_size=train_size, random_state=0)
|
||||
qda_shrinkage = QuadraticDiscriminantAnalysis(solver="eigen", shrinkage="auto")
|
||||
qda_no_shrinkage = QuadraticDiscriminantAnalysis(
|
||||
solver=solver_no_shrinkage, shrinkage=None
|
||||
)
|
||||
|
||||
scores_no_shrinkage = cross_val_score(
|
||||
qda_no_shrinkage, X, y, cv=cv, scoring="d2_brier_score"
|
||||
)
|
||||
scores_shrinkage = cross_val_score(
|
||||
qda_shrinkage, X, y, cv=cv, scoring="d2_brier_score"
|
||||
)
|
||||
|
||||
assert scores_shrinkage.mean() > 0.9
|
||||
assert scores_no_shrinkage.mean() < 0.6
|
||||
@@ -0,0 +1,336 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
import warnings
|
||||
from inspect import signature
|
||||
from pkgutil import walk_packages
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
# make it possible to discover experimental estimators when calling `all_estimators`
|
||||
from sklearn.experimental import (
|
||||
enable_halving_search_cv, # noqa: F401
|
||||
enable_iterative_imputer, # noqa: F401
|
||||
)
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.utils import all_estimators
|
||||
from sklearn.utils._test_common.instance_generator import _construct_instances
|
||||
from sklearn.utils._testing import (
|
||||
_get_func_name,
|
||||
check_docstring_parameters,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.deprecation import _is_deprecated
|
||||
from sklearn.utils.estimator_checks import (
|
||||
_enforce_estimator_tags_X,
|
||||
_enforce_estimator_tags_y,
|
||||
)
|
||||
|
||||
# walk_packages() ignores DeprecationWarnings, now we need to ignore
|
||||
# FutureWarnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", FutureWarning)
|
||||
# mypy error: Module has no attribute "__path__"
|
||||
sklearn_path = [os.path.dirname(sklearn.__file__)]
|
||||
PUBLIC_MODULES = set(
|
||||
[
|
||||
pckg[1]
|
||||
for pckg in walk_packages(prefix="sklearn.", path=sklearn_path)
|
||||
if not any(
|
||||
substr in pckg[1] for substr in ["._", ".tests.", "sklearn.externals"]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# functions to ignore args / docstring of
|
||||
_DOCSTRING_IGNORES = [
|
||||
"sklearn.utils.deprecation.load_mlcomp",
|
||||
"sklearn.pipeline.make_pipeline",
|
||||
"sklearn.pipeline.make_union",
|
||||
"sklearn.utils.extmath.safe_sparse_dot",
|
||||
"HalfBinomialLoss",
|
||||
]
|
||||
|
||||
# Methods where y param should be ignored if y=None by default
|
||||
_METHODS_IGNORE_NONE_Y = [
|
||||
"fit",
|
||||
"score",
|
||||
"fit_predict",
|
||||
"fit_transform",
|
||||
"partial_fit",
|
||||
"predict",
|
||||
]
|
||||
|
||||
|
||||
def test_docstring_parameters():
|
||||
# Test module docstring formatting
|
||||
|
||||
# Skip test if numpydoc is not found
|
||||
pytest.importorskip(
|
||||
"numpydoc", reason="numpydoc is required to test the docstrings"
|
||||
)
|
||||
|
||||
# XXX unreached code as of v0.22
|
||||
from numpydoc import docscrape
|
||||
|
||||
incorrect = []
|
||||
for name in PUBLIC_MODULES:
|
||||
if name.endswith(".conftest"):
|
||||
# pytest tooling, not part of the scikit-learn API
|
||||
continue
|
||||
if name == "sklearn.utils.fixes":
|
||||
# We cannot always control these docstrings
|
||||
continue
|
||||
with warnings.catch_warnings(record=True):
|
||||
module = importlib.import_module(name)
|
||||
classes = inspect.getmembers(module, inspect.isclass)
|
||||
# Exclude non-scikit-learn classes
|
||||
classes = [cls for cls in classes if cls[1].__module__.startswith("sklearn")]
|
||||
for cname, cls in classes:
|
||||
this_incorrect = []
|
||||
if cname in _DOCSTRING_IGNORES or cname.startswith("_"):
|
||||
continue
|
||||
if inspect.isabstract(cls):
|
||||
continue
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
cdoc = docscrape.ClassDoc(cls)
|
||||
if len(w):
|
||||
raise RuntimeError(
|
||||
"Error for __init__ of %s in %s:\n%s" % (cls, name, w[0])
|
||||
)
|
||||
|
||||
# Skip checks on deprecated classes
|
||||
if _is_deprecated(cls.__new__):
|
||||
continue
|
||||
|
||||
this_incorrect += check_docstring_parameters(cls.__init__, cdoc)
|
||||
|
||||
for method_name in cdoc.methods:
|
||||
method = getattr(cls, method_name)
|
||||
if _is_deprecated(method):
|
||||
continue
|
||||
param_ignore = None
|
||||
# Now skip docstring test for y when y is None
|
||||
# by default for API reason
|
||||
if method_name in _METHODS_IGNORE_NONE_Y:
|
||||
sig = signature(method)
|
||||
if "y" in sig.parameters and sig.parameters["y"].default is None:
|
||||
param_ignore = ["y"] # ignore y for fit and score
|
||||
result = check_docstring_parameters(method, ignore=param_ignore)
|
||||
this_incorrect += result
|
||||
|
||||
incorrect += this_incorrect
|
||||
|
||||
functions = inspect.getmembers(module, inspect.isfunction)
|
||||
# Exclude imported functions
|
||||
functions = [fn for fn in functions if fn[1].__module__ == name]
|
||||
for fname, func in functions:
|
||||
# Don't test private methods / functions
|
||||
if fname.startswith("_"):
|
||||
continue
|
||||
if fname == "configuration" and name.endswith("setup"):
|
||||
continue
|
||||
name_ = _get_func_name(func)
|
||||
if not any(d in name_ for d in _DOCSTRING_IGNORES) and not _is_deprecated(
|
||||
func
|
||||
):
|
||||
incorrect += check_docstring_parameters(func)
|
||||
|
||||
msg = "\n".join(incorrect)
|
||||
if len(incorrect) > 0:
|
||||
raise AssertionError("Docstring Error:\n" + msg)
|
||||
|
||||
|
||||
def _construct_searchcv_instance(SearchCV):
|
||||
return SearchCV(LogisticRegression(), {"C": [0.1, 1]})
|
||||
|
||||
|
||||
def _construct_compose_pipeline_instance(Estimator):
|
||||
# Minimal / degenerate instances: only useful to test the docstrings.
|
||||
if Estimator.__name__ == "ColumnTransformer":
|
||||
return Estimator(transformers=[("transformer", "passthrough", [0, 1])])
|
||||
elif Estimator.__name__ == "Pipeline":
|
||||
return Estimator(steps=[("clf", LogisticRegression())])
|
||||
elif Estimator.__name__ == "FeatureUnion":
|
||||
return Estimator(transformer_list=[("transformer", FunctionTransformer())])
|
||||
|
||||
|
||||
def _construct_sparse_coder(Estimator):
|
||||
# XXX: hard-coded assumption that n_features=3
|
||||
dictionary = np.array(
|
||||
[[0, 1, 0], [-1, -1, 2], [1, 1, 1], [0, 1, 1], [0, 2, 1]],
|
||||
dtype=np.float64,
|
||||
)
|
||||
return Estimator(dictionary=dictionary)
|
||||
|
||||
|
||||
# TODO(1.10): remove copy warning filter
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:The default value of `copy` will change from False to True in 1.10."
|
||||
)
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("name, Estimator", all_estimators())
|
||||
def test_fit_docstring_attributes(name, Estimator):
|
||||
pytest.importorskip("numpydoc")
|
||||
from numpydoc import docscrape
|
||||
|
||||
doc = docscrape.ClassDoc(Estimator)
|
||||
attributes = doc["Attributes"]
|
||||
|
||||
if Estimator.__name__ in (
|
||||
"HalvingRandomSearchCV",
|
||||
"RandomizedSearchCV",
|
||||
"HalvingGridSearchCV",
|
||||
"GridSearchCV",
|
||||
):
|
||||
est = _construct_searchcv_instance(Estimator)
|
||||
elif Estimator.__name__ in (
|
||||
"ColumnTransformer",
|
||||
"Pipeline",
|
||||
"FeatureUnion",
|
||||
):
|
||||
est = _construct_compose_pipeline_instance(Estimator)
|
||||
elif Estimator.__name__ == "SparseCoder":
|
||||
est = _construct_sparse_coder(Estimator)
|
||||
elif Estimator.__name__ == "FrozenEstimator":
|
||||
X, y = make_classification(n_samples=20, n_features=5, random_state=0)
|
||||
est = Estimator(LogisticRegression().fit(X, y))
|
||||
else:
|
||||
# TODO(devtools): use _tested_estimators instead of all_estimators in the
|
||||
# decorator
|
||||
est = next(_construct_instances(Estimator))
|
||||
|
||||
if Estimator.__name__ == "SelectKBest":
|
||||
est.set_params(k=2)
|
||||
elif Estimator.__name__ == "DummyClassifier":
|
||||
est.set_params(strategy="stratified")
|
||||
elif Estimator.__name__ == "CCA" or Estimator.__name__.startswith("PLS"):
|
||||
# default = 2 is invalid for single target
|
||||
est.set_params(n_components=1)
|
||||
elif Estimator.__name__ in (
|
||||
"GaussianRandomProjection",
|
||||
"SparseRandomProjection",
|
||||
):
|
||||
# default="auto" raises an error with the shape of `X`
|
||||
est.set_params(n_components=2)
|
||||
elif Estimator.__name__ == "TSNE":
|
||||
# default raises an error, perplexity must be less than n_samples
|
||||
est.set_params(perplexity=2)
|
||||
# TODO(1.9) remove
|
||||
elif Estimator.__name__ == "KBinsDiscretizer":
|
||||
# default raises a FutureWarning if quantile method is at default "warn"
|
||||
est.set_params(quantile_method="averaged_inverted_cdf")
|
||||
# TODO(1.10) remove
|
||||
elif Estimator.__name__ == "MDS":
|
||||
# default raises a FutureWarning
|
||||
est.set_params(n_init=1, init="random")
|
||||
# TODO(1.10) remove
|
||||
elif Estimator.__name__ == "LogisticRegressionCV":
|
||||
# default 'l1_ratios' value creates a FutureWarning
|
||||
est.set_params(l1_ratios=(0,))
|
||||
|
||||
# Low max iter to speed up tests: we are only interested in checking the existence
|
||||
# of fitted attributes. This should be invariant to whether it has converged or not.
|
||||
if "max_iter" in est.get_params():
|
||||
est.set_params(max_iter=2)
|
||||
# min value for `TSNE` is 250
|
||||
if Estimator.__name__ == "TSNE":
|
||||
est.set_params(max_iter=250)
|
||||
|
||||
if "random_state" in est.get_params():
|
||||
est.set_params(random_state=0)
|
||||
|
||||
# In case we want to deprecate some attributes in the future
|
||||
skipped_attributes = {}
|
||||
|
||||
if Estimator.__name__.endswith("Vectorizer"):
|
||||
# Vectorizer require some specific input data
|
||||
if Estimator.__name__ in (
|
||||
"CountVectorizer",
|
||||
"HashingVectorizer",
|
||||
"TfidfVectorizer",
|
||||
):
|
||||
X = [
|
||||
"This is the first document.",
|
||||
"This document is the second document.",
|
||||
"And this is the third one.",
|
||||
"Is this the first document?",
|
||||
]
|
||||
elif Estimator.__name__ == "DictVectorizer":
|
||||
X = [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]
|
||||
y = None
|
||||
else:
|
||||
X, y = make_classification(
|
||||
n_samples=20,
|
||||
n_features=3,
|
||||
n_redundant=0,
|
||||
n_classes=2,
|
||||
random_state=2,
|
||||
)
|
||||
|
||||
y = _enforce_estimator_tags_y(est, y)
|
||||
X = _enforce_estimator_tags_X(est, X)
|
||||
|
||||
if est.__sklearn_tags__().target_tags.one_d_labels:
|
||||
est.fit(y)
|
||||
elif est.__sklearn_tags__().target_tags.two_d_labels:
|
||||
est.fit(np.c_[y, y])
|
||||
elif est.__sklearn_tags__().input_tags.three_d_array:
|
||||
est.fit(X[np.newaxis, ...], y)
|
||||
else:
|
||||
est.fit(X, y)
|
||||
|
||||
for attr in attributes:
|
||||
if attr.name in skipped_attributes:
|
||||
continue
|
||||
desc = " ".join(attr.desc).lower()
|
||||
# As certain attributes are present "only" if a certain parameter is
|
||||
# provided, this checks if the word "only" is present in the attribute
|
||||
# description, and if not the attribute is required to be present.
|
||||
if "only " in desc:
|
||||
continue
|
||||
# ignore deprecation warnings
|
||||
with ignore_warnings(category=FutureWarning):
|
||||
assert hasattr(est, attr.name)
|
||||
|
||||
fit_attr = _get_all_fitted_attributes(est)
|
||||
fit_attr_names = [attr.name for attr in attributes]
|
||||
undocumented_attrs = set(fit_attr).difference(fit_attr_names)
|
||||
undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
|
||||
if undocumented_attrs:
|
||||
raise AssertionError(
|
||||
f"Undocumented attributes for {Estimator.__name__}: {undocumented_attrs}"
|
||||
)
|
||||
|
||||
|
||||
def _get_all_fitted_attributes(estimator):
|
||||
"Get all the fitted attributes of an estimator including properties"
|
||||
# attributes
|
||||
fit_attr = list(estimator.__dict__.keys())
|
||||
|
||||
# properties
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("error", category=FutureWarning)
|
||||
|
||||
for name in dir(estimator.__class__):
|
||||
obj = getattr(estimator.__class__, name)
|
||||
if not isinstance(obj, property):
|
||||
continue
|
||||
|
||||
# ignore properties that raises an AttributeError and deprecated
|
||||
# properties
|
||||
try:
|
||||
getattr(estimator, name)
|
||||
except (AttributeError, FutureWarning):
|
||||
continue
|
||||
fit_attr.append(name)
|
||||
|
||||
return [k for k in fit_attr if k.endswith("_") and not k.startswith("_")]
|
||||
@@ -0,0 +1,113 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import metrics
|
||||
from sklearn.ensemble import (
|
||||
BaggingClassifier,
|
||||
BaggingRegressor,
|
||||
IsolationForest,
|
||||
StackingClassifier,
|
||||
StackingRegressor,
|
||||
)
|
||||
from sklearn.utils._testing import assert_docstring_consistency, skip_if_no_numpydoc
|
||||
|
||||
CLASS_DOCSTRING_CONSISTENCY_CASES = [
|
||||
{
|
||||
"objects": [BaggingClassifier, BaggingRegressor, IsolationForest],
|
||||
"include_params": ["max_samples"],
|
||||
"exclude_params": None,
|
||||
"include_attrs": False,
|
||||
"exclude_attrs": None,
|
||||
"include_returns": False,
|
||||
"exclude_returns": None,
|
||||
"descr_regex_pattern": r"The number of samples to draw from X to train each.*",
|
||||
"ignore_types": ("max_samples"),
|
||||
},
|
||||
{
|
||||
"objects": [StackingClassifier, StackingRegressor],
|
||||
"include_params": ["cv", "n_jobs", "passthrough", "verbose"],
|
||||
"exclude_params": None,
|
||||
"include_attrs": True,
|
||||
"exclude_attrs": ["final_estimator_"],
|
||||
"include_returns": False,
|
||||
"exclude_returns": None,
|
||||
"descr_regex_pattern": None,
|
||||
},
|
||||
]
|
||||
|
||||
FUNCTION_DOCSTRING_CONSISTENCY_CASES = [
|
||||
{
|
||||
"objects": [
|
||||
metrics.precision_recall_fscore_support,
|
||||
metrics.f1_score,
|
||||
metrics.fbeta_score,
|
||||
metrics.precision_score,
|
||||
metrics.recall_score,
|
||||
],
|
||||
"include_params": True,
|
||||
"exclude_params": ["average", "zero_division"],
|
||||
"include_attrs": False,
|
||||
"exclude_attrs": None,
|
||||
"include_returns": False,
|
||||
"exclude_returns": None,
|
||||
"descr_regex_pattern": None,
|
||||
},
|
||||
{
|
||||
"objects": [
|
||||
metrics.precision_recall_fscore_support,
|
||||
metrics.f1_score,
|
||||
metrics.fbeta_score,
|
||||
metrics.precision_score,
|
||||
metrics.recall_score,
|
||||
],
|
||||
"include_params": ["average"],
|
||||
"exclude_params": None,
|
||||
"include_attrs": False,
|
||||
"exclude_attrs": None,
|
||||
"include_returns": False,
|
||||
"exclude_returns": None,
|
||||
"descr_regex_pattern": " ".join(
|
||||
(
|
||||
r"""This parameter is required for multiclass/multilabel targets\.
|
||||
If ``None``, the metrics for each class are returned\. Otherwise, this
|
||||
determines the type of averaging performed on the data:
|
||||
``'binary'``:
|
||||
Only report results for the class specified by ``pos_label``\.
|
||||
This is applicable only if targets \(``y_\{true,pred\}``\) are binary\.
|
||||
``'micro'``:
|
||||
Calculate metrics globally by counting the total true positives,
|
||||
false negatives and false positives\.
|
||||
``'macro'``:
|
||||
Calculate metrics for each label, and find their unweighted
|
||||
mean\. This does not take label imbalance into account\.
|
||||
``'weighted'``:
|
||||
Calculate metrics for each label, and find their average weighted
|
||||
by support \(the number of true instances for each label\)\. This
|
||||
alters 'macro' to account for label imbalance; it can result in an
|
||||
F-score that is not between precision and recall\."""
|
||||
r"[\s\w]*\.*" # optionally match additional sentence
|
||||
r"""
|
||||
``'samples'``:
|
||||
Calculate metrics for each instance, and find their average \(only
|
||||
meaningful for multilabel classification where this differs from
|
||||
:func:`accuracy_score`\)\."""
|
||||
).split()
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", CLASS_DOCSTRING_CONSISTENCY_CASES)
|
||||
@skip_if_no_numpydoc
|
||||
def test_class_docstring_consistency(case):
|
||||
"""Check docstrings parameters consistency between related classes."""
|
||||
assert_docstring_consistency(**case)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", FUNCTION_DOCSTRING_CONSISTENCY_CASES)
|
||||
@skip_if_no_numpydoc
|
||||
def test_function_docstring_consistency(case):
|
||||
"""Check docstrings parameters consistency between related functions."""
|
||||
assert_docstring_consistency(**case)
|
||||
@@ -0,0 +1,208 @@
|
||||
import re
|
||||
from inspect import signature
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
# make it possible to discover experimental estimators when calling `all_estimators`
|
||||
from sklearn.experimental import (
|
||||
enable_halving_search_cv, # noqa: F401
|
||||
enable_iterative_imputer, # noqa: F401
|
||||
)
|
||||
from sklearn.utils.discovery import all_displays, all_estimators, all_functions
|
||||
|
||||
numpydoc_validation = pytest.importorskip("numpydoc.validate")
|
||||
|
||||
|
||||
def get_all_methods():
|
||||
estimators = all_estimators()
|
||||
displays = all_displays()
|
||||
for name, Klass in estimators + displays:
|
||||
if name.startswith("_"):
|
||||
# skip private classes
|
||||
continue
|
||||
methods = []
|
||||
for name in dir(Klass):
|
||||
if name.startswith("_"):
|
||||
continue
|
||||
method_obj = getattr(Klass, name)
|
||||
if hasattr(method_obj, "__call__") or isinstance(method_obj, property):
|
||||
methods.append(name)
|
||||
methods.append(None)
|
||||
|
||||
for method in sorted(methods, key=str):
|
||||
yield Klass, method
|
||||
|
||||
|
||||
def get_all_functions_names():
|
||||
functions = all_functions()
|
||||
for _, func in functions:
|
||||
# exclude functions from utils.fixex since they come from external packages
|
||||
if "utils.fixes" not in func.__module__:
|
||||
yield f"{func.__module__}.{func.__name__}"
|
||||
|
||||
|
||||
def filter_errors(errors, method, Klass=None):
|
||||
"""
|
||||
Ignore some errors based on the method type.
|
||||
|
||||
These rules are specific for scikit-learn."""
|
||||
for code, message in errors:
|
||||
# We ignore following error code,
|
||||
# - RT02: The first line of the Returns section
|
||||
# should contain only the type, ..
|
||||
# (as we may need refer to the name of the returned
|
||||
# object)
|
||||
# - GL01: Docstring text (summary) should start in the line
|
||||
# immediately after the opening quotes (not in the same line,
|
||||
# or leaving a blank line in between)
|
||||
# - GL02: If there's a blank line, it should be before the
|
||||
# first line of the Returns section, not after (it allows to have
|
||||
# short docstrings for properties).
|
||||
|
||||
if code in ["RT02", "GL01", "GL02"]:
|
||||
continue
|
||||
|
||||
# Ignore PR02: Unknown parameters for properties. We sometimes use
|
||||
# properties for ducktyping, i.e. SGDClassifier.predict_proba
|
||||
# Ignore GL08: Parsing of the method signature failed, possibly because this is
|
||||
# a property. Properties are sometimes used for deprecated attributes and the
|
||||
# attribute is already documented in the class docstring.
|
||||
#
|
||||
# All error codes:
|
||||
# https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks
|
||||
if code in ("PR02", "GL08") and Klass is not None and method is not None:
|
||||
method_obj = getattr(Klass, method)
|
||||
if isinstance(method_obj, property):
|
||||
continue
|
||||
|
||||
# Following codes are only taken into account for the
|
||||
# top level class docstrings:
|
||||
# - ES01: No extended summary found
|
||||
# - SA01: See Also section not found
|
||||
# - EX01: No examples section found
|
||||
|
||||
if method is not None and code in ["EX01", "SA01", "ES01"]:
|
||||
continue
|
||||
yield code, message
|
||||
|
||||
|
||||
def repr_errors(res, Klass=None, method: Optional[str] = None) -> str:
|
||||
"""Pretty print original docstring and the obtained errors
|
||||
|
||||
Parameters
|
||||
----------
|
||||
res : dict
|
||||
result of numpydoc.validate.validate
|
||||
Klass : {Estimator, Display, None}
|
||||
estimator object or None
|
||||
method : str
|
||||
if estimator is not None, either the method name or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
String representation of the error.
|
||||
"""
|
||||
if method is None:
|
||||
if hasattr(Klass, "__init__"):
|
||||
method = "__init__"
|
||||
elif Klass is None:
|
||||
raise ValueError("At least one of Klass, method should be provided")
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
if Klass is not None:
|
||||
obj = getattr(Klass, method)
|
||||
try:
|
||||
obj_signature = str(signature(obj))
|
||||
except TypeError:
|
||||
# In particular we can't parse the signature of properties
|
||||
obj_signature = (
|
||||
"\nParsing of the method signature failed, "
|
||||
"possibly because this is a property."
|
||||
)
|
||||
|
||||
obj_name = Klass.__name__ + "." + method
|
||||
else:
|
||||
obj_signature = ""
|
||||
obj_name = method
|
||||
|
||||
msg = "\n\n" + "\n\n".join(
|
||||
[
|
||||
str(res["file"]),
|
||||
obj_name + obj_signature,
|
||||
res["docstring"],
|
||||
"# Errors",
|
||||
"\n".join(
|
||||
" - {}: {}".format(code, message) for code, message in res["errors"]
|
||||
),
|
||||
]
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
@pytest.mark.parametrize("function_name", get_all_functions_names())
|
||||
def test_function_docstring(function_name, request):
|
||||
"""Check function docstrings using numpydoc."""
|
||||
res = numpydoc_validation.validate(function_name)
|
||||
|
||||
res["errors"] = list(filter_errors(res["errors"], method="function"))
|
||||
|
||||
if res["errors"]:
|
||||
msg = repr_errors(res, method=f"Tested function: {function_name}")
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Klass, method", get_all_methods())
|
||||
def test_docstring(Klass, method, request):
|
||||
base_import_path = Klass.__module__
|
||||
import_path = [base_import_path, Klass.__name__]
|
||||
if method is not None:
|
||||
import_path.append(method)
|
||||
|
||||
import_path = ".".join(import_path)
|
||||
|
||||
res = numpydoc_validation.validate(import_path)
|
||||
|
||||
res["errors"] = list(filter_errors(res["errors"], method, Klass=Klass))
|
||||
|
||||
if res["errors"]:
|
||||
msg = repr_errors(res, Klass, method)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Validate docstring with numpydoc.")
|
||||
parser.add_argument("import_path", help="Import path to validate")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
res = numpydoc_validation.validate(args.import_path)
|
||||
|
||||
import_path_sections = args.import_path.split(".")
|
||||
# When applied to classes, detect class method. For functions
|
||||
# method = None.
|
||||
# TODO: this detection can be improved. Currently we assume that we have
|
||||
# class # methods if the second path element before last is in camel case.
|
||||
if len(import_path_sections) >= 2 and re.match(
|
||||
r"(?:[A-Z][a-z]*)+", import_path_sections[-2]
|
||||
):
|
||||
method = import_path_sections[-1]
|
||||
else:
|
||||
method = None
|
||||
|
||||
res["errors"] = list(filter_errors(res["errors"], method))
|
||||
|
||||
if res["errors"]:
|
||||
msg = repr_errors(res, method=args.import_path)
|
||||
|
||||
print(msg)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("All docstring checks passed for {}!".format(args.import_path))
|
||||
@@ -0,0 +1,715 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
|
||||
|
||||
def _check_predict_proba(clf, X, y):
|
||||
proba = clf.predict_proba(X)
|
||||
|
||||
# We know that we can have division by zero
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", "divide by zero encountered in log")
|
||||
log_proba = clf.predict_log_proba(X)
|
||||
|
||||
y = np.atleast_1d(y)
|
||||
if y.ndim == 1:
|
||||
y = np.reshape(y, (-1, 1))
|
||||
|
||||
n_outputs = y.shape[1]
|
||||
n_samples = len(X)
|
||||
|
||||
if n_outputs == 1:
|
||||
proba = [proba]
|
||||
log_proba = [log_proba]
|
||||
|
||||
for k in range(n_outputs):
|
||||
assert proba[k].shape[0] == n_samples
|
||||
assert proba[k].shape[1] == len(np.unique(y[:, k]))
|
||||
assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
|
||||
# We know that we can have division by zero
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", "divide by zero encountered in log")
|
||||
assert_array_almost_equal(np.log(proba[k]), log_proba[k])
|
||||
|
||||
|
||||
def _check_behavior_2d(clf):
|
||||
# 1d case
|
||||
X = np.array([[0], [0], [0], [0]]) # ignored
|
||||
y = np.array([1, 2, 1, 1])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
# 2d case
|
||||
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
|
||||
def _check_behavior_2d_for_constant(clf):
|
||||
# 2d case only
|
||||
X = np.array([[0], [0], [0], [0]]) # ignored
|
||||
y = np.array([[1, 0, 5, 4, 3], [2, 0, 1, 2, 5], [1, 0, 4, 5, 2], [1, 3, 3, 2, 0]])
|
||||
est = clone(clf)
|
||||
est.fit(X, y)
|
||||
y_pred = est.predict(X)
|
||||
assert y.shape == y_pred.shape
|
||||
|
||||
|
||||
def _check_equality_regressor(statistic, y_learn, y_pred_learn, y_test, y_pred_test):
|
||||
assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)), y_pred_learn)
|
||||
assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)), y_pred_test)
|
||||
|
||||
|
||||
def test_feature_names_in_and_n_features_in_(global_random_seed, n_samples=10):
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = pd.DataFrame([[0]] * n_samples, columns=["feature_1"])
|
||||
y = random_state.rand(n_samples)
|
||||
|
||||
est = DummyRegressor().fit(X, y)
|
||||
assert hasattr(est, "feature_names_in_")
|
||||
assert hasattr(est, "n_features_in_")
|
||||
|
||||
est = DummyClassifier().fit(X, y)
|
||||
assert hasattr(est, "feature_names_in_")
|
||||
assert hasattr(est, "n_features_in_")
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = [1, 2, 1, 1]
|
||||
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.ones(len(X)))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
if strategy == "prior":
|
||||
assert_array_almost_equal(
|
||||
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))
|
||||
)
|
||||
else:
|
||||
assert_array_almost_equal(
|
||||
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5
|
||||
)
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy_with_2d_column_y():
|
||||
# non-regression test added in
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/13545
|
||||
X = [[0], [0], [0], [0]]
|
||||
y_1d = [1, 2, 1, 1]
|
||||
y_2d = [[1], [2], [1], [1]]
|
||||
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf_1d = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf_2d = DummyClassifier(strategy=strategy, random_state=0)
|
||||
|
||||
clf_1d.fit(X, y_1d)
|
||||
clf_2d.fit(X, y_2d)
|
||||
assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
|
||||
|
||||
|
||||
def test_most_frequent_and_prior_strategy_multioutput():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
for strategy in ("prior", "most_frequent"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(
|
||||
clf.predict(X),
|
||||
np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]),
|
||||
)
|
||||
_check_predict_proba(clf, X, y)
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_stratified_strategy(global_random_seed):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = [1, 2, 1, 1, 2]
|
||||
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
p = np.bincount(y_pred) / float(len(X))
|
||||
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
|
||||
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_stratified_strategy_multioutput(global_random_seed):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
|
||||
|
||||
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
|
||||
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_uniform_strategy(global_random_seed):
|
||||
X = [[0]] * 4 # ignored
|
||||
y = [1, 2, 1, 1]
|
||||
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
p = np.bincount(y_pred) / float(len(X))
|
||||
assert_almost_equal(p[1], 0.5, decimal=1)
|
||||
assert_almost_equal(p[2], 0.5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_uniform_strategy_multioutput(global_random_seed):
|
||||
X = [[0]] * 4 # ignored
|
||||
y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
|
||||
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 0.5, decimal=1)
|
||||
assert_almost_equal(p[2], 0.5, decimal=1)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
_check_behavior_2d(clf)
|
||||
|
||||
|
||||
def test_string_labels():
|
||||
X = [[0]] * 5
|
||||
y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
|
||||
clf = DummyClassifier(strategy="most_frequent")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), ["paris"] * 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y,y_test",
|
||||
[
|
||||
([2, 1, 1, 1], [2, 2, 1, 1]),
|
||||
(
|
||||
np.array([[2, 2], [1, 1], [1, 1], [1, 1]]),
|
||||
np.array([[2, 2], [2, 2], [1, 1], [1, 1]]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_classifier_score_with_None(y, y_test):
|
||||
clf = DummyClassifier(strategy="most_frequent")
|
||||
clf.fit(None, y)
|
||||
assert clf.score(None, y_test) == 0.5
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
|
||||
)
|
||||
def test_classifier_prediction_independent_of_X(strategy, global_random_seed):
|
||||
y = [0, 2, 1, 1]
|
||||
X1 = [[0]] * 4
|
||||
clf1 = DummyClassifier(
|
||||
strategy=strategy, random_state=global_random_seed, constant=0
|
||||
)
|
||||
clf1.fit(X1, y)
|
||||
predictions1 = clf1.predict(X1)
|
||||
|
||||
X2 = [[1]] * 4
|
||||
clf2 = DummyClassifier(
|
||||
strategy=strategy, random_state=global_random_seed, constant=0
|
||||
)
|
||||
clf2.fit(X2, y)
|
||||
predictions2 = clf2.predict(X2)
|
||||
|
||||
assert_array_equal(predictions1, predictions2)
|
||||
|
||||
|
||||
def test_mean_strategy_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = [[0]] * 4 # ignored
|
||||
y = random_state.randn(4)
|
||||
|
||||
reg = DummyRegressor()
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
|
||||
|
||||
|
||||
def test_mean_strategy_multioutput_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
mean = np.mean(y_learn, axis=0).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor()
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_regressor_exceptions():
|
||||
reg = DummyRegressor()
|
||||
with pytest.raises(NotFittedError):
|
||||
reg.predict([])
|
||||
|
||||
|
||||
def test_median_strategy_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="median")
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
|
||||
|
||||
|
||||
def test_median_strategy_multioutput_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
median = np.median(y_learn, axis=0).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="median")
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_quantile_strategy_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0.5)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=1)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="quantile", quantile=0.3)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
|
||||
|
||||
|
||||
def test_quantile_strategy_multioutput_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
median = np.median(y_learn, axis=0).reshape((1, -1))
|
||||
quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.5)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.8)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(
|
||||
quantile_values, y_learn, y_pred_learn, y_test, y_pred_test
|
||||
)
|
||||
_check_behavior_2d(est)
|
||||
|
||||
|
||||
def test_quantile_invalid():
|
||||
X = [[0]] * 5 # ignored
|
||||
y = [0] * 5 # ignored
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=None)
|
||||
err_msg = (
|
||||
"When using `strategy='quantile', you have to specify the desired quantile"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
def test_quantile_strategy_empty_train():
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.4)
|
||||
with pytest.raises(IndexError):
|
||||
est.fit([], [])
|
||||
|
||||
|
||||
def test_constant_strategy_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = [[0]] * 5 # ignored
|
||||
y = random_state.randn(5)
|
||||
|
||||
reg = DummyRegressor(strategy="constant", constant=[43])
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [43] * len(X))
|
||||
|
||||
reg = DummyRegressor(strategy="constant", constant=43)
|
||||
reg.fit(X, y)
|
||||
assert_array_equal(reg.predict(X), [43] * len(X))
|
||||
|
||||
# non-regression test for #22478
|
||||
assert not isinstance(reg.constant, np.ndarray)
|
||||
|
||||
|
||||
def test_constant_strategy_multioutput_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X_learn = random_state.randn(10, 10)
|
||||
y_learn = random_state.randn(10, 5)
|
||||
|
||||
# test with 2d array
|
||||
constants = random_state.randn(5)
|
||||
|
||||
X_test = random_state.randn(20, 10)
|
||||
y_test = random_state.randn(20, 5)
|
||||
|
||||
# Correctness oracle
|
||||
est = DummyRegressor(strategy="constant", constant=constants)
|
||||
est.fit(X_learn, y_learn)
|
||||
y_pred_learn = est.predict(X_learn)
|
||||
y_pred_test = est.predict(X_test)
|
||||
|
||||
_check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
|
||||
_check_behavior_2d_for_constant(est)
|
||||
|
||||
|
||||
def test_y_mean_attribute_regressor():
|
||||
X = [[0]] * 5
|
||||
y = [1, 2, 4, 6, 8]
|
||||
# when strategy = 'mean'
|
||||
est = DummyRegressor(strategy="mean")
|
||||
est.fit(X, y)
|
||||
|
||||
assert est.constant_ == np.mean(y)
|
||||
|
||||
|
||||
def test_constants_not_specified_regressor():
|
||||
X = [[0]] * 5
|
||||
y = [1, 2, 4, 6, 8]
|
||||
|
||||
est = DummyRegressor(strategy="constant")
|
||||
err_msg = "Constant target value has to be specified"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
def test_constant_size_multioutput_regressor(global_random_seed):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
X = random_state.randn(10, 10)
|
||||
y = random_state.randn(10, 5)
|
||||
|
||||
est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
|
||||
err_msg = r"Constant target value should have shape \(5, 1\)."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
def test_constant_strategy():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = [2, 1, 2, 2]
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.ones(len(X)))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = ["two", "one", "two", "two"]
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant="one")
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.array(["one"] * 4))
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
def test_constant_strategy_multioutput():
|
||||
X = [[0], [0], [0], [0]] # ignored
|
||||
y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(
|
||||
clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
|
||||
)
|
||||
_check_predict_proba(clf, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, err_msg",
|
||||
[
|
||||
([2, 1, 2, 2], {"random_state": 0}, "Constant.*has to be specified"),
|
||||
([2, 1, 2, 2], {"constant": [2, 0]}, "Constant.*should have shape"),
|
||||
(
|
||||
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
|
||||
{"constant": 2},
|
||||
"Constant.*should have shape",
|
||||
),
|
||||
(
|
||||
[2, 1, 2, 2],
|
||||
{"constant": "my-constant"},
|
||||
"constant=my-constant.*Possible values.*\\[1, 2]",
|
||||
),
|
||||
(
|
||||
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
|
||||
{"constant": [2, "unknown"]},
|
||||
"constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]",
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"no-constant",
|
||||
"too-many-constant",
|
||||
"not-enough-output",
|
||||
"single-output",
|
||||
"multi-output",
|
||||
],
|
||||
)
|
||||
def test_constant_strategy_exceptions(y, params, err_msg):
|
||||
X = [[0], [0], [0], [0]]
|
||||
|
||||
clf = DummyClassifier(strategy="constant", **params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_classification_sample_weight():
|
||||
X = [[0], [0], [1]]
|
||||
y = [0, 1, 0]
|
||||
sample_weight = [0.1, 1.0, 0.1]
|
||||
|
||||
clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
|
||||
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1.0 / 1.2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_constant_strategy_sparse_target(csc_container):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = csc_container(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))
|
||||
|
||||
n_samples = len(X)
|
||||
|
||||
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
|
||||
clf.fit(X, y)
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
assert_array_equal(
|
||||
y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_uniform_strategy_sparse_target_warning(global_random_seed, csc_container):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = csc_container(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))
|
||||
|
||||
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
|
||||
with pytest.warns(UserWarning, match="the uniform strategy would not save memory"):
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 1 / 3, decimal=1)
|
||||
assert_almost_equal(p[2], 1 / 3, decimal=1)
|
||||
assert_almost_equal(p[4], 1 / 3, decimal=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_stratified_strategy_sparse_target(global_random_seed, csc_container):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = csc_container(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))
|
||||
|
||||
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
|
||||
clf.fit(X, y)
|
||||
|
||||
X = [[0]] * 500
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
y_pred = y_pred.toarray()
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
p = np.bincount(y_pred[:, k]) / float(len(X))
|
||||
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
|
||||
assert_almost_equal(p[0], 1.0 / 5, decimal=1)
|
||||
assert_almost_equal(p[4], 1.0 / 5, decimal=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_most_frequent_and_prior_strategy_sparse_target(csc_container):
|
||||
X = [[0]] * 5 # ignored
|
||||
y = csc_container(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))
|
||||
|
||||
n_samples = len(X)
|
||||
y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
|
||||
for strategy in ("most_frequent", "prior"):
|
||||
clf = DummyClassifier(strategy=strategy, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert sp.issparse(y_pred)
|
||||
assert_array_equal(y_pred.toarray(), y_expected)
|
||||
|
||||
|
||||
def test_dummy_regressor_sample_weight(global_random_seed, n_samples=10):
|
||||
random_state = np.random.RandomState(seed=global_random_seed)
|
||||
|
||||
X = [[0]] * n_samples
|
||||
y = random_state.rand(n_samples)
|
||||
sample_weight = random_state.rand(n_samples)
|
||||
|
||||
est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
|
||||
assert est.constant_ == np.average(y, weights=sample_weight)
|
||||
|
||||
est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
|
||||
assert est.constant_ == _weighted_percentile(y, sample_weight, 50.0)
|
||||
|
||||
est = DummyRegressor(strategy="quantile", quantile=0.95).fit(X, y, sample_weight)
|
||||
assert est.constant_ == _weighted_percentile(y, sample_weight, 95.0)
|
||||
|
||||
|
||||
def test_dummy_regressor_on_3D_array():
|
||||
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
|
||||
y = np.array([2, 2, 2])
|
||||
y_expected = np.array([2, 2, 2])
|
||||
cls = DummyRegressor()
|
||||
cls.fit(X, y)
|
||||
y_pred = cls.predict(X)
|
||||
assert_array_equal(y_pred, y_expected)
|
||||
|
||||
|
||||
def test_dummy_classifier_on_3D_array():
|
||||
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
|
||||
y = [2, 2, 2]
|
||||
y_expected = [2, 2, 2]
|
||||
y_proba_expected = [[1], [1], [1]]
|
||||
cls = DummyClassifier(strategy="stratified")
|
||||
cls.fit(X, y)
|
||||
y_pred = cls.predict(X)
|
||||
y_pred_proba = cls.predict_proba(X)
|
||||
assert_array_equal(y_pred, y_expected)
|
||||
assert_array_equal(y_pred_proba, y_proba_expected)
|
||||
|
||||
|
||||
def test_dummy_regressor_return_std():
|
||||
X = [[0]] * 3 # ignored
|
||||
y = np.array([2, 2, 2])
|
||||
y_std_expected = np.array([0, 0, 0])
|
||||
cls = DummyRegressor()
|
||||
cls.fit(X, y)
|
||||
y_pred_list = cls.predict(X, return_std=True)
|
||||
# there should be two elements when return_std is True
|
||||
assert len(y_pred_list) == 2
|
||||
# the second element should be all zeros
|
||||
assert_array_equal(y_pred_list[1], y_std_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y,y_test",
|
||||
[
|
||||
([1, 1, 1, 2], [1.25] * 4),
|
||||
(np.array([[2, 2], [1, 1], [1, 1], [1, 1]]), [[1.25, 1.25]] * 4),
|
||||
],
|
||||
)
|
||||
def test_regressor_score_with_None(y, y_test):
|
||||
reg = DummyRegressor()
|
||||
reg.fit(None, y)
|
||||
assert reg.score(None, y_test) == 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strategy", ["mean", "median", "quantile", "constant"])
|
||||
def test_regressor_prediction_independent_of_X(strategy):
|
||||
y = [0, 2, 1, 1]
|
||||
X1 = [[0]] * 4
|
||||
reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
|
||||
reg1.fit(X1, y)
|
||||
predictions1 = reg1.predict(X1)
|
||||
|
||||
X2 = [[1]] * 4
|
||||
reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
|
||||
reg2.fit(X2, y)
|
||||
predictions2 = reg2.predict(X2)
|
||||
|
||||
assert_array_equal(predictions1, predictions2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
|
||||
)
|
||||
def test_dtype_of_classifier_probas(strategy):
|
||||
y = [0, 2, 1, 1]
|
||||
X = np.zeros(4)
|
||||
model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
|
||||
probas = model.fit(X, y).predict_proba(X)
|
||||
|
||||
assert probas.dtype == np.float64
|
||||
@@ -0,0 +1,20 @@
|
||||
# Basic unittests to test functioning of module's top-level
|
||||
|
||||
|
||||
__author__ = "Yaroslav Halchenko"
|
||||
__license__ = "BSD"
|
||||
|
||||
|
||||
try:
|
||||
from sklearn import * # noqa: F403
|
||||
|
||||
_top_import_error = None
|
||||
except Exception as e:
|
||||
_top_import_error = e
|
||||
|
||||
|
||||
def test_import_skl():
|
||||
# Test either above import has failed for some reason
|
||||
# "import *" is discouraged outside of the module level, hence we
|
||||
# rely on setting up the variable above
|
||||
assert _top_import_error is None
|
||||
@@ -0,0 +1,699 @@
|
||||
import copy
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.special import expit
|
||||
|
||||
import sklearn
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.isotonic import (
|
||||
IsotonicRegression,
|
||||
_make_unique,
|
||||
check_increasing,
|
||||
isotonic_regression,
|
||||
)
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
|
||||
def test_permutation_invariance():
|
||||
# check that fit is permutation invariant.
|
||||
# regression test of missing sorting of sample-weights
|
||||
ir = IsotonicRegression()
|
||||
x = [1, 2, 3, 4, 5, 6, 7]
|
||||
y = [1, 41, 51, 1, 2, 5, 24]
|
||||
sample_weight = [1, 2, 3, 4, 5, 6, 7]
|
||||
x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
|
||||
y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
|
||||
y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)
|
||||
|
||||
assert_array_equal(y_transformed, y_transformed_s)
|
||||
|
||||
|
||||
def test_check_increasing_small_number_of_samples():
|
||||
x = [0, 1, 2]
|
||||
y = [1, 1.1, 1.05]
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_up():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, 1.5, 2.77, 8.99, 8.99, 50]
|
||||
|
||||
# Check that we got increasing=True and no warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_up_extreme():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, 1, 2, 3, 4, 5]
|
||||
|
||||
# Check that we got increasing=True and no warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_down():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1.5, -2.77, -8.99, -8.99, -50]
|
||||
|
||||
# Check that we got increasing=False and no warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_check_increasing_down_extreme():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1, -2, -3, -4, -5]
|
||||
|
||||
# Check that we got increasing=False and no warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_check_ci_warn():
|
||||
x = [0, 1, 2, 3, 4, 5]
|
||||
y = [0, -1, 2, -3, 4, -5]
|
||||
|
||||
# Check that we got increasing=False and CI interval warning
|
||||
msg = "interval"
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
is_increasing = check_increasing(x, y)
|
||||
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_isotonic_regression():
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
y_ = np.array([3, 6, 6, 8, 8, 8, 10])
|
||||
assert_array_equal(y_, isotonic_regression(y))
|
||||
|
||||
y = np.array([10, 0, 2])
|
||||
y_ = np.array([4, 4, 4])
|
||||
assert_array_equal(y_, isotonic_regression(y))
|
||||
|
||||
x = np.arange(len(y))
|
||||
ir = IsotonicRegression(y_min=0.0, y_max=1.0)
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(ir.transform(x), ir.predict(x))
|
||||
|
||||
# check that it is immune to permutation
|
||||
perm = np.random.permutation(len(y))
|
||||
ir = IsotonicRegression(y_min=0.0, y_max=1.0)
|
||||
assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm])
|
||||
assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
|
||||
|
||||
# check we don't crash when all x are equal:
|
||||
ir = IsotonicRegression()
|
||||
assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_min():
|
||||
# Setup examples with ties on minimum
|
||||
x = [1, 1, 2, 3, 4, 5]
|
||||
y = [1, 2, 3, 4, 5, 6]
|
||||
y_true = [1.5, 1.5, 3, 4, 5, 6]
|
||||
|
||||
# Check that we get identical results for fit/transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(y_true, ir.fit_transform(x, y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_max():
|
||||
# Setup examples with ties on maximum
|
||||
x = [1, 2, 3, 4, 5, 5]
|
||||
y = [1, 2, 3, 4, 5, 6]
|
||||
y_true = [1, 2, 3, 4, 5.5, 5.5]
|
||||
|
||||
# Check that we get identical results for fit/transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
|
||||
assert_array_equal(y_true, ir.fit_transform(x, y))
|
||||
|
||||
|
||||
def test_isotonic_regression_ties_secondary_():
|
||||
"""
|
||||
Test isotonic regression fit, transform and fit_transform
|
||||
against the "secondary" ties method and "pituitary" data from R
|
||||
"isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
|
||||
Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
|
||||
(PAVA) and Active Set Methods
|
||||
|
||||
Set values based on pituitary example and
|
||||
the following R command detailed in the paper above:
|
||||
> library("isotone")
|
||||
> data("pituitary")
|
||||
> res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
|
||||
> res1$x
|
||||
|
||||
`isotone` version: 1.0-2, 2014-09-07
|
||||
R version: R version 3.1.1 (2014-07-10)
|
||||
"""
|
||||
x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
|
||||
y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
|
||||
y_true = [
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
22.22222,
|
||||
24.25,
|
||||
24.25,
|
||||
]
|
||||
|
||||
# Check fit, transform and fit_transform
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_almost_equal(ir.transform(x), y_true, 4)
|
||||
assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
|
||||
|
||||
|
||||
def test_isotonic_regression_with_ties_in_differently_sized_groups():
|
||||
"""
|
||||
Non-regression test to handle issue 9432:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/9432
|
||||
|
||||
Compare against output in R:
|
||||
> library("isotone")
|
||||
> x <- c(0, 1, 1, 2, 3, 4)
|
||||
> y <- c(0, 0, 1, 0, 0, 1)
|
||||
> res1 <- gpava(x, y, ties="secondary")
|
||||
> res1$x
|
||||
|
||||
`isotone` version: 1.1-0, 2015-07-24
|
||||
R version: R version 3.3.2 (2016-10-31)
|
||||
"""
|
||||
x = np.array([0, 1, 1, 2, 3, 4])
|
||||
y = np.array([0, 0, 1, 0, 0, 1])
|
||||
y_true = np.array([0.0, 0.25, 0.25, 0.25, 0.25, 1.0])
|
||||
ir = IsotonicRegression()
|
||||
ir.fit(x, y)
|
||||
assert_array_almost_equal(ir.transform(x), y_true)
|
||||
assert_array_almost_equal(ir.fit_transform(x, y), y_true)
|
||||
|
||||
|
||||
def test_isotonic_regression_reversed():
|
||||
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
|
||||
y_result = np.array([10, 9.5, 9.5, 7, 6.05, 6.05, 5])
|
||||
|
||||
y_iso = isotonic_regression(y, increasing=False)
|
||||
assert_allclose(y_iso, y_result)
|
||||
|
||||
y_ = IsotonicRegression(increasing=False).fit_transform(np.arange(len(y)), y)
|
||||
assert_allclose(y_, y_result)
|
||||
assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
|
||||
|
||||
|
||||
def test_isotonic_regression_auto_decreasing():
|
||||
# Set y and x for decreasing
|
||||
y = np.array([10, 9, 10, 7, 6, 6.1, 5])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit_transform
|
||||
ir = IsotonicRegression(increasing="auto")
|
||||
y_ = ir.fit_transform(x, y)
|
||||
# Check that relationship decreases
|
||||
is_increasing = y_[0] < y_[-1]
|
||||
assert not is_increasing
|
||||
|
||||
|
||||
def test_isotonic_regression_auto_increasing():
|
||||
# Set y and x for decreasing
|
||||
y = np.array([5, 6.1, 6, 7, 10, 9, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit_transform
|
||||
ir = IsotonicRegression(increasing="auto")
|
||||
y_ = ir.fit_transform(x, y)
|
||||
|
||||
# Check that relationship increases
|
||||
is_increasing = y_[0] < y_[-1]
|
||||
assert is_increasing
|
||||
|
||||
|
||||
def test_assert_raises_exceptions():
|
||||
ir = IsotonicRegression()
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
msg = "Found input variables with inconsistent numbers of samples"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ir.fit([0, 1, 2], [5, 7])
|
||||
|
||||
msg = "X should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ir.fit(rng.randn(3, 10), [0, 1, 2])
|
||||
|
||||
msg = "Isotonic regression input X should be a 1d array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ir.transform(rng.randn(3, 10))
|
||||
|
||||
|
||||
def test_isotonic_sample_weight_parameter_default_value():
|
||||
# check if default value of sample_weight parameter is one
|
||||
ir = IsotonicRegression()
|
||||
# random test data
|
||||
rng = np.random.RandomState(42)
|
||||
n = 100
|
||||
x = np.arange(n)
|
||||
y = rng.randint(-50, 50, size=(n,)) + 50.0 * np.log(1 + np.arange(n))
|
||||
# check if value is correctly used
|
||||
weights = np.ones(n)
|
||||
y_set_value = ir.fit_transform(x, y, sample_weight=weights)
|
||||
y_default_value = ir.fit_transform(x, y)
|
||||
|
||||
assert_array_equal(y_set_value, y_default_value)
|
||||
|
||||
|
||||
def test_isotonic_min_max_boundaries():
|
||||
# check if min value is used correctly
|
||||
ir = IsotonicRegression(y_min=2, y_max=4)
|
||||
n = 6
|
||||
x = np.arange(n)
|
||||
y = np.arange(n)
|
||||
y_test = [2, 2, 2, 3, 4, 4]
|
||||
y_result = np.round(ir.fit_transform(x, y))
|
||||
assert_array_equal(y_result, y_test)
|
||||
|
||||
|
||||
def test_isotonic_sample_weight():
|
||||
ir = IsotonicRegression()
|
||||
x = [1, 2, 3, 4, 5, 6, 7]
|
||||
y = [1, 41, 51, 1, 2, 5, 24]
|
||||
sample_weight = [1, 2, 3, 4, 5, 6, 7]
|
||||
expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
|
||||
received_y = ir.fit_transform(x, y, sample_weight=sample_weight)
|
||||
|
||||
assert_array_equal(expected_y, received_y)
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_raise():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing="auto", out_of_bounds="raise")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Check that an exception is thrown
|
||||
msg = "in x_new is below the interpolation range"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ir.predict([min(x) - 10, max(x) + 10])
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_clip():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Predict from training and test x and check that min/max match.
|
||||
y1 = ir.predict([min(x) - 10, max(x) + 10])
|
||||
y2 = ir.predict(x)
|
||||
assert max(y1) == max(y2)
|
||||
assert min(y1) == min(y2)
|
||||
|
||||
|
||||
def test_isotonic_regression_oob_nan():
|
||||
# Set y and x
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing="auto", out_of_bounds="nan")
|
||||
ir.fit(x, y)
|
||||
|
||||
# Predict from training and test x and check that we have two NaNs.
|
||||
y1 = ir.predict([min(x) - 10, max(x) + 10])
|
||||
assert sum(np.isnan(y1)) == 2
|
||||
|
||||
|
||||
def test_isotonic_regression_pickle():
|
||||
y = np.array([3, 7, 5, 9, 8, 7, 10])
|
||||
x = np.arange(len(y))
|
||||
|
||||
# Create model and fit
|
||||
ir = IsotonicRegression(increasing="auto", out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
|
||||
ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL)
|
||||
ir2 = pickle.loads(ir_ser)
|
||||
np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
|
||||
|
||||
|
||||
def test_isotonic_duplicate_min_entry():
|
||||
x = [0, 0, 1]
|
||||
y = [0, 0, 1]
|
||||
|
||||
ir = IsotonicRegression(increasing=True, out_of_bounds="clip")
|
||||
ir.fit(x, y)
|
||||
all_predictions_finite = np.all(np.isfinite(ir.predict(x)))
|
||||
assert all_predictions_finite
|
||||
|
||||
|
||||
def test_isotonic_ymin_ymax():
|
||||
# Test from @NelleV's issue:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6921
|
||||
x = np.array(
|
||||
[
|
||||
1.263,
|
||||
1.318,
|
||||
-0.572,
|
||||
0.307,
|
||||
-0.707,
|
||||
-0.176,
|
||||
-1.599,
|
||||
1.059,
|
||||
1.396,
|
||||
1.906,
|
||||
0.210,
|
||||
0.028,
|
||||
-0.081,
|
||||
0.444,
|
||||
0.018,
|
||||
-0.377,
|
||||
-0.896,
|
||||
-0.377,
|
||||
-1.327,
|
||||
0.180,
|
||||
]
|
||||
)
|
||||
y = isotonic_regression(x, y_min=0.0, y_max=0.1)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
assert np.all(y <= 0.1)
|
||||
|
||||
# Also test decreasing case since the logic there is different
|
||||
y = isotonic_regression(x, y_min=0.0, y_max=0.1, increasing=False)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
assert np.all(y <= 0.1)
|
||||
|
||||
# Finally, test with only one bound
|
||||
y = isotonic_regression(x, y_min=0.0, increasing=False)
|
||||
|
||||
assert np.all(y >= 0)
|
||||
|
||||
|
||||
def test_isotonic_zero_weight_loop():
|
||||
# Test from @ogrisel's issue:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/4297
|
||||
|
||||
# Get deterministic RNG with seed
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# Create regression and samples
|
||||
regression = IsotonicRegression()
|
||||
n_samples = 50
|
||||
x = np.linspace(-3, 3, n_samples)
|
||||
y = x + rng.uniform(size=n_samples)
|
||||
|
||||
# Get some random weights and zero out
|
||||
w = rng.uniform(size=n_samples)
|
||||
w[5:8] = 0
|
||||
regression.fit(x, y, sample_weight=w)
|
||||
|
||||
# This will hang in failure case.
|
||||
regression.fit(x, y, sample_weight=w)
|
||||
|
||||
|
||||
def test_fast_predict():
|
||||
# test that the faster prediction change doesn't
|
||||
# affect out-of-sample predictions:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/6206
|
||||
rng = np.random.RandomState(123)
|
||||
n_samples = 10**3
|
||||
# X values over the -10,10 range
|
||||
X_train = 20.0 * rng.rand(n_samples) - 10
|
||||
y_train = (
|
||||
np.less(rng.rand(n_samples), expit(X_train)).astype("int64").astype("float64")
|
||||
)
|
||||
|
||||
weights = rng.rand(n_samples)
|
||||
# we also want to test that everything still works when some weights are 0
|
||||
weights[rng.rand(n_samples) < 0.1] = 0
|
||||
|
||||
slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
|
||||
fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
|
||||
|
||||
# Build interpolation function with ALL input data, not just the
|
||||
# non-redundant subset. The following 2 lines are taken from the
|
||||
# .fit() method, without removing unnecessary points
|
||||
X_train_fit, y_train_fit = slow_model._build_y(
|
||||
X_train, y_train, sample_weight=weights, trim_duplicates=False
|
||||
)
|
||||
slow_model._build_f(X_train_fit, y_train_fit)
|
||||
|
||||
# fit with just the necessary data
|
||||
fast_model.fit(X_train, y_train, sample_weight=weights)
|
||||
|
||||
X_test = 20.0 * rng.rand(n_samples) - 10
|
||||
y_pred_slow = slow_model.predict(X_test)
|
||||
y_pred_fast = fast_model.predict(X_test)
|
||||
|
||||
assert_array_equal(y_pred_slow, y_pred_fast)
|
||||
|
||||
|
||||
def test_isotonic_copy_before_fit():
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6628
|
||||
ir = IsotonicRegression()
|
||||
copy.copy(ir)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
|
||||
def test_isotonic_dtype(dtype):
|
||||
y = [2, 1, 4, 3, 5]
|
||||
weights = np.array([0.9, 0.9, 0.9, 0.9, 0.9], dtype=np.float64)
|
||||
reg = IsotonicRegression()
|
||||
|
||||
for sample_weight in (None, weights.astype(np.float32), weights):
|
||||
y_np = np.array(y, dtype=dtype)
|
||||
expected_dtype = check_array(
|
||||
y_np, dtype=[np.float64, np.float32], ensure_2d=False
|
||||
).dtype
|
||||
|
||||
res = isotonic_regression(y_np, sample_weight=sample_weight)
|
||||
assert res.dtype == expected_dtype
|
||||
|
||||
X = np.arange(len(y)).astype(dtype)
|
||||
reg.fit(X, y_np, sample_weight=sample_weight)
|
||||
res = reg.predict(X)
|
||||
assert res.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("y_dtype", [np.int32, np.int64, np.float32, np.float64])
|
||||
def test_isotonic_mismatched_dtype(y_dtype):
|
||||
# regression test for #15004
|
||||
# check that data are converted when X and y dtype differ
|
||||
reg = IsotonicRegression()
|
||||
y = np.array([2, 1, 4, 3, 5], dtype=y_dtype)
|
||||
X = np.arange(len(y), dtype=np.float32)
|
||||
reg.fit(X, y)
|
||||
assert reg.predict(X).dtype == X.dtype
|
||||
|
||||
|
||||
def test_make_unique_dtype():
|
||||
x_list = [2, 2, 2, 3, 5]
|
||||
for dtype in (np.float32, np.float64):
|
||||
x = np.array(x_list, dtype=dtype)
|
||||
y = x.copy()
|
||||
w = np.ones_like(x)
|
||||
x, y, w = _make_unique(x, y, w)
|
||||
assert_array_equal(x, [2, 3, 5])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||||
def test_make_unique_tolerance(dtype):
|
||||
# Check that equality takes account of np.finfo tolerance
|
||||
x = np.array([0, 1e-16, 1, 1 + 1e-14], dtype=dtype)
|
||||
y = x.copy()
|
||||
w = np.ones_like(x)
|
||||
x, y, w = _make_unique(x, y, w)
|
||||
if dtype == np.float64:
|
||||
x_out = np.array([0, 1, 1 + 1e-14])
|
||||
else:
|
||||
x_out = np.array([0, 1])
|
||||
assert_array_equal(x, x_out)
|
||||
|
||||
|
||||
def test_isotonic_make_unique_tolerance():
|
||||
# Check that averaging of targets for duplicate X is done correctly,
|
||||
# taking into account tolerance
|
||||
X = np.array([0, 1, 1 + 1e-16, 2], dtype=np.float64)
|
||||
y = np.array([0, 1, 2, 3], dtype=np.float64)
|
||||
ireg = IsotonicRegression().fit(X, y)
|
||||
y_pred = ireg.predict([0, 0.5, 1, 1.5, 2])
|
||||
|
||||
assert_array_equal(y_pred, np.array([0, 0.75, 1.5, 2.25, 3]))
|
||||
assert_array_equal(ireg.X_thresholds_, np.array([0.0, 1.0, 2.0]))
|
||||
assert_array_equal(ireg.y_thresholds_, np.array([0.0, 1.5, 3.0]))
|
||||
|
||||
|
||||
def test_isotonic_non_regression_inf_slope():
|
||||
# Non-regression test to ensure that inf values are not returned
|
||||
# see: https://github.com/scikit-learn/scikit-learn/issues/10903
|
||||
X = np.array([0.0, 4.1e-320, 4.4e-314, 1.0])
|
||||
y = np.array([0.42, 0.42, 0.44, 0.44])
|
||||
ireg = IsotonicRegression().fit(X, y)
|
||||
y_pred = ireg.predict(np.array([0, 2.1e-319, 5.4e-316, 1e-10]))
|
||||
assert np.all(np.isfinite(y_pred))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("increasing", [True, False])
|
||||
def test_isotonic_thresholds(increasing):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 30
|
||||
X = rng.normal(size=n_samples)
|
||||
y = rng.normal(size=n_samples)
|
||||
ireg = IsotonicRegression(increasing=increasing).fit(X, y)
|
||||
X_thresholds, y_thresholds = ireg.X_thresholds_, ireg.y_thresholds_
|
||||
assert X_thresholds.shape == y_thresholds.shape
|
||||
|
||||
# Input thresholds are a strict subset of the training set (unless
|
||||
# the data is already strictly monotonic which is not the case with
|
||||
# this random data)
|
||||
assert X_thresholds.shape[0] < X.shape[0]
|
||||
assert np.isin(X_thresholds, X).all()
|
||||
|
||||
# Output thresholds lie in the range of the training set:
|
||||
assert y_thresholds.max() <= y.max()
|
||||
assert y_thresholds.min() >= y.min()
|
||||
|
||||
assert all(np.diff(X_thresholds) > 0)
|
||||
if increasing:
|
||||
assert all(np.diff(y_thresholds) >= 0)
|
||||
else:
|
||||
assert all(np.diff(y_thresholds) <= 0)
|
||||
|
||||
|
||||
def test_input_shape_validation():
|
||||
# Test from #15012
|
||||
# Check that IsotonicRegression can handle 2darray with only 1 feature
|
||||
X = np.arange(10)
|
||||
X_2d = X.reshape(-1, 1)
|
||||
y = np.arange(10)
|
||||
|
||||
iso_reg = IsotonicRegression().fit(X, y)
|
||||
iso_reg_2d = IsotonicRegression().fit(X_2d, y)
|
||||
|
||||
assert iso_reg.X_max_ == iso_reg_2d.X_max_
|
||||
assert iso_reg.X_min_ == iso_reg_2d.X_min_
|
||||
assert iso_reg.y_max == iso_reg_2d.y_max
|
||||
assert iso_reg.y_min == iso_reg_2d.y_min
|
||||
assert_array_equal(iso_reg.X_thresholds_, iso_reg_2d.X_thresholds_)
|
||||
assert_array_equal(iso_reg.y_thresholds_, iso_reg_2d.y_thresholds_)
|
||||
|
||||
y_pred1 = iso_reg.predict(X)
|
||||
y_pred2 = iso_reg_2d.predict(X_2d)
|
||||
assert_allclose(y_pred1, y_pred2)
|
||||
|
||||
|
||||
def test_isotonic_2darray_more_than_1_feature():
|
||||
# Ensure IsotonicRegression raises error if input has more than 1 feature
|
||||
X = np.arange(10)
|
||||
X_2d = np.c_[X, X]
|
||||
y = np.arange(10)
|
||||
|
||||
msg = "should be a 1d array or 2d array with 1 feature"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
IsotonicRegression().fit(X_2d, y)
|
||||
|
||||
iso_reg = IsotonicRegression().fit(X, y)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
iso_reg.predict(X_2d)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
iso_reg.transform(X_2d)
|
||||
|
||||
|
||||
def test_isotonic_regression_sample_weight_not_overwritten():
|
||||
"""Check that calling fitting function of isotonic regression will not
|
||||
overwrite `sample_weight`.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20508
|
||||
"""
|
||||
X, y = make_regression(n_samples=10, n_features=1, random_state=41)
|
||||
sample_weight_original = np.ones_like(y)
|
||||
sample_weight_original[0] = 10
|
||||
sample_weight_fit = sample_weight_original.copy()
|
||||
|
||||
isotonic_regression(y, sample_weight=sample_weight_fit)
|
||||
assert_allclose(sample_weight_fit, sample_weight_original)
|
||||
|
||||
IsotonicRegression().fit(X, y, sample_weight=sample_weight_fit)
|
||||
assert_allclose(sample_weight_fit, sample_weight_original)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shape", ["1d", "2d"])
|
||||
def test_get_feature_names_out(shape):
|
||||
"""Check `get_feature_names_out` for `IsotonicRegression`."""
|
||||
X = np.arange(10)
|
||||
if shape == "2d":
|
||||
X = X.reshape(-1, 1)
|
||||
y = np.arange(10)
|
||||
|
||||
iso = IsotonicRegression().fit(X, y)
|
||||
names = iso.get_feature_names_out()
|
||||
assert isinstance(names, np.ndarray)
|
||||
assert names.dtype == object
|
||||
assert_array_equal(["isotonicregression0"], names)
|
||||
|
||||
|
||||
def test_isotonic_regression_output_predict():
|
||||
"""Check that `predict` does return the expected output type.
|
||||
|
||||
We need to check that `transform` will output a DataFrame and a NumPy array
|
||||
when we set `transform_output` to `pandas`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/25499
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X, y = make_regression(n_samples=10, n_features=1, random_state=42)
|
||||
regressor = IsotonicRegression()
|
||||
with sklearn.config_context(transform_output="pandas"):
|
||||
regressor.fit(X, y)
|
||||
X_trans = regressor.transform(X)
|
||||
y_pred = regressor.predict(X)
|
||||
|
||||
assert isinstance(X_trans, pd.DataFrame)
|
||||
assert isinstance(y_pred, np.ndarray)
|
||||
@@ -0,0 +1,495 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.kernel_approximation import (
|
||||
AdditiveChi2Sampler,
|
||||
Nystroem,
|
||||
PolynomialCountSketch,
|
||||
RBFSampler,
|
||||
SkewedChi2Sampler,
|
||||
)
|
||||
from sklearn.metrics.pairwise import (
|
||||
chi2_kernel,
|
||||
kernel_metrics,
|
||||
polynomial_kernel,
|
||||
rbf_kernel,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# generate data
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random_sample(size=(300, 50))
|
||||
Y = rng.random_sample(size=(300, 50))
|
||||
X /= X.sum(axis=1)[:, np.newaxis]
|
||||
Y /= Y.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
# Make sure X and Y are not writable to avoid introducing dependencies between
|
||||
# tests.
|
||||
X.flags.writeable = False
|
||||
Y.flags.writeable = False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gamma", [0.1, 1, 2.5])
|
||||
@pytest.mark.parametrize("degree, n_components", [(1, 500), (2, 500), (3, 5000)])
|
||||
@pytest.mark.parametrize("coef0", [0, 2.5])
|
||||
def test_polynomial_count_sketch(gamma, degree, coef0, n_components):
|
||||
# test that PolynomialCountSketch approximates polynomial
|
||||
# kernel on random data
|
||||
|
||||
# compute exact kernel
|
||||
kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0)
|
||||
|
||||
# approximate kernel mapping
|
||||
ps_transform = PolynomialCountSketch(
|
||||
n_components=n_components,
|
||||
gamma=gamma,
|
||||
coef0=coef0,
|
||||
degree=degree,
|
||||
random_state=42,
|
||||
)
|
||||
X_trans = ps_transform.fit_transform(X)
|
||||
Y_trans = ps_transform.transform(Y)
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
|
||||
error = kernel - kernel_approx
|
||||
assert np.abs(np.mean(error)) <= 0.05 # close to unbiased
|
||||
np.abs(error, out=error)
|
||||
assert np.max(error) <= 0.1 # nothing too far off
|
||||
assert np.mean(error) <= 0.05 # mean is fairly close
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("gamma", [0.1, 1.0])
|
||||
@pytest.mark.parametrize("degree", [1, 2, 3])
|
||||
@pytest.mark.parametrize("coef0", [0, 2.5])
|
||||
def test_polynomial_count_sketch_dense_sparse(gamma, degree, coef0, csr_container):
|
||||
"""Check that PolynomialCountSketch results are the same for dense and sparse
|
||||
input.
|
||||
"""
|
||||
ps_dense = PolynomialCountSketch(
|
||||
n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
|
||||
)
|
||||
Xt_dense = ps_dense.fit_transform(X)
|
||||
Yt_dense = ps_dense.transform(Y)
|
||||
|
||||
ps_sparse = PolynomialCountSketch(
|
||||
n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
|
||||
)
|
||||
Xt_sparse = ps_sparse.fit_transform(csr_container(X))
|
||||
Yt_sparse = ps_sparse.transform(csr_container(Y))
|
||||
|
||||
assert_allclose(Xt_dense, Xt_sparse)
|
||||
assert_allclose(Yt_dense, Yt_sparse)
|
||||
|
||||
|
||||
def _linear_kernel(X, Y):
|
||||
return np.dot(X, Y.T)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_additive_chi2_sampler(csr_container):
|
||||
# test that AdditiveChi2Sampler approximates kernel on random data
|
||||
|
||||
# compute exact kernel
|
||||
# abbreviations for easier formula
|
||||
X_ = X[:, np.newaxis, :].copy()
|
||||
Y_ = Y[np.newaxis, :, :].copy()
|
||||
|
||||
large_kernel = 2 * X_ * Y_ / (X_ + Y_)
|
||||
|
||||
# reduce to n_samples_x x n_samples_y by summing over features
|
||||
kernel = large_kernel.sum(axis=2)
|
||||
|
||||
# approximate kernel mapping
|
||||
transform = AdditiveChi2Sampler(sample_steps=3)
|
||||
X_trans = transform.fit_transform(X)
|
||||
Y_trans = transform.transform(Y)
|
||||
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
|
||||
assert_array_almost_equal(kernel, kernel_approx, 1)
|
||||
|
||||
X_sp_trans = transform.fit_transform(csr_container(X))
|
||||
Y_sp_trans = transform.transform(csr_container(Y))
|
||||
|
||||
assert_array_equal(X_trans, X_sp_trans.toarray())
|
||||
assert_array_equal(Y_trans, Y_sp_trans.toarray())
|
||||
|
||||
# test error is raised on negative input
|
||||
Y_neg = Y.copy()
|
||||
Y_neg[0, 0] = -1
|
||||
msg = "Negative values in data passed to"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transform.fit(Y_neg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
|
||||
@pytest.mark.parametrize("sample_steps", range(1, 4))
|
||||
def test_additive_chi2_sampler_sample_steps(method, sample_steps):
|
||||
"""Check that the input sample step doesn't raise an error
|
||||
and that sample interval doesn't change after fit.
|
||||
"""
|
||||
transformer = AdditiveChi2Sampler(sample_steps=sample_steps)
|
||||
getattr(transformer, method)(X)
|
||||
|
||||
sample_interval = 0.5
|
||||
transformer = AdditiveChi2Sampler(
|
||||
sample_steps=sample_steps,
|
||||
sample_interval=sample_interval,
|
||||
)
|
||||
getattr(transformer, method)(X)
|
||||
assert transformer.sample_interval == sample_interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
|
||||
def test_additive_chi2_sampler_wrong_sample_steps(method):
|
||||
"""Check that we raise a ValueError on invalid sample_steps"""
|
||||
transformer = AdditiveChi2Sampler(sample_steps=4)
|
||||
msg = re.escape(
|
||||
"If sample_steps is not in [1, 2, 3], you need to provide sample_interval"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(transformer, method)(X)
|
||||
|
||||
|
||||
def test_skewed_chi2_sampler():
|
||||
# test that RBFSampler approximates kernel on random data
|
||||
|
||||
# compute exact kernel
|
||||
c = 0.03
|
||||
# set on negative component but greater than c to ensure that the kernel
|
||||
# approximation is valid on the group (-c; +\infty) endowed with the skewed
|
||||
# multiplication.
|
||||
Y_ = Y.copy()
|
||||
Y_[0, 0] = -c / 2.0
|
||||
|
||||
# abbreviations for easier formula
|
||||
X_c = (X + c)[:, np.newaxis, :]
|
||||
Y_c = (Y_ + c)[np.newaxis, :, :]
|
||||
|
||||
# we do it in log-space in the hope that it's more stable
|
||||
# this array is n_samples_x x n_samples_y big x n_features
|
||||
log_kernel = (
|
||||
(np.log(X_c) / 2.0) + (np.log(Y_c) / 2.0) + np.log(2.0) - np.log(X_c + Y_c)
|
||||
)
|
||||
# reduce to n_samples_x x n_samples_y by summing over features in log-space
|
||||
kernel = np.exp(log_kernel.sum(axis=2))
|
||||
|
||||
# approximate kernel mapping
|
||||
transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42)
|
||||
X_trans = transform.fit_transform(X)
|
||||
Y_trans = transform.transform(Y_)
|
||||
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
assert_array_almost_equal(kernel, kernel_approx, 1)
|
||||
assert np.isfinite(kernel).all(), "NaNs found in the Gram matrix"
|
||||
assert np.isfinite(kernel_approx).all(), "NaNs found in the approximate Gram matrix"
|
||||
|
||||
# test error is raised on when inputs contains values smaller than -c
|
||||
Y_neg = Y_.copy()
|
||||
Y_neg[0, 0] = -c * 2.0
|
||||
msg = "X may not contain entries smaller than -skewedness"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
transform.transform(Y_neg)
|
||||
|
||||
|
||||
def test_additive_chi2_sampler_exceptions():
|
||||
"""Ensures correct error message"""
|
||||
transformer = AdditiveChi2Sampler()
|
||||
X_neg = X.copy()
|
||||
X_neg[0, 0] = -1
|
||||
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
|
||||
transformer.fit(X_neg)
|
||||
with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
|
||||
transformer.fit(X)
|
||||
transformer.transform(X_neg)
|
||||
|
||||
|
||||
def test_rbf_sampler():
|
||||
# test that RBFSampler approximates kernel on random data
|
||||
# compute exact kernel
|
||||
gamma = 10.0
|
||||
kernel = rbf_kernel(X, Y, gamma=gamma)
|
||||
|
||||
# approximate kernel mapping
|
||||
rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
|
||||
X_trans = rbf_transform.fit_transform(X)
|
||||
Y_trans = rbf_transform.transform(Y)
|
||||
kernel_approx = np.dot(X_trans, Y_trans.T)
|
||||
|
||||
error = kernel - kernel_approx
|
||||
assert np.abs(np.mean(error)) <= 0.01 # close to unbiased
|
||||
np.abs(error, out=error)
|
||||
assert np.max(error) <= 0.1 # nothing too far off
|
||||
assert np.mean(error) <= 0.05 # mean is fairly close
|
||||
|
||||
|
||||
def test_rbf_sampler_fitted_attributes_dtype(global_dtype):
|
||||
"""Check that the fitted attributes are stored accordingly to the
|
||||
data type of X."""
|
||||
rbf = RBFSampler()
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
|
||||
|
||||
rbf.fit(X)
|
||||
|
||||
assert rbf.random_offset_.dtype == global_dtype
|
||||
assert rbf.random_weights_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_rbf_sampler_dtype_equivalence():
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
rbf32 = RBFSampler(random_state=42)
|
||||
X32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
|
||||
rbf32.fit(X32)
|
||||
|
||||
rbf64 = RBFSampler(random_state=42)
|
||||
X64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
|
||||
rbf64.fit(X64)
|
||||
|
||||
assert_allclose(rbf32.random_offset_, rbf64.random_offset_)
|
||||
assert_allclose(rbf32.random_weights_, rbf64.random_weights_)
|
||||
|
||||
|
||||
def test_rbf_sampler_gamma_scale():
|
||||
"""Check the inner value computed when `gamma='scale'`."""
|
||||
X, y = [[0.0], [1.0]], [0, 1]
|
||||
rbf = RBFSampler(gamma="scale")
|
||||
rbf.fit(X, y)
|
||||
assert rbf._gamma == pytest.approx(4)
|
||||
|
||||
|
||||
def test_skewed_chi2_sampler_fitted_attributes_dtype(global_dtype):
|
||||
"""Check that the fitted attributes are stored accordingly to the
|
||||
data type of X."""
|
||||
skewed_chi2_sampler = SkewedChi2Sampler()
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
|
||||
|
||||
skewed_chi2_sampler.fit(X)
|
||||
|
||||
assert skewed_chi2_sampler.random_offset_.dtype == global_dtype
|
||||
assert skewed_chi2_sampler.random_weights_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_skewed_chi2_sampler_dtype_equivalence():
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
skewed_chi2_sampler_32 = SkewedChi2Sampler(random_state=42)
|
||||
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
|
||||
skewed_chi2_sampler_32.fit(X_32)
|
||||
|
||||
skewed_chi2_sampler_64 = SkewedChi2Sampler(random_state=42)
|
||||
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
|
||||
skewed_chi2_sampler_64.fit(X_64)
|
||||
|
||||
assert_allclose(
|
||||
skewed_chi2_sampler_32.random_offset_, skewed_chi2_sampler_64.random_offset_
|
||||
)
|
||||
assert_allclose(
|
||||
skewed_chi2_sampler_32.random_weights_, skewed_chi2_sampler_64.random_weights_
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_input_validation(csr_container):
|
||||
# Regression test: kernel approx. transformers should work on lists
|
||||
# No assertions; the old versions would simply crash
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
AdditiveChi2Sampler().fit(X).transform(X)
|
||||
SkewedChi2Sampler().fit(X).transform(X)
|
||||
RBFSampler().fit(X).transform(X)
|
||||
|
||||
X = csr_container(X)
|
||||
RBFSampler().fit(X).transform(X)
|
||||
|
||||
|
||||
def test_nystroem_approximation():
|
||||
# some basic tests
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
# With n_components = n_samples this is exact
|
||||
X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
|
||||
K = rbf_kernel(X)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
trans = Nystroem(n_components=2, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
# test callable kernel
|
||||
trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
# test that available kernels fit and transform
|
||||
kernels_available = kernel_metrics()
|
||||
for kern in kernels_available:
|
||||
trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
|
||||
X_transformed = trans.fit(X).transform(X)
|
||||
assert X_transformed.shape == (X.shape[0], 2)
|
||||
|
||||
|
||||
def test_nystroem_default_parameters():
|
||||
rnd = np.random.RandomState(42)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
# rbf kernel should behave as gamma=None by default
|
||||
# aka gamma = 1 / n_features
|
||||
nystroem = Nystroem(n_components=10)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
K = rbf_kernel(X, gamma=None)
|
||||
K2 = np.dot(X_transformed, X_transformed.T)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
# chi2 kernel should behave as gamma=1 by default
|
||||
nystroem = Nystroem(kernel="chi2", n_components=10)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
K = chi2_kernel(X, gamma=1)
|
||||
K2 = np.dot(X_transformed, X_transformed.T)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
|
||||
def test_nystroem_singular_kernel():
|
||||
# test that nystroem works with singular kernel matrix
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 20)
|
||||
X = np.vstack([X] * 2) # duplicate samples
|
||||
|
||||
gamma = 100
|
||||
N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
|
||||
X_transformed = N.transform(X)
|
||||
|
||||
K = rbf_kernel(X, gamma=gamma)
|
||||
|
||||
assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
|
||||
assert np.all(np.isfinite(Y))
|
||||
|
||||
|
||||
def test_nystroem_poly_kernel_params():
|
||||
# Non-regression: Nystroem should pass other parameters beside gamma.
|
||||
rnd = np.random.RandomState(37)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
K = polynomial_kernel(X, degree=3.1, coef0=0.1)
|
||||
nystroem = Nystroem(
|
||||
kernel="polynomial", n_components=X.shape[0], degree=3.1, coef0=0.1
|
||||
)
|
||||
X_transformed = nystroem.fit_transform(X)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
|
||||
def test_nystroem_callable():
|
||||
# Test Nystroem on a callable.
|
||||
rnd = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
X = rnd.uniform(size=(n_samples, 4))
|
||||
|
||||
def logging_histogram_kernel(x, y, log):
|
||||
"""Histogram kernel that writes to a log."""
|
||||
log.append(1)
|
||||
return np.minimum(x, y).sum()
|
||||
|
||||
kernel_log = []
|
||||
X = list(X) # test input validation
|
||||
Nystroem(
|
||||
kernel=logging_histogram_kernel,
|
||||
n_components=(n_samples - 1),
|
||||
kernel_params={"log": kernel_log},
|
||||
).fit(X)
|
||||
assert len(kernel_log) == n_samples * (n_samples - 1) / 2
|
||||
|
||||
# if degree, gamma or coef0 is passed, we raise a ValueError
|
||||
msg = "Don't pass gamma, coef0 or degree to Nystroem"
|
||||
params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
|
||||
for param in params:
|
||||
ny = Nystroem(kernel=_linear_kernel, n_components=(n_samples - 1), **param)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ny.fit(X)
|
||||
|
||||
|
||||
def test_nystroem_precomputed_kernel():
|
||||
# Non-regression: test Nystroem on precomputed kernel.
|
||||
# PR - 14706
|
||||
rnd = np.random.RandomState(12)
|
||||
X = rnd.uniform(size=(10, 4))
|
||||
|
||||
K = polynomial_kernel(X, degree=2, coef0=0.1)
|
||||
nystroem = Nystroem(kernel="precomputed", n_components=X.shape[0])
|
||||
X_transformed = nystroem.fit_transform(K)
|
||||
assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
|
||||
|
||||
# if degree, gamma or coef0 is passed, we raise a ValueError
|
||||
msg = "Don't pass gamma, coef0 or degree to Nystroem"
|
||||
params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
|
||||
for param in params:
|
||||
ny = Nystroem(kernel="precomputed", n_components=X.shape[0], **param)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ny.fit(K)
|
||||
|
||||
|
||||
def test_nystroem_component_indices():
|
||||
"""Check that `component_indices_` corresponds to the subset of
|
||||
training points used to construct the feature map.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20474
|
||||
"""
|
||||
X, _ = make_classification(n_samples=100, n_features=20)
|
||||
feature_map_nystroem = Nystroem(
|
||||
n_components=10,
|
||||
random_state=0,
|
||||
)
|
||||
feature_map_nystroem.fit(X)
|
||||
assert feature_map_nystroem.component_indices_.shape == (10,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator", [PolynomialCountSketch, RBFSampler, SkewedChi2Sampler, Nystroem]
|
||||
)
|
||||
def test_get_feature_names_out(Estimator):
|
||||
"""Check get_feature_names_out"""
|
||||
est = Estimator().fit(X)
|
||||
X_trans = est.transform(X)
|
||||
|
||||
names_out = est.get_feature_names_out()
|
||||
class_name = Estimator.__name__.lower()
|
||||
expected_names = [f"{class_name}{i}" for i in range(X_trans.shape[1])]
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
def test_additivechi2sampler_get_feature_names_out():
|
||||
"""Check get_feature_names_out for AdditiveChi2Sampler."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.random_sample(size=(300, 3))
|
||||
|
||||
chi2_sampler = AdditiveChi2Sampler(sample_steps=3).fit(X)
|
||||
input_names = ["f0", "f1", "f2"]
|
||||
suffixes = [
|
||||
"f0_sqrt",
|
||||
"f1_sqrt",
|
||||
"f2_sqrt",
|
||||
"f0_cos1",
|
||||
"f1_cos1",
|
||||
"f2_cos1",
|
||||
"f0_sin1",
|
||||
"f1_sin1",
|
||||
"f2_sin1",
|
||||
"f0_cos2",
|
||||
"f1_cos2",
|
||||
"f2_cos2",
|
||||
"f0_sin2",
|
||||
"f1_sin2",
|
||||
"f2_sin2",
|
||||
]
|
||||
|
||||
names_out = chi2_sampler.get_feature_names_out(input_features=input_names)
|
||||
expected_names = [f"additivechi2sampler_{suffix}" for suffix in suffixes]
|
||||
assert_array_equal(names_out, expected_names)
|
||||
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.metrics.pairwise import pairwise_kernels
|
||||
from sklearn.utils._testing import assert_array_almost_equal, ignore_warnings
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
X, y = make_regression(n_features=10, random_state=0)
|
||||
Y = np.array([y, y]).T
|
||||
|
||||
|
||||
def test_kernel_ridge():
|
||||
pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [*CSR_CONTAINERS, *CSC_CONTAINERS])
|
||||
def test_kernel_ridge_sparse(sparse_container):
|
||||
X_sparse = sparse_container(X)
|
||||
pred = (
|
||||
Ridge(alpha=1, fit_intercept=False, solver="cholesky")
|
||||
.fit(X_sparse, y)
|
||||
.predict(X_sparse)
|
||||
)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X_sparse, y).predict(X_sparse)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_singular_kernel():
|
||||
# alpha=0 causes a LinAlgError in computing the dual coefficients,
|
||||
# which causes a fallback to a lstsq solver. This is tested here.
|
||||
pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X)
|
||||
kr = KernelRidge(kernel="linear", alpha=0)
|
||||
ignore_warnings(kr.fit)(X, y)
|
||||
pred2 = kr.predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_precomputed():
|
||||
for kernel in ["linear", "rbf", "poly", "cosine"]:
|
||||
K = pairwise_kernels(X, X, metric=kernel)
|
||||
pred = KernelRidge(kernel=kernel).fit(X, y).predict(X)
|
||||
pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
|
||||
def test_kernel_ridge_precomputed_kernel_unchanged():
|
||||
K = np.dot(X, X.T)
|
||||
K2 = K.copy()
|
||||
KernelRidge(kernel="precomputed").fit(K, y)
|
||||
assert_array_almost_equal(K, K2)
|
||||
|
||||
|
||||
def test_kernel_ridge_sample_weights():
|
||||
K = np.dot(X, X.T) # precomputed kernel
|
||||
sw = np.random.RandomState(0).rand(X.shape[0])
|
||||
|
||||
pred = Ridge(alpha=1, fit_intercept=False).fit(X, y, sample_weight=sw).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y, sample_weight=sw).predict(X)
|
||||
pred3 = (
|
||||
KernelRidge(kernel="precomputed", alpha=1)
|
||||
.fit(K, y, sample_weight=sw)
|
||||
.predict(K)
|
||||
)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
assert_array_almost_equal(pred, pred3)
|
||||
|
||||
|
||||
def test_kernel_ridge_multi_output():
|
||||
pred = Ridge(alpha=1, fit_intercept=False).fit(X, Y).predict(X)
|
||||
pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, Y).predict(X)
|
||||
assert_array_almost_equal(pred, pred2)
|
||||
|
||||
pred3 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
|
||||
pred3 = np.array([pred3, pred3]).T
|
||||
assert_array_almost_equal(pred2, pred3)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,340 @@
|
||||
"""Common tests for metaestimators"""
|
||||
|
||||
import functools
|
||||
from contextlib import suppress
|
||||
from inspect import signature
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import BaseEstimator, clone, is_regressor
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.feature_selection import RFE, RFECV
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
|
||||
from sklearn.semi_supervised import SelfTrainingClassifier
|
||||
from sklearn.utils import all_estimators
|
||||
from sklearn.utils._test_common.instance_generator import _construct_instances
|
||||
from sklearn.utils._testing import SkipTest, set_random_state
|
||||
from sklearn.utils.estimator_checks import (
|
||||
_enforce_estimator_tags_X,
|
||||
_enforce_estimator_tags_y,
|
||||
)
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
|
||||
|
||||
class DelegatorData:
|
||||
def __init__(
|
||||
self,
|
||||
name,
|
||||
construct,
|
||||
skip_methods=(),
|
||||
fit_args=make_classification(random_state=0),
|
||||
):
|
||||
self.name = name
|
||||
self.construct = construct
|
||||
self.fit_args = fit_args
|
||||
self.skip_methods = skip_methods
|
||||
|
||||
|
||||
# For the following meta estimators we check for the existence of relevant
|
||||
# methods only if the sub estimator also contains them. Any methods that
|
||||
# are implemented in the meta estimator themselves and are not dependent
|
||||
# on the sub estimator are specified in the `skip_methods` parameter.
|
||||
DELEGATING_METAESTIMATORS = [
|
||||
DelegatorData("Pipeline", lambda est: Pipeline([("est", est)])),
|
||||
DelegatorData(
|
||||
"GridSearchCV",
|
||||
lambda est: GridSearchCV(est, param_grid={"param": [5]}, cv=2),
|
||||
skip_methods=["score"],
|
||||
),
|
||||
DelegatorData(
|
||||
"RandomizedSearchCV",
|
||||
lambda est: RandomizedSearchCV(
|
||||
est, param_distributions={"param": [5]}, cv=2, n_iter=1
|
||||
),
|
||||
skip_methods=["score"],
|
||||
),
|
||||
DelegatorData("RFE", RFE, skip_methods=["transform", "inverse_transform"]),
|
||||
DelegatorData(
|
||||
"RFECV", RFECV, skip_methods=["transform", "inverse_transform", "score"]
|
||||
),
|
||||
DelegatorData(
|
||||
"BaggingClassifier",
|
||||
BaggingClassifier,
|
||||
skip_methods=[
|
||||
"transform",
|
||||
"inverse_transform",
|
||||
"score",
|
||||
"predict_proba",
|
||||
"predict_log_proba",
|
||||
"predict",
|
||||
],
|
||||
),
|
||||
DelegatorData(
|
||||
"SelfTrainingClassifier",
|
||||
lambda est: SelfTrainingClassifier(est),
|
||||
skip_methods=["transform", "inverse_transform", "predict_proba"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_metaestimator_delegation():
|
||||
# Ensures specified metaestimators have methods iff subestimator does
|
||||
def hides(method):
|
||||
@property
|
||||
def wrapper(obj):
|
||||
if obj.hidden_method == method.__name__:
|
||||
raise AttributeError("%r is hidden" % obj.hidden_method)
|
||||
return functools.partial(method, obj)
|
||||
|
||||
return wrapper
|
||||
|
||||
class SubEstimator(BaseEstimator):
|
||||
def __init__(self, param=1, hidden_method=None):
|
||||
self.param = param
|
||||
self.hidden_method = hidden_method
|
||||
|
||||
def fit(self, X, y=None, *args, **kwargs):
|
||||
self.coef_ = np.arange(X.shape[1])
|
||||
self.classes_ = []
|
||||
return True
|
||||
|
||||
def _check_fit(self):
|
||||
check_is_fitted(self)
|
||||
|
||||
@hides
|
||||
def inverse_transform(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return X
|
||||
|
||||
@hides
|
||||
def transform(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return X
|
||||
|
||||
@hides
|
||||
def predict(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def predict_proba(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def predict_log_proba(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def decision_function(self, X, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
@hides
|
||||
def score(self, X, y, *args, **kwargs):
|
||||
self._check_fit()
|
||||
return 1.0
|
||||
|
||||
methods = [
|
||||
k
|
||||
for k in SubEstimator.__dict__.keys()
|
||||
if not k.startswith("_") and not k.startswith("fit")
|
||||
]
|
||||
methods.sort()
|
||||
|
||||
for delegator_data in DELEGATING_METAESTIMATORS:
|
||||
delegate = SubEstimator()
|
||||
delegator = delegator_data.construct(delegate)
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
assert hasattr(delegate, method)
|
||||
assert hasattr(delegator, method), (
|
||||
"%s does not have method %r when its delegate does"
|
||||
% (
|
||||
delegator_data.name,
|
||||
method,
|
||||
)
|
||||
)
|
||||
# delegation before fit raises a NotFittedError
|
||||
if method == "score":
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(delegator, method)(
|
||||
delegator_data.fit_args[0], delegator_data.fit_args[1]
|
||||
)
|
||||
else:
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(delegator, method)(delegator_data.fit_args[0])
|
||||
|
||||
delegator.fit(*delegator_data.fit_args)
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
# smoke test delegation
|
||||
if method == "score":
|
||||
getattr(delegator, method)(
|
||||
delegator_data.fit_args[0], delegator_data.fit_args[1]
|
||||
)
|
||||
else:
|
||||
getattr(delegator, method)(delegator_data.fit_args[0])
|
||||
|
||||
for method in methods:
|
||||
if method in delegator_data.skip_methods:
|
||||
continue
|
||||
delegate = SubEstimator(hidden_method=method)
|
||||
delegator = delegator_data.construct(delegate)
|
||||
assert not hasattr(delegate, method)
|
||||
assert not hasattr(delegator, method), (
|
||||
"%s has method %r when its delegate does not"
|
||||
% (
|
||||
delegator_data.name,
|
||||
method,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _get_instance_with_pipeline(meta_estimator, init_params):
|
||||
"""Given a single meta-estimator instance, generate an instance with a pipeline"""
|
||||
if {"estimator", "base_estimator", "regressor"} & init_params:
|
||||
if is_regressor(meta_estimator):
|
||||
estimator = make_pipeline(TfidfVectorizer(), Ridge())
|
||||
param_grid = {"ridge__alpha": [0.1, 1.0]}
|
||||
else:
|
||||
estimator = make_pipeline(TfidfVectorizer(), LogisticRegression())
|
||||
param_grid = {"logisticregression__C": [0.1, 1.0]}
|
||||
|
||||
if init_params.intersection(
|
||||
{"param_grid", "param_distributions"}
|
||||
): # SearchCV estimators
|
||||
extra_params = {"n_iter": 2} if "n_iter" in init_params else {}
|
||||
return type(meta_estimator)(estimator, param_grid, **extra_params)
|
||||
else:
|
||||
return type(meta_estimator)(estimator)
|
||||
|
||||
if "transformer_list" in init_params:
|
||||
# FeatureUnion
|
||||
transformer_list = [
|
||||
("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
|
||||
(
|
||||
"trans2",
|
||||
make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False)),
|
||||
),
|
||||
]
|
||||
return type(meta_estimator)(transformer_list)
|
||||
|
||||
if "estimators" in init_params:
|
||||
# stacking, voting
|
||||
if is_regressor(meta_estimator):
|
||||
estimator = [
|
||||
("est1", make_pipeline(TfidfVectorizer(), Ridge(alpha=0.1))),
|
||||
("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))),
|
||||
]
|
||||
else:
|
||||
estimator = [
|
||||
(
|
||||
"est1",
|
||||
make_pipeline(TfidfVectorizer(), LogisticRegression(C=0.1)),
|
||||
),
|
||||
("est2", make_pipeline(TfidfVectorizer(), LogisticRegression(C=1))),
|
||||
]
|
||||
return type(meta_estimator)(estimator)
|
||||
|
||||
|
||||
def _generate_meta_estimator_instances_with_pipeline():
|
||||
"""Generate instances of meta-estimators fed with a pipeline
|
||||
|
||||
Are considered meta-estimators all estimators accepting one of "estimator",
|
||||
"base_estimator" or "estimators".
|
||||
"""
|
||||
print("estimators: ", len(all_estimators()))
|
||||
for _, Estimator in sorted(all_estimators()):
|
||||
sig = set(signature(Estimator).parameters)
|
||||
|
||||
print("\n", Estimator.__name__, sig)
|
||||
if not sig.intersection(
|
||||
{
|
||||
"estimator",
|
||||
"base_estimator",
|
||||
"regressor",
|
||||
"transformer_list",
|
||||
"estimators",
|
||||
}
|
||||
):
|
||||
continue
|
||||
|
||||
with suppress(SkipTest):
|
||||
for meta_estimator in _construct_instances(Estimator):
|
||||
print(meta_estimator)
|
||||
yield _get_instance_with_pipeline(meta_estimator, sig)
|
||||
|
||||
|
||||
# TODO: remove data validation for the following estimators
|
||||
# They should be able to work on any data and delegate data validation to
|
||||
# their inner estimator(s).
|
||||
DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
|
||||
"AdaBoostClassifier",
|
||||
"AdaBoostRegressor",
|
||||
"BaggingClassifier",
|
||||
"BaggingRegressor",
|
||||
"ClassifierChain", # data validation is necessary
|
||||
"FrozenEstimator", # this estimator cannot be tested like others.
|
||||
"IterativeImputer",
|
||||
"OneVsOneClassifier", # input validation can't be avoided
|
||||
"RANSACRegressor",
|
||||
"RFE",
|
||||
"RFECV",
|
||||
"RegressorChain", # data validation is necessary
|
||||
"SelfTrainingClassifier",
|
||||
"SequentialFeatureSelector", # not applicable (2D data mandatory)
|
||||
]
|
||||
|
||||
DATA_VALIDATION_META_ESTIMATORS = [
|
||||
est
|
||||
for est in _generate_meta_estimator_instances_with_pipeline()
|
||||
if est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE
|
||||
]
|
||||
|
||||
|
||||
def _get_meta_estimator_id(estimator):
|
||||
return estimator.__class__.__name__
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id
|
||||
)
|
||||
def test_meta_estimators_delegate_data_validation(estimator):
|
||||
# Check that meta-estimators delegate data validation to the inner
|
||||
# estimator(s).
|
||||
|
||||
# clone to avoid side effects and ensure thread-safe test execution.
|
||||
estimator = clone(estimator)
|
||||
rng = np.random.RandomState(0)
|
||||
set_random_state(estimator)
|
||||
|
||||
n_samples = 30
|
||||
X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)
|
||||
|
||||
if is_regressor(estimator):
|
||||
y = rng.normal(size=n_samples)
|
||||
else:
|
||||
y = rng.randint(3, size=n_samples)
|
||||
|
||||
# We convert to lists to make sure it works on array-like
|
||||
X = _enforce_estimator_tags_X(estimator, X).tolist()
|
||||
y = _enforce_estimator_tags_y(estimator, y).tolist()
|
||||
|
||||
# Calling fit should not raise any data validation exception since X is a
|
||||
# valid input datastructure for the first step of the pipeline passed as
|
||||
# base estimator to the meta estimator.
|
||||
estimator.fit(X, y)
|
||||
|
||||
# n_features_in_ should not be defined since data is not tabular data.
|
||||
assert not hasattr(estimator, "n_features_in_")
|
||||
@@ -0,0 +1,967 @@
|
||||
import copy
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.base import BaseEstimator, is_classifier
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.compose import TransformedTargetRegressor
|
||||
from sklearn.covariance import GraphicalLassoCV
|
||||
from sklearn.ensemble import (
|
||||
AdaBoostClassifier,
|
||||
AdaBoostRegressor,
|
||||
BaggingClassifier,
|
||||
BaggingRegressor,
|
||||
)
|
||||
from sklearn.exceptions import UnsetMetadataPassedError
|
||||
from sklearn.experimental import (
|
||||
enable_halving_search_cv, # noqa: F401
|
||||
enable_iterative_imputer, # noqa: F401
|
||||
)
|
||||
from sklearn.feature_selection import (
|
||||
RFE,
|
||||
RFECV,
|
||||
SelectFromModel,
|
||||
SequentialFeatureSelector,
|
||||
)
|
||||
from sklearn.impute import IterativeImputer
|
||||
from sklearn.linear_model import (
|
||||
ElasticNetCV,
|
||||
LarsCV,
|
||||
LassoCV,
|
||||
LassoLarsCV,
|
||||
LogisticRegressionCV,
|
||||
MultiTaskElasticNetCV,
|
||||
MultiTaskLassoCV,
|
||||
OrthogonalMatchingPursuitCV,
|
||||
RANSACRegressor,
|
||||
RidgeClassifierCV,
|
||||
RidgeCV,
|
||||
)
|
||||
from sklearn.metrics._regression import mean_squared_error
|
||||
from sklearn.metrics._scorer import make_scorer
|
||||
from sklearn.model_selection import (
|
||||
FixedThresholdClassifier,
|
||||
GridSearchCV,
|
||||
GroupKFold,
|
||||
HalvingGridSearchCV,
|
||||
HalvingRandomSearchCV,
|
||||
RandomizedSearchCV,
|
||||
TunedThresholdClassifierCV,
|
||||
cross_validate,
|
||||
)
|
||||
from sklearn.multiclass import (
|
||||
OneVsOneClassifier,
|
||||
OneVsRestClassifier,
|
||||
OutputCodeClassifier,
|
||||
)
|
||||
from sklearn.multioutput import (
|
||||
ClassifierChain,
|
||||
MultiOutputClassifier,
|
||||
MultiOutputRegressor,
|
||||
RegressorChain,
|
||||
)
|
||||
from sklearn.semi_supervised import SelfTrainingClassifier
|
||||
from sklearn.tests.metadata_routing_common import (
|
||||
ConsumingClassifier,
|
||||
ConsumingRegressor,
|
||||
ConsumingScorer,
|
||||
ConsumingSplitter,
|
||||
NonConsumingClassifier,
|
||||
NonConsumingRegressor,
|
||||
_Registry,
|
||||
assert_request_is_empty,
|
||||
check_recorded_metadata,
|
||||
)
|
||||
from sklearn.utils.metadata_routing import MetadataRouter
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
N, M = 100, 4
|
||||
X = rng.rand(N, M)
|
||||
y = rng.randint(0, 3, size=N)
|
||||
y_binary = (y >= 1).astype(int)
|
||||
classes = np.unique(y)
|
||||
y_multi = rng.randint(0, 3, size=(N, 3))
|
||||
classes_multi = [np.unique(y_multi[:, i]) for i in range(y_multi.shape[1])]
|
||||
metadata = rng.randint(0, 10, size=N)
|
||||
sample_weight = rng.rand(N)
|
||||
groups = rng.randint(0, 10, size=len(y))
|
||||
|
||||
|
||||
METAESTIMATORS: list = [
|
||||
{
|
||||
"metaestimator": MultiOutputRegressor,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "regressor",
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"estimator_routing_methods": ["fit", "partial_fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": MultiOutputClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"estimator_routing_methods": ["fit", "partial_fit"],
|
||||
"method_args": {"partial_fit": {"classes": classes_multi}},
|
||||
},
|
||||
{
|
||||
"metaestimator": CalibratedClassifierCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
},
|
||||
{
|
||||
"metaestimator": ClassifierChain,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RegressorChain,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "regressor",
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": LogisticRegressionCV,
|
||||
"init_args": {"use_legacy_attributes": False, "l1_ratios": (0,)},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": GridSearchCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"init_args": {"param_grid": {"alpha": [0.1, 0.2]}},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RandomizedSearchCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"init_args": {"param_distributions": {"alpha": [0.1, 0.2]}},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": HalvingGridSearchCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"init_args": {"param_grid": {"alpha": [0.1, 0.2]}},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": HalvingRandomSearchCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"init_args": {"param_distributions": {"alpha": [0.1, 0.2]}},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": FixedThresholdClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y_binary,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
},
|
||||
{
|
||||
"metaestimator": TunedThresholdClassifierCV,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y_binary,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"preserves_metadata": "subset",
|
||||
},
|
||||
{
|
||||
"metaestimator": OneVsRestClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit", "partial_fit"],
|
||||
"method_args": {"partial_fit": {"classes": classes}},
|
||||
},
|
||||
{
|
||||
"metaestimator": OneVsOneClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit", "partial_fit"],
|
||||
"preserves_metadata": "subset",
|
||||
"method_args": {"partial_fit": {"classes": classes}},
|
||||
},
|
||||
{
|
||||
"metaestimator": OutputCodeClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"init_args": {"random_state": 42},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": SelectFromModel,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit", "partial_fit"],
|
||||
"method_args": {"partial_fit": {"classes": classes}},
|
||||
},
|
||||
{
|
||||
"metaestimator": OrthogonalMatchingPursuitCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": ElasticNetCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": LassoCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": MultiTaskElasticNetCV,
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": MultiTaskLassoCV,
|
||||
"X": X,
|
||||
"y": y_multi,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": LarsCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": LassoLarsCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RANSACRegressor,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "regressor",
|
||||
"init_args": {"min_samples": 0.5, "max_trials": 10},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"preserves_metadata": "subset",
|
||||
"estimator_routing_methods": ["fit", "predict", "score"],
|
||||
"method_mapping": {"fit": ["fit", "score"]},
|
||||
},
|
||||
{
|
||||
"metaestimator": IterativeImputer,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "regressor",
|
||||
"init_args": {"skip_complete": False},
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": BaggingClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"preserves_metadata": False,
|
||||
"estimator_routing_methods": [
|
||||
("fit", ["metadata"]),
|
||||
"predict",
|
||||
"predict_proba",
|
||||
"predict_log_proba",
|
||||
"decision_function",
|
||||
],
|
||||
"method_mapping": {
|
||||
"predict": ["predict", "predict_proba"],
|
||||
"predict_proba": ["predict", "predict_proba"],
|
||||
"predict_log_proba": ["predict", "predict_proba", "predict_log_proba"],
|
||||
},
|
||||
},
|
||||
{
|
||||
"metaestimator": BaggingRegressor,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "regressor",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"preserves_metadata": False,
|
||||
"estimator_routing_methods": [("fit", ["metadata"]), "predict"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RidgeCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RidgeClassifierCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RidgeCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RidgeClassifierCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": GraphicalLassoCV,
|
||||
"X": X,
|
||||
"y": y,
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": TransformedTargetRegressor,
|
||||
"estimator": "regressor",
|
||||
"estimator_name": "regressor",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit", "predict"],
|
||||
},
|
||||
{
|
||||
"metaestimator": SelfTrainingClassifier,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"preserves_metadata": True,
|
||||
"estimator_routing_methods": [
|
||||
"fit",
|
||||
"predict",
|
||||
"predict_proba",
|
||||
"predict_log_proba",
|
||||
"decision_function",
|
||||
"score",
|
||||
],
|
||||
"method_mapping": {"fit": ["fit", "score"]},
|
||||
},
|
||||
{
|
||||
"metaestimator": SequentialFeatureSelector,
|
||||
"estimator_name": "estimator",
|
||||
"estimator": "classifier",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RFE,
|
||||
"estimator": "classifier",
|
||||
"estimator_name": "estimator",
|
||||
"X": X,
|
||||
"y": y,
|
||||
"estimator_routing_methods": ["fit", "predict", "score"],
|
||||
},
|
||||
{
|
||||
"metaestimator": RFECV,
|
||||
"estimator": "classifier",
|
||||
"estimator_name": "estimator",
|
||||
"estimator_routing_methods": ["fit"],
|
||||
"cv_name": "cv",
|
||||
"cv_routing_methods": ["fit"],
|
||||
"scorer_name": "scoring",
|
||||
"scorer_routing_methods": ["fit", "score"],
|
||||
"X": X,
|
||||
"y": y,
|
||||
},
|
||||
]
|
||||
"""List containing all metaestimators to be tested and their settings
|
||||
|
||||
The keys are as follows:
|
||||
|
||||
- metaestimator: The metaestimator to be tested
|
||||
- estimator_name: The name of the argument for the sub-estimator
|
||||
- estimator: The sub-estimator type, either "regressor" or "classifier"
|
||||
- init_args: The arguments to be passed to the metaestimator's constructor
|
||||
- X: X-data to fit and predict
|
||||
- y: y-data to fit
|
||||
- estimator_routing_methods: list of all methods to check for routing metadata
|
||||
to the sub-estimator. Each value is either a str or a tuple:
|
||||
- str: the name of the method, all metadata in this method must be routed to the
|
||||
sub-estimator
|
||||
- tuple: the name of the method, the second element is a list of metadata keys
|
||||
to be passed to the sub-estimator. This is useful if certain metadata such as
|
||||
`sample_weight` are never routed and only consumed, such as in `BaggingClassifier`
|
||||
and `BaggingRegressor`.
|
||||
- preserves_metadata:
|
||||
- True (default): the metaestimator passes the metadata to the
|
||||
sub-estimator without modification. We check that the values recorded by
|
||||
the sub-estimator are identical to what we've passed to the
|
||||
metaestimator.
|
||||
- False: no check is performed regarding values, we only check that a
|
||||
metadata with the expected names/keys are passed.
|
||||
- "subset": we check that the recorded metadata by the sub-estimator is a
|
||||
subset of what is passed to the metaestimator.
|
||||
- scorer_name: The name of the argument for the scorer
|
||||
- scorer_routing_methods: list of all methods to check for routing metadata
|
||||
to the scorer
|
||||
- cv_name: The name of the argument for the CV splitter
|
||||
- cv_routing_methods: list of all methods to check for routing metadata
|
||||
to the splitter
|
||||
- method_args: a dict of dicts, defining extra arguments needed to be passed to
|
||||
methods, such as passing `classes` to `partial_fit`.
|
||||
- method_mapping: a dict of the form `{caller: [callee1, ...]}` which signals
|
||||
which `.set_{method}_request` methods should be called to set request values.
|
||||
If not present, a one-to-one mapping is assumed.
|
||||
"""
|
||||
|
||||
# IDs used by pytest to get meaningful verbose messages when running the tests
|
||||
METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS]
|
||||
|
||||
UNSUPPORTED_ESTIMATORS = [
|
||||
AdaBoostClassifier(),
|
||||
AdaBoostRegressor(),
|
||||
]
|
||||
|
||||
|
||||
def get_init_args(metaestimator_info, sub_estimator_consumes):
|
||||
"""Get the init args for a metaestimator
|
||||
|
||||
This is a helper function to get the init args for a metaestimator from
|
||||
the METAESTIMATORS list. It returns an empty dict if no init args are
|
||||
required.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metaestimator_info : dict
|
||||
The metaestimator info from METAESTIMATORS
|
||||
|
||||
sub_estimator_consumes : bool
|
||||
Whether the sub-estimator consumes metadata or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
kwargs : dict
|
||||
The init args for the metaestimator.
|
||||
|
||||
(estimator, estimator_registry) : (estimator, registry)
|
||||
The sub-estimator and the corresponding registry.
|
||||
|
||||
(scorer, scorer_registry) : (scorer, registry)
|
||||
The scorer and the corresponding registry.
|
||||
|
||||
(cv, cv_registry) : (CV splitter, registry)
|
||||
The CV splitter and the corresponding registry.
|
||||
"""
|
||||
# Avoid mutating the original init_args dict to keep the test execution
|
||||
# thread-safe.
|
||||
kwargs = metaestimator_info.get("init_args", {}).copy()
|
||||
estimator, estimator_registry = None, None
|
||||
scorer, scorer_registry = None, None
|
||||
cv, cv_registry = None, None
|
||||
if "estimator" in metaestimator_info:
|
||||
estimator_name = metaestimator_info["estimator_name"]
|
||||
estimator_registry = _Registry()
|
||||
sub_estimator_type = metaestimator_info["estimator"]
|
||||
if sub_estimator_consumes:
|
||||
if sub_estimator_type == "regressor":
|
||||
estimator = ConsumingRegressor(estimator_registry)
|
||||
elif sub_estimator_type == "classifier":
|
||||
estimator = ConsumingClassifier(estimator_registry)
|
||||
else:
|
||||
raise ValueError("Unpermitted `sub_estimator_type`.") # pragma: nocover
|
||||
else:
|
||||
if sub_estimator_type == "regressor":
|
||||
estimator = NonConsumingRegressor()
|
||||
elif sub_estimator_type == "classifier":
|
||||
estimator = NonConsumingClassifier()
|
||||
else:
|
||||
raise ValueError("Unpermitted `sub_estimator_type`.") # pragma: nocover
|
||||
kwargs[estimator_name] = estimator
|
||||
if "scorer_name" in metaestimator_info:
|
||||
scorer_name = metaestimator_info["scorer_name"]
|
||||
scorer_registry = _Registry()
|
||||
scorer = ConsumingScorer(registry=scorer_registry)
|
||||
kwargs[scorer_name] = scorer
|
||||
if "cv_name" in metaestimator_info:
|
||||
cv_name = metaestimator_info["cv_name"]
|
||||
cv_registry = _Registry()
|
||||
cv = ConsumingSplitter(registry=cv_registry)
|
||||
kwargs[cv_name] = cv
|
||||
|
||||
return (
|
||||
kwargs,
|
||||
(estimator, estimator_registry),
|
||||
(scorer, scorer_registry),
|
||||
(cv, cv_registry),
|
||||
)
|
||||
|
||||
|
||||
def filter_metadata_in_routing_methods(estimator_routing_methods):
|
||||
"""Process estimator_routing_methods and return a dict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator_routing_methods : list of str or tuple
|
||||
The estimator_routing_methods info from METAESTIMATORS.
|
||||
|
||||
Returns
|
||||
-------
|
||||
routing_methods : dict
|
||||
The dictionary is of the form {"method": ["metadata", ...]}.
|
||||
It specifies the list of metadata keys for each routing method.
|
||||
By default the list includes `sample_weight` and `metadata`.
|
||||
"""
|
||||
res = dict()
|
||||
for method_spec in estimator_routing_methods:
|
||||
if isinstance(method_spec, str):
|
||||
method = method_spec
|
||||
metadata = ["sample_weight", "metadata"]
|
||||
else:
|
||||
method, metadata = method_spec
|
||||
res[method] = metadata
|
||||
return res
|
||||
|
||||
|
||||
def set_requests(obj, *, method_mapping, methods, metadata_name, value=True):
|
||||
"""Call `set_{method}_request` on a list of methods from the sub-estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : BaseEstimator
|
||||
The object for which `set_{method}_request` methods are called.
|
||||
|
||||
method_mapping : dict
|
||||
The method mapping in the form of `{caller: [callee, ...]}`.
|
||||
If a "caller" is not present in the method mapping, a one-to-one mapping is
|
||||
assumed.
|
||||
|
||||
methods : list of str
|
||||
The list of methods as "caller"s for which the request for the child should
|
||||
be set.
|
||||
|
||||
metadata_name : str
|
||||
The name of the metadata to be routed, usually either `"metadata"` or
|
||||
`"sample_weight"` in our tests.
|
||||
|
||||
value : None, bool, or str
|
||||
The request value to be set, by default it's `True`
|
||||
"""
|
||||
for caller in methods:
|
||||
for callee in method_mapping.get(caller, [caller]):
|
||||
set_request_for_method = getattr(obj, f"set_{callee}_request")
|
||||
set_request_for_method(**{metadata_name: value})
|
||||
if (
|
||||
isinstance(obj, BaseEstimator)
|
||||
and is_classifier(obj)
|
||||
and callee == "partial_fit"
|
||||
):
|
||||
set_request_for_method(classes=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_unsupported_estimators_get_metadata_routing(estimator):
|
||||
"""Test that get_metadata_routing is not implemented on meta-estimators for
|
||||
which we haven't implemented routing yet."""
|
||||
with pytest.raises(NotImplementedError):
|
||||
estimator.get_metadata_routing()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_unsupported_estimators_fit_with_metadata(estimator):
|
||||
"""Test that fit raises NotImplementedError when metadata routing is
|
||||
enabled and a metadata is passed on meta-estimators for which we haven't
|
||||
implemented routing yet."""
|
||||
with pytest.raises(NotImplementedError):
|
||||
try:
|
||||
estimator.fit([[1]], [1], sample_weight=[1])
|
||||
except TypeError:
|
||||
# not all meta-estimators in the list support sample_weight,
|
||||
# and for those we skip this test.
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_registry_copy():
|
||||
# test that _Registry is not copied into a new instance.
|
||||
a = _Registry()
|
||||
b = _Registry()
|
||||
assert a is not b
|
||||
assert a is copy.copy(a)
|
||||
assert a is copy.deepcopy(a)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_default_request(metaestimator):
|
||||
# Check that by default request is empty and the right type
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
|
||||
instance = metaestimator_class(**kwargs)
|
||||
if "cv_name" in metaestimator:
|
||||
# Our GroupCV splitters request groups by default, which we should
|
||||
# ignore in this test.
|
||||
exclude = {"splitter": ["split"]}
|
||||
else:
|
||||
exclude = None
|
||||
assert_request_is_empty(instance.get_metadata_routing(), exclude=exclude)
|
||||
assert isinstance(instance.get_metadata_routing(), MetadataRouter)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_error_on_missing_requests_for_sub_estimator(metaestimator):
|
||||
# Test that a UnsetMetadataPassedError is raised when the sub-estimator's
|
||||
# requests are not set
|
||||
if "estimator" not in metaestimator:
|
||||
# This test only makes sense for metaestimators which have a
|
||||
# sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
|
||||
return
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
X = metaestimator["X"]
|
||||
y = metaestimator["y"]
|
||||
routing_methods = filter_metadata_in_routing_methods(
|
||||
metaestimator["estimator_routing_methods"]
|
||||
)
|
||||
|
||||
for method_name, metadata_keys in routing_methods.items():
|
||||
for key in metadata_keys:
|
||||
kwargs, (estimator, _), (scorer, _), *_ = get_init_args(
|
||||
metaestimator, sub_estimator_consumes=True
|
||||
)
|
||||
if scorer:
|
||||
scorer.set_score_request(**{key: True})
|
||||
val = {"sample_weight": sample_weight, "metadata": metadata}[key]
|
||||
method_kwargs = {key: val}
|
||||
instance = metaestimator_class(**kwargs)
|
||||
msg = (
|
||||
f"[{key}] are passed but are not explicitly set as requested or not"
|
||||
f" requested for {estimator.__class__.__name__}.{method_name}"
|
||||
)
|
||||
with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)):
|
||||
method = getattr(instance, method_name)
|
||||
if "fit" not in method_name:
|
||||
# set request on fit
|
||||
set_requests(
|
||||
estimator,
|
||||
method_mapping=metaestimator.get("method_mapping", {}),
|
||||
methods=["fit"],
|
||||
metadata_name=key,
|
||||
)
|
||||
instance.fit(X, y, **method_kwargs)
|
||||
# making sure the requests are unset, in case they were set as a
|
||||
# side effect of setting them for fit. For instance, if method
|
||||
# mapping for fit is: `"fit": ["fit", "score"]`, that would mean
|
||||
# calling `.score` here would not raise, because we have already
|
||||
# set request value for child estimator's `score`.
|
||||
set_requests(
|
||||
estimator,
|
||||
method_mapping=metaestimator.get("method_mapping", {}),
|
||||
methods=["fit"],
|
||||
metadata_name=key,
|
||||
value=None,
|
||||
)
|
||||
try:
|
||||
# `fit`, `partial_fit`, 'score' accept y, others don't.
|
||||
method(X, y, **method_kwargs)
|
||||
except TypeError:
|
||||
method(X, **method_kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_setting_request_on_sub_estimator_removes_error(metaestimator):
|
||||
# When the metadata is explicitly requested on the sub-estimator, there
|
||||
# should be no errors.
|
||||
if "estimator" not in metaestimator:
|
||||
# This test only makes sense for metaestimators which have a
|
||||
# sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
|
||||
return
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
X = metaestimator["X"]
|
||||
y = metaestimator["y"]
|
||||
routing_methods = filter_metadata_in_routing_methods(
|
||||
metaestimator["estimator_routing_methods"]
|
||||
)
|
||||
method_mapping = metaestimator.get("method_mapping", {})
|
||||
preserves_metadata = metaestimator.get("preserves_metadata", True)
|
||||
|
||||
for method_name, metadata_keys in routing_methods.items():
|
||||
for key in metadata_keys:
|
||||
val = {"sample_weight": sample_weight, "metadata": metadata}[key]
|
||||
method_kwargs = {key: val}
|
||||
|
||||
kwargs, (estimator, registry), (scorer, _), (cv, _) = get_init_args(
|
||||
metaestimator, sub_estimator_consumes=True
|
||||
)
|
||||
if scorer:
|
||||
set_requests(
|
||||
scorer, method_mapping={}, methods=["score"], metadata_name=key
|
||||
)
|
||||
if cv:
|
||||
cv.set_split_request(groups=True, metadata=True)
|
||||
|
||||
# `set_{method}_request({metadata}==True)` on the underlying objects
|
||||
set_requests(
|
||||
estimator,
|
||||
method_mapping=method_mapping,
|
||||
methods=[method_name],
|
||||
metadata_name=key,
|
||||
)
|
||||
|
||||
instance = metaestimator_class(**kwargs)
|
||||
method = getattr(instance, method_name)
|
||||
extra_method_args = metaestimator.get("method_args", {}).get(
|
||||
method_name, {}
|
||||
)
|
||||
if "fit" not in method_name:
|
||||
# fit before calling method
|
||||
instance.fit(X, y)
|
||||
try:
|
||||
# `fit` and `partial_fit` accept y, others don't.
|
||||
method(X, y, **method_kwargs, **extra_method_args)
|
||||
except TypeError:
|
||||
method(X, **method_kwargs, **extra_method_args)
|
||||
|
||||
# sanity check that registry is not empty, or else the test passes
|
||||
# trivially
|
||||
assert registry
|
||||
split_params = (
|
||||
method_kwargs.keys() if preserves_metadata == "subset" else ()
|
||||
)
|
||||
for estimator in registry:
|
||||
check_recorded_metadata(
|
||||
estimator,
|
||||
method=method_name,
|
||||
parent=method_name,
|
||||
split_params=split_params,
|
||||
**method_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_non_consuming_estimator_works(metaestimator):
|
||||
# Test that when a non-consuming estimator is given, the meta-estimator
|
||||
# works w/o setting any requests.
|
||||
# Regression test for https://github.com/scikit-learn/scikit-learn/issues/28239
|
||||
if "estimator" not in metaestimator:
|
||||
# This test only makes sense for metaestimators which have a
|
||||
# sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator())
|
||||
return
|
||||
|
||||
def set_request(estimator, method_name):
|
||||
# e.g. call set_fit_request on estimator
|
||||
if is_classifier(estimator) and method_name == "partial_fit":
|
||||
estimator.set_partial_fit_request(classes=True)
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
X = metaestimator["X"]
|
||||
y = metaestimator["y"]
|
||||
routing_methods = filter_metadata_in_routing_methods(
|
||||
metaestimator["estimator_routing_methods"]
|
||||
)
|
||||
for method_name in routing_methods:
|
||||
kwargs, (estimator, _), (_, _), (_, _) = get_init_args(
|
||||
metaestimator, sub_estimator_consumes=False
|
||||
)
|
||||
instance = metaestimator_class(**kwargs)
|
||||
set_request(estimator, method_name)
|
||||
method = getattr(instance, method_name)
|
||||
extra_method_args = metaestimator.get("method_args", {}).get(method_name, {})
|
||||
if "fit" not in method_name:
|
||||
instance.fit(X, y, **extra_method_args)
|
||||
# The following should pass w/o raising a routing error.
|
||||
try:
|
||||
# `fit` and `partial_fit` accept y, others don't.
|
||||
method(X, y, **extra_method_args)
|
||||
except TypeError:
|
||||
method(X, **extra_method_args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_metadata_is_routed_correctly_to_scorer(metaestimator):
|
||||
"""Test that any requested metadata is correctly routed to the underlying
|
||||
scorers in CV estimators.
|
||||
"""
|
||||
if "scorer_name" not in metaestimator:
|
||||
# This test only makes sense for CV estimators
|
||||
return
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
routing_methods = metaestimator["scorer_routing_methods"]
|
||||
method_mapping = metaestimator.get("method_mapping", {})
|
||||
|
||||
for method_name in routing_methods:
|
||||
kwargs, (estimator, _), (scorer, registry), (cv, _) = get_init_args(
|
||||
metaestimator, sub_estimator_consumes=True
|
||||
)
|
||||
scorer.set_score_request(sample_weight=True)
|
||||
if cv:
|
||||
cv.set_split_request(groups=True, metadata=True)
|
||||
if estimator is not None:
|
||||
set_requests(
|
||||
estimator,
|
||||
method_mapping=method_mapping,
|
||||
methods=[method_name],
|
||||
metadata_name="sample_weight",
|
||||
)
|
||||
instance = metaestimator_class(**kwargs)
|
||||
method = getattr(instance, method_name)
|
||||
method_kwargs = {"sample_weight": sample_weight}
|
||||
if "fit" not in method_name:
|
||||
instance.fit(X, y)
|
||||
method(X, y, **method_kwargs)
|
||||
|
||||
assert registry
|
||||
for _scorer in registry:
|
||||
check_recorded_metadata(
|
||||
obj=_scorer,
|
||||
method="score",
|
||||
parent=method_name,
|
||||
split_params=("sample_weight",),
|
||||
**method_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_metadata_is_routed_correctly_to_splitter(metaestimator):
|
||||
"""Test that any requested metadata is correctly routed to the underlying
|
||||
splitters in CV estimators.
|
||||
"""
|
||||
if "cv_routing_methods" not in metaestimator:
|
||||
# This test is only for metaestimators accepting a CV splitter
|
||||
return
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
routing_methods = metaestimator["cv_routing_methods"]
|
||||
X_ = metaestimator["X"]
|
||||
y_ = metaestimator["y"]
|
||||
|
||||
for method_name in routing_methods:
|
||||
kwargs, (estimator, _), (scorer, _), (cv, registry) = get_init_args(
|
||||
metaestimator, sub_estimator_consumes=True
|
||||
)
|
||||
if estimator:
|
||||
estimator.set_fit_request(sample_weight=False, metadata=False)
|
||||
if scorer:
|
||||
scorer.set_score_request(sample_weight=False, metadata=False)
|
||||
cv.set_split_request(groups=True, metadata=True)
|
||||
instance = metaestimator_class(**kwargs)
|
||||
method_kwargs = {"groups": groups, "metadata": metadata}
|
||||
method = getattr(instance, method_name)
|
||||
method(X_, y_, **method_kwargs)
|
||||
assert registry
|
||||
for _splitter in registry:
|
||||
check_recorded_metadata(
|
||||
obj=_splitter, method="split", parent=method_name, **method_kwargs
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS)
|
||||
@config_context(enable_metadata_routing=True)
|
||||
def test_metadata_routed_to_group_splitter(metaestimator):
|
||||
"""Test that groups are routed correctly if group splitter of CV estimator is used
|
||||
within cross_validate. Regression test for issue described in PR #29634 to test that
|
||||
`ValueError: The 'groups' parameter should not be None.` is not raised."""
|
||||
|
||||
if "cv_routing_methods" not in metaestimator:
|
||||
# This test is only for metaestimators accepting a CV splitter
|
||||
return
|
||||
|
||||
metaestimator_class = metaestimator["metaestimator"]
|
||||
X_ = metaestimator["X"]
|
||||
y_ = metaestimator["y"]
|
||||
|
||||
kwargs, *_ = get_init_args(metaestimator, sub_estimator_consumes=True)
|
||||
# remove `ConsumingSplitter` from kwargs, so 'cv' param isn't passed twice:
|
||||
kwargs.pop("cv", None)
|
||||
instance = metaestimator_class(cv=GroupKFold(n_splits=2), **kwargs)
|
||||
cross_validate(
|
||||
instance,
|
||||
X_,
|
||||
y_,
|
||||
params={"groups": groups},
|
||||
cv=GroupKFold(n_splits=2),
|
||||
scoring=make_scorer(mean_squared_error, response_method="predict"),
|
||||
)
|
||||
@@ -0,0 +1,143 @@
|
||||
"""Tests for the minimum dependencies in README.rst and pyproject.toml"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn._min_dependencies import dependent_packages
|
||||
from sklearn.utils.fixes import parse_version
|
||||
|
||||
min_depencies_tag_to_packages_without_version = defaultdict(list)
|
||||
for package, (min_version, extras) in dependent_packages.items():
|
||||
for extra in extras.split(", "):
|
||||
min_depencies_tag_to_packages_without_version[extra].append(package)
|
||||
|
||||
pyproject_section_to_min_dependencies_tag = {
|
||||
"build-system.requires": "build",
|
||||
"project.dependencies": "install",
|
||||
}
|
||||
for tag in min_depencies_tag_to_packages_without_version:
|
||||
section = f"project.optional-dependencies.{tag}"
|
||||
pyproject_section_to_min_dependencies_tag[section] = tag
|
||||
|
||||
|
||||
def test_min_dependencies_readme():
|
||||
# Test that the minimum dependencies in the README.rst file are
|
||||
# consistent with the minimum dependencies defined at the file:
|
||||
# sklearn/_min_dependencies.py
|
||||
|
||||
pattern = re.compile(
|
||||
r"\.\. \|"
|
||||
r"([A-Za-z-]+)"
|
||||
r"MinVersion\| replace::"
|
||||
r"( [0-9]+\.[0-9]+(\.[0-9]+)?)"
|
||||
)
|
||||
|
||||
readme_path = Path(sklearn.__file__).parent.parent
|
||||
readme_file = readme_path / "README.rst"
|
||||
|
||||
if not os.path.exists(readme_file):
|
||||
# Skip the test if the README.rst file is not available.
|
||||
# For instance, when installing scikit-learn from wheels
|
||||
pytest.skip("The README.rst file is not available.")
|
||||
|
||||
with readme_file.open("r") as f:
|
||||
for line in f:
|
||||
matched = pattern.match(line)
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
package, version = matched.group(1), matched.group(2)
|
||||
package = package.lower()
|
||||
|
||||
if package in dependent_packages:
|
||||
version = parse_version(version)
|
||||
min_version = parse_version(dependent_packages[package][0])
|
||||
|
||||
message = (
|
||||
f"{package} has inconsistent minimum versions in README.rst and"
|
||||
f" _min_depencies.py: {version} != {min_version}"
|
||||
)
|
||||
assert version == min_version, message
|
||||
|
||||
|
||||
def check_pyproject_section(
|
||||
pyproject_section, min_dependencies_tag, skip_version_check_for=None
|
||||
):
|
||||
# tomllib is available in Python 3.11
|
||||
tomllib = pytest.importorskip("tomllib")
|
||||
|
||||
if skip_version_check_for is None:
|
||||
skip_version_check_for = []
|
||||
|
||||
expected_packages = min_depencies_tag_to_packages_without_version[
|
||||
min_dependencies_tag
|
||||
]
|
||||
|
||||
root_directory = Path(sklearn.__file__).parent.parent
|
||||
pyproject_toml_path = root_directory / "pyproject.toml"
|
||||
|
||||
if not pyproject_toml_path.exists():
|
||||
# Skip the test if the pyproject.toml file is not available.
|
||||
# For instance, when installing scikit-learn from wheels
|
||||
pytest.skip("pyproject.toml is not available.")
|
||||
|
||||
with pyproject_toml_path.open("rb") as f:
|
||||
pyproject_toml = tomllib.load(f)
|
||||
|
||||
pyproject_section_keys = pyproject_section.split(".")
|
||||
info = pyproject_toml
|
||||
for key in pyproject_section_keys:
|
||||
info = info[key]
|
||||
|
||||
pyproject_build_min_versions = {}
|
||||
# Assuming pyproject.toml build section has something like "my-package>=2.3.0"
|
||||
# Warning: if you try to modify this regex, bear in mind that there can be upper
|
||||
# bounds in release branches so "my-package>=2.3.0,<2.5.0"
|
||||
pattern = r"([\w-]+)\s*[>=]=\s*([\d\w.]+)"
|
||||
for requirement in info:
|
||||
match = re.search(pattern, requirement)
|
||||
if match is None:
|
||||
raise NotImplementedError(
|
||||
f"{requirement} does not match expected regex {pattern!r}. "
|
||||
"Only >= and == are supported for version requirements"
|
||||
)
|
||||
|
||||
package, version = match.group(1), match.group(2)
|
||||
|
||||
pyproject_build_min_versions[package] = version
|
||||
|
||||
assert sorted(pyproject_build_min_versions) == sorted(expected_packages)
|
||||
|
||||
for package, version in pyproject_build_min_versions.items():
|
||||
version = parse_version(version)
|
||||
expected_min_version = parse_version(dependent_packages[package][0])
|
||||
if package in skip_version_check_for:
|
||||
continue
|
||||
|
||||
message = (
|
||||
f"{package} has inconsistent minimum versions in pyproject.toml and"
|
||||
f" _min_depencies.py: {version} != {expected_min_version}"
|
||||
)
|
||||
assert version == expected_min_version, message
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pyproject_section, min_dependencies_tag",
|
||||
pyproject_section_to_min_dependencies_tag.items(),
|
||||
)
|
||||
def test_min_dependencies_pyproject_toml(pyproject_section, min_dependencies_tag):
|
||||
"""Check versions in pyproject.toml is consistent with _min_dependencies."""
|
||||
# NumPy is more complex because build-time (>=1.25) and run-time (>=1.19.5)
|
||||
# requirement currently don't match
|
||||
skip_version_check_for = ["numpy"] if min_dependencies_tag == "build" else None
|
||||
check_pyproject_section(
|
||||
pyproject_section,
|
||||
min_dependencies_tag,
|
||||
skip_version_check_for=skip_version_check_for,
|
||||
)
|
||||
@@ -0,0 +1,990 @@
|
||||
from re import escape
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn import datasets, svm
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import (
|
||||
ElasticNet,
|
||||
Lasso,
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
Perceptron,
|
||||
Ridge,
|
||||
SGDClassifier,
|
||||
)
|
||||
from sklearn.metrics import precision_score, recall_score
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_score
|
||||
from sklearn.multiclass import (
|
||||
OneVsOneClassifier,
|
||||
OneVsRestClassifier,
|
||||
OutputCodeClassifier,
|
||||
)
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import (
|
||||
check_array,
|
||||
shuffle,
|
||||
)
|
||||
from sklearn.utils._mocking import CheckingClassifier
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_equal
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.multiclass import check_classification_targets, type_of_target
|
||||
|
||||
iris = datasets.load_iris()
|
||||
rng = np.random.RandomState(0)
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
n_classes = 3
|
||||
|
||||
|
||||
def test_ovr_exceptions():
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
|
||||
# test predicting without fitting
|
||||
with pytest.raises(NotFittedError):
|
||||
ovr.predict([])
|
||||
|
||||
# Fail on multioutput data
|
||||
msg = "Multioutput target data is not supported with label binarization"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
X = np.array([[1, 0], [0, 1]])
|
||||
y = np.array([[1, 2], [3, 1]])
|
||||
OneVsRestClassifier(MultinomialNB()).fit(X, y)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
X = np.array([[1, 0], [0, 1]])
|
||||
y = np.array([[1.5, 2.4], [3.1, 0.8]])
|
||||
OneVsRestClassifier(MultinomialNB()).fit(X, y)
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
# Test that check_classification_target return correct type. #5782
|
||||
y = np.array([0.0, 1.1, 2.0, 3.0])
|
||||
msg = type_of_target(y)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(y)
|
||||
|
||||
|
||||
def test_ovr_ties():
|
||||
"""Check that ties-breaking matches np.argmax behavior
|
||||
|
||||
Non-regression test for issue #14124
|
||||
"""
|
||||
|
||||
class Dummy(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
return np.zeros(len(X))
|
||||
|
||||
X = np.array([[0], [0], [0], [0]])
|
||||
y = np.array([0, 1, 2, 3])
|
||||
clf = OneVsRestClassifier(Dummy()).fit(X, y)
|
||||
assert_array_equal(clf.predict(X), np.argmax(clf.decision_function(X), axis=1))
|
||||
|
||||
|
||||
def test_ovr_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes
|
||||
|
||||
clf = LinearSVC(random_state=0)
|
||||
pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert np.mean(iris.target == pred) == np.mean(iris.target == pred2)
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert np.mean(iris.target == pred) > 0.65
|
||||
|
||||
|
||||
def test_ovr_partial_fit():
|
||||
# Test if partial_fit is working as intended
|
||||
X, y = shuffle(iris.data, iris.target, random_state=0)
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
ovr.partial_fit(X[:100], y[:100], np.unique(y))
|
||||
ovr.partial_fit(X[100:], y[100:])
|
||||
pred = ovr.predict(X)
|
||||
ovr2 = OneVsRestClassifier(MultinomialNB())
|
||||
pred2 = ovr2.fit(X, y).predict(X)
|
||||
|
||||
assert_almost_equal(pred, pred2)
|
||||
assert len(ovr.estimators_) == len(np.unique(y))
|
||||
assert np.mean(y == pred) > 0.65
|
||||
|
||||
# Test when mini batches doesn't have all classes
|
||||
# with SGDClassifier
|
||||
X = np.abs(np.random.randn(14, 2))
|
||||
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
|
||||
|
||||
ovr = OneVsRestClassifier(
|
||||
SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
|
||||
)
|
||||
ovr.partial_fit(X[:7], y[:7], np.unique(y))
|
||||
ovr.partial_fit(X[7:], y[7:])
|
||||
pred = ovr.predict(X)
|
||||
ovr1 = OneVsRestClassifier(
|
||||
SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)
|
||||
)
|
||||
pred1 = ovr1.fit(X, y).predict(X)
|
||||
assert np.mean(pred == y) == np.mean(pred1 == y)
|
||||
|
||||
# test partial_fit only exists if estimator has it:
|
||||
ovr = OneVsRestClassifier(SVC())
|
||||
assert not hasattr(ovr, "partial_fit")
|
||||
|
||||
|
||||
def test_ovr_partial_fit_exceptions():
|
||||
ovr = OneVsRestClassifier(MultinomialNB())
|
||||
X = np.abs(np.random.randn(14, 2))
|
||||
y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
|
||||
ovr.partial_fit(X[:7], y[:7], np.unique(y))
|
||||
# If a new class that was not in the first call of partial fit is seen
|
||||
# it should raise ValueError
|
||||
y1 = [5] + y[7:-1]
|
||||
msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ovr.partial_fit(X=X[7:], y=y1)
|
||||
|
||||
|
||||
def test_ovr_ovo_regressor():
|
||||
# test that ovr and ovo work on regressors which don't have a decision_
|
||||
# function
|
||||
ovr = OneVsRestClassifier(DecisionTreeRegressor())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes
|
||||
assert_array_equal(np.unique(pred), [0, 1, 2])
|
||||
# we are doing something sensible
|
||||
assert np.mean(pred == iris.target) > 0.9
|
||||
|
||||
ovr = OneVsOneClassifier(DecisionTreeRegressor())
|
||||
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovr.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
assert_array_equal(np.unique(pred), [0, 1, 2])
|
||||
# we are doing something sensible
|
||||
assert np.mean(pred == iris.target) > 0.9
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSR_CONTAINERS + CSC_CONTAINERS + COO_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
def test_ovr_fit_predict_sparse(sparse_container):
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
|
||||
X, Y = datasets.make_multilabel_classification(
|
||||
n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=True,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
Y_pred = clf.predict(X_test)
|
||||
|
||||
clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse_container(Y_train))
|
||||
Y_pred_sprs = clf_sprs.predict(X_test)
|
||||
|
||||
assert clf.multilabel_
|
||||
assert sp.issparse(Y_pred_sprs)
|
||||
assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
|
||||
|
||||
# Test predict_proba
|
||||
Y_proba = clf_sprs.predict_proba(X_test)
|
||||
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label is greater than 0.5.
|
||||
pred = Y_proba > 0.5
|
||||
assert_array_equal(pred, Y_pred_sprs.toarray())
|
||||
|
||||
# Test decision_function
|
||||
clf = svm.SVC()
|
||||
clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse_container(Y_train))
|
||||
dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
|
||||
assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
|
||||
|
||||
|
||||
def test_ovr_always_present():
|
||||
# Test that ovr works with classes that are always present or absent.
|
||||
# Note: tests is the case where _ConstantPredictor is utilised
|
||||
X = np.ones((10, 2))
|
||||
X[:5, :] = 0
|
||||
|
||||
# Build an indicator matrix where two features are always on.
|
||||
# As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)]
|
||||
y = np.zeros((10, 3))
|
||||
y[5:, 0] = 1
|
||||
y[:, 1] = 1
|
||||
y[:, 2] = 1
|
||||
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
msg = r"Label .+ is present in all training examples"
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
ovr.fit(X, y)
|
||||
y_pred = ovr.predict(X)
|
||||
assert_array_equal(np.array(y_pred), np.array(y))
|
||||
y_pred = ovr.decision_function(X)
|
||||
assert np.unique(y_pred[:, -2:]) == 1
|
||||
y_pred = ovr.predict_proba(X)
|
||||
assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))
|
||||
|
||||
# y has a constantly absent label
|
||||
y = np.zeros((10, 2))
|
||||
y[5:, 0] = 1 # variable label
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
|
||||
msg = r"Label not 1 is present in all training examples"
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
ovr.fit(X, y)
|
||||
y_pred = ovr.predict_proba(X)
|
||||
assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
|
||||
|
||||
|
||||
def test_ovr_multiclass():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
|
||||
y = ["eggs", "spam", "ham", "eggs", "ham"]
|
||||
Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0]])
|
||||
|
||||
classes = set("ham eggs spam".split())
|
||||
|
||||
for base_clf in (
|
||||
MultinomialNB(),
|
||||
LinearSVC(random_state=0),
|
||||
LinearRegression(),
|
||||
Ridge(),
|
||||
ElasticNet(),
|
||||
):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
assert set(clf.classes_) == classes
|
||||
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
|
||||
assert_array_equal(y_pred, ["eggs"])
|
||||
|
||||
# test input as label indicator matrix
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, Y)
|
||||
y_pred = clf.predict([[0, 0, 4]])[0]
|
||||
assert_array_equal(y_pred, [0, 0, 1])
|
||||
|
||||
|
||||
def test_ovr_binary():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
|
||||
y = ["eggs", "spam", "spam", "eggs", "spam"]
|
||||
Y = np.array([[0, 1, 1, 0, 1]]).T
|
||||
|
||||
classes = set("eggs spam".split())
|
||||
|
||||
def conduct_test(base_clf, test_predict_proba=False):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
assert set(clf.classes_) == classes
|
||||
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
|
||||
assert_array_equal(y_pred, ["eggs"])
|
||||
if hasattr(base_clf, "decision_function"):
|
||||
dec = clf.decision_function(X)
|
||||
assert dec.shape == (5,)
|
||||
|
||||
if test_predict_proba:
|
||||
X_test = np.array([[0, 0, 4]])
|
||||
probabilities = clf.predict_proba(X_test)
|
||||
assert 2 == len(probabilities[0])
|
||||
assert clf.classes_[np.argmax(probabilities, axis=1)] == clf.predict(X_test)
|
||||
|
||||
# test input as label indicator matrix
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, Y)
|
||||
y_pred = clf.predict([[3, 0, 0]])[0]
|
||||
assert y_pred == 1
|
||||
|
||||
for base_clf in (
|
||||
LinearSVC(random_state=0),
|
||||
LinearRegression(),
|
||||
Ridge(),
|
||||
ElasticNet(),
|
||||
):
|
||||
conduct_test(base_clf)
|
||||
|
||||
for base_clf in (MultinomialNB(), SVC(probability=True), LogisticRegression()):
|
||||
conduct_test(base_clf, test_predict_proba=True)
|
||||
|
||||
|
||||
def test_ovr_multilabel():
|
||||
# Toy dataset where features correspond directly to labels.
|
||||
X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
|
||||
y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]])
|
||||
|
||||
for base_clf in (
|
||||
MultinomialNB(),
|
||||
LinearSVC(random_state=0),
|
||||
LinearRegression(),
|
||||
Ridge(),
|
||||
ElasticNet(),
|
||||
Lasso(alpha=0.5),
|
||||
):
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
y_pred = clf.predict([[0, 4, 4]])[0]
|
||||
assert_array_equal(y_pred, [0, 1, 1])
|
||||
assert clf.multilabel_
|
||||
|
||||
|
||||
def test_ovr_fit_predict_svc():
|
||||
ovr = OneVsRestClassifier(svm.SVC())
|
||||
ovr.fit(iris.data, iris.target)
|
||||
assert len(ovr.estimators_) == 3
|
||||
assert ovr.score(iris.data, iris.target) > 0.9
|
||||
|
||||
|
||||
def test_ovr_multilabel_dataset():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
|
||||
X, Y = datasets.make_multilabel_classification(
|
||||
n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=2,
|
||||
length=50,
|
||||
allow_unlabeled=au,
|
||||
random_state=0,
|
||||
)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test, Y_test = X[80:], Y[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
Y_pred = clf.predict(X_test)
|
||||
|
||||
assert clf.multilabel_
|
||||
assert_almost_equal(
|
||||
precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2
|
||||
)
|
||||
assert_almost_equal(
|
||||
recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2
|
||||
)
|
||||
|
||||
|
||||
def test_ovr_multilabel_predict_proba():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
for au in (False, True):
|
||||
X, Y = datasets.make_multilabel_classification(
|
||||
n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=au,
|
||||
random_state=0,
|
||||
)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
|
||||
# Decision function only estimator.
|
||||
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, "predict_proba")
|
||||
|
||||
# Estimator with predict_proba disabled, depending on parameters.
|
||||
decision_only = OneVsRestClassifier(svm.SVC(probability=False))
|
||||
assert not hasattr(decision_only, "predict_proba")
|
||||
decision_only.fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, "predict_proba")
|
||||
assert hasattr(decision_only, "decision_function")
|
||||
|
||||
# Estimator which can get predict_proba enabled after fitting
|
||||
gs = GridSearchCV(
|
||||
svm.SVC(probability=False), param_grid={"probability": [True]}
|
||||
)
|
||||
proba_after_fit = OneVsRestClassifier(gs)
|
||||
assert not hasattr(proba_after_fit, "predict_proba")
|
||||
proba_after_fit.fit(X_train, Y_train)
|
||||
assert hasattr(proba_after_fit, "predict_proba")
|
||||
|
||||
Y_pred = clf.predict(X_test)
|
||||
Y_proba = clf.predict_proba(X_test)
|
||||
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label is greater than 0.5.
|
||||
pred = Y_proba > 0.5
|
||||
assert_array_equal(pred, Y_pred)
|
||||
|
||||
|
||||
def test_ovr_single_label_predict_proba():
|
||||
base_clf = MultinomialNB(alpha=1)
|
||||
X, Y = iris.data, iris.target
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
|
||||
|
||||
# Decision function only estimator.
|
||||
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
|
||||
assert not hasattr(decision_only, "predict_proba")
|
||||
|
||||
Y_pred = clf.predict(X_test)
|
||||
Y_proba = clf.predict_proba(X_test)
|
||||
|
||||
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
|
||||
# predict assigns a label if the probability that the
|
||||
# sample has the label with the greatest predictive probability.
|
||||
pred = Y_proba.argmax(axis=1)
|
||||
assert not (pred - Y_pred).any()
|
||||
|
||||
|
||||
def test_ovr_single_label_predict_proba_zero():
|
||||
"""Check that predic_proba returns all zeros when the base estimator
|
||||
never predicts the positive class.
|
||||
"""
|
||||
|
||||
class NaiveBinaryClassifier(BaseEstimator, ClassifierMixin):
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
proba = np.ones((len(X), 2))
|
||||
# Probability of being the positive class is always 0
|
||||
proba[:, 1] = 0
|
||||
return proba
|
||||
|
||||
base_clf = NaiveBinaryClassifier()
|
||||
X, y = iris.data, iris.target # Three-class problem with 150 samples
|
||||
|
||||
clf = OneVsRestClassifier(base_clf).fit(X, y)
|
||||
y_proba = clf.predict_proba(X)
|
||||
|
||||
assert_allclose(y_proba, 0.0)
|
||||
|
||||
|
||||
def test_ovr_multilabel_decision_function():
|
||||
X, Y = datasets.make_multilabel_classification(
|
||||
n_samples=100,
|
||||
n_features=20,
|
||||
n_classes=5,
|
||||
n_labels=3,
|
||||
length=50,
|
||||
allow_unlabeled=True,
|
||||
random_state=0,
|
||||
)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
|
||||
assert_array_equal(
|
||||
(clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test)
|
||||
)
|
||||
|
||||
|
||||
def test_ovr_single_label_decision_function():
|
||||
X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0)
|
||||
X_train, Y_train = X[:80], Y[:80]
|
||||
X_test = X[80:]
|
||||
clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
|
||||
assert_array_equal(clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
|
||||
|
||||
|
||||
def test_ovr_gridsearch():
|
||||
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ovr, {"estimator__C": Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ovr_pipeline():
|
||||
# Test with pipeline of length one
|
||||
# This test is needed because the multiclass estimators may fail to detect
|
||||
# the presence of predict_proba or decision_function.
|
||||
clf = Pipeline([("tree", DecisionTreeClassifier())])
|
||||
ovr_pipe = OneVsRestClassifier(clf)
|
||||
ovr_pipe.fit(iris.data, iris.target)
|
||||
ovr = OneVsRestClassifier(DecisionTreeClassifier())
|
||||
ovr.fit(iris.data, iris.target)
|
||||
assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
|
||||
|
||||
|
||||
def test_ovo_exceptions():
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
with pytest.raises(NotFittedError):
|
||||
ovo.predict([])
|
||||
|
||||
|
||||
def test_ovo_fit_on_list():
|
||||
# Test that OneVsOne fitting works with a list of targets and yields the
|
||||
# same output as predict from an array
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
iris_data_list = [list(a) for a in iris.data]
|
||||
prediction_from_list = ovo.fit(iris_data_list, list(iris.target)).predict(
|
||||
iris_data_list
|
||||
)
|
||||
assert_array_equal(prediction_from_array, prediction_from_list)
|
||||
|
||||
|
||||
def test_ovo_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
ovo.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
|
||||
|
||||
def test_ovo_partial_fit_predict():
|
||||
temp = datasets.load_iris()
|
||||
X, y = temp.data, temp.target
|
||||
ovo1 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo1.partial_fit(X[:100], y[:100], np.unique(y))
|
||||
ovo1.partial_fit(X[100:], y[100:])
|
||||
pred1 = ovo1.predict(X)
|
||||
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo2.fit(X, y)
|
||||
pred2 = ovo2.predict(X)
|
||||
assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
|
||||
assert np.mean(y == pred1) > 0.65
|
||||
assert_almost_equal(pred1, pred2)
|
||||
|
||||
# Test when mini-batches have binary target classes
|
||||
ovo1 = OneVsOneClassifier(MultinomialNB())
|
||||
ovo1.partial_fit(X[:60], y[:60], np.unique(y))
|
||||
ovo1.partial_fit(X[60:], y[60:])
|
||||
pred1 = ovo1.predict(X)
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
pred2 = ovo2.fit(X, y).predict(X)
|
||||
|
||||
assert_almost_equal(pred1, pred2)
|
||||
assert len(ovo1.estimators_) == len(np.unique(y))
|
||||
assert np.mean(y == pred1) > 0.65
|
||||
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
X = np.random.rand(14, 2)
|
||||
y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
|
||||
ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
|
||||
ovo.partial_fit(X[7:], y[7:])
|
||||
pred = ovo.predict(X)
|
||||
ovo2 = OneVsOneClassifier(MultinomialNB())
|
||||
pred2 = ovo2.fit(X, y).predict(X)
|
||||
assert_almost_equal(pred, pred2)
|
||||
|
||||
# raises error when mini-batch does not have classes from all_classes
|
||||
ovo = OneVsOneClassifier(MultinomialNB())
|
||||
error_y = [0, 1, 2, 3, 4, 5, 2]
|
||||
message_re = escape(
|
||||
"Mini-batch contains {0} while it must be subset of {1}".format(
|
||||
np.unique(error_y), np.unique(y)
|
||||
)
|
||||
)
|
||||
with pytest.raises(ValueError, match=message_re):
|
||||
ovo.partial_fit(X[:7], error_y, np.unique(y))
|
||||
|
||||
# test partial_fit only exists if estimator has it:
|
||||
ovr = OneVsOneClassifier(SVC())
|
||||
assert not hasattr(ovr, "partial_fit")
|
||||
|
||||
|
||||
def test_ovo_decision_function():
|
||||
n_samples = iris.data.shape[0]
|
||||
|
||||
ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
# first binary
|
||||
ovo_clf.fit(iris.data, iris.target == 0)
|
||||
decisions = ovo_clf.decision_function(iris.data)
|
||||
assert decisions.shape == (n_samples,)
|
||||
|
||||
# then multi-class
|
||||
ovo_clf.fit(iris.data, iris.target)
|
||||
decisions = ovo_clf.decision_function(iris.data)
|
||||
|
||||
assert decisions.shape == (n_samples, n_classes)
|
||||
assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))
|
||||
|
||||
# Compute the votes
|
||||
votes = np.zeros((n_samples, n_classes))
|
||||
|
||||
k = 0
|
||||
for i in range(n_classes):
|
||||
for j in range(i + 1, n_classes):
|
||||
pred = ovo_clf.estimators_[k].predict(iris.data)
|
||||
votes[pred == 0, i] += 1
|
||||
votes[pred == 1, j] += 1
|
||||
k += 1
|
||||
|
||||
# Extract votes and verify
|
||||
assert_array_equal(votes, np.round(decisions))
|
||||
|
||||
for class_idx in range(n_classes):
|
||||
# For each sample and each class, there only 3 possible vote levels
|
||||
# because they are only 3 distinct class pairs thus 3 distinct
|
||||
# binary classifiers.
|
||||
# Therefore, sorting predictions based on votes would yield
|
||||
# mostly tied predictions:
|
||||
assert set(votes[:, class_idx]).issubset(set([0.0, 1.0, 2.0]))
|
||||
|
||||
# The OVO decision function on the other hand is able to resolve
|
||||
# most of the ties on this data as it combines both the vote counts
|
||||
# and the aggregated confidence levels of the binary classifiers
|
||||
# to compute the aggregate decision function. The iris dataset
|
||||
# has 150 samples with a couple of duplicates. The OvO decisions
|
||||
# can resolve most of the ties:
|
||||
assert len(np.unique(decisions[:, class_idx])) > 146
|
||||
|
||||
|
||||
def test_ovo_gridsearch():
|
||||
ovo = OneVsOneClassifier(LinearSVC(random_state=0))
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ovo, {"estimator__C": Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ovo_ties():
|
||||
# Test that ties are broken using the decision function,
|
||||
# not defaulting to the smallest label
|
||||
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
|
||||
y = np.array([2, 0, 1, 2])
|
||||
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
|
||||
ovo_prediction = multi_clf.fit(X, y).predict(X)
|
||||
ovo_decision = multi_clf.decision_function(X)
|
||||
|
||||
# Classifiers are in order 0-1, 0-2, 1-2
|
||||
# Use decision_function to compute the votes and the normalized
|
||||
# sum_of_confidences, which is used to disambiguate when there is a tie in
|
||||
# votes.
|
||||
votes = np.round(ovo_decision)
|
||||
normalized_confidences = ovo_decision - votes
|
||||
|
||||
# For the first point, there is one vote per class
|
||||
assert_array_equal(votes[0, :], 1)
|
||||
# For the rest, there is no tie and the prediction is the argmax
|
||||
assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
|
||||
# For the tie, the prediction is the class with the highest score
|
||||
assert ovo_prediction[0] == normalized_confidences[0].argmax()
|
||||
|
||||
|
||||
def test_ovo_ties2():
|
||||
# test that ties can not only be won by the first two labels
|
||||
X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
|
||||
y_ref = np.array([2, 0, 1, 2])
|
||||
|
||||
# cycle through labels so that each label wins once
|
||||
for i in range(3):
|
||||
y = (y_ref + i) % 3
|
||||
multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None))
|
||||
ovo_prediction = multi_clf.fit(X, y).predict(X)
|
||||
assert ovo_prediction[0] == i % 3
|
||||
|
||||
|
||||
def test_ovo_string_y():
|
||||
# Test that the OvO doesn't mess up the encoding of string labels
|
||||
X = np.eye(4)
|
||||
y = np.array(["a", "b", "c", "d"])
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
ovo.fit(X, y)
|
||||
assert_array_equal(y, ovo.predict(X))
|
||||
|
||||
|
||||
def test_ovo_one_class():
|
||||
# Test error for OvO with one class
|
||||
X = np.eye(4)
|
||||
y = np.array(["a"] * 4)
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
msg = "when only one class"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ovo.fit(X, y)
|
||||
|
||||
|
||||
def test_ovo_float_y():
|
||||
# Test that the OvO errors on float targets
|
||||
X = iris.data
|
||||
y = iris.data[:, 0]
|
||||
|
||||
ovo = OneVsOneClassifier(LinearSVC())
|
||||
msg = "Unknown label type"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ovo.fit(X, y)
|
||||
|
||||
|
||||
def test_ecoc_exceptions():
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
|
||||
with pytest.raises(NotFittedError):
|
||||
ecoc.predict([])
|
||||
|
||||
|
||||
def test_ecoc_fit_predict():
|
||||
# A classifier which implements decision_function.
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0)
|
||||
ecoc.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ecoc.estimators_) == n_classes * 2
|
||||
|
||||
# A classifier which implements predict_proba.
|
||||
ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
|
||||
ecoc.fit(iris.data, iris.target).predict(iris.data)
|
||||
assert len(ecoc.estimators_) == n_classes * 2
|
||||
|
||||
|
||||
def test_ecoc_gridsearch():
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
|
||||
Cs = [0.1, 0.5, 0.8]
|
||||
cv = GridSearchCV(ecoc, {"estimator__C": Cs})
|
||||
cv.fit(iris.data, iris.target)
|
||||
best_C = cv.best_estimator_.estimators_[0].C
|
||||
assert best_C in Cs
|
||||
|
||||
|
||||
def test_ecoc_float_y():
|
||||
# Test that the OCC errors on float targets
|
||||
X = iris.data
|
||||
y = iris.data[:, 0]
|
||||
|
||||
ovo = OutputCodeClassifier(LinearSVC())
|
||||
msg = "Unknown label type"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ovo.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_ecoc_delegate_sparse_base_estimator(csc_container):
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/17218
|
||||
X, y = iris.data, iris.target
|
||||
X_sp = csc_container(X)
|
||||
|
||||
# create an estimator that does not support sparse input
|
||||
base_estimator = CheckingClassifier(
|
||||
check_X=check_array,
|
||||
check_X_params={"ensure_2d": True, "accept_sparse": False},
|
||||
)
|
||||
ecoc = OutputCodeClassifier(base_estimator, random_state=0)
|
||||
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
ecoc.fit(X_sp, y)
|
||||
|
||||
ecoc.fit(X, y)
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
ecoc.predict(X_sp)
|
||||
|
||||
# smoke test to check when sparse input should be supported
|
||||
ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
|
||||
ecoc.fit(X_sp, y).predict(X_sp)
|
||||
assert len(ecoc.estimators_) == 4
|
||||
|
||||
|
||||
def test_pairwise_indices():
|
||||
clf_precomputed = svm.SVC(kernel="precomputed")
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
ovr_false = OneVsOneClassifier(clf_precomputed)
|
||||
linear_kernel = np.dot(X, X.T)
|
||||
ovr_false.fit(linear_kernel, y)
|
||||
|
||||
n_estimators = len(ovr_false.estimators_)
|
||||
precomputed_indices = ovr_false.pairwise_indices_
|
||||
|
||||
for idx in precomputed_indices:
|
||||
assert (
|
||||
idx.shape[0] * n_estimators / (n_estimators - 1) == linear_kernel.shape[0]
|
||||
)
|
||||
|
||||
|
||||
def test_pairwise_n_features_in():
|
||||
"""Check the n_features_in_ attributes of the meta and base estimators
|
||||
|
||||
When the training data is a regular design matrix, everything is intuitive.
|
||||
However, when the training data is a precomputed kernel matrix, the
|
||||
multiclass strategy can resample the kernel matrix of the underlying base
|
||||
estimator both row-wise and column-wise and this has a non-trivial impact
|
||||
on the expected value for the n_features_in_ of both the meta and the base
|
||||
estimators.
|
||||
"""
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
# Remove the last sample to make the classes not exactly balanced and make
|
||||
# the test more interesting.
|
||||
assert y[-1] == 0
|
||||
X = X[:-1]
|
||||
y = y[:-1]
|
||||
|
||||
# Fitting directly on the design matrix:
|
||||
assert X.shape == (149, 4)
|
||||
|
||||
clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y)
|
||||
assert clf_notprecomputed.n_features_in_ == 4
|
||||
|
||||
ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y)
|
||||
assert ovr_notprecomputed.n_features_in_ == 4
|
||||
for est in ovr_notprecomputed.estimators_:
|
||||
assert est.n_features_in_ == 4
|
||||
|
||||
ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y)
|
||||
assert ovo_notprecomputed.n_features_in_ == 4
|
||||
assert ovo_notprecomputed.n_classes_ == 3
|
||||
assert len(ovo_notprecomputed.estimators_) == 3
|
||||
for est in ovo_notprecomputed.estimators_:
|
||||
assert est.n_features_in_ == 4
|
||||
|
||||
# When working with precomputed kernels we have one "feature" per training
|
||||
# sample:
|
||||
K = X @ X.T
|
||||
assert K.shape == (149, 149)
|
||||
|
||||
clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y)
|
||||
assert clf_precomputed.n_features_in_ == 149
|
||||
|
||||
ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y)
|
||||
assert ovr_precomputed.n_features_in_ == 149
|
||||
assert ovr_precomputed.n_classes_ == 3
|
||||
assert len(ovr_precomputed.estimators_) == 3
|
||||
for est in ovr_precomputed.estimators_:
|
||||
assert est.n_features_in_ == 149
|
||||
|
||||
# This becomes really interesting with OvO and precomputed kernel together:
|
||||
# internally, OvO will drop the samples of the classes not part of the pair
|
||||
# of classes under consideration for a given binary classifier. Since we
|
||||
# use a precomputed kernel, it will also drop the matching columns of the
|
||||
# kernel matrix, and therefore we have fewer "features" as result.
|
||||
#
|
||||
# Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a
|
||||
# single OvO binary classifier works with a sub-kernel matrix of shape
|
||||
# either (99, 99) or (100, 100).
|
||||
ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y)
|
||||
assert ovo_precomputed.n_features_in_ == 149
|
||||
assert ovr_precomputed.n_classes_ == 3
|
||||
assert len(ovr_precomputed.estimators_) == 3
|
||||
assert ovo_precomputed.estimators_[0].n_features_in_ == 99 # class 0 vs class 1
|
||||
assert ovo_precomputed.estimators_[1].n_features_in_ == 99 # class 0 vs class 2
|
||||
assert ovo_precomputed.estimators_[2].n_features_in_ == 100 # class 1 vs class 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
|
||||
)
|
||||
def test_pairwise_tag(MultiClassClassifier):
|
||||
clf_precomputed = svm.SVC(kernel="precomputed")
|
||||
clf_notprecomputed = svm.SVC()
|
||||
|
||||
ovr_false = MultiClassClassifier(clf_notprecomputed)
|
||||
assert not ovr_false.__sklearn_tags__().input_tags.pairwise
|
||||
|
||||
ovr_true = MultiClassClassifier(clf_precomputed)
|
||||
assert ovr_true.__sklearn_tags__().input_tags.pairwise
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
|
||||
)
|
||||
def test_pairwise_cross_val_score(MultiClassClassifier):
|
||||
clf_precomputed = svm.SVC(kernel="precomputed")
|
||||
clf_notprecomputed = svm.SVC(kernel="linear")
|
||||
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
multiclass_clf_notprecomputed = MultiClassClassifier(clf_notprecomputed)
|
||||
multiclass_clf_precomputed = MultiClassClassifier(clf_precomputed)
|
||||
|
||||
linear_kernel = np.dot(X, X.T)
|
||||
score_not_precomputed = cross_val_score(
|
||||
multiclass_clf_notprecomputed, X, y, error_score="raise"
|
||||
)
|
||||
score_precomputed = cross_val_score(
|
||||
multiclass_clf_precomputed, linear_kernel, y, error_score="raise"
|
||||
)
|
||||
assert_array_equal(score_precomputed, score_not_precomputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"MultiClassClassifier", [OneVsRestClassifier, OneVsOneClassifier]
|
||||
)
|
||||
# FIXME: we should move this test in `estimator_checks` once we are able
|
||||
# to construct meta-estimator instances
|
||||
def test_support_missing_values(MultiClassClassifier):
|
||||
# smoke test to check that pipeline OvR and OvO classifiers are letting
|
||||
# the validation of missing values to
|
||||
# the underlying pipeline or classifiers
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = iris.data, iris.target
|
||||
X = np.copy(X) # Copy to avoid that the original data is modified
|
||||
mask = rng.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
|
||||
X[mask] = np.nan
|
||||
lr = make_pipeline(SimpleImputer(), LogisticRegression(random_state=rng))
|
||||
|
||||
MultiClassClassifier(lr).fit(X, y).score(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("make_y", [np.ones, np.zeros])
|
||||
def test_constant_int_target(make_y):
|
||||
"""Check that constant y target does not raise.
|
||||
|
||||
Non-regression test for #21869
|
||||
"""
|
||||
X = np.ones((10, 2))
|
||||
y = make_y((10, 1), dtype=np.int32)
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
|
||||
ovr.fit(X, y)
|
||||
y_pred = ovr.predict_proba(X)
|
||||
expected = np.zeros((X.shape[0], 2))
|
||||
expected[:, 0] = 1
|
||||
assert_allclose(y_pred, expected)
|
||||
|
||||
|
||||
def test_ovo_consistent_binary_classification():
|
||||
"""Check that ovo is consistent with binary classifier.
|
||||
|
||||
Non-regression test for #13617.
|
||||
"""
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
|
||||
clf = KNeighborsClassifier(n_neighbors=8, weights="distance")
|
||||
ovo = OneVsOneClassifier(clf)
|
||||
|
||||
clf.fit(X, y)
|
||||
ovo.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.predict(X), ovo.predict(X))
|
||||
|
||||
|
||||
def test_multiclass_estimator_attribute_error():
|
||||
"""Check that we raise the proper AttributeError when the final estimator
|
||||
does not implement the `partial_fit` method, which is decorated with
|
||||
`available_if`.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/28108
|
||||
"""
|
||||
iris = datasets.load_iris()
|
||||
|
||||
# LogisticRegression does not implement 'partial_fit' and should raise an
|
||||
# AttributeError
|
||||
clf = OneVsRestClassifier(estimator=LogisticRegression(random_state=42))
|
||||
|
||||
outer_msg = "This 'OneVsRestClassifier' has no attribute 'partial_fit'"
|
||||
inner_msg = "'LogisticRegression' object has no attribute 'partial_fit'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
clf.partial_fit(iris.data, iris.target)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg in str(exec_info.value.__cause__)
|
||||
@@ -0,0 +1,883 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from joblib import cpu_count
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.base import ClassifierMixin, clone
|
||||
from sklearn.datasets import (
|
||||
load_linnerud,
|
||||
make_classification,
|
||||
make_multilabel_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import (
|
||||
GradientBoostingRegressor,
|
||||
RandomForestClassifier,
|
||||
StackingRegressor,
|
||||
)
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import (
|
||||
Lasso,
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
OrthogonalMatchingPursuit,
|
||||
Ridge,
|
||||
SGDClassifier,
|
||||
SGDRegressor,
|
||||
)
|
||||
from sklearn.metrics import jaccard_score, mean_squared_error
|
||||
from sklearn.model_selection import GridSearchCV, train_test_split
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import (
|
||||
ClassifierChain,
|
||||
MultiOutputClassifier,
|
||||
MultiOutputRegressor,
|
||||
RegressorChain,
|
||||
)
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
BSR_CONTAINERS,
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
|
||||
def test_multi_target_regression():
|
||||
X, y = datasets.make_regression(n_targets=3, random_state=0)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test, y_test = X[50:], y[50:]
|
||||
|
||||
references = np.zeros_like(y_test)
|
||||
for n in range(3):
|
||||
rgr = GradientBoostingRegressor(random_state=0)
|
||||
rgr.fit(X_train, y_train[:, n])
|
||||
references[:, n] = rgr.predict(X_test)
|
||||
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X_train, y_train)
|
||||
y_pred = rgr.predict(X_test)
|
||||
|
||||
assert_almost_equal(references, y_pred)
|
||||
|
||||
|
||||
def test_multi_target_regression_partial_fit():
|
||||
X, y = datasets.make_regression(n_targets=3, random_state=0)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test, y_test = X[50:], y[50:]
|
||||
|
||||
references = np.zeros_like(y_test)
|
||||
half_index = 25
|
||||
for n in range(3):
|
||||
sgr = SGDRegressor(random_state=0, max_iter=5)
|
||||
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
|
||||
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
|
||||
references[:, n] = sgr.predict(X_test)
|
||||
|
||||
sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
|
||||
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
|
||||
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
|
||||
|
||||
y_pred = sgr.predict(X_test)
|
||||
assert_almost_equal(references, y_pred)
|
||||
assert not hasattr(MultiOutputRegressor(Lasso), "partial_fit")
|
||||
|
||||
|
||||
def test_multi_target_regression_one_target():
|
||||
# Test multi target regression raises
|
||||
X, y = datasets.make_regression(n_targets=1, random_state=0)
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
msg = "at least two dimensions"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
rgr.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSR_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ COO_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ BSR_CONTAINERS,
|
||||
)
|
||||
def test_multi_target_sparse_regression(sparse_container):
|
||||
X, y = datasets.make_regression(n_targets=3, random_state=0)
|
||||
X_train, y_train = X[:50], y[:50]
|
||||
X_test = X[50:]
|
||||
|
||||
rgr = MultiOutputRegressor(Lasso(random_state=0))
|
||||
rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
|
||||
|
||||
rgr.fit(X_train, y_train)
|
||||
rgr_sparse.fit(sparse_container(X_train), y_train)
|
||||
|
||||
assert_almost_equal(
|
||||
rgr.predict(X_test), rgr_sparse.predict(sparse_container(X_test))
|
||||
)
|
||||
|
||||
|
||||
def test_multi_target_sample_weights_api():
|
||||
X = [[1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [0.8, 0.6]
|
||||
|
||||
rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
|
||||
msg = "does not support sample weights"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
rgr.fit(X, y, w)
|
||||
|
||||
# no exception should be raised if the base estimator supports weights
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X, y, w)
|
||||
|
||||
|
||||
def test_multi_target_sample_weight_partial_fit():
|
||||
# weighted regressor
|
||||
X = [[1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [2.0, 1.0]
|
||||
rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
rgr_w.partial_fit(X, y, w)
|
||||
|
||||
# weighted with different weights
|
||||
w = [2.0, 2.0]
|
||||
rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
|
||||
rgr.partial_fit(X, y, w)
|
||||
|
||||
assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
|
||||
|
||||
|
||||
def test_multi_target_sample_weights():
|
||||
# weighted regressor
|
||||
Xw = [[1, 2, 3], [4, 5, 6]]
|
||||
yw = [[3.141, 2.718], [2.718, 3.141]]
|
||||
w = [2.0, 1.0]
|
||||
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
|
||||
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
|
||||
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
|
||||
rgr.fit(X, y)
|
||||
|
||||
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
|
||||
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
|
||||
|
||||
|
||||
# Import the data
|
||||
iris = datasets.load_iris()
|
||||
# create a multiple targets by randomized shuffling and concatenating y.
|
||||
X = iris.data
|
||||
y1 = iris.target
|
||||
y2 = shuffle(y1, random_state=1)
|
||||
y3 = shuffle(y1, random_state=2)
|
||||
y = np.column_stack((y1, y2, y3))
|
||||
n_samples, n_features = X.shape
|
||||
n_outputs = y.shape[1]
|
||||
n_classes = len(np.unique(y1))
|
||||
classes = list(map(np.unique, (y1, y2, y3)))
|
||||
|
||||
|
||||
# TODO: remove mark once loky bug is fixed:
|
||||
# https://github.com/joblib/loky/issues/458
|
||||
@pytest.mark.thread_unsafe
|
||||
def test_multi_output_classification_partial_fit_parallelism():
|
||||
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
|
||||
mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
|
||||
mor.partial_fit(X, y, classes)
|
||||
est1 = mor.estimators_[0]
|
||||
mor.partial_fit(X, y)
|
||||
est2 = mor.estimators_[0]
|
||||
if cpu_count() > 1:
|
||||
# parallelism requires this to be the case for a sane implementation
|
||||
assert est1 is not est2
|
||||
|
||||
|
||||
# check multioutput has predict_proba
|
||||
def test_hasattr_multi_output_predict_proba():
|
||||
# default SGDClassifier has loss='hinge'
|
||||
# which does not expose a predict_proba method
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
assert not hasattr(multi_target_linear, "predict_proba")
|
||||
|
||||
# case where predict_proba attribute exists
|
||||
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
assert hasattr(multi_target_linear, "predict_proba")
|
||||
|
||||
|
||||
# check predict_proba passes
|
||||
def test_multi_output_predict_proba():
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
param = {"loss": ("hinge", "log_loss", "modified_huber")}
|
||||
|
||||
# inner function for custom scoring
|
||||
def custom_scorer(estimator, X, y):
|
||||
if hasattr(estimator, "predict_proba"):
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
grid_clf = GridSearchCV(
|
||||
sgd_linear_clf,
|
||||
param_grid=param,
|
||||
scoring=custom_scorer,
|
||||
cv=3,
|
||||
error_score="raise",
|
||||
)
|
||||
multi_target_linear = MultiOutputClassifier(grid_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
|
||||
multi_target_linear.predict_proba(X)
|
||||
|
||||
# SGDClassifier defaults to loss='hinge' which is not a probabilistic
|
||||
# loss function; therefore it does not expose a predict_proba method
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
multi_target_linear.fit(X, y)
|
||||
|
||||
inner2_msg = "probability estimates are not available for loss='hinge'"
|
||||
inner1_msg = "'SGDClassifier' has no attribute 'predict_proba'"
|
||||
outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
multi_target_linear.predict_proba(X)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner1_msg in str(exec_info.value.__cause__)
|
||||
|
||||
assert isinstance(exec_info.value.__cause__.__cause__, AttributeError)
|
||||
assert inner2_msg in str(exec_info.value.__cause__.__cause__)
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit():
|
||||
# test if multi_target initializes correctly with base estimator and fit
|
||||
# assert predictions work as expected for predict
|
||||
|
||||
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
|
||||
# train the multi_target_linear and also get the predictions.
|
||||
half_index = X.shape[0] // 2
|
||||
multi_target_linear.partial_fit(X[:half_index], y[:half_index], classes=classes)
|
||||
|
||||
first_predictions = multi_target_linear.predict(X)
|
||||
assert (n_samples, n_outputs) == first_predictions.shape
|
||||
|
||||
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
|
||||
second_predictions = multi_target_linear.predict(X)
|
||||
assert (n_samples, n_outputs) == second_predictions.shape
|
||||
|
||||
# train the linear classification with each column and assert that
|
||||
# predictions are equal after first partial_fit and second partial_fit
|
||||
for i in range(3):
|
||||
# create a clone with the same state
|
||||
sgd_linear_clf = clone(sgd_linear_clf)
|
||||
sgd_linear_clf.partial_fit(
|
||||
X[:half_index], y[:half_index, i], classes=classes[i]
|
||||
)
|
||||
assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
|
||||
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
|
||||
assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit_no_first_classes_exception():
|
||||
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
|
||||
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
|
||||
msg = "classes must be passed on the first call to partial_fit."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
multi_target_linear.partial_fit(X, y)
|
||||
|
||||
|
||||
def test_multi_output_classification():
|
||||
# test if multi_target initializes correctly with base estimator and fit
|
||||
# assert predictions work as expected for predict, prodict_proba and score
|
||||
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
multi_target_forest = MultiOutputClassifier(forest)
|
||||
|
||||
# train the multi_target_forest and also get the predictions.
|
||||
multi_target_forest.fit(X, y)
|
||||
|
||||
predictions = multi_target_forest.predict(X)
|
||||
assert (n_samples, n_outputs) == predictions.shape
|
||||
|
||||
predict_proba = multi_target_forest.predict_proba(X)
|
||||
|
||||
assert len(predict_proba) == n_outputs
|
||||
for class_probabilities in predict_proba:
|
||||
assert (n_samples, n_classes) == class_probabilities.shape
|
||||
|
||||
assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions)
|
||||
|
||||
# train the forest with each column and assert that predictions are equal
|
||||
for i in range(3):
|
||||
forest_ = clone(forest) # create a clone with the same state
|
||||
forest_.fit(X, y[:, i])
|
||||
assert list(forest_.predict(X)) == list(predictions[:, i])
|
||||
assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
|
||||
|
||||
|
||||
def test_multiclass_multioutput_estimator():
|
||||
# test to check meta of meta estimators
|
||||
svc = LinearSVC(random_state=0)
|
||||
multi_class_svc = OneVsRestClassifier(svc)
|
||||
multi_target_svc = MultiOutputClassifier(multi_class_svc)
|
||||
|
||||
multi_target_svc.fit(X, y)
|
||||
|
||||
predictions = multi_target_svc.predict(X)
|
||||
assert (n_samples, n_outputs) == predictions.shape
|
||||
|
||||
# train the forest with each column and assert that predictions are equal
|
||||
for i in range(3):
|
||||
multi_class_svc_ = clone(multi_class_svc) # create a clone
|
||||
multi_class_svc_.fit(X, y[:, i])
|
||||
assert list(multi_class_svc_.predict(X)) == list(predictions[:, i])
|
||||
|
||||
|
||||
def test_multiclass_multioutput_estimator_predict_proba():
|
||||
seed = 542
|
||||
|
||||
# make test deterministic
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
# random features
|
||||
X = rng.normal(size=(5, 5))
|
||||
|
||||
# random labels
|
||||
y1 = np.array(["b", "a", "a", "b", "a"]).reshape(5, 1) # 2 classes
|
||||
y2 = np.array(["d", "e", "f", "e", "d"]).reshape(5, 1) # 3 classes
|
||||
|
||||
Y = np.concatenate([y1, y2], axis=1)
|
||||
|
||||
clf = MultiOutputClassifier(LogisticRegression(random_state=seed))
|
||||
|
||||
clf.fit(X, Y)
|
||||
|
||||
y_result = clf.predict_proba(X)
|
||||
y_actual = [
|
||||
np.array(
|
||||
[
|
||||
[0.31525135, 0.68474865],
|
||||
[0.81004803, 0.18995197],
|
||||
[0.65664086, 0.34335914],
|
||||
[0.38584929, 0.61415071],
|
||||
[0.83234285, 0.16765715],
|
||||
]
|
||||
),
|
||||
np.array(
|
||||
[
|
||||
[0.65759215, 0.20976588, 0.13264197],
|
||||
[0.14996984, 0.82591444, 0.02411571],
|
||||
[0.13111876, 0.13294966, 0.73593158],
|
||||
[0.24663053, 0.65860244, 0.09476703],
|
||||
[0.81458885, 0.1728158, 0.01259535],
|
||||
]
|
||||
),
|
||||
]
|
||||
|
||||
for i in range(len(y_actual)):
|
||||
assert_almost_equal(y_result[i], y_actual[i])
|
||||
|
||||
|
||||
def test_multi_output_classification_sample_weights():
|
||||
# weighted classifier
|
||||
Xw = [[1, 2, 3], [4, 5, 6]]
|
||||
yw = [[3, 2], [2, 3]]
|
||||
w = np.asarray([2.0, 1.0])
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
clf_w = MultiOutputClassifier(forest)
|
||||
clf_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
|
||||
y = [[3, 2], [3, 2], [2, 3]]
|
||||
forest = RandomForestClassifier(n_estimators=10, random_state=1)
|
||||
clf = MultiOutputClassifier(forest)
|
||||
clf.fit(X, y)
|
||||
|
||||
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
|
||||
assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
|
||||
|
||||
|
||||
def test_multi_output_classification_partial_fit_sample_weights():
|
||||
# weighted classifier
|
||||
Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
yw = [[3, 2], [2, 3], [3, 2]]
|
||||
w = np.asarray([2.0, 1.0, 1.0])
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20, tol=None)
|
||||
clf_w = MultiOutputClassifier(sgd_linear_clf)
|
||||
clf_w.fit(Xw, yw, w)
|
||||
|
||||
# unweighted, but with repeated samples
|
||||
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
y = [[3, 2], [3, 2], [2, 3], [3, 2]]
|
||||
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20, tol=None)
|
||||
clf = MultiOutputClassifier(sgd_linear_clf)
|
||||
clf.fit(X, y)
|
||||
X_test = [[1.5, 2.5, 3.5]]
|
||||
assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
|
||||
|
||||
|
||||
def test_multi_output_exceptions():
|
||||
# NotFittedError when fit is not done but score, predict and
|
||||
# and predict_proba are called
|
||||
moc = MultiOutputClassifier(LinearSVC(random_state=0))
|
||||
with pytest.raises(NotFittedError):
|
||||
moc.score(X, y)
|
||||
|
||||
# ValueError when number of outputs is different
|
||||
# for fit and score
|
||||
y_new = np.column_stack((y1, y2))
|
||||
moc.fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
moc.score(X, y_new)
|
||||
|
||||
# ValueError when y is continuous
|
||||
msg = "Unknown label type"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
moc.fit(X, X[:, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict"])
|
||||
def test_multi_output_not_fitted_error(response_method):
|
||||
"""Check that we raise the proper error when the estimator is not fitted"""
|
||||
moc = MultiOutputClassifier(LogisticRegression())
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(moc, response_method)(X)
|
||||
|
||||
|
||||
def test_multi_output_delegate_predict_proba():
|
||||
"""Check the behavior for the delegation of predict_proba to the underlying
|
||||
estimator"""
|
||||
|
||||
# A base estimator with `predict_proba`should expose the method even before fit
|
||||
moc = MultiOutputClassifier(LogisticRegression())
|
||||
assert hasattr(moc, "predict_proba")
|
||||
moc.fit(X, y)
|
||||
assert hasattr(moc, "predict_proba")
|
||||
|
||||
# A base estimator without `predict_proba` should raise an AttributeError
|
||||
moc = MultiOutputClassifier(LinearSVC())
|
||||
assert not hasattr(moc, "predict_proba")
|
||||
|
||||
outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
|
||||
inner_msg = "'LinearSVC' object has no attribute 'predict_proba'"
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
moc.predict_proba(X)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg == str(exec_info.value.__cause__)
|
||||
|
||||
moc.fit(X, y)
|
||||
assert not hasattr(moc, "predict_proba")
|
||||
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
|
||||
moc.predict_proba(X)
|
||||
assert isinstance(exec_info.value.__cause__, AttributeError)
|
||||
assert inner_msg == str(exec_info.value.__cause__)
|
||||
|
||||
|
||||
def generate_multilabel_dataset_with_correlations():
|
||||
# Generate a multilabel data set from a multiclass dataset as a way of
|
||||
# by representing the integer number of the original class using a binary
|
||||
# encoding.
|
||||
X, y = make_classification(
|
||||
n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0
|
||||
)
|
||||
|
||||
Y_multi = np.array([[int(yyy) for yyy in format(yy, "#06b")[2:]] for yy in y])
|
||||
return X, Y_multi
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chain_method", ["predict", "decision_function"])
|
||||
def test_classifier_chain_fit_and_predict_with_linear_svc(chain_method):
|
||||
# Fit classifier chain and verify predict performance using LinearSVC
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
classifier_chain = ClassifierChain(
|
||||
LinearSVC(),
|
||||
chain_method=chain_method,
|
||||
).fit(X, Y)
|
||||
|
||||
Y_pred = classifier_chain.predict(X)
|
||||
assert Y_pred.shape == Y.shape
|
||||
|
||||
Y_decision = classifier_chain.decision_function(X)
|
||||
|
||||
Y_binary = Y_decision >= 0
|
||||
assert_array_equal(Y_binary, Y_pred)
|
||||
assert not hasattr(classifier_chain, "predict_proba")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_classifier_chain_fit_and_predict_with_sparse_data(csr_container):
|
||||
# Fit classifier chain with sparse data
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_sparse = csr_container(X)
|
||||
|
||||
classifier_chain = ClassifierChain(LogisticRegression()).fit(X_sparse, Y)
|
||||
Y_pred_sparse = classifier_chain.predict(X_sparse)
|
||||
|
||||
classifier_chain = ClassifierChain(LogisticRegression()).fit(X, Y)
|
||||
Y_pred_dense = classifier_chain.predict(X)
|
||||
|
||||
assert_array_equal(Y_pred_sparse, Y_pred_dense)
|
||||
|
||||
|
||||
def test_classifier_chain_vs_independent_models():
|
||||
# Verify that an ensemble of classifier chains (each of length
|
||||
# N) can achieve a higher Jaccard similarity score than N independent
|
||||
# models
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_train = X[:600, :]
|
||||
X_test = X[600:, :]
|
||||
Y_train = Y[:600, :]
|
||||
Y_test = Y[600:, :]
|
||||
|
||||
ovr = OneVsRestClassifier(LogisticRegression())
|
||||
ovr.fit(X_train, Y_train)
|
||||
Y_pred_ovr = ovr.predict(X_test)
|
||||
|
||||
chain = ClassifierChain(LogisticRegression())
|
||||
chain.fit(X_train, Y_train)
|
||||
Y_pred_chain = chain.predict(X_test)
|
||||
|
||||
assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score(
|
||||
Y_test, Y_pred_ovr, average="samples"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"chain_method",
|
||||
["predict", "predict_proba", "predict_log_proba", "decision_function"],
|
||||
)
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
|
||||
def test_classifier_chain_fit_and_predict(chain_method, response_method):
|
||||
# Fit classifier chain and verify predict performance
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
|
||||
chain.fit(X, Y)
|
||||
Y_pred = chain.predict(X)
|
||||
assert Y_pred.shape == Y.shape
|
||||
assert [c.coef_.size for c in chain.estimators_] == list(
|
||||
range(X.shape[1], X.shape[1] + Y.shape[1])
|
||||
)
|
||||
|
||||
Y_prob = getattr(chain, response_method)(X)
|
||||
if response_method == "predict_log_proba":
|
||||
Y_prob = np.exp(Y_prob)
|
||||
Y_binary = Y_prob >= 0.5
|
||||
assert_array_equal(Y_binary, Y_pred)
|
||||
|
||||
assert isinstance(chain, ClassifierMixin)
|
||||
|
||||
|
||||
def test_regressor_chain_fit_and_predict():
|
||||
# Fit regressor chain and verify Y and estimator coefficients shape
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
chain = RegressorChain(Ridge())
|
||||
chain.fit(X, Y)
|
||||
Y_pred = chain.predict(X)
|
||||
assert Y_pred.shape == Y.shape
|
||||
assert [c.coef_.size for c in chain.estimators_] == list(
|
||||
range(X.shape[1], X.shape[1] + Y.shape[1])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_base_chain_fit_and_predict_with_sparse_data_and_cv(csr_container):
|
||||
# Fit base chain with sparse data cross_val_predict
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
X_sparse = csr_container(X)
|
||||
base_chains = [
|
||||
ClassifierChain(LogisticRegression(), cv=3),
|
||||
RegressorChain(Ridge(), cv=3),
|
||||
]
|
||||
for chain in base_chains:
|
||||
chain.fit(X_sparse, Y)
|
||||
Y_pred = chain.predict(X_sparse)
|
||||
assert Y_pred.shape == Y.shape
|
||||
|
||||
|
||||
def test_base_chain_random_order():
|
||||
# Fit base chain with random order
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
|
||||
chain_random = clone(chain).set_params(order="random", random_state=42)
|
||||
chain_random.fit(X, Y)
|
||||
chain_fixed = clone(chain).set_params(order=chain_random.order_)
|
||||
chain_fixed.fit(X, Y)
|
||||
assert_array_equal(chain_fixed.order_, chain_random.order_)
|
||||
assert list(chain_random.order) != list(range(4))
|
||||
assert len(chain_random.order_) == 4
|
||||
assert len(set(chain_random.order_)) == 4
|
||||
# Randomly ordered chain should behave identically to a fixed order
|
||||
# chain with the same order.
|
||||
for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
|
||||
assert_array_almost_equal(est1.coef_, est2.coef_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"chain_type, chain_method",
|
||||
[
|
||||
("classifier", "predict"),
|
||||
("classifier", "predict_proba"),
|
||||
("classifier", "predict_log_proba"),
|
||||
("classifier", "decision_function"),
|
||||
("regressor", ""),
|
||||
],
|
||||
)
|
||||
def test_base_chain_crossval_fit_and_predict(chain_type, chain_method):
|
||||
# Fit chain with cross_val_predict and verify predict
|
||||
# performance
|
||||
X, Y = generate_multilabel_dataset_with_correlations()
|
||||
|
||||
if chain_type == "classifier":
|
||||
chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
|
||||
else:
|
||||
chain = RegressorChain(Ridge())
|
||||
chain.fit(X, Y)
|
||||
chain_cv = clone(chain).set_params(cv=3)
|
||||
chain_cv.fit(X, Y)
|
||||
Y_pred_cv = chain_cv.predict(X)
|
||||
Y_pred = chain.predict(X)
|
||||
|
||||
assert Y_pred_cv.shape == Y_pred.shape
|
||||
assert not np.all(Y_pred == Y_pred_cv)
|
||||
if isinstance(chain, ClassifierChain):
|
||||
assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
|
||||
else:
|
||||
assert mean_squared_error(Y, Y_pred_cv) < 0.25
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
RandomForestClassifier(n_estimators=2),
|
||||
MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
|
||||
ClassifierChain(RandomForestClassifier(n_estimators=2)),
|
||||
],
|
||||
)
|
||||
def test_multi_output_classes_(estimator):
|
||||
# Tests classes_ attribute of multioutput classifiers
|
||||
# RandomForestClassifier supports multioutput out-of-the-box
|
||||
estimator = clone(estimator).fit(X, y)
|
||||
assert isinstance(estimator.classes_, list)
|
||||
assert len(estimator.classes_) == n_outputs
|
||||
for estimator_classes, expected_classes in zip(classes, estimator.classes_):
|
||||
assert_array_equal(estimator_classes, expected_classes)
|
||||
|
||||
|
||||
class DummyRegressorWithFitParams(DummyRegressor):
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
self._fit_params = fit_params
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
|
||||
class DummyClassifierWithFitParams(DummyClassifier):
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
self._fit_params = fit_params
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, dataset",
|
||||
[
|
||||
(
|
||||
MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
|
||||
datasets.make_multilabel_classification(),
|
||||
),
|
||||
(
|
||||
MultiOutputRegressor(DummyRegressorWithFitParams()),
|
||||
datasets.make_regression(n_targets=3, random_state=0),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multioutput_estimator_with_fit_params(estimator, dataset):
|
||||
estimator = clone(estimator) # Avoid side effects from shared instances
|
||||
X, y = dataset
|
||||
some_param = np.zeros_like(X)
|
||||
estimator.fit(X, y, some_param=some_param)
|
||||
for dummy_estimator in estimator.estimators_:
|
||||
assert "some_param" in dummy_estimator._fit_params
|
||||
|
||||
|
||||
def test_regressor_chain_w_fit_params():
|
||||
# Make sure fit_params are properly propagated to the sub-estimators
|
||||
rng = np.random.RandomState(0)
|
||||
X, y = datasets.make_regression(n_targets=3, random_state=0)
|
||||
weight = rng.rand(y.shape[0])
|
||||
|
||||
class MySGD(SGDRegressor):
|
||||
def fit(self, X, y, **fit_params):
|
||||
self.sample_weight_ = fit_params["sample_weight"]
|
||||
super().fit(X, y, **fit_params)
|
||||
|
||||
model = RegressorChain(MySGD())
|
||||
|
||||
# Fitting with params
|
||||
fit_param = {"sample_weight": weight}
|
||||
model.fit(X, y, **fit_param)
|
||||
|
||||
for est in model.estimators_:
|
||||
assert est.sample_weight_ is weight
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"MultiOutputEstimator, Estimator",
|
||||
[(MultiOutputClassifier, LogisticRegression), (MultiOutputRegressor, Ridge)],
|
||||
)
|
||||
# FIXME: we should move this test in `estimator_checks` once we are able
|
||||
# to construct meta-estimator instances
|
||||
def test_support_missing_values(MultiOutputEstimator, Estimator):
|
||||
# smoke test to check that pipeline MultioutputEstimators are letting
|
||||
# the validation of missing values to
|
||||
# the underlying pipeline, regressor or classifier
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
|
||||
mask = rng.choice([1, 0], X.shape, p=[0.01, 0.99]).astype(bool)
|
||||
X[mask] = np.nan
|
||||
|
||||
pipe = make_pipeline(SimpleImputer(), Estimator())
|
||||
MultiOutputEstimator(pipe).fit(X, y).score(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("order_type", [list, np.array, tuple])
|
||||
def test_classifier_chain_tuple_order(order_type):
|
||||
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
y = [[3, 2], [2, 3], [3, 2]]
|
||||
order = order_type([1, 0])
|
||||
|
||||
chain = ClassifierChain(
|
||||
RandomForestClassifier(n_estimators=2, random_state=0), order=order
|
||||
)
|
||||
|
||||
chain.fit(X, y)
|
||||
X_test = [[1.5, 2.5, 3.5]]
|
||||
y_test = [[3, 2]]
|
||||
assert_array_almost_equal(chain.predict(X_test), y_test)
|
||||
|
||||
|
||||
def test_classifier_chain_tuple_invalid_order():
|
||||
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
|
||||
y = [[3, 2], [2, 3], [3, 2]]
|
||||
order = tuple([1, 2])
|
||||
|
||||
chain = ClassifierChain(RandomForestClassifier(), order=order)
|
||||
|
||||
with pytest.raises(ValueError, match="invalid order"):
|
||||
chain.fit(X, y)
|
||||
|
||||
|
||||
def test_classifier_chain_verbose(capsys):
|
||||
X, y = make_multilabel_classification(
|
||||
n_samples=100, n_features=5, n_classes=3, n_labels=3, random_state=0
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
pattern = (
|
||||
r"\[Chain\].*\(1 of 3\) Processing order 0, total=.*\n"
|
||||
r"\[Chain\].*\(2 of 3\) Processing order 1, total=.*\n"
|
||||
r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
|
||||
)
|
||||
|
||||
classifier = ClassifierChain(
|
||||
DecisionTreeClassifier(),
|
||||
order=[0, 1, 2],
|
||||
random_state=0,
|
||||
verbose=True,
|
||||
)
|
||||
classifier.fit(X_train, y_train)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
def test_regressor_chain_verbose(capsys):
|
||||
X, y = make_regression(n_samples=125, n_targets=3, random_state=0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
pattern = (
|
||||
r"\[Chain\].*\(1 of 3\) Processing order 1, total=.*\n"
|
||||
r"\[Chain\].*\(2 of 3\) Processing order 0, total=.*\n"
|
||||
r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
|
||||
)
|
||||
regressor = RegressorChain(
|
||||
LinearRegression(),
|
||||
order=[1, 0, 2],
|
||||
random_state=0,
|
||||
verbose=True,
|
||||
)
|
||||
regressor.fit(X_train, y_train)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
def test_multioutputregressor_ducktypes_fitted_estimator():
|
||||
"""Test that MultiOutputRegressor checks the fitted estimator for
|
||||
predict. Non-regression test for #16549."""
|
||||
X, y = load_linnerud(return_X_y=True)
|
||||
stacker = StackingRegressor(
|
||||
estimators=[("sgd", SGDRegressor(random_state=1))],
|
||||
final_estimator=Ridge(),
|
||||
cv=2,
|
||||
)
|
||||
|
||||
reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
|
||||
|
||||
# Does not raise
|
||||
reg.predict(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, method", [(ClassifierChain, "fit"), (MultiOutputClassifier, "partial_fit")]
|
||||
)
|
||||
def test_fit_params_no_routing(Cls, method):
|
||||
"""Check that we raise an error when passing metadata not requested by the
|
||||
underlying classifier.
|
||||
"""
|
||||
X, y = make_classification(n_samples=50)
|
||||
clf = Cls(SGDClassifier())
|
||||
|
||||
with pytest.raises(ValueError, match="is only supported if"):
|
||||
getattr(clf, method)(X, y, test=1)
|
||||
|
||||
|
||||
def test_multioutput_regressor_has_partial_fit():
|
||||
# Test that an unfitted MultiOutputRegressor handles available_if for
|
||||
# partial_fit correctly
|
||||
est = MultiOutputRegressor(LinearRegression())
|
||||
msg = "This 'MultiOutputRegressor' has no attribute 'partial_fit'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
getattr(est, "partial_fit")
|
||||
|
||||
|
||||
# TODO(1.9): remove when deprecated `base_estimator` is removed
|
||||
@pytest.mark.parametrize("Estimator", [ClassifierChain, RegressorChain])
|
||||
def test_base_estimator_deprecation(Estimator):
|
||||
"""Check that we warn about the deprecation of `base_estimator`."""
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([[1, 0], [0, 1]])
|
||||
|
||||
estimator = LogisticRegression()
|
||||
|
||||
with pytest.warns(FutureWarning):
|
||||
Estimator(base_estimator=estimator).fit(X, y)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
Estimator(base_estimator=estimator, estimator=estimator).fit(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,404 @@
|
||||
from importlib import import_module
|
||||
from inspect import signature
|
||||
from numbers import Integral, Real
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._param_validation import (
|
||||
Interval,
|
||||
InvalidParameterError,
|
||||
generate_invalid_param_val,
|
||||
generate_valid_param,
|
||||
make_constraint,
|
||||
)
|
||||
|
||||
|
||||
def _get_func_info(func_module):
|
||||
module_name, func_name = func_module.rsplit(".", 1)
|
||||
module = import_module(module_name)
|
||||
func = getattr(module, func_name)
|
||||
|
||||
func_sig = signature(func)
|
||||
func_params = [
|
||||
p.name
|
||||
for p in func_sig.parameters.values()
|
||||
if p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
|
||||
]
|
||||
|
||||
# The parameters `*args` and `**kwargs` are ignored since we cannot generate
|
||||
# constraints.
|
||||
required_params = [
|
||||
p.name
|
||||
for p in func_sig.parameters.values()
|
||||
if p.default is p.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
|
||||
]
|
||||
|
||||
return func, func_name, func_params, required_params
|
||||
|
||||
|
||||
def _check_function_param_validation(
|
||||
func, func_name, func_params, required_params, parameter_constraints
|
||||
):
|
||||
"""Check that an informative error is raised when the value of a parameter does not
|
||||
have an appropriate type or value.
|
||||
"""
|
||||
# generate valid values for the required parameters
|
||||
valid_required_params = {}
|
||||
for param_name in required_params:
|
||||
if parameter_constraints[param_name] == "no_validation":
|
||||
valid_required_params[param_name] = 1
|
||||
else:
|
||||
valid_required_params[param_name] = generate_valid_param(
|
||||
make_constraint(parameter_constraints[param_name][0])
|
||||
)
|
||||
|
||||
# check that there is a constraint for each parameter
|
||||
if func_params:
|
||||
validation_params = parameter_constraints.keys()
|
||||
unexpected_params = set(validation_params) - set(func_params)
|
||||
missing_params = set(func_params) - set(validation_params)
|
||||
err_msg = (
|
||||
"Mismatch between _parameter_constraints and the parameters of"
|
||||
f" {func_name}.\nConsider the unexpected parameters {unexpected_params} and"
|
||||
f" expected but missing parameters {missing_params}\n"
|
||||
)
|
||||
assert set(validation_params) == set(func_params), err_msg
|
||||
|
||||
# this object does not have a valid type for sure for all params
|
||||
param_with_bad_type = type("BadType", (), {})()
|
||||
|
||||
for param_name in func_params:
|
||||
constraints = parameter_constraints[param_name]
|
||||
|
||||
if constraints == "no_validation":
|
||||
# This parameter is not validated
|
||||
continue
|
||||
|
||||
# Mixing an interval of reals and an interval of integers must be avoided.
|
||||
if any(
|
||||
isinstance(constraint, Interval) and constraint.type == Integral
|
||||
for constraint in constraints
|
||||
) and any(
|
||||
isinstance(constraint, Interval) and constraint.type == Real
|
||||
for constraint in constraints
|
||||
):
|
||||
raise ValueError(
|
||||
f"The constraint for parameter {param_name} of {func_name} can't have a"
|
||||
" mix of intervals of Integral and Real types. Use the type"
|
||||
" RealNotInt instead of Real."
|
||||
)
|
||||
|
||||
match = (
|
||||
rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead."
|
||||
)
|
||||
|
||||
err_msg = (
|
||||
f"{func_name} does not raise an informative error message when the "
|
||||
f"parameter {param_name} does not have a valid type. If any Python type "
|
||||
"is valid, the constraint should be 'no_validation'."
|
||||
)
|
||||
|
||||
# First, check that the error is raised if param doesn't match any valid type.
|
||||
with pytest.raises(InvalidParameterError, match=match):
|
||||
func(**{**valid_required_params, param_name: param_with_bad_type})
|
||||
pytest.fail(err_msg)
|
||||
|
||||
# Then, for constraints that are more than a type constraint, check that the
|
||||
# error is raised if param does match a valid type but does not match any valid
|
||||
# value for this type.
|
||||
constraints = [make_constraint(constraint) for constraint in constraints]
|
||||
|
||||
for constraint in constraints:
|
||||
try:
|
||||
bad_value = generate_invalid_param_val(constraint)
|
||||
except NotImplementedError:
|
||||
continue
|
||||
|
||||
err_msg = (
|
||||
f"{func_name} does not raise an informative error message when the "
|
||||
f"parameter {param_name} does not have a valid value.\n"
|
||||
"Constraints should be disjoint. For instance "
|
||||
"[StrOptions({'a_string'}), str] is not an acceptable set of "
|
||||
"constraint because generating an invalid string for the first "
|
||||
"constraint will always produce a valid string for the second "
|
||||
"constraint."
|
||||
)
|
||||
|
||||
with pytest.raises(InvalidParameterError, match=match):
|
||||
func(**{**valid_required_params, param_name: bad_value})
|
||||
pytest.fail(err_msg)
|
||||
|
||||
|
||||
PARAM_VALIDATION_FUNCTION_LIST = [
|
||||
"sklearn.calibration.calibration_curve",
|
||||
"sklearn.cluster.cluster_optics_dbscan",
|
||||
"sklearn.cluster.compute_optics_graph",
|
||||
"sklearn.cluster.estimate_bandwidth",
|
||||
"sklearn.cluster.kmeans_plusplus",
|
||||
"sklearn.cluster.cluster_optics_xi",
|
||||
"sklearn.cluster.ward_tree",
|
||||
"sklearn.covariance.empirical_covariance",
|
||||
"sklearn.covariance.ledoit_wolf_shrinkage",
|
||||
"sklearn.covariance.log_likelihood",
|
||||
"sklearn.covariance.shrunk_covariance",
|
||||
"sklearn.datasets.clear_data_home",
|
||||
"sklearn.datasets.dump_svmlight_file",
|
||||
"sklearn.datasets.fetch_20newsgroups",
|
||||
"sklearn.datasets.fetch_20newsgroups_vectorized",
|
||||
"sklearn.datasets.fetch_california_housing",
|
||||
"sklearn.datasets.fetch_covtype",
|
||||
"sklearn.datasets.fetch_kddcup99",
|
||||
"sklearn.datasets.fetch_lfw_pairs",
|
||||
"sklearn.datasets.fetch_lfw_people",
|
||||
"sklearn.datasets.fetch_olivetti_faces",
|
||||
"sklearn.datasets.fetch_rcv1",
|
||||
"sklearn.datasets.fetch_openml",
|
||||
"sklearn.datasets.fetch_species_distributions",
|
||||
"sklearn.datasets.get_data_home",
|
||||
"sklearn.datasets.load_breast_cancer",
|
||||
"sklearn.datasets.load_diabetes",
|
||||
"sklearn.datasets.load_digits",
|
||||
"sklearn.datasets.load_files",
|
||||
"sklearn.datasets.load_iris",
|
||||
"sklearn.datasets.load_linnerud",
|
||||
"sklearn.datasets.load_sample_image",
|
||||
"sklearn.datasets.load_svmlight_file",
|
||||
"sklearn.datasets.load_svmlight_files",
|
||||
"sklearn.datasets.load_wine",
|
||||
"sklearn.datasets.make_biclusters",
|
||||
"sklearn.datasets.make_blobs",
|
||||
"sklearn.datasets.make_checkerboard",
|
||||
"sklearn.datasets.make_circles",
|
||||
"sklearn.datasets.make_classification",
|
||||
"sklearn.datasets.make_friedman1",
|
||||
"sklearn.datasets.make_friedman2",
|
||||
"sklearn.datasets.make_friedman3",
|
||||
"sklearn.datasets.make_gaussian_quantiles",
|
||||
"sklearn.datasets.make_hastie_10_2",
|
||||
"sklearn.datasets.make_low_rank_matrix",
|
||||
"sklearn.datasets.make_moons",
|
||||
"sklearn.datasets.make_multilabel_classification",
|
||||
"sklearn.datasets.make_regression",
|
||||
"sklearn.datasets.make_s_curve",
|
||||
"sklearn.datasets.make_sparse_coded_signal",
|
||||
"sklearn.datasets.make_sparse_spd_matrix",
|
||||
"sklearn.datasets.make_sparse_uncorrelated",
|
||||
"sklearn.datasets.make_spd_matrix",
|
||||
"sklearn.datasets.make_swiss_roll",
|
||||
"sklearn.decomposition.sparse_encode",
|
||||
"sklearn.feature_extraction.grid_to_graph",
|
||||
"sklearn.feature_extraction.img_to_graph",
|
||||
"sklearn.feature_extraction.image.extract_patches_2d",
|
||||
"sklearn.feature_extraction.image.reconstruct_from_patches_2d",
|
||||
"sklearn.feature_selection.chi2",
|
||||
"sklearn.feature_selection.f_classif",
|
||||
"sklearn.feature_selection.f_regression",
|
||||
"sklearn.feature_selection.mutual_info_classif",
|
||||
"sklearn.feature_selection.mutual_info_regression",
|
||||
"sklearn.feature_selection.r_regression",
|
||||
"sklearn.inspection.partial_dependence",
|
||||
"sklearn.inspection.permutation_importance",
|
||||
"sklearn.isotonic.check_increasing",
|
||||
"sklearn.isotonic.isotonic_regression",
|
||||
"sklearn.linear_model.enet_path",
|
||||
"sklearn.linear_model.lars_path",
|
||||
"sklearn.linear_model.lars_path_gram",
|
||||
"sklearn.linear_model.lasso_path",
|
||||
"sklearn.linear_model.orthogonal_mp",
|
||||
"sklearn.linear_model.orthogonal_mp_gram",
|
||||
"sklearn.linear_model.ridge_regression",
|
||||
"sklearn.manifold.locally_linear_embedding",
|
||||
"sklearn.manifold.smacof",
|
||||
"sklearn.manifold.spectral_embedding",
|
||||
"sklearn.manifold.trustworthiness",
|
||||
"sklearn.metrics.accuracy_score",
|
||||
"sklearn.metrics.auc",
|
||||
"sklearn.metrics.average_precision_score",
|
||||
"sklearn.metrics.balanced_accuracy_score",
|
||||
"sklearn.metrics.brier_score_loss",
|
||||
"sklearn.metrics.calinski_harabasz_score",
|
||||
"sklearn.metrics.check_scoring",
|
||||
"sklearn.metrics.completeness_score",
|
||||
"sklearn.metrics.class_likelihood_ratios",
|
||||
"sklearn.metrics.classification_report",
|
||||
"sklearn.metrics.cluster.adjusted_mutual_info_score",
|
||||
"sklearn.metrics.cluster.contingency_matrix",
|
||||
"sklearn.metrics.cluster.fowlkes_mallows_score",
|
||||
"sklearn.metrics.cluster.homogeneity_completeness_v_measure",
|
||||
"sklearn.metrics.cluster.normalized_mutual_info_score",
|
||||
"sklearn.metrics.cluster.silhouette_samples",
|
||||
"sklearn.metrics.cluster.silhouette_score",
|
||||
"sklearn.metrics.cohen_kappa_score",
|
||||
"sklearn.metrics.confusion_matrix",
|
||||
"sklearn.metrics.confusion_matrix_at_thresholds",
|
||||
"sklearn.metrics.consensus_score",
|
||||
"sklearn.metrics.coverage_error",
|
||||
"sklearn.metrics.d2_absolute_error_score",
|
||||
"sklearn.metrics.d2_brier_score",
|
||||
"sklearn.metrics.d2_log_loss_score",
|
||||
"sklearn.metrics.d2_pinball_score",
|
||||
"sklearn.metrics.d2_tweedie_score",
|
||||
"sklearn.metrics.davies_bouldin_score",
|
||||
"sklearn.metrics.dcg_score",
|
||||
"sklearn.metrics.det_curve",
|
||||
"sklearn.metrics.explained_variance_score",
|
||||
"sklearn.metrics.f1_score",
|
||||
"sklearn.metrics.fbeta_score",
|
||||
"sklearn.metrics.get_scorer",
|
||||
"sklearn.metrics.hamming_loss",
|
||||
"sklearn.metrics.hinge_loss",
|
||||
"sklearn.metrics.homogeneity_score",
|
||||
"sklearn.metrics.jaccard_score",
|
||||
"sklearn.metrics.label_ranking_average_precision_score",
|
||||
"sklearn.metrics.label_ranking_loss",
|
||||
"sklearn.metrics.log_loss",
|
||||
"sklearn.metrics.make_scorer",
|
||||
"sklearn.metrics.matthews_corrcoef",
|
||||
"sklearn.metrics.max_error",
|
||||
"sklearn.metrics.mean_absolute_error",
|
||||
"sklearn.metrics.mean_absolute_percentage_error",
|
||||
"sklearn.metrics.mean_gamma_deviance",
|
||||
"sklearn.metrics.mean_pinball_loss",
|
||||
"sklearn.metrics.mean_poisson_deviance",
|
||||
"sklearn.metrics.mean_squared_error",
|
||||
"sklearn.metrics.mean_squared_log_error",
|
||||
"sklearn.metrics.mean_tweedie_deviance",
|
||||
"sklearn.metrics.median_absolute_error",
|
||||
"sklearn.metrics.multilabel_confusion_matrix",
|
||||
"sklearn.metrics.mutual_info_score",
|
||||
"sklearn.metrics.ndcg_score",
|
||||
"sklearn.metrics.pair_confusion_matrix",
|
||||
"sklearn.metrics.adjusted_rand_score",
|
||||
"sklearn.metrics.pairwise.additive_chi2_kernel",
|
||||
"sklearn.metrics.pairwise.chi2_kernel",
|
||||
"sklearn.metrics.pairwise.cosine_distances",
|
||||
"sklearn.metrics.pairwise.cosine_similarity",
|
||||
"sklearn.metrics.pairwise.euclidean_distances",
|
||||
"sklearn.metrics.pairwise.haversine_distances",
|
||||
"sklearn.metrics.pairwise.laplacian_kernel",
|
||||
"sklearn.metrics.pairwise.linear_kernel",
|
||||
"sklearn.metrics.pairwise.manhattan_distances",
|
||||
"sklearn.metrics.pairwise.nan_euclidean_distances",
|
||||
"sklearn.metrics.pairwise.paired_cosine_distances",
|
||||
"sklearn.metrics.pairwise.paired_distances",
|
||||
"sklearn.metrics.pairwise.paired_euclidean_distances",
|
||||
"sklearn.metrics.pairwise.paired_manhattan_distances",
|
||||
"sklearn.metrics.pairwise.pairwise_distances_argmin_min",
|
||||
"sklearn.metrics.pairwise.pairwise_kernels",
|
||||
"sklearn.metrics.pairwise.polynomial_kernel",
|
||||
"sklearn.metrics.pairwise.rbf_kernel",
|
||||
"sklearn.metrics.pairwise.sigmoid_kernel",
|
||||
"sklearn.metrics.pairwise_distances",
|
||||
"sklearn.metrics.pairwise_distances_argmin",
|
||||
"sklearn.metrics.pairwise_distances_chunked",
|
||||
"sklearn.metrics.precision_recall_curve",
|
||||
"sklearn.metrics.precision_recall_fscore_support",
|
||||
"sklearn.metrics.precision_score",
|
||||
"sklearn.metrics.r2_score",
|
||||
"sklearn.metrics.rand_score",
|
||||
"sklearn.metrics.recall_score",
|
||||
"sklearn.metrics.roc_auc_score",
|
||||
"sklearn.metrics.roc_curve",
|
||||
"sklearn.metrics.root_mean_squared_error",
|
||||
"sklearn.metrics.root_mean_squared_log_error",
|
||||
"sklearn.metrics.top_k_accuracy_score",
|
||||
"sklearn.metrics.v_measure_score",
|
||||
"sklearn.metrics.zero_one_loss",
|
||||
"sklearn.model_selection.cross_val_predict",
|
||||
"sklearn.model_selection.cross_val_score",
|
||||
"sklearn.model_selection.cross_validate",
|
||||
"sklearn.model_selection.learning_curve",
|
||||
"sklearn.model_selection.permutation_test_score",
|
||||
"sklearn.model_selection.train_test_split",
|
||||
"sklearn.model_selection.validation_curve",
|
||||
"sklearn.neighbors.kneighbors_graph",
|
||||
"sklearn.neighbors.radius_neighbors_graph",
|
||||
"sklearn.neighbors.sort_graph_by_row_values",
|
||||
"sklearn.preprocessing.add_dummy_feature",
|
||||
"sklearn.preprocessing.binarize",
|
||||
"sklearn.preprocessing.label_binarize",
|
||||
"sklearn.preprocessing.normalize",
|
||||
"sklearn.preprocessing.scale",
|
||||
"sklearn.random_projection.johnson_lindenstrauss_min_dim",
|
||||
"sklearn.svm.l1_min_c",
|
||||
"sklearn.tree.export_graphviz",
|
||||
"sklearn.tree.export_text",
|
||||
"sklearn.tree.plot_tree",
|
||||
"sklearn.utils.gen_batches",
|
||||
"sklearn.utils.gen_even_slices",
|
||||
"sklearn.utils.resample",
|
||||
"sklearn.utils.safe_mask",
|
||||
"sklearn.utils.extmath.randomized_svd",
|
||||
"sklearn.utils.class_weight.compute_class_weight",
|
||||
"sklearn.utils.class_weight.compute_sample_weight",
|
||||
"sklearn.utils.graph.single_source_shortest_path_length",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func_module", PARAM_VALIDATION_FUNCTION_LIST)
|
||||
def test_function_param_validation(func_module):
|
||||
"""Check param validation for public functions that are not wrappers around
|
||||
estimators.
|
||||
"""
|
||||
func, func_name, func_params, required_params = _get_func_info(func_module)
|
||||
|
||||
parameter_constraints = getattr(func, "_skl_parameter_constraints")
|
||||
|
||||
_check_function_param_validation(
|
||||
func, func_name, func_params, required_params, parameter_constraints
|
||||
)
|
||||
|
||||
|
||||
PARAM_VALIDATION_CLASS_WRAPPER_LIST = [
|
||||
("sklearn.cluster.affinity_propagation", "sklearn.cluster.AffinityPropagation"),
|
||||
("sklearn.cluster.dbscan", "sklearn.cluster.DBSCAN"),
|
||||
("sklearn.cluster.k_means", "sklearn.cluster.KMeans"),
|
||||
("sklearn.cluster.mean_shift", "sklearn.cluster.MeanShift"),
|
||||
("sklearn.cluster.spectral_clustering", "sklearn.cluster.SpectralClustering"),
|
||||
("sklearn.covariance.graphical_lasso", "sklearn.covariance.GraphicalLasso"),
|
||||
("sklearn.covariance.ledoit_wolf", "sklearn.covariance.LedoitWolf"),
|
||||
("sklearn.covariance.oas", "sklearn.covariance.OAS"),
|
||||
("sklearn.decomposition.dict_learning", "sklearn.decomposition.DictionaryLearning"),
|
||||
(
|
||||
"sklearn.decomposition.dict_learning_online",
|
||||
"sklearn.decomposition.MiniBatchDictionaryLearning",
|
||||
),
|
||||
("sklearn.decomposition.fastica", "sklearn.decomposition.FastICA"),
|
||||
("sklearn.decomposition.non_negative_factorization", "sklearn.decomposition.NMF"),
|
||||
("sklearn.preprocessing.maxabs_scale", "sklearn.preprocessing.MaxAbsScaler"),
|
||||
("sklearn.preprocessing.minmax_scale", "sklearn.preprocessing.MinMaxScaler"),
|
||||
("sklearn.preprocessing.power_transform", "sklearn.preprocessing.PowerTransformer"),
|
||||
(
|
||||
"sklearn.preprocessing.quantile_transform",
|
||||
"sklearn.preprocessing.QuantileTransformer",
|
||||
),
|
||||
("sklearn.preprocessing.robust_scale", "sklearn.preprocessing.RobustScaler"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func_module, class_module", PARAM_VALIDATION_CLASS_WRAPPER_LIST
|
||||
)
|
||||
def test_class_wrapper_param_validation(func_module, class_module):
|
||||
"""Check param validation for public functions that are wrappers around
|
||||
estimators.
|
||||
"""
|
||||
func, func_name, func_params, required_params = _get_func_info(func_module)
|
||||
|
||||
module_name, class_name = class_module.rsplit(".", 1)
|
||||
module = import_module(module_name)
|
||||
klass = getattr(module, class_name)
|
||||
|
||||
parameter_constraints_func = getattr(func, "_skl_parameter_constraints")
|
||||
parameter_constraints_class = getattr(klass, "_parameter_constraints")
|
||||
parameter_constraints = {
|
||||
**parameter_constraints_class,
|
||||
**parameter_constraints_func,
|
||||
}
|
||||
parameter_constraints = {
|
||||
k: v for k, v in parameter_constraints.items() if k in func_params
|
||||
}
|
||||
|
||||
_check_function_param_validation(
|
||||
func, func_name, func_params, required_params, parameter_constraints
|
||||
)
|
||||
@@ -0,0 +1,584 @@
|
||||
import functools
|
||||
import warnings
|
||||
from typing import Any, List
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.exceptions import DataDimensionalityWarning, NotFittedError
|
||||
from sklearn.metrics import euclidean_distances
|
||||
from sklearn.random_projection import (
|
||||
GaussianRandomProjection,
|
||||
SparseRandomProjection,
|
||||
_gaussian_random_matrix,
|
||||
_sparse_random_matrix,
|
||||
johnson_lindenstrauss_min_dim,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import COO_CONTAINERS
|
||||
|
||||
all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
|
||||
all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
|
||||
all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
|
||||
|
||||
all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
|
||||
all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
|
||||
all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
|
||||
|
||||
|
||||
def make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=None,
|
||||
sparse_format="csr",
|
||||
):
|
||||
"""Make some random data with uniformly located non zero entries with
|
||||
Gaussian distributed values; `sparse_format` can be `"csr"` (default) or
|
||||
`None` (in which case a dense array is returned).
|
||||
"""
|
||||
rng = np.random.RandomState(random_state)
|
||||
data_coo = coo_container(
|
||||
(
|
||||
rng.randn(n_nonzeros),
|
||||
(
|
||||
rng.randint(n_samples, size=n_nonzeros),
|
||||
rng.randint(n_features, size=n_nonzeros),
|
||||
),
|
||||
),
|
||||
shape=(n_samples, n_features),
|
||||
)
|
||||
if sparse_format is not None:
|
||||
return data_coo.asformat(sparse_format)
|
||||
else:
|
||||
return data_coo.toarray()
|
||||
|
||||
|
||||
def densify(matrix):
|
||||
if not sp.issparse(matrix):
|
||||
return matrix
|
||||
else:
|
||||
return matrix.toarray()
|
||||
|
||||
|
||||
n_samples, n_features = (10, 1000)
|
||||
n_nonzeros = int(n_samples * n_features / 100.0)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test on JL lemma
|
||||
###############################################################################
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples, eps",
|
||||
[
|
||||
([100, 110], [0.9, 1.1]),
|
||||
([90, 100], [0.1, 0.0]),
|
||||
([50, -40], [0.1, 0.2]),
|
||||
],
|
||||
)
|
||||
def test_invalid_jl_domain(n_samples, eps):
|
||||
with pytest.raises(ValueError):
|
||||
johnson_lindenstrauss_min_dim(n_samples, eps=eps)
|
||||
|
||||
|
||||
def test_input_size_jl_min_dim():
|
||||
with pytest.raises(ValueError):
|
||||
johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
|
||||
|
||||
johnson_lindenstrauss_min_dim(
|
||||
np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
|
||||
)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# tests random matrix generation
|
||||
###############################################################################
|
||||
def check_input_size_random_matrix(random_matrix):
|
||||
inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
|
||||
for n_components, n_features in inputs:
|
||||
with pytest.raises(ValueError):
|
||||
random_matrix(n_components, n_features)
|
||||
|
||||
|
||||
def check_size_generated(random_matrix):
|
||||
inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
|
||||
for n_components, n_features in inputs:
|
||||
assert random_matrix(n_components, n_features).shape == (
|
||||
n_components,
|
||||
n_features,
|
||||
)
|
||||
|
||||
|
||||
def check_zero_mean_and_unit_norm(random_matrix):
|
||||
# All random matrix should produce a transformation matrix
|
||||
# with zero mean and unit norm for each columns
|
||||
|
||||
A = densify(random_matrix(10000, 1, random_state=0))
|
||||
|
||||
assert_array_almost_equal(0, np.mean(A), 3)
|
||||
assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
|
||||
|
||||
|
||||
def check_input_with_sparse_random_matrix(random_matrix):
|
||||
n_components, n_features = 5, 10
|
||||
|
||||
for density in [-1.0, 0.0, 1.1]:
|
||||
with pytest.raises(ValueError):
|
||||
random_matrix(n_components, n_features, density=density)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_matrix", all_random_matrix)
|
||||
def test_basic_property_of_random_matrix(random_matrix):
|
||||
# Check basic properties of random matrix generation
|
||||
check_input_size_random_matrix(random_matrix)
|
||||
check_size_generated(random_matrix)
|
||||
check_zero_mean_and_unit_norm(random_matrix)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
|
||||
def test_basic_property_of_sparse_random_matrix(random_matrix):
|
||||
check_input_with_sparse_random_matrix(random_matrix)
|
||||
|
||||
random_matrix_dense = functools.partial(random_matrix, density=1.0)
|
||||
|
||||
check_zero_mean_and_unit_norm(random_matrix_dense)
|
||||
|
||||
|
||||
def test_gaussian_random_matrix():
|
||||
# Check some statical properties of Gaussian random matrix
|
||||
# Check that the random matrix follow the proper distribution.
|
||||
# Let's say that each element of a_{ij} of A is taken from
|
||||
# a_ij ~ N(0.0, 1 / n_components).
|
||||
#
|
||||
n_components = 100
|
||||
n_features = 1000
|
||||
A = _gaussian_random_matrix(n_components, n_features, random_state=0)
|
||||
|
||||
assert_array_almost_equal(0.0, np.mean(A), 2)
|
||||
assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
|
||||
|
||||
|
||||
def test_sparse_random_matrix():
|
||||
# Check some statical properties of sparse random matrix
|
||||
n_components = 100
|
||||
n_features = 500
|
||||
|
||||
for density in [0.3, 1.0]:
|
||||
s = 1 / density
|
||||
|
||||
A = _sparse_random_matrix(
|
||||
n_components, n_features, density=density, random_state=0
|
||||
)
|
||||
A = densify(A)
|
||||
|
||||
# Check possible values
|
||||
values = np.unique(A)
|
||||
assert np.sqrt(s) / np.sqrt(n_components) in values
|
||||
assert -np.sqrt(s) / np.sqrt(n_components) in values
|
||||
|
||||
if density == 1.0:
|
||||
assert np.size(values) == 2
|
||||
else:
|
||||
assert 0.0 in values
|
||||
assert np.size(values) == 3
|
||||
|
||||
# Check that the random matrix follow the proper distribution.
|
||||
# Let's say that each element of a_{ij} of A is taken from
|
||||
#
|
||||
# - -sqrt(s) / sqrt(n_components) with probability 1 / 2s
|
||||
# - 0 with probability 1 - 1 / s
|
||||
# - +sqrt(s) / sqrt(n_components) with probability 1 / 2s
|
||||
#
|
||||
assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
|
||||
assert_almost_equal(
|
||||
np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
|
||||
)
|
||||
assert_almost_equal(
|
||||
np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
|
||||
)
|
||||
|
||||
assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
|
||||
assert_almost_equal(
|
||||
np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
|
||||
(1 - 1 / (2 * s)) * 1 / (2 * s),
|
||||
decimal=2,
|
||||
)
|
||||
assert_almost_equal(
|
||||
np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
|
||||
(1 - 1 / (2 * s)) * 1 / (2 * s),
|
||||
decimal=2,
|
||||
)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# tests on random projection transformer
|
||||
###############################################################################
|
||||
|
||||
|
||||
def test_random_projection_transformer_invalid_input():
|
||||
n_components = "auto"
|
||||
fit_data = [[0, 1, 2]]
|
||||
for RandomProjection in all_RandomProjection:
|
||||
with pytest.raises(ValueError):
|
||||
RandomProjection(n_components=n_components).fit(fit_data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_try_to_transform_before_fit(coo_container, global_random_seed):
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
for RandomProjection in all_RandomProjection:
|
||||
with pytest.raises(NotFittedError):
|
||||
RandomProjection(n_components="auto").transform(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed):
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples=1000,
|
||||
n_features=100,
|
||||
n_nonzeros=1000,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components="auto", eps=0.1)
|
||||
expected_msg = (
|
||||
"eps=0.100000 and n_samples=1000 lead to a target dimension"
|
||||
" of 5920 which is larger than the original space with"
|
||||
" n_features=100"
|
||||
)
|
||||
with pytest.raises(ValueError, match=expected_msg):
|
||||
rp.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_random_projection_embedding_quality(coo_container):
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples=8,
|
||||
n_features=5000,
|
||||
n_nonzeros=15000,
|
||||
random_state=0,
|
||||
sparse_format=None,
|
||||
)
|
||||
eps = 0.2
|
||||
|
||||
original_distances = euclidean_distances(data, squared=True)
|
||||
original_distances = original_distances.ravel()
|
||||
non_identical = original_distances != 0.0
|
||||
|
||||
# remove 0 distances to avoid division by 0
|
||||
original_distances = original_distances[non_identical]
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
|
||||
projected = rp.fit_transform(data)
|
||||
|
||||
projected_distances = euclidean_distances(projected, squared=True)
|
||||
projected_distances = projected_distances.ravel()
|
||||
|
||||
# remove 0 distances to avoid division by 0
|
||||
projected_distances = projected_distances[non_identical]
|
||||
|
||||
distances_ratio = projected_distances / original_distances
|
||||
|
||||
# check that the automatically tuned values for the density respect the
|
||||
# contract for eps: pairwise distances are preserved according to the
|
||||
# Johnson-Lindenstrauss lemma
|
||||
assert distances_ratio.max() < 1 + eps
|
||||
assert 1 - eps < distances_ratio.min()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_SparseRandomProj_output_representation(coo_container):
|
||||
dense_data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=0,
|
||||
sparse_format=None,
|
||||
)
|
||||
sparse_data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=0,
|
||||
sparse_format="csr",
|
||||
)
|
||||
for SparseRandomProj in all_SparseRandomProjection:
|
||||
# when using sparse input, the projected data can be forced to be a
|
||||
# dense numpy array
|
||||
rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
|
||||
rp.fit(dense_data)
|
||||
assert isinstance(rp.transform(dense_data), np.ndarray)
|
||||
assert isinstance(rp.transform(sparse_data), np.ndarray)
|
||||
|
||||
# the output can be left to a sparse matrix instead
|
||||
rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
|
||||
rp = rp.fit(dense_data)
|
||||
# output for dense input will stay dense:
|
||||
assert isinstance(rp.transform(dense_data), np.ndarray)
|
||||
|
||||
# output for sparse output will be sparse:
|
||||
assert sp.issparse(rp.transform(sparse_data))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_correct_RandomProjection_dimensions_embedding(
|
||||
coo_container, global_random_seed
|
||||
):
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
|
||||
|
||||
# the number of components is adjusted from the shape of the training
|
||||
# set
|
||||
assert rp.n_components == "auto"
|
||||
assert rp.n_components_ == 110
|
||||
|
||||
if RandomProjection in all_SparseRandomProjection:
|
||||
assert rp.density == "auto"
|
||||
assert_almost_equal(rp.density_, 0.03, 2)
|
||||
|
||||
assert rp.components_.shape == (110, n_features)
|
||||
|
||||
projected_1 = rp.transform(data)
|
||||
assert projected_1.shape == (n_samples, 110)
|
||||
|
||||
# once the RP is 'fitted' the projection is always the same
|
||||
projected_2 = rp.transform(data)
|
||||
assert_array_equal(projected_1, projected_2)
|
||||
|
||||
# fit transform with same random seed will lead to the same results
|
||||
rp2 = RandomProjection(random_state=0, eps=0.5)
|
||||
projected_3 = rp2.fit_transform(data)
|
||||
assert_array_equal(projected_1, projected_3)
|
||||
|
||||
# Try to transform with an input X of size different from fitted.
|
||||
with pytest.raises(ValueError):
|
||||
rp.transform(data[:, 1:5])
|
||||
|
||||
# it is also possible to fix the number of components and the density
|
||||
# level
|
||||
if RandomProjection in all_SparseRandomProjection:
|
||||
rp = RandomProjection(n_components=100, density=0.001, random_state=0)
|
||||
projected = rp.fit_transform(data)
|
||||
assert projected.shape == (n_samples, 100)
|
||||
assert rp.components_.shape == (100, n_features)
|
||||
assert rp.components_.nnz < 115 # close to 1% density
|
||||
assert 85 < rp.components_.nnz # close to 1% density
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_warning_n_components_greater_than_n_features(
|
||||
coo_container, global_random_seed
|
||||
):
|
||||
n_features = 20
|
||||
n_samples = 5
|
||||
n_nonzeros = int(n_features / 4)
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
with pytest.warns(DataDimensionalityWarning):
|
||||
RandomProjection(n_components=n_features + 1).fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_works_with_sparse_data(coo_container, global_random_seed):
|
||||
n_features = 20
|
||||
n_samples = 5
|
||||
n_nonzeros = int(n_features / 4)
|
||||
dense_data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
sparse_data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format="csr",
|
||||
)
|
||||
|
||||
for RandomProjection in all_RandomProjection:
|
||||
rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data)
|
||||
rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data)
|
||||
assert_array_almost_equal(
|
||||
densify(rp_dense.components_), densify(rp_sparse.components_)
|
||||
)
|
||||
|
||||
|
||||
def test_johnson_lindenstrauss_min_dim():
|
||||
"""Test Johnson-Lindenstrauss for small eps.
|
||||
|
||||
Regression test for #17111: before #19374, 32-bit systems would fail.
|
||||
"""
|
||||
assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
|
||||
def test_random_projection_feature_names_out(
|
||||
coo_container, random_projection_cls, global_random_seed
|
||||
):
|
||||
data = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
random_projection = random_projection_cls(n_components=2)
|
||||
random_projection.fit(data)
|
||||
names_out = random_projection.get_feature_names_out()
|
||||
class_name_lower = random_projection_cls.__name__.lower()
|
||||
expected_names_out = np.array(
|
||||
[f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
assert_array_equal(names_out, expected_names_out)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
|
||||
@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
|
||||
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
|
||||
@pytest.mark.parametrize("compute_inverse_components", [True, False])
|
||||
def test_inverse_transform(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
random_projection_cls,
|
||||
compute_inverse_components,
|
||||
global_random_seed,
|
||||
):
|
||||
n_components = 10
|
||||
|
||||
random_projection = random_projection_cls(
|
||||
n_components=n_components,
|
||||
compute_inverse_components=compute_inverse_components,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
|
||||
X_dense = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros=n_samples * n_features // 100 + 1,
|
||||
random_state=global_random_seed,
|
||||
sparse_format=None,
|
||||
)
|
||||
X_csr = make_sparse_random_data(
|
||||
coo_container,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_nonzeros=n_samples * n_features // 100 + 1,
|
||||
random_state=global_random_seed,
|
||||
sparse_format="csr",
|
||||
)
|
||||
|
||||
for X in [X_dense, X_csr]:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message=(
|
||||
"The number of components is higher than the number of features"
|
||||
),
|
||||
category=DataDimensionalityWarning,
|
||||
)
|
||||
projected = random_projection.fit_transform(X)
|
||||
|
||||
if compute_inverse_components:
|
||||
assert hasattr(random_projection, "inverse_components_")
|
||||
inv_components = random_projection.inverse_components_
|
||||
assert inv_components.shape == (n_features, n_components)
|
||||
|
||||
projected_back = random_projection.inverse_transform(projected)
|
||||
assert projected_back.shape == X.shape
|
||||
|
||||
projected_again = random_projection.transform(projected_back)
|
||||
if hasattr(projected, "toarray"):
|
||||
projected = projected.toarray()
|
||||
assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
|
||||
@pytest.mark.parametrize(
|
||||
"input_dtype, expected_dtype",
|
||||
(
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
),
|
||||
)
|
||||
def test_random_projection_dtype_match(
|
||||
random_projection_cls, input_dtype, expected_dtype
|
||||
):
|
||||
# Verify output matrix dtype
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.rand(25, 3000)
|
||||
rp = random_projection_cls(random_state=0)
|
||||
transformed = rp.fit_transform(X.astype(input_dtype))
|
||||
|
||||
assert rp.components_.dtype == expected_dtype
|
||||
assert transformed.dtype == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
|
||||
def test_random_projection_numerical_consistency(random_projection_cls):
|
||||
# Verify numerical consistency among np.float32 and np.float64
|
||||
atol = 1e-5
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.rand(25, 3000)
|
||||
rp_32 = random_projection_cls(random_state=0)
|
||||
rp_64 = random_projection_cls(random_state=0)
|
||||
|
||||
projection_32 = rp_32.fit_transform(X.astype(np.float32))
|
||||
projection_64 = rp_64.fit_transform(X.astype(np.float64))
|
||||
|
||||
assert_allclose(projection_64, projection_32, atol=atol)
|
||||
|
||||
assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)
|
||||
Reference in New Issue
Block a user