This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,45 @@
"""Ensemble-based methods for classification, regression and anomaly detection."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.ensemble._bagging import BaggingClassifier, BaggingRegressor
from sklearn.ensemble._base import BaseEnsemble
from sklearn.ensemble._forest import (
ExtraTreesClassifier,
ExtraTreesRegressor,
RandomForestClassifier,
RandomForestRegressor,
RandomTreesEmbedding,
)
from sklearn.ensemble._gb import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.ensemble._iforest import IsolationForest
from sklearn.ensemble._stacking import StackingClassifier, StackingRegressor
from sklearn.ensemble._voting import VotingClassifier, VotingRegressor
from sklearn.ensemble._weight_boosting import AdaBoostClassifier, AdaBoostRegressor
__all__ = [
"AdaBoostClassifier",
"AdaBoostRegressor",
"BaggingClassifier",
"BaggingRegressor",
"BaseEnsemble",
"ExtraTreesClassifier",
"ExtraTreesRegressor",
"GradientBoostingClassifier",
"GradientBoostingRegressor",
"HistGradientBoostingClassifier",
"HistGradientBoostingRegressor",
"IsolationForest",
"RandomForestClassifier",
"RandomForestRegressor",
"RandomTreesEmbedding",
"StackingClassifier",
"StackingRegressor",
"VotingClassifier",
"VotingRegressor",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
"""Base class for ensemble-based estimators."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from abc import ABCMeta, abstractmethod
import numpy as np
from joblib import effective_n_jobs
from sklearn.base import (
BaseEstimator,
MetaEstimatorMixin,
clone,
is_classifier,
is_regressor,
)
from sklearn.utils import Bunch, check_random_state
from sklearn.utils._tags import get_tags
from sklearn.utils._user_interface import _print_elapsed_time
from sklearn.utils.metadata_routing import _routing_enabled
from sklearn.utils.metaestimators import _BaseComposition
def _fit_single_estimator(
estimator, X, y, fit_params, message_clsname=None, message=None
):
"""Private function used to fit an estimator within a job."""
# TODO(SLEP6): remove if-condition for unrouted sample_weight when metadata
# routing can't be disabled.
if not _routing_enabled() and "sample_weight" in fit_params:
try:
with _print_elapsed_time(message_clsname, message):
estimator.fit(X, y, sample_weight=fit_params["sample_weight"])
except TypeError as exc:
if "unexpected keyword argument 'sample_weight'" in str(exc):
raise TypeError(
"Underlying estimator {} does not support sample weights.".format(
estimator.__class__.__name__
)
) from exc
raise
else:
with _print_elapsed_time(message_clsname, message):
estimator.fit(X, y, **fit_params)
return estimator
def _set_random_states(estimator, random_state=None):
"""Set fixed random_state parameters for an estimator.
Finds all parameters ending ``random_state`` and sets them to integers
derived from ``random_state``.
Parameters
----------
estimator : estimator supporting get/set_params
Estimator with potential randomness managed by random_state
parameters.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the generation of the random
integers. Pass an int for reproducible output across multiple function
calls.
See :term:`Glossary <random_state>`.
Notes
-----
This does not necessarily set *all* ``random_state`` attributes that
control an estimator's randomness, only those accessible through
``estimator.get_params()``. ``random_state``s not controlled include
those belonging to:
* cross-validation splitters
* ``scipy.stats`` rvs
"""
random_state = check_random_state(random_state)
to_set = {}
for key in sorted(estimator.get_params(deep=True)):
if key == "random_state" or key.endswith("__random_state"):
to_set[key] = random_state.randint(np.iinfo(np.int32).max)
if to_set:
estimator.set_params(**to_set)
class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for all ensemble classes.
Warning: This class should not be used directly. Use derived classes
instead.
Parameters
----------
estimator : object
The base estimator from which the ensemble is built.
n_estimators : int, default=10
The number of estimators in the ensemble.
estimator_params : list of str, default=tuple()
The list of attributes to use as parameters when instantiating a
new base estimator. If none are given, default parameters are used.
Attributes
----------
estimator_ : estimator
The base estimator from which the ensemble is grown.
estimators_ : list of estimators
The collection of fitted base estimators.
"""
@abstractmethod
def __init__(
self,
estimator=None,
*,
n_estimators=10,
estimator_params=tuple(),
):
# Set parameters
self.estimator = estimator
self.n_estimators = n_estimators
self.estimator_params = estimator_params
# Don't instantiate estimators now! Parameters of estimator might
# still change. Eg., when grid-searching with the nested object syntax.
# self.estimators_ needs to be filled by the derived classes in fit.
def _validate_estimator(self, default=None):
"""Check the base estimator.
Sets the `estimator_` attributes.
"""
if self.estimator is not None:
self.estimator_ = self.estimator
else:
self.estimator_ = default
def _make_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `estimator_` attribute.
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
estimator = clone(self.estimator_)
estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})
if random_state is not None:
_set_random_states(estimator, random_state)
if append:
self.estimators_.append(estimator)
return estimator
def __len__(self):
"""Return the number of estimators in the ensemble."""
return len(self.estimators_)
def __getitem__(self, index):
"""Return the index'th estimator in the ensemble."""
return self.estimators_[index]
def __iter__(self):
"""Return iterator over estimators in the ensemble."""
return iter(self.estimators_)
def _partition_estimators(n_estimators, n_jobs):
"""Private function used to partition estimators between jobs."""
# Compute the number of jobs
n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
# Partition estimators between jobs
n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int)
n_estimators_per_job[: n_estimators % n_jobs] += 1
starts = np.cumsum(n_estimators_per_job)
return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
class _BaseHeterogeneousEnsemble(
MetaEstimatorMixin, _BaseComposition, metaclass=ABCMeta
):
"""Base class for heterogeneous ensemble of learners.
Parameters
----------
estimators : list of (str, estimator) tuples
The ensemble of estimators to use in the ensemble. Each element of the
list is defined as a tuple of string (i.e. name of the estimator) and
an estimator instance. An estimator can be set to `'drop'` using
`set_params`.
Attributes
----------
estimators_ : list of estimators
The elements of the estimators parameter, having been fitted on the
training data. If an estimator has been set to `'drop'`, it will not
appear in `estimators_`.
"""
@property
def named_estimators(self):
"""Dictionary to access any fitted sub-estimators by name.
Returns
-------
:class:`~sklearn.utils.Bunch`
"""
return Bunch(**dict(self.estimators))
@abstractmethod
def __init__(self, estimators):
self.estimators = estimators
def _validate_estimators(self):
if len(self.estimators) == 0 or not all(
isinstance(item, (tuple, list)) and isinstance(item[0], str)
for item in self.estimators
):
raise ValueError(
"Invalid 'estimators' attribute, 'estimators' should be a "
"non-empty list of (string, estimator) tuples."
)
names, estimators = zip(*self.estimators)
# defined by MetaEstimatorMixin
self._validate_names(names)
has_estimator = any(est != "drop" for est in estimators)
if not has_estimator:
raise ValueError(
"All estimators are dropped. At least one is required "
"to be an estimator."
)
is_estimator_type = is_classifier if is_classifier(self) else is_regressor
for est in estimators:
if est != "drop" and not is_estimator_type(est):
raise ValueError(
"The estimator {} should be a {}.".format(
est.__class__.__name__, is_estimator_type.__name__[3:]
)
)
return names, estimators
def set_params(self, **params):
"""
Set the parameters of an estimator from the ensemble.
Valid parameter keys can be listed with `get_params()`. Note that you
can directly set the parameters of the estimators contained in
`estimators`.
Parameters
----------
**params : keyword arguments
Specific parameters using e.g.
`set_params(parameter_name=new_value)`. In addition, to setting the
parameters of the estimator, the individual estimator of the
estimators can also be set, or can be removed by setting them to
'drop'.
Returns
-------
self : object
Estimator instance.
"""
super()._set_params("estimators", **params)
return self
def get_params(self, deep=True):
"""
Get the parameters of an estimator from the ensemble.
Returns the parameters given in the constructor as well as the
estimators contained within the `estimators` parameter.
Parameters
----------
deep : bool, default=True
Setting it to True gets the various estimators and the parameters
of the estimators as well.
Returns
-------
params : dict
Parameter and estimator names mapped to their values or parameter
names mapped to their values.
"""
return super()._get_params("estimators", deep=deep)
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
try:
tags.input_tags.allow_nan = all(
get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True
for est in self.estimators
)
tags.input_tags.sparse = all(
get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True
for est in self.estimators
)
except Exception:
# If `estimators` does not comply with our API (list of tuples) then it will
# fail. In this case, we assume that `allow_nan` and `sparse` are False but
# the parameter validation will raise an error during `fit`.
pass # pragma: no cover
return tags

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,262 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from libc.stdlib cimport free
from libc.string cimport memset
import numpy as np
from scipy.sparse import issparse
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t
# Note: _tree uses cimport numpy, cnp.import_array, so we need to include
# numpy headers in the build configuration of this extension
from sklearn.tree._tree cimport Node
from sklearn.tree._tree cimport Tree
from sklearn.tree._utils cimport safe_realloc
# no namespace lookup for numpy dtype and array creation
from numpy import zeros as np_zeros
# constant to mark tree leafs
cdef intp_t TREE_LEAF = -1
cdef void _predict_regression_tree_inplace_fast_dense(
const float32_t[:, ::1] X,
Node* root_node,
double *value,
double scale,
Py_ssize_t k,
float64_t[:, :] out
) noexcept nogil:
"""Predicts output for regression tree and stores it in ``out[i, k]``.
This function operates directly on the data arrays of the tree
data structures. This is 5x faster than the variant above because
it allows us to avoid buffer validation.
The function assumes that the ndarray that wraps ``X`` is
c-continuous.
Parameters
----------
X : float32_t 2d memory view
The memory view on the data ndarray of the input ``X``.
Assumes that the array is c-continuous.
root_node : tree Node pointer
Pointer to the main node array of the :class:``sklearn.tree.Tree``.
value : np.float64_t pointer
The pointer to the data array of the ``value`` array attribute
of the :class:``sklearn.tree.Tree``.
scale : double
A constant to scale the predictions.
k : int
The index of the tree output to be predicted. Must satisfy
0 <= ``k`` < ``K``.
out : memory view on array of type np.float64_t
The data array where the predictions are stored.
``out`` is assumed to be a two-dimensional array of
shape ``(n_samples, K)``.
"""
cdef intp_t n_samples = X.shape[0]
cdef Py_ssize_t i
cdef Node *node
for i in range(n_samples):
node = root_node
# While node not a leaf
while node.left_child != TREE_LEAF:
if X[i, node.feature] <= node.threshold:
node = root_node + node.left_child
else:
node = root_node + node.right_child
out[i, k] += scale * value[node - root_node]
def _predict_regression_tree_stages_sparse(
object[:, :] estimators,
object X,
double scale,
float64_t[:, :] out
):
"""Predicts output for regression tree inplace and adds scaled value to ``out[i, k]``.
The function assumes that the ndarray that wraps ``X`` is csr_matrix.
"""
cdef const float32_t[::1] X_data = X.data
cdef const int32_t[::1] X_indices = X.indices
cdef const int32_t[::1] X_indptr = X.indptr
cdef intp_t n_samples = X.shape[0]
cdef intp_t n_features = X.shape[1]
cdef intp_t n_stages = estimators.shape[0]
cdef intp_t n_outputs = estimators.shape[1]
# Indices and temporary variables
cdef intp_t sample_i
cdef intp_t feature_i
cdef intp_t stage_i
cdef intp_t output_i
cdef Node *root_node = NULL
cdef Node *node = NULL
cdef double *value = NULL
cdef Tree tree
cdef Node** nodes = NULL
cdef double** values = NULL
safe_realloc(&nodes, n_stages * n_outputs)
safe_realloc(&values, n_stages * n_outputs)
for stage_i in range(n_stages):
for output_i in range(n_outputs):
tree = estimators[stage_i, output_i].tree_
nodes[stage_i * n_outputs + output_i] = tree.nodes
values[stage_i * n_outputs + output_i] = tree.value
# Initialize auxiliary data-structure
cdef float32_t feature_value = 0.
cdef float32_t* X_sample = NULL
# feature_to_sample as a data structure records the last seen sample
# for each feature; functionally, it is an efficient way to identify
# which features are nonzero in the present sample.
cdef intp_t* feature_to_sample = NULL
safe_realloc(&X_sample, n_features)
safe_realloc(&feature_to_sample, n_features)
memset(feature_to_sample, -1, n_features * sizeof(intp_t))
# Cycle through all samples
for sample_i in range(n_samples):
for feature_i in range(X_indptr[sample_i], X_indptr[sample_i + 1]):
feature_to_sample[X_indices[feature_i]] = sample_i
X_sample[X_indices[feature_i]] = X_data[feature_i]
# Cycle through all stages
for stage_i in range(n_stages):
# Cycle through all trees
for output_i in range(n_outputs):
root_node = nodes[stage_i * n_outputs + output_i]
value = values[stage_i * n_outputs + output_i]
node = root_node
# While node not a leaf
while node.left_child != TREE_LEAF:
# ... and node.right_child != TREE_LEAF:
if feature_to_sample[node.feature] == sample_i:
feature_value = X_sample[node.feature]
else:
feature_value = 0.
if feature_value <= node.threshold:
node = root_node + node.left_child
else:
node = root_node + node.right_child
out[sample_i, output_i] += scale * value[node - root_node]
# Free auxiliary arrays
free(X_sample)
free(feature_to_sample)
free(nodes)
free(values)
def predict_stages(
object[:, :] estimators,
object X,
double scale,
float64_t[:, :] out
):
"""Add predictions of ``estimators`` to ``out``.
Each estimator is scaled by ``scale`` before its prediction
is added to ``out``.
"""
cdef Py_ssize_t i
cdef Py_ssize_t k
cdef Py_ssize_t n_estimators = estimators.shape[0]
cdef Py_ssize_t K = estimators.shape[1]
cdef Tree tree
if issparse(X):
if X.format != 'csr':
raise ValueError("When X is a sparse matrix, a CSR format is"
" expected, got {!r}".format(type(X)))
_predict_regression_tree_stages_sparse(
estimators=estimators, X=X, scale=scale, out=out
)
else:
if not isinstance(X, np.ndarray) or np.isfortran(X):
raise ValueError(f"X should be C-ordered np.ndarray, got {type(X)}")
for i in range(n_estimators):
for k in range(K):
tree = estimators[i, k].tree_
# avoid buffer validation by casting to ndarray
# and get data pointer
# need brackets because of casting operator priority
_predict_regression_tree_inplace_fast_dense(
X=X,
root_node=tree.nodes,
value=tree.value,
scale=scale,
k=k,
out=out
)
# out[:, k] += scale * tree.predict(X).ravel()
def predict_stage(
object[:, :] estimators,
int stage,
object X,
double scale,
float64_t[:, :] out
):
"""Add predictions of ``estimators[stage]`` to ``out``.
Each estimator in the stage is scaled by ``scale`` before
its prediction is added to ``out``.
"""
return predict_stages(
estimators=estimators[stage:stage + 1], X=X, scale=scale, out=out
)
def _random_sample_mask(
intp_t n_total_samples,
intp_t n_total_in_bag,
random_state
):
"""Create a random sample mask where ``n_total_in_bag`` elements are set.
Parameters
----------
n_total_samples : int
The length of the resulting mask.
n_total_in_bag : int
The number of elements in the sample mask which are set to 1.
random_state : RandomState
A numpy ``RandomState`` object.
Returns
-------
sample_mask : np.ndarray, shape=[n_total_samples]
An ndarray where ``n_total_in_bag`` elements are set to ``True``
the others are ``False``.
"""
cdef float64_t[::1] rand = random_state.uniform(size=n_total_samples)
cdef uint8_t[::1] sample_mask = np_zeros((n_total_samples,), dtype=bool)
cdef intp_t n_bagged = 0
cdef intp_t i = 0
for i in range(n_total_samples):
if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged):
sample_mask[i] = 1
n_bagged += 1
return sample_mask.base

View File

@@ -0,0 +1,8 @@
"""This module implements histogram-based gradient boosting estimators.
The implementation is a port from pygbm which is itself strongly inspired
from LightGBM.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

View File

@@ -0,0 +1,85 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython.parallel import prange
from libc.math cimport isnan
from sklearn.ensemble._hist_gradient_boosting.common cimport X_DTYPE_C, X_BINNED_DTYPE_C
from sklearn.utils._typedefs cimport uint8_t
def _map_to_bins(const X_DTYPE_C [:, :] data,
list binning_thresholds,
const uint8_t[::1] is_categorical,
const uint8_t missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [::1, :] binned):
"""Bin continuous and categorical values to discrete integer-coded levels.
A given value x is mapped into bin value i iff
thresholds[i - 1] < x <= thresholds[i]
Parameters
----------
data : ndarray, shape (n_samples, n_features)
The data to bin.
binning_thresholds : list of arrays
For each feature, stores the increasing numeric values that are
used to separate the bins.
is_categorical : ndarray of uint8_t of shape (n_features,)
Indicates categorical features.
n_threads : int
Number of OpenMP threads to use.
binned : ndarray, shape (n_samples, n_features)
Output array, must be fortran aligned.
"""
cdef:
int feature_idx
for feature_idx in range(data.shape[1]):
_map_col_to_bins(
data[:, feature_idx],
binning_thresholds[feature_idx],
is_categorical[feature_idx],
missing_values_bin_idx,
n_threads,
binned[:, feature_idx]
)
cdef void _map_col_to_bins(
const X_DTYPE_C [:] data,
const X_DTYPE_C [:] binning_thresholds,
const uint8_t is_categorical,
const uint8_t missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [:] binned
):
"""Binary search to find the bin index for each value in the data."""
cdef:
int i
int left
int right
int middle
for i in prange(data.shape[0], schedule='static', nogil=True,
num_threads=n_threads):
if (
isnan(data[i]) or
# To follow LightGBM's conventions, negative values for
# categorical features are considered as missing values.
(is_categorical and data[i] < 0)
):
binned[i] = missing_values_bin_idx
else:
# for known values, use binary search
left, right = 0, binning_thresholds.shape[0]
while left < right:
# equal to (right + left - 1) // 2 but avoids overflow
middle = left + (right - left - 1) // 2
if data[i] <= binning_thresholds[middle]:
right = middle
else:
left = middle + 1
binned[i] = left

View File

@@ -0,0 +1,20 @@
from sklearn.ensemble._hist_gradient_boosting.common cimport X_BINNED_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_INNER_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport X_DTYPE_C
from sklearn.utils._typedefs cimport uint8_t
cdef void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil
cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil
cdef uint8_t in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil
cpdef uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
X_BINNED_DTYPE_C val) noexcept nogil
cdef uint8_t in_bitset_2d_memoryview(
const BITSET_INNER_DTYPE_C[:, :] bitset,
X_BINNED_DTYPE_C val,
unsigned int row) noexcept nogil

View File

@@ -0,0 +1,65 @@
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_INNER_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport X_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport X_BINNED_DTYPE_C
from sklearn.utils._typedefs cimport uint8_t
# A bitset is a data structure used to represent sets of integers in [0, n]. We
# use them to represent sets of features indices (e.g. features that go to the
# left child, or features that are categorical). For familiarity with bitsets
# and bitwise operations:
# https://en.wikipedia.org/wiki/Bit_array
# https://en.wikipedia.org/wiki/Bitwise_operation
cdef inline void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil: # OUT
cdef:
unsigned int i
for i in range(8):
bitset[i] = 0
cdef inline void set_bitset(BITSET_DTYPE_C bitset, # OUT
X_BINNED_DTYPE_C val) noexcept nogil:
bitset[val // 32] |= (1 << (val % 32))
cdef inline uint8_t in_bitset(BITSET_DTYPE_C bitset,
X_BINNED_DTYPE_C val) noexcept nogil:
return (bitset[val // 32] >> (val % 32)) & 1
cpdef inline uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
X_BINNED_DTYPE_C val) noexcept nogil:
return (bitset[val // 32] >> (val % 32)) & 1
cdef inline uint8_t in_bitset_2d_memoryview(const BITSET_INNER_DTYPE_C[:, :] bitset,
X_BINNED_DTYPE_C val,
unsigned int row) noexcept nogil:
# Same as above but works on 2d memory views to avoid the creation of 1d
# memory views. See https://github.com/scikit-learn/scikit-learn/issues/17299
return (bitset[row, val // 32] >> (val % 32)) & 1
cpdef inline void set_bitset_memoryview(BITSET_INNER_DTYPE_C[:] bitset, # OUT
X_BINNED_DTYPE_C val):
bitset[val // 32] |= (1 << (val % 32))
def set_raw_bitset_from_binned_bitset(BITSET_INNER_DTYPE_C[:] raw_bitset, # OUT
BITSET_INNER_DTYPE_C[:] binned_bitset,
X_DTYPE_C[:] categories):
"""Set the raw_bitset from the values of the binned bitset
categories is a mapping from binned category value to raw category value.
"""
cdef:
int binned_cat_value
X_DTYPE_C raw_cat_value
for binned_cat_value, raw_cat_value in enumerate(categories):
if in_bitset_memoryview(binned_bitset, binned_cat_value):
set_bitset_memoryview(raw_bitset, <X_BINNED_DTYPE_C>raw_cat_value)

View File

@@ -0,0 +1,59 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython.parallel import prange
import numpy as np
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common cimport Y_DTYPE_C
def _update_raw_predictions(
Y_DTYPE_C [::1] raw_predictions, # OUT
grower,
n_threads,
):
"""Update raw_predictions with the predictions of the newest tree.
This is equivalent to (and much faster than):
raw_predictions += last_estimator.predict(X_train)
It's only possible for data X_train that is used to train the trees (it
isn't usable for e.g. X_val).
"""
cdef:
unsigned int [::1] starts # start of each leaf in partition
unsigned int [::1] stops # end of each leaf in partition
Y_DTYPE_C [::1] values # value of each leaf
const unsigned int [::1] partition = grower.splitter.partition
list leaves
leaves = grower.finalized_leaves
starts = np.array([leaf.partition_start for leaf in leaves],
dtype=np.uint32)
stops = np.array([leaf.partition_stop for leaf in leaves],
dtype=np.uint32)
values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE)
_update_raw_predictions_helper(raw_predictions, starts, stops, partition,
values, n_threads)
cdef inline void _update_raw_predictions_helper(
Y_DTYPE_C [::1] raw_predictions, # OUT
const unsigned int [::1] starts,
const unsigned int [::1] stops,
const unsigned int [::1] partition,
const Y_DTYPE_C [::1] values,
int n_threads,
):
cdef:
unsigned int position
int leaf_idx
int n_leaves = starts.shape[0]
for leaf_idx in prange(n_leaves, schedule='static', nogil=True,
num_threads=n_threads):
for position in range(starts[leaf_idx], stops[leaf_idx]):
raw_predictions[partition[position]] += values[leaf_idx]

View File

@@ -0,0 +1,256 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython.parallel import prange
from libc.math cimport isnan
import numpy as np
from sklearn.utils._typedefs cimport intp_t, uint8_t
from sklearn.ensemble._hist_gradient_boosting.common cimport X_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport Y_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common cimport X_BINNED_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_INNER_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport node_struct
from sklearn.ensemble._hist_gradient_boosting._bitset cimport in_bitset_2d_memoryview
def _predict_from_raw_data( # raw data = non-binned data
const node_struct [:] nodes,
const X_DTYPE_C [:, :] numeric_data,
const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
const unsigned int [::1] f_idx_map,
int n_threads,
Y_DTYPE_C [:] out):
cdef:
int i
for i in prange(numeric_data.shape[0], schedule='static', nogil=True,
num_threads=n_threads):
out[i] = _predict_one_from_raw_data(
nodes, numeric_data, raw_left_cat_bitsets,
known_cat_bitsets,
f_idx_map, i)
cdef inline Y_DTYPE_C _predict_one_from_raw_data(
const node_struct [:] nodes,
const X_DTYPE_C [:, :] numeric_data,
const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
const unsigned int [::1] f_idx_map,
const int row) noexcept nogil:
# Need to pass the whole array and the row index, else prange won't work.
# See issue Cython #2798
cdef:
node_struct node = nodes[0]
unsigned int node_idx = 0
X_DTYPE_C data_val
while True:
if node.is_leaf:
return node.value
data_val = numeric_data[row, node.feature_idx]
if isnan(data_val):
if node.missing_go_to_left:
node_idx = node.left
else:
node_idx = node.right
elif node.is_categorical:
if data_val < 0:
# data_val is not in the accepted range, so it is treated as missing value
node_idx = node.left if node.missing_go_to_left else node.right
elif in_bitset_2d_memoryview(
raw_left_cat_bitsets,
<X_BINNED_DTYPE_C>data_val,
node.bitset_idx):
node_idx = node.left
elif in_bitset_2d_memoryview(
known_cat_bitsets,
<X_BINNED_DTYPE_C>data_val,
f_idx_map[node.feature_idx]):
node_idx = node.right
else:
# Treat unknown categories as missing.
node_idx = node.left if node.missing_go_to_left else node.right
else:
if data_val <= node.num_threshold:
node_idx = node.left
else:
node_idx = node.right
node = nodes[node_idx]
def _predict_from_binned_data(
node_struct [:] nodes,
const X_BINNED_DTYPE_C [:, :] binned_data,
BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
const uint8_t missing_values_bin_idx,
int n_threads,
Y_DTYPE_C [:] out):
cdef:
int i
for i in prange(binned_data.shape[0], schedule='static', nogil=True,
num_threads=n_threads):
out[i] = _predict_one_from_binned_data(nodes,
binned_data,
binned_left_cat_bitsets, i,
missing_values_bin_idx)
cdef inline Y_DTYPE_C _predict_one_from_binned_data(
node_struct [:] nodes,
const X_BINNED_DTYPE_C [:, :] binned_data,
const BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
const int row,
const uint8_t missing_values_bin_idx) noexcept nogil:
# Need to pass the whole array and the row index, else prange won't work.
# See issue Cython #2798
cdef:
node_struct node = nodes[0]
unsigned int node_idx = 0
X_BINNED_DTYPE_C data_val
while True:
if node.is_leaf:
return node.value
data_val = binned_data[row, node.feature_idx]
if data_val == missing_values_bin_idx:
if node.missing_go_to_left:
node_idx = node.left
else:
node_idx = node.right
elif node.is_categorical:
if in_bitset_2d_memoryview(
binned_left_cat_bitsets,
data_val,
node.bitset_idx):
node_idx = node.left
else:
node_idx = node.right
else:
if data_val <= node.bin_threshold:
node_idx = node.left
else:
node_idx = node.right
node = nodes[node_idx]
def _compute_partial_dependence(
node_struct [:] nodes,
const X_DTYPE_C [:, ::1] X,
const intp_t [:] target_features,
Y_DTYPE_C [:] out
):
"""Partial dependence of the response on the ``target_features`` set.
For each sample in ``X`` a tree traversal is performed.
Each traversal starts from the root with weight 1.0.
At each non-leaf node that splits on a target feature, either
the left child or the right child is visited based on the feature
value of the current sample, and the weight is not modified.
At each non-leaf node that splits on a complementary feature,
both children are visited and the weight is multiplied by the fraction
of training samples which went to each child.
At each leaf, the value of the node is multiplied by the current
weight (weights sum to 1 for all visited terminal nodes).
Parameters
----------
nodes : view on array of PREDICTOR_RECORD_DTYPE, shape (n_nodes)
The array representing the predictor tree.
X : view on 2d ndarray, shape (n_samples, n_target_features)
The grid points on which the partial dependence should be
evaluated.
target_features : view on 1d ndarray of intp_t, shape (n_target_features)
The set of target features for which the partial dependence
should be evaluated.
out : view on 1d ndarray, shape (n_samples)
The value of the partial dependence function on each grid
point.
"""
cdef:
unsigned int current_node_idx
unsigned int [:] node_idx_stack = np.zeros(shape=nodes.shape[0],
dtype=np.uint32)
Y_DTYPE_C [::1] weight_stack = np.zeros(shape=nodes.shape[0],
dtype=Y_DTYPE)
node_struct * current_node # pointer to avoid copying attributes
unsigned int sample_idx
intp_t feature_idx
unsigned stack_size
Y_DTYPE_C left_sample_frac
Y_DTYPE_C current_weight
Y_DTYPE_C total_weight # used for sanity check only
bint is_target_feature
for sample_idx in range(X.shape[0]):
# init stacks for current sample
stack_size = 1
node_idx_stack[0] = 0 # root node
weight_stack[0] = 1 # all the samples are in the root node
total_weight = 0
while stack_size > 0:
# pop the stack
stack_size -= 1
current_node_idx = node_idx_stack[stack_size]
current_node = &nodes[current_node_idx]
if current_node.is_leaf:
out[sample_idx] += (weight_stack[stack_size] *
current_node.value)
total_weight += weight_stack[stack_size]
else:
# determine if the split feature is a target feature
is_target_feature = False
for feature_idx in range(target_features.shape[0]):
if target_features[feature_idx] == current_node.feature_idx:
is_target_feature = True
break
if is_target_feature:
# In this case, we push left or right child on stack
if X[sample_idx, feature_idx] <= current_node.num_threshold:
node_idx_stack[stack_size] = current_node.left
else:
node_idx_stack[stack_size] = current_node.right
stack_size += 1
else:
# In this case, we push both children onto the stack,
# and give a weight proportional to the number of
# samples going through each branch.
# push left child
node_idx_stack[stack_size] = current_node.left
left_sample_frac = (
<Y_DTYPE_C> nodes[current_node.left].count /
current_node.count)
current_weight = weight_stack[stack_size]
weight_stack[stack_size] = current_weight * left_sample_frac
stack_size += 1
# push right child
node_idx_stack[stack_size] = current_node.right
weight_stack[stack_size] = (
current_weight * (1 - left_sample_frac))
stack_size += 1
# Sanity check. Should never happen.
if not (0.999 < total_weight < 1.001):
raise ValueError("Total weight should be 1.0 but was %.9f" %total_weight)

View File

@@ -0,0 +1,338 @@
"""
This module contains the BinMapper class.
BinMapper is used for mapping a real-valued dataset into integer-valued bins.
Bin thresholds are computed with the quantiles so that each bin contains
approximately the same number of samples.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble._hist_gradient_boosting._binning import _map_to_bins
from sklearn.ensemble._hist_gradient_boosting._bitset import set_bitset_memoryview
from sklearn.ensemble._hist_gradient_boosting.common import (
ALMOST_INF,
X_BINNED_DTYPE,
X_BITSET_INNER_DTYPE,
X_DTYPE,
)
from sklearn.utils import check_array, check_random_state
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import check_is_fitted
def _find_binning_thresholds(col_data, max_bins):
"""Extract quantiles from a continuous feature.
Missing values are ignored for finding the thresholds.
Parameters
----------
col_data : array-like, shape (n_samples,)
The continuous feature to bin.
max_bins: int
The maximum number of bins to use for non-missing values. If for a
given feature the number of unique values is less than ``max_bins``,
then those unique values will be used to compute the bin thresholds,
instead of the quantiles
Return
------
binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
The increasing numeric values that can be used to separate the bins.
A given value x will be mapped into bin value i iff
bining_thresholds[i - 1] < x <= binning_thresholds[i]
"""
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
# The data will be sorted anyway in np.unique and again in percentile, so we do it
# here. Sorting also returns a contiguous array.
col_data = np.sort(col_data)
distinct_values = np.unique(col_data).astype(X_DTYPE)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= 0.5
else:
# We could compute approximate midpoint percentiles using the output of
# np.unique(col_data, return_counts) instead but this is more
# work and the performance benefit will be limited because we
# work on a fixed-size subsample of the full data.
percentiles = np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
midpoints = np.percentile(col_data, percentiles, method="midpoint").astype(
X_DTYPE
)
assert midpoints.shape[0] == max_bins - 1
# We avoid having +inf thresholds: +inf thresholds are only allowed in
# a "split on nan" situation.
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
return midpoints
class _BinMapper(TransformerMixin, BaseEstimator):
"""Transformer that maps a dataset into integer-valued bins.
For continuous features, the bins are created in a feature-wise fashion,
using quantiles so that each bins contains approximately the same number
of samples. For large datasets, quantiles are computed on a subset of the
data to speed-up the binning, but the quantiles should remain stable.
For categorical features, the raw categorical values are expected to be
in [0, 254] (this is not validated here though) and each category
corresponds to a bin. All categorical values must be known at
initialization: transform() doesn't know how to bin unknown categorical
values. Note that transform() is only used on non-training data in the
case of early stopping.
Features with a small number of values may be binned into less than
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
for missing values.
Parameters
----------
n_bins : int, default=256
The maximum number of bins to use (including the bin for missing
values). Should be in [3, 256]. Non-missing values are binned on
``max_bins = n_bins - 1`` bins. The last bin is always reserved for
missing values. If for a given feature the number of unique values is
less than ``max_bins``, then those unique values will be used to
compute the bin thresholds, instead of the quantiles. For categorical
features indicated by ``is_categorical``, the docstring for
``is_categorical`` details on this procedure.
subsample : int or None, default=2e5
If ``n_samples > subsample``, then ``sub_samples`` samples will be
randomly chosen to compute the quantiles. If ``None``, the whole data
is used.
is_categorical : ndarray of bool of shape (n_features,), default=None
Indicates categorical features. By default, all features are
considered continuous.
known_categories : list of {ndarray, None} of shape (n_features,), \
default=none
For each categorical feature, the array indicates the set of unique
categorical values. These should be the possible values over all the
data, not just the training data. For continuous features, the
corresponding entry should be None.
random_state: int, RandomState instance or None, default=None
Pseudo-random number generator to control the random sub-sampling.
Pass an int for reproducible output across multiple
function calls.
See :term:`Glossary <random_state>`.
n_threads : int, default=None
Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
to determine the effective number of threads use, which takes cgroups CPU
quotes into account. See the docstring of `_openmp_effective_n_threads`
for details.
Attributes
----------
bin_thresholds_ : list of ndarray
For each feature, each array indicates how to map a feature into a
binned feature. The semantic and size depends on the nature of the
feature:
- for real-valued features, the array corresponds to the real-valued
bin thresholds (the upper bound of each bin). There are ``max_bins
- 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
bins used for non-missing values.
- for categorical features, the array is a map from a binned category
value to the raw category value. The size of the array is equal to
``min(max_bins, category_cardinality)`` where we ignore missing
values in the cardinality.
n_bins_non_missing_ : ndarray, dtype=np.uint32
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this is
equal to ``n_bins - 1``.
is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
Indicator for categorical features.
missing_values_bin_idx_ : np.uint8
The index of the bin where missing values are mapped. This is a
constant across all features. This corresponds to the last bin, and
it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_``
is less than ``n_bins - 1`` for a given feature, then there are
empty (and unused) bins.
"""
def __init__(
self,
n_bins=256,
subsample=int(2e5),
is_categorical=None,
known_categories=None,
random_state=None,
n_threads=None,
):
self.n_bins = n_bins
self.subsample = subsample
self.is_categorical = is_categorical
self.known_categories = known_categories
self.random_state = random_state
self.n_threads = n_threads
def fit(self, X, y=None):
"""Fit data X by computing the binning thresholds.
The last bin is reserved for missing values, whether missing values
are present in the data or not.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to bin.
y: None
Ignored.
Returns
-------
self : object
"""
if not (3 <= self.n_bins <= 256):
# min is 3: at least 2 distinct bins and a missing values bin
raise ValueError(
"n_bins={} should be no smaller than 3 and no larger than 256.".format(
self.n_bins
)
)
X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False)
max_bins = self.n_bins - 1
rng = check_random_state(self.random_state)
if self.subsample is not None and X.shape[0] > self.subsample:
subset = rng.choice(X.shape[0], self.subsample, replace=False)
X = X.take(subset, axis=0)
if self.is_categorical is None:
self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
else:
self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)
n_features = X.shape[1]
known_categories = self.known_categories
if known_categories is None:
known_categories = [None] * n_features
# validate is_categorical and known_categories parameters
for f_idx in range(n_features):
is_categorical = self.is_categorical_[f_idx]
known_cats = known_categories[f_idx]
if is_categorical and known_cats is None:
raise ValueError(
f"Known categories for feature {f_idx} must be provided."
)
if not is_categorical and known_cats is not None:
raise ValueError(
f"Feature {f_idx} isn't marked as a categorical feature, "
"but categories were passed."
)
self.missing_values_bin_idx_ = self.n_bins - 1
self.bin_thresholds_ = [None] * n_features
n_bins_non_missing = [None] * n_features
non_cat_thresholds = Parallel(n_jobs=self.n_threads, backend="threading")(
delayed(_find_binning_thresholds)(X[:, f_idx], max_bins)
for f_idx in range(n_features)
if not self.is_categorical_[f_idx]
)
non_cat_idx = 0
for f_idx in range(n_features):
if self.is_categorical_[f_idx]:
# Since categories are assumed to be encoded in
# [0, n_cats] and since n_cats <= max_bins,
# the thresholds *are* the unique categorical values. This will
# lead to the correct mapping in transform()
thresholds = known_categories[f_idx]
n_bins_non_missing[f_idx] = thresholds.shape[0]
self.bin_thresholds_[f_idx] = thresholds
else:
self.bin_thresholds_[f_idx] = non_cat_thresholds[non_cat_idx]
n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1
non_cat_idx += 1
self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
return self
def transform(self, X):
"""Bin data X.
Missing values will be mapped to the last bin.
For categorical features, the mapping will be incorrect for unknown
categories. Since the BinMapper is given known_categories of the
entire training data (i.e. before the call to train_test_split() in
case of early-stopping), this never happens.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to bin.
Returns
-------
X_binned : array-like of shape (n_samples, n_features)
The binned data (fortran-aligned).
"""
X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False)
check_is_fitted(self)
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
raise ValueError(
"This estimator was fitted with {} features but {} got passed "
"to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1])
)
n_threads = _openmp_effective_n_threads(self.n_threads)
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
_map_to_bins(
X,
self.bin_thresholds_,
self.is_categorical_,
self.missing_values_bin_idx_,
n_threads,
binned,
)
return binned
def make_known_categories_bitsets(self):
"""Create bitsets of known categories.
Returns
-------
- known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
Array of bitsets of known categories, for each categorical feature.
- f_idx_map : ndarray of shape (n_features,)
Map from original feature index to the corresponding index in the
known_cat_bitsets array.
"""
categorical_features_indices = np.flatnonzero(self.is_categorical_)
n_features = self.is_categorical_.size
n_categorical_features = categorical_features_indices.size
f_idx_map = np.zeros(n_features, dtype=np.uint32)
f_idx_map[categorical_features_indices] = np.arange(
n_categorical_features, dtype=np.uint32
)
known_categories = self.bin_thresholds_
known_cat_bitsets = np.zeros(
(n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE
)
# TODO: complexity is O(n_categorical_features * 255). Maybe this is
# worth cythonizing
for mapped_f_idx, f_idx in enumerate(categorical_features_indices):
for raw_cat_val in known_categories[f_idx]:
set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)
return known_cat_bitsets, f_idx_map

View File

@@ -0,0 +1,43 @@
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t
ctypedef float64_t X_DTYPE_C
ctypedef uint8_t X_BINNED_DTYPE_C
ctypedef float64_t Y_DTYPE_C
ctypedef float32_t G_H_DTYPE_C
ctypedef uint32_t BITSET_INNER_DTYPE_C
ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C
cdef packed struct hist_struct:
# Same as histogram dtype but we need a struct to declare views. It needs
# to be packed since by default numpy dtypes aren't aligned
Y_DTYPE_C sum_gradients
Y_DTYPE_C sum_hessians
unsigned int count
cdef packed struct node_struct:
# Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
# needs to be packed since by default numpy dtypes aren't aligned
Y_DTYPE_C value
unsigned int count
intp_t feature_idx
X_DTYPE_C num_threshold
uint8_t missing_go_to_left
unsigned int left
unsigned int right
Y_DTYPE_C gain
unsigned int depth
uint8_t is_leaf
X_BINNED_DTYPE_C bin_threshold
uint8_t is_categorical
# The index of the corresponding bitsets in the Predictor's bitset arrays.
# Only used if is_categorical is True
unsigned int bitset_idx
cpdef enum MonotonicConstraint:
NO_CST = 0
POS = 1
NEG = -1

View File

@@ -0,0 +1,44 @@
import numpy as np
# Y_DYTPE is the dtype to which the targets y are converted to. This is also
# dtype for leaf values, gains, and sums of gradients / hessians. The gradients
# and hessians arrays are stored as floats to avoid using too much memory.
Y_DTYPE = np.float64
X_DTYPE = np.float64
X_BINNED_DTYPE = np.uint8 # hence max_bins == 256
# dtype for gradients and hessians arrays
G_H_DTYPE = np.float32
X_BITSET_INNER_DTYPE = np.uint32
# Note that we use Y_DTYPE=float64 to avoid issues with floating point precision when
# summing gradients and hessians (both float32). Those are difficult to protect via
# tools like (Kahan-) Neumaier summation as in CPython, see
# https://github.com/python/cpython/issues/100425, or pairwise summation as numpy, see
# https://github.com/numpy/numpy/pull/3685, due to the way histograms are summed
# (number of additions per bin is not known in advance). See also comment in
# _subtract_histograms.
HISTOGRAM_DTYPE = np.dtype([
('sum_gradients', Y_DTYPE), # sum of sample gradients in bin
('sum_hessians', Y_DTYPE), # sum of sample hessians in bin
('count', np.uint32), # number of samples in bin
])
PREDICTOR_RECORD_DTYPE = np.dtype([
('value', Y_DTYPE),
('count', np.uint32),
('feature_idx', np.intp),
('num_threshold', X_DTYPE),
('missing_go_to_left', np.uint8),
('left', np.uint32),
('right', np.uint32),
('gain', Y_DTYPE),
('depth', np.uint32),
('is_leaf', np.uint8),
('bin_threshold', X_BINNED_DTYPE),
('is_categorical', np.uint8),
# The index of the corresponding bitsets in the Predictor's bitset arrays.
# Only used if is_categorical is True
('bitset_idx', np.uint32)
])
ALMOST_INF = 1e300 # see LightGBM AvoidInf()

View File

@@ -0,0 +1,822 @@
"""
This module contains the TreeGrower class.
TreeGrower builds a regression tree fitting a Newton-Raphson step, based on
the gradients and hessians of the training data.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numbers
from heapq import heappop, heappush
from timeit import default_timer as time
import numpy as np
from sklearn.ensemble._hist_gradient_boosting._bitset import (
set_raw_bitset_from_binned_bitset,
)
from sklearn.ensemble._hist_gradient_boosting.common import (
PREDICTOR_RECORD_DTYPE,
X_BITSET_INNER_DTYPE,
MonotonicConstraint,
)
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
class TreeNode:
"""Tree Node class used in TreeGrower.
This isn't used for prediction purposes, only for training (see
TreePredictor).
Parameters
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32
The indices of the samples at the node.
partition_start : int
start position of the node's sample_indices in splitter.partition.
partition_stop : int
stop position of the node's sample_indices in splitter.partition.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
Attributes
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32
The indices of the samples at the node.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
split_info : SplitInfo or None
The result of the split evaluation.
is_leaf : bool
True if node is a leaf
left_child : TreeNode or None
The left child of the node. None for leaves.
right_child : TreeNode or None
The right child of the node. None for leaves.
value : float or None
The value of the leaf, as computed in finalize_leaf(). None for
non-leaf nodes.
partition_start : int
start position of the node's sample_indices in splitter.partition.
partition_stop : int
stop position of the node's sample_indices in splitter.partition.
allowed_features : None or ndarray, dtype=int
Indices of features allowed to split for children.
interaction_cst_indices : None or list of ints
Indices of the interaction sets that have to be applied on splits of
child nodes. The fewer sets the stronger the constraint as fewer sets
contain fewer features.
children_lower_bound : float
children_upper_bound : float
"""
def __init__(
self,
*,
depth,
sample_indices,
partition_start,
partition_stop,
sum_gradients,
sum_hessians,
value=None,
):
self.depth = depth
self.sample_indices = sample_indices
self.n_samples = sample_indices.shape[0]
self.sum_gradients = sum_gradients
self.sum_hessians = sum_hessians
self.value = value
self.is_leaf = False
self.allowed_features = None
self.interaction_cst_indices = None
self.set_children_bounds(float("-inf"), float("+inf"))
self.split_info = None
self.left_child = None
self.right_child = None
self.histograms = None
# start and stop indices of the node in the splitter.partition
# array. Concretely,
# self.sample_indices = view(self.splitter.partition[start:stop])
# Please see the comments about splitter.partition and
# splitter.split_indices for more info about this design.
# These 2 attributes are only used in _update_raw_prediction, because we
# need to iterate over the leaves and I don't know how to efficiently
# store the sample_indices views because they're all of different sizes.
self.partition_start = partition_start
self.partition_stop = partition_stop
def set_children_bounds(self, lower, upper):
"""Set children values bounds to respect monotonic constraints."""
# These are bounds for the node's *children* values, not the node's
# value. The bounds are used in the splitter when considering potential
# left and right child.
self.children_lower_bound = lower
self.children_upper_bound = upper
def __lt__(self, other_node):
"""Comparison for priority queue.
Nodes with high gain are higher priority than nodes with low gain.
heapq.heappush only need the '<' operator.
heapq.heappop take the smallest item first (smaller is higher
priority).
Parameters
----------
other_node : TreeNode
The node to compare with.
"""
return self.split_info.gain > other_node.split_info.gain
class TreeGrower:
"""Tree grower class used to build a tree.
The tree is fitted to predict the values of a Newton-Raphson step. The
splits are considered in a best-first fashion, and the quality of a
split is defined in splitting._split_gain.
Parameters
----------
X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8
The binned input samples. Must be Fortran-aligned.
gradients : ndarray of shape (n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
hessians : ndarray of shape (n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
max_leaf_nodes : int, default=None
The maximum number of leaves for each tree. If None, there is no
maximum limit.
max_depth : int, default=None
The maximum depth of each tree. The depth of a tree is the number of
edges to go from the root to the deepest leaf.
Depth isn't constrained by default.
min_samples_leaf : int, default=20
The minimum number of samples per leaf.
min_gain_to_split : float, default=0.
The minimum gain needed to split a node. Splits with lower gain will
be ignored.
min_hessian_to_split : float, default=1e-3
The minimum sum of hessians needed in each node. Splits that result in
at least one child having a sum of hessians less than
``min_hessian_to_split`` are discarded.
n_bins : int, default=256
The total number of bins, including the bin for missing values. Used
to define the shape of the histograms.
n_bins_non_missing : ndarray, dtype=np.uint32, default=None
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this
is equal to ``n_bins - 1``. If it's an int, all features are
considered to have the same number of bins. If None, all features
are considered to have ``n_bins - 1`` bins.
has_missing_values : bool or ndarray, dtype=bool, default=False
Whether each feature contains missing values (in the training data).
If it's a bool, the same value is used for all features.
is_categorical : ndarray of bool of shape (n_features,), default=None
Indicates categorical features.
monotonic_cst : array-like of int of shape (n_features,), dtype=int, default=None
Indicates the monotonic constraint to enforce on each feature.
- 1: monotonic increase
- 0: no constraint
- -1: monotonic decrease
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
interaction_cst : list of sets of integers, default=None
List of interaction constraints.
l2_regularization : float, default=0.
The L2 regularization parameter penalizing leaves with small hessians.
Use ``0`` for no regularization (default).
feature_fraction_per_split : float, default=1
Proportion of randomly chosen features in each and every node split.
This is a form of regularization, smaller values make the trees weaker
learners and might prevent overfitting.
rng : Generator
Numpy random Generator used for feature subsampling.
shrinkage : float, default=1.
The shrinkage parameter to apply to the leaves values, also known as
learning rate.
n_threads : int, default=None
Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
to determine the effective number of threads use, which takes cgroups CPU
quotes into account. See the docstring of `_openmp_effective_n_threads`
for details.
Attributes
----------
histogram_builder : HistogramBuilder
splitter : Splitter
root : TreeNode
finalized_leaves : list of TreeNode
splittable_nodes : list of TreeNode
missing_values_bin_idx : int
Equals n_bins - 1
n_categorical_splits : int
n_features : int
n_nodes : int
total_find_split_time : float
Time spent finding the best splits
total_compute_hist_time : float
Time spent computing histograms
total_apply_split_time : float
Time spent splitting nodes
with_monotonic_cst : bool
Whether there are monotonic constraints that apply. False iff monotonic_cst is
None.
"""
def __init__(
self,
X_binned,
gradients,
hessians,
max_leaf_nodes=None,
max_depth=None,
min_samples_leaf=20,
min_gain_to_split=0.0,
min_hessian_to_split=1e-3,
n_bins=256,
n_bins_non_missing=None,
has_missing_values=False,
is_categorical=None,
monotonic_cst=None,
interaction_cst=None,
l2_regularization=0.0,
feature_fraction_per_split=1.0,
rng=np.random.default_rng(),
shrinkage=1.0,
n_threads=None,
):
self._validate_parameters(
X_binned,
min_gain_to_split,
min_hessian_to_split,
)
n_threads = _openmp_effective_n_threads(n_threads)
if n_bins_non_missing is None:
n_bins_non_missing = n_bins - 1
if isinstance(n_bins_non_missing, numbers.Integral):
n_bins_non_missing = np.array(
[n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32
)
else:
n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)
if isinstance(has_missing_values, bool):
has_missing_values = [has_missing_values] * X_binned.shape[1]
has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
# `monotonic_cst` validation is done in _validate_monotonic_cst
# at the estimator level and therefore the following should not be
# needed when using the public API.
if monotonic_cst is None:
monotonic_cst = np.full(
shape=X_binned.shape[1],
fill_value=MonotonicConstraint.NO_CST,
dtype=np.int8,
)
else:
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
self.with_monotonic_cst = np.any(monotonic_cst != MonotonicConstraint.NO_CST)
if is_categorical is None:
is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)
else:
is_categorical = np.asarray(is_categorical, dtype=np.uint8)
if np.any(
np.logical_and(
is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST
)
):
raise ValueError("Categorical features cannot have monotonic constraints.")
hessians_are_constant = hessians.shape[0] == 1
self.histogram_builder = HistogramBuilder(
X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
)
missing_values_bin_idx = n_bins - 1
self.splitter = Splitter(
X_binned=X_binned,
n_bins_non_missing=n_bins_non_missing,
missing_values_bin_idx=missing_values_bin_idx,
has_missing_values=has_missing_values,
is_categorical=is_categorical,
monotonic_cst=monotonic_cst,
l2_regularization=l2_regularization,
min_hessian_to_split=min_hessian_to_split,
min_samples_leaf=min_samples_leaf,
min_gain_to_split=min_gain_to_split,
hessians_are_constant=hessians_are_constant,
feature_fraction_per_split=feature_fraction_per_split,
rng=rng,
n_threads=n_threads,
)
self.X_binned = X_binned
self.max_leaf_nodes = max_leaf_nodes
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.min_gain_to_split = min_gain_to_split
self.n_bins_non_missing = n_bins_non_missing
self.missing_values_bin_idx = missing_values_bin_idx
self.has_missing_values = has_missing_values
self.is_categorical = is_categorical
self.monotonic_cst = monotonic_cst
self.interaction_cst = interaction_cst
self.l2_regularization = l2_regularization
self.shrinkage = shrinkage
self.n_features = X_binned.shape[1]
self.n_threads = n_threads
self.splittable_nodes = []
self.finalized_leaves = []
self.total_find_split_time = 0.0 # time spent finding the best splits
self.total_compute_hist_time = 0.0 # time spent computing histograms
self.total_apply_split_time = 0.0 # time spent splitting nodes
self.n_categorical_splits = 0
self._initialize_root()
self.n_nodes = 1
def _validate_parameters(
self,
X_binned,
min_gain_to_split,
min_hessian_to_split,
):
"""Validate parameters passed to __init__.
Also validate parameters passed to splitter.
"""
if X_binned.dtype != np.uint8:
raise NotImplementedError("X_binned must be of type uint8.")
if not X_binned.flags.f_contiguous:
raise ValueError(
"X_binned should be passed as Fortran contiguous "
"array for maximum efficiency."
)
if min_gain_to_split < 0:
raise ValueError(
"min_gain_to_split={} must be positive.".format(min_gain_to_split)
)
if min_hessian_to_split < 0:
raise ValueError(
"min_hessian_to_split={} must be positive.".format(min_hessian_to_split)
)
def grow(self):
"""Grow the tree, from root to leaves."""
while self.splittable_nodes:
self.split_next()
self._apply_shrinkage()
def _apply_shrinkage(self):
"""Multiply leaves values by shrinkage parameter.
This must be done at the very end of the growing process. If this were
done during the growing process e.g. in finalize_leaf(), then a leaf
would be shrunk but its sibling would potentially not be (if it's a
non-leaf), which would lead to a wrong computation of the 'middle'
value needed to enforce the monotonic constraints.
"""
for leaf in self.finalized_leaves:
leaf.value *= self.shrinkage
def _initialize_root(self):
"""Initialize root node and finalize it if needed."""
tic = time()
if self.interaction_cst is not None:
allowed_features = set().union(*self.interaction_cst)
allowed_features = np.fromiter(
allowed_features, dtype=np.uint32, count=len(allowed_features)
)
arbitrary_feature = allowed_features[0]
else:
allowed_features = None
arbitrary_feature = 0
# TreeNode init needs the total sum of gradients and hessians. Therefore, we
# first compute the histograms and then compute the total grad/hess on an
# arbitrary feature histogram. This way we replace a loop over n_samples by a
# loop over n_bins.
histograms = self.histogram_builder.compute_histograms_brute(
self.splitter.partition, # =self.root.sample_indices
allowed_features,
)
self.total_compute_hist_time += time() - tic
tic = time()
n_samples = self.X_binned.shape[0]
depth = 0
histogram_array = np.asarray(histograms[arbitrary_feature])
sum_gradients = histogram_array["sum_gradients"].sum()
if self.histogram_builder.hessians_are_constant:
sum_hessians = self.histogram_builder.hessians[0] * n_samples
else:
sum_hessians = histogram_array["sum_hessians"].sum()
self.root = TreeNode(
depth=depth,
sample_indices=self.splitter.partition,
partition_start=0,
partition_stop=n_samples,
sum_gradients=sum_gradients,
sum_hessians=sum_hessians,
value=0,
)
if self.root.n_samples < 2 * self.min_samples_leaf:
# Do not even bother computing any splitting statistics.
self._finalize_leaf(self.root)
return
if sum_hessians < self.splitter.min_hessian_to_split:
self._finalize_leaf(self.root)
return
if self.interaction_cst is not None:
self.root.interaction_cst_indices = range(len(self.interaction_cst))
self.root.allowed_features = allowed_features
self.root.histograms = histograms
self._compute_best_split_and_push(self.root)
self.total_find_split_time += time() - tic
def _compute_best_split_and_push(self, node):
"""Compute the best possible split (SplitInfo) of a given node.
Also push it in the heap of splittable nodes if gain isn't zero.
The gain of a node is 0 if either all the leaves are pure
(best gain = 0), or if no split would satisfy the constraints,
(min_hessians_to_split, min_gain_to_split, min_samples_leaf)
"""
node.split_info = self.splitter.find_node_split(
n_samples=node.n_samples,
histograms=node.histograms,
sum_gradients=node.sum_gradients,
sum_hessians=node.sum_hessians,
value=node.value,
lower_bound=node.children_lower_bound,
upper_bound=node.children_upper_bound,
allowed_features=node.allowed_features,
)
if node.split_info.gain <= 0: # no valid split
self._finalize_leaf(node)
else:
heappush(self.splittable_nodes, node)
def split_next(self):
"""Split the node with highest potential gain.
Returns
-------
left : TreeNode
The resulting left child.
right : TreeNode
The resulting right child.
"""
# Consider the node with the highest loss reduction (a.k.a. gain)
node = heappop(self.splittable_nodes)
tic = time()
(
sample_indices_left,
sample_indices_right,
right_child_pos,
) = self.splitter.split_indices(node.split_info, node.sample_indices)
self.total_apply_split_time += time() - tic
depth = node.depth + 1
n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
n_leaf_nodes += 2
left_child_node = TreeNode(
depth=depth,
sample_indices=sample_indices_left,
partition_start=node.partition_start,
partition_stop=node.partition_start + right_child_pos,
sum_gradients=node.split_info.sum_gradient_left,
sum_hessians=node.split_info.sum_hessian_left,
value=node.split_info.value_left,
)
right_child_node = TreeNode(
depth=depth,
sample_indices=sample_indices_right,
partition_start=left_child_node.partition_stop,
partition_stop=node.partition_stop,
sum_gradients=node.split_info.sum_gradient_right,
sum_hessians=node.split_info.sum_hessian_right,
value=node.split_info.value_right,
)
node.right_child = right_child_node
node.left_child = left_child_node
# set interaction constraints (the indices of the constraints sets)
if self.interaction_cst is not None:
# Calculate allowed_features and interaction_cst_indices only once. Child
# nodes inherit them before they get split.
(
left_child_node.allowed_features,
left_child_node.interaction_cst_indices,
) = self._compute_interactions(node)
right_child_node.interaction_cst_indices = (
left_child_node.interaction_cst_indices
)
right_child_node.allowed_features = left_child_node.allowed_features
if not self.has_missing_values[node.split_info.feature_idx]:
# If no missing values are encountered at fit time, then samples
# with missing values during predict() will go to whichever child
# has the most samples.
node.split_info.missing_go_to_left = (
left_child_node.n_samples > right_child_node.n_samples
)
self.n_nodes += 2
self.n_categorical_splits += node.split_info.is_categorical
if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes:
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
self._finalize_splittable_nodes()
return left_child_node, right_child_node
if self.max_depth is not None and depth == self.max_depth:
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
return left_child_node, right_child_node
if left_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(left_child_node)
if right_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(right_child_node)
if self.with_monotonic_cst:
# Set value bounds for respecting monotonic constraints
# See test_nodes_values() for details
if (
self.monotonic_cst[node.split_info.feature_idx]
== MonotonicConstraint.NO_CST
):
lower_left = lower_right = node.children_lower_bound
upper_left = upper_right = node.children_upper_bound
else:
mid = (left_child_node.value + right_child_node.value) / 2
if (
self.monotonic_cst[node.split_info.feature_idx]
== MonotonicConstraint.POS
):
lower_left, upper_left = node.children_lower_bound, mid
lower_right, upper_right = mid, node.children_upper_bound
else: # NEG
lower_left, upper_left = mid, node.children_upper_bound
lower_right, upper_right = node.children_lower_bound, mid
left_child_node.set_children_bounds(lower_left, upper_left)
right_child_node.set_children_bounds(lower_right, upper_right)
# Compute histograms of children, and compute their best possible split
# (if needed)
should_split_left = not left_child_node.is_leaf
should_split_right = not right_child_node.is_leaf
if should_split_left or should_split_right:
# We will compute the histograms of both nodes even if one of them
# is a leaf, since computing the second histogram is very cheap
# (using histogram subtraction).
n_samples_left = left_child_node.sample_indices.shape[0]
n_samples_right = right_child_node.sample_indices.shape[0]
if n_samples_left < n_samples_right:
smallest_child = left_child_node
largest_child = right_child_node
else:
smallest_child = right_child_node
largest_child = left_child_node
# We use the brute O(n_samples) method on the child that has the
# smallest number of samples, and the subtraction trick O(n_bins)
# on the other one.
# Note that both left and right child have the same allowed_features.
tic = time()
smallest_child.histograms = self.histogram_builder.compute_histograms_brute(
smallest_child.sample_indices, smallest_child.allowed_features
)
largest_child.histograms = (
self.histogram_builder.compute_histograms_subtraction(
node.histograms,
smallest_child.histograms,
smallest_child.allowed_features,
)
)
# node.histograms is reused in largest_child.histograms. To break cyclic
# memory references and help garbage collection, we set it to None.
node.histograms = None
self.total_compute_hist_time += time() - tic
tic = time()
if should_split_left:
self._compute_best_split_and_push(left_child_node)
if should_split_right:
self._compute_best_split_and_push(right_child_node)
self.total_find_split_time += time() - tic
# Release memory used by histograms as they are no longer needed
# for leaf nodes since they won't be split.
for child in (left_child_node, right_child_node):
if child.is_leaf:
del child.histograms
# Release memory used by histograms as they are no longer needed for
# internal nodes once children histograms have been computed.
del node.histograms
return left_child_node, right_child_node
def _compute_interactions(self, node):
r"""Compute features allowed by interactions to be inherited by child nodes.
Example: Assume constraints [{0, 1}, {1, 2}].
1 <- Both constraint groups could be applied from now on
/ \
1 2 <- Left split still fulfills both constraint groups.
/ \ / \ Right split at feature 2 has only group {1, 2} from now on.
LightGBM uses the same logic for overlapping groups. See
https://github.com/microsoft/LightGBM/issues/4481 for details.
Parameters:
----------
node : TreeNode
A node that might have children. Based on its feature_idx, the interaction
constraints for possible child nodes are computed.
Returns
-------
allowed_features : ndarray, dtype=uint32
Indices of features allowed to split for children.
interaction_cst_indices : list of ints
Indices of the interaction sets that have to be applied on splits of
child nodes. The fewer sets the stronger the constraint as fewer sets
contain fewer features.
"""
# Note:
# - Case of no interactions is already captured before function call.
# - This is for nodes that are already split and have a
# node.split_info.feature_idx.
allowed_features = set()
interaction_cst_indices = []
for i in node.interaction_cst_indices:
if node.split_info.feature_idx in self.interaction_cst[i]:
interaction_cst_indices.append(i)
allowed_features.update(self.interaction_cst[i])
return (
np.fromiter(allowed_features, dtype=np.uint32, count=len(allowed_features)),
interaction_cst_indices,
)
def _finalize_leaf(self, node):
"""Make node a leaf of the tree being grown."""
node.is_leaf = True
self.finalized_leaves.append(node)
def _finalize_splittable_nodes(self):
"""Transform all splittable nodes into leaves.
Used when some constraint is met e.g. maximum number of leaves or
maximum depth."""
while len(self.splittable_nodes) > 0:
node = self.splittable_nodes.pop()
self._finalize_leaf(node)
def make_predictor(self, binning_thresholds):
"""Make a TreePredictor object out of the current tree.
Parameters
----------
binning_thresholds : array-like of floats
Corresponds to the bin_thresholds_ attribute of the BinMapper.
For each feature, this stores:
- the bin frontiers for continuous features
- the unique raw category values for categorical features
Returns
-------
A TreePredictor object.
"""
predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
binned_left_cat_bitsets = np.zeros(
(self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
)
raw_left_cat_bitsets = np.zeros(
(self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
)
_fill_predictor_arrays(
predictor_nodes,
binned_left_cat_bitsets,
raw_left_cat_bitsets,
self.root,
binning_thresholds,
self.n_bins_non_missing,
)
return TreePredictor(
predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets
)
def _fill_predictor_arrays(
predictor_nodes,
binned_left_cat_bitsets,
raw_left_cat_bitsets,
grower_node,
binning_thresholds,
n_bins_non_missing,
next_free_node_idx=0,
next_free_bitset_idx=0,
):
"""Helper used in make_predictor to set the TreePredictor fields."""
node = predictor_nodes[next_free_node_idx]
node["count"] = grower_node.n_samples
node["depth"] = grower_node.depth
if grower_node.split_info is not None:
node["gain"] = grower_node.split_info.gain
else:
node["gain"] = -1
node["value"] = grower_node.value
if grower_node.is_leaf:
# Leaf node
node["is_leaf"] = True
return next_free_node_idx + 1, next_free_bitset_idx
split_info = grower_node.split_info
feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
node["feature_idx"] = feature_idx
node["bin_threshold"] = bin_idx
node["missing_go_to_left"] = split_info.missing_go_to_left
node["is_categorical"] = split_info.is_categorical
if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
# Split is on the last non-missing bin: it's a "split on nans".
# All nans go to the right, the rest go to the left.
# Note: for categorical splits, bin_idx is 0 and we rely on the bitset
node["num_threshold"] = np.inf
elif split_info.is_categorical:
categories = binning_thresholds[feature_idx]
node["bitset_idx"] = next_free_bitset_idx
binned_left_cat_bitsets[next_free_bitset_idx] = split_info.left_cat_bitset
set_raw_bitset_from_binned_bitset(
raw_left_cat_bitsets[next_free_bitset_idx],
split_info.left_cat_bitset,
categories,
)
next_free_bitset_idx += 1
else:
node["num_threshold"] = binning_thresholds[feature_idx][bin_idx]
next_free_node_idx += 1
node["left"] = next_free_node_idx
next_free_node_idx, next_free_bitset_idx = _fill_predictor_arrays(
predictor_nodes,
binned_left_cat_bitsets,
raw_left_cat_bitsets,
grower_node.left_child,
binning_thresholds=binning_thresholds,
n_bins_non_missing=n_bins_non_missing,
next_free_node_idx=next_free_node_idx,
next_free_bitset_idx=next_free_bitset_idx,
)
node["right"] = next_free_node_idx
return _fill_predictor_arrays(
predictor_nodes,
binned_left_cat_bitsets,
raw_left_cat_bitsets,
grower_node.right_child,
binning_thresholds=binning_thresholds,
n_bins_non_missing=n_bins_non_missing,
next_free_node_idx=next_free_node_idx,
next_free_bitset_idx=next_free_bitset_idx,
)

View File

@@ -0,0 +1,520 @@
"""This module contains routines for building histograms."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
cimport cython
from cython.parallel import prange
from libc.string cimport memset
import numpy as np
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common cimport hist_struct
from sklearn.ensemble._hist_gradient_boosting.common cimport X_BINNED_DTYPE_C
from sklearn.ensemble._hist_gradient_boosting.common cimport G_H_DTYPE_C
from sklearn.utils._typedefs cimport uint8_t
# Notes:
# - IN views are read-only, OUT views are write-only
# - In a lot of functions here, we pass feature_idx and the whole 2d
# histograms arrays instead of just histograms[feature_idx]. This is because
# Cython generated C code will have strange Python interactions (likely
# related to the GIL release and the custom histogram dtype) when using 1d
# histogram arrays that come from 2d arrays.
# - The for loops are un-wrapped, for example:
#
# for i in range(n):
# array[i] = i
#
# will become
#
# for i in range(n // 4):
# array[i] = i
# array[i + 1] = i + 1
# array[i + 2] = i + 2
# array[i + 3] = i + 3
#
# This is to hint gcc that it can auto-vectorize these 4 operations and
# perform them all at once.
@cython.final
cdef class HistogramBuilder:
"""A Histogram builder... used to build histograms.
A histogram is an array with n_bins entries of type HISTOGRAM_DTYPE. Each
feature has its own histogram. A histogram contains the sum of gradients
and hessians of all the samples belonging to each bin.
There are different ways to build a histogram:
- by subtraction: hist(child) = hist(parent) - hist(sibling)
- from scratch. In this case we have routines that update the hessians
or not (not useful when hessians are constant for some losses e.g.
least squares). Also, there's a special case for the root which
contains all the samples, leading to some possible optimizations.
Overall all the implementations look the same, and are optimized for
cache hit.
Parameters
----------
X_binned : ndarray of int, shape (n_samples, n_features)
The binned input samples. Must be Fortran-aligned.
n_bins : int
The total number of bins, including the bin for missing values. Used
to define the shape of the histograms.
gradients : ndarray, shape (n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration i - 1.
hessians : ndarray, shape (n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration i - 1.
hessians_are_constant : bool
Whether hessians are constant.
"""
cdef public:
const X_BINNED_DTYPE_C [::1, :] X_binned
unsigned int n_features
unsigned int n_bins
G_H_DTYPE_C [::1] gradients
G_H_DTYPE_C [::1] hessians
G_H_DTYPE_C [::1] ordered_gradients
G_H_DTYPE_C [::1] ordered_hessians
uint8_t hessians_are_constant
int n_threads
def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned,
unsigned int n_bins, G_H_DTYPE_C [::1] gradients,
G_H_DTYPE_C [::1] hessians,
uint8_t hessians_are_constant,
int n_threads):
self.X_binned = X_binned
self.n_features = X_binned.shape[1]
# Note: all histograms will have <n_bins> bins, but some of the
# bins may be unused if a feature has a small number of unique values.
self.n_bins = n_bins
self.gradients = gradients
self.hessians = hessians
# for root node, gradients and hessians are already ordered
self.ordered_gradients = gradients.copy()
self.ordered_hessians = hessians.copy()
self.hessians_are_constant = hessians_are_constant
self.n_threads = n_threads
def compute_histograms_brute(
HistogramBuilder self,
const unsigned int [::1] sample_indices, # IN
const unsigned int [:] allowed_features=None, # IN
):
"""Compute the histograms of the node by scanning through all the data.
For a given feature, the complexity is O(n_samples)
Parameters
----------
sample_indices : array of int, shape (n_samples_at_node,)
The indices of the samples at the node to split.
allowed_features : None or ndarray, dtype=np.uint32
Indices of the features that are allowed by interaction constraints to be
split.
Returns
-------
histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
The computed histograms of the current node.
"""
cdef:
int n_samples
int feature_idx
int f_idx
int i
# need local views to avoid python interactions
uint8_t hessians_are_constant = self.hessians_are_constant
int n_allowed_features = self.n_features
G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients
G_H_DTYPE_C [::1] gradients = self.gradients
G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians
G_H_DTYPE_C [::1] hessians = self.hessians
# Histograms will be initialized to zero later within a prange
hist_struct [:, ::1] histograms = np.empty(
shape=(self.n_features, self.n_bins),
dtype=HISTOGRAM_DTYPE
)
bint has_interaction_cst = allowed_features is not None
int n_threads = self.n_threads
if has_interaction_cst:
n_allowed_features = allowed_features.shape[0]
with nogil:
n_samples = sample_indices.shape[0]
# Populate ordered_gradients and ordered_hessians. (Already done
# for root) Ordering the gradients and hessians helps to improve
# cache hit.
if sample_indices.shape[0] != gradients.shape[0]:
if hessians_are_constant:
for i in prange(n_samples, schedule='static',
num_threads=n_threads):
ordered_gradients[i] = gradients[sample_indices[i]]
else:
for i in prange(n_samples, schedule='static',
num_threads=n_threads):
ordered_gradients[i] = gradients[sample_indices[i]]
ordered_hessians[i] = hessians[sample_indices[i]]
# Compute histogram of each feature
for f_idx in prange(
n_allowed_features, schedule='static', num_threads=n_threads
):
if has_interaction_cst:
feature_idx = allowed_features[f_idx]
else:
feature_idx = f_idx
self._compute_histogram_brute_single_feature(
feature_idx, sample_indices, histograms
)
return histograms
cdef void _compute_histogram_brute_single_feature(
HistogramBuilder self,
const int feature_idx,
const unsigned int [::1] sample_indices, # IN
hist_struct [:, ::1] histograms) noexcept nogil: # OUT
"""Compute the histogram for a given feature."""
cdef:
unsigned int n_samples = sample_indices.shape[0]
const X_BINNED_DTYPE_C [::1] X_binned = \
self.X_binned[:, feature_idx]
unsigned int root_node = X_binned.shape[0] == n_samples
G_H_DTYPE_C [::1] ordered_gradients = \
self.ordered_gradients[:n_samples]
G_H_DTYPE_C [::1] ordered_hessians = \
self.ordered_hessians[:n_samples]
uint8_t hessians_are_constant = \
self.hessians_are_constant
# Set histograms to zero.
memset(&histograms[feature_idx, 0], 0, self.n_bins * sizeof(hist_struct))
if root_node:
if hessians_are_constant:
_build_histogram_root_no_hessian(feature_idx, X_binned,
ordered_gradients,
histograms)
else:
_build_histogram_root(feature_idx, X_binned,
ordered_gradients, ordered_hessians,
histograms)
else:
if hessians_are_constant:
_build_histogram_no_hessian(feature_idx,
sample_indices, X_binned,
ordered_gradients, histograms)
else:
_build_histogram(feature_idx, sample_indices,
X_binned, ordered_gradients,
ordered_hessians, histograms)
def compute_histograms_subtraction(
HistogramBuilder self,
hist_struct [:, ::1] parent_histograms, # IN and OUT
hist_struct [:, ::1] sibling_histograms, # IN
const unsigned int [:] allowed_features=None, # IN
):
"""Compute the histograms of the node using the subtraction trick.
hist(parent) = hist(left_child) + hist(right_child)
For a given feature, the complexity is O(n_bins). This is much more
efficient than compute_histograms_brute, but it's only possible for one
of the siblings.
Parameters
----------
parent_histograms : ndarray of HISTOGRAM_DTYPE, \
shape (n_features, n_bins)
The histograms of the parent.
sibling_histograms : ndarray of HISTOGRAM_DTYPE, \
shape (n_features, n_bins)
The histograms of the sibling.
allowed_features : None or ndarray, dtype=np.uint32
Indices of the features that are allowed by interaction constraints to be
split.
Returns
-------
histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins)
The computed histograms of the current node.
We repurpose parent_histograms for this and don't need to allocate new
memory.
"""
cdef:
int feature_idx
int f_idx
int n_allowed_features = self.n_features
bint has_interaction_cst = allowed_features is not None
int n_threads = self.n_threads
if has_interaction_cst:
n_allowed_features = allowed_features.shape[0]
# Compute histogram of each feature
for f_idx in prange(n_allowed_features, schedule='static', nogil=True,
num_threads=n_threads):
if has_interaction_cst:
feature_idx = allowed_features[f_idx]
else:
feature_idx = f_idx
_subtract_histograms(
feature_idx,
self.n_bins,
parent_histograms,
sibling_histograms,
)
return parent_histograms
cpdef void _build_histogram_naive(
const int feature_idx,
unsigned int [:] sample_indices, # IN
X_BINNED_DTYPE_C [:] binned_feature, # IN
G_H_DTYPE_C [:] ordered_gradients, # IN
G_H_DTYPE_C [:] ordered_hessians, # IN
hist_struct [:, :] out) noexcept nogil: # OUT
"""Build histogram in a naive way, without optimizing for cache hit.
Used in tests to compare with the optimized version."""
cdef:
unsigned int i
unsigned int n_samples = sample_indices.shape[0]
unsigned int sample_idx
unsigned int bin_idx
for i in range(n_samples):
sample_idx = sample_indices[i]
bin_idx = binned_feature[sample_idx]
out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i]
out[feature_idx, bin_idx].count += 1
cpdef void _subtract_histograms(
const int feature_idx,
unsigned int n_bins,
hist_struct [:, ::1] hist_a, # IN and OUT
hist_struct [:, ::1] hist_b, # IN
) noexcept nogil: # OUT
"""compute hist_a = hist_a - hist_b"""
# Note that subtraction of large sums of floating point numbers, as we have here,
# can exhibit catastrophic cancallation. This is in particular true for gradients
# as they can be positive and negative, while hessians are non-negative.
# Remember that gradients and hessians are originally computed in
# G_H_DTYPE_C = float32 precision. Therefore, if sum_gradients and sum_hessians are
# float64, we don't loose precision. But if we also used float32 for summation, we
# would need to take care of floating point errors.
#
# Note that we could protect for negative hessians by setting:
# sum_hessians = max(0, sum_hessians)
# But as we use float64 for summing float32, that's veeeery unlikely.
cdef:
unsigned int i = 0
for i in range(n_bins):
hist_a[feature_idx, i].sum_gradients -= hist_b[feature_idx, i].sum_gradients
hist_a[feature_idx, i].sum_hessians -= hist_b[feature_idx, i].sum_hessians
hist_a[feature_idx, i].count -= hist_b[feature_idx, i].count
cpdef void _build_histogram(
const int feature_idx,
const unsigned int [::1] sample_indices, # IN
const X_BINNED_DTYPE_C [::1] binned_feature, # IN
const G_H_DTYPE_C [::1] ordered_gradients, # IN
const G_H_DTYPE_C [::1] ordered_hessians, # IN
hist_struct [:, ::1] out) noexcept nogil: # OUT
"""Return histogram for a given feature."""
cdef:
unsigned int i = 0
unsigned int n_node_samples = sample_indices.shape[0]
unsigned int unrolled_upper = (n_node_samples // 4) * 4
unsigned int bin_0
unsigned int bin_1
unsigned int bin_2
unsigned int bin_3
unsigned int bin_idx
for i in range(0, unrolled_upper, 4):
bin_0 = binned_feature[sample_indices[i]]
bin_1 = binned_feature[sample_indices[i + 1]]
bin_2 = binned_feature[sample_indices[i + 2]]
bin_3 = binned_feature[sample_indices[i + 3]]
out[feature_idx, bin_0].sum_gradients += ordered_gradients[i]
out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1]
out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2]
out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3]
out[feature_idx, bin_0].sum_hessians += ordered_hessians[i]
out[feature_idx, bin_1].sum_hessians += ordered_hessians[i + 1]
out[feature_idx, bin_2].sum_hessians += ordered_hessians[i + 2]
out[feature_idx, bin_3].sum_hessians += ordered_hessians[i + 3]
out[feature_idx, bin_0].count += 1
out[feature_idx, bin_1].count += 1
out[feature_idx, bin_2].count += 1
out[feature_idx, bin_3].count += 1
for i in range(unrolled_upper, n_node_samples):
bin_idx = binned_feature[sample_indices[i]]
out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i]
out[feature_idx, bin_idx].count += 1
cpdef void _build_histogram_no_hessian(
const int feature_idx,
const unsigned int [::1] sample_indices, # IN
const X_BINNED_DTYPE_C [::1] binned_feature, # IN
const G_H_DTYPE_C [::1] ordered_gradients, # IN
hist_struct [:, ::1] out) noexcept nogil: # OUT
"""Return histogram for a given feature, not updating hessians.
Used when the hessians of the loss are constant (typically LS loss).
"""
cdef:
unsigned int i = 0
unsigned int n_node_samples = sample_indices.shape[0]
unsigned int unrolled_upper = (n_node_samples // 4) * 4
unsigned int bin_0
unsigned int bin_1
unsigned int bin_2
unsigned int bin_3
unsigned int bin_idx
for i in range(0, unrolled_upper, 4):
bin_0 = binned_feature[sample_indices[i]]
bin_1 = binned_feature[sample_indices[i + 1]]
bin_2 = binned_feature[sample_indices[i + 2]]
bin_3 = binned_feature[sample_indices[i + 3]]
out[feature_idx, bin_0].sum_gradients += ordered_gradients[i]
out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1]
out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2]
out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3]
out[feature_idx, bin_0].count += 1
out[feature_idx, bin_1].count += 1
out[feature_idx, bin_2].count += 1
out[feature_idx, bin_3].count += 1
for i in range(unrolled_upper, n_node_samples):
bin_idx = binned_feature[sample_indices[i]]
out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i]
out[feature_idx, bin_idx].count += 1
cpdef void _build_histogram_root(
const int feature_idx,
const X_BINNED_DTYPE_C [::1] binned_feature, # IN
const G_H_DTYPE_C [::1] all_gradients, # IN
const G_H_DTYPE_C [::1] all_hessians, # IN
hist_struct [:, ::1] out) noexcept nogil: # OUT
"""Compute histogram of the root node.
Unlike other nodes, the root node has to find the split among *all* the
samples from the training set. binned_feature and all_gradients /
all_hessians already have a consistent ordering.
"""
cdef:
unsigned int i = 0
unsigned int n_samples = binned_feature.shape[0]
unsigned int unrolled_upper = (n_samples // 4) * 4
unsigned int bin_0
unsigned int bin_1
unsigned int bin_2
unsigned int bin_3
unsigned int bin_idx
for i in range(0, unrolled_upper, 4):
bin_0 = binned_feature[i]
bin_1 = binned_feature[i + 1]
bin_2 = binned_feature[i + 2]
bin_3 = binned_feature[i + 3]
out[feature_idx, bin_0].sum_gradients += all_gradients[i]
out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1]
out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2]
out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3]
out[feature_idx, bin_0].sum_hessians += all_hessians[i]
out[feature_idx, bin_1].sum_hessians += all_hessians[i + 1]
out[feature_idx, bin_2].sum_hessians += all_hessians[i + 2]
out[feature_idx, bin_3].sum_hessians += all_hessians[i + 3]
out[feature_idx, bin_0].count += 1
out[feature_idx, bin_1].count += 1
out[feature_idx, bin_2].count += 1
out[feature_idx, bin_3].count += 1
for i in range(unrolled_upper, n_samples):
bin_idx = binned_feature[i]
out[feature_idx, bin_idx].sum_gradients += all_gradients[i]
out[feature_idx, bin_idx].sum_hessians += all_hessians[i]
out[feature_idx, bin_idx].count += 1
cpdef void _build_histogram_root_no_hessian(
const int feature_idx,
const X_BINNED_DTYPE_C [::1] binned_feature, # IN
const G_H_DTYPE_C [::1] all_gradients, # IN
hist_struct [:, ::1] out) noexcept nogil: # OUT
"""Compute histogram of the root node, not updating hessians.
Used when the hessians of the loss are constant (typically LS loss).
"""
cdef:
unsigned int i = 0
unsigned int n_samples = binned_feature.shape[0]
unsigned int unrolled_upper = (n_samples // 4) * 4
unsigned int bin_0
unsigned int bin_1
unsigned int bin_2
unsigned int bin_3
unsigned int bin_idx
for i in range(0, unrolled_upper, 4):
bin_0 = binned_feature[i]
bin_1 = binned_feature[i + 1]
bin_2 = binned_feature[i + 2]
bin_3 = binned_feature[i + 3]
out[feature_idx, bin_0].sum_gradients += all_gradients[i]
out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1]
out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2]
out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3]
out[feature_idx, bin_0].count += 1
out[feature_idx, bin_1].count += 1
out[feature_idx, bin_2].count += 1
out[feature_idx, bin_3].count += 1
for i in range(unrolled_upper, n_samples):
bin_idx = binned_feature[i]
out[feature_idx, bin_idx].sum_gradients += all_gradients[i]
out[feature_idx, bin_idx].count += 1

View File

@@ -0,0 +1,20 @@
hist_gradient_boosting_extension_metadata = {
'_gradient_boosting': {'sources': [cython_gen.process('_gradient_boosting.pyx')],
'dependencies': [openmp_dep]},
'histogram': {'sources': [cython_gen.process('histogram.pyx')], 'dependencies': [openmp_dep]},
'splitting': {'sources': [cython_gen.process('splitting.pyx')], 'dependencies': [openmp_dep]},
'_binning': {'sources': [cython_gen.process('_binning.pyx')], 'dependencies': [openmp_dep]},
'_predictor': {'sources': [cython_gen.process('_predictor.pyx')], 'dependencies': [openmp_dep]},
'_bitset': {'sources': [cython_gen.process('_bitset.pyx')]},
'common': {'sources': [cython_gen.process('common.pyx')]},
}
foreach ext_name, ext_dict : hist_gradient_boosting_extension_metadata
py.extension_module(
ext_name,
ext_dict.get('sources'),
dependencies: ext_dict.get('dependencies', []),
subdir: 'sklearn/ensemble/_hist_gradient_boosting',
install: true
)
endforeach

View File

@@ -0,0 +1,149 @@
"""
This module contains the TreePredictor class which is used for prediction.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from sklearn.ensemble._hist_gradient_boosting._predictor import (
_compute_partial_dependence,
_predict_from_binned_data,
_predict_from_raw_data,
)
from sklearn.ensemble._hist_gradient_boosting.common import (
PREDICTOR_RECORD_DTYPE,
Y_DTYPE,
)
class TreePredictor:
"""Tree class used for predictions.
Parameters
----------
nodes : ndarray of PREDICTOR_RECORD_DTYPE
The nodes of the tree.
binned_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32
Array of bitsets for binned categories used in predict_binned when a
split is categorical.
raw_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32
Array of bitsets for raw categories used in predict when a split is
categorical.
"""
def __init__(self, nodes, binned_left_cat_bitsets, raw_left_cat_bitsets):
self.nodes = nodes
self.binned_left_cat_bitsets = binned_left_cat_bitsets
self.raw_left_cat_bitsets = raw_left_cat_bitsets
def get_n_leaf_nodes(self):
"""Return number of leaves."""
return int(self.nodes["is_leaf"].sum())
def get_max_depth(self):
"""Return maximum depth among all leaves."""
return int(self.nodes["depth"].max())
def predict(self, X, known_cat_bitsets, f_idx_map, n_threads):
"""Predict raw values for non-binned data.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
The input samples.
known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
Array of bitsets of known categories, for each categorical feature.
f_idx_map : ndarray of shape (n_features,)
Map from original feature index to the corresponding index in the
known_cat_bitsets array.
n_threads : int
Number of OpenMP threads to use.
Returns
-------
y : ndarray, shape (n_samples,)
The raw predicted values.
"""
out = np.empty(X.shape[0], dtype=Y_DTYPE)
_predict_from_raw_data(
self.nodes,
X,
self.raw_left_cat_bitsets,
known_cat_bitsets,
f_idx_map,
n_threads,
out,
)
return out
def predict_binned(self, X, missing_values_bin_idx, n_threads):
"""Predict raw values for binned data.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
The input samples.
missing_values_bin_idx : uint8
Index of the bin that is used for missing values. This is the
index of the last bin and is always equal to max_bins (as passed
to the GBDT classes), or equivalently to n_bins - 1.
n_threads : int
Number of OpenMP threads to use.
Returns
-------
y : ndarray, shape (n_samples,)
The raw predicted values.
"""
out = np.empty(X.shape[0], dtype=Y_DTYPE)
_predict_from_binned_data(
self.nodes,
X,
self.binned_left_cat_bitsets,
missing_values_bin_idx,
n_threads,
out,
)
return out
def compute_partial_dependence(self, grid, target_features, out):
"""Fast partial dependence computation.
Parameters
----------
grid : ndarray, shape (n_samples, n_target_features)
The grid points on which the partial dependence should be
evaluated.
target_features : ndarray, shape (n_target_features)
The set of target features for which the partial dependence
should be evaluated.
out : ndarray, shape (n_samples)
The value of the partial dependence function on each grid
point.
"""
_compute_partial_dependence(self.nodes, grid, target_features, out)
def __setstate__(self, state):
try:
super().__setstate__(state)
except AttributeError:
self.__dict__.update(state)
# The dtype of feature_idx is np.intp which is platform dependent. Here, we
# make sure that saving and loading on different bitness systems works without
# errors. For instance, on a 64 bit Python runtime, np.intp = np.int64,
# while on 32 bit np.intp = np.int32.
#
# TODO: consider always using platform agnostic dtypes for fitted
# estimator attributes. For this particular estimator, this would
# mean replacing the intp field of PREDICTOR_RECORD_DTYPE by an int32
# field. Ideally this should be done consistently throughout
# scikit-learn along with a common test.
if self.nodes.dtype != PREDICTOR_RECORD_DTYPE:
self.nodes = self.nodes.astype(PREDICTOR_RECORD_DTYPE, casting="same_kind")

View File

@@ -0,0 +1,489 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.ensemble._hist_gradient_boosting.binning import (
_BinMapper,
_find_binning_thresholds,
_map_to_bins,
)
from sklearn.ensemble._hist_gradient_boosting.common import (
ALMOST_INF,
X_BINNED_DTYPE,
X_DTYPE,
)
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
n_threads = _openmp_effective_n_threads()
DATA = (
np.random.RandomState(42)
.normal(loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2))
.astype(X_DTYPE)
)
def test_find_binning_thresholds_regular_data():
data = np.linspace(0, 10, 1001)
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds, [2, 4, 6, 8])
def test_find_binning_thresholds_small_regular_data():
data = np.linspace(0, 10, 11)
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds, [2, 4, 6, 8])
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
bin_thresholds = _find_binning_thresholds(data, max_bins=11)
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
bin_thresholds = _find_binning_thresholds(data, max_bins=255)
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
def test_find_binning_thresholds_random_data():
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=255) for i in range(2)
]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (254,) # 255 - 1
assert bin_thresholds[i].dtype == DATA.dtype
assert_allclose(
bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1
)
assert_allclose(
bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2
)
def test_find_binning_thresholds_low_n_bins():
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=128) for i in range(2)
]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (127,) # 128 - 1
assert bin_thresholds[i].dtype == DATA.dtype
@pytest.mark.parametrize("n_bins", (2, 257))
def test_invalid_n_bins(n_bins):
err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format(
n_bins
)
with pytest.raises(ValueError, match=err_msg):
_BinMapper(n_bins=n_bins).fit(DATA)
def test_bin_mapper_n_features_transform():
mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
err_msg = "This estimator was fitted with 2 features but 4 got passed"
with pytest.raises(ValueError, match=err_msg):
mapper.transform(np.repeat(DATA, 2, axis=1))
@pytest.mark.parametrize("max_bins", [16, 128, 255])
def test_map_to_bins(max_bins):
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
]
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
is_categorical = np.zeros(2, dtype=np.uint8)
last_bin_idx = max_bins
_map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
assert binned.shape == DATA.shape
assert binned.dtype == np.uint8
assert binned.flags.f_contiguous
min_indices = DATA.argmin(axis=0)
max_indices = DATA.argmax(axis=0)
for feature_idx, min_idx in enumerate(min_indices):
assert binned[min_idx, feature_idx] == 0
for feature_idx, max_idx in enumerate(max_indices):
assert binned[max_idx, feature_idx] == max_bins - 1
@pytest.mark.parametrize("max_bins", [5, 10, 42])
def test_bin_mapper_random_data(max_bins):
n_samples, n_features = DATA.shape
expected_count_per_bin = n_samples // max_bins
tol = int(0.05 * expected_count_per_bin)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
binned = mapper.transform(DATA)
assert binned.shape == (n_samples, n_features)
assert binned.dtype == np.uint8
assert_array_equal(binned.min(axis=0), np.array([0, 0]))
assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1]))
assert len(mapper.bin_thresholds_) == n_features
for bin_thresholds_feature in mapper.bin_thresholds_:
assert bin_thresholds_feature.shape == (max_bins - 1,)
assert bin_thresholds_feature.dtype == DATA.dtype
assert np.all(mapper.n_bins_non_missing_ == max_bins)
# Check that the binned data is approximately balanced across bins.
for feature_idx in range(n_features):
for bin_idx in range(max_bins):
count = (binned[:, feature_idx] == bin_idx).sum()
assert abs(count - expected_count_per_bin) < tol
@pytest.mark.parametrize("n_samples, max_bins", [(5, 5), (5, 10), (5, 11), (42, 255)])
def test_bin_mapper_small_random_data(n_samples, max_bins):
data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
assert len(np.unique(data)) == n_samples
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42)
binned = mapper.fit_transform(data)
assert binned.shape == data.shape
assert binned.dtype == np.uint8
assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
@pytest.mark.parametrize(
"max_bins, n_distinct, multiplier",
[
(5, 5, 1),
(5, 5, 3),
(255, 12, 42),
],
)
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(data, binned)
@pytest.mark.parametrize("n_distinct", [2, 7, 42])
def test_bin_mapper_repeated_values_invariance(n_distinct):
rng = np.random.RandomState(42)
distinct_values = rng.normal(size=n_distinct)
assert len(np.unique(distinct_values)) == n_distinct
repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
data = distinct_values[repeated_indices]
rng.shuffle(data)
assert_array_equal(np.unique(data), np.sort(distinct_values))
data = data.reshape(-1, 1)
mapper_1 = _BinMapper(n_bins=n_distinct + 1)
binned_1 = mapper_1.fit_transform(data)
assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
# Adding more bins to the mapper yields the same results (same thresholds)
mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
binned_2 = mapper_2.fit_transform(data)
assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
assert_array_equal(binned_1, binned_2)
@pytest.mark.parametrize(
"max_bins, scale, offset",
[
(3, 2, -1),
(42, 1, 0),
(255, 0.3, 42),
],
)
def test_bin_mapper_identity_small(max_bins, scale, offset):
data = np.arange(max_bins).reshape(-1, 1) * scale + offset
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
@pytest.mark.parametrize(
"max_bins_small, max_bins_large",
[
(2, 2),
(3, 3),
(4, 4),
(42, 42),
(255, 255),
(5, 17),
(42, 255),
],
)
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
assert max_bins_large >= max_bins_small
data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
mapper_small = _BinMapper(n_bins=max_bins_small + 1)
mapper_large = _BinMapper(n_bins=max_bins_small + 1)
binned_small = mapper_small.fit_transform(data)
binned_large = mapper_large.fit_transform(binned_small)
assert_array_equal(binned_small, binned_large)
@pytest.mark.parametrize("n_bins", [10, 100, 256])
@pytest.mark.parametrize("diff", [-5, 0, 5])
def test_n_bins_non_missing(n_bins, diff):
# Check that n_bins_non_missing is n_unique_values when
# there are not a lot of unique values, else n_bins - 1.
n_unique_values = n_bins + diff
X = list(range(n_unique_values)) * 2
X = np.array(X).reshape(-1, 1)
mapper = _BinMapper(n_bins=n_bins).fit(X)
assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values))
def test_subsample():
# Make sure bin thresholds are different when applying subsampling
mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
for feature in range(DATA.shape[1]):
assert not np.allclose(
mapper_no_subsample.bin_thresholds_[feature],
mapper_subsample.bin_thresholds_[feature],
rtol=1e-4,
)
@pytest.mark.parametrize(
"n_bins, n_bins_non_missing, X_trans_expected",
[
(
256,
[4, 2, 2],
[
[0, 0, 0], # 255 <=> missing value
[255, 255, 0],
[1, 0, 0],
[255, 1, 1],
[2, 1, 1],
[3, 0, 0],
],
),
(
3,
[2, 2, 2],
[
[0, 0, 0], # 2 <=> missing value
[2, 2, 0],
[0, 0, 0],
[2, 1, 1],
[1, 1, 1],
[1, 0, 0],
],
),
],
)
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
# check for missing values: make sure nans are mapped to the last bin
# and that the _BinMapper attributes are correct
X = [
[1, 1, 0],
[np.nan, np.nan, 0],
[2, 1, 0],
[np.nan, 2, 1],
[3, 2, 1],
[4, 1, 0],
]
X = np.array(X)
mapper = _BinMapper(n_bins=n_bins)
mapper.fit(X)
assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
for feature_idx in range(X.shape[1]):
assert (
len(mapper.bin_thresholds_[feature_idx])
== n_bins_non_missing[feature_idx] - 1
)
assert mapper.missing_values_bin_idx_ == n_bins - 1
X_trans = mapper.transform(X)
assert_array_equal(X_trans, X_trans_expected)
def test_infinite_values():
# Make sure infinite values are properly handled.
bin_mapper = _BinMapper()
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
bin_mapper.fit(X)
assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, 0.5, ALMOST_INF])
assert bin_mapper.n_bins_non_missing_ == [4]
expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
assert_array_equal(bin_mapper.transform(X), expected_binned_X)
@pytest.mark.parametrize("n_bins", [15, 256])
def test_categorical_feature(n_bins):
# Basic test for categorical features
# we make sure that categories are mapped into [0, n_categories - 1] and
# that nans are mapped to the last bin
X = np.array(
[[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2],
dtype=X_DTYPE,
).T
known_categories = [np.unique(X[~np.isnan(X)])]
bin_mapper = _BinMapper(
n_bins=n_bins,
is_categorical=np.array([True]),
known_categories=known_categories,
).fit(X)
assert bin_mapper.n_bins_non_missing_ == [6]
assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13])
X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)
# Negative categories are mapped to the missing values' bin
# (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
# Unknown positive categories does not happen in practice and tested
# for illustration purpose.
X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)
def test_categorical_feature_negative_missing():
"""Make sure bin mapper treats negative categories as missing values."""
X = np.array(
[[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
).T
bin_mapper = _BinMapper(
n_bins=4,
is_categorical=np.array([True]),
known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
).fit(X)
assert bin_mapper.n_bins_non_missing_ == [3]
X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
# Negative values for categorical features are considered as missing values.
# They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
# which is 3 here.
assert bin_mapper.missing_values_bin_idx_ == 3
expected_trans = np.array([[3, 0, 1, 2, 3]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)
@pytest.mark.parametrize("n_bins", (128, 256))
def test_categorical_with_numerical_features(n_bins):
# basic check for binmapper with mixed data
X1 = np.arange(10, 20).reshape(-1, 1) # numerical
X2 = np.arange(10, 15).reshape(-1, 1) # categorical
X2 = np.r_[X2, X2]
X = np.c_[X1, X2]
known_categories = [None, np.unique(X2).astype(X_DTYPE)]
bin_mapper = _BinMapper(
n_bins=n_bins,
is_categorical=np.array([False, True]),
known_categories=known_categories,
).fit(X)
assert_array_equal(bin_mapper.n_bins_non_missing_, [10, 5])
bin_thresholds = bin_mapper.bin_thresholds_
assert len(bin_thresholds) == 2
assert_array_equal(bin_thresholds[1], np.arange(10, 15))
expected_X_trans = [
[0, 0],
[1, 1],
[2, 2],
[3, 3],
[4, 4],
[5, 0],
[6, 1],
[7, 2],
[8, 3],
[9, 4],
]
assert_array_equal(bin_mapper.transform(X), expected_X_trans)
def test_make_known_categories_bitsets():
# Check the output of make_known_categories_bitsets
X = np.array(
[[14, 2, 30], [30, 4, 70], [40, 10, 180], [40, 240, 180]], dtype=X_DTYPE
)
bin_mapper = _BinMapper(
n_bins=256,
is_categorical=np.array([False, True, True]),
known_categories=[None, X[:, 1], X[:, 2]],
)
bin_mapper.fit(X)
known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
# Note that for non-categorical features, values are left to 0
expected_f_idx_map = np.array([0, 0, 1], dtype=np.uint8)
assert_allclose(expected_f_idx_map, f_idx_map)
expected_cat_bitset = np.zeros((2, 8), dtype=np.uint32)
# first categorical feature: [2, 4, 10, 240]
f_idx = 1
mapped_f_idx = f_idx_map[f_idx]
expected_cat_bitset[mapped_f_idx, 0] = 2**2 + 2**4 + 2**10
# 240 = 32**7 + 16, therefore the 16th bit of the 7th array is 1.
expected_cat_bitset[mapped_f_idx, 7] = 2**16
# second categorical feature [30, 70, 180]
f_idx = 2
mapped_f_idx = f_idx_map[f_idx]
expected_cat_bitset[mapped_f_idx, 0] = 2**30
expected_cat_bitset[mapped_f_idx, 2] = 2**6
expected_cat_bitset[mapped_f_idx, 5] = 2**20
assert_allclose(expected_cat_bitset, known_cat_bitsets)
@pytest.mark.parametrize(
"is_categorical, known_categories, match",
[
(np.array([True]), [None], "Known categories for feature 0 must be provided"),
(
np.array([False]),
np.array([1, 2, 3]),
"isn't marked as a categorical feature, but categories were passed",
),
],
)
def test_categorical_parameters(is_categorical, known_categories, match):
# test the validation of the is_categorical and known_categories parameters
X = np.array([[1, 2, 3]], dtype=X_DTYPE)
bin_mapper = _BinMapper(
is_categorical=is_categorical, known_categories=known_categories
)
with pytest.raises(ValueError, match=match):
bin_mapper.fit(X)

View File

@@ -0,0 +1,64 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.ensemble._hist_gradient_boosting._bitset import (
in_bitset_memoryview,
set_bitset_memoryview,
set_raw_bitset_from_binned_bitset,
)
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
@pytest.mark.parametrize(
"values_to_insert, expected_bitset",
[
([0, 4, 33], np.array([2**0 + 2**4, 2**1, 0], dtype=np.uint32)),
(
[31, 32, 33, 79],
np.array([2**31, 2**0 + 2**1, 2**15], dtype=np.uint32),
),
],
)
def test_set_get_bitset(values_to_insert, expected_bitset):
n_32bits_ints = 3
bitset = np.zeros(n_32bits_ints, dtype=np.uint32)
for value in values_to_insert:
set_bitset_memoryview(bitset, value)
assert_allclose(expected_bitset, bitset)
for value in range(32 * n_32bits_ints):
if value in values_to_insert:
assert in_bitset_memoryview(bitset, value)
else:
assert not in_bitset_memoryview(bitset, value)
@pytest.mark.parametrize(
"raw_categories, binned_cat_to_insert, expected_raw_bitset",
[
(
[3, 4, 5, 10, 31, 32, 43],
[0, 2, 4, 5, 6],
[2**3 + 2**5 + 2**31, 2**0 + 2**11],
),
([3, 33, 50, 52], [1, 3], [0, 2**1 + 2**20]),
],
)
def test_raw_bitset_from_binned_bitset(
raw_categories, binned_cat_to_insert, expected_raw_bitset
):
binned_bitset = np.zeros(2, dtype=np.uint32)
raw_bitset = np.zeros(2, dtype=np.uint32)
raw_categories = np.asarray(raw_categories, dtype=X_DTYPE)
for val in binned_cat_to_insert:
set_bitset_memoryview(binned_bitset, val)
set_raw_bitset_from_binned_bitset(raw_bitset, binned_bitset, raw_categories)
assert_allclose(expected_raw_bitset, raw_bitset)
for binned_cat_val, raw_cat_val in enumerate(raw_categories):
if binned_cat_val in binned_cat_to_insert:
assert in_bitset_memoryview(raw_bitset, raw_cat_val)
else:
assert not in_bitset_memoryview(raw_bitset, raw_cat_val)

View File

@@ -0,0 +1,279 @@
import numpy as np
import pytest
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
@pytest.mark.parametrize("seed", range(5))
@pytest.mark.parametrize(
"loss",
[
"squared_error",
"poisson",
pytest.param(
"gamma",
marks=pytest.mark.skip("LightGBM with gamma loss has larger deviation."),
),
],
)
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
@pytest.mark.parametrize(
"n_samples, max_leaf_nodes",
[
(255, 4096),
(1000, 8),
],
)
def test_same_predictions_regression(
seed, loss, min_samples_leaf, n_samples, max_leaf_nodes
):
# Make sure sklearn has the same predictions as lightgbm for easy targets.
#
# In particular when the size of the trees are bound and the number of
# samples is large enough, the structure of the prediction trees found by
# LightGBM and sklearn should be exactly identical.
#
# Notes:
# - Several candidate splits may have equal gains when the number of
# samples in a node is low (and because of float errors). Therefore the
# predictions on the test set might differ if the structure of the tree
# is not exactly the same. To avoid this issue we only compare the
# predictions on the test set when the number of samples is large enough
# and max_leaf_nodes is low enough.
# - To ignore discrepancies caused by small differences in the binning
# strategy, data is pre-binned if n_samples > 255.
# - We don't check the absolute_error loss here. This is because
# LightGBM's computation of the median (used for the initial value of
# raw_prediction) is a bit off (they'll e.g. return midpoints when there
# is no need to.). Since these tests only run 1 iteration, the
# discrepancy between the initial values leads to biggish differences in
# the predictions. These differences are much smaller with more
# iterations.
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
max_iter = 1
max_bins = 255
X, y = make_regression(
n_samples=n_samples, n_features=5, n_informative=5, random_state=0
)
if loss in ("gamma", "poisson"):
# make the target positive
y = np.abs(y) + np.mean(np.abs(y))
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingRegressor(
loss=loss,
max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
)
est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
est_lightgbm.set_params(min_sum_hessian_in_leaf=0)
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated a numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
if loss in ("gamma", "poisson"):
# More than 65% of the predictions must be close up to the 2nd decimal.
# TODO: We are not entirely satisfied with this lax comparison, but the root
# cause is not clear, maybe algorithmic differences. One such example is the
# poisson_max_delta_step parameter of LightGBM which does not exist in HGBT.
assert (
np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-2, atol=1e-2))
> 0.65
)
else:
# Less than 1% of the predictions may deviate more than 1e-3 in relative terms.
assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-3)) > 1 - 0.01
if max_leaf_nodes < 10 and n_samples >= 1000 and loss in ("squared_error",):
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
# Less than 1% of the predictions may deviate more than 1e-4 in relative terms.
assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-4)) > 1 - 0.01
@pytest.mark.parametrize("seed", range(5))
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
@pytest.mark.parametrize(
"n_samples, max_leaf_nodes",
[
(255, 4096),
(1000, 8),
],
)
def test_same_predictions_classification(
seed, min_samples_leaf, n_samples, max_leaf_nodes
):
# Same as test_same_predictions_regression but for classification
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
max_iter = 1
n_classes = 2
max_bins = 255
X, y = make_classification(
n_samples=n_samples,
n_classes=n_classes,
n_features=5,
n_informative=5,
n_redundant=0,
random_state=0,
)
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingClassifier(
loss="log_loss",
max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
)
est_lightgbm = get_equivalent_estimator(
est_sklearn, lib="lightgbm", n_classes=n_classes
)
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated a numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
acc_sklearn = accuracy_score(y_train, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
if max_leaf_nodes < 10 and n_samples >= 1000:
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
acc_sklearn = accuracy_score(y_test, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
@pytest.mark.parametrize("seed", range(5))
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
@pytest.mark.parametrize(
"n_samples, max_leaf_nodes",
[
(255, 4096),
(10000, 8),
],
)
def test_same_predictions_multiclass_classification(
seed, min_samples_leaf, n_samples, max_leaf_nodes
):
# Same as test_same_predictions_regression but for classification
pytest.importorskip("lightgbm")
rng = np.random.RandomState(seed=seed)
n_classes = 3
max_iter = 1
max_bins = 255
lr = 1
X, y = make_classification(
n_samples=n_samples,
n_classes=n_classes,
n_features=5,
n_informative=5,
n_redundant=0,
n_clusters_per_class=1,
random_state=0,
)
if n_samples > 255:
# bin data and convert it to float32 so that the estimator doesn't
# treat it as pre-binned
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
est_sklearn = HistGradientBoostingClassifier(
loss="log_loss",
max_iter=max_iter,
max_bins=max_bins,
learning_rate=lr,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
)
est_lightgbm = get_equivalent_estimator(
est_sklearn, lib="lightgbm", n_classes=n_classes
)
est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
# We need X to be treated a numerical data, not pre-binned data.
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
pred_lightgbm = est_lightgbm.predict(X_train)
pred_sklearn = est_sklearn.predict(X_train)
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
proba_lightgbm = est_lightgbm.predict_proba(X_train)
proba_sklearn = est_sklearn.predict_proba(X_train)
# assert more than 75% of the predicted probabilities are the same up to
# the second decimal
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
acc_sklearn = accuracy_score(y_train, pred_sklearn)
np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2)
if max_leaf_nodes < 10 and n_samples >= 1000:
pred_lightgbm = est_lightgbm.predict(X_test)
pred_sklearn = est_sklearn.predict(X_test)
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
proba_lightgbm = est_lightgbm.predict_proba(X_train)
proba_sklearn = est_sklearn.predict_proba(X_train)
# assert more than 75% of the predicted probabilities are the same up
# to the second decimal
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
acc_sklearn = accuracy_score(y_test, pred_sklearn)
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

View File

@@ -0,0 +1,650 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from pytest import approx
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.common import (
G_H_DTYPE,
X_BINNED_DTYPE,
X_BITSET_INNER_DTYPE,
X_DTYPE,
Y_DTYPE,
)
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
n_threads = _openmp_effective_n_threads()
def _make_training_data(n_bins=256, constant_hessian=True):
rng = np.random.RandomState(42)
n_samples = 10000
# Generate some test data directly binned so as to test the grower code
# independently of the binning logic.
X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE)
X_binned = np.asfortranarray(X_binned)
def true_decision_function(input_features):
"""Ground truth decision function
This is a very simple yet asymmetric decision tree. Therefore the
grower code should have no trouble recovering the decision function
from 10000 training samples.
"""
if input_features[0] <= n_bins // 2:
return -1
else:
return -1 if input_features[1] <= n_bins // 3 else 1
target = np.array([true_decision_function(x) for x in X_binned], dtype=Y_DTYPE)
# Assume a square loss applied to an initial model that always predicts 0
# (hardcoded for this test):
all_gradients = target.astype(G_H_DTYPE)
shape_hessians = 1 if constant_hessian else all_gradients.shape
all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
return X_binned, all_gradients, all_hessians
def _check_children_consistency(parent, left, right):
# Make sure the samples are correctly dispatched from a parent to its
# children
assert parent.left_child is left
assert parent.right_child is right
# each sample from the parent is propagated to one of the two children
assert len(left.sample_indices) + len(right.sample_indices) == len(
parent.sample_indices
)
assert set(left.sample_indices).union(set(right.sample_indices)) == set(
parent.sample_indices
)
# samples are sent either to the left or the right node, never to both
assert set(left.sample_indices).intersection(set(right.sample_indices)) == set()
@pytest.mark.parametrize(
"n_bins, constant_hessian, stopping_param, shrinkage",
[
(11, True, "min_gain_to_split", 0.5),
(11, False, "min_gain_to_split", 1.0),
(11, True, "max_leaf_nodes", 1.0),
(11, False, "max_leaf_nodes", 0.1),
(42, True, "max_leaf_nodes", 0.01),
(42, False, "max_leaf_nodes", 1.0),
(256, True, "min_gain_to_split", 1.0),
(256, True, "max_leaf_nodes", 0.1),
],
)
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
X_binned, all_gradients, all_hessians = _make_training_data(
n_bins=n_bins, constant_hessian=constant_hessian
)
n_samples = X_binned.shape[0]
if stopping_param == "max_leaf_nodes":
stopping_param = {"max_leaf_nodes": 3}
else:
stopping_param = {"min_gain_to_split": 0.01}
grower = TreeGrower(
X_binned,
all_gradients,
all_hessians,
n_bins=n_bins,
shrinkage=shrinkage,
min_samples_leaf=1,
**stopping_param,
)
# The root node is not yet split, but the best possible split has
# already been evaluated:
assert grower.root.left_child is None
assert grower.root.right_child is None
root_split = grower.root.split_info
assert root_split.feature_idx == 0
assert root_split.bin_idx == n_bins // 2
assert len(grower.splittable_nodes) == 1
# Calling split next applies the next split and computes the best split
# for each of the two newly introduced children nodes.
left_node, right_node = grower.split_next()
# All training samples have ben split in the two nodes, approximately
# 50%/50%
_check_children_consistency(grower.root, left_node, right_node)
assert len(left_node.sample_indices) > 0.4 * n_samples
assert len(left_node.sample_indices) < 0.6 * n_samples
if grower.min_gain_to_split > 0:
# The left node is too pure: there is no gain to split it further.
assert left_node.split_info.gain < grower.min_gain_to_split
assert left_node in grower.finalized_leaves
# The right node can still be split further, this time on feature #1
split_info = right_node.split_info
assert split_info.gain > 1.0
assert split_info.feature_idx == 1
assert split_info.bin_idx == n_bins // 3
assert right_node.left_child is None
assert right_node.right_child is None
# The right split has not been applied yet. Let's do it now:
assert len(grower.splittable_nodes) == 1
right_left_node, right_right_node = grower.split_next()
_check_children_consistency(right_node, right_left_node, right_right_node)
assert len(right_left_node.sample_indices) > 0.1 * n_samples
assert len(right_left_node.sample_indices) < 0.2 * n_samples
assert len(right_right_node.sample_indices) > 0.2 * n_samples
assert len(right_right_node.sample_indices) < 0.4 * n_samples
# All the leafs are pure, it is not possible to split any further:
assert not grower.splittable_nodes
grower._apply_shrinkage()
# Check the values of the leaves:
assert grower.root.left_child.value == approx(shrinkage)
assert grower.root.right_child.left_child.value == approx(shrinkage)
assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3)
def test_predictor_from_grower():
# Build a tree on the toy 3-leaf dataset to extract the predictor.
n_bins = 256
X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins)
grower = TreeGrower(
X_binned,
all_gradients,
all_hessians,
n_bins=n_bins,
shrinkage=1.0,
max_leaf_nodes=3,
min_samples_leaf=5,
)
grower.grow()
assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves)
# Check that the node structure can be converted into a predictor
# object to perform predictions at scale
# We pass undefined binning_thresholds because we won't use predict anyway
predictor = grower.make_predictor(
binning_thresholds=np.zeros((X_binned.shape[1], n_bins))
)
assert predictor.nodes.shape[0] == 5
assert predictor.nodes["is_leaf"].sum() == 3
# Probe some predictions for each leaf of the tree
# each group of 3 samples corresponds to a condition in _make_training_data
input_data = np.array(
[
[0, 0],
[42, 99],
[128, 254],
[129, 0],
[129, 85],
[254, 85],
[129, 86],
[129, 254],
[242, 100],
],
dtype=np.uint8,
)
missing_values_bin_idx = n_bins - 1
predictions = predictor.predict_binned(
input_data, missing_values_bin_idx, n_threads
)
expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
assert np.allclose(predictions, expected_targets)
# Check that training set can be recovered exactly:
predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads)
assert np.allclose(predictions, -all_gradients)
@pytest.mark.parametrize(
"n_samples, min_samples_leaf, n_bins, constant_hessian, noise",
[
(11, 10, 7, True, 0),
(13, 10, 42, False, 0),
(56, 10, 255, True, 0.1),
(101, 3, 7, True, 0),
(200, 42, 42, False, 0),
(300, 55, 255, True, 0.1),
(300, 301, 255, True, 0.1),
],
)
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise):
rng = np.random.RandomState(seed=0)
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
if noise:
y_scale = y.std()
y += rng.normal(scale=noise, size=n_samples) * y_scale
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
shape_hessian = 1 if constant_hessian else all_gradients.shape
all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
grower = TreeGrower(
X,
all_gradients,
all_hessians,
n_bins=n_bins,
shrinkage=1.0,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=n_samples,
)
grower.grow()
predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
if n_samples >= min_samples_leaf:
for node in predictor.nodes:
if node["is_leaf"]:
assert node["count"] >= min_samples_leaf
else:
assert predictor.nodes.shape[0] == 1
assert predictor.nodes[0]["is_leaf"]
assert predictor.nodes[0]["count"] == n_samples
@pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)])
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
# Make sure root node isn't split if n_samples is not at least twice
# min_samples_leaf
rng = np.random.RandomState(seed=0)
n_bins = 256
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(
X,
all_gradients,
all_hessians,
n_bins=n_bins,
shrinkage=1.0,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=n_samples,
)
grower.grow()
if n_samples >= min_samples_leaf * 2:
assert len(grower.finalized_leaves) >= 2
else:
assert len(grower.finalized_leaves) == 1
def assert_is_stump(grower):
# To assert that stumps are created when max_depth=1
for leaf in (grower.root.left_child, grower.root.right_child):
assert leaf.left_child is None
assert leaf.right_child is None
@pytest.mark.parametrize("max_depth", [1, 2, 3])
def test_max_depth(max_depth):
# Make sure max_depth parameter works as expected
rng = np.random.RandomState(seed=0)
n_bins = 256
n_samples = 1000
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
mapper = _BinMapper(n_bins=n_bins)
X = mapper.fit_transform(X)
all_gradients = y.astype(G_H_DTYPE)
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
grower.grow()
depth = max(leaf.depth for leaf in grower.finalized_leaves)
assert depth == max_depth
if max_depth == 1:
assert_is_stump(grower)
def test_input_validation():
X_binned, all_gradients, all_hessians = _make_training_data()
X_binned_float = X_binned.astype(np.float32)
with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"):
TreeGrower(X_binned_float, all_gradients, all_hessians)
X_binned_C_array = np.ascontiguousarray(X_binned)
with pytest.raises(
ValueError, match="X_binned should be passed as Fortran contiguous array"
):
TreeGrower(X_binned_C_array, all_gradients, all_hessians)
def test_init_parameters_validation():
X_binned, all_gradients, all_hessians = _make_training_data()
with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"):
TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1)
with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"):
TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1)
def test_missing_value_predict_only():
# Make sure that missing values are supported at predict time even if they
# were not encountered in the training data: the missing values are
# assigned to whichever child has the most samples.
rng = np.random.RandomState(0)
n_samples = 100
X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(
X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False
)
grower.grow()
# We pass undefined binning_thresholds because we won't use predict anyway
predictor = grower.make_predictor(
binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
)
# go from root to a leaf, always following node with the most samples.
# That's the path nans are supposed to take
node = predictor.nodes[0]
while not node["is_leaf"]:
left = predictor.nodes[node["left"]]
right = predictor.nodes[node["right"]]
node = left if left["count"] > right["count"] else right
prediction_main_path = node["value"]
# now build X_test with only nans, and make sure all predictions are equal
# to prediction_main_path
all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
f_idx_map = np.zeros(0, dtype=np.uint32)
y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads)
assert np.all(y_pred == prediction_main_path)
def test_split_on_nan_with_infinite_values():
# Make sure the split on nan situations are respected even when there are
# samples with +inf values (we set the threshold to +inf when we have a
# split on nan so this test makes sure this does not introduce edge-case
# bugs). We need to use the private API so that we can also test
# predict_binned().
X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
# the gradient values will force a split on nan situation
gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
bin_mapper = _BinMapper()
X_binned = bin_mapper.fit_transform(X)
n_bins_non_missing = 3
has_missing_values = True
grower = TreeGrower(
X_binned,
gradients,
hessians,
n_bins_non_missing=n_bins_non_missing,
has_missing_values=has_missing_values,
min_samples_leaf=1,
n_threads=n_threads,
)
grower.grow()
predictor = grower.make_predictor(binning_thresholds=bin_mapper.bin_thresholds_)
# sanity check: this was a split on nan
assert predictor.nodes[0]["num_threshold"] == np.inf
assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1
known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
# Make sure in particular that the +inf sample is mapped to the left child
# Note that lightgbm "fails" here and will assign the inf sample to the
# right child, even though it's a "split on nan" situation.
predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads)
predictions_binned = predictor.predict_binned(
X_binned,
missing_values_bin_idx=bin_mapper.missing_values_bin_idx_,
n_threads=n_threads,
)
np.testing.assert_allclose(predictions, -gradients)
np.testing.assert_allclose(predictions_binned, -gradients)
def test_grow_tree_categories():
# Check that the grower produces the right predictor tree when a split is
# categorical
X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T
X_binned = np.asfortranarray(X_binned)
all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE)
all_hessians = np.ones(1, dtype=G_H_DTYPE)
is_categorical = np.ones(1, dtype=np.uint8)
grower = TreeGrower(
X_binned,
all_gradients,
all_hessians,
n_bins=4,
shrinkage=1.0,
min_samples_leaf=1,
is_categorical=is_categorical,
n_threads=n_threads,
)
grower.grow()
assert grower.n_nodes == 3
categories = [np.array([4, 9], dtype=X_DTYPE)]
predictor = grower.make_predictor(binning_thresholds=categories)
root = predictor.nodes[0]
assert root["count"] == 23
assert root["depth"] == 0
assert root["is_categorical"]
left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]]
# arbitrary validation, but this means ones go to the left.
assert left["count"] >= right["count"]
# check binned category value (1)
expected_binned_cat_bitset = [2**1] + [0] * 7
binned_cat_bitset = predictor.binned_left_cat_bitsets
assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset)
# check raw category value (9)
expected_raw_cat_bitsets = [2**9] + [0] * 7
raw_cat_bitsets = predictor.raw_left_cat_bitsets
assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets)
# Note that since there was no missing values during training, the missing
# values aren't part of the bitsets. However, we expect the missing values
# to go to the biggest child (i.e. the left one).
# The left child has a value of -1 = negative gradient.
assert root["missing_go_to_left"]
# make sure binned missing values are mapped to the left child during
# prediction
prediction_binned = predictor.predict_binned(
np.asarray([[6]]).astype(X_BINNED_DTYPE),
missing_values_bin_idx=6,
n_threads=n_threads,
)
assert_allclose(prediction_binned, [-1]) # negative gradient
# make sure raw missing values are mapped to the left child during
# prediction
known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32) # ignored anyway
f_idx_map = np.array([0], dtype=np.uint32)
prediction = predictor.predict(
np.array([[np.nan]]), known_cat_bitsets, f_idx_map, n_threads
)
assert_allclose(prediction, [-1])
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
@pytest.mark.parametrize("n_unique_categories", (2, 10, 100))
@pytest.mark.parametrize("target", ("binary", "random", "equal"))
def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
# Make sure that native categorical splits are equivalent to using a OHE,
# when given enough depth
rng = np.random.RandomState(0)
n_samples = 10_000
X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8)
X_ohe = OneHotEncoder(sparse_output=False).fit_transform(X_binned)
X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)
if target == "equal":
gradients = X_binned.reshape(-1)
elif target == "binary":
gradients = (X_binned % 2).reshape(-1)
else:
gradients = rng.randn(n_samples)
gradients = gradients.astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower_params = {
"min_samples_leaf": min_samples_leaf,
"max_depth": None,
"max_leaf_nodes": None,
}
grower = TreeGrower(
X_binned, gradients, hessians, is_categorical=[True], **grower_params
)
grower.grow()
# we pass undefined bin_thresholds because we won't use predict()
predictor = grower.make_predictor(
binning_thresholds=np.zeros((1, n_unique_categories))
)
preds = predictor.predict_binned(
X_binned, missing_values_bin_idx=255, n_threads=n_threads
)
grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params)
grower_ohe.grow()
predictor_ohe = grower_ohe.make_predictor(
binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories))
)
preds_ohe = predictor_ohe.predict_binned(
X_ohe, missing_values_bin_idx=255, n_threads=n_threads
)
assert predictor.get_max_depth() <= predictor_ohe.get_max_depth()
if target == "binary" and n_unique_categories > 2:
# OHE needs more splits to achieve the same predictions
assert predictor.get_max_depth() < predictor_ohe.get_max_depth()
np.testing.assert_allclose(preds, preds_ohe)
def test_grower_interaction_constraints():
"""Check that grower respects interaction constraints."""
n_features = 6
interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}]
n_samples = 10
n_bins = 6
root_feature_splits = []
def get_all_children(node):
res = []
if node.is_leaf:
return res
for n in [node.left_child, node.right_child]:
res.append(n)
res.extend(get_all_children(n))
return res
for seed in range(20):
rng = np.random.RandomState(seed)
X_binned = rng.randint(
0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(
X_binned,
gradients,
hessians,
n_bins=n_bins,
min_samples_leaf=1,
interaction_cst=interaction_cst,
n_threads=n_threads,
)
grower.grow()
root_feature_idx = grower.root.split_info.feature_idx
root_feature_splits.append(root_feature_idx)
feature_idx_to_constraint_set = {
0: {0, 1},
1: {0, 1, 2},
2: {1, 2},
3: {3, 4, 5},
4: {3, 4, 5},
5: {3, 4, 5},
}
root_constraint_set = feature_idx_to_constraint_set[root_feature_idx]
for node in (grower.root.left_child, grower.root.right_child):
# Root's children's allowed_features must be the root's constraints set.
assert_array_equal(node.allowed_features, list(root_constraint_set))
for node in get_all_children(grower.root):
if node.is_leaf:
continue
# Ensure that each node uses a subset of features of its parent node.
parent_interaction_cst_indices = set(node.interaction_cst_indices)
right_interactions_cst_indices = set(
node.right_child.interaction_cst_indices
)
left_interactions_cst_indices = set(node.left_child.interaction_cst_indices)
assert right_interactions_cst_indices.issubset(
parent_interaction_cst_indices
)
assert left_interactions_cst_indices.issubset(
parent_interaction_cst_indices
)
# The features used for split must have been present in the root's
# constraint set.
assert node.split_info.feature_idx in root_constraint_set
# Make sure that every feature is used at least once as split for the root node.
assert (
len(set(root_feature_splits))
== len(set().union(*interaction_cst))
== n_features
)

View File

@@ -0,0 +1,239 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.ensemble._hist_gradient_boosting.common import (
G_H_DTYPE,
HISTOGRAM_DTYPE,
X_BINNED_DTYPE,
)
from sklearn.ensemble._hist_gradient_boosting.histogram import (
_build_histogram,
_build_histogram_naive,
_build_histogram_no_hessian,
_build_histogram_root,
_build_histogram_root_no_hessian,
_subtract_histograms,
)
@pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram])
def test_build_histogram(build_func):
binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
# Small sample_indices (below unrolling threshold)
ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
sample_indices = np.array([0, 2, 3], dtype=np.uint32)
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
build_func(
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
)
hist = hist[0]
assert_array_equal(hist["count"], [2, 1, 0])
assert_allclose(hist["sum_gradients"], [1, 3, 0])
assert_allclose(hist["sum_hessians"], [2, 2, 0])
# Larger sample_indices (above unrolling threshold)
sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
build_func(
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
)
hist = hist[0]
assert_array_equal(hist["count"], [2, 2, 1])
assert_allclose(hist["sum_gradients"], [1, 4, 0])
assert_allclose(hist["sum_hessians"], [2, 2, 1])
def test_histogram_sample_order_independence():
# Make sure the order of the samples has no impact on the histogram
# computations
rng = np.random.RandomState(42)
n_sub_samples = 100
n_samples = 1000
n_bins = 256
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE)
sample_indices = rng.choice(
np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False
)
ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_no_hessian(
0, sample_indices, binned_feature, ordered_gradients, hist_gc
)
ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram(
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
)
permutation = rng.permutation(n_sub_samples)
hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_no_hessian(
0,
sample_indices[permutation],
binned_feature,
ordered_gradients[permutation],
hist_gc_perm,
)
hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram(
0,
sample_indices[permutation],
binned_feature,
ordered_gradients[permutation],
ordered_hessians[permutation],
hist_ghc_perm,
)
hist_gc = hist_gc[0]
hist_ghc = hist_ghc[0]
hist_gc_perm = hist_gc_perm[0]
hist_ghc_perm = hist_ghc_perm[0]
assert_allclose(hist_gc["sum_gradients"], hist_gc_perm["sum_gradients"])
assert_array_equal(hist_gc["count"], hist_gc_perm["count"])
assert_allclose(hist_ghc["sum_gradients"], hist_ghc_perm["sum_gradients"])
assert_allclose(hist_ghc["sum_hessians"], hist_ghc_perm["sum_hessians"])
assert_array_equal(hist_ghc["count"], hist_ghc_perm["count"])
@pytest.mark.parametrize("constant_hessian", [True, False])
def test_unrolled_equivalent_to_naive(constant_hessian):
# Make sure the different unrolled histogram computations give the same
# results as the naive one.
rng = np.random.RandomState(42)
n_samples = 10
n_bins = 5
sample_indices = np.arange(n_samples).astype(np.uint32)
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
if constant_hessian:
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
else:
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
_build_histogram_root_no_hessian(0, binned_feature, ordered_gradients, hist_gc_root)
_build_histogram_root(
0, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root
)
_build_histogram_no_hessian(
0, sample_indices, binned_feature, ordered_gradients, hist_gc
)
_build_histogram(
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
)
_build_histogram_naive(
0,
sample_indices,
binned_feature,
ordered_gradients,
ordered_hessians,
hist_naive,
)
hist_naive = hist_naive[0]
hist_gc_root = hist_gc_root[0]
hist_ghc_root = hist_ghc_root[0]
hist_gc = hist_gc[0]
hist_ghc = hist_ghc[0]
for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
assert_array_equal(hist["count"], hist_naive["count"])
assert_allclose(hist["sum_gradients"], hist_naive["sum_gradients"])
for hist in (hist_ghc_root, hist_ghc):
assert_allclose(hist["sum_hessians"], hist_naive["sum_hessians"])
for hist in (hist_gc_root, hist_gc):
assert_array_equal(hist["sum_hessians"], np.zeros(n_bins))
@pytest.mark.parametrize("constant_hessian", [True, False])
def test_hist_subtraction(constant_hessian):
# Make sure the histogram subtraction trick gives the same result as the
# classical method.
rng = np.random.RandomState(42)
n_samples = 10
n_bins = 5
sample_indices = np.arange(n_samples).astype(np.uint32)
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
if constant_hessian:
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
else:
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(
0, sample_indices, binned_feature, ordered_gradients, hist_parent
)
else:
_build_histogram(
0,
sample_indices,
binned_feature,
ordered_gradients,
ordered_hessians,
hist_parent,
)
mask = rng.randint(0, 2, n_samples).astype(bool)
sample_indices_left = sample_indices[mask]
ordered_gradients_left = ordered_gradients[mask]
ordered_hessians_left = ordered_hessians[mask]
hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(
0, sample_indices_left, binned_feature, ordered_gradients_left, hist_left
)
else:
_build_histogram(
0,
sample_indices_left,
binned_feature,
ordered_gradients_left,
ordered_hessians_left,
hist_left,
)
sample_indices_right = sample_indices[~mask]
ordered_gradients_right = ordered_gradients[~mask]
ordered_hessians_right = ordered_hessians[~mask]
hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
if constant_hessian:
_build_histogram_no_hessian(
0, sample_indices_right, binned_feature, ordered_gradients_right, hist_right
)
else:
_build_histogram(
0,
sample_indices_right,
binned_feature,
ordered_gradients_right,
ordered_hessians_right,
hist_right,
)
hist_left_sub = np.copy(hist_parent)
hist_right_sub = np.copy(hist_parent)
_subtract_histograms(0, n_bins, hist_left_sub, hist_right)
_subtract_histograms(0, n_bins, hist_right_sub, hist_left)
for key in ("count", "sum_hessians", "sum_gradients"):
assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)

View File

@@ -0,0 +1,446 @@
import re
import numpy as np
import pytest
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.ensemble._hist_gradient_boosting.common import (
G_H_DTYPE,
X_BINNED_DTYPE,
MonotonicConstraint,
)
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.ensemble._hist_gradient_boosting.splitting import (
Splitter,
compute_node_value,
)
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils._testing import _convert_container
n_threads = _openmp_effective_n_threads()
def is_increasing(a):
return (np.diff(a) >= 0.0).all()
def is_decreasing(a):
return (np.diff(a) <= 0.0).all()
def assert_leaves_values_monotonic(predictor, monotonic_cst):
# make sure leaves values (from left to right) are either all increasing
# or all decreasing (or neither) depending on the monotonic constraint.
nodes = predictor.nodes
def get_leaves_values():
"""get leaves values from left to right"""
values = []
def depth_first_collect_leaf_values(node_idx):
node = nodes[node_idx]
if node["is_leaf"]:
values.append(node["value"])
return
depth_first_collect_leaf_values(node["left"])
depth_first_collect_leaf_values(node["right"])
depth_first_collect_leaf_values(0) # start at root (0)
return values
values = get_leaves_values()
if monotonic_cst == MonotonicConstraint.NO_CST:
# some increasing, some decreasing
assert not is_increasing(values) and not is_decreasing(values)
elif monotonic_cst == MonotonicConstraint.POS:
# all increasing
assert is_increasing(values)
else: # NEG
# all decreasing
assert is_decreasing(values)
def assert_children_values_monotonic(predictor, monotonic_cst):
# Make sure siblings values respect the monotonic constraints. Left should
# be lower (resp greater) than right child if constraint is POS (resp.
# NEG).
# Note that this property alone isn't enough to ensure full monotonicity,
# since we also need to guanrantee that all the descendents of the left
# child won't be greater (resp. lower) than the right child, or its
# descendents. That's why we need to bound the predicted values (this is
# tested in assert_children_values_bounded)
nodes = predictor.nodes
left_lower = []
left_greater = []
for node in nodes:
if node["is_leaf"]:
continue
left_idx = node["left"]
right_idx = node["right"]
if nodes[left_idx]["value"] < nodes[right_idx]["value"]:
left_lower.append(node)
elif nodes[left_idx]["value"] > nodes[right_idx]["value"]:
left_greater.append(node)
if monotonic_cst == MonotonicConstraint.NO_CST:
assert left_lower and left_greater
elif monotonic_cst == MonotonicConstraint.POS:
assert left_lower and not left_greater
else: # NEG
assert not left_lower and left_greater
def assert_children_values_bounded(grower, monotonic_cst):
# Make sure that the values of the children of a node are bounded by the
# middle value between that node and its sibling (if there is a monotonic
# constraint).
# As a bonus, we also check that the siblings values are properly ordered
# which is slightly redundant with assert_children_values_monotonic (but
# this check is done on the grower nodes whereas
# assert_children_values_monotonic is done on the predictor nodes)
if monotonic_cst == MonotonicConstraint.NO_CST:
return
def recursively_check_children_node_values(node, right_sibling=None):
if node.is_leaf:
return
if right_sibling is not None:
middle = (node.value + right_sibling.value) / 2
if monotonic_cst == MonotonicConstraint.POS:
assert node.left_child.value <= node.right_child.value <= middle
if not right_sibling.is_leaf:
assert (
middle
<= right_sibling.left_child.value
<= right_sibling.right_child.value
)
else: # NEG
assert node.left_child.value >= node.right_child.value >= middle
if not right_sibling.is_leaf:
assert (
middle
>= right_sibling.left_child.value
>= right_sibling.right_child.value
)
recursively_check_children_node_values(
node.left_child, right_sibling=node.right_child
)
recursively_check_children_node_values(node.right_child)
recursively_check_children_node_values(grower.root)
@pytest.mark.parametrize("seed", range(3))
@pytest.mark.parametrize(
"monotonic_cst",
(
MonotonicConstraint.NO_CST,
MonotonicConstraint.POS,
MonotonicConstraint.NEG,
),
)
def test_nodes_values(monotonic_cst, seed):
# Build a single tree with only one feature, and make sure the nodes
# values respect the monotonic constraints.
# Considering the following tree with a monotonic POS constraint, we
# should have:
#
# root
# / \
# 5 10 # middle = 7.5
# / \ / \
# a b c d
#
# a <= b and c <= d (assert_children_values_monotonic)
# a, b <= middle <= c, d (assert_children_values_bounded)
# a <= b <= c <= d (assert_leaves_values_monotonic)
#
# The last one is a consequence of the others, but can't hurt to check
rng = np.random.RandomState(seed)
n_samples = 1000
n_features = 1
X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(
X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1
)
grower.grow()
# grow() will shrink the leaves values at the very end. For our comparison
# tests, we need to revert the shrinkage of the leaves, else we would
# compare the value of a leaf (shrunk) with a node (not shrunk) and the
# test would not be correct.
for leave in grower.finalized_leaves:
leave.value /= grower.shrinkage
# We pass undefined binning_thresholds because we won't use predict anyway
predictor = grower.make_predictor(
binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
)
# The consistency of the bounds can only be checked on the tree grower
# as the node bounds are not copied into the predictor tree. The
# consistency checks on the values of node children and leaves can be
# done either on the grower tree or on the predictor tree. We only
# do those checks on the predictor tree as the latter is derived from
# the former.
assert_children_values_monotonic(predictor, monotonic_cst)
assert_children_values_bounded(grower, monotonic_cst)
assert_leaves_values_monotonic(predictor, monotonic_cst)
@pytest.mark.parametrize("use_feature_names", (True, False))
def test_predictions(global_random_seed, use_feature_names):
# Train a model with a POS constraint on the first non-categorical feature
# and a NEG constraint on the second non-categorical feature, and make sure
# the constraints are respected by checking the predictions.
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
f_0 = rng.rand(n_samples) # positive correlation with y
f_1 = rng.rand(n_samples) # negative correlation with y
# extra categorical features, no correlation with y,
# to check the correctness of monotonicity constraint remapping, see issue #28898
f_a = rng.randint(low=0, high=9, size=n_samples)
f_b = rng.randint(low=0, high=9, size=n_samples)
f_c = rng.randint(low=0, high=9, size=n_samples)
X = np.c_[f_a, f_0, f_b, f_1, f_c]
columns_name = ["f_a", "f_0", "f_b", "f_1", "f_c"]
constructor_name = "dataframe" if use_feature_names else "array"
X = _convert_container(X, constructor_name, columns_name=columns_name)
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
if use_feature_names:
monotonic_cst = {"f_0": +1, "f_1": -1}
categorical_features = ["f_a", "f_b", "f_c"]
else:
monotonic_cst = [0, +1, 0, -1, 0]
categorical_features = [0, 2, 4]
gbdt = HistGradientBoostingRegressor(
monotonic_cst=monotonic_cst, categorical_features=categorical_features
)
gbdt.fit(X, y)
linspace = np.linspace(0, 1, 100)
sin = np.sin(linspace)
constant = np.full_like(linspace, fill_value=0.5)
# We now assert the predictions properly respect the constraints, on each
# feature. When testing for a feature we need to set the other one to a
# constant, because the monotonic constraints are only a "all else being
# equal" type of constraints:
# a constraint on the first feature only means that
# x0 < x0' => f(x0, x1) < f(x0', x1)
# while x1 stays constant.
# The constraint does not guanrantee that
# x0 < x0' => f(x0, x1) < f(x0', x1')
# First non-categorical feature (POS)
# assert pred is all increasing when f_0 is all increasing
X = np.c_[constant, linspace, constant, constant, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert is_increasing(pred)
# assert pred actually follows the variations of f_0
X = np.c_[constant, sin, constant, constant, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
# Second non-categorical feature (NEG)
# assert pred is all decreasing when f_1 is all increasing
X = np.c_[constant, constant, constant, linspace, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert is_decreasing(pred)
# assert pred actually follows the inverse variations of f_1
X = np.c_[constant, constant, constant, sin, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
def test_input_error():
X = [[1, 2], [2, 3], [3, 4]]
y = [0, 1, 2]
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
with pytest.raises(
ValueError, match=re.escape("monotonic_cst has shape (3,) but the input data")
):
gbdt.fit(X, y)
for monotonic_cst in ([1, 3], [1, -3], [0.3, -0.7]):
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"must be an array-like of -1, 0 or 1. Observed values:"
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
with pytest.raises(
ValueError,
match="monotonic constraints are not supported for multiclass classification",
):
gbdt.fit(X, y)
def test_input_error_related_to_feature_names():
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
y = np.array([0, 1, 0])
monotonic_cst = {"d": 1, "a": 1, "c": -1}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"monotonic_cst contains 2 unexpected feature names: ['c', 'd']."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
monotonic_cst = {k: 1 for k in "abcdefghijklmnopqrstuvwxyz"}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"monotonic_cst contains 24 unexpected feature names: "
"['c', 'd', 'e', 'f', 'g', '...']."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
monotonic_cst = {"a": 1}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"HistGradientBoostingRegressor was not fitted on data with feature "
"names. Pass monotonic_cst as an integer array instead."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X.values, y)
monotonic_cst = {"b": -1, "a": "+"}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape("monotonic_cst['a'] must be either -1, 0 or 1. Got '+'.")
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
def test_bounded_value_min_gain_to_split():
# The purpose of this test is to show that when computing the gain at a
# given split, the value of the current node should be properly bounded to
# respect the monotonic constraints, because it strongly interacts with
# min_gain_to_split. We build a simple example where gradients are [1, 1,
# 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
# bin, and depending on whether the value of the node is bounded or not,
# the min_gain_to_split constraint is or isn't satisfied.
l2_regularization = 0
min_hessian_to_split = 0
min_samples_leaf = 1
n_bins = n_samples = 5
X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = all_hessians.sum()
hessians_are_constant = False
builder = HistogramBuilder(
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
)
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
missing_values_bin_idx = n_bins - 1
children_lower_bound, children_upper_bound = -np.inf, np.inf
min_gain_to_split = 2000
splitter = Splitter(
X_binned,
n_bins_non_missing,
missing_values_bin_idx,
has_missing_values,
is_categorical,
monotonic_cst,
l2_regularization,
min_hessian_to_split,
min_samples_leaf,
min_gain_to_split,
hessians_are_constant,
)
histograms = builder.compute_histograms_brute(sample_indices)
# Since the gradient array is [1, 1, 100, 1, 1]
# the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
# and is equal to about 1307, which less than min_gain_to_split = 2000, so
# the node is considered unsplittable (gain = -1)
current_lower_bound, current_upper_bound = -np.inf, np.inf
value = compute_node_value(
sum_gradients,
sum_hessians,
current_lower_bound,
current_upper_bound,
l2_regularization,
)
# the unbounded value is equal to -sum_gradients / sum_hessians
assert value == pytest.approx(-104 / 5)
split_info = splitter.find_node_split(
n_samples,
histograms,
sum_gradients,
sum_hessians,
value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound,
)
assert split_info.gain == -1 # min_gain_to_split not respected
# here again the max possible gain is on the 3rd bin but we now cap the
# value of the node into [-10, inf].
# This means the gain is now about 2430 which is more than the
# min_gain_to_split constraint.
current_lower_bound, current_upper_bound = -10, np.inf
value = compute_node_value(
sum_gradients,
sum_hessians,
current_lower_bound,
current_upper_bound,
l2_regularization,
)
assert value == -10
split_info = splitter.find_node_split(
n_samples,
histograms,
sum_gradients,
sum_hessians,
value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound,
)
assert split_info.gain > min_gain_to_split

View File

@@ -0,0 +1,187 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import make_regression
from sklearn.ensemble._hist_gradient_boosting._bitset import (
set_bitset_memoryview,
set_raw_bitset_from_binned_bitset,
)
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.common import (
ALMOST_INF,
G_H_DTYPE,
PREDICTOR_RECORD_DTYPE,
X_BINNED_DTYPE,
X_BITSET_INNER_DTYPE,
X_DTYPE,
)
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
n_threads = _openmp_effective_n_threads()
@pytest.mark.parametrize("n_bins", [200, 256])
def test_regression_dataset(n_bins):
X, y = make_regression(
n_samples=500, n_features=10, n_informative=5, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
mapper = _BinMapper(n_bins=n_bins, random_state=42)
X_train_binned = mapper.fit_transform(X_train)
# Init gradients and hessians to that of least squares loss
gradients = -y_train.astype(G_H_DTYPE)
hessians = np.ones(1, dtype=G_H_DTYPE)
min_samples_leaf = 10
max_leaf_nodes = 30
grower = TreeGrower(
X_train_binned,
gradients,
hessians,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
n_bins=n_bins,
n_bins_non_missing=mapper.n_bins_non_missing_,
)
grower.grow()
predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
f_idx_map = np.zeros(0, dtype=np.uint32)
y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)
assert r2_score(y_train, y_pred_train) > 0.82
y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)
assert r2_score(y_test, y_pred_test) > 0.67
@pytest.mark.parametrize(
"num_threshold, expected_predictions",
[
(-np.inf, [0, 1, 1, 1]),
(10, [0, 0, 1, 1]),
(20, [0, 0, 0, 1]),
(ALMOST_INF, [0, 0, 0, 1]),
(np.inf, [0, 0, 0, 0]),
],
)
def test_infinite_values_and_thresholds(num_threshold, expected_predictions):
# Make sure infinite values and infinite thresholds are handled properly.
# In particular, if a value is +inf and the threshold is ALMOST_INF the
# sample should go to the right child. If the threshold is inf (split on
# nan), the +inf sample will go to the left child.
X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
# We just construct a simple tree with 1 root and 2 children
# parent node
nodes[0]["left"] = 1
nodes[0]["right"] = 2
nodes[0]["feature_idx"] = 0
nodes[0]["num_threshold"] = num_threshold
# left child
nodes[1]["is_leaf"] = True
nodes[1]["value"] = 0
# right child
nodes[2]["is_leaf"] = True
nodes[2]["value"] = 1
binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
f_idx_map = np.zeros(0, dtype=np.uint32)
predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)
assert np.all(predictions == expected_predictions)
@pytest.mark.parametrize(
"bins_go_left, expected_predictions",
[
([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]),
([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]),
([3, 5, 6], [0, 0, 0, 1, 0, 1]),
],
)
def test_categorical_predictor(bins_go_left, expected_predictions):
# Test predictor outputs are correct with categorical features
X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T
categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE)
bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE)
# We just construct a simple tree with 1 root and 2 children
# parent node
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
nodes[0]["left"] = 1
nodes[0]["right"] = 2
nodes[0]["feature_idx"] = 0
nodes[0]["is_categorical"] = True
nodes[0]["missing_go_to_left"] = True
# left child
nodes[1]["is_leaf"] = True
nodes[1]["value"] = 1
# right child
nodes[2]["is_leaf"] = True
nodes[2]["value"] = 0
binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
for go_left in bins_go_left:
set_bitset_memoryview(binned_cat_bitsets[0], go_left)
set_raw_bitset_from_binned_bitset(
raw_categorical_bitsets[0], binned_cat_bitsets[0], categories
)
predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
# Check binned data gives correct predictions
prediction_binned = predictor.predict_binned(
X_binned, missing_values_bin_idx=6, n_threads=n_threads
)
assert_allclose(prediction_binned, expected_predictions)
# manually construct bitset
known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)
known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32)
f_idx_map = np.array([0], dtype=np.uint32)
# Check with un-binned data
predictions = predictor.predict(
categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads
)
assert_allclose(predictions, expected_predictions)
# Check missing goes left because missing_values_bin_idx=6
X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T
predictions = predictor.predict_binned(
X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads
)
assert_allclose(predictions, [1])
# missing and unknown go left
predictions = predictor.predict(
np.array([[np.nan, 17]], dtype=X_DTYPE).T,
known_cat_bitsets,
f_idx_map,
n_threads,
)
assert_allclose(predictions, [1, 1])

View File

@@ -0,0 +1,231 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.metrics import check_scoring
X_classification, y_classification = make_classification(random_state=0)
X_regression, y_regression = make_regression(random_state=0)
def _assert_predictor_equal(gb_1, gb_2, X):
"""Assert that two HistGBM instances are identical."""
# Check identical nodes for each tree
for pred_ith_1, pred_ith_2 in zip(gb_1._predictors, gb_2._predictors):
for predictor_1, predictor_2 in zip(pred_ith_1, pred_ith_2):
assert_array_equal(predictor_1.nodes, predictor_2.nodes)
# Check identical predictions
assert_allclose(gb_1.predict(X), gb_2.predict(X))
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
# Check that a ValueError is raised when the maximum number of iterations
# is smaller than the number of iterations from the previous fit when warm
# start is True.
estimator = GradientBoosting(max_iter=10, early_stopping=False, warm_start=True)
estimator.fit(X, y)
estimator.set_params(max_iter=5)
err_msg = (
"max_iter=5 must be larger than or equal to n_iter_=10 when warm_start==True"
)
with pytest.raises(ValueError, match=err_msg):
estimator.fit(X, y)
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
def test_warm_start_yields_identical_results(GradientBoosting, X, y):
# Make sure that fitting 50 iterations and then 25 with warm start is
# equivalent to fitting 75 iterations.
rng = 42
gb_warm_start = GradientBoosting(
n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
)
gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
gb_no_warm_start = GradientBoosting(
n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
)
gb_no_warm_start.fit(X, y)
# Check that both predictors are equal
_assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
def test_warm_start_max_depth(GradientBoosting, X, y):
# Test if possible to fit trees of different depth in ensemble.
gb = GradientBoosting(
max_iter=20,
min_samples_leaf=1,
warm_start=True,
max_depth=2,
early_stopping=False,
)
gb.fit(X, y)
gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
gb.fit(X, y)
# First 20 trees have max_depth == 2
for i in range(20):
assert gb._predictors[i][0].get_max_depth() == 2
# Last 10 trees have max_depth == 3
for i in range(1, 11):
assert gb._predictors[-i][0].get_max_depth() == 3
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
@pytest.mark.parametrize("scoring", (None, "loss"))
def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
# Make sure that early stopping occurs after a small number of iterations
# when fitting a second time with warm starting.
n_iter_no_change = 5
gb = GradientBoosting(
n_iter_no_change=n_iter_no_change,
max_iter=10000,
early_stopping=True,
random_state=42,
warm_start=True,
tol=1e-3,
scoring=scoring,
)
gb.fit(X, y)
n_iter_first_fit = gb.n_iter_
gb.fit(X, y)
n_iter_second_fit = gb.n_iter_
assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
# Test if warm start with equal n_estimators does nothing
gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
gb_1.fit(X, y)
gb_2 = clone(gb_1)
gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True, n_iter_no_change=5)
gb_2.fit(X, y)
# Check that both predictors are equal
_assert_predictor_equal(gb_1, gb_2, X)
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
def test_warm_start_clear(GradientBoosting, X, y):
# Test if fit clears state.
gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
gb_1.fit(X, y)
gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42, warm_start=True)
gb_2.fit(X, y) # inits state
gb_2.set_params(warm_start=False)
gb_2.fit(X, y) # clears old state and equals est
# Check that both predictors have the same train_score_ and
# validation_score_ attributes
assert_allclose(gb_1.train_score_, gb_2.train_score_)
assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
# Check that both predictors are equal
_assert_predictor_equal(gb_1, gb_2, X)
@pytest.mark.parametrize(
"GradientBoosting, X, y",
[
(HistGradientBoostingClassifier, X_classification, y_classification),
(HistGradientBoostingRegressor, X_regression, y_regression),
],
)
@pytest.mark.parametrize("rng_type", ("none", "int", "instance"))
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
# Make sure the seeds for train/val split and small trainset subsampling
# are correctly set in a warm start context.
def _get_rng(rng_type):
# Helper to avoid consuming rngs
if rng_type == "none":
return None
elif rng_type == "int":
return 42
else:
return np.random.RandomState(0)
random_state = _get_rng(rng_type)
gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state)
gb_1.set_params(scoring=check_scoring(gb_1))
gb_1.fit(X, y)
random_seed_1_1 = gb_1._random_seed
gb_1.fit(X, y)
random_seed_1_2 = gb_1._random_seed # clear the old state, different seed
random_state = _get_rng(rng_type)
gb_2 = GradientBoosting(
early_stopping=True, max_iter=2, random_state=random_state, warm_start=True
)
gb_2.set_params(scoring=check_scoring(gb_2))
gb_2.fit(X, y) # inits state
random_seed_2_1 = gb_2._random_seed
gb_2.fit(X, y) # clears old state and equals est
random_seed_2_2 = gb_2._random_seed
# Without warm starting, the seeds should be
# * all different if random state is None
# * all equal if random state is an integer
# * different when refitting and equal with a new estimator (because
# the random state is mutated)
if rng_type == "none":
assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
elif rng_type == "int":
assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
else:
assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
# With warm starting, the seeds must be equal
assert random_seed_2_1 == random_seed_2_2

View File

@@ -0,0 +1,149 @@
"""This module contains utility routines."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.base import is_classifier
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None):
"""Return an unfitted estimator from another lib with matching hyperparams.
This utility function takes care of renaming the sklearn parameters into
their LightGBM, XGBoost or CatBoost equivalent parameters.
# unmapped XGB parameters:
# - min_samples_leaf
# - min_data_in_bin
# - min_split_gain (there is min_split_loss though?)
# unmapped Catboost parameters:
# max_leaves
# min_*
"""
if lib not in ("lightgbm", "xgboost", "catboost"):
raise ValueError(
"accepted libs are lightgbm, xgboost, and catboost. got {}".format(lib)
)
sklearn_params = estimator.get_params()
if sklearn_params["loss"] == "auto":
raise ValueError(
"auto loss is not accepted. We need to know if "
"the problem is binary or multiclass classification."
)
if sklearn_params["early_stopping"]:
raise NotImplementedError("Early stopping should be deactivated.")
lightgbm_loss_mapping = {
"squared_error": "regression_l2",
"absolute_error": "regression_l1",
"log_loss": "binary" if n_classes == 2 else "multiclass",
"gamma": "gamma",
"poisson": "poisson",
}
lightgbm_params = {
"objective": lightgbm_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"n_estimators": sklearn_params["max_iter"],
"num_leaves": sklearn_params["max_leaf_nodes"],
"max_depth": sklearn_params["max_depth"],
"min_data_in_leaf": sklearn_params["min_samples_leaf"],
"reg_lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"min_data_in_bin": 1,
"min_sum_hessian_in_leaf": 1e-3,
"min_split_gain": 0,
"verbosity": 10 if sklearn_params["verbose"] else -10,
"boost_from_average": True,
"enable_bundle": False, # also makes feature order consistent
"subsample_for_bin": _BinMapper().subsample,
"poisson_max_delta_step": 1e-12,
"feature_fraction_bynode": sklearn_params["max_features"],
}
if sklearn_params["loss"] == "log_loss" and n_classes > 2:
# LightGBM multiplies hessians by 2 in multiclass loss.
lightgbm_params["min_sum_hessian_in_leaf"] *= 2
# LightGBM 3.0 introduced a different scaling of the hessian for the multiclass
# case.
# It is equivalent of scaling the learning rate.
# See https://github.com/microsoft/LightGBM/pull/3256.
if n_classes is not None:
lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1)
# XGB
xgboost_loss_mapping = {
"squared_error": "reg:linear",
"absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED",
"log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax",
"gamma": "reg:gamma",
"poisson": "count:poisson",
}
xgboost_params = {
"tree_method": "hist",
"grow_policy": "lossguide", # so that we can set max_leaves
"objective": xgboost_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"n_estimators": sklearn_params["max_iter"],
"max_leaves": sklearn_params["max_leaf_nodes"],
"max_depth": sklearn_params["max_depth"] or 0,
"lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"min_child_weight": 1e-3,
"verbosity": 2 if sklearn_params["verbose"] else 0,
"silent": sklearn_params["verbose"] == 0,
"n_jobs": -1,
"colsample_bynode": sklearn_params["max_features"],
}
# Catboost
catboost_loss_mapping = {
"squared_error": "RMSE",
# catboost does not support MAE when leaf_estimation_method is Newton
"absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED",
"log_loss": "Logloss" if n_classes == 2 else "MultiClass",
"gamma": None,
"poisson": "Poisson",
}
catboost_params = {
"loss_function": catboost_loss_mapping[sklearn_params["loss"]],
"learning_rate": sklearn_params["learning_rate"],
"iterations": sklearn_params["max_iter"],
"depth": sklearn_params["max_depth"],
"reg_lambda": sklearn_params["l2_regularization"],
"max_bin": sklearn_params["max_bins"],
"feature_border_type": "Median",
"leaf_estimation_method": "Newton",
"verbose": bool(sklearn_params["verbose"]),
}
if lib == "lightgbm":
from lightgbm import LGBMClassifier, LGBMRegressor
if is_classifier(estimator):
return LGBMClassifier(**lightgbm_params)
else:
return LGBMRegressor(**lightgbm_params)
elif lib == "xgboost":
from xgboost import XGBClassifier, XGBRegressor
if is_classifier(estimator):
return XGBClassifier(**xgboost_params)
else:
return XGBRegressor(**xgboost_params)
else:
from catboost import CatBoostClassifier, CatBoostRegressor
if is_classifier(estimator):
return CatBoostClassifier(**catboost_params)
else:
return CatBoostRegressor(**catboost_params)

View File

@@ -0,0 +1,681 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numbers
import threading
from numbers import Integral, Real
from warnings import warn
import numpy as np
from scipy.sparse import issparse
from sklearn.base import OutlierMixin, _fit_context
from sklearn.ensemble._bagging import BaseBagging
from sklearn.tree import ExtraTreeRegressor
from sklearn.tree._tree import DTYPE as tree_dtype
from sklearn.utils import check_array, check_random_state, gen_batches
from sklearn.utils._chunking import get_chunk_n_rows
from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
_check_sample_weight,
_num_samples,
check_is_fitted,
validate_data,
)
__all__ = ["IsolationForest"]
def _parallel_compute_tree_depths(
tree,
X,
features,
tree_decision_path_lengths,
tree_avg_path_lengths,
depths,
lock,
):
"""Parallel computation of isolation tree depth."""
if features is None:
X_subset = X
else:
X_subset = X[:, features]
leaves_index = tree.apply(X_subset, check_input=False)
with lock:
depths += (
tree_decision_path_lengths[leaves_index]
+ tree_avg_path_lengths[leaves_index]
- 1.0
)
class IsolationForest(OutlierMixin, BaseBagging):
"""
Isolation Forest Algorithm.
Return the anomaly score of each sample using the IsolationForest algorithm
The IsolationForest 'isolates' observations by randomly selecting a feature
and then randomly selecting a split value between the maximum and minimum
values of the selected feature.
Since recursive partitioning can be represented by a tree structure, the
number of splittings required to isolate a sample is equivalent to the path
length from the root node to the terminating node.
This path length, averaged over a forest of such random trees, is a
measure of normality and our decision function.
Random partitioning produces noticeably shorter paths for anomalies.
Hence, when a forest of random trees collectively produce shorter path
lengths for particular samples, they are highly likely to be anomalies.
Read more in the :ref:`User Guide <isolation_forest>`.
.. versionadded:: 0.18
Parameters
----------
n_estimators : int, default=100
The number of base estimators in the ensemble.
max_samples : "auto", int or float, default="auto"
The number of samples to draw from X to train each base estimator.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
- If "auto", then `max_samples=min(256, n_samples)`.
If max_samples is larger than the number of samples provided,
all samples will be used for all trees (no sampling).
contamination : 'auto' or float, default='auto'
The amount of contamination of the data set, i.e. the proportion
of outliers in the data set. Used when fitting to define the threshold
on the scores of the samples.
- If 'auto', the threshold is determined as in the
original paper.
- If float, the contamination should be in the range (0, 0.5].
.. versionchanged:: 0.22
The default value of ``contamination`` changed from 0.1
to ``'auto'``.
max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator.
- If int, then draw `max_features` features.
- If float, then draw `max(1, int(max_features * n_features_in_))` features.
Note: using a float number less than 1.0 or integer less than number of
features will enable feature subsampling and leads to a longer runtime.
bootstrap : bool, default=False
If True, individual trees are fit on random subsets of the training
data sampled with replacement. If False, sampling without replacement
is performed.
n_jobs : int, default=None
The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using
all processors. See :term:`Glossary <n_jobs>` for more details.
random_state : int, RandomState instance or None, default=None
Controls the pseudo-randomness of the selection of the feature
and split values for each branching step and each tree in the forest.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
verbose : int, default=0
Controls the verbosity of the tree building process.
warm_start : bool, default=False
When set to ``True``, reuse the solution of the previous call to fit
and add more estimators to the ensemble, otherwise, just fit a whole
new forest. See :term:`the Glossary <warm_start>`.
.. versionadded:: 0.21
Attributes
----------
estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
The child estimator template used to create the collection of
fitted sub-estimators.
.. versionadded:: 1.2
`base_estimator_` was renamed to `estimator_`.
estimators_ : list of ExtraTreeRegressor instances
The collection of fitted sub-estimators.
estimators_features_ : list of ndarray
The subset of drawn features for each base estimator.
estimators_samples_ : list of ndarray
The subset of drawn samples (i.e., the in-bag samples) for each base
estimator.
max_samples_ : int
The actual number of samples.
offset_ : float
Offset used to define the decision function from the raw scores. We
have the relation: ``decision_function = score_samples - offset_``.
``offset_`` is defined as follows. When the contamination parameter is
set to "auto", the offset is equal to -0.5 as the scores of inliers are
close to 0 and the scores of outliers are close to -1. When a
contamination parameter different than "auto" is provided, the offset
is defined in such a way we obtain the expected number of outliers
(samples with decision function < 0) in training.
.. versionadded:: 0.20
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
Gaussian distributed dataset.
sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
Estimate the support of a high-dimensional distribution.
The implementation is based on libsvm.
sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
using Local Outlier Factor (LOF).
Notes
-----
The implementation is based on an ensemble of ExtraTreeRegressor. The
maximum depth of each tree is set to ``ceil(log_2(n))`` where
:math:`n` is the number of samples used to build the tree
(see [1]_ for more details).
References
----------
.. [1] F. T. Liu, K. M. Ting and Z. -H. Zhou.
:doi:`"Isolation forest." <10.1109/ICDM.2008.17>`
2008 Eighth IEEE International Conference on Data Mining (ICDM),
2008, pp. 413-422.
.. [2] F. T. Liu, K. M. Ting and Z. -H. Zhou.
:doi:`"Isolation-based anomaly detection."
<10.1145/2133360.2133363>` ACM Transactions on
Knowledge Discovery from Data (TKDD) 6.1 (2012): 1-39.
Examples
--------
>>> from sklearn.ensemble import IsolationForest
>>> X = [[-1.1], [0.3], [0.5], [100]]
>>> clf = IsolationForest(random_state=0).fit(X)
>>> clf.predict([[0.1], [0], [90]])
array([ 1, 1, -1])
For an example of using isolation forest for anomaly detection see
:ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py`.
"""
_parameter_constraints: dict = {
"n_estimators": [Interval(Integral, 1, None, closed="left")],
"max_samples": [
StrOptions({"auto"}),
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="right"),
],
"contamination": [
StrOptions({"auto"}),
Interval(Real, 0, 0.5, closed="right"),
],
"max_features": [
Integral,
Interval(Real, 0, 1, closed="right"),
],
"bootstrap": ["boolean"],
"n_jobs": [Integral, None],
"random_state": ["random_state"],
"verbose": ["verbose"],
"warm_start": ["boolean"],
}
def __init__(
self,
*,
n_estimators=100,
max_samples="auto",
contamination="auto",
max_features=1.0,
bootstrap=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
):
super().__init__(
estimator=None,
# here above max_features has no links with self.max_features
bootstrap=bootstrap,
bootstrap_features=False,
n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
warm_start=warm_start,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
)
self.contamination = contamination
def _get_estimator(self):
return ExtraTreeRegressor(
# here max_features has no links with self.max_features
max_features=1,
splitter="random",
random_state=self.random_state,
)
def _set_oob_score(self, X, y):
raise NotImplementedError("OOB score not supported by iforest")
def _parallel_args(self):
# ExtraTreeRegressor releases the GIL, so it's more efficient to use
# a thread-based backend rather than a process-based backend so as
# to avoid suffering from communication overhead and extra memory
# copies. This is only used in the fit method.
return {"prefer": "threads"}
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None, sample_weight=None):
"""
Fit estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Use ``dtype=np.float32`` for maximum
efficiency. Sparse matrices are also supported, use sparse
``csc_matrix`` for maximum efficiency.
y : Ignored
Not used, present for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Returns
-------
self : object
Fitted estimator.
"""
X = validate_data(
self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False
)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()
rnd = check_random_state(self.random_state)
y = rnd.uniform(size=X.shape[0])
# ensure that max_sample is in [1, n_samples]:
n_samples = X.shape[0]
if isinstance(self.max_samples, str) and self.max_samples == "auto":
max_samples = min(256, n_samples)
elif isinstance(self.max_samples, numbers.Integral):
if self.max_samples > n_samples:
warn(
"max_samples (%s) is greater than the "
"total number of samples (%s). max_samples "
"will be set to n_samples for estimation."
% (self.max_samples, n_samples)
)
max_samples = n_samples
else:
max_samples = self.max_samples
else: # max_samples is float
max_samples = int(self.max_samples * X.shape[0])
self.max_samples_ = max_samples
max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
super()._fit(
X,
y,
max_samples=max_samples,
max_depth=max_depth,
sample_weight=sample_weight,
check_input=False,
)
self._average_path_length_per_tree, self._decision_path_lengths = zip(
*[
(
_average_path_length(tree.tree_.n_node_samples),
tree.tree_.compute_node_depths(),
)
for tree in self.estimators_
]
)
if self.contamination == "auto":
# 0.5 plays a special role as described in the original paper.
# we take the opposite as we consider the opposite of their score.
self.offset_ = -0.5
return self
# Else, define offset_ wrt contamination parameter
# To avoid performing input validation a second time we call
# _score_samples rather than score_samples.
# _score_samples expects a CSR matrix, so we convert if necessary.
if issparse(X):
X = X.tocsr()
self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination)
return self
def predict(self, X):
"""
Predict if a particular sample is an outlier or not.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
For each observation, tells whether or not (+1 or -1) it should
be considered as an inlier according to the fitted model.
Notes
-----
The predict method can be parallelized by setting a joblib context. This
inherently does NOT use the ``n_jobs`` parameter initialized in the class,
which is used during ``fit``. This is because, predict may actually be faster
without parallelization for a small number of samples,
such as for 1000 samples or less. The user can set the
number of jobs in the joblib context to control the number of parallel jobs.
.. code-block:: python
from joblib import parallel_backend
# Note, we use threading here as the predict method is not CPU bound.
with parallel_backend("threading", n_jobs=4):
model.predict(X)
"""
check_is_fitted(self)
decision_func = self.decision_function(X)
is_inlier = np.ones_like(decision_func, dtype=int)
is_inlier[decision_func < 0] = -1
return is_inlier
def decision_function(self, X):
"""
Average anomaly score of X of the base classifiers.
The anomaly score of an input sample is computed as
the mean anomaly score of the trees in the forest.
The measure of normality of an observation given a tree is the depth
of the leaf containing this observation, which is equivalent to
the number of splittings required to isolate this point. In case of
several observations n_left in the leaf, the average path length of
an n_left samples isolation tree is added.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
scores : ndarray of shape (n_samples,)
The anomaly score of the input samples.
The lower, the more abnormal. Negative scores represent outliers,
positive scores represent inliers.
Notes
-----
The decision_function method can be parallelized by setting a joblib context.
This inherently does NOT use the ``n_jobs`` parameter initialized in the class,
which is used during ``fit``. This is because, calculating the score may
actually be faster without parallelization for a small number of samples,
such as for 1000 samples or less.
The user can set the number of jobs in the joblib context to control the
number of parallel jobs.
.. code-block:: python
from joblib import parallel_backend
# Note, we use threading here as the decision_function method is
# not CPU bound.
with parallel_backend("threading", n_jobs=4):
model.decision_function(X)
"""
# We subtract self.offset_ to make 0 be the threshold value for being
# an outlier:
return self.score_samples(X) - self.offset_
def score_samples(self, X):
"""
Opposite of the anomaly score defined in the original paper.
The anomaly score of an input sample is computed as
the mean anomaly score of the trees in the forest.
The measure of normality of an observation given a tree is the depth
of the leaf containing this observation, which is equivalent to
the number of splittings required to isolate this point. In case of
several observations n_left in the leaf, the average path length of
an n_left samples isolation tree is added.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
scores : ndarray of shape (n_samples,)
The anomaly score of the input samples.
The lower, the more abnormal.
Notes
-----
The score function method can be parallelized by setting a joblib context. This
inherently does NOT use the ``n_jobs`` parameter initialized in the class,
which is used during ``fit``. This is because, calculating the score may
actually be faster without parallelization for a small number of samples,
such as for 1000 samples or less.
The user can set the number of jobs in the joblib context to control the
number of parallel jobs.
.. code-block:: python
from joblib import parallel_backend
# Note, we use threading here as the score_samples method is not CPU bound.
with parallel_backend("threading", n_jobs=4):
model.score(X)
"""
# Check data
X = validate_data(
self,
X,
accept_sparse="csr",
dtype=tree_dtype,
reset=False,
ensure_all_finite=False,
)
return self._score_samples(X)
def _score_samples(self, X):
"""Private version of score_samples without input validation.
Input validation would remove feature names, so we disable it.
"""
# Code structure from ForestClassifier/predict_proba
check_is_fitted(self)
# Take the opposite of the scores as bigger is better (here less abnormal)
return -self._compute_chunked_score_samples(X)
def _compute_chunked_score_samples(self, X):
n_samples = _num_samples(X)
if self._max_features == X.shape[1]:
subsample_features = False
else:
subsample_features = True
# We get as many rows as possible within our working_memory budget
# (defined by sklearn.get_config()['working_memory']) to store
# self._max_features in each row during computation.
#
# Note:
# - this will get at least 1 row, even if 1 row of score will
# exceed working_memory.
# - this does only account for temporary memory usage while loading
# the data needed to compute the scores -- the returned scores
# themselves are 1D.
chunk_n_rows = get_chunk_n_rows(
row_bytes=16 * self._max_features, max_n_rows=n_samples
)
slices = gen_batches(n_samples, chunk_n_rows)
scores = np.zeros(n_samples, order="f")
for sl in slices:
# compute score on the slices of test samples:
scores[sl] = self._compute_score_samples(X[sl], subsample_features)
return scores
def _compute_score_samples(self, X, subsample_features):
"""
Compute the score of each samples in X going through the extra trees.
Parameters
----------
X : array-like or sparse matrix
Data matrix.
subsample_features : bool
Whether features should be subsampled.
Returns
-------
scores : ndarray of shape (n_samples,)
The score of each sample in X.
"""
n_samples = X.shape[0]
depths = np.zeros(n_samples, order="f")
average_path_length_max_samples = _average_path_length([self._max_samples])
# Note: we use default n_jobs value, i.e. sequential computation, which
# we expect to be more performant that parallelizing for small number
# of samples, e.g. < 1k samples. Default n_jobs value can be overridden
# by using joblib.parallel_backend context manager around
# ._compute_score_samples. Using a higher n_jobs may speed up the
# computation of the scores, e.g. for > 1k samples. See
# https://github.com/scikit-learn/scikit-learn/pull/28622 for more
# details.
lock = threading.Lock()
Parallel(
verbose=self.verbose,
require="sharedmem",
)(
delayed(_parallel_compute_tree_depths)(
tree,
X,
features if subsample_features else None,
self._decision_path_lengths[tree_idx],
self._average_path_length_per_tree[tree_idx],
depths,
lock,
)
for tree_idx, (tree, features) in enumerate(
zip(self.estimators_, self.estimators_features_)
)
)
denominator = len(self.estimators_) * average_path_length_max_samples
scores = 2 ** (
# For a single training sample, denominator and depth are 0.
# Therefore, we set the score manually to 1.
-np.divide(
depths, denominator, out=np.ones_like(depths), where=denominator != 0
)
)
return scores
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags
def _average_path_length(n_samples_leaf):
"""
The average path length in an n_samples iTree, which is equal to
the average path length of an unsuccessful BST search since the
latter has the same structure as an isolation tree.
Parameters
----------
n_samples_leaf : array-like of shape (n_samples,)
The number of training samples in each test sample leaf, for
each estimators.
Returns
-------
average_path_length : ndarray of shape (n_samples,)
"""
n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
n_samples_leaf_shape = n_samples_leaf.shape
n_samples_leaf = n_samples_leaf.reshape((1, -1))
average_path_length = np.zeros(n_samples_leaf.shape)
mask_1 = n_samples_leaf <= 1
mask_2 = n_samples_leaf == 2
not_mask = ~np.logical_or(mask_1, mask_2)
average_path_length[mask_1] = 0.0
average_path_length[mask_2] = 1.0
average_path_length[not_mask] = (
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
)
return average_path_length.reshape(n_samples_leaf_shape)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,734 @@
"""
Soft Voting/Majority Rule classifier and Voting regressor.
This module contains:
- A Soft Voting/Majority Rule classifier for classification estimators.
- A Voting regressor for regression estimators.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from abc import abstractmethod
from numbers import Integral
import numpy as np
from sklearn.base import (
ClassifierMixin,
RegressorMixin,
TransformerMixin,
_fit_context,
clone,
)
from sklearn.ensemble._base import _BaseHeterogeneousEnsemble, _fit_single_estimator
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import Bunch
from sklearn.utils._param_validation import StrOptions
from sklearn.utils._repr_html.estimator import _VisualBlock
from sklearn.utils.metadata_routing import (
MetadataRouter,
MethodMapping,
_raise_for_params,
_routing_enabled,
process_routing,
)
from sklearn.utils.metaestimators import available_if
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
_check_feature_names_in,
check_is_fitted,
column_or_1d,
)
class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
"""Base class for voting.
Warning: This class should not be used directly. Use derived classes
instead.
"""
_parameter_constraints: dict = {
"estimators": [list],
"weights": ["array-like", None],
"n_jobs": [None, Integral],
"verbose": ["verbose"],
}
def _log_message(self, name, idx, total):
if not self.verbose:
return None
return f"({idx} of {total}) Processing {name}"
@property
def _weights_not_none(self):
"""Get the weights of not `None` estimators."""
if self.weights is None:
return None
return [w for est, w in zip(self.estimators, self.weights) if est[1] != "drop"]
def _predict(self, X):
"""Collect results from clf.predict calls."""
return np.asarray([est.predict(X) for est in self.estimators_]).T
@abstractmethod
def fit(self, X, y, **fit_params):
"""Get common fit operations."""
names, clfs = self._validate_estimators()
if self.weights is not None and len(self.weights) != len(self.estimators):
raise ValueError(
"Number of `estimators` and weights must be equal; got"
f" {len(self.weights)} weights, {len(self.estimators)} estimators"
)
if _routing_enabled():
routed_params = process_routing(self, "fit", **fit_params)
else:
routed_params = Bunch()
for name in names:
routed_params[name] = Bunch(fit={})
if "sample_weight" in fit_params:
routed_params[name].fit["sample_weight"] = fit_params[
"sample_weight"
]
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_single_estimator)(
clone(clf),
X,
y,
fit_params=routed_params[name]["fit"],
message_clsname="Voting",
message=self._log_message(name, idx + 1, len(clfs)),
)
for idx, (name, clf) in enumerate(zip(names, clfs))
if clf != "drop"
)
self.named_estimators_ = Bunch()
# Uses 'drop' as placeholder for dropped estimators
est_iter = iter(self.estimators_)
for name, est in self.estimators:
current_est = est if est == "drop" else next(est_iter)
self.named_estimators_[name] = current_est
if hasattr(current_est, "feature_names_in_"):
self.feature_names_in_ = current_est.feature_names_in_
return self
def fit_transform(self, X, y=None, **fit_params):
"""Return class labels or probabilities for each estimator.
Return predictions for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix, dataframe} of shape \
(n_samples, n_features)
Input samples.
y : ndarray of shape (n_samples,), default=None
Target values (None for unsupervised transformations).
**fit_params : dict
Additional fit parameters.
Returns
-------
X_new : ndarray array of shape (n_samples, n_features_new)
Transformed array.
"""
return super().fit_transform(X, y, **fit_params)
@property
def n_features_in_(self):
"""Number of features seen during :term:`fit`."""
# For consistency with other estimators we raise an AttributeError so
# that hasattr() fails if the estimator isn't fitted.
try:
check_is_fitted(self)
except NotFittedError as nfe:
raise AttributeError(
"{} object has no n_features_in_ attribute.".format(
self.__class__.__name__
)
) from nfe
return self.estimators_[0].n_features_in_
def _sk_visual_block_(self):
names, estimators = zip(*self.estimators)
return _VisualBlock("parallel", estimators, names=names)
def get_metadata_routing(self):
"""Get metadata routing of this object.
Please check :ref:`User Guide <metadata_routing>` on how the routing
mechanism works.
.. versionadded:: 1.5
Returns
-------
routing : MetadataRouter
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
routing information.
"""
router = MetadataRouter(owner=self)
# `self.estimators` is a list of (name, est) tuples
for name, estimator in self.estimators:
router.add(
**{name: estimator},
method_mapping=MethodMapping().add(callee="fit", caller="fit"),
)
return router
class VotingClassifier(ClassifierMixin, _BaseVoting):
"""Soft Voting/Majority Rule classifier for unfitted estimators.
Read more in the :ref:`User Guide <voting_classifier>`.
.. versionadded:: 0.17
Parameters
----------
estimators : list of (str, estimator) tuples
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
of those original estimators that will be stored in the class attribute
``self.estimators_``. An estimator can be set to ``'drop'`` using
:meth:`set_params`.
.. versionchanged:: 0.21
``'drop'`` is accepted. Using None was deprecated in 0.22 and
support was removed in 0.24.
voting : {'hard', 'soft'}, default='hard'
If 'hard', uses predicted class labels for majority rule voting.
Else if 'soft', predicts the class label based on the argmax of
the sums of the predicted probabilities, which is recommended for
an ensemble of well-calibrated classifiers.
weights : array-like of shape (n_classifiers,), default=None
Sequence of weights (`float` or `int`) to weight the occurrences of
predicted class labels (`hard` voting) or class probabilities
before averaging (`soft` voting). Uses uniform weights if `None`.
n_jobs : int, default=None
The number of jobs to run in parallel for ``fit``.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.18
flatten_transform : bool, default=True
Affects shape of transform output only when voting='soft'
If voting='soft' and flatten_transform=True, transform method returns
matrix with shape (n_samples, n_classifiers * n_classes). If
flatten_transform=False, it returns
(n_classifiers, n_samples, n_classes).
verbose : bool, default=False
If True, the time elapsed while fitting will be printed as it
is completed.
.. versionadded:: 0.23
Attributes
----------
estimators_ : list of classifiers
The collection of fitted sub-estimators as defined in ``estimators``
that are not 'drop'.
named_estimators_ : :class:`~sklearn.utils.Bunch`
Attribute to access any fitted sub-estimators by name.
.. versionadded:: 0.20
le_ : :class:`~sklearn.preprocessing.LabelEncoder`
Transformer used to encode the labels during fit and decode during
prediction.
classes_ : ndarray of shape (n_classes,)
The classes labels.
n_features_in_ : int
Number of features seen during :term:`fit`. Only defined if the
underlying classifier exposes such an attribute when fit.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Only defined if the
underlying estimators expose such an attribute when fit.
.. versionadded:: 1.0
See Also
--------
VotingRegressor : Prediction voting regressor.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.naive_bayes import GaussianNB
>>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
>>> clf1 = LogisticRegression(random_state=1)
>>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
>>> clf3 = GaussianNB()
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> y = np.array([1, 1, 1, 2, 2, 2])
>>> eclf1 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
>>> eclf1 = eclf1.fit(X, y)
>>> print(eclf1.predict(X))
[1 1 1 2 2 2]
>>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
... eclf1.named_estimators_['lr'].predict(X))
True
>>> eclf2 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
... voting='soft')
>>> eclf2 = eclf2.fit(X, y)
>>> print(eclf2.predict(X))
[1 1 1 2 2 2]
To drop an estimator, :meth:`set_params` can be used to remove it. Here we
dropped one of the estimators, resulting in 2 fitted estimators:
>>> eclf2 = eclf2.set_params(lr='drop')
>>> eclf2 = eclf2.fit(X, y)
>>> len(eclf2.estimators_)
2
Setting `flatten_transform=True` with `voting='soft'` flattens output shape of
`transform`:
>>> eclf3 = VotingClassifier(estimators=[
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
... voting='soft', weights=[2,1,1],
... flatten_transform=True)
>>> eclf3 = eclf3.fit(X, y)
>>> print(eclf3.predict(X))
[1 1 1 2 2 2]
>>> print(eclf3.transform(X).shape)
(6, 6)
"""
_parameter_constraints: dict = {
**_BaseVoting._parameter_constraints,
"voting": [StrOptions({"hard", "soft"})],
"flatten_transform": ["boolean"],
}
def __init__(
self,
estimators,
*,
voting="hard",
weights=None,
n_jobs=None,
flatten_transform=True,
verbose=False,
):
super().__init__(estimators=estimators)
self.voting = voting
self.weights = weights
self.n_jobs = n_jobs
self.flatten_transform = flatten_transform
self.verbose = verbose
@_fit_context(
# estimators in VotingClassifier.estimators are not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y, **fit_params):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values.
**fit_params : dict
Parameters to pass to the underlying estimators.
.. versionadded:: 1.5
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
self : object
Returns the instance itself.
"""
_raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
y_type = type_of_target(y, input_name="y")
if y_type in ("unknown", "continuous"):
# raise a specific ValueError for non-classification tasks
raise ValueError(
f"Unknown label type: {y_type}. Maybe you are trying to fit a "
"classifier, which expects discrete classes on a "
"regression target with continuous values."
)
elif y_type not in ("binary", "multiclass"):
# raise a NotImplementedError for backward compatibility for non-supported
# classification tasks
raise NotImplementedError(
f"{self.__class__.__name__} only supports binary or multiclass "
"classification. Multilabel and multi-output classification are not "
"supported."
)
self.le_ = LabelEncoder().fit(y)
self.classes_ = self.le_.classes_
transformed_y = self.le_.transform(y)
return super().fit(X, transformed_y, **fit_params)
def predict(self, X):
"""Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
maj : array-like of shape (n_samples,)
Predicted class labels.
"""
check_is_fitted(self)
if self.voting == "soft":
maj = np.argmax(self.predict_proba(X), axis=1)
else: # 'hard' voting
predictions = self._predict(X)
maj = np.apply_along_axis(
lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)),
axis=1,
arr=predictions,
)
maj = self.le_.inverse_transform(maj)
return maj
def _collect_probas(self, X):
"""Collect results from clf.predict calls."""
return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
def _check_voting(self):
if self.voting == "hard":
raise AttributeError(
f"predict_proba is not available when voting={self.voting!r}"
)
return True
@available_if(_check_voting)
def predict_proba(self, X):
"""Compute probabilities of possible outcomes for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
avg : array-like of shape (n_samples, n_classes)
Weighted average probability for each class per sample.
"""
check_is_fitted(self)
avg = np.average(
self._collect_probas(X), axis=0, weights=self._weights_not_none
)
return avg
def transform(self, X):
"""Return class labels or probabilities for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
Returns
-------
probabilities_or_labels
If `voting='soft'` and `flatten_transform=True`:
returns ndarray of shape (n_samples, n_classifiers * n_classes),
being class probabilities calculated by each classifier.
If `voting='soft' and `flatten_transform=False`:
ndarray of shape (n_classifiers, n_samples, n_classes)
If `voting='hard'`:
ndarray of shape (n_samples, n_classifiers), being
class labels predicted by each classifier.
"""
check_is_fitted(self)
if self.voting == "soft":
probas = self._collect_probas(X)
if not self.flatten_transform:
return probas
return np.hstack(probas)
else:
return self._predict(X)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
Parameters
----------
input_features : array-like of str or None, default=None
Not used, present here for API consistency by convention.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self, "n_features_in_")
if self.voting == "soft" and not self.flatten_transform:
raise ValueError(
"get_feature_names_out is not supported when `voting='soft'` and "
"`flatten_transform=False`"
)
_check_feature_names_in(self, input_features, generate_names=False)
class_name = self.__class__.__name__.lower()
active_names = [name for name, est in self.estimators if est != "drop"]
if self.voting == "hard":
return np.asarray(
[f"{class_name}_{name}" for name in active_names], dtype=object
)
# voting == "soft"
n_classes = len(self.classes_)
names_out = [
f"{class_name}_{name}{i}" for name in active_names for i in range(n_classes)
]
return np.asarray(names_out, dtype=object)
def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.transformer_tags.preserves_dtype = []
return tags
class VotingRegressor(RegressorMixin, _BaseVoting):
"""Prediction voting regressor for unfitted estimators.
A voting regressor is an ensemble meta-estimator that fits several base
regressors, each on the whole dataset. Then it averages the individual
predictions to form a final prediction.
For a detailed example, refer to
:ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py`.
Read more in the :ref:`User Guide <voting_regressor>`.
.. versionadded:: 0.21
Parameters
----------
estimators : list of (str, estimator) tuples
Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
of those original estimators that will be stored in the class attribute
``self.estimators_``. An estimator can be set to ``'drop'`` using
:meth:`set_params`.
.. versionchanged:: 0.21
``'drop'`` is accepted. Using None was deprecated in 0.22 and
support was removed in 0.24.
weights : array-like of shape (n_regressors,), default=None
Sequence of weights (`float` or `int`) to weight the occurrences of
predicted values before averaging. Uses uniform weights if `None`.
n_jobs : int, default=None
The number of jobs to run in parallel for ``fit``.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
If True, the time elapsed while fitting will be printed as it
is completed.
.. versionadded:: 0.23
Attributes
----------
estimators_ : list of regressors
The collection of fitted sub-estimators as defined in ``estimators``
that are not 'drop'.
named_estimators_ : :class:`~sklearn.utils.Bunch`
Attribute to access any fitted sub-estimators by name.
.. versionadded:: 0.20
n_features_in_ : int
Number of features seen during :term:`fit`. Only defined if the
underlying regressor exposes such an attribute when fit.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Only defined if the
underlying estimators expose such an attribute when fit.
.. versionadded:: 1.0
See Also
--------
VotingClassifier : Soft Voting/Majority Rule classifier.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.ensemble import RandomForestRegressor
>>> from sklearn.ensemble import VotingRegressor
>>> from sklearn.neighbors import KNeighborsRegressor
>>> r1 = LinearRegression()
>>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
>>> r3 = KNeighborsRegressor()
>>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
>>> y = np.array([2, 6, 12, 20, 30, 42])
>>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
>>> print(er.fit(X, y).predict(X))
[ 6.8 8.4 12.5 17.8 26 34]
In the following example, we drop the `'lr'` estimator with
:meth:`~VotingRegressor.set_params` and fit the remaining two estimators:
>>> er = er.set_params(lr='drop')
>>> er = er.fit(X, y)
>>> len(er.estimators_)
2
"""
def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
super().__init__(estimators=estimators)
self.weights = weights
self.n_jobs = n_jobs
self.verbose = verbose
@_fit_context(
# estimators in VotingRegressor.estimators are not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y, **fit_params):
"""Fit the estimators.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values.
**fit_params : dict
Parameters to pass to the underlying estimators.
.. versionadded:: 1.5
Only available if `enable_metadata_routing=True`,
which can be set by using
``sklearn.set_config(enable_metadata_routing=True)``.
See :ref:`Metadata Routing User Guide <metadata_routing>` for
more details.
Returns
-------
self : object
Fitted estimator.
"""
_raise_for_params(fit_params, self, "fit", allow=["sample_weight"])
y = column_or_1d(y, warn=True)
return super().fit(X, y, **fit_params)
def predict(self, X):
"""Predict regression target for X.
The predicted regression target of an input sample is computed as the
mean predicted regression targets of the estimators in the ensemble.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
y : ndarray of shape (n_samples,)
The predicted values.
"""
check_is_fitted(self)
return np.average(self._predict(X), axis=1, weights=self._weights_not_none)
def transform(self, X):
"""Return predictions for X for each estimator.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
predictions : ndarray of shape (n_samples, n_classifiers)
Values predicted by each regressor.
"""
check_is_fitted(self)
return self._predict(X)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
Parameters
----------
input_features : array-like of str or None, default=None
Not used, present here for API consistency by convention.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self, "n_features_in_")
_check_feature_names_in(self, input_features, generate_names=False)
class_name = self.__class__.__name__.lower()
return np.asarray(
[f"{class_name}_{name}" for name, est in self.estimators if est != "drop"],
dtype=object,
)

View File

@@ -0,0 +1,9 @@
py.extension_module(
'_gradient_boosting',
[cython_gen.process('_gradient_boosting.pyx')] + utils_cython_tree,
dependencies: [np_dep],
subdir: 'sklearn/ensemble',
install: true
)
subdir('_hist_gradient_boosting')

View File

@@ -0,0 +1,109 @@
"""
Testing for the base module (sklearn.ensemble.base).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from collections import OrderedDict
import numpy as np
from sklearn.datasets import load_iris
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble._base import _set_random_states
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
def test_base():
# Check BaseEnsemble methods.
ensemble = BaggingClassifier(
estimator=Perceptron(random_state=None), n_estimators=3
)
iris = load_iris()
ensemble.fit(iris.data, iris.target)
ensemble.estimators_ = [] # empty the list and create estimators manually
ensemble._make_estimator()
random_state = np.random.RandomState(3)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(random_state=random_state)
ensemble._make_estimator(append=False)
assert 3 == len(ensemble)
assert 3 == len(ensemble.estimators_)
assert isinstance(ensemble[0], Perceptron)
assert ensemble[0].random_state is None
assert isinstance(ensemble[1].random_state, int)
assert isinstance(ensemble[2].random_state, int)
assert ensemble[1].random_state != ensemble[2].random_state
np_int_ensemble = BaggingClassifier(
estimator=Perceptron(), n_estimators=np.int32(3)
)
np_int_ensemble.fit(iris.data, iris.target)
def test_set_random_states():
# Linear Discriminant Analysis doesn't have random state: smoke test
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
clf1 = Perceptron(random_state=None)
assert clf1.random_state is None
# check random_state is None still sets
_set_random_states(clf1, None)
assert isinstance(clf1.random_state, int)
# check random_state fixes results in consistent initialisation
_set_random_states(clf1, 3)
assert isinstance(clf1.random_state, int)
clf2 = Perceptron(random_state=None)
_set_random_states(clf2, 3)
assert clf1.random_state == clf2.random_state
# nested random_state
def make_steps():
return [
("sel", SelectFromModel(Perceptron(random_state=None))),
("clf", Perceptron(random_state=None)),
]
est1 = Pipeline(make_steps())
_set_random_states(est1, 3)
assert isinstance(est1.steps[0][1].estimator.random_state, int)
assert isinstance(est1.steps[1][1].random_state, int)
assert (
est1.get_params()["sel__estimator__random_state"]
!= est1.get_params()["clf__random_state"]
)
# ensure multiple random_state parameters are invariant to get_params()
# iteration order
class AlphaParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params))
class RevParamPipeline(Pipeline):
def get_params(self, *args, **kwargs):
params = Pipeline.get_params(self, *args, **kwargs).items()
return OrderedDict(sorted(params, reverse=True))
for cls in [AlphaParamPipeline, RevParamPipeline]:
est2 = cls(make_steps())
_set_random_states(est2, 3)
assert (
est1.get_params()["sel__estimator__random_state"]
== est2.get_params()["sel__estimator__random_state"]
)
assert (
est1.get_params()["clf__random_state"]
== est2.get_params()["clf__random_state"]
)

View File

@@ -0,0 +1,263 @@
import numpy as np
import pytest
from sklearn.base import ClassifierMixin, clone, is_classifier
from sklearn.datasets import (
load_diabetes,
load_iris,
make_classification,
make_regression,
)
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
StackingClassifier,
StackingRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, SVR, LinearSVC
X, y = load_iris(return_X_y=True)
X_r, y_r = load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_classification(n_samples=10),
VotingClassifier(
estimators=[
("lr", LogisticRegression()),
("svm", LinearSVC()),
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
]
),
),
(
*make_regression(n_samples=10),
StackingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", SVR(kernel="linear")),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
],
cv=2,
),
),
(
*make_regression(n_samples=10),
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("svm", SVR(kernel="linear")),
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
]
),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
# check that the behavior of `estimators`, `estimators_`,
# `named_estimators`, `named_estimators_` is consistent across all
# ensemble classes and when using `set_params()`.
estimator = clone(estimator) # Avoid side effects from shared instances
# before fit
assert "svm" in estimator.named_estimators
assert estimator.named_estimators.svm is estimator.estimators[1][1]
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
# check fitted attributes
estimator.fit(X, y)
assert len(estimator.named_estimators) == 3
assert len(estimator.named_estimators_) == 3
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
# check that set_params() does not add a new attribute
estimator_new_params = clone(estimator)
svm_estimator = SVC() if is_classifier(estimator) else SVR()
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
assert not hasattr(estimator_new_params, "svm")
assert (
estimator_new_params.named_estimators.lr.get_params()
== estimator.named_estimators.lr.get_params()
)
assert (
estimator_new_params.named_estimators.rf.get_params()
== estimator.named_estimators.rf.get_params()
)
# check the behavior when setting and dropping an estimator
estimator_dropped = clone(estimator)
estimator_dropped.set_params(svm="drop")
estimator_dropped.fit(X, y)
assert len(estimator_dropped.named_estimators) == 3
assert estimator_dropped.named_estimators.svm == "drop"
assert len(estimator_dropped.named_estimators_) == 3
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
["lr", "svm", "rf"]
)
for sub_est in estimator_dropped.named_estimators_:
# check that the correspondence is correct
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
# check that we can set the parameters of the underlying classifier
estimator.set_params(svm__C=10.0)
estimator.set_params(rf__max_depth=5)
assert (
estimator.get_params()["svm__C"]
== estimator.get_params()["svm"].get_params()["C"]
)
assert (
estimator.get_params()["rf__max_depth"]
== estimator.get_params()["rf"].get_params()["max_depth"]
)
@pytest.mark.parametrize(
"Ensemble",
[VotingClassifier, StackingRegressor, VotingRegressor],
)
def test_ensemble_heterogeneous_estimators_type(Ensemble):
# check that ensemble will fail during validation if the underlying
# estimators are not of the same type (i.e. classifier or regressor)
# StackingClassifier can have an underlying regresor so it's not checked
if issubclass(Ensemble, ClassifierMixin):
X, y = make_classification(n_samples=10)
estimators = [("lr", LinearRegression())]
ensemble_type = "classifier"
else:
X, y = make_regression(n_samples=10)
estimators = [("lr", LogisticRegression())]
ensemble_type = "regressor"
ensemble = Ensemble(estimators=estimators)
err_msg = "should be a {}".format(ensemble_type)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, Ensemble",
[
(*make_classification(n_samples=10), StackingClassifier),
(*make_classification(n_samples=10), VotingClassifier),
(*make_regression(n_samples=10), StackingRegressor),
(*make_regression(n_samples=10), VotingRegressor),
],
)
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
# raise an error when the name contains dunder
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr__", LogisticRegression())]
else:
estimators = [("lr__", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name is not unique
if issubclass(Ensemble, ClassifierMixin):
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
else:
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
# raise an error when the name conflicts with the parameters
if issubclass(Ensemble, ClassifierMixin):
estimators = [("estimators", LogisticRegression())]
else:
estimators = [("estimators", LinearRegression())]
ensemble = Ensemble(estimators=estimators)
err_msg = "Estimator names conflict with constructor arguments"
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
@pytest.mark.parametrize(
"X, y, estimator",
[
(
*make_classification(n_samples=10),
StackingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_classification(n_samples=10),
VotingClassifier(estimators=[("lr", LogisticRegression())]),
),
(
*make_regression(n_samples=10),
StackingRegressor(estimators=[("lr", LinearRegression())]),
),
(
*make_regression(n_samples=10),
VotingRegressor(estimators=[("lr", LinearRegression())]),
),
],
ids=[
"stacking-classifier",
"voting-classifier",
"stacking-regressor",
"voting-regressor",
],
)
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
# check that we raise a consistent error when all estimators are
# dropped
estimator.set_params(lr="drop")
with pytest.raises(ValueError, match="All estimators are dropped."):
estimator.fit(X, y)
@pytest.mark.parametrize(
"Ensemble, Estimator, X, y",
[
(StackingClassifier, LogisticRegression, X, y),
(StackingRegressor, LinearRegression, X_r, y_r),
(VotingClassifier, LogisticRegression, X, y),
(VotingRegressor, LinearRegression, X_r, y_r),
],
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
# check that Voting and Stacking predictor delegate the missing values
# validation to the underlying estimator.
X = X.copy()
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
ensemble.fit(X, y).score(X, y)

View File

@@ -0,0 +1,395 @@
"""
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from unittest.mock import Mock, patch
import numpy as np
import pytest
from joblib import parallel_backend
from sklearn.datasets import load_diabetes, load_iris, make_classification
from sklearn.ensemble import IsolationForest
from sklearn.ensemble._iforest import _average_path_length
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
ignore_warnings,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# load iris & diabetes dataset
iris = load_iris()
diabetes = load_diabetes()
def test_iforest(global_random_seed):
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid(
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
)
with ignore_warnings():
for params in grid:
IsolationForest(random_state=global_random_seed, **params).fit(
X_train
).predict(X_test)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse(global_random_seed, sparse_container):
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
IsolationForest(max_samples=1000).fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples="auto").fit(X)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
IsolationForest(max_samples=np.int64(2)).fit(X)
# test X_test n_features match X_train one:
with pytest.raises(ValueError):
IsolationForest().fit(X).predict(X[:, 1:])
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=500)
warn_msg = "max_samples will be set to n_samples for estimation"
with pytest.warns(UserWarning, match=warn_msg):
clf.fit(X)
assert clf.max_samples_ == X.shape[0]
clf = IsolationForest(max_samples=0.4).fit(X)
assert clf.max_samples_ == 0.4 * X.shape[0]
def test_iforest_parallel_regression(global_random_seed):
"""Check parallel regression."""
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data, random_state=rng)
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_iforest_performance(global_random_seed):
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(global_random_seed)
X = 0.3 * rng.randn(600, 2)
X = rng.permutation(np.vstack((X + 2, X - 2)))
X_train = X[:1000]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
X_test = np.vstack((X[1000:], X_outliers))
y_test = np.array([0] * 200 + [1] * 200)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert roc_auc_score(y_test, y_pred) > 0.98
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_works(contamination, global_random_seed):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
# Test IsolationForest
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert clf.max_samples_ == clf._max_samples
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)
def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case
# Updated to check average path length when input is <= 2 (issue #11839)
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
assert_allclose(_average_path_length([0]), [0.0])
assert_allclose(_average_path_length([1]), [0.0])
assert_allclose(_average_path_length([2]), [1.0])
assert_allclose(_average_path_length([5]), [result_one])
assert_allclose(_average_path_length([999]), [result_two])
assert_allclose(
_average_path_length(np.array([1, 2, 5, 999])),
[0.0, 1.0, result_one, result_two],
)
# _average_path_length is increasing
avg_path_length = _average_path_length(np.arange(5))
assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
clf2 = IsolationForest().fit(X_train)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]),
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
)
assert_array_equal(
clf2.score_samples([[2.0, 2.0]]),
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
)
def test_iforest_warm_start():
"""Test iterative addition of iTrees to an iForest"""
rng = check_random_state(0)
X = rng.randn(20, 2)
# fit first 10 trees
clf = IsolationForest(
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
)
clf.fit(X)
# remember the 1st tree
tree_1 = clf.estimators_[0]
# fit another 10 trees
clf.set_params(n_estimators=20)
clf.fit(X)
# expecting 20 fitted trees and no overwritten trees
assert len(clf.estimators_) == 20
assert clf.estimators_[0] is tree_1
# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk has 3 rows):
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
@pytest.mark.thread_unsafe # monkeypatched code
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
# idem with chunk_size = 10 rows
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
@pytest.mark.thread_unsafe # monkeypatched code
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls
def test_iforest_with_uniform_data():
"""Test whether iforest predicts inliers when using uniform data"""
# 2-d array of all 1s
X = np.ones((100, 10))
iforest = IsolationForest()
iforest.fit(X)
rng = np.random.RandomState(0)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(X + 1) == 1)
assert all(iforest.predict(X - 1) == 1)
# 2-d array where columns contain the same value across rows
X = np.repeat(rng.randn(1, 10), 100, 0)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
# Single row
X = rng.randn(1, 10)
iforest = IsolationForest()
iforest.fit(X)
assert all(iforest.predict(X) == 1)
assert all(iforest.predict(rng.randn(100, 10)) == 1)
assert all(iforest.predict(np.ones((100, 10))) == 1)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_iforest_with_n_jobs_does_not_segfault(csc_container):
"""Check that Isolation Forest does not segfault with n_jobs=2
Non-regression test for #23252
"""
X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0)
X = csc_container(X)
IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X)
def test_iforest_preserve_feature_names():
"""Check that feature names are preserved when contamination is not "auto".
Feature names are required for consistency checks during scoring.
Non-regression test for Issue #25844
"""
pd = pytest.importorskip("pandas")
rng = np.random.RandomState(0)
X = pd.DataFrame(data=rng.randn(4), columns=["a"])
model = IsolationForest(random_state=0, contamination=0.05)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
model.fit(X)
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
def test_iforest_sparse_input_float_contamination(sparse_container):
"""Check that `IsolationForest` accepts sparse matrix input and float value for
contamination.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27626
"""
X, _ = make_classification(n_samples=50, n_features=4, random_state=0)
X = sparse_container(X)
X.sort_indices()
contamination = 0.1
iforest = IsolationForest(
n_estimators=5, contamination=contamination, random_state=0
).fit(X)
X_decision = iforest.decision_function(X)
assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination)
@pytest.mark.parametrize("n_jobs", [1, 2])
@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_predict_parallel(global_random_seed, contamination, n_jobs):
"""Check that `IsolationForest.predict` is parallelized."""
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
# Test IsolationForest
clf = IsolationForest(
random_state=global_random_seed, contamination=contamination, n_jobs=None
)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
assert_array_equal(pred, 6 * [1] + 2 * [-1])
clf_parallel = IsolationForest(
random_state=global_random_seed, contamination=contamination, n_jobs=-1
)
clf_parallel.fit(X)
with parallel_backend("threading", n_jobs=n_jobs):
pred_paralell = clf_parallel.predict(X)
# assert the same results as non-parallel
assert_array_equal(pred, pred_paralell)

View File

@@ -0,0 +1,795 @@
"""Testing for the VotingClassifier and VotingRegressor"""
import re
import numpy as np
import pytest
from sklearn import config_context, datasets
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_multilabel_classification
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
VotingClassifier,
VotingRegressor,
)
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tests.metadata_routing_common import (
ConsumingClassifier,
ConsumingRegressor,
_Registry,
check_recorded_metadata,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
# Load datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
X_r, y_r = datasets.load_diabetes(return_X_y=True)
@pytest.mark.parametrize(
"params, err_msg",
[
(
{"estimators": []},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [LogisticRegression()]},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [(213, LogisticRegression())]},
"Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
),
(
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
"Number of `estimators` and weights must be equal",
),
],
)
def test_voting_classifier_estimator_init(params, err_msg):
ensemble = VotingClassifier(**params)
with pytest.raises(ValueError, match=err_msg):
ensemble.fit(X, y)
def test_predictproba_hardvoting():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="hard",
)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf.predict_proba
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
assert not hasattr(eclf, "predict_proba")
eclf.fit(X_scaled, y)
assert not hasattr(eclf, "predict_proba")
def test_notfitted():
eclf = VotingClassifier(
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
voting="soft",
)
ereg = VotingRegressor([("dr", DummyRegressor())])
msg = (
"This %s instance is not fitted yet. Call 'fit'"
" with appropriate arguments before using this estimator."
)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.predict_proba(X)
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
eclf.transform(X)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.predict(X_r)
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
ereg.transform(X_r)
def test_majority_label_iris(global_random_seed):
"""Check classification by majority label on dataset iris."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
scores = cross_val_score(eclf, X, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_tie_situation():
"""Check voting classifier selects smaller class label in tie situation."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
assert clf1.fit(X, y).predict(X)[52] == 2
assert clf2.fit(X, y).predict(X)[52] == 1
assert eclf.fit(X, y).predict(X)[52] == 1
def test_weights_iris(global_random_seed):
"""Check classification by average probabilities on dataset iris."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 2, 10],
)
scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
assert scores.mean() >= 0.9
def test_weights_regressor():
"""Check weighted average regression prediction on diabetes dataset."""
reg1 = DummyRegressor(strategy="mean")
reg2 = DummyRegressor(strategy="median")
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
ereg = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
)
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
X_r, y_r, test_size=0.25
)
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
avg = np.average(
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
)
assert_almost_equal(ereg_pred, avg, decimal=2)
ereg_weights_none = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
)
ereg_weights_equal = VotingRegressor(
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
)
ereg_weights_none.fit(X_r_train, y_r_train)
ereg_weights_equal.fit(X_r_train, y_r_train)
ereg_none_pred = ereg_weights_none.predict(X_r_test)
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def test_predict_on_toy_problem(global_random_seed):
"""Manually check predicted class labels for toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array(
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
)
y = np.array([1, 1, 1, 2, 2, 2])
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="hard",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[1, 1, 1],
)
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_predict_proba_on_toy_problem():
"""Calculate predicted probabilities on toy dataset."""
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
clf1_res = np.array(
[
[0.59790391, 0.40209609],
[0.57622162, 0.42377838],
[0.50728456, 0.49271544],
[0.40241774, 0.59758226],
]
)
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
clf3_res = np.array(
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
)
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
weights=[2, 1, 1],
)
eclf_res = eclf.fit(X, y).predict_proba(X)
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
inner_msg = "predict_proba is not available when voting='hard'"
outer_msg = "'VotingClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
)
eclf.fit(X, y).predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
def test_multilabel():
"""Check if error is raised for multilabel classification."""
X, y = make_multilabel_classification(
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
)
clf = OneVsRestClassifier(SVC(kernel="linear"))
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
try:
eclf.fit(X, y)
except NotImplementedError:
return
def test_gridsearch():
"""Check GridSearch support."""
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
)
params = {
"lr__C": [1.0, 100.0],
"voting": ["soft", "hard"],
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
grid.fit(X_scaled, y)
def test_parallel_fit(global_random_seed):
"""Check parallel backend of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_sample_weight(global_random_seed):
"""Tests sample_weight parameter of VotingClassifier"""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = CalibratedClassifierCV(SVC(random_state=global_random_seed), ensemble=False)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
clf1.fit(X_scaled, y, sample_weight)
assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
assert_array_almost_equal(
eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
)
# check that an error is raised and indicative if sample_weight is not
# supported.
clf4 = KNeighborsClassifier()
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
)
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
with pytest.raises(TypeError, match=msg):
eclf3.fit(X_scaled, y, sample_weight=sample_weight)
# check that _fit_single_estimator will raise the right error
# it should raise the original error if this is not linked to sample_weight
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
def fit(self, X_scaled, y, sample_weight):
raise TypeError("Error unrelated to sample_weight.")
clf = ClassifierErrorFit()
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
clf.fit(X_scaled, y, sample_weight=sample_weight)
def test_sample_weight_kwargs():
"""Check that VotingClassifier passes sample_weight as kwargs"""
class MockClassifier(ClassifierMixin, BaseEstimator):
"""Mock Classifier to check that sample_weight is received as kwargs"""
def fit(self, X, y, *args, **sample_weight):
assert "sample_weight" in sample_weight
clf = MockClassifier()
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
# Should not raise an error.
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def test_voting_classifier_set_params(global_random_seed):
# check equivalence in the output when setting underlying estimators
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(
n_estimators=10, random_state=global_random_seed, max_depth=None
)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
).fit(X_scaled, y)
eclf2 = VotingClassifier(
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
)
eclf2.set_params(nb=clf2).fit(X_scaled, y)
assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def test_set_estimator_drop():
# VotingClassifier set_params should be able to set estimators as drop
# Test predict
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
clf3 = GaussianNB()
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 0, 0.5],
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
voting="hard",
weights=[1, 1, 0.5],
)
eclf2.set_params(rf="drop").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert dict(eclf2.estimators)["rf"] == "drop"
assert len(eclf2.estimators_) == 2
assert all(
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
)
assert eclf2.get_params()["rf"] == "drop"
eclf1.set_params(voting="soft").fit(X, y)
eclf2.set_params(voting="soft").fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
msg = "All estimators are dropped. At least one is required"
with pytest.raises(ValueError, match=msg):
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
# Test soft voting transform
X1 = np.array([[1], [2]])
y1 = np.array([1, 2])
eclf1 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[0, 0.5],
flatten_transform=False,
).fit(X1, y1)
eclf2 = VotingClassifier(
estimators=[("rf", clf2), ("nb", clf3)],
voting="soft",
weights=[1, 0.5],
flatten_transform=False,
)
eclf2.set_params(rf="drop").fit(X1, y1)
assert_array_almost_equal(
eclf1.transform(X1),
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
)
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
eclf1.set_params(voting="hard")
eclf2.set_params(voting="hard")
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format(global_random_seed):
# Test estimator weights inputs as list and array
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
)
eclf1.fit(X_scaled, y)
eclf2.fit(X_scaled, y)
assert_array_almost_equal(
eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
)
def test_transform(global_random_seed):
"""Check transform method of VotingClassifier on toy dataset."""
clf1 = LogisticRegression(random_state=global_random_seed)
clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
clf3 = GaussianNB()
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
eclf1 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
).fit(X, y)
eclf2 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=True,
).fit(X, y)
eclf3 = VotingClassifier(
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
voting="soft",
flatten_transform=False,
).fit(X, y)
assert_array_equal(eclf1.transform(X).shape, (4, 6))
assert_array_equal(eclf2.transform(X).shape, (4, 6))
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
assert_array_almost_equal(
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
)
@pytest.mark.parametrize(
"X, y, voter",
[
(
X,
y,
VotingClassifier(
[
("lr", LogisticRegression()),
("rf", RandomForestClassifier(n_estimators=5)),
]
),
),
(
X_r,
y_r,
VotingRegressor(
[
("lr", LinearRegression()),
("rf", RandomForestRegressor(n_estimators=5)),
]
),
),
],
)
def test_none_estimator_with_weights(X, y, voter):
# check that an estimator can be set to 'drop' and passing some weight
# regression test for
# https://github.com/scikit-learn/scikit-learn/issues/13777
voter = clone(voter)
# Scaled to solve ConvergenceWarning throw by Logistic Regression
X_scaled = StandardScaler().fit_transform(X)
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
voter.set_params(lr="drop")
voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
y_pred = voter.predict(X_scaled)
assert y_pred.shape == y.shape
@pytest.mark.parametrize(
"est",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
]
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
]
),
],
ids=["VotingRegressor", "VotingClassifier"],
)
def test_n_features_in(est):
est = clone(est)
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
assert not hasattr(est, "n_features_in_")
est.fit(X, y)
assert est.n_features_in_ == 2
@pytest.mark.parametrize(
"estimator",
[
VotingRegressor(
estimators=[
("lr", LinearRegression()),
("rf", RandomForestRegressor(random_state=123)),
],
verbose=True,
),
VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=123)),
("rf", RandomForestClassifier(random_state=123)),
],
verbose=True,
),
],
)
def test_voting_verbose(estimator, capsys):
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
pattern = (
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
)
clone(estimator).fit(X, y)
assert re.match(pattern, capsys.readouterr()[0])
def test_get_features_names_out_regressor():
"""Check get_feature_names_out output for regressor."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingRegressor(
estimators=[
("lr", LinearRegression()),
("tree", DecisionTreeRegressor(random_state=0)),
("ignore", "drop"),
]
)
voting.fit(X, y)
names_out = voting.get_feature_names_out()
expected_names = ["votingregressor_lr", "votingregressor_tree"]
assert_array_equal(names_out, expected_names)
@pytest.mark.parametrize(
"kwargs, expected_names",
[
(
{"voting": "soft", "flatten_transform": True},
[
"votingclassifier_lr0",
"votingclassifier_lr1",
"votingclassifier_lr2",
"votingclassifier_tree0",
"votingclassifier_tree1",
"votingclassifier_tree2",
],
),
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
],
)
def test_get_features_names_out_classifier(kwargs, expected_names):
"""Check get_feature_names_out for classifier for different settings."""
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
y = [0, 1, 2, 0]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
**kwargs,
)
voting.fit(X, y)
X_trans = voting.transform(X)
names_out = voting.get_feature_names_out()
assert X_trans.shape[1] == len(expected_names)
assert_array_equal(names_out, expected_names)
def test_get_features_names_out_classifier_error():
"""Check that error is raised when voting="soft" and flatten_transform=False."""
X = [[1, 2], [3, 4], [5, 6]]
y = [0, 1, 2]
voting = VotingClassifier(
estimators=[
("lr", LogisticRegression(random_state=0)),
("tree", DecisionTreeClassifier(random_state=0)),
],
voting="soft",
flatten_transform=False,
)
voting.fit(X, y)
msg = (
"get_feature_names_out is not supported when `voting='soft'` and "
"`flatten_transform=False`"
)
with pytest.raises(ValueError, match=msg):
voting.get_feature_names_out()
# Metadata Routing Tests
# ======================
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
def test_routing_passed_metadata_not_supported(Estimator, Child):
"""Test that the right error message is raised when metadata is passed while
not supported when `enable_metadata_routing=False`."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
with pytest.raises(
ValueError, match="is only supported if enable_metadata_routing=True"
):
Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a")
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@config_context(enable_metadata_routing=True)
def test_get_metadata_routing_without_fit(Estimator, Child):
# Test that metadata_routing() doesn't raise when called before fit.
est = Estimator([("sub_est", Child())])
est.get_metadata_routing()
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@pytest.mark.parametrize("prop", ["sample_weight", "metadata"])
@config_context(enable_metadata_routing=True)
def test_metadata_routing_for_voting_estimators(Estimator, Child, prop):
"""Test that metadata is routed correctly for Voting*."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator(
[
(
"sub_est1",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
(
"sub_est2",
Child(registry=_Registry()).set_fit_request(**{prop: True}),
),
]
)
est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata})
for estimator in est.estimators:
if prop == "sample_weight":
kwargs = {prop: sample_weight}
else:
kwargs = {prop: metadata}
# access sub-estimator in (name, est) with estimator[1]
registry = estimator[1].registry
assert len(registry)
for sub_est in registry:
check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs)
@pytest.mark.parametrize(
"Estimator, Child",
[(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)],
)
@config_context(enable_metadata_routing=True)
def test_metadata_routing_error_for_voting_estimators(Estimator, Child):
"""Test that the right error is raised when metadata is not requested."""
X = np.array([[0, 1], [2, 2], [4, 6]])
y = [1, 2, 3]
sample_weight, metadata = [1, 1, 1], "a"
est = Estimator([("sub_est", Child())])
error_message = (
"[sample_weight, metadata] are passed but are not explicitly set as requested"
f" or not requested for {Child.__name__}.fit"
)
with pytest.raises(ValueError, match=re.escape(error_message)):
est.fit(X, y, sample_weight=sample_weight, metadata=metadata)
# End of Metadata Routing Tests
# =============================

View File

@@ -0,0 +1,602 @@
"""Testing for the boost module (sklearn.ensemble.boost)."""
import re
import numpy as np
import pytest
from sklearn import datasets
from sklearn.base import BaseEstimator, clone
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.utils._mocking import NoSampleWeightWrapper
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
# Common random state
rng = np.random.RandomState(0)
# Toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
y_regr = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
y_t_class = ["foo", 1, 1]
y_t_regr = [-1, 1, 1]
# Load the iris dataset and randomly permute it
iris = datasets.load_iris()
perm = rng.permutation(iris.target.size)
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
# Load the diabetes dataset and randomly permute it
diabetes = datasets.load_diabetes()
diabetes.data, diabetes.target = shuffle(
diabetes.data, diabetes.target, random_state=rng
)
def test_oneclass_adaboost_proba():
# Test predict_proba robustness for one class label input.
# In response to issue #7501
# https://github.com/scikit-learn/scikit-learn/issues/7501
y_t = np.ones(len(X))
clf = AdaBoostClassifier().fit(X, y_t)
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def test_classification_toy():
# Check classification on a toy dataset.
clf = AdaBoostClassifier(random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert clf.predict_proba(T).shape == (len(T), 2)
assert clf.decision_function(T).shape == (len(T),)
def test_regression_toy():
# Check classification on a toy dataset.
clf = AdaBoostRegressor(random_state=0)
clf.fit(X, y_regr)
assert_array_equal(clf.predict(T), y_t_regr)
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf = AdaBoostClassifier()
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
assert proba.shape[1] == len(classes)
assert clf.decision_function(iris.data).shape[1] == len(classes)
score = clf.score(iris.data, iris.target)
assert score > 0.9, f"Failed with {score = }"
# Check we used multiple estimators
assert len(clf.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
def test_diabetes(loss):
# Check consistency on dataset diabetes.
reg = AdaBoostRegressor(loss=loss, random_state=0)
reg.fit(diabetes.data, diabetes.target)
score = reg.score(diabetes.data, diabetes.target)
assert score > 0.55
# Check we used multiple estimators
assert len(reg.estimators_) > 1
# Check for distinct random states (see issue #7408)
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
def test_staged_predict():
# Check staged predictions.
rng = np.random.RandomState(0)
iris_weights = rng.randint(10, size=iris.target.shape)
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
clf = AdaBoostClassifier(n_estimators=10)
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
predictions = clf.predict(iris.data)
staged_predictions = [p for p in clf.staged_predict(iris.data)]
proba = clf.predict_proba(iris.data)
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
staged_scores = [
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_probas) == 10
assert_array_almost_equal(proba, staged_probas[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
# AdaBoost regression
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
predictions = clf.predict(diabetes.data)
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
staged_scores = [
s
for s in clf.staged_score(
diabetes.data, diabetes.target, sample_weight=diabetes_weights
)
]
assert len(staged_predictions) == 10
assert_array_almost_equal(predictions, staged_predictions[-1])
assert len(staged_scores) == 10
assert_array_almost_equal(score, staged_scores[-1])
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
parameters = {
"n_estimators": (1, 2),
"estimator__max_depth": (1, 2),
}
clf = GridSearchCV(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
clf = GridSearchCV(boost, parameters)
clf.fit(diabetes.data, diabetes.target)
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
obj = AdaBoostClassifier()
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert score == score2
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(diabetes.data, diabetes.target)
score = obj.score(diabetes.data, diabetes.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(diabetes.data, diabetes.target)
assert score == score2
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(
n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1,
)
clf = AdaBoostClassifier()
clf.fit(X, y)
importances = clf.feature_importances_
assert importances.shape[0] == 10
assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_adaboost_classifier_sample_weight_error():
# Test that it gives proper exception on incorrect sample weight.
clf = AdaBoostClassifier()
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
with pytest.raises(ValueError, match=msg):
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
def test_estimator():
# Test different estimators.
from sklearn.ensemble import RandomForestClassifier
# XXX doesn't work with y_class because RF doesn't support classes_
# Shouldn't AdaBoost run a LabelBinarizer?
clf = AdaBoostClassifier(RandomForestClassifier())
clf.fit(X, y_regr)
clf = AdaBoostClassifier(SVC())
clf.fit(X, y_class)
from sklearn.ensemble import RandomForestRegressor
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
clf.fit(X, y_regr)
clf = AdaBoostRegressor(SVR(), random_state=0)
clf.fit(X, y_regr)
# Check that an empty discrete ensemble fails in fit, not predict.
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
y_fail = ["foo", "bar", 1, 2]
clf = AdaBoostClassifier(SVC())
with pytest.raises(ValueError, match="worse than random"):
clf.fit(X_fail, y_fail)
def test_sample_weights_infinite():
msg = "Sample weights have reached infinite values"
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0)
with pytest.warns(UserWarning, match=msg):
clf.fit(iris.data, iris.target)
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_classification(sparse_container, expected_internal_type):
# Check classification with sparse input.
class CustomSVC(SVC):
"""SVC variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_multilabel_classification(
n_classes=1, n_samples=15, n_features=5, random_state=42
)
# Flatten y to a 1d array
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
).fit(X_train_sparse, y_train)
# Trained on dense format
dense_classifier = AdaBoostClassifier(
estimator=CustomSVC(probability=True),
random_state=1,
).fit(X_train, y_train)
# predict
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
dense_clf_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_clf_results, dense_clf_results)
# decision_function
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
dense_clf_results = dense_classifier.decision_function(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_log_proba
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_log_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# predict_proba
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.predict_proba(X_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# score
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.score(X_test, y_test)
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
# staged_decision_function
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
dense_clf_results = dense_classifier.staged_decision_function(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_predict
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# staged_predict_proba
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
# staged_score
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
assert_array_equal(sparse_clf_res, dense_clf_res)
# Verify sparsity of data is maintained during training
types = [i.data_type_ for i in sparse_classifier.estimators_]
assert all([t == expected_internal_type for t in types])
@pytest.mark.parametrize(
"sparse_container, expected_internal_type",
zip(
[
*CSC_CONTAINERS,
*CSR_CONTAINERS,
*LIL_CONTAINERS,
*COO_CONTAINERS,
*DOK_CONTAINERS,
],
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
),
)
def test_sparse_regression(sparse_container, expected_internal_type):
# Check regression with sparse input.
class CustomSVR(SVR):
"""SVR variant that records the nature of the training set."""
def fit(self, X, y, sample_weight=None):
"""Modification on fit caries data type for later verification."""
super().fit(X, y, sample_weight=sample_weight)
self.data_type_ = type(X)
return self
X, y = datasets.make_regression(
n_samples=15, n_features=50, n_targets=1, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_sparse = sparse_container(X_train)
X_test_sparse = sparse_container(X_test)
# Trained on sparse format
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train_sparse, y_train
)
# Trained on dense format
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
X_train, y_train
)
# predict
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
dense_regr_results = dense_regressor.predict(X_test)
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
# staged_predict
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
dense_regr_results = dense_regressor.staged_predict(X_test)
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
types = [i.data_type_ for i in sparse_regressor.estimators_]
assert all([t == expected_internal_type for t in types])
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
def test_multidimensional_X():
"""
Check that the AdaBoost estimators can work with n-dimensional
data matrix
"""
rng = np.random.RandomState(0)
X = rng.randn(51, 3, 3)
yc = rng.choice([0, 1], 51)
yr = rng.randn(51)
boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
boost.fit(X, yc)
boost.predict(X)
boost.predict_proba(X)
boost = AdaBoostRegressor(DummyRegressor())
boost.fit(X, yr)
boost.predict(X)
def test_adaboostclassifier_without_sample_weight():
X, y = iris.data, iris.target
estimator = NoSampleWeightWrapper(DummyClassifier())
clf = AdaBoostClassifier(estimator=estimator)
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_adaboostregressor_sample_weight():
# check that giving weight will have an influence on the error computed
# for a weak learner
rng = np.random.RandomState(42)
X = np.linspace(0, 100, num=1000)
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
X = X.reshape(-1, 1)
# add an arbitrary outlier
X[-1] *= 10
y[-1] = 10000
# random_state=0 ensure that the underlying bootstrap will use the outlier
regr_no_outlier = AdaBoostRegressor(
estimator=LinearRegression(), n_estimators=1, random_state=0
)
regr_with_weight = clone(regr_no_outlier)
regr_with_outlier = clone(regr_no_outlier)
# fit 3 models:
# - a model containing the outlier
# - a model without the outlier
# - a model containing the outlier but with a null sample-weight
regr_with_outlier.fit(X, y)
regr_no_outlier.fit(X[:-1], y[:-1])
sample_weight = np.ones_like(y)
sample_weight[-1] = 0
regr_with_weight.fit(X, y, sample_weight=sample_weight)
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
assert score_with_outlier < score_no_outlier
assert score_with_outlier < score_with_weight
assert score_no_outlier == pytest.approx(score_with_weight)
def test_adaboost_consistent_predict():
# check that predict_proba and predict give consistent results
# regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/14084
X_train, X_test, y_train, y_test = train_test_split(
*datasets.load_digits(return_X_y=True), random_state=42
)
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)
assert_array_equal(
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
)
@pytest.mark.parametrize(
"model, X, y",
[
(AdaBoostClassifier(), iris.data, iris.target),
(AdaBoostRegressor(), diabetes.data, diabetes.target),
],
)
def test_adaboost_negative_weight_error(model, X, y):
sample_weight = np.ones_like(y)
sample_weight[-1] = -10
err_msg = "Negative values in data passed to `sample_weight`"
with pytest.raises(ValueError, match=err_msg):
model.fit(X, y, sample_weight=sample_weight)
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
"""Check that we don't create NaN feature importance with numerically
instable inputs.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20320
"""
rng = np.random.RandomState(42)
X = rng.normal(size=(1000, 10))
y = rng.choice([0, 1], size=1000)
sample_weight = np.ones_like(y) * 1e-263
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12)
ada_model.fit(X, y, sample_weight=sample_weight)
assert np.isnan(ada_model.feature_importances_).sum() == 0
def test_adaboost_decision_function(global_random_seed):
"""Check that the decision function respects the symmetric constraint for weak
learners.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/26520
"""
n_classes = 3
X, y = datasets.make_classification(
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
)
clf = AdaBoostClassifier(n_estimators=1, random_state=global_random_seed).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
# We can assert the same for staged_decision_function since we have a single learner
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
# With a single learner, we expect to have a decision function in
# {1, - 1 / (n_classes - 1)}.
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
clf.set_params(n_estimators=5).fit(X, y)
y_score = clf.decision_function(X)
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
for y_score in clf.staged_decision_function(X):
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)