This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,24 @@
"""Decision tree based models for classification and regression."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from sklearn.tree._classes import (
BaseDecisionTree,
DecisionTreeClassifier,
DecisionTreeRegressor,
ExtraTreeClassifier,
ExtraTreeRegressor,
)
from sklearn.tree._export import export_graphviz, export_text, plot_tree
__all__ = [
"BaseDecisionTree",
"DecisionTreeClassifier",
"DecisionTreeRegressor",
"ExtraTreeClassifier",
"ExtraTreeRegressor",
"export_graphviz",
"export_text",
"plot_tree",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,109 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See _criterion.pyx for implementation details.
from sklearn.utils._typedefs cimport float64_t, int8_t, intp_t
cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.
# Internal structures
cdef const float64_t[:, ::1] y # Values of y
cdef const float64_t[:] sample_weight # Sample weights
cdef const intp_t[:] sample_indices # Sample indices in X, y
cdef intp_t start # samples[start:pos] are the samples in the left node
cdef intp_t pos # samples[pos:end] are the samples in the right node
cdef intp_t end
cdef intp_t n_missing # Number of missing values for the feature being evaluated
cdef bint missing_go_to_left # Whether missing values go to the left node
cdef intp_t n_outputs # Number of outputs
cdef intp_t n_samples # Number of samples
cdef intp_t n_node_samples # Number of samples in the node (end-start)
cdef float64_t weighted_n_samples # Weighted number of samples (in total)
cdef float64_t weighted_n_node_samples # Weighted number of samples in the node
cdef float64_t weighted_n_left # Weighted number of samples in the left node
cdef float64_t weighted_n_right # Weighted number of samples in the right node
cdef float64_t weighted_n_missing # Weighted number of samples that are missing
# The criterion object is maintained such that left and right collected
# statistics correspond to samples[start:pos] and samples[pos:end].
# Methods
cdef int init(
self,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
float64_t weighted_n_samples,
const intp_t[:] sample_indices,
intp_t start,
intp_t end
) except -1 nogil
cdef void init_sum_missing(self)
cdef void init_missing(self, intp_t n_missing) noexcept nogil
cdef int reset(self) except -1 nogil
cdef int reverse_reset(self) except -1 nogil
cdef int update(self, intp_t new_pos) except -1 nogil
cdef float64_t node_impurity(self) noexcept nogil
cdef void children_impurity(
self,
float64_t* impurity_left,
float64_t* impurity_right
) noexcept nogil
cdef void node_value(
self,
float64_t* dest
) noexcept nogil
cdef void clip_node_value(
self,
float64_t* dest,
float64_t lower_bound,
float64_t upper_bound
) noexcept nogil
cdef float64_t middle_value(self) noexcept nogil
cdef float64_t impurity_improvement(
self,
float64_t impurity_parent,
float64_t impurity_left,
float64_t impurity_right
) noexcept nogil
cdef float64_t proxy_impurity_improvement(self) noexcept nogil
cdef bint check_monotonicity(
self,
int8_t monotonic_cst,
float64_t lower_bound,
float64_t upper_bound,
) noexcept nogil
cdef inline bint _check_monotonicity(
self,
int8_t monotonic_cst,
float64_t lower_bound,
float64_t upper_bound,
float64_t sum_left,
float64_t sum_right,
) noexcept nogil
cdef class ClassificationCriterion(Criterion):
"""Abstract criterion for classification."""
cdef intp_t[::1] n_classes
cdef intp_t max_n_classes
cdef float64_t[:, ::1] sum_total # The sum of the weighted count of each label.
cdef float64_t[:, ::1] sum_left # Same as above, but for the left side of the split
cdef float64_t[:, ::1] sum_right # Same as above, but for the right side of the split
cdef float64_t[:, ::1] sum_missing # Same as above, but for missing values in X
cdef class RegressionCriterion(Criterion):
"""Abstract regression criterion."""
cdef float64_t sq_sum_total
cdef float64_t[::1] sum_total # The sum of w*y.
cdef float64_t[::1] sum_left # Same as above, but for the left side of the split
cdef float64_t[::1] sum_right # Same as above, but for the right side of the split
cdef float64_t[::1] sum_missing # Same as above, but for missing values in X

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,183 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See _partitioner.pyx for details.
from cython cimport floating
from sklearn.utils._typedefs cimport (
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
)
from sklearn.tree._splitter cimport SplitRecord
# Mitigate precision differences between 32 bit and 64 bit
cdef const float32_t FEATURE_THRESHOLD = 1e-7
# We provide here the abstract interface for a Partitioner that would be
# theoretically shared between the Dense and Sparse partitioners. However,
# we leave it commented out for now as it is not used in the current
# implementation due to the performance hit from vtable lookups when using
# inheritance based polymorphism. It is left here for future reference.
#
# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used
# to represent both the dense and sparse partitioners.
#
# cdef class BasePartitioner:
# cdef intp_t[::1] samples
# cdef float32_t[::1] feature_values
# cdef intp_t start
# cdef intp_t end
# cdef intp_t n_missing
# cdef const uint8_t[::1] missing_values_in_feature_mask
# cdef void sort_samples_and_feature_values(
# self, intp_t current_feature
# ) noexcept nogil
# cdef void init_node_split(
# self,
# intp_t start,
# intp_t end
# ) noexcept nogil
# cdef void find_min_max(
# self,
# intp_t current_feature,
# float32_t* min_feature_value_out,
# float32_t* max_feature_value_out,
# ) noexcept nogil
# cdef void next_p(
# self,
# intp_t* p_prev,
# intp_t* p
# ) noexcept nogil
# cdef intp_t partition_samples(
# self,
# float64_t current_threshold
# ) noexcept nogil
# cdef void partition_samples_final(
# self,
# intp_t best_pos,
# float64_t best_threshold,
# intp_t best_feature,
# intp_t n_missing,
# ) noexcept nogil
cdef class DensePartitioner:
"""Partitioner specialized for dense data.
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef const float32_t[:, :] X
cdef intp_t[::1] samples
cdef float32_t[::1] feature_values
cdef intp_t start
cdef intp_t end
cdef intp_t n_missing
cdef const uint8_t[::1] missing_values_in_feature_mask
cdef void sort_samples_and_feature_values(
self, intp_t current_feature
) noexcept nogil
cdef void init_node_split(
self,
intp_t start,
intp_t end
) noexcept nogil
cdef void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil
cdef void next_p(
self,
intp_t* p_prev,
intp_t* p
) noexcept nogil
cdef intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil
cdef void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t n_missing,
) noexcept nogil
cdef class SparsePartitioner:
"""Partitioner specialized for sparse CSC data.
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef const float32_t[::1] X_data
cdef const int32_t[::1] X_indices
cdef const int32_t[::1] X_indptr
cdef intp_t n_total_samples
cdef intp_t[::1] index_to_samples
cdef intp_t[::1] sorted_samples
cdef intp_t start_positive
cdef intp_t end_negative
cdef bint is_samples_sorted
cdef intp_t[::1] samples
cdef float32_t[::1] feature_values
cdef intp_t start
cdef intp_t end
cdef intp_t n_missing
cdef const uint8_t[::1] missing_values_in_feature_mask
cdef void sort_samples_and_feature_values(
self, intp_t current_feature
) noexcept nogil
cdef void init_node_split(
self,
intp_t start,
intp_t end
) noexcept nogil
cdef void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil
cdef void next_p(
self,
intp_t* p_prev,
intp_t* p
) noexcept nogil
cdef intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil
cdef void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t n_missing,
) noexcept nogil
cdef void extract_nnz(
self,
intp_t feature
) noexcept nogil
cdef intp_t _partition(
self,
float64_t threshold,
intp_t zero_pos
) noexcept nogil
cdef void shift_missing_values_to_left_if_required(
SplitRecord* best,
intp_t[::1] samples,
intp_t end,
) noexcept nogil
cdef void sort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil

View File

@@ -0,0 +1,817 @@
"""Partition samples in the construction of a tree.
This module contains the algorithms for moving sample indices to
the left and right child node given a split determined by the
splitting algorithm in `_splitter.pyx`.
Partitioning is done in a way that is efficient for both dense data,
and sparse data stored in a Compressed Sparse Column (CSC) format.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cython cimport final
from libc.math cimport isnan, log2
from libc.stdlib cimport qsort
from libc.string cimport memcpy
import numpy as np
from scipy.sparse import issparse
# Constant to switch between algorithm non zero value extract algorithm
# in SparsePartitioner
cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
# Allow for 32 bit float comparisons
cdef float32_t INFINITY_32t = np.inf
@final
cdef class DensePartitioner:
"""Partitioner specialized for dense data.
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
def __init__(
self,
const float32_t[:, :] X,
intp_t[::1] samples,
float32_t[::1] feature_values,
const uint8_t[::1] missing_values_in_feature_mask,
):
self.X = X
self.samples = samples
self.feature_values = feature_values
self.missing_values_in_feature_mask = missing_values_in_feature_mask
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
"""Initialize splitter at the beginning of node_split."""
self.start = start
self.end = end
self.n_missing = 0
cdef inline void sort_samples_and_feature_values(
self, intp_t current_feature
) noexcept nogil:
"""Simultaneously sort based on the feature_values.
Missing values are stored at the end of feature_values.
The number of missing values observed in feature_values is stored
in self.n_missing.
"""
cdef:
intp_t i, current_end
float32_t[::1] feature_values = self.feature_values
const float32_t[:, :] X = self.X
intp_t[::1] samples = self.samples
intp_t n_missing = 0
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
# Sort samples along that feature; by copying the values into an array and
# sorting the array in a manner which utilizes the cache more effectively.
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
i, current_end = self.start, self.end - 1
# Missing values are placed at the end and do not participate in the sorting.
while i <= current_end:
# Finds the right-most value that is not missing so that
# it can be swapped with missing values at its left.
if isnan(X[samples[current_end], current_feature]):
n_missing += 1
current_end -= 1
continue
# X[samples[current_end], current_feature] is a non-missing value
if isnan(X[samples[i], current_feature]):
samples[i], samples[current_end] = samples[current_end], samples[i]
n_missing += 1
current_end -= 1
feature_values[i] = X[samples[i], current_feature]
i += 1
else:
# When there are no missing values, we only need to copy the data into
# feature_values
for i in range(self.start, self.end):
feature_values[i] = X[samples[i], current_feature]
sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
self.n_missing = n_missing
cdef inline void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil:
"""Find the minimum and maximum value for current_feature.
Missing values are stored at the end of feature_values. The number of missing
values observed in feature_values is stored in self.n_missing.
"""
cdef:
intp_t p, current_end
float32_t current_feature_value
const float32_t[:, :] X = self.X
intp_t[::1] samples = self.samples
float32_t min_feature_value = INFINITY_32t
float32_t max_feature_value = -INFINITY_32t
float32_t[::1] feature_values = self.feature_values
intp_t n_missing = 0
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
# We are copying the values into an array and finding min/max of the array in
# a manner which utilizes the cache more effectively. We need to also count
# the number of missing-values there are.
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
p, current_end = self.start, self.end - 1
# Missing values are placed at the end and do not participate in the
# min/max calculation.
while p <= current_end:
# Finds the right-most value that is not missing so that
# it can be swapped with missing values towards its left.
if isnan(X[samples[current_end], current_feature]):
n_missing += 1
current_end -= 1
continue
# X[samples[current_end], current_feature] is a non-missing value
if isnan(X[samples[p], current_feature]):
samples[p], samples[current_end] = samples[current_end], samples[p]
n_missing += 1
current_end -= 1
current_feature_value = X[samples[p], current_feature]
feature_values[p] = current_feature_value
if current_feature_value < min_feature_value:
min_feature_value = current_feature_value
elif current_feature_value > max_feature_value:
max_feature_value = current_feature_value
p += 1
else:
min_feature_value = X[samples[self.start], current_feature]
max_feature_value = min_feature_value
feature_values[self.start] = min_feature_value
for p in range(self.start + 1, self.end):
current_feature_value = X[samples[p], current_feature]
feature_values[p] = current_feature_value
if current_feature_value < min_feature_value:
min_feature_value = current_feature_value
elif current_feature_value > max_feature_value:
max_feature_value = current_feature_value
min_feature_value_out[0] = min_feature_value
max_feature_value_out[0] = max_feature_value
self.n_missing = n_missing
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
"""Compute the next p_prev and p for iterating over feature values.
The missing values are not included when iterating through the feature values.
"""
cdef intp_t end_non_missing = self.end - self.n_missing
while (
p[0] + 1 < end_non_missing and
self.feature_values[p[0] + 1] <= self.feature_values[p[0]] + FEATURE_THRESHOLD
):
p[0] += 1
p_prev[0] = p[0]
# By adding 1, we have
# (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
p[0] += 1
cdef inline intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil:
"""Partition samples for feature_values at the current_threshold."""
cdef:
intp_t p = self.start
intp_t partition_end = self.end - self.n_missing
intp_t[::1] samples = self.samples
float32_t[::1] feature_values = self.feature_values
while p < partition_end:
if feature_values[p] <= current_threshold:
p += 1
else:
partition_end -= 1
feature_values[p], feature_values[partition_end] = (
feature_values[partition_end], feature_values[p]
)
samples[p], samples[partition_end] = samples[partition_end], samples[p]
return partition_end
cdef inline void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t best_n_missing,
) noexcept nogil:
"""Partition samples for X at the best_threshold and best_feature.
If missing values are present, this method partitions `samples`
so that the `best_n_missing` missing values' indices are in the
right-most end of `samples`, that is `samples[end_non_missing:end]`.
"""
cdef:
# Local invariance: start <= p <= partition_end <= end
intp_t start = self.start
intp_t p = start
intp_t end = self.end - 1
intp_t partition_end = end - best_n_missing
intp_t[::1] samples = self.samples
const float32_t[:, :] X = self.X
float32_t current_value
if best_n_missing != 0:
# Move samples with missing values to the end while partitioning the
# non-missing samples
while p <= partition_end:
# Keep samples with missing values at the end
if isnan(X[samples[end], best_feature]):
end -= 1
continue
# Swap sample with missing values with the sample at the end
current_value = X[samples[p], best_feature]
if isnan(current_value):
samples[p], samples[end] = samples[end], samples[p]
end -= 1
# The swapped sample at the end is always a non-missing value, so
# we can continue the algorithm without checking for missingness.
current_value = X[samples[p], best_feature]
# Partition the non-missing samples
if current_value <= best_threshold:
p += 1
else:
samples[p], samples[partition_end] = samples[partition_end], samples[p]
partition_end -= 1
else:
# Partitioning routine when there are no missing values
while p < partition_end:
if X[samples[p], best_feature] <= best_threshold:
p += 1
else:
samples[p], samples[partition_end] = samples[partition_end], samples[p]
partition_end -= 1
@final
cdef class SparsePartitioner:
"""Partitioner specialized for sparse CSC data.
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
def __init__(
self,
object X,
intp_t[::1] samples,
intp_t n_samples,
float32_t[::1] feature_values,
const uint8_t[::1] missing_values_in_feature_mask,
):
if not (issparse(X) and X.format == "csc"):
raise ValueError("X should be in csc format")
self.samples = samples
self.feature_values = feature_values
# Initialize X
cdef intp_t n_total_samples = X.shape[0]
self.X_data = X.data
self.X_indices = X.indices
self.X_indptr = X.indptr
self.n_total_samples = n_total_samples
# Initialize auxiliary array used to perform split
self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp)
self.sorted_samples = np.empty(n_samples, dtype=np.intp)
cdef intp_t p
for p in range(n_samples):
self.index_to_samples[samples[p]] = p
self.missing_values_in_feature_mask = missing_values_in_feature_mask
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
"""Initialize splitter at the beginning of node_split."""
self.start = start
self.end = end
self.is_samples_sorted = 0
self.n_missing = 0
cdef inline void sort_samples_and_feature_values(
self,
intp_t current_feature
) noexcept nogil:
"""Simultaneously sort based on the feature_values."""
cdef:
float32_t[::1] feature_values = self.feature_values
intp_t[::1] index_to_samples = self.index_to_samples
intp_t[::1] samples = self.samples
self.extract_nnz(current_feature)
# Sort the positive and negative parts of `feature_values`
sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
if self.start_positive < self.end:
sort(
&feature_values[self.start_positive],
&samples[self.start_positive],
self.end - self.start_positive
)
# Update index_to_samples to take into account the sort
for p in range(self.start, self.end_negative):
index_to_samples[samples[p]] = p
for p in range(self.start_positive, self.end):
index_to_samples[samples[p]] = p
# Add one or two zeros in feature_values, if there is any
if self.end_negative < self.start_positive:
self.start_positive -= 1
feature_values[self.start_positive] = 0.
if self.end_negative != self.start_positive:
feature_values[self.end_negative] = 0.
self.end_negative += 1
# XXX: When sparse supports missing values, this should be set to the
# number of missing values for current_feature
self.n_missing = 0
cdef inline void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil:
"""Find the minimum and maximum value for current_feature."""
cdef:
intp_t p
float32_t current_feature_value, min_feature_value, max_feature_value
float32_t[::1] feature_values = self.feature_values
self.extract_nnz(current_feature)
if self.end_negative != self.start_positive:
# There is a zero
min_feature_value = 0
max_feature_value = 0
else:
min_feature_value = feature_values[self.start]
max_feature_value = min_feature_value
# Find min, max in feature_values[start:end_negative]
for p in range(self.start, self.end_negative):
current_feature_value = feature_values[p]
if current_feature_value < min_feature_value:
min_feature_value = current_feature_value
elif current_feature_value > max_feature_value:
max_feature_value = current_feature_value
# Update min, max given feature_values[start_positive:end]
for p in range(self.start_positive, self.end):
current_feature_value = feature_values[p]
if current_feature_value < min_feature_value:
min_feature_value = current_feature_value
elif current_feature_value > max_feature_value:
max_feature_value = current_feature_value
min_feature_value_out[0] = min_feature_value
max_feature_value_out[0] = max_feature_value
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
"""Compute the next p_prev and p for iterating over feature values."""
cdef intp_t p_next
if p[0] + 1 != self.end_negative:
p_next = p[0] + 1
else:
p_next = self.start_positive
while (p_next < self.end and
self.feature_values[p_next] <= self.feature_values[p[0]] + FEATURE_THRESHOLD):
p[0] = p_next
if p[0] + 1 != self.end_negative:
p_next = p[0] + 1
else:
p_next = self.start_positive
p_prev[0] = p[0]
p[0] = p_next
cdef inline intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil:
"""Partition samples for feature_values at the current_threshold."""
return self._partition(current_threshold, self.start_positive)
cdef inline void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t n_missing,
) noexcept nogil:
"""Partition samples for X at the best_threshold and best_feature."""
self.extract_nnz(best_feature)
self._partition(best_threshold, best_pos)
cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil:
"""Partition samples[start:end] based on threshold."""
cdef:
intp_t p, partition_end
intp_t[::1] index_to_samples = self.index_to_samples
float32_t[::1] feature_values = self.feature_values
intp_t[::1] samples = self.samples
if threshold < 0.:
p = self.start
partition_end = self.end_negative
elif threshold > 0.:
p = self.start_positive
partition_end = self.end
else:
# Data are already split
return zero_pos
while p < partition_end:
if feature_values[p] <= threshold:
p += 1
else:
partition_end -= 1
feature_values[p], feature_values[partition_end] = (
feature_values[partition_end], feature_values[p]
)
sparse_swap(index_to_samples, samples, p, partition_end)
return partition_end
cdef inline void extract_nnz(self, intp_t feature) noexcept nogil:
"""Extract and partition values for a given feature.
The extracted values are partitioned between negative values
feature_values[start:end_negative[0]] and positive values
feature_values[start_positive[0]:end].
The samples and index_to_samples are modified according to this
partition.
The extraction corresponds to the intersection between the arrays
X_indices[indptr_start:indptr_end] and samples[start:end].
This is done efficiently using either an index_to_samples based approach
or binary search based approach.
Parameters
----------
feature : intp_t,
Index of the feature we want to extract non zero value.
"""
cdef intp_t[::1] samples = self.samples
cdef float32_t[::1] feature_values = self.feature_values
cdef intp_t indptr_start = self.X_indptr[feature]
cdef intp_t indptr_end = self.X_indptr[feature + 1]
cdef intp_t n_indices = <intp_t>(indptr_end - indptr_start)
cdef intp_t n_samples = self.end - self.start
cdef intp_t[::1] index_to_samples = self.index_to_samples
cdef intp_t[::1] sorted_samples = self.sorted_samples
cdef const int32_t[::1] X_indices = self.X_indices
cdef const float32_t[::1] X_data = self.X_data
# Use binary search if n_samples * log(n_indices) <
# n_indices and index_to_samples approach otherwise.
# O(n_samples * log(n_indices)) is the running time of binary
# search and O(n_indices) is the running time of index_to_samples
# approach.
if ((1 - self.is_samples_sorted) * n_samples * log2(n_samples) +
n_samples * log2(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
extract_nnz_binary_search(X_indices, X_data,
indptr_start, indptr_end,
samples, self.start, self.end,
index_to_samples,
feature_values,
&self.end_negative, &self.start_positive,
sorted_samples, &self.is_samples_sorted)
# Using an index to samples technique to extract non zero values
# index_to_samples is a mapping from X_indices to samples
else:
extract_nnz_index_to_samples(X_indices, X_data,
indptr_start, indptr_end,
samples, self.start, self.end,
index_to_samples,
feature_values,
&self.end_negative, &self.start_positive)
cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
"""Comparison function for sort.
This must return an `int` as it is used by stdlib's qsort, which expects
an `int` return value.
"""
return <int>((<intp_t*>a)[0] - (<intp_t*>b)[0])
cdef inline void binary_search(const int32_t[::1] sorted_array,
int32_t start, int32_t end,
intp_t value, intp_t* index,
int32_t* new_start) noexcept nogil:
"""Return the index of value in the sorted array.
If not found, return -1. new_start is the last pivot + 1
"""
cdef int32_t pivot
index[0] = -1
while start < end:
pivot = start + (end - start) / 2
if sorted_array[pivot] == value:
index[0] = pivot
start = pivot + 1
break
if sorted_array[pivot] < value:
start = pivot + 1
else:
end = pivot
new_start[0] = start
cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices,
const float32_t[::1] X_data,
int32_t indptr_start,
int32_t indptr_end,
intp_t[::1] samples,
intp_t start,
intp_t end,
intp_t[::1] index_to_samples,
float32_t[::1] feature_values,
intp_t* end_negative,
intp_t* start_positive) noexcept nogil:
"""Extract and partition values for a feature using index_to_samples.
Complexity is O(indptr_end - indptr_start).
"""
cdef int32_t k
cdef intp_t index
cdef intp_t end_negative_ = start
cdef intp_t start_positive_ = end
for k in range(indptr_start, indptr_end):
if start <= index_to_samples[X_indices[k]] < end:
if X_data[k] > 0:
start_positive_ -= 1
feature_values[start_positive_] = X_data[k]
index = index_to_samples[X_indices[k]]
sparse_swap(index_to_samples, samples, index, start_positive_)
elif X_data[k] < 0:
feature_values[end_negative_] = X_data[k]
index = index_to_samples[X_indices[k]]
sparse_swap(index_to_samples, samples, index, end_negative_)
end_negative_ += 1
# Returned values
end_negative[0] = end_negative_
start_positive[0] = start_positive_
cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
const float32_t[::1] X_data,
int32_t indptr_start,
int32_t indptr_end,
intp_t[::1] samples,
intp_t start,
intp_t end,
intp_t[::1] index_to_samples,
float32_t[::1] feature_values,
intp_t* end_negative,
intp_t* start_positive,
intp_t[::1] sorted_samples,
bint* is_samples_sorted) noexcept nogil:
"""Extract and partition values for a given feature using binary search.
If n_samples = end - start and n_indices = indptr_end - indptr_start,
the complexity is
O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
n_samples * log(n_indices)).
"""
cdef intp_t n_samples
if not is_samples_sorted[0]:
n_samples = end - start
memcpy(&sorted_samples[start], &samples[start],
n_samples * sizeof(intp_t))
qsort(&sorted_samples[start], n_samples, sizeof(intp_t),
compare_SIZE_t)
is_samples_sorted[0] = 1
while (indptr_start < indptr_end and
sorted_samples[start] > X_indices[indptr_start]):
indptr_start += 1
while (indptr_start < indptr_end and
sorted_samples[end - 1] < X_indices[indptr_end - 1]):
indptr_end -= 1
cdef intp_t p = start
cdef intp_t index
cdef intp_t k
cdef intp_t end_negative_ = start
cdef intp_t start_positive_ = end
while (p < end and indptr_start < indptr_end):
# Find index of sorted_samples[p] in X_indices
binary_search(X_indices, indptr_start, indptr_end,
sorted_samples[p], &k, &indptr_start)
if k != -1:
# If k != -1, we have found a non zero value
if X_data[k] > 0:
start_positive_ -= 1
feature_values[start_positive_] = X_data[k]
index = index_to_samples[X_indices[k]]
sparse_swap(index_to_samples, samples, index, start_positive_)
elif X_data[k] < 0:
feature_values[end_negative_] = X_data[k]
index = index_to_samples[X_indices[k]]
sparse_swap(index_to_samples, samples, index, end_negative_)
end_negative_ += 1
p += 1
# Returned values
end_negative[0] = end_negative_
start_positive[0] = start_positive_
cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples,
intp_t pos_1, intp_t pos_2) noexcept nogil:
"""Swap sample pos_1 and pos_2 preserving sparse invariant."""
samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1]
index_to_samples[samples[pos_1]] = pos_1
index_to_samples[samples[pos_2]] = pos_2
cdef inline void shift_missing_values_to_left_if_required(
SplitRecord* best,
intp_t[::1] samples,
intp_t end,
) noexcept nogil:
"""Shift missing value sample indices to the left of the split if required.
Note: this should always be called at the very end because it will
move samples around, thereby affecting the criterion.
This affects the computation of the children impurity, which affects
the computation of the next node.
"""
cdef intp_t i, p, current_end
# The partitioner partitions the data such that the missing values are in
# samples[-n_missing:] for the criterion to consume. If the missing values
# are going to the right node, then the missing values are already in the
# correct position. If the missing values go left, then we move the missing
# values to samples[best.pos:best.pos+n_missing] and update `best.pos`.
if best.n_missing > 0 and best.missing_go_to_left:
for p in range(best.n_missing):
i = best.pos + p
current_end = end - 1 - p
samples[i], samples[current_end] = samples[current_end], samples[i]
best.pos += best.n_missing
def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n):
"""Used for testing sort."""
sort(&feature_values[0], &samples[0], n)
# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
cdef void sort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
if n == 0:
return
cdef intp_t maxd = 2 * <intp_t>log2(n)
introsort(feature_values, samples, n, maxd)
cdef inline void swap(floating* feature_values, intp_t* samples,
intp_t i, intp_t j) noexcept nogil:
# Helper for sort
feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
samples[i], samples[j] = samples[j], samples[i]
cdef inline floating median3(floating* feature_values, intp_t n) noexcept nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef floating a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b
# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(floating* feature_values, intp_t *samples,
intp_t n, intp_t maxd) noexcept nogil:
cdef floating pivot
cdef intp_t i, l, r
while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(feature_values, samples, n)
return
maxd -= 1
pivot = median3(feature_values, n)
# Three-way partition.
i = l = 0
r = n
while i < r:
if feature_values[i] < pivot:
swap(feature_values, samples, i, l)
i += 1
l += 1
elif feature_values[i] > pivot:
r -= 1
swap(feature_values, samples, i, r)
else:
i += 1
introsort(feature_values, samples, l, maxd)
feature_values += r
samples += r
n -= r
cdef inline void sift_down(floating* feature_values, intp_t* samples,
intp_t start, intp_t end) noexcept nogil:
# Restore heap order in feature_values[start:end] by moving the max element to start.
cdef intp_t child, maxind, root
root = start
while True:
child = root * 2 + 1
# find max of root, left child, right child
maxind = root
if child < end and feature_values[maxind] < feature_values[child]:
maxind = child
if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
maxind = child + 1
if maxind == root:
break
else:
swap(feature_values, samples, root, maxind)
root = maxind
cdef void heapsort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
cdef intp_t start, end
# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(feature_values, samples, start, end)
if start == 0:
break
start -= 1
# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(feature_values, samples, 0, end)
sift_down(feature_values, samples, 0, end)
end = end - 1

View File

@@ -0,0 +1,188 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
class DrawTree:
def __init__(self, tree, parent=None, depth=0, number=1):
self.x = -1.0
self.y = depth
self.tree = tree
self.children = [
DrawTree(c, self, depth + 1, i + 1) for i, c in enumerate(tree.children)
]
self.parent = parent
self.thread = None
self.mod = 0
self.ancestor = self
self.change = self.shift = 0
self._lmost_sibling = None
# this is the number of the node in its group of siblings 1..n
self.number = number
def left(self):
return self.thread or (len(self.children) and self.children[0])
def right(self):
return self.thread or (len(self.children) and self.children[-1])
def lbrother(self):
n = None
if self.parent:
for node in self.parent.children:
if node == self:
return n
else:
n = node
return n
def get_lmost_sibling(self):
if not self._lmost_sibling and self.parent and self != self.parent.children[0]:
self._lmost_sibling = self.parent.children[0]
return self._lmost_sibling
lmost_sibling = property(get_lmost_sibling)
def __str__(self):
return "%s: x=%s mod=%s" % (self.tree, self.x, self.mod)
def __repr__(self):
return self.__str__()
def max_extents(self):
extents = [c.max_extents() for c in self.children]
extents.append((self.x, self.y))
return np.max(extents, axis=0)
def buchheim(tree):
dt = first_walk(DrawTree(tree))
min = second_walk(dt)
if min < 0:
third_walk(dt, -min)
return dt
def third_walk(tree, n):
tree.x += n
for c in tree.children:
third_walk(c, n)
def first_walk(v, distance=1.0):
if len(v.children) == 0:
if v.lmost_sibling:
v.x = v.lbrother().x + distance
else:
v.x = 0.0
else:
default_ancestor = v.children[0]
for w in v.children:
first_walk(w)
default_ancestor = apportion(w, default_ancestor, distance)
# print("finished v =", v.tree, "children")
execute_shifts(v)
midpoint = (v.children[0].x + v.children[-1].x) / 2
w = v.lbrother()
if w:
v.x = w.x + distance
v.mod = v.x - midpoint
else:
v.x = midpoint
return v
def apportion(v, default_ancestor, distance):
w = v.lbrother()
if w is not None:
# in buchheim notation:
# i == inner; o == outer; r == right; l == left; r = +; l = -
vir = vor = v
vil = w
vol = v.lmost_sibling
sir = sor = v.mod
sil = vil.mod
sol = vol.mod
while vil.right() and vir.left():
vil = vil.right()
vir = vir.left()
vol = vol.left()
vor = vor.right()
vor.ancestor = v
shift = (vil.x + sil) - (vir.x + sir) + distance
if shift > 0:
move_subtree(ancestor(vil, v, default_ancestor), v, shift)
sir = sir + shift
sor = sor + shift
sil += vil.mod
sir += vir.mod
sol += vol.mod
sor += vor.mod
if vil.right() and not vor.right():
vor.thread = vil.right()
vor.mod += sil - sor
else:
if vir.left() and not vol.left():
vol.thread = vir.left()
vol.mod += sir - sol
default_ancestor = v
return default_ancestor
def move_subtree(wl, wr, shift):
subtrees = wr.number - wl.number
# print(wl.tree, "is conflicted with", wr.tree, 'moving', subtrees,
# 'shift', shift)
# print wl, wr, wr.number, wl.number, shift, subtrees, shift/subtrees
wr.change -= shift / subtrees
wr.shift += shift
wl.change += shift / subtrees
wr.x += shift
wr.mod += shift
def execute_shifts(v):
shift = change = 0
for w in v.children[::-1]:
# print("shift:", w, shift, w.change)
w.x += shift
w.mod += shift
change += w.change
shift += w.shift + change
def ancestor(vil, v, default_ancestor):
# the relevant text is at the bottom of page 7 of
# "Improving Walker's Algorithm to Run in Linear Time" by Buchheim et al,
# (2002)
# https://citeseerx.ist.psu.edu/doc_view/pid/1f41c3c2a4880dc49238e46d555f16d28da2940d
if vil.ancestor in v.parent.children:
return vil.ancestor
else:
return default_ancestor
def second_walk(v, m=0, depth=0, min=None):
v.x += m
v.y = depth
if min is None or v.x < min:
min = v.x
for w in v.children:
min = second_walk(w, m + v.mod, depth + 1, min)
return min
class Tree:
def __init__(self, label="", node_id=-1, *children):
self.label = label
self.node_id = node_id
if children:
self.children = children
else:
self.children = []

View File

@@ -0,0 +1,106 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See _splitter.pyx for details.
from sklearn.utils._typedefs cimport (
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
)
from sklearn.tree._criterion cimport Criterion
from sklearn.tree._tree cimport ParentInfo
cdef struct SplitRecord:
# Data to track sample split
intp_t feature # Which feature to split on.
intp_t pos # Split samples array at the given position,
# # i.e. count of samples below threshold for feature.
# # pos is >= end if the node is a leaf.
float64_t threshold # Threshold to split at.
float64_t improvement # Impurity improvement given parent node.
float64_t impurity_left # Impurity of the left split.
float64_t impurity_right # Impurity of the right split.
float64_t lower_bound # Lower bound on value of both children for monotonicity
float64_t upper_bound # Upper bound on value of both children for monotonicity
uint8_t missing_go_to_left # Controls if missing values go to the left node.
intp_t n_missing # Number of missing values for the feature being split on
cdef class Splitter:
# The splitter searches in the input space for a feature and a threshold
# to split the samples samples[start:end].
#
# The impurity computations are delegated to a criterion object.
# Internal structures
cdef public Criterion criterion # Impurity criterion
cdef public intp_t max_features # Number of features to test
cdef public intp_t min_samples_leaf # Min samples in a leaf
cdef public float64_t min_weight_leaf # Minimum weight in a leaf
cdef object random_state # Random state
cdef uint32_t rand_r_state # sklearn_rand_r random number state
cdef intp_t[::1] samples # Sample indices in X, y
cdef intp_t n_samples # X.shape[0]
cdef float64_t weighted_n_samples # Weighted number of samples
cdef intp_t[::1] features # Feature indices in X
cdef intp_t[::1] constant_features # Constant features indices
cdef intp_t n_features # X.shape[1]
cdef float32_t[::1] feature_values # temp. array holding feature values
cdef intp_t start # Start position for the current node
cdef intp_t end # End position for the current node
cdef const float64_t[:, ::1] y
# Monotonicity constraints for each feature.
# The encoding is as follows:
# -1: monotonic decrease
# 0: no constraint
# +1: monotonic increase
cdef const int8_t[:] monotonic_cst
cdef bint with_monotonic_cst
cdef const float64_t[:] sample_weight
# The samples vector `samples` is maintained by the Splitter object such
# that the samples contained in a node are contiguous. With this setting,
# `node_split` reorganizes the node samples `samples[start:end]` in two
# subsets `samples[start:pos]` and `samples[pos:end]`.
# The 1-d `features` array of size n_features contains the features
# indices and allows fast sampling without replacement of features.
# The 1-d `constant_features` array of size n_features holds in
# `constant_features[:n_constant_features]` the feature ids with
# constant values for all the samples that reached a specific node.
# The value `n_constant_features` is given by the parent node to its
# child nodes. The content of the range `[n_constant_features:]` is left
# undefined, but preallocated for performance reasons
# This allows optimization with depth-based tree building.
# Methods
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1
cdef int node_reset(
self,
intp_t start,
intp_t end,
float64_t* weighted_n_node_samples
) except -1 nogil
cdef int node_split(
self,
ParentInfo* parent,
SplitRecord* split,
) except -1 nogil
cdef void node_value(self, float64_t* dest) noexcept nogil
cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil
cdef float64_t node_impurity(self) noexcept nogil

View File

@@ -0,0 +1,904 @@
"""Splitting algorithms in the construction of a tree.
This module contains the main splitting algorithms for constructing a tree.
Splitting is concerned with finding the optimal partition of the data into
two groups. The impurity of the groups is minimized, and the impurity is measured
by some criterion, which is typically the Gini impurity or the entropy. Criterion
are implemented in the ``_criterion`` module.
Splitting evaluates a subset of features (defined by `max_features` also
known as mtry in the literature). The module supports two primary types
of splitting strategies:
- Best Split: A greedy approach to find the optimal split. This method
ensures that the best possible split is chosen by examining various
thresholds for each candidate feature.
- Random Split: A stochastic approach that selects a split randomly
from a subset of the best splits. This method is faster but does
not guarantee the optimal split.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from libc.string cimport memcpy
from sklearn.utils._typedefs cimport int8_t
from sklearn.tree._criterion cimport Criterion
from sklearn.tree._partitioner cimport (
FEATURE_THRESHOLD, DensePartitioner, SparsePartitioner,
shift_missing_values_to_left_if_required
)
from sklearn.tree._utils cimport RAND_R_MAX, rand_int, rand_uniform
import numpy as np
# Introduce a fused-class to make it possible to share the split implementation
# between the dense and sparse cases in the node_split_best and node_split_random
# functions. The alternative would have been to use inheritance-based polymorphism
# but it would have resulted in a ~10% overall tree fitting performance
# degradation caused by the overhead frequent virtual method lookups.
ctypedef fused Partitioner:
DensePartitioner
SparsePartitioner
cdef float64_t INFINITY = np.inf
cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil:
self.impurity_left = INFINITY
self.impurity_right = INFINITY
self.pos = start_pos
self.feature = 0
self.threshold = 0.
self.improvement = -INFINITY
self.missing_go_to_left = False
self.n_missing = 0
cdef class Splitter:
"""Abstract splitter class.
Splitters are called by tree builders to find the best splits on both
sparse and dense data, one split at a time.
"""
def __cinit__(
self,
Criterion criterion,
intp_t max_features,
intp_t min_samples_leaf,
float64_t min_weight_leaf,
object random_state,
const int8_t[:] monotonic_cst,
):
"""
Parameters
----------
criterion : Criterion
The criterion to measure the quality of a split.
max_features : intp_t
The maximal number of randomly selected features which can be
considered for a split.
min_samples_leaf : intp_t
The minimal number of samples each leaf can have, where splits
which would result in having less samples in a leaf are not
considered.
min_weight_leaf : float64_t
The minimal weight each leaf can have, where the weight is the sum
of the weights of each sample in it.
random_state : object
The user inputted random state to be used for pseudo-randomness
monotonic_cst : const int8_t[:]
Monotonicity constraints
"""
self.criterion = criterion
self.n_samples = 0
self.n_features = 0
self.max_features = max_features
self.min_samples_leaf = min_samples_leaf
self.min_weight_leaf = min_weight_leaf
self.random_state = random_state
self.monotonic_cst = monotonic_cst
self.with_monotonic_cst = monotonic_cst is not None
def __getstate__(self):
return {}
def __setstate__(self, d):
pass
def __reduce__(self):
return (type(self), (self.criterion,
self.max_features,
self.min_samples_leaf,
self.min_weight_leaf,
self.random_state,
self.monotonic_cst), self.__getstate__())
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1:
"""Initialize the splitter.
Take in the input data X, the target Y, and optional sample weights.
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
Parameters
----------
X : object
This contains the inputs. Usually it is a 2d numpy array.
y : ndarray, dtype=float64_t
This is the vector of targets, or true labels, for the samples represented
as a Cython memoryview.
sample_weight : ndarray, dtype=float64_t
The weights of the samples, where higher weighted samples are fit
closer than lower weight samples. If not provided, all samples
are assumed to have uniform weight. This is represented
as a Cython memoryview.
has_missing : bool
At least one missing values is in X.
"""
self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef intp_t n_samples = X.shape[0]
# Create a new array which will be used to store nonzero
# samples from the feature of interest
self.samples = np.empty(n_samples, dtype=np.intp)
cdef intp_t[::1] samples = self.samples
cdef intp_t i, j
cdef float64_t weighted_n_samples = 0.0
j = 0
for i in range(n_samples):
# Only work with positively weighted samples
if sample_weight is None or sample_weight[i] != 0.0:
samples[j] = i
j += 1
if sample_weight is not None:
weighted_n_samples += sample_weight[i]
else:
weighted_n_samples += 1.0
# Number of samples is number of positively weighted samples
self.n_samples = j
self.weighted_n_samples = weighted_n_samples
cdef intp_t n_features = X.shape[1]
self.features = np.arange(n_features, dtype=np.intp)
self.n_features = n_features
self.feature_values = np.empty(n_samples, dtype=np.float32)
self.constant_features = np.empty(n_features, dtype=np.intp)
self.y = y
self.sample_weight = sample_weight
if missing_values_in_feature_mask is not None:
self.criterion.init_sum_missing()
return 0
cdef int node_reset(
self,
intp_t start,
intp_t end,
float64_t* weighted_n_node_samples
) except -1 nogil:
"""Reset splitter on node samples[start:end].
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
Parameters
----------
start : intp_t
The index of the first sample to consider
end : intp_t
The index of the last sample to consider
weighted_n_node_samples : ndarray, dtype=float64_t pointer
The total weight of those samples
"""
self.start = start
self.end = end
self.criterion.init(
self.y,
self.sample_weight,
self.weighted_n_samples,
self.samples,
start,
end
)
weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
return 0
cdef int node_split(
self,
ParentInfo* parent_record,
SplitRecord* split,
) except -1 nogil:
"""Find the best split on node samples[start:end].
This is a placeholder method. The majority of computation will be done
here.
It should return -1 upon errors.
"""
pass
cdef void node_value(self, float64_t* dest) noexcept nogil:
"""Copy the value of node samples[start:end] into dest."""
self.criterion.node_value(dest)
cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
"""Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
self.criterion.clip_node_value(dest, lower_bound, upper_bound)
cdef float64_t node_impurity(self) noexcept nogil:
"""Return the impurity of the current node."""
return self.criterion.node_impurity()
cdef inline int node_split_best(
Splitter splitter,
Partitioner partitioner,
Criterion criterion,
SplitRecord* split,
ParentInfo* parent_record,
) except -1 nogil:
"""Find the best split on node samples[start:end]
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
"""
cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
cdef bint with_monotonic_cst = splitter.with_monotonic_cst
# Find the best split
cdef intp_t start = splitter.start
cdef intp_t end = splitter.end
cdef intp_t end_non_missing
cdef intp_t n_missing = 0
cdef bint has_missing = 0
cdef intp_t n_searches
cdef intp_t n_left, n_right
cdef bint missing_go_to_left
cdef intp_t[::1] samples = splitter.samples
cdef intp_t[::1] features = splitter.features
cdef intp_t[::1] constant_features = splitter.constant_features
cdef intp_t n_features = splitter.n_features
cdef float32_t[::1] feature_values = splitter.feature_values
cdef intp_t max_features = splitter.max_features
cdef intp_t min_samples_leaf = splitter.min_samples_leaf
cdef float64_t min_weight_leaf = splitter.min_weight_leaf
cdef uint32_t* random_state = &splitter.rand_r_state
cdef SplitRecord best_split, current_split
cdef float64_t current_proxy_improvement = -INFINITY
cdef float64_t best_proxy_improvement = -INFINITY
cdef float64_t impurity = parent_record.impurity
cdef float64_t lower_bound = parent_record.lower_bound
cdef float64_t upper_bound = parent_record.upper_bound
cdef intp_t f_i = n_features
cdef intp_t f_j
cdef intp_t p
cdef intp_t p_prev
cdef intp_t n_visited_features = 0
# Number of features discovered to be constant during the split search
cdef intp_t n_found_constants = 0
# Number of features known to be constant and drawn without replacement
cdef intp_t n_drawn_constants = 0
cdef intp_t n_known_constants = parent_record.n_constant_features
# n_total_constants = n_known_constants + n_found_constants
cdef intp_t n_total_constants = n_known_constants
_init_split(&best_split, end)
partitioner.init_node_split(start, end)
# Sample up to max_features without replacement using a
# Fisher-Yates-based algorithm (using the local variables `f_i` and
# `f_j` to compute a permutation of the `features` array).
#
# Skip the CPU intensive evaluation of the impurity criterion for
# features that were already detected as constant (hence not suitable
# for good splitting) by ancestor nodes and save the information on
# newly discovered constant features to spare computation on descendant
# nodes.
while (f_i > n_total_constants and # Stop early if remaining features
# are constant
(n_visited_features < max_features or
# At least one drawn features must be non constant
n_visited_features <= n_found_constants + n_drawn_constants)):
n_visited_features += 1
# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
# Draw a feature at random
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
random_state)
if f_j < n_known_constants:
# f_j in the interval [n_drawn_constants, n_known_constants[
features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
n_drawn_constants += 1
continue
# f_j in the interval [n_known_constants, f_i - n_found_constants[
f_j += n_found_constants
# f_j in the interval [n_total_constants, f_i[
current_split.feature = features[f_j]
partitioner.sort_samples_and_feature_values(current_split.feature)
n_missing = partitioner.n_missing
end_non_missing = end - n_missing
if (
# All values for this feature are missing, or
end_non_missing == start or
# This feature is considered constant (max - min <= FEATURE_THRESHOLD)
((
feature_values[end_non_missing - 1]
<= feature_values[start] + FEATURE_THRESHOLD
) and n_missing == 0)
):
# We consider this feature constant in this case.
# Since finding a split among constant feature is not valuable,
# we do not consider this feature for splitting.
features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
n_found_constants += 1
n_total_constants += 1
continue
f_i -= 1
features[f_i], features[f_j] = features[f_j], features[f_i]
has_missing = n_missing != 0
criterion.init_missing(n_missing) # initialize even when n_missing == 0
# Evaluate all splits
# If there are missing values, then we search twice for the most optimal split.
# The first search will have all the missing values going to the right node.
# The second search will have all the missing values going to the left node.
# If there are no missing values, then we search only once for the most
# optimal split.
n_searches = 2 if has_missing else 1
for i in range(n_searches):
missing_go_to_left = i == 1
criterion.missing_go_to_left = missing_go_to_left
criterion.reset()
p = start
while p < end_non_missing:
partitioner.next_p(&p_prev, &p)
if p >= end_non_missing:
continue
if missing_go_to_left:
n_left = p - start + n_missing
n_right = end_non_missing - p
else:
n_left = p - start
n_right = end_non_missing - p + n_missing
# Reject if min_samples_leaf is not guaranteed
if n_left < min_samples_leaf or n_right < min_samples_leaf:
continue
current_split.pos = p
criterion.update(current_split.pos)
# Reject if monotonicity constraints are not satisfied
if (
with_monotonic_cst and
monotonic_cst[current_split.feature] != 0 and
not criterion.check_monotonicity(
monotonic_cst[current_split.feature],
lower_bound,
upper_bound,
)
):
continue
# Reject if min_weight_leaf is not satisfied
if ((criterion.weighted_n_left < min_weight_leaf) or
(criterion.weighted_n_right < min_weight_leaf)):
continue
current_proxy_improvement = criterion.proxy_impurity_improvement()
if current_proxy_improvement > best_proxy_improvement:
best_proxy_improvement = current_proxy_improvement
# sum of halves is used to avoid infinite value
current_split.threshold = (
feature_values[p_prev] / 2.0 + feature_values[p] / 2.0
)
if (
current_split.threshold == feature_values[p] or
current_split.threshold == INFINITY or
current_split.threshold == -INFINITY
):
current_split.threshold = feature_values[p_prev]
current_split.n_missing = n_missing
# if there are no missing values in the training data, during
# test time, we send missing values to the branch that contains
# the most samples during training time.
if n_missing == 0:
current_split.missing_go_to_left = n_left > n_right
else:
current_split.missing_go_to_left = missing_go_to_left
best_split = current_split # copy
# Evaluate when there are missing values and all missing values goes
# to the right node and non-missing values goes to the left node.
if has_missing:
n_left, n_right = end - start - n_missing, n_missing
p = end - n_missing
missing_go_to_left = 0
if not (n_left < min_samples_leaf or n_right < min_samples_leaf):
criterion.missing_go_to_left = missing_go_to_left
criterion.update(p)
if not ((criterion.weighted_n_left < min_weight_leaf) or
(criterion.weighted_n_right < min_weight_leaf)):
current_proxy_improvement = criterion.proxy_impurity_improvement()
if current_proxy_improvement > best_proxy_improvement:
best_proxy_improvement = current_proxy_improvement
current_split.threshold = INFINITY
current_split.missing_go_to_left = missing_go_to_left
current_split.n_missing = n_missing
current_split.pos = p
best_split = current_split
# Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end]
if best_split.pos < end:
partitioner.partition_samples_final(
best_split.pos,
best_split.threshold,
best_split.feature,
best_split.n_missing
)
criterion.init_missing(best_split.n_missing)
criterion.missing_go_to_left = best_split.missing_go_to_left
criterion.reset()
criterion.update(best_split.pos)
criterion.children_impurity(
&best_split.impurity_left, &best_split.impurity_right
)
best_split.improvement = criterion.impurity_improvement(
impurity,
best_split.impurity_left,
best_split.impurity_right
)
shift_missing_values_to_left_if_required(&best_split, samples, end)
# Respect invariant for constant features: the original order of
# element in features[:n_known_constants] must be preserved for sibling
# and child nodes
memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
# Copy newly found constant features
memcpy(&constant_features[n_known_constants],
&features[n_known_constants],
sizeof(intp_t) * n_found_constants)
# Return values
parent_record.n_constant_features = n_total_constants
split[0] = best_split
return 0
cdef inline int node_split_random(
Splitter splitter,
Partitioner partitioner,
Criterion criterion,
SplitRecord* split,
ParentInfo* parent_record,
) except -1 nogil:
"""Find the best random split on node samples[start:end]
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
"""
cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
cdef bint with_monotonic_cst = splitter.with_monotonic_cst
# Draw random splits and pick the best
cdef intp_t start = splitter.start
cdef intp_t end = splitter.end
cdef intp_t end_non_missing
cdef intp_t n_missing = 0
cdef bint has_missing = 0
cdef intp_t n_left, n_right
cdef bint missing_go_to_left
cdef intp_t[::1] samples = splitter.samples
cdef intp_t[::1] features = splitter.features
cdef intp_t[::1] constant_features = splitter.constant_features
cdef intp_t n_features = splitter.n_features
cdef intp_t max_features = splitter.max_features
cdef intp_t min_samples_leaf = splitter.min_samples_leaf
cdef float64_t min_weight_leaf = splitter.min_weight_leaf
cdef uint32_t* random_state = &splitter.rand_r_state
cdef SplitRecord best_split, current_split
cdef float64_t current_proxy_improvement = - INFINITY
cdef float64_t best_proxy_improvement = - INFINITY
cdef float64_t impurity = parent_record.impurity
cdef float64_t lower_bound = parent_record.lower_bound
cdef float64_t upper_bound = parent_record.upper_bound
cdef intp_t f_i = n_features
cdef intp_t f_j
# Number of features discovered to be constant during the split search
cdef intp_t n_found_constants = 0
# Number of features known to be constant and drawn without replacement
cdef intp_t n_drawn_constants = 0
cdef intp_t n_known_constants = parent_record.n_constant_features
# n_total_constants = n_known_constants + n_found_constants
cdef intp_t n_total_constants = n_known_constants
cdef intp_t n_visited_features = 0
cdef float32_t min_feature_value
cdef float32_t max_feature_value
_init_split(&best_split, end)
partitioner.init_node_split(start, end)
# Sample up to max_features without replacement using a
# Fisher-Yates-based algorithm (using the local variables `f_i` and
# `f_j` to compute a permutation of the `features` array).
#
# Skip the CPU intensive evaluation of the impurity criterion for
# features that were already detected as constant (hence not suitable
# for good splitting) by ancestor nodes and save the information on
# newly discovered constant features to spare computation on descendant
# nodes.
while (f_i > n_total_constants and # Stop early if remaining features
# are constant
(n_visited_features < max_features or
# At least one drawn features must be non constant
n_visited_features <= n_found_constants + n_drawn_constants)):
n_visited_features += 1
# Loop invariant: elements of features in
# - [:n_drawn_constant[ holds drawn and known constant features;
# - [n_drawn_constant:n_known_constant[ holds known constant
# features that haven't been drawn yet;
# - [n_known_constant:n_total_constant[ holds newly found constant
# features;
# - [n_total_constant:f_i[ holds features that haven't been drawn
# yet and aren't constant apriori.
# - [f_i:n_features[ holds features that have been drawn
# and aren't constant.
# Draw a feature at random
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
random_state)
if f_j < n_known_constants:
# f_j in the interval [n_drawn_constants, n_known_constants[
features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
n_drawn_constants += 1
continue
# f_j in the interval [n_known_constants, f_i - n_found_constants[
f_j += n_found_constants
# f_j in the interval [n_total_constants, f_i[
current_split.feature = features[f_j]
# Find min, max as we will randomly select a threshold between them
partitioner.find_min_max(
current_split.feature, &min_feature_value, &max_feature_value
)
n_missing = partitioner.n_missing
end_non_missing = end - n_missing
if (
# All values for this feature are missing, or
end_non_missing == start or
# This feature is considered constant (max - min <= FEATURE_THRESHOLD)
(max_feature_value <= min_feature_value + FEATURE_THRESHOLD and n_missing == 0)
):
# We consider this feature constant in this case.
# Since finding a split with a constant feature is not valuable,
# we do not consider this feature for splitting.
features[f_j], features[n_total_constants] = features[n_total_constants], current_split.feature
n_found_constants += 1
n_total_constants += 1
continue
f_i -= 1
features[f_i], features[f_j] = features[f_j], features[f_i]
has_missing = n_missing != 0
criterion.init_missing(n_missing)
# Draw a random threshold
current_split.threshold = rand_uniform(
min_feature_value,
max_feature_value,
random_state,
)
if has_missing:
# If there are missing values, then we randomly make all missing
# values go to the right or left.
#
# Note: compared to the BestSplitter, we do not evaluate the
# edge case where all the missing values go to the right node
# and the non-missing values go to the left node. This is because
# this would indicate a threshold outside of the observed range
# of the feature. However, it is not clear how much probability weight should
# be given to this edge case.
missing_go_to_left = rand_int(0, 2, random_state)
else:
missing_go_to_left = 0
criterion.missing_go_to_left = missing_go_to_left
if current_split.threshold == max_feature_value:
current_split.threshold = min_feature_value
# Partition
current_split.pos = partitioner.partition_samples(
current_split.threshold
)
if missing_go_to_left:
n_left = current_split.pos - start + n_missing
n_right = end_non_missing - current_split.pos
else:
n_left = current_split.pos - start
n_right = end_non_missing - current_split.pos + n_missing
# Reject if min_samples_leaf is not guaranteed
if n_left < min_samples_leaf or n_right < min_samples_leaf:
continue
# Evaluate split
# At this point, the criterion has a view into the samples that was partitioned
# by the partitioner. The criterion will use the partition to evaluating the split.
criterion.reset()
criterion.update(current_split.pos)
# Reject if min_weight_leaf is not satisfied
if ((criterion.weighted_n_left < min_weight_leaf) or
(criterion.weighted_n_right < min_weight_leaf)):
continue
# Reject if monotonicity constraints are not satisfied
if (
with_monotonic_cst and
monotonic_cst[current_split.feature] != 0 and
not criterion.check_monotonicity(
monotonic_cst[current_split.feature],
lower_bound,
upper_bound,
)
):
continue
current_proxy_improvement = criterion.proxy_impurity_improvement()
if current_proxy_improvement > best_proxy_improvement:
current_split.n_missing = n_missing
# if there are no missing values in the training data, during
# test time, we send missing values to the branch that contains
# the most samples during training time.
if has_missing:
current_split.missing_go_to_left = missing_go_to_left
else:
current_split.missing_go_to_left = n_left > n_right
best_proxy_improvement = current_proxy_improvement
best_split = current_split # copy
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
if best_split.pos < end:
if current_split.feature != best_split.feature:
partitioner.partition_samples_final(
best_split.pos,
best_split.threshold,
best_split.feature,
best_split.n_missing
)
criterion.init_missing(best_split.n_missing)
criterion.missing_go_to_left = best_split.missing_go_to_left
criterion.reset()
criterion.update(best_split.pos)
criterion.children_impurity(
&best_split.impurity_left, &best_split.impurity_right
)
best_split.improvement = criterion.impurity_improvement(
impurity,
best_split.impurity_left,
best_split.impurity_right
)
shift_missing_values_to_left_if_required(&best_split, samples, end)
# Respect invariant for constant features: the original order of
# element in features[:n_known_constants] must be preserved for sibling
# and child nodes
memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
# Copy newly found constant features
memcpy(&constant_features[n_known_constants],
&features[n_known_constants],
sizeof(intp_t) * n_found_constants)
# Return values
parent_record.n_constant_features = n_total_constants
split[0] = best_split
return 0
cdef class BestSplitter(Splitter):
"""Splitter for finding the best split on dense data."""
cdef DensePartitioner partitioner
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = DensePartitioner(
X, self.samples, self.feature_values, missing_values_in_feature_mask
)
cdef int node_split(
self,
ParentInfo* parent_record,
SplitRecord* split,
) except -1 nogil:
return node_split_best(
self,
self.partitioner,
self.criterion,
split,
parent_record,
)
cdef class BestSparseSplitter(Splitter):
"""Splitter for finding the best split, using the sparse data."""
cdef SparsePartitioner partitioner
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = SparsePartitioner(
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
)
cdef int node_split(
self,
ParentInfo* parent_record,
SplitRecord* split,
) except -1 nogil:
return node_split_best(
self,
self.partitioner,
self.criterion,
split,
parent_record,
)
cdef class RandomSplitter(Splitter):
"""Splitter for finding the best random split on dense data."""
cdef DensePartitioner partitioner
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = DensePartitioner(
X, self.samples, self.feature_values, missing_values_in_feature_mask
)
cdef int node_split(
self,
ParentInfo* parent_record,
SplitRecord* split,
) except -1 nogil:
return node_split_random(
self,
self.partitioner,
self.criterion,
split,
parent_record,
)
cdef class RandomSparseSplitter(Splitter):
"""Splitter for finding the best random split, using the sparse data."""
cdef SparsePartitioner partitioner
cdef int init(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
const uint8_t[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = SparsePartitioner(
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
)
cdef int node_split(
self,
ParentInfo* parent_record,
SplitRecord* split,
) except -1 nogil:
return node_split_random(
self,
self.partitioner,
self.criterion,
split,
parent_record,
)

View File

@@ -0,0 +1,133 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See _tree.pyx for details.
import numpy as np
cimport numpy as cnp
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t
from sklearn.tree._splitter cimport Splitter
from sklearn.tree._splitter cimport SplitRecord
cdef struct Node:
# Base storage structure for the nodes in a Tree object
intp_t left_child # id of the left child of the node
intp_t right_child # id of the right child of the node
intp_t feature # Feature used for splitting the node
float64_t threshold # Threshold value at the node
float64_t impurity # Impurity of the node (i.e., the value of the criterion)
intp_t n_node_samples # Number of samples at the node
float64_t weighted_n_node_samples # Weighted number of samples at the node
uint8_t missing_go_to_left # Whether features have missing values
cdef struct ParentInfo:
# Structure to store information about the parent of a node
# This is passed to the splitter, to provide information about the previous split
float64_t lower_bound # the lower bound of the parent's impurity
float64_t upper_bound # the upper bound of the parent's impurity
float64_t impurity # the impurity of the parent
intp_t n_constant_features # the number of constant features found in parent
cdef class Tree:
# The Tree object is a binary tree structure constructed by the
# TreeBuilder. The tree structure is used for predictions and
# feature importances.
# Input/Output layout
cdef public intp_t n_features # Number of features in X
cdef intp_t* n_classes # Number of classes in y[:, k]
cdef public intp_t n_outputs # Number of outputs in y
cdef public intp_t max_n_classes # max(n_classes)
# Inner structures: values are stored separately from node structure,
# since size is determined at runtime.
cdef public intp_t max_depth # Max depth of the tree
cdef public intp_t node_count # Counter for node IDs
cdef public intp_t capacity # Capacity of tree, in terms of nodes
cdef Node* nodes # Array of nodes
cdef float64_t* value # (capacity, n_outputs, max_n_classes) array of values
cdef intp_t value_stride # = n_outputs * max_n_classes
# Methods
cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
intp_t feature, float64_t threshold, float64_t impurity,
intp_t n_node_samples,
float64_t weighted_n_node_samples,
uint8_t missing_go_to_left) except -1 nogil
cdef int _resize(self, intp_t capacity) except -1 nogil
cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
cdef cnp.ndarray _get_value_ndarray(self)
cdef cnp.ndarray _get_node_ndarray(self)
cpdef cnp.ndarray predict(self, object X)
cpdef cnp.ndarray apply(self, object X)
cdef cnp.ndarray _apply_dense(self, object X)
cdef cnp.ndarray _apply_sparse_csr(self, object X)
cpdef object decision_path(self, object X)
cdef object _decision_path_dense(self, object X)
cdef object _decision_path_sparse_csr(self, object X)
cpdef compute_node_depths(self)
cpdef compute_feature_importances(self, normalize=*)
# =============================================================================
# Tree builder
# =============================================================================
cdef class TreeBuilder:
# The TreeBuilder recursively builds a Tree object from training samples,
# using a Splitter object for splitting internal nodes and assigning
# values to leaves.
#
# This class controls the various stopping criteria and the node splitting
# evaluation order, e.g. depth-first or best-first.
cdef Splitter splitter # Splitting algorithm
cdef intp_t min_samples_split # Minimum number of samples in an internal node
cdef intp_t min_samples_leaf # Minimum number of samples in a leaf
cdef float64_t min_weight_leaf # Minimum weight in a leaf
cdef intp_t max_depth # Maximal tree depth
cdef float64_t min_impurity_decrease # Impurity threshold for early stopping
cpdef build(
self,
Tree tree,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight=*,
const uint8_t[::1] missing_values_in_feature_mask=*,
)
cdef _check_input(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
)
# =============================================================================
# Tree pruning
# =============================================================================
# The private function allows any external caller to prune the tree and return
# a new tree with the pruned nodes. The pruned tree is a new tree object.
#
# .. warning:: this function is not backwards compatible and may change without
# notice.
cdef void _build_pruned_tree(
Tree tree, # OUT
Tree orig_tree,
const uint8_t[:] leaves_in_subtree,
intp_t capacity
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,70 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# See _utils.pyx for details.
cimport numpy as cnp
from sklearn.tree._tree cimport Node
from sklearn.neighbors._quad_tree cimport Cell
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t
cdef enum:
# Max value for our rand_r replacement (near the bottom).
# We don't use RAND_MAX because it's different across platforms and
# particularly tiny on Windows/MSVC.
# It corresponds to the maximum representable value for
# 32-bit signed integers (i.e. 2^31 - 1).
RAND_R_MAX = 2147483647
# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or
# raises a MemoryError. It never calls free, since that's __dealloc__'s job.
# cdef float32_t *p = NULL
# safe_realloc(&p, n)
# is equivalent to p = malloc(n * sizeof(*p)) with error checking.
ctypedef fused realloc_ptr:
# Add pointer types here as needed.
(float32_t*)
(intp_t*)
(uint8_t*)
(float64_t*)
(float64_t**)
(Node*)
(Cell*)
(Node**)
cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
cdef intp_t rand_int(intp_t low, intp_t high,
uint32_t* random_state) noexcept nogil
cdef float64_t rand_uniform(float64_t low, float64_t high,
uint32_t* random_state) noexcept nogil
cdef float64_t log(float64_t x) noexcept nogil
cdef class WeightedFenwickTree:
cdef intp_t size # number of leaves (ranks)
cdef float64_t* tree_w # BIT for weights
cdef float64_t* tree_wy # BIT for weighted targets
cdef intp_t max_pow2 # highest power of two <= n
cdef float64_t total_w # running total weight
cdef float64_t total_wy # running total weighted target
cdef void reset(self, intp_t size) noexcept nogil
cdef void add(self, intp_t idx, float64_t y, float64_t w) noexcept nogil
cdef intp_t search(
self,
float64_t t,
float64_t* cw_out,
float64_t* cwy_out,
intp_t* prev_idx_out,
) noexcept nogil

View File

@@ -0,0 +1,291 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from libc.stdlib cimport free
from libc.stdlib cimport realloc
from libc.math cimport log as ln
from libc.math cimport isnan
from libc.string cimport memset
import numpy as np
cimport numpy as cnp
cnp.import_array()
from sklearn.utils._random cimport our_rand_r
# =============================================================================
# Helper functions
# =============================================================================
cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
# sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
# 0.20.1 to crash.
cdef size_t nbytes = nelems * sizeof(p[0][0])
if nbytes / sizeof(p[0][0]) != nelems:
# Overflow in the multiplication
raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes")
cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
if tmp == NULL:
raise MemoryError(f"could not allocate {nbytes} bytes")
p[0] = tmp
return 0
def _realloc_test():
# Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
# bytes, which will always overflow.
cdef intp_t* p = NULL
safe_realloc(&p, <size_t>(-1) / 2)
if p != NULL:
free(p)
assert False
cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
"""Return copied data as 1D numpy array of intp's."""
cdef cnp.npy_intp shape[1]
shape[0] = <cnp.npy_intp> size
return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
cdef inline intp_t rand_int(intp_t low, intp_t high,
uint32_t* random_state) noexcept nogil:
"""Generate a random integer in [low; end)."""
return low + our_rand_r(random_state) % (high - low)
cdef inline float64_t rand_uniform(float64_t low, float64_t high,
uint32_t* random_state) noexcept nogil:
"""Generate a random float64_t in [low; high)."""
return ((high - low) * <float64_t> our_rand_r(random_state) /
<float64_t> RAND_R_MAX) + low
cdef inline float64_t log(float64_t x) noexcept nogil:
return ln(x) / ln(2.0)
def _any_isnan_axis0(const float32_t[:, :] X):
"""Same as np.any(np.isnan(X), axis=0)"""
cdef:
intp_t i, j
intp_t n_samples = X.shape[0]
intp_t n_features = X.shape[1]
uint8_t[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)
with nogil:
for i in range(n_samples):
for j in range(n_features):
if isnan_out[j]:
continue
if isnan(X[i, j]):
isnan_out[j] = True
break
return np.asarray(isnan_out)
cdef class WeightedFenwickTree:
"""
Fenwick tree (Binary Indexed Tree) specialized for maintaining:
- prefix sums of weights
- prefix sums of weight * target (y)
Notes:
- Implementation uses 1-based indexing internally for the Fenwick tree
arrays, hence the +1 sized buffers. 1-based indexing is customary for this
data structure and makes the some index handling slightly more efficient and
natural.
- Memory ownership: this class allocates and frees the underlying C buffers.
- Typical operations:
add(rank, y, w) -> O(log n)
search(t) -> O(log n), finds the smallest rank with
cumulative weight > t (see search for details).
"""
def __cinit__(self, intp_t capacity):
self.tree_w = NULL
self.tree_wy = NULL
# Allocate arrays of length (capacity + 1) because indices are 1-based.
safe_realloc(&self.tree_w, capacity + 1)
safe_realloc(&self.tree_wy, capacity + 1)
cdef void reset(self, intp_t size) noexcept nogil:
"""
Reset the tree to hold 'size' elements and clear all aggregates.
"""
cdef intp_t p
cdef intp_t n_bytes = (size + 1) * sizeof(float64_t) # +1 for 1-based storage
# Public size and zeroed aggregates.
self.size = size
memset(self.tree_w, 0, n_bytes)
memset(self.tree_wy, 0, n_bytes)
self.total_w = 0.0
self.total_wy = 0.0
# highest power of two <= size
p = 1
while p <= size:
p <<= 1
self.max_pow2 = p >> 1
def __dealloc__(self):
if self.tree_w != NULL:
free(self.tree_w)
if self.tree_wy != NULL:
free(self.tree_wy)
cdef void add(self, intp_t idx, float64_t y_value, float64_t weight) noexcept nogil:
"""
Add a weighted observation to the Fenwick tree.
Parameters
----------
idx : intp_t
The 0-based index where to add the observation
y_value : float64_t
The target value (y) of the observation
weight : float64_t
The sample weight
Notes
-----
Updates both weight sums and weighted target sums in O(log n) time.
"""
cdef float64_t weighted_y = weight * y_value
cdef intp_t fenwick_idx = idx + 1 # Convert to 1-based indexing
# Update Fenwick tree nodes by traversing up the tree
while fenwick_idx <= self.size:
self.tree_w[fenwick_idx] += weight
self.tree_wy[fenwick_idx] += weighted_y
# Move to next node using bit manipulation: add lowest set bit
fenwick_idx += fenwick_idx & -fenwick_idx
# Update global totals
self.total_w += weight
self.total_wy += weighted_y
cdef intp_t search(
self,
float64_t target_weight,
float64_t* cumul_weight_out,
float64_t* cumul_weighted_y_out,
intp_t* prev_idx_out,
) noexcept nogil:
"""
Binary search to find the position where cumulative weight reaches target.
This method performs a binary search on the Fenwick tree to find indices
such that the cumulative weight at 'prev_idx' is < target_weight and
the cumulative weight at the returned index is >= target_weight.
Parameters
----------
target_weight : float64_t
The target cumulative weight to search for
cumul_weight_out : float64_t*
Output pointer for cumulative weight up to returned index (exclusive)
cumul_weighted_y_out : float64_t*
Output pointer for cumulative weighted y-sum up to returned index (exclusive)
prev_idx_out : intp_t*
Output pointer for the previous index (largest index with cumul_weight < target)
Returns
-------
intp_t
The index where cumulative weight first reaches or exceeds target_weight
Notes
-----
- O(log n) complexity
- Ignores nodes with zero weights (corresponding to uninserted y-values)
- Assumes at least one active (positive-weight) item exists
- Assumes 0 <= target_weight <= total_weight
"""
cdef:
intp_t current_idx = 0
intp_t next_idx, prev_idx, equal_bit
float64_t cumul_weight = 0.0
float64_t cumul_weighted_y = 0.0
intp_t search_bit = self.max_pow2 # Start from highest power of 2
float64_t node_weight, equal_target
# Phase 1: Standard Fenwick binary search with prefix accumulation
# Traverse down the tree, moving right when we can consume more weight
while search_bit != 0:
next_idx = current_idx + search_bit
if next_idx <= self.size:
node_weight = self.tree_w[next_idx]
if target_weight == node_weight:
# Exact match found - store state for later processing
equal_target = target_weight
equal_bit = search_bit
break
elif target_weight > node_weight:
# We can consume this node's weight - move right and accumulate
target_weight -= node_weight
current_idx = next_idx
cumul_weight += node_weight
cumul_weighted_y += self.tree_wy[next_idx]
search_bit >>= 1
# If no exact match, we're done with standard search
if search_bit == 0:
cumul_weight_out[0] = cumul_weight
cumul_weighted_y_out[0] = cumul_weighted_y
prev_idx_out[0] = current_idx
return current_idx
# Phase 2: Handle exact match case - find prev_idx
# Search for the largest index with cumulative weight < original target
prev_idx = current_idx
while search_bit != 0:
next_idx = prev_idx + search_bit
if next_idx <= self.size:
node_weight = self.tree_w[next_idx]
if target_weight > node_weight:
target_weight -= node_weight
prev_idx = next_idx
search_bit >>= 1
# Phase 3: Complete the exact match search
# Restore state and search for the largest index with
# cumulative weight <= original target (and this is case, we know we have ==)
search_bit = equal_bit
target_weight = equal_target
while search_bit != 0:
next_idx = current_idx + search_bit
if next_idx <= self.size:
node_weight = self.tree_w[next_idx]
if target_weight >= node_weight:
target_weight -= node_weight
current_idx = next_idx
cumul_weight += node_weight
cumul_weighted_y += self.tree_wy[next_idx]
search_bit >>= 1
# Output results
cumul_weight_out[0] = cumul_weight
cumul_weighted_y_out[0] = cumul_weighted_y
prev_idx_out[0] = prev_idx
return current_idx
cdef class PytestWeightedFenwickTree(WeightedFenwickTree):
"""Used for testing only"""
def py_reset(self, intp_t n):
self.reset(n)
def py_add(self, intp_t idx, float64_t y, float64_t w):
self.add(idx, y, w)
def py_search(self, float64_t t):
cdef float64_t w, wy
cdef intp_t prev_idx
idx = self.search(t, &w, &wy, &prev_idx)
return prev_idx, idx, w, wy

View File

@@ -0,0 +1,28 @@
tree_extension_metadata = {
'_tree':
{'sources': [cython_gen_cpp.process('_tree.pyx')],
'override_options': ['optimization=3']},
'_splitter':
{'sources': [cython_gen.process('_splitter.pyx')],
'override_options': ['optimization=3']},
'_partitioner':
{'sources': [cython_gen.process('_partitioner.pyx')],
'override_options': ['optimization=3']},
'_criterion':
{'sources': [cython_gen.process('_criterion.pyx')],
'override_options': ['optimization=3']},
'_utils':
{'sources': [cython_gen.process('_utils.pyx')],
'override_options': ['optimization=3']},
}
foreach ext_name, ext_dict : tree_extension_metadata
py.extension_module(
ext_name,
[ext_dict.get('sources'), utils_cython_tree],
dependencies: [np_dep],
override_options : ext_dict.get('override_options', []),
subdir: 'sklearn/tree',
install: true
)
endforeach

View File

@@ -0,0 +1,635 @@
"""
Testing for export functions of decision trees (sklearn.tree.export).
"""
from io import StringIO
from re import finditer, search
from textwrap import dedent
import numpy as np
import pytest
from numpy.random import RandomState
from sklearn.base import is_classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.exceptions import NotFittedError
from sklearn.tree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
export_graphviz,
export_text,
plot_tree,
)
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
y2 = [[-1, 1], [-1, 1], [-1, 1], [1, 2], [1, 2], [1, 3]]
w = [1, 1, 1, 0.5, 0.5, 0.5]
y_degraded = [1, 1, 1, 1, 1, 1]
def test_graphviz_toy():
# Check correctness of export_graphviz
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
)
clf.fit(X, y)
# Test export code
contents1 = export_graphviz(clf, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with feature_names
contents1 = export_graphviz(
clf, feature_names=["feature0", "feature1"], out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with feature_names (escaped)
contents1 = export_graphviz(
clf, feature_names=['feature"0"', 'feature"1"'], out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="feature\\"0\\" <= 0.0\\n'
"gini = 0.5\\nsamples = 6\\n"
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with class_names
contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = yes"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
'class = yes"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
'class = no"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with class_names (escaped)
contents1 = export_graphviz(clf, class_names=['"yes"', '"no"'], out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = \\"yes\\""] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
'class = \\"yes\\""] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
'class = \\"no\\""] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test plot_options
contents1 = export_graphviz(
clf,
filled=True,
impurity=False,
proportion=True,
special_characters=True,
rounded=True,
out_file=None,
fontname="sans",
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled, rounded", color="black", '
'fontname="sans"] ;\n'
'edge [fontname="sans"] ;\n'
"0 [label=<x<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>"
'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n'
"1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, "
'fillcolor="#e58139"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
"2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, "
'fillcolor="#399de5"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test max_depth
contents1 = export_graphviz(clf, max_depth=0, class_names=True, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = y[0]"] ;\n'
'1 [label="(...)"] ;\n'
"0 -> 1 ;\n"
'2 [label="(...)"] ;\n'
"0 -> 2 ;\n"
"}"
)
assert contents1 == contents2
# Test max_depth with plot_options
contents1 = export_graphviz(
clf, max_depth=0, filled=True, out_file=None, node_ids=True
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="node #0\\nx[0] <= 0.0\\ngini = 0.5\\n'
'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n'
'1 [label="(...)", fillcolor="#C0C0C0"] ;\n'
"0 -> 1 ;\n"
'2 [label="(...)", fillcolor="#C0C0C0"] ;\n'
"0 -> 2 ;\n"
"}"
)
assert contents1 == contents2
# Test multi-output with weighted samples
clf = DecisionTreeClassifier(
max_depth=2, min_samples_split=2, criterion="gini", random_state=2
)
clf = clf.fit(X, y2, sample_weight=w)
contents1 = export_graphviz(clf, filled=True, impurity=False, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\nsamples = 6\\n'
"value = [[3.0, 1.5, 0.0]\\n"
'[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n'
'1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n'
'[3, 0, 0]]", fillcolor="#e58139"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="x[0] <= 1.5\\nsamples = 3\\n'
"value = [[0.0, 1.5, 0.0]\\n"
'[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
'3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n'
'[0, 1, 0]]", fillcolor="#e58139"] ;\n'
"2 -> 3 ;\n"
'4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n'
'[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n'
"2 -> 4 ;\n"
"}"
)
assert contents1 == contents2
# Test regression output with plot_options
clf = DecisionTreeRegressor(
max_depth=3, min_samples_split=2, criterion="squared_error", random_state=2
)
clf.fit(X, y)
contents1 = export_graphviz(
clf,
filled=True,
leaves_parallel=True,
out_file=None,
rotate=True,
rounded=True,
fontname="sans",
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled, rounded", color="black", '
'fontname="sans"] ;\n'
"graph [ranksep=equally, splines=polyline] ;\n"
'edge [fontname="sans"] ;\n'
"rankdir=LR ;\n"
'0 [label="x[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n'
'value = 0.0", fillcolor="#f2c09c"] ;\n'
'1 [label="squared_error = 0.0\\nsamples = 3\\'
'nvalue = -1.0", '
'fillcolor="#ffffff"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=-45, "
'headlabel="True"] ;\n'
'2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", '
'fillcolor="#e58139"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=45, "
'headlabel="False"] ;\n'
"{rank=same ; 0} ;\n"
"{rank=same ; 1; 2} ;\n"
"}"
)
assert contents1 == contents2
# Test classifier with degraded learning set
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y_degraded)
contents1 = export_graphviz(clf, filled=True, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", '
'fillcolor="#ffffff"] ;\n'
"}"
)
@pytest.mark.parametrize("constructor", [list, np.array])
def test_graphviz_feature_class_names_array_support(constructor):
# Check that export_graphviz treats feature names
# and class names correctly and supports arrays
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
)
clf.fit(X, y)
# Test with feature_names
contents1 = export_graphviz(
clf, feature_names=constructor(["feature0", "feature1"]), out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with class_names
contents1 = export_graphviz(
clf, class_names=constructor(["yes", "no"]), out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = yes"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
'class = yes"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
'class = no"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
def test_graphviz_errors():
# Check for errors of export_graphviz
clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
# Check not-fitted decision tree error
out = StringIO()
with pytest.raises(NotFittedError):
export_graphviz(clf, out)
clf.fit(X, y)
# Check if it errors when length of feature_names
# mismatches with number of features
message = "Length of feature_names, 1 does not match number of features, 2"
with pytest.raises(ValueError, match=message):
export_graphviz(clf, None, feature_names=["a"])
message = "Length of feature_names, 3 does not match number of features, 2"
with pytest.raises(ValueError, match=message):
export_graphviz(clf, None, feature_names=["a", "b", "c"])
# Check error when feature_names contains non-string elements
message = "All feature names must be strings."
with pytest.raises(ValueError, match=message):
export_graphviz(clf, None, feature_names=["a", 1])
# Check error when argument is not an estimator
message = "is not an estimator instance"
with pytest.raises(TypeError, match=message):
export_graphviz(clf.fit(X, y).tree_)
# Check class_names error
out = StringIO()
with pytest.raises(IndexError):
export_graphviz(clf, out, class_names=[])
def test_friedman_mse_in_graphviz():
clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data)
clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
clf.fit(X, y)
for estimator in clf.estimators_:
export_graphviz(estimator[0], out_file=dot_data)
for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
assert "friedman_mse" in finding.group()
def test_precision():
rng_reg = RandomState(2)
rng_clf = RandomState(8)
for X, y, clf in zip(
(rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))),
(rng_reg.random_sample((5,)), rng_clf.randint(2, size=(1000,))),
(
DecisionTreeRegressor(
criterion="friedman_mse", random_state=0, max_depth=1
),
DecisionTreeClassifier(max_depth=1, random_state=0),
),
):
clf.fit(X, y)
for precision in (4, 3):
dot_data = export_graphviz(
clf, out_file=None, precision=precision, proportion=True
)
# With the current random state, the impurity and the threshold
# will have the number of precision set in the export_graphviz
# function. We will check the number of precision with a strict
# equality. The value reported will have only 2 precision and
# therefore, only a less equal comparison will be done.
# check value
for finding in finditer(r"value = \d+\.\d+", dot_data):
assert len(search(r"\.\d+", finding.group()).group()) <= precision + 1
# check impurity
if is_classifier(clf):
pattern = r"gini = \d+\.\d+"
else:
pattern = r"friedman_mse = \d+\.\d+"
# check impurity
for finding in finditer(pattern, dot_data):
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
# check threshold
for finding in finditer(r"<= \d+\.\d+", dot_data):
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
def test_export_text_errors():
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
err_msg = "feature_names must contain 2 elements, got 1"
with pytest.raises(ValueError, match=err_msg):
export_text(clf, feature_names=["a"])
err_msg = (
"When `class_names` is an array, it should contain as"
" many items as `decision_tree.classes_`. Got 1 while"
" the tree was fitted with 2 classes."
)
with pytest.raises(ValueError, match=err_msg):
export_text(clf, class_names=["a"])
def test_export_text():
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: -1
|--- feature_1 > 0.00
| |--- class: 1
"""
).lstrip()
assert export_text(clf) == expected_report
# testing that leaves at level 1 are not truncated
assert export_text(clf, max_depth=0) == expected_report
# testing that the rest of the tree is truncated
assert export_text(clf, max_depth=10) == expected_report
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- weights: [3.00, 0.00] class: -1
|--- feature_1 > 0.00
| |--- weights: [0.00, 3.00] class: 1
"""
).lstrip()
assert export_text(clf, show_weights=True) == expected_report
expected_report = dedent(
"""
|- feature_1 <= 0.00
| |- class: -1
|- feature_1 > 0.00
| |- class: 1
"""
).lstrip()
assert export_text(clf, spacing=1) == expected_report
X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
y_l = [-1, -1, -1, 1, 1, 1, 2]
clf = DecisionTreeClassifier(max_depth=4, random_state=0)
clf.fit(X_l, y_l)
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: -1
|--- feature_1 > 0.00
| |--- truncated branch of depth 2
"""
).lstrip()
assert export_text(clf, max_depth=0) == expected_report
X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
reg.fit(X_mo, y_mo)
expected_report = dedent(
"""
|--- feature_1 <= 0.0
| |--- value: [-1.0, -1.0]
|--- feature_1 > 0.0
| |--- value: [1.0, 1.0]
"""
).lstrip()
assert export_text(reg, decimals=1) == expected_report
assert export_text(reg, decimals=1, show_weights=True) == expected_report
X_single = [[-2], [-1], [-1], [1], [1], [2]]
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
reg.fit(X_single, y_mo)
expected_report = dedent(
"""
|--- first <= 0.0
| |--- value: [-1.0, -1.0]
|--- first > 0.0
| |--- value: [1.0, 1.0]
"""
).lstrip()
assert export_text(reg, decimals=1, feature_names=["first"]) == expected_report
assert (
export_text(reg, decimals=1, show_weights=True, feature_names=["first"])
== expected_report
)
@pytest.mark.parametrize("constructor", [list, np.array])
def test_export_text_feature_class_names_array_support(constructor):
# Check that export_graphviz treats feature names
# and class names correctly and supports arrays
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
expected_report = dedent(
"""
|--- b <= 0.00
| |--- class: -1
|--- b > 0.00
| |--- class: 1
"""
).lstrip()
assert export_text(clf, feature_names=constructor(["a", "b"])) == expected_report
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: cat
|--- feature_1 > 0.00
| |--- class: dog
"""
).lstrip()
assert export_text(clf, class_names=constructor(["cat", "dog"])) == expected_report
def test_plot_tree_entropy(pyplot):
# mostly smoke tests
# Check correctness of export_graphviz for criterion = entropy
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="entropy", random_state=2
)
clf.fit(X, y)
# Test export code
feature_names = ["first feat", "sepal_width"]
nodes = plot_tree(clf, feature_names=feature_names)
assert len(nodes) == 5
assert (
nodes[0].get_text()
== "first feat <= 0.0\nentropy = 1.0\nsamples = 6\nvalue = [3, 3]"
)
assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]"
assert nodes[2].get_text() == "True "
assert nodes[3].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
assert nodes[4].get_text() == " False"
@pytest.mark.parametrize("fontsize", [None, 10, 20])
def test_plot_tree_gini(pyplot, fontsize):
# mostly smoke tests
# Check correctness of export_graphviz for criterion = gini
clf = DecisionTreeClassifier(
max_depth=3,
min_samples_split=2,
criterion="gini",
random_state=2,
)
clf.fit(X, y)
# Test export code
feature_names = ["first feat", "sepal_width"]
nodes = plot_tree(clf, feature_names=feature_names, fontsize=fontsize)
assert len(nodes) == 5
if fontsize is not None:
assert all(node.get_fontsize() == fontsize for node in nodes)
assert (
nodes[0].get_text()
== "first feat <= 0.0\ngini = 0.5\nsamples = 6\nvalue = [3, 3]"
)
assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]"
assert nodes[2].get_text() == "True "
assert nodes[3].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
assert nodes[4].get_text() == " False"
def test_not_fitted_tree(pyplot):
# Testing if not fitted tree throws the correct error
clf = DecisionTreeRegressor()
with pytest.raises(NotFittedError):
plot_tree(clf)

View File

@@ -0,0 +1,51 @@
import numpy as np
from sklearn.tree._utils import PytestWeightedFenwickTree
def test_cython_weighted_fenwick_tree(global_random_seed):
"""
Test Cython's weighted Fenwick tree implementation
"""
rng = np.random.default_rng(global_random_seed)
n = 100
indices = rng.permutation(n)
y = rng.normal(size=n)
w = rng.integers(0, 4, size=n)
y_included_so_far = np.zeros_like(y)
w_included_so_far = np.zeros_like(w)
tree = PytestWeightedFenwickTree(n)
tree.py_reset(n)
for i in range(n):
idx = indices[i]
tree.py_add(idx, y[idx], w[idx])
y_included_so_far[idx] = y[idx]
w_included_so_far[idx] = w[idx]
target = rng.uniform(0, w_included_so_far.sum())
t_idx_low, t_idx, cw, cwy = tree.py_search(target)
# check the aggregates are consistent with the returned idx
assert np.isclose(cw, np.sum(w_included_so_far[:t_idx]))
assert np.isclose(
cwy, np.sum(w_included_so_far[:t_idx] * y_included_so_far[:t_idx])
)
# check if the cumulative weight is less than or equal to the target
# depending on t_idx_low and t_idx
if t_idx_low == t_idx:
assert cw < target
else:
assert cw == target
# check that if we add the next non-null weight, we are above the target:
next_weights = w_included_so_far[t_idx:][w_included_so_far[t_idx:] > 0]
if next_weights.size > 0:
assert cw + next_weights[0] > target
# and not below the target for `t_idx_low`:
next_weights = w_included_so_far[t_idx_low:][w_included_so_far[t_idx_low:] > 0]
if next_weights.size > 0:
assert cw + next_weights[0] >= target

View File

@@ -0,0 +1,512 @@
import numpy as np
import pytest
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import (
ExtraTreesClassifier,
ExtraTreesRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.tree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
ExtraTreeClassifier,
ExtraTreeRegressor,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSC_CONTAINERS
TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor]
TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [
RandomForestClassifier,
ExtraTreesClassifier,
]
TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [
RandomForestRegressor,
ExtraTreesRegressor,
]
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("sparse_splitter", (True, False))
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_monotonic_constraints_classifications(
TreeClassifier,
depth_first_builder,
sparse_splitter,
global_random_seed,
csc_container,
):
n_samples = 1000
n_samples_train = 900
X, y = make_classification(
n_samples=n_samples,
n_classes=2,
n_features=5,
n_informative=5,
n_redundant=0,
random_state=global_random_seed,
)
X_train, y_train = X[:n_samples_train], y[:n_samples_train]
X_test, _ = X[n_samples_train:], y[n_samples_train:]
X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test)
X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test)
X_test_0incr[:, 0] += 10
X_test_0decr[:, 0] -= 10
X_test_1incr[:, 1] += 10
X_test_1decr[:, 1] -= 10
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
monotonic_cst[1] = -1
if depth_first_builder:
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst)
else:
est = TreeClassifier(
max_depth=None,
monotonic_cst=monotonic_cst,
max_leaf_nodes=n_samples_train,
)
if hasattr(est, "random_state"):
est.set_params(**{"random_state": global_random_seed})
if hasattr(est, "n_estimators"):
est.set_params(**{"n_estimators": 5})
if sparse_splitter:
X_train = csc_container(X_train)
est.fit(X_train, y_train)
proba_test = est.predict_proba(X_test)
assert np.logical_and(proba_test >= 0.0, proba_test <= 1.0).all(), (
"Probability should always be in [0, 1] range."
)
assert_allclose(proba_test.sum(axis=1), 1.0)
# Monotonic increase constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1])
# Monotonic decrease constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1])
@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("sparse_splitter", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_monotonic_constraints_regressions(
TreeRegressor,
depth_first_builder,
sparse_splitter,
criterion,
global_random_seed,
csc_container,
):
n_samples = 1000
n_samples_train = 900
# Build a regression task using 5 informative features
X, y = make_regression(
n_samples=n_samples,
n_features=5,
n_informative=5,
random_state=global_random_seed,
)
train = np.arange(n_samples_train)
test = np.arange(n_samples_train, n_samples)
X_train = X[train]
y_train = y[train]
X_test = np.copy(X[test])
X_test_incr = np.copy(X_test)
X_test_decr = np.copy(X_test)
X_test_incr[:, 0] += 10
X_test_decr[:, 1] += 10
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
monotonic_cst[1] = -1
if depth_first_builder:
est = TreeRegressor(
max_depth=None,
monotonic_cst=monotonic_cst,
criterion=criterion,
)
else:
est = TreeRegressor(
max_depth=8,
monotonic_cst=monotonic_cst,
criterion=criterion,
max_leaf_nodes=n_samples_train,
)
if hasattr(est, "random_state"):
est.set_params(random_state=global_random_seed)
if hasattr(est, "n_estimators"):
est.set_params(**{"n_estimators": 5})
if sparse_splitter:
X_train = csc_container(X_train)
est.fit(X_train, y_train)
y = est.predict(X_test)
# Monotonic increase constraint
y_incr = est.predict(X_test_incr)
# y_incr should always be greater than y
assert np.all(y_incr >= y)
# Monotonic decrease constraint
y_decr = est.predict(X_test_decr)
# y_decr should always be lower than y
assert np.all(y_decr <= y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_multiclass_raises(TreeClassifier):
X, y = make_classification(
n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0
)
y[0] = 0
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = -1
monotonic_cst[1] = 1
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
msg = "Monotonicity constraints are not supported with multiclass classification"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_multiple_output_raises(TreeClassifier):
X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]]
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0
)
msg = "Monotonicity constraints are not supported with multiple output"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize(
"Tree",
[
DecisionTreeClassifier,
DecisionTreeRegressor,
ExtraTreeClassifier,
ExtraTreeRegressor,
],
)
def test_missing_values_raises(Tree):
X, y = make_classification(
n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0
)
X[0, 0] = np.nan
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
est = Tree(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
msg = "Input X contains NaN"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_bad_monotonic_cst_raises(TreeClassifier):
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [1, 0, 1, 0, 1]
msg = "monotonic_cst has shape 3 but the input data X has 2 features."
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0
)
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
msg = "monotonic_cst must be None or an array-like of -1, 0 or 1."
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0
)
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0
)
with pytest.raises(ValueError, match=msg + "(.*)0.8]"):
est.fit(X, y)
def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign):
values = tree_.value
for i in range(tree_.node_count):
if tree_.children_left[i] > i and tree_.children_right[i] > i:
# Check monotonicity on children
i_left = tree_.children_left[i]
i_right = tree_.children_right[i]
if monotonic_sign == 1:
assert values[i_left] <= values[i_right]
elif monotonic_sign == -1:
assert values[i_left] >= values[i_right]
val_middle = (values[i_left] + values[i_right]) / 2
# Check bounds on grand-children, filtering out leaf nodes
if tree_.feature[i_left] >= 0:
i_left_right = tree_.children_right[i_left]
if monotonic_sign == 1:
assert values[i_left_right] <= val_middle
elif monotonic_sign == -1:
assert values[i_left_right] >= val_middle
if tree_.feature[i_right] >= 0:
i_right_left = tree_.children_left[i_right]
if monotonic_sign == 1:
assert val_middle <= values[i_right_left]
elif monotonic_sign == -1:
assert val_middle >= values[i_right_left]
def test_assert_1d_reg_tree_children_monotonic_bounded():
X = np.linspace(-1, 1, 7).reshape(-1, 1)
y = np.sin(2 * np.pi * X.ravel())
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1)
with pytest.raises(AssertionError):
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1)
def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps):
X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1)
y_pred_grid = clf.predict(X_grid)
if monotonic_sign == 1:
assert (np.diff(y_pred_grid) >= 0.0).all()
elif monotonic_sign == -1:
assert (np.diff(y_pred_grid) <= 0.0).all()
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
def test_1d_opposite_monotonicity_cst_data(TreeRegressor):
# Check that positive monotonic data with negative monotonic constraint
# yield constant predictions, equal to the average of target values
X = np.linspace(-2, 2, 10).reshape(-1, 1)
y = X.ravel()
clf = TreeRegressor(monotonic_cst=[-1])
clf.fit(X, y)
assert clf.tree_.node_count == 1
assert clf.tree_.value[0] == 0.0
# Swap monotonicity
clf = TreeRegressor(monotonic_cst=[1])
clf.fit(X, -y)
assert clf.tree_.node_count == 1
assert clf.tree_.value[0] == 0.0
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
def test_1d_tree_nodes_values(
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
):
# Adaptation from test_nodes_values in test_monotonic_constraints.py
# in sklearn.ensemble._hist_gradient_boosting
# Build a single tree with only one feature, and make sure the node
# values respect the monotonicity constraints.
# Considering the following tree with a monotonic +1 constraint, we
# should have:
#
# root
# / \
# a b
# / \ / \
# c d e f
#
# a <= root <= b
# c <= d <= (a + b) / 2 <= e <= f
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
n_features = 1
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
if depth_first_builder:
# No max_leaf_nodes, default depth first tree builder
clf = TreeRegressor(
monotonic_cst=[monotonic_sign],
criterion=criterion,
random_state=global_random_seed,
)
else:
# max_leaf_nodes triggers best first tree builder
clf = TreeRegressor(
monotonic_cst=[monotonic_sign],
max_leaf_nodes=n_samples,
criterion=criterion,
random_state=global_random_seed,
)
clf.fit(X, y)
assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign)
assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100)
def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst):
upper_bound = np.full(tree_.node_count, np.inf)
lower_bound = np.full(tree_.node_count, -np.inf)
for i in range(tree_.node_count):
feature = tree_.feature[i]
node_value = tree_.value[i][0][0] # unpack value from nx1x1 array
# While building the tree, the computed middle value is slightly
# different from the average of the siblings values, because
# sum_right / weighted_n_right
# is slightly different from the value of the right sibling.
# This can cause a discrepancy up to numerical noise when clipping,
# which is resolved by comparing with some loss of precision.
assert np.float32(node_value) <= np.float32(upper_bound[i])
assert np.float32(node_value) >= np.float32(lower_bound[i])
if feature < 0:
# Leaf: nothing to do
continue
# Split node: check and update bounds for the children.
i_left = tree_.children_left[i]
i_right = tree_.children_right[i]
# unpack value from nx1x1 array
middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2
if monotonic_cst[feature] == 0:
# Feature without monotonicity constraint: propagate bounds
# down the tree to both children.
# Otherwise, with 2 features and a monotonic increase constraint
# (encoded by +1) on feature 0, the following tree can be accepted,
# although it does not respect the monotonic increase constraint:
#
# X[0] <= 0
# value = 100
# / \
# X[0] <= -1 X[1] <= 0
# value = 50 value = 150
# / \ / \
# leaf leaf leaf leaf
# value = 25 value = 75 value = 50 value = 250
lower_bound[i_left] = lower_bound[i]
upper_bound[i_left] = upper_bound[i]
lower_bound[i_right] = lower_bound[i]
upper_bound[i_right] = upper_bound[i]
elif monotonic_cst[feature] == 1:
# Feature with constraint: check monotonicity
assert tree_.value[i_left] <= tree_.value[i_right]
# Propagate bounds down the tree to both children.
lower_bound[i_left] = lower_bound[i]
upper_bound[i_left] = middle_value
lower_bound[i_right] = middle_value
upper_bound[i_right] = upper_bound[i]
elif monotonic_cst[feature] == -1:
# Feature with constraint: check monotonicity
assert tree_.value[i_left] >= tree_.value[i_right]
# Update and propagate bounds down the tree to both children.
lower_bound[i_left] = middle_value
upper_bound[i_left] = upper_bound[i]
lower_bound[i_right] = lower_bound[i]
upper_bound[i_right] = middle_value
else: # pragma: no cover
raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}")
def test_assert_nd_reg_tree_children_monotonic_bounded():
# Check that assert_nd_reg_tree_children_monotonic_bounded can detect
# non-monotonic tree predictions.
X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1)
y = np.sin(X).ravel()
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0])
# Check that assert_nd_reg_tree_children_monotonic_bounded raises
# when the data (and therefore the model) is naturally monotonic in the
# opposite direction.
X = np.linspace(-5, 5, 5).reshape(-1, 1)
y = X.ravel() ** 3
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
# For completeness, check that the converse holds when swapping the sign.
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
def test_nd_tree_nodes_values(
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
):
# Build tree with several features, and make sure the nodes
# values respect the monotonicity constraints.
# Considering the following tree with a monotonic increase constraint on X[0],
# we should have:
#
# root
# X[0]<=t
# / \
# a b
# X[0]<=u X[1]<=v
# / \ / \
# c d e f
#
# i) a <= root <= b
# ii) c <= a <= d <= (a+b)/2
# iii) (a+b)/2 <= min(e,f)
# For iii) we check that each node value is within the proper lower and
# upper bounds.
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
n_features = 2
monotonic_cst = [monotonic_sign, 0]
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
if depth_first_builder:
# No max_leaf_nodes, default depth first tree builder
clf = TreeRegressor(
monotonic_cst=monotonic_cst,
criterion=criterion,
random_state=global_random_seed,
)
else:
# max_leaf_nodes triggers best first tree builder
clf = TreeRegressor(
monotonic_cst=monotonic_cst,
max_leaf_nodes=n_samples,
criterion=criterion,
random_state=global_random_seed,
)
clf.fit(X, y)
assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst)

View File

@@ -0,0 +1,49 @@
import numpy as np
import pytest
from sklearn.tree._reingold_tilford import Tree, buchheim
simple_tree = Tree("", 0, Tree("", 1), Tree("", 2))
bigger_tree = Tree(
"",
0,
Tree(
"",
1,
Tree("", 3),
Tree("", 4, Tree("", 7), Tree("", 8)),
),
Tree("", 2, Tree("", 5), Tree("", 6)),
)
@pytest.mark.parametrize("tree, n_nodes", [(simple_tree, 3), (bigger_tree, 9)])
def test_buchheim(tree, n_nodes):
def walk_tree(draw_tree):
res = [(draw_tree.x, draw_tree.y)]
for child in draw_tree.children:
# parents higher than children:
assert child.y == draw_tree.y + 1
res.extend(walk_tree(child))
if len(draw_tree.children):
# these trees are always binary
# parents are centered above children
assert (
draw_tree.x == (draw_tree.children[0].x + draw_tree.children[1].x) / 2
)
return res
layout = buchheim(tree)
coordinates = walk_tree(layout)
assert len(coordinates) == n_nodes
# test that x values are unique per depth / level
# we could also do it quicker using defaultdicts..
depth = 0
while True:
x_at_this_depth = [node[0] for node in coordinates if node[1] == depth]
if not x_at_this_depth:
# reached all leafs
break
assert len(np.unique(x_at_this_depth)) == len(x_at_this_depth)
depth += 1

File diff suppressed because it is too large Load Diff