Videre
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
"""Decision tree based models for classification and regression."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.tree._classes import (
|
||||
BaseDecisionTree,
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
ExtraTreeClassifier,
|
||||
ExtraTreeRegressor,
|
||||
)
|
||||
from sklearn.tree._export import export_graphviz, export_text, plot_tree
|
||||
|
||||
__all__ = [
|
||||
"BaseDecisionTree",
|
||||
"DecisionTreeClassifier",
|
||||
"DecisionTreeRegressor",
|
||||
"ExtraTreeClassifier",
|
||||
"ExtraTreeRegressor",
|
||||
"export_graphviz",
|
||||
"export_text",
|
||||
"plot_tree",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,109 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See _criterion.pyx for implementation details.
|
||||
from sklearn.utils._typedefs cimport float64_t, int8_t, intp_t
|
||||
|
||||
|
||||
cdef class Criterion:
|
||||
# The criterion computes the impurity of a node and the reduction of
|
||||
# impurity of a split on that node. It also computes the output statistics
|
||||
# such as the mean in regression and class probabilities in classification.
|
||||
|
||||
# Internal structures
|
||||
cdef const float64_t[:, ::1] y # Values of y
|
||||
cdef const float64_t[:] sample_weight # Sample weights
|
||||
|
||||
cdef const intp_t[:] sample_indices # Sample indices in X, y
|
||||
cdef intp_t start # samples[start:pos] are the samples in the left node
|
||||
cdef intp_t pos # samples[pos:end] are the samples in the right node
|
||||
cdef intp_t end
|
||||
cdef intp_t n_missing # Number of missing values for the feature being evaluated
|
||||
cdef bint missing_go_to_left # Whether missing values go to the left node
|
||||
|
||||
cdef intp_t n_outputs # Number of outputs
|
||||
cdef intp_t n_samples # Number of samples
|
||||
cdef intp_t n_node_samples # Number of samples in the node (end-start)
|
||||
cdef float64_t weighted_n_samples # Weighted number of samples (in total)
|
||||
cdef float64_t weighted_n_node_samples # Weighted number of samples in the node
|
||||
cdef float64_t weighted_n_left # Weighted number of samples in the left node
|
||||
cdef float64_t weighted_n_right # Weighted number of samples in the right node
|
||||
cdef float64_t weighted_n_missing # Weighted number of samples that are missing
|
||||
|
||||
# The criterion object is maintained such that left and right collected
|
||||
# statistics correspond to samples[start:pos] and samples[pos:end].
|
||||
|
||||
# Methods
|
||||
cdef int init(
|
||||
self,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
float64_t weighted_n_samples,
|
||||
const intp_t[:] sample_indices,
|
||||
intp_t start,
|
||||
intp_t end
|
||||
) except -1 nogil
|
||||
cdef void init_sum_missing(self)
|
||||
cdef void init_missing(self, intp_t n_missing) noexcept nogil
|
||||
cdef int reset(self) except -1 nogil
|
||||
cdef int reverse_reset(self) except -1 nogil
|
||||
cdef int update(self, intp_t new_pos) except -1 nogil
|
||||
cdef float64_t node_impurity(self) noexcept nogil
|
||||
cdef void children_impurity(
|
||||
self,
|
||||
float64_t* impurity_left,
|
||||
float64_t* impurity_right
|
||||
) noexcept nogil
|
||||
cdef void node_value(
|
||||
self,
|
||||
float64_t* dest
|
||||
) noexcept nogil
|
||||
cdef void clip_node_value(
|
||||
self,
|
||||
float64_t* dest,
|
||||
float64_t lower_bound,
|
||||
float64_t upper_bound
|
||||
) noexcept nogil
|
||||
cdef float64_t middle_value(self) noexcept nogil
|
||||
cdef float64_t impurity_improvement(
|
||||
self,
|
||||
float64_t impurity_parent,
|
||||
float64_t impurity_left,
|
||||
float64_t impurity_right
|
||||
) noexcept nogil
|
||||
cdef float64_t proxy_impurity_improvement(self) noexcept nogil
|
||||
cdef bint check_monotonicity(
|
||||
self,
|
||||
int8_t monotonic_cst,
|
||||
float64_t lower_bound,
|
||||
float64_t upper_bound,
|
||||
) noexcept nogil
|
||||
cdef inline bint _check_monotonicity(
|
||||
self,
|
||||
int8_t monotonic_cst,
|
||||
float64_t lower_bound,
|
||||
float64_t upper_bound,
|
||||
float64_t sum_left,
|
||||
float64_t sum_right,
|
||||
) noexcept nogil
|
||||
|
||||
cdef class ClassificationCriterion(Criterion):
|
||||
"""Abstract criterion for classification."""
|
||||
|
||||
cdef intp_t[::1] n_classes
|
||||
cdef intp_t max_n_classes
|
||||
|
||||
cdef float64_t[:, ::1] sum_total # The sum of the weighted count of each label.
|
||||
cdef float64_t[:, ::1] sum_left # Same as above, but for the left side of the split
|
||||
cdef float64_t[:, ::1] sum_right # Same as above, but for the right side of the split
|
||||
cdef float64_t[:, ::1] sum_missing # Same as above, but for missing values in X
|
||||
|
||||
cdef class RegressionCriterion(Criterion):
|
||||
"""Abstract regression criterion."""
|
||||
|
||||
cdef float64_t sq_sum_total
|
||||
|
||||
cdef float64_t[::1] sum_total # The sum of w*y.
|
||||
cdef float64_t[::1] sum_left # Same as above, but for the left side of the split
|
||||
cdef float64_t[::1] sum_right # Same as above, but for the right side of the split
|
||||
cdef float64_t[::1] sum_missing # Same as above, but for missing values in X
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,183 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See _partitioner.pyx for details.
|
||||
|
||||
from cython cimport floating
|
||||
|
||||
from sklearn.utils._typedefs cimport (
|
||||
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
|
||||
)
|
||||
from sklearn.tree._splitter cimport SplitRecord
|
||||
|
||||
|
||||
# Mitigate precision differences between 32 bit and 64 bit
|
||||
cdef const float32_t FEATURE_THRESHOLD = 1e-7
|
||||
|
||||
|
||||
# We provide here the abstract interface for a Partitioner that would be
|
||||
# theoretically shared between the Dense and Sparse partitioners. However,
|
||||
# we leave it commented out for now as it is not used in the current
|
||||
# implementation due to the performance hit from vtable lookups when using
|
||||
# inheritance based polymorphism. It is left here for future reference.
|
||||
#
|
||||
# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used
|
||||
# to represent both the dense and sparse partitioners.
|
||||
#
|
||||
# cdef class BasePartitioner:
|
||||
# cdef intp_t[::1] samples
|
||||
# cdef float32_t[::1] feature_values
|
||||
# cdef intp_t start
|
||||
# cdef intp_t end
|
||||
# cdef intp_t n_missing
|
||||
# cdef const uint8_t[::1] missing_values_in_feature_mask
|
||||
|
||||
# cdef void sort_samples_and_feature_values(
|
||||
# self, intp_t current_feature
|
||||
# ) noexcept nogil
|
||||
# cdef void init_node_split(
|
||||
# self,
|
||||
# intp_t start,
|
||||
# intp_t end
|
||||
# ) noexcept nogil
|
||||
# cdef void find_min_max(
|
||||
# self,
|
||||
# intp_t current_feature,
|
||||
# float32_t* min_feature_value_out,
|
||||
# float32_t* max_feature_value_out,
|
||||
# ) noexcept nogil
|
||||
# cdef void next_p(
|
||||
# self,
|
||||
# intp_t* p_prev,
|
||||
# intp_t* p
|
||||
# ) noexcept nogil
|
||||
# cdef intp_t partition_samples(
|
||||
# self,
|
||||
# float64_t current_threshold
|
||||
# ) noexcept nogil
|
||||
# cdef void partition_samples_final(
|
||||
# self,
|
||||
# intp_t best_pos,
|
||||
# float64_t best_threshold,
|
||||
# intp_t best_feature,
|
||||
# intp_t n_missing,
|
||||
# ) noexcept nogil
|
||||
|
||||
|
||||
cdef class DensePartitioner:
|
||||
"""Partitioner specialized for dense data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
cdef const float32_t[:, :] X
|
||||
cdef intp_t[::1] samples
|
||||
cdef float32_t[::1] feature_values
|
||||
cdef intp_t start
|
||||
cdef intp_t end
|
||||
cdef intp_t n_missing
|
||||
cdef const uint8_t[::1] missing_values_in_feature_mask
|
||||
|
||||
cdef void sort_samples_and_feature_values(
|
||||
self, intp_t current_feature
|
||||
) noexcept nogil
|
||||
cdef void init_node_split(
|
||||
self,
|
||||
intp_t start,
|
||||
intp_t end
|
||||
) noexcept nogil
|
||||
cdef void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil
|
||||
cdef void next_p(
|
||||
self,
|
||||
intp_t* p_prev,
|
||||
intp_t* p
|
||||
) noexcept nogil
|
||||
cdef intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil
|
||||
cdef void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t n_missing,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef class SparsePartitioner:
|
||||
"""Partitioner specialized for sparse CSC data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
cdef const float32_t[::1] X_data
|
||||
cdef const int32_t[::1] X_indices
|
||||
cdef const int32_t[::1] X_indptr
|
||||
cdef intp_t n_total_samples
|
||||
cdef intp_t[::1] index_to_samples
|
||||
cdef intp_t[::1] sorted_samples
|
||||
cdef intp_t start_positive
|
||||
cdef intp_t end_negative
|
||||
cdef bint is_samples_sorted
|
||||
|
||||
cdef intp_t[::1] samples
|
||||
cdef float32_t[::1] feature_values
|
||||
cdef intp_t start
|
||||
cdef intp_t end
|
||||
cdef intp_t n_missing
|
||||
cdef const uint8_t[::1] missing_values_in_feature_mask
|
||||
|
||||
cdef void sort_samples_and_feature_values(
|
||||
self, intp_t current_feature
|
||||
) noexcept nogil
|
||||
cdef void init_node_split(
|
||||
self,
|
||||
intp_t start,
|
||||
intp_t end
|
||||
) noexcept nogil
|
||||
cdef void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil
|
||||
cdef void next_p(
|
||||
self,
|
||||
intp_t* p_prev,
|
||||
intp_t* p
|
||||
) noexcept nogil
|
||||
cdef intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil
|
||||
cdef void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t n_missing,
|
||||
) noexcept nogil
|
||||
|
||||
cdef void extract_nnz(
|
||||
self,
|
||||
intp_t feature
|
||||
) noexcept nogil
|
||||
cdef intp_t _partition(
|
||||
self,
|
||||
float64_t threshold,
|
||||
intp_t zero_pos
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef void shift_missing_values_to_left_if_required(
|
||||
SplitRecord* best,
|
||||
intp_t[::1] samples,
|
||||
intp_t end,
|
||||
) noexcept nogil
|
||||
|
||||
|
||||
cdef void sort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil
|
||||
@@ -0,0 +1,817 @@
|
||||
"""Partition samples in the construction of a tree.
|
||||
|
||||
This module contains the algorithms for moving sample indices to
|
||||
the left and right child node given a split determined by the
|
||||
splitting algorithm in `_splitter.pyx`.
|
||||
|
||||
Partitioning is done in a way that is efficient for both dense data,
|
||||
and sparse data stored in a Compressed Sparse Column (CSC) format.
|
||||
"""
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from cython cimport final
|
||||
from libc.math cimport isnan, log2
|
||||
from libc.stdlib cimport qsort
|
||||
from libc.string cimport memcpy
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
|
||||
# Constant to switch between algorithm non zero value extract algorithm
|
||||
# in SparsePartitioner
|
||||
cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
|
||||
|
||||
# Allow for 32 bit float comparisons
|
||||
cdef float32_t INFINITY_32t = np.inf
|
||||
|
||||
|
||||
@final
|
||||
cdef class DensePartitioner:
|
||||
"""Partitioner specialized for dense data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
const float32_t[:, :] X,
|
||||
intp_t[::1] samples,
|
||||
float32_t[::1] feature_values,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
):
|
||||
self.X = X
|
||||
self.samples = samples
|
||||
self.feature_values = feature_values
|
||||
self.missing_values_in_feature_mask = missing_values_in_feature_mask
|
||||
|
||||
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
|
||||
"""Initialize splitter at the beginning of node_split."""
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void sort_samples_and_feature_values(
|
||||
self, intp_t current_feature
|
||||
) noexcept nogil:
|
||||
"""Simultaneously sort based on the feature_values.
|
||||
|
||||
Missing values are stored at the end of feature_values.
|
||||
The number of missing values observed in feature_values is stored
|
||||
in self.n_missing.
|
||||
"""
|
||||
cdef:
|
||||
intp_t i, current_end
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
const float32_t[:, :] X = self.X
|
||||
intp_t[::1] samples = self.samples
|
||||
intp_t n_missing = 0
|
||||
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
|
||||
|
||||
# Sort samples along that feature; by copying the values into an array and
|
||||
# sorting the array in a manner which utilizes the cache more effectively.
|
||||
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
|
||||
i, current_end = self.start, self.end - 1
|
||||
# Missing values are placed at the end and do not participate in the sorting.
|
||||
while i <= current_end:
|
||||
# Finds the right-most value that is not missing so that
|
||||
# it can be swapped with missing values at its left.
|
||||
if isnan(X[samples[current_end], current_feature]):
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
continue
|
||||
|
||||
# X[samples[current_end], current_feature] is a non-missing value
|
||||
if isnan(X[samples[i], current_feature]):
|
||||
samples[i], samples[current_end] = samples[current_end], samples[i]
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
|
||||
feature_values[i] = X[samples[i], current_feature]
|
||||
i += 1
|
||||
else:
|
||||
# When there are no missing values, we only need to copy the data into
|
||||
# feature_values
|
||||
for i in range(self.start, self.end):
|
||||
feature_values[i] = X[samples[i], current_feature]
|
||||
|
||||
sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
|
||||
self.n_missing = n_missing
|
||||
|
||||
cdef inline void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil:
|
||||
"""Find the minimum and maximum value for current_feature.
|
||||
|
||||
Missing values are stored at the end of feature_values. The number of missing
|
||||
values observed in feature_values is stored in self.n_missing.
|
||||
"""
|
||||
cdef:
|
||||
intp_t p, current_end
|
||||
float32_t current_feature_value
|
||||
const float32_t[:, :] X = self.X
|
||||
intp_t[::1] samples = self.samples
|
||||
float32_t min_feature_value = INFINITY_32t
|
||||
float32_t max_feature_value = -INFINITY_32t
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t n_missing = 0
|
||||
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
|
||||
|
||||
# We are copying the values into an array and finding min/max of the array in
|
||||
# a manner which utilizes the cache more effectively. We need to also count
|
||||
# the number of missing-values there are.
|
||||
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
|
||||
p, current_end = self.start, self.end - 1
|
||||
# Missing values are placed at the end and do not participate in the
|
||||
# min/max calculation.
|
||||
while p <= current_end:
|
||||
# Finds the right-most value that is not missing so that
|
||||
# it can be swapped with missing values towards its left.
|
||||
if isnan(X[samples[current_end], current_feature]):
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
continue
|
||||
|
||||
# X[samples[current_end], current_feature] is a non-missing value
|
||||
if isnan(X[samples[p], current_feature]):
|
||||
samples[p], samples[current_end] = samples[current_end], samples[p]
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
|
||||
current_feature_value = X[samples[p], current_feature]
|
||||
feature_values[p] = current_feature_value
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
p += 1
|
||||
else:
|
||||
min_feature_value = X[samples[self.start], current_feature]
|
||||
max_feature_value = min_feature_value
|
||||
|
||||
feature_values[self.start] = min_feature_value
|
||||
for p in range(self.start + 1, self.end):
|
||||
current_feature_value = X[samples[p], current_feature]
|
||||
feature_values[p] = current_feature_value
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
min_feature_value_out[0] = min_feature_value
|
||||
max_feature_value_out[0] = max_feature_value
|
||||
self.n_missing = n_missing
|
||||
|
||||
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
|
||||
"""Compute the next p_prev and p for iterating over feature values.
|
||||
|
||||
The missing values are not included when iterating through the feature values.
|
||||
"""
|
||||
cdef intp_t end_non_missing = self.end - self.n_missing
|
||||
|
||||
while (
|
||||
p[0] + 1 < end_non_missing and
|
||||
self.feature_values[p[0] + 1] <= self.feature_values[p[0]] + FEATURE_THRESHOLD
|
||||
):
|
||||
p[0] += 1
|
||||
|
||||
p_prev[0] = p[0]
|
||||
|
||||
# By adding 1, we have
|
||||
# (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
|
||||
p[0] += 1
|
||||
|
||||
cdef inline intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil:
|
||||
"""Partition samples for feature_values at the current_threshold."""
|
||||
cdef:
|
||||
intp_t p = self.start
|
||||
intp_t partition_end = self.end - self.n_missing
|
||||
intp_t[::1] samples = self.samples
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
|
||||
while p < partition_end:
|
||||
if feature_values[p] <= current_threshold:
|
||||
p += 1
|
||||
else:
|
||||
partition_end -= 1
|
||||
|
||||
feature_values[p], feature_values[partition_end] = (
|
||||
feature_values[partition_end], feature_values[p]
|
||||
)
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
|
||||
return partition_end
|
||||
|
||||
cdef inline void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t best_n_missing,
|
||||
) noexcept nogil:
|
||||
"""Partition samples for X at the best_threshold and best_feature.
|
||||
|
||||
If missing values are present, this method partitions `samples`
|
||||
so that the `best_n_missing` missing values' indices are in the
|
||||
right-most end of `samples`, that is `samples[end_non_missing:end]`.
|
||||
"""
|
||||
cdef:
|
||||
# Local invariance: start <= p <= partition_end <= end
|
||||
intp_t start = self.start
|
||||
intp_t p = start
|
||||
intp_t end = self.end - 1
|
||||
intp_t partition_end = end - best_n_missing
|
||||
intp_t[::1] samples = self.samples
|
||||
const float32_t[:, :] X = self.X
|
||||
float32_t current_value
|
||||
|
||||
if best_n_missing != 0:
|
||||
# Move samples with missing values to the end while partitioning the
|
||||
# non-missing samples
|
||||
while p <= partition_end:
|
||||
# Keep samples with missing values at the end
|
||||
if isnan(X[samples[end], best_feature]):
|
||||
end -= 1
|
||||
continue
|
||||
|
||||
# Swap sample with missing values with the sample at the end
|
||||
current_value = X[samples[p], best_feature]
|
||||
if isnan(current_value):
|
||||
samples[p], samples[end] = samples[end], samples[p]
|
||||
end -= 1
|
||||
|
||||
# The swapped sample at the end is always a non-missing value, so
|
||||
# we can continue the algorithm without checking for missingness.
|
||||
current_value = X[samples[p], best_feature]
|
||||
|
||||
# Partition the non-missing samples
|
||||
if current_value <= best_threshold:
|
||||
p += 1
|
||||
else:
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
partition_end -= 1
|
||||
else:
|
||||
# Partitioning routine when there are no missing values
|
||||
while p < partition_end:
|
||||
if X[samples[p], best_feature] <= best_threshold:
|
||||
p += 1
|
||||
else:
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
partition_end -= 1
|
||||
|
||||
|
||||
@final
|
||||
cdef class SparsePartitioner:
|
||||
"""Partitioner specialized for sparse CSC data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
object X,
|
||||
intp_t[::1] samples,
|
||||
intp_t n_samples,
|
||||
float32_t[::1] feature_values,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
):
|
||||
if not (issparse(X) and X.format == "csc"):
|
||||
raise ValueError("X should be in csc format")
|
||||
|
||||
self.samples = samples
|
||||
self.feature_values = feature_values
|
||||
|
||||
# Initialize X
|
||||
cdef intp_t n_total_samples = X.shape[0]
|
||||
|
||||
self.X_data = X.data
|
||||
self.X_indices = X.indices
|
||||
self.X_indptr = X.indptr
|
||||
self.n_total_samples = n_total_samples
|
||||
|
||||
# Initialize auxiliary array used to perform split
|
||||
self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp)
|
||||
self.sorted_samples = np.empty(n_samples, dtype=np.intp)
|
||||
|
||||
cdef intp_t p
|
||||
for p in range(n_samples):
|
||||
self.index_to_samples[samples[p]] = p
|
||||
|
||||
self.missing_values_in_feature_mask = missing_values_in_feature_mask
|
||||
|
||||
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
|
||||
"""Initialize splitter at the beginning of node_split."""
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.is_samples_sorted = 0
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void sort_samples_and_feature_values(
|
||||
self,
|
||||
intp_t current_feature
|
||||
) noexcept nogil:
|
||||
"""Simultaneously sort based on the feature_values."""
|
||||
cdef:
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t[::1] index_to_samples = self.index_to_samples
|
||||
intp_t[::1] samples = self.samples
|
||||
|
||||
self.extract_nnz(current_feature)
|
||||
# Sort the positive and negative parts of `feature_values`
|
||||
sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
|
||||
if self.start_positive < self.end:
|
||||
sort(
|
||||
&feature_values[self.start_positive],
|
||||
&samples[self.start_positive],
|
||||
self.end - self.start_positive
|
||||
)
|
||||
|
||||
# Update index_to_samples to take into account the sort
|
||||
for p in range(self.start, self.end_negative):
|
||||
index_to_samples[samples[p]] = p
|
||||
for p in range(self.start_positive, self.end):
|
||||
index_to_samples[samples[p]] = p
|
||||
|
||||
# Add one or two zeros in feature_values, if there is any
|
||||
if self.end_negative < self.start_positive:
|
||||
self.start_positive -= 1
|
||||
feature_values[self.start_positive] = 0.
|
||||
|
||||
if self.end_negative != self.start_positive:
|
||||
feature_values[self.end_negative] = 0.
|
||||
self.end_negative += 1
|
||||
|
||||
# XXX: When sparse supports missing values, this should be set to the
|
||||
# number of missing values for current_feature
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil:
|
||||
"""Find the minimum and maximum value for current_feature."""
|
||||
cdef:
|
||||
intp_t p
|
||||
float32_t current_feature_value, min_feature_value, max_feature_value
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
|
||||
self.extract_nnz(current_feature)
|
||||
|
||||
if self.end_negative != self.start_positive:
|
||||
# There is a zero
|
||||
min_feature_value = 0
|
||||
max_feature_value = 0
|
||||
else:
|
||||
min_feature_value = feature_values[self.start]
|
||||
max_feature_value = min_feature_value
|
||||
|
||||
# Find min, max in feature_values[start:end_negative]
|
||||
for p in range(self.start, self.end_negative):
|
||||
current_feature_value = feature_values[p]
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
# Update min, max given feature_values[start_positive:end]
|
||||
for p in range(self.start_positive, self.end):
|
||||
current_feature_value = feature_values[p]
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
min_feature_value_out[0] = min_feature_value
|
||||
max_feature_value_out[0] = max_feature_value
|
||||
|
||||
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
|
||||
"""Compute the next p_prev and p for iterating over feature values."""
|
||||
cdef intp_t p_next
|
||||
|
||||
if p[0] + 1 != self.end_negative:
|
||||
p_next = p[0] + 1
|
||||
else:
|
||||
p_next = self.start_positive
|
||||
|
||||
while (p_next < self.end and
|
||||
self.feature_values[p_next] <= self.feature_values[p[0]] + FEATURE_THRESHOLD):
|
||||
p[0] = p_next
|
||||
if p[0] + 1 != self.end_negative:
|
||||
p_next = p[0] + 1
|
||||
else:
|
||||
p_next = self.start_positive
|
||||
|
||||
p_prev[0] = p[0]
|
||||
p[0] = p_next
|
||||
|
||||
cdef inline intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil:
|
||||
"""Partition samples for feature_values at the current_threshold."""
|
||||
return self._partition(current_threshold, self.start_positive)
|
||||
|
||||
cdef inline void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t n_missing,
|
||||
) noexcept nogil:
|
||||
"""Partition samples for X at the best_threshold and best_feature."""
|
||||
self.extract_nnz(best_feature)
|
||||
self._partition(best_threshold, best_pos)
|
||||
|
||||
cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil:
|
||||
"""Partition samples[start:end] based on threshold."""
|
||||
cdef:
|
||||
intp_t p, partition_end
|
||||
intp_t[::1] index_to_samples = self.index_to_samples
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t[::1] samples = self.samples
|
||||
|
||||
if threshold < 0.:
|
||||
p = self.start
|
||||
partition_end = self.end_negative
|
||||
elif threshold > 0.:
|
||||
p = self.start_positive
|
||||
partition_end = self.end
|
||||
else:
|
||||
# Data are already split
|
||||
return zero_pos
|
||||
|
||||
while p < partition_end:
|
||||
if feature_values[p] <= threshold:
|
||||
p += 1
|
||||
|
||||
else:
|
||||
partition_end -= 1
|
||||
|
||||
feature_values[p], feature_values[partition_end] = (
|
||||
feature_values[partition_end], feature_values[p]
|
||||
)
|
||||
sparse_swap(index_to_samples, samples, p, partition_end)
|
||||
|
||||
return partition_end
|
||||
|
||||
cdef inline void extract_nnz(self, intp_t feature) noexcept nogil:
|
||||
"""Extract and partition values for a given feature.
|
||||
|
||||
The extracted values are partitioned between negative values
|
||||
feature_values[start:end_negative[0]] and positive values
|
||||
feature_values[start_positive[0]:end].
|
||||
The samples and index_to_samples are modified according to this
|
||||
partition.
|
||||
|
||||
The extraction corresponds to the intersection between the arrays
|
||||
X_indices[indptr_start:indptr_end] and samples[start:end].
|
||||
This is done efficiently using either an index_to_samples based approach
|
||||
or binary search based approach.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature : intp_t,
|
||||
Index of the feature we want to extract non zero value.
|
||||
"""
|
||||
cdef intp_t[::1] samples = self.samples
|
||||
cdef float32_t[::1] feature_values = self.feature_values
|
||||
cdef intp_t indptr_start = self.X_indptr[feature]
|
||||
cdef intp_t indptr_end = self.X_indptr[feature + 1]
|
||||
cdef intp_t n_indices = <intp_t>(indptr_end - indptr_start)
|
||||
cdef intp_t n_samples = self.end - self.start
|
||||
cdef intp_t[::1] index_to_samples = self.index_to_samples
|
||||
cdef intp_t[::1] sorted_samples = self.sorted_samples
|
||||
cdef const int32_t[::1] X_indices = self.X_indices
|
||||
cdef const float32_t[::1] X_data = self.X_data
|
||||
|
||||
# Use binary search if n_samples * log(n_indices) <
|
||||
# n_indices and index_to_samples approach otherwise.
|
||||
# O(n_samples * log(n_indices)) is the running time of binary
|
||||
# search and O(n_indices) is the running time of index_to_samples
|
||||
# approach.
|
||||
if ((1 - self.is_samples_sorted) * n_samples * log2(n_samples) +
|
||||
n_samples * log2(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
|
||||
extract_nnz_binary_search(X_indices, X_data,
|
||||
indptr_start, indptr_end,
|
||||
samples, self.start, self.end,
|
||||
index_to_samples,
|
||||
feature_values,
|
||||
&self.end_negative, &self.start_positive,
|
||||
sorted_samples, &self.is_samples_sorted)
|
||||
|
||||
# Using an index to samples technique to extract non zero values
|
||||
# index_to_samples is a mapping from X_indices to samples
|
||||
else:
|
||||
extract_nnz_index_to_samples(X_indices, X_data,
|
||||
indptr_start, indptr_end,
|
||||
samples, self.start, self.end,
|
||||
index_to_samples,
|
||||
feature_values,
|
||||
&self.end_negative, &self.start_positive)
|
||||
|
||||
|
||||
cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
|
||||
"""Comparison function for sort.
|
||||
|
||||
This must return an `int` as it is used by stdlib's qsort, which expects
|
||||
an `int` return value.
|
||||
"""
|
||||
return <int>((<intp_t*>a)[0] - (<intp_t*>b)[0])
|
||||
|
||||
|
||||
cdef inline void binary_search(const int32_t[::1] sorted_array,
|
||||
int32_t start, int32_t end,
|
||||
intp_t value, intp_t* index,
|
||||
int32_t* new_start) noexcept nogil:
|
||||
"""Return the index of value in the sorted array.
|
||||
|
||||
If not found, return -1. new_start is the last pivot + 1
|
||||
"""
|
||||
cdef int32_t pivot
|
||||
index[0] = -1
|
||||
while start < end:
|
||||
pivot = start + (end - start) / 2
|
||||
|
||||
if sorted_array[pivot] == value:
|
||||
index[0] = pivot
|
||||
start = pivot + 1
|
||||
break
|
||||
|
||||
if sorted_array[pivot] < value:
|
||||
start = pivot + 1
|
||||
else:
|
||||
end = pivot
|
||||
new_start[0] = start
|
||||
|
||||
|
||||
cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices,
|
||||
const float32_t[::1] X_data,
|
||||
int32_t indptr_start,
|
||||
int32_t indptr_end,
|
||||
intp_t[::1] samples,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
intp_t[::1] index_to_samples,
|
||||
float32_t[::1] feature_values,
|
||||
intp_t* end_negative,
|
||||
intp_t* start_positive) noexcept nogil:
|
||||
"""Extract and partition values for a feature using index_to_samples.
|
||||
|
||||
Complexity is O(indptr_end - indptr_start).
|
||||
"""
|
||||
cdef int32_t k
|
||||
cdef intp_t index
|
||||
cdef intp_t end_negative_ = start
|
||||
cdef intp_t start_positive_ = end
|
||||
|
||||
for k in range(indptr_start, indptr_end):
|
||||
if start <= index_to_samples[X_indices[k]] < end:
|
||||
if X_data[k] > 0:
|
||||
start_positive_ -= 1
|
||||
feature_values[start_positive_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, start_positive_)
|
||||
|
||||
elif X_data[k] < 0:
|
||||
feature_values[end_negative_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, end_negative_)
|
||||
end_negative_ += 1
|
||||
|
||||
# Returned values
|
||||
end_negative[0] = end_negative_
|
||||
start_positive[0] = start_positive_
|
||||
|
||||
|
||||
cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
|
||||
const float32_t[::1] X_data,
|
||||
int32_t indptr_start,
|
||||
int32_t indptr_end,
|
||||
intp_t[::1] samples,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
intp_t[::1] index_to_samples,
|
||||
float32_t[::1] feature_values,
|
||||
intp_t* end_negative,
|
||||
intp_t* start_positive,
|
||||
intp_t[::1] sorted_samples,
|
||||
bint* is_samples_sorted) noexcept nogil:
|
||||
"""Extract and partition values for a given feature using binary search.
|
||||
|
||||
If n_samples = end - start and n_indices = indptr_end - indptr_start,
|
||||
the complexity is
|
||||
|
||||
O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
|
||||
n_samples * log(n_indices)).
|
||||
"""
|
||||
cdef intp_t n_samples
|
||||
|
||||
if not is_samples_sorted[0]:
|
||||
n_samples = end - start
|
||||
memcpy(&sorted_samples[start], &samples[start],
|
||||
n_samples * sizeof(intp_t))
|
||||
qsort(&sorted_samples[start], n_samples, sizeof(intp_t),
|
||||
compare_SIZE_t)
|
||||
is_samples_sorted[0] = 1
|
||||
|
||||
while (indptr_start < indptr_end and
|
||||
sorted_samples[start] > X_indices[indptr_start]):
|
||||
indptr_start += 1
|
||||
|
||||
while (indptr_start < indptr_end and
|
||||
sorted_samples[end - 1] < X_indices[indptr_end - 1]):
|
||||
indptr_end -= 1
|
||||
|
||||
cdef intp_t p = start
|
||||
cdef intp_t index
|
||||
cdef intp_t k
|
||||
cdef intp_t end_negative_ = start
|
||||
cdef intp_t start_positive_ = end
|
||||
|
||||
while (p < end and indptr_start < indptr_end):
|
||||
# Find index of sorted_samples[p] in X_indices
|
||||
binary_search(X_indices, indptr_start, indptr_end,
|
||||
sorted_samples[p], &k, &indptr_start)
|
||||
|
||||
if k != -1:
|
||||
# If k != -1, we have found a non zero value
|
||||
|
||||
if X_data[k] > 0:
|
||||
start_positive_ -= 1
|
||||
feature_values[start_positive_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, start_positive_)
|
||||
|
||||
elif X_data[k] < 0:
|
||||
feature_values[end_negative_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, end_negative_)
|
||||
end_negative_ += 1
|
||||
p += 1
|
||||
|
||||
# Returned values
|
||||
end_negative[0] = end_negative_
|
||||
start_positive[0] = start_positive_
|
||||
|
||||
|
||||
cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples,
|
||||
intp_t pos_1, intp_t pos_2) noexcept nogil:
|
||||
"""Swap sample pos_1 and pos_2 preserving sparse invariant."""
|
||||
samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1]
|
||||
index_to_samples[samples[pos_1]] = pos_1
|
||||
index_to_samples[samples[pos_2]] = pos_2
|
||||
|
||||
|
||||
cdef inline void shift_missing_values_to_left_if_required(
|
||||
SplitRecord* best,
|
||||
intp_t[::1] samples,
|
||||
intp_t end,
|
||||
) noexcept nogil:
|
||||
"""Shift missing value sample indices to the left of the split if required.
|
||||
|
||||
Note: this should always be called at the very end because it will
|
||||
move samples around, thereby affecting the criterion.
|
||||
This affects the computation of the children impurity, which affects
|
||||
the computation of the next node.
|
||||
"""
|
||||
cdef intp_t i, p, current_end
|
||||
# The partitioner partitions the data such that the missing values are in
|
||||
# samples[-n_missing:] for the criterion to consume. If the missing values
|
||||
# are going to the right node, then the missing values are already in the
|
||||
# correct position. If the missing values go left, then we move the missing
|
||||
# values to samples[best.pos:best.pos+n_missing] and update `best.pos`.
|
||||
if best.n_missing > 0 and best.missing_go_to_left:
|
||||
for p in range(best.n_missing):
|
||||
i = best.pos + p
|
||||
current_end = end - 1 - p
|
||||
samples[i], samples[current_end] = samples[current_end], samples[i]
|
||||
best.pos += best.n_missing
|
||||
|
||||
|
||||
def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n):
|
||||
"""Used for testing sort."""
|
||||
sort(&feature_values[0], &samples[0], n)
|
||||
|
||||
|
||||
# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
|
||||
# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
|
||||
cdef void sort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
|
||||
if n == 0:
|
||||
return
|
||||
cdef intp_t maxd = 2 * <intp_t>log2(n)
|
||||
introsort(feature_values, samples, n, maxd)
|
||||
|
||||
|
||||
cdef inline void swap(floating* feature_values, intp_t* samples,
|
||||
intp_t i, intp_t j) noexcept nogil:
|
||||
# Helper for sort
|
||||
feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
|
||||
samples[i], samples[j] = samples[j], samples[i]
|
||||
|
||||
|
||||
cdef inline floating median3(floating* feature_values, intp_t n) noexcept nogil:
|
||||
# Median of three pivot selection, after Bentley and McIlroy (1993).
|
||||
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
|
||||
cdef floating a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
|
||||
if a < b:
|
||||
if b < c:
|
||||
return b
|
||||
elif a < c:
|
||||
return c
|
||||
else:
|
||||
return a
|
||||
elif b < c:
|
||||
if a < c:
|
||||
return a
|
||||
else:
|
||||
return c
|
||||
else:
|
||||
return b
|
||||
|
||||
|
||||
# Introsort with median of 3 pivot selection and 3-way partition function
|
||||
# (robust to repeated elements, e.g. lots of zero features).
|
||||
cdef void introsort(floating* feature_values, intp_t *samples,
|
||||
intp_t n, intp_t maxd) noexcept nogil:
|
||||
cdef floating pivot
|
||||
cdef intp_t i, l, r
|
||||
|
||||
while n > 1:
|
||||
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
|
||||
heapsort(feature_values, samples, n)
|
||||
return
|
||||
maxd -= 1
|
||||
|
||||
pivot = median3(feature_values, n)
|
||||
|
||||
# Three-way partition.
|
||||
i = l = 0
|
||||
r = n
|
||||
while i < r:
|
||||
if feature_values[i] < pivot:
|
||||
swap(feature_values, samples, i, l)
|
||||
i += 1
|
||||
l += 1
|
||||
elif feature_values[i] > pivot:
|
||||
r -= 1
|
||||
swap(feature_values, samples, i, r)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
introsort(feature_values, samples, l, maxd)
|
||||
feature_values += r
|
||||
samples += r
|
||||
n -= r
|
||||
|
||||
|
||||
cdef inline void sift_down(floating* feature_values, intp_t* samples,
|
||||
intp_t start, intp_t end) noexcept nogil:
|
||||
# Restore heap order in feature_values[start:end] by moving the max element to start.
|
||||
cdef intp_t child, maxind, root
|
||||
|
||||
root = start
|
||||
while True:
|
||||
child = root * 2 + 1
|
||||
|
||||
# find max of root, left child, right child
|
||||
maxind = root
|
||||
if child < end and feature_values[maxind] < feature_values[child]:
|
||||
maxind = child
|
||||
if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
|
||||
maxind = child + 1
|
||||
|
||||
if maxind == root:
|
||||
break
|
||||
else:
|
||||
swap(feature_values, samples, root, maxind)
|
||||
root = maxind
|
||||
|
||||
|
||||
cdef void heapsort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
|
||||
cdef intp_t start, end
|
||||
|
||||
# heapify
|
||||
start = (n - 2) / 2
|
||||
end = n
|
||||
while True:
|
||||
sift_down(feature_values, samples, start, end)
|
||||
if start == 0:
|
||||
break
|
||||
start -= 1
|
||||
|
||||
# sort by shrinking the heap, putting the max element immediately after it
|
||||
end = n - 1
|
||||
while end > 0:
|
||||
swap(feature_values, samples, 0, end)
|
||||
sift_down(feature_values, samples, 0, end)
|
||||
end = end - 1
|
||||
@@ -0,0 +1,188 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DrawTree:
|
||||
def __init__(self, tree, parent=None, depth=0, number=1):
|
||||
self.x = -1.0
|
||||
self.y = depth
|
||||
self.tree = tree
|
||||
self.children = [
|
||||
DrawTree(c, self, depth + 1, i + 1) for i, c in enumerate(tree.children)
|
||||
]
|
||||
self.parent = parent
|
||||
self.thread = None
|
||||
self.mod = 0
|
||||
self.ancestor = self
|
||||
self.change = self.shift = 0
|
||||
self._lmost_sibling = None
|
||||
# this is the number of the node in its group of siblings 1..n
|
||||
self.number = number
|
||||
|
||||
def left(self):
|
||||
return self.thread or (len(self.children) and self.children[0])
|
||||
|
||||
def right(self):
|
||||
return self.thread or (len(self.children) and self.children[-1])
|
||||
|
||||
def lbrother(self):
|
||||
n = None
|
||||
if self.parent:
|
||||
for node in self.parent.children:
|
||||
if node == self:
|
||||
return n
|
||||
else:
|
||||
n = node
|
||||
return n
|
||||
|
||||
def get_lmost_sibling(self):
|
||||
if not self._lmost_sibling and self.parent and self != self.parent.children[0]:
|
||||
self._lmost_sibling = self.parent.children[0]
|
||||
return self._lmost_sibling
|
||||
|
||||
lmost_sibling = property(get_lmost_sibling)
|
||||
|
||||
def __str__(self):
|
||||
return "%s: x=%s mod=%s" % (self.tree, self.x, self.mod)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def max_extents(self):
|
||||
extents = [c.max_extents() for c in self.children]
|
||||
extents.append((self.x, self.y))
|
||||
return np.max(extents, axis=0)
|
||||
|
||||
|
||||
def buchheim(tree):
|
||||
dt = first_walk(DrawTree(tree))
|
||||
min = second_walk(dt)
|
||||
if min < 0:
|
||||
third_walk(dt, -min)
|
||||
return dt
|
||||
|
||||
|
||||
def third_walk(tree, n):
|
||||
tree.x += n
|
||||
for c in tree.children:
|
||||
third_walk(c, n)
|
||||
|
||||
|
||||
def first_walk(v, distance=1.0):
|
||||
if len(v.children) == 0:
|
||||
if v.lmost_sibling:
|
||||
v.x = v.lbrother().x + distance
|
||||
else:
|
||||
v.x = 0.0
|
||||
else:
|
||||
default_ancestor = v.children[0]
|
||||
for w in v.children:
|
||||
first_walk(w)
|
||||
default_ancestor = apportion(w, default_ancestor, distance)
|
||||
# print("finished v =", v.tree, "children")
|
||||
execute_shifts(v)
|
||||
|
||||
midpoint = (v.children[0].x + v.children[-1].x) / 2
|
||||
|
||||
w = v.lbrother()
|
||||
if w:
|
||||
v.x = w.x + distance
|
||||
v.mod = v.x - midpoint
|
||||
else:
|
||||
v.x = midpoint
|
||||
return v
|
||||
|
||||
|
||||
def apportion(v, default_ancestor, distance):
|
||||
w = v.lbrother()
|
||||
if w is not None:
|
||||
# in buchheim notation:
|
||||
# i == inner; o == outer; r == right; l == left; r = +; l = -
|
||||
vir = vor = v
|
||||
vil = w
|
||||
vol = v.lmost_sibling
|
||||
sir = sor = v.mod
|
||||
sil = vil.mod
|
||||
sol = vol.mod
|
||||
while vil.right() and vir.left():
|
||||
vil = vil.right()
|
||||
vir = vir.left()
|
||||
vol = vol.left()
|
||||
vor = vor.right()
|
||||
vor.ancestor = v
|
||||
shift = (vil.x + sil) - (vir.x + sir) + distance
|
||||
if shift > 0:
|
||||
move_subtree(ancestor(vil, v, default_ancestor), v, shift)
|
||||
sir = sir + shift
|
||||
sor = sor + shift
|
||||
sil += vil.mod
|
||||
sir += vir.mod
|
||||
sol += vol.mod
|
||||
sor += vor.mod
|
||||
if vil.right() and not vor.right():
|
||||
vor.thread = vil.right()
|
||||
vor.mod += sil - sor
|
||||
else:
|
||||
if vir.left() and not vol.left():
|
||||
vol.thread = vir.left()
|
||||
vol.mod += sir - sol
|
||||
default_ancestor = v
|
||||
return default_ancestor
|
||||
|
||||
|
||||
def move_subtree(wl, wr, shift):
|
||||
subtrees = wr.number - wl.number
|
||||
# print(wl.tree, "is conflicted with", wr.tree, 'moving', subtrees,
|
||||
# 'shift', shift)
|
||||
# print wl, wr, wr.number, wl.number, shift, subtrees, shift/subtrees
|
||||
wr.change -= shift / subtrees
|
||||
wr.shift += shift
|
||||
wl.change += shift / subtrees
|
||||
wr.x += shift
|
||||
wr.mod += shift
|
||||
|
||||
|
||||
def execute_shifts(v):
|
||||
shift = change = 0
|
||||
for w in v.children[::-1]:
|
||||
# print("shift:", w, shift, w.change)
|
||||
w.x += shift
|
||||
w.mod += shift
|
||||
change += w.change
|
||||
shift += w.shift + change
|
||||
|
||||
|
||||
def ancestor(vil, v, default_ancestor):
|
||||
# the relevant text is at the bottom of page 7 of
|
||||
# "Improving Walker's Algorithm to Run in Linear Time" by Buchheim et al,
|
||||
# (2002)
|
||||
# https://citeseerx.ist.psu.edu/doc_view/pid/1f41c3c2a4880dc49238e46d555f16d28da2940d
|
||||
if vil.ancestor in v.parent.children:
|
||||
return vil.ancestor
|
||||
else:
|
||||
return default_ancestor
|
||||
|
||||
|
||||
def second_walk(v, m=0, depth=0, min=None):
|
||||
v.x += m
|
||||
v.y = depth
|
||||
|
||||
if min is None or v.x < min:
|
||||
min = v.x
|
||||
|
||||
for w in v.children:
|
||||
min = second_walk(w, m + v.mod, depth + 1, min)
|
||||
|
||||
return min
|
||||
|
||||
|
||||
class Tree:
|
||||
def __init__(self, label="", node_id=-1, *children):
|
||||
self.label = label
|
||||
self.node_id = node_id
|
||||
if children:
|
||||
self.children = children
|
||||
else:
|
||||
self.children = []
|
||||
Binary file not shown.
@@ -0,0 +1,106 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See _splitter.pyx for details.
|
||||
|
||||
from sklearn.utils._typedefs cimport (
|
||||
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
|
||||
)
|
||||
from sklearn.tree._criterion cimport Criterion
|
||||
from sklearn.tree._tree cimport ParentInfo
|
||||
|
||||
|
||||
cdef struct SplitRecord:
|
||||
# Data to track sample split
|
||||
intp_t feature # Which feature to split on.
|
||||
intp_t pos # Split samples array at the given position,
|
||||
# # i.e. count of samples below threshold for feature.
|
||||
# # pos is >= end if the node is a leaf.
|
||||
float64_t threshold # Threshold to split at.
|
||||
float64_t improvement # Impurity improvement given parent node.
|
||||
float64_t impurity_left # Impurity of the left split.
|
||||
float64_t impurity_right # Impurity of the right split.
|
||||
float64_t lower_bound # Lower bound on value of both children for monotonicity
|
||||
float64_t upper_bound # Upper bound on value of both children for monotonicity
|
||||
uint8_t missing_go_to_left # Controls if missing values go to the left node.
|
||||
intp_t n_missing # Number of missing values for the feature being split on
|
||||
|
||||
cdef class Splitter:
|
||||
# The splitter searches in the input space for a feature and a threshold
|
||||
# to split the samples samples[start:end].
|
||||
#
|
||||
# The impurity computations are delegated to a criterion object.
|
||||
|
||||
# Internal structures
|
||||
cdef public Criterion criterion # Impurity criterion
|
||||
cdef public intp_t max_features # Number of features to test
|
||||
cdef public intp_t min_samples_leaf # Min samples in a leaf
|
||||
cdef public float64_t min_weight_leaf # Minimum weight in a leaf
|
||||
|
||||
cdef object random_state # Random state
|
||||
cdef uint32_t rand_r_state # sklearn_rand_r random number state
|
||||
|
||||
cdef intp_t[::1] samples # Sample indices in X, y
|
||||
cdef intp_t n_samples # X.shape[0]
|
||||
cdef float64_t weighted_n_samples # Weighted number of samples
|
||||
cdef intp_t[::1] features # Feature indices in X
|
||||
cdef intp_t[::1] constant_features # Constant features indices
|
||||
cdef intp_t n_features # X.shape[1]
|
||||
cdef float32_t[::1] feature_values # temp. array holding feature values
|
||||
|
||||
cdef intp_t start # Start position for the current node
|
||||
cdef intp_t end # End position for the current node
|
||||
|
||||
cdef const float64_t[:, ::1] y
|
||||
# Monotonicity constraints for each feature.
|
||||
# The encoding is as follows:
|
||||
# -1: monotonic decrease
|
||||
# 0: no constraint
|
||||
# +1: monotonic increase
|
||||
cdef const int8_t[:] monotonic_cst
|
||||
cdef bint with_monotonic_cst
|
||||
cdef const float64_t[:] sample_weight
|
||||
|
||||
# The samples vector `samples` is maintained by the Splitter object such
|
||||
# that the samples contained in a node are contiguous. With this setting,
|
||||
# `node_split` reorganizes the node samples `samples[start:end]` in two
|
||||
# subsets `samples[start:pos]` and `samples[pos:end]`.
|
||||
|
||||
# The 1-d `features` array of size n_features contains the features
|
||||
# indices and allows fast sampling without replacement of features.
|
||||
|
||||
# The 1-d `constant_features` array of size n_features holds in
|
||||
# `constant_features[:n_constant_features]` the feature ids with
|
||||
# constant values for all the samples that reached a specific node.
|
||||
# The value `n_constant_features` is given by the parent node to its
|
||||
# child nodes. The content of the range `[n_constant_features:]` is left
|
||||
# undefined, but preallocated for performance reasons
|
||||
# This allows optimization with depth-based tree building.
|
||||
|
||||
# Methods
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1
|
||||
|
||||
cdef int node_reset(
|
||||
self,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
float64_t* weighted_n_node_samples
|
||||
) except -1 nogil
|
||||
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil
|
||||
|
||||
cdef void node_value(self, float64_t* dest) noexcept nogil
|
||||
|
||||
cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil
|
||||
|
||||
cdef float64_t node_impurity(self) noexcept nogil
|
||||
@@ -0,0 +1,904 @@
|
||||
"""Splitting algorithms in the construction of a tree.
|
||||
|
||||
This module contains the main splitting algorithms for constructing a tree.
|
||||
Splitting is concerned with finding the optimal partition of the data into
|
||||
two groups. The impurity of the groups is minimized, and the impurity is measured
|
||||
by some criterion, which is typically the Gini impurity or the entropy. Criterion
|
||||
are implemented in the ``_criterion`` module.
|
||||
|
||||
Splitting evaluates a subset of features (defined by `max_features` also
|
||||
known as mtry in the literature). The module supports two primary types
|
||||
of splitting strategies:
|
||||
|
||||
- Best Split: A greedy approach to find the optimal split. This method
|
||||
ensures that the best possible split is chosen by examining various
|
||||
thresholds for each candidate feature.
|
||||
- Random Split: A stochastic approach that selects a split randomly
|
||||
from a subset of the best splits. This method is faster but does
|
||||
not guarantee the optimal split.
|
||||
"""
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from libc.string cimport memcpy
|
||||
|
||||
from sklearn.utils._typedefs cimport int8_t
|
||||
from sklearn.tree._criterion cimport Criterion
|
||||
from sklearn.tree._partitioner cimport (
|
||||
FEATURE_THRESHOLD, DensePartitioner, SparsePartitioner,
|
||||
shift_missing_values_to_left_if_required
|
||||
)
|
||||
from sklearn.tree._utils cimport RAND_R_MAX, rand_int, rand_uniform
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Introduce a fused-class to make it possible to share the split implementation
|
||||
# between the dense and sparse cases in the node_split_best and node_split_random
|
||||
# functions. The alternative would have been to use inheritance-based polymorphism
|
||||
# but it would have resulted in a ~10% overall tree fitting performance
|
||||
# degradation caused by the overhead frequent virtual method lookups.
|
||||
ctypedef fused Partitioner:
|
||||
DensePartitioner
|
||||
SparsePartitioner
|
||||
|
||||
|
||||
cdef float64_t INFINITY = np.inf
|
||||
|
||||
|
||||
cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil:
|
||||
self.impurity_left = INFINITY
|
||||
self.impurity_right = INFINITY
|
||||
self.pos = start_pos
|
||||
self.feature = 0
|
||||
self.threshold = 0.
|
||||
self.improvement = -INFINITY
|
||||
self.missing_go_to_left = False
|
||||
self.n_missing = 0
|
||||
|
||||
cdef class Splitter:
|
||||
"""Abstract splitter class.
|
||||
|
||||
Splitters are called by tree builders to find the best splits on both
|
||||
sparse and dense data, one split at a time.
|
||||
"""
|
||||
|
||||
def __cinit__(
|
||||
self,
|
||||
Criterion criterion,
|
||||
intp_t max_features,
|
||||
intp_t min_samples_leaf,
|
||||
float64_t min_weight_leaf,
|
||||
object random_state,
|
||||
const int8_t[:] monotonic_cst,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
criterion : Criterion
|
||||
The criterion to measure the quality of a split.
|
||||
|
||||
max_features : intp_t
|
||||
The maximal number of randomly selected features which can be
|
||||
considered for a split.
|
||||
|
||||
min_samples_leaf : intp_t
|
||||
The minimal number of samples each leaf can have, where splits
|
||||
which would result in having less samples in a leaf are not
|
||||
considered.
|
||||
|
||||
min_weight_leaf : float64_t
|
||||
The minimal weight each leaf can have, where the weight is the sum
|
||||
of the weights of each sample in it.
|
||||
|
||||
random_state : object
|
||||
The user inputted random state to be used for pseudo-randomness
|
||||
|
||||
monotonic_cst : const int8_t[:]
|
||||
Monotonicity constraints
|
||||
|
||||
"""
|
||||
|
||||
self.criterion = criterion
|
||||
|
||||
self.n_samples = 0
|
||||
self.n_features = 0
|
||||
|
||||
self.max_features = max_features
|
||||
self.min_samples_leaf = min_samples_leaf
|
||||
self.min_weight_leaf = min_weight_leaf
|
||||
self.random_state = random_state
|
||||
self.monotonic_cst = monotonic_cst
|
||||
self.with_monotonic_cst = monotonic_cst is not None
|
||||
|
||||
def __getstate__(self):
|
||||
return {}
|
||||
|
||||
def __setstate__(self, d):
|
||||
pass
|
||||
|
||||
def __reduce__(self):
|
||||
return (type(self), (self.criterion,
|
||||
self.max_features,
|
||||
self.min_samples_leaf,
|
||||
self.min_weight_leaf,
|
||||
self.random_state,
|
||||
self.monotonic_cst), self.__getstate__())
|
||||
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1:
|
||||
"""Initialize the splitter.
|
||||
|
||||
Take in the input data X, the target Y, and optional sample weights.
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : object
|
||||
This contains the inputs. Usually it is a 2d numpy array.
|
||||
|
||||
y : ndarray, dtype=float64_t
|
||||
This is the vector of targets, or true labels, for the samples represented
|
||||
as a Cython memoryview.
|
||||
|
||||
sample_weight : ndarray, dtype=float64_t
|
||||
The weights of the samples, where higher weighted samples are fit
|
||||
closer than lower weight samples. If not provided, all samples
|
||||
are assumed to have uniform weight. This is represented
|
||||
as a Cython memoryview.
|
||||
|
||||
has_missing : bool
|
||||
At least one missing values is in X.
|
||||
"""
|
||||
|
||||
self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
|
||||
cdef intp_t n_samples = X.shape[0]
|
||||
|
||||
# Create a new array which will be used to store nonzero
|
||||
# samples from the feature of interest
|
||||
self.samples = np.empty(n_samples, dtype=np.intp)
|
||||
cdef intp_t[::1] samples = self.samples
|
||||
|
||||
cdef intp_t i, j
|
||||
cdef float64_t weighted_n_samples = 0.0
|
||||
j = 0
|
||||
|
||||
for i in range(n_samples):
|
||||
# Only work with positively weighted samples
|
||||
if sample_weight is None or sample_weight[i] != 0.0:
|
||||
samples[j] = i
|
||||
j += 1
|
||||
|
||||
if sample_weight is not None:
|
||||
weighted_n_samples += sample_weight[i]
|
||||
else:
|
||||
weighted_n_samples += 1.0
|
||||
|
||||
# Number of samples is number of positively weighted samples
|
||||
self.n_samples = j
|
||||
self.weighted_n_samples = weighted_n_samples
|
||||
|
||||
cdef intp_t n_features = X.shape[1]
|
||||
self.features = np.arange(n_features, dtype=np.intp)
|
||||
self.n_features = n_features
|
||||
|
||||
self.feature_values = np.empty(n_samples, dtype=np.float32)
|
||||
self.constant_features = np.empty(n_features, dtype=np.intp)
|
||||
|
||||
self.y = y
|
||||
|
||||
self.sample_weight = sample_weight
|
||||
if missing_values_in_feature_mask is not None:
|
||||
self.criterion.init_sum_missing()
|
||||
return 0
|
||||
|
||||
cdef int node_reset(
|
||||
self,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
float64_t* weighted_n_node_samples
|
||||
) except -1 nogil:
|
||||
"""Reset splitter on node samples[start:end].
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start : intp_t
|
||||
The index of the first sample to consider
|
||||
end : intp_t
|
||||
The index of the last sample to consider
|
||||
weighted_n_node_samples : ndarray, dtype=float64_t pointer
|
||||
The total weight of those samples
|
||||
"""
|
||||
|
||||
self.start = start
|
||||
self.end = end
|
||||
|
||||
self.criterion.init(
|
||||
self.y,
|
||||
self.sample_weight,
|
||||
self.weighted_n_samples,
|
||||
self.samples,
|
||||
start,
|
||||
end
|
||||
)
|
||||
|
||||
weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
|
||||
return 0
|
||||
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent_record,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil:
|
||||
|
||||
"""Find the best split on node samples[start:end].
|
||||
|
||||
This is a placeholder method. The majority of computation will be done
|
||||
here.
|
||||
|
||||
It should return -1 upon errors.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
cdef void node_value(self, float64_t* dest) noexcept nogil:
|
||||
"""Copy the value of node samples[start:end] into dest."""
|
||||
|
||||
self.criterion.node_value(dest)
|
||||
|
||||
cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
|
||||
"""Clip the value in dest between lower_bound and upper_bound for monotonic constraints."""
|
||||
|
||||
self.criterion.clip_node_value(dest, lower_bound, upper_bound)
|
||||
|
||||
cdef float64_t node_impurity(self) noexcept nogil:
|
||||
"""Return the impurity of the current node."""
|
||||
|
||||
return self.criterion.node_impurity()
|
||||
|
||||
|
||||
cdef inline int node_split_best(
|
||||
Splitter splitter,
|
||||
Partitioner partitioner,
|
||||
Criterion criterion,
|
||||
SplitRecord* split,
|
||||
ParentInfo* parent_record,
|
||||
) except -1 nogil:
|
||||
"""Find the best split on node samples[start:end]
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
|
||||
cdef bint with_monotonic_cst = splitter.with_monotonic_cst
|
||||
|
||||
# Find the best split
|
||||
cdef intp_t start = splitter.start
|
||||
cdef intp_t end = splitter.end
|
||||
cdef intp_t end_non_missing
|
||||
cdef intp_t n_missing = 0
|
||||
cdef bint has_missing = 0
|
||||
cdef intp_t n_searches
|
||||
cdef intp_t n_left, n_right
|
||||
cdef bint missing_go_to_left
|
||||
|
||||
cdef intp_t[::1] samples = splitter.samples
|
||||
cdef intp_t[::1] features = splitter.features
|
||||
cdef intp_t[::1] constant_features = splitter.constant_features
|
||||
cdef intp_t n_features = splitter.n_features
|
||||
|
||||
cdef float32_t[::1] feature_values = splitter.feature_values
|
||||
cdef intp_t max_features = splitter.max_features
|
||||
cdef intp_t min_samples_leaf = splitter.min_samples_leaf
|
||||
cdef float64_t min_weight_leaf = splitter.min_weight_leaf
|
||||
cdef uint32_t* random_state = &splitter.rand_r_state
|
||||
|
||||
cdef SplitRecord best_split, current_split
|
||||
cdef float64_t current_proxy_improvement = -INFINITY
|
||||
cdef float64_t best_proxy_improvement = -INFINITY
|
||||
|
||||
cdef float64_t impurity = parent_record.impurity
|
||||
cdef float64_t lower_bound = parent_record.lower_bound
|
||||
cdef float64_t upper_bound = parent_record.upper_bound
|
||||
|
||||
cdef intp_t f_i = n_features
|
||||
cdef intp_t f_j
|
||||
cdef intp_t p
|
||||
cdef intp_t p_prev
|
||||
|
||||
cdef intp_t n_visited_features = 0
|
||||
# Number of features discovered to be constant during the split search
|
||||
cdef intp_t n_found_constants = 0
|
||||
# Number of features known to be constant and drawn without replacement
|
||||
cdef intp_t n_drawn_constants = 0
|
||||
cdef intp_t n_known_constants = parent_record.n_constant_features
|
||||
# n_total_constants = n_known_constants + n_found_constants
|
||||
cdef intp_t n_total_constants = n_known_constants
|
||||
|
||||
_init_split(&best_split, end)
|
||||
|
||||
partitioner.init_node_split(start, end)
|
||||
|
||||
# Sample up to max_features without replacement using a
|
||||
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
||||
# `f_j` to compute a permutation of the `features` array).
|
||||
#
|
||||
# Skip the CPU intensive evaluation of the impurity criterion for
|
||||
# features that were already detected as constant (hence not suitable
|
||||
# for good splitting) by ancestor nodes and save the information on
|
||||
# newly discovered constant features to spare computation on descendant
|
||||
# nodes.
|
||||
while (f_i > n_total_constants and # Stop early if remaining features
|
||||
# are constant
|
||||
(n_visited_features < max_features or
|
||||
# At least one drawn features must be non constant
|
||||
n_visited_features <= n_found_constants + n_drawn_constants)):
|
||||
|
||||
n_visited_features += 1
|
||||
|
||||
# Loop invariant: elements of features in
|
||||
# - [:n_drawn_constant[ holds drawn and known constant features;
|
||||
# - [n_drawn_constant:n_known_constant[ holds known constant
|
||||
# features that haven't been drawn yet;
|
||||
# - [n_known_constant:n_total_constant[ holds newly found constant
|
||||
# features;
|
||||
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
||||
# yet and aren't constant apriori.
|
||||
# - [f_i:n_features[ holds features that have been drawn
|
||||
# and aren't constant.
|
||||
|
||||
# Draw a feature at random
|
||||
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
||||
random_state)
|
||||
|
||||
if f_j < n_known_constants:
|
||||
# f_j in the interval [n_drawn_constants, n_known_constants[
|
||||
features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
|
||||
|
||||
n_drawn_constants += 1
|
||||
continue
|
||||
|
||||
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
||||
f_j += n_found_constants
|
||||
# f_j in the interval [n_total_constants, f_i[
|
||||
current_split.feature = features[f_j]
|
||||
partitioner.sort_samples_and_feature_values(current_split.feature)
|
||||
n_missing = partitioner.n_missing
|
||||
end_non_missing = end - n_missing
|
||||
|
||||
if (
|
||||
# All values for this feature are missing, or
|
||||
end_non_missing == start or
|
||||
# This feature is considered constant (max - min <= FEATURE_THRESHOLD)
|
||||
((
|
||||
feature_values[end_non_missing - 1]
|
||||
<= feature_values[start] + FEATURE_THRESHOLD
|
||||
) and n_missing == 0)
|
||||
):
|
||||
# We consider this feature constant in this case.
|
||||
# Since finding a split among constant feature is not valuable,
|
||||
# we do not consider this feature for splitting.
|
||||
features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
|
||||
|
||||
n_found_constants += 1
|
||||
n_total_constants += 1
|
||||
continue
|
||||
|
||||
f_i -= 1
|
||||
features[f_i], features[f_j] = features[f_j], features[f_i]
|
||||
has_missing = n_missing != 0
|
||||
criterion.init_missing(n_missing) # initialize even when n_missing == 0
|
||||
|
||||
# Evaluate all splits
|
||||
|
||||
# If there are missing values, then we search twice for the most optimal split.
|
||||
# The first search will have all the missing values going to the right node.
|
||||
# The second search will have all the missing values going to the left node.
|
||||
# If there are no missing values, then we search only once for the most
|
||||
# optimal split.
|
||||
n_searches = 2 if has_missing else 1
|
||||
|
||||
for i in range(n_searches):
|
||||
missing_go_to_left = i == 1
|
||||
criterion.missing_go_to_left = missing_go_to_left
|
||||
criterion.reset()
|
||||
|
||||
p = start
|
||||
|
||||
while p < end_non_missing:
|
||||
partitioner.next_p(&p_prev, &p)
|
||||
|
||||
if p >= end_non_missing:
|
||||
continue
|
||||
|
||||
if missing_go_to_left:
|
||||
n_left = p - start + n_missing
|
||||
n_right = end_non_missing - p
|
||||
else:
|
||||
n_left = p - start
|
||||
n_right = end_non_missing - p + n_missing
|
||||
|
||||
# Reject if min_samples_leaf is not guaranteed
|
||||
if n_left < min_samples_leaf or n_right < min_samples_leaf:
|
||||
continue
|
||||
|
||||
current_split.pos = p
|
||||
criterion.update(current_split.pos)
|
||||
|
||||
# Reject if monotonicity constraints are not satisfied
|
||||
if (
|
||||
with_monotonic_cst and
|
||||
monotonic_cst[current_split.feature] != 0 and
|
||||
not criterion.check_monotonicity(
|
||||
monotonic_cst[current_split.feature],
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)
|
||||
):
|
||||
continue
|
||||
|
||||
# Reject if min_weight_leaf is not satisfied
|
||||
if ((criterion.weighted_n_left < min_weight_leaf) or
|
||||
(criterion.weighted_n_right < min_weight_leaf)):
|
||||
continue
|
||||
|
||||
current_proxy_improvement = criterion.proxy_impurity_improvement()
|
||||
|
||||
if current_proxy_improvement > best_proxy_improvement:
|
||||
best_proxy_improvement = current_proxy_improvement
|
||||
# sum of halves is used to avoid infinite value
|
||||
current_split.threshold = (
|
||||
feature_values[p_prev] / 2.0 + feature_values[p] / 2.0
|
||||
)
|
||||
|
||||
if (
|
||||
current_split.threshold == feature_values[p] or
|
||||
current_split.threshold == INFINITY or
|
||||
current_split.threshold == -INFINITY
|
||||
):
|
||||
current_split.threshold = feature_values[p_prev]
|
||||
|
||||
current_split.n_missing = n_missing
|
||||
|
||||
# if there are no missing values in the training data, during
|
||||
# test time, we send missing values to the branch that contains
|
||||
# the most samples during training time.
|
||||
if n_missing == 0:
|
||||
current_split.missing_go_to_left = n_left > n_right
|
||||
else:
|
||||
current_split.missing_go_to_left = missing_go_to_left
|
||||
|
||||
best_split = current_split # copy
|
||||
|
||||
# Evaluate when there are missing values and all missing values goes
|
||||
# to the right node and non-missing values goes to the left node.
|
||||
if has_missing:
|
||||
n_left, n_right = end - start - n_missing, n_missing
|
||||
p = end - n_missing
|
||||
missing_go_to_left = 0
|
||||
|
||||
if not (n_left < min_samples_leaf or n_right < min_samples_leaf):
|
||||
criterion.missing_go_to_left = missing_go_to_left
|
||||
criterion.update(p)
|
||||
|
||||
if not ((criterion.weighted_n_left < min_weight_leaf) or
|
||||
(criterion.weighted_n_right < min_weight_leaf)):
|
||||
current_proxy_improvement = criterion.proxy_impurity_improvement()
|
||||
|
||||
if current_proxy_improvement > best_proxy_improvement:
|
||||
best_proxy_improvement = current_proxy_improvement
|
||||
current_split.threshold = INFINITY
|
||||
current_split.missing_go_to_left = missing_go_to_left
|
||||
current_split.n_missing = n_missing
|
||||
current_split.pos = p
|
||||
best_split = current_split
|
||||
|
||||
# Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end]
|
||||
if best_split.pos < end:
|
||||
partitioner.partition_samples_final(
|
||||
best_split.pos,
|
||||
best_split.threshold,
|
||||
best_split.feature,
|
||||
best_split.n_missing
|
||||
)
|
||||
criterion.init_missing(best_split.n_missing)
|
||||
criterion.missing_go_to_left = best_split.missing_go_to_left
|
||||
|
||||
criterion.reset()
|
||||
criterion.update(best_split.pos)
|
||||
criterion.children_impurity(
|
||||
&best_split.impurity_left, &best_split.impurity_right
|
||||
)
|
||||
best_split.improvement = criterion.impurity_improvement(
|
||||
impurity,
|
||||
best_split.impurity_left,
|
||||
best_split.impurity_right
|
||||
)
|
||||
|
||||
shift_missing_values_to_left_if_required(&best_split, samples, end)
|
||||
|
||||
# Respect invariant for constant features: the original order of
|
||||
# element in features[:n_known_constants] must be preserved for sibling
|
||||
# and child nodes
|
||||
memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
|
||||
|
||||
# Copy newly found constant features
|
||||
memcpy(&constant_features[n_known_constants],
|
||||
&features[n_known_constants],
|
||||
sizeof(intp_t) * n_found_constants)
|
||||
|
||||
# Return values
|
||||
parent_record.n_constant_features = n_total_constants
|
||||
split[0] = best_split
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline int node_split_random(
|
||||
Splitter splitter,
|
||||
Partitioner partitioner,
|
||||
Criterion criterion,
|
||||
SplitRecord* split,
|
||||
ParentInfo* parent_record,
|
||||
) except -1 nogil:
|
||||
"""Find the best random split on node samples[start:end]
|
||||
|
||||
Returns -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
|
||||
cdef bint with_monotonic_cst = splitter.with_monotonic_cst
|
||||
|
||||
# Draw random splits and pick the best
|
||||
cdef intp_t start = splitter.start
|
||||
cdef intp_t end = splitter.end
|
||||
cdef intp_t end_non_missing
|
||||
cdef intp_t n_missing = 0
|
||||
cdef bint has_missing = 0
|
||||
cdef intp_t n_left, n_right
|
||||
cdef bint missing_go_to_left
|
||||
|
||||
cdef intp_t[::1] samples = splitter.samples
|
||||
cdef intp_t[::1] features = splitter.features
|
||||
cdef intp_t[::1] constant_features = splitter.constant_features
|
||||
cdef intp_t n_features = splitter.n_features
|
||||
|
||||
cdef intp_t max_features = splitter.max_features
|
||||
cdef intp_t min_samples_leaf = splitter.min_samples_leaf
|
||||
cdef float64_t min_weight_leaf = splitter.min_weight_leaf
|
||||
cdef uint32_t* random_state = &splitter.rand_r_state
|
||||
|
||||
cdef SplitRecord best_split, current_split
|
||||
cdef float64_t current_proxy_improvement = - INFINITY
|
||||
cdef float64_t best_proxy_improvement = - INFINITY
|
||||
|
||||
cdef float64_t impurity = parent_record.impurity
|
||||
cdef float64_t lower_bound = parent_record.lower_bound
|
||||
cdef float64_t upper_bound = parent_record.upper_bound
|
||||
|
||||
cdef intp_t f_i = n_features
|
||||
cdef intp_t f_j
|
||||
# Number of features discovered to be constant during the split search
|
||||
cdef intp_t n_found_constants = 0
|
||||
# Number of features known to be constant and drawn without replacement
|
||||
cdef intp_t n_drawn_constants = 0
|
||||
cdef intp_t n_known_constants = parent_record.n_constant_features
|
||||
# n_total_constants = n_known_constants + n_found_constants
|
||||
cdef intp_t n_total_constants = n_known_constants
|
||||
cdef intp_t n_visited_features = 0
|
||||
cdef float32_t min_feature_value
|
||||
cdef float32_t max_feature_value
|
||||
|
||||
_init_split(&best_split, end)
|
||||
|
||||
partitioner.init_node_split(start, end)
|
||||
|
||||
# Sample up to max_features without replacement using a
|
||||
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
||||
# `f_j` to compute a permutation of the `features` array).
|
||||
#
|
||||
# Skip the CPU intensive evaluation of the impurity criterion for
|
||||
# features that were already detected as constant (hence not suitable
|
||||
# for good splitting) by ancestor nodes and save the information on
|
||||
# newly discovered constant features to spare computation on descendant
|
||||
# nodes.
|
||||
while (f_i > n_total_constants and # Stop early if remaining features
|
||||
# are constant
|
||||
(n_visited_features < max_features or
|
||||
# At least one drawn features must be non constant
|
||||
n_visited_features <= n_found_constants + n_drawn_constants)):
|
||||
n_visited_features += 1
|
||||
|
||||
# Loop invariant: elements of features in
|
||||
# - [:n_drawn_constant[ holds drawn and known constant features;
|
||||
# - [n_drawn_constant:n_known_constant[ holds known constant
|
||||
# features that haven't been drawn yet;
|
||||
# - [n_known_constant:n_total_constant[ holds newly found constant
|
||||
# features;
|
||||
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
||||
# yet and aren't constant apriori.
|
||||
# - [f_i:n_features[ holds features that have been drawn
|
||||
# and aren't constant.
|
||||
|
||||
# Draw a feature at random
|
||||
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
||||
random_state)
|
||||
|
||||
if f_j < n_known_constants:
|
||||
# f_j in the interval [n_drawn_constants, n_known_constants[
|
||||
features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
|
||||
n_drawn_constants += 1
|
||||
continue
|
||||
|
||||
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
||||
f_j += n_found_constants
|
||||
# f_j in the interval [n_total_constants, f_i[
|
||||
|
||||
current_split.feature = features[f_j]
|
||||
|
||||
# Find min, max as we will randomly select a threshold between them
|
||||
partitioner.find_min_max(
|
||||
current_split.feature, &min_feature_value, &max_feature_value
|
||||
)
|
||||
n_missing = partitioner.n_missing
|
||||
end_non_missing = end - n_missing
|
||||
|
||||
if (
|
||||
# All values for this feature are missing, or
|
||||
end_non_missing == start or
|
||||
# This feature is considered constant (max - min <= FEATURE_THRESHOLD)
|
||||
(max_feature_value <= min_feature_value + FEATURE_THRESHOLD and n_missing == 0)
|
||||
):
|
||||
# We consider this feature constant in this case.
|
||||
# Since finding a split with a constant feature is not valuable,
|
||||
# we do not consider this feature for splitting.
|
||||
features[f_j], features[n_total_constants] = features[n_total_constants], current_split.feature
|
||||
|
||||
n_found_constants += 1
|
||||
n_total_constants += 1
|
||||
continue
|
||||
|
||||
f_i -= 1
|
||||
features[f_i], features[f_j] = features[f_j], features[f_i]
|
||||
has_missing = n_missing != 0
|
||||
criterion.init_missing(n_missing)
|
||||
|
||||
# Draw a random threshold
|
||||
current_split.threshold = rand_uniform(
|
||||
min_feature_value,
|
||||
max_feature_value,
|
||||
random_state,
|
||||
)
|
||||
|
||||
if has_missing:
|
||||
# If there are missing values, then we randomly make all missing
|
||||
# values go to the right or left.
|
||||
#
|
||||
# Note: compared to the BestSplitter, we do not evaluate the
|
||||
# edge case where all the missing values go to the right node
|
||||
# and the non-missing values go to the left node. This is because
|
||||
# this would indicate a threshold outside of the observed range
|
||||
# of the feature. However, it is not clear how much probability weight should
|
||||
# be given to this edge case.
|
||||
missing_go_to_left = rand_int(0, 2, random_state)
|
||||
else:
|
||||
missing_go_to_left = 0
|
||||
criterion.missing_go_to_left = missing_go_to_left
|
||||
|
||||
if current_split.threshold == max_feature_value:
|
||||
current_split.threshold = min_feature_value
|
||||
|
||||
# Partition
|
||||
current_split.pos = partitioner.partition_samples(
|
||||
current_split.threshold
|
||||
)
|
||||
|
||||
if missing_go_to_left:
|
||||
n_left = current_split.pos - start + n_missing
|
||||
n_right = end_non_missing - current_split.pos
|
||||
else:
|
||||
n_left = current_split.pos - start
|
||||
n_right = end_non_missing - current_split.pos + n_missing
|
||||
|
||||
# Reject if min_samples_leaf is not guaranteed
|
||||
if n_left < min_samples_leaf or n_right < min_samples_leaf:
|
||||
continue
|
||||
|
||||
# Evaluate split
|
||||
# At this point, the criterion has a view into the samples that was partitioned
|
||||
# by the partitioner. The criterion will use the partition to evaluating the split.
|
||||
criterion.reset()
|
||||
criterion.update(current_split.pos)
|
||||
|
||||
# Reject if min_weight_leaf is not satisfied
|
||||
if ((criterion.weighted_n_left < min_weight_leaf) or
|
||||
(criterion.weighted_n_right < min_weight_leaf)):
|
||||
continue
|
||||
|
||||
# Reject if monotonicity constraints are not satisfied
|
||||
if (
|
||||
with_monotonic_cst and
|
||||
monotonic_cst[current_split.feature] != 0 and
|
||||
not criterion.check_monotonicity(
|
||||
monotonic_cst[current_split.feature],
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)
|
||||
):
|
||||
continue
|
||||
|
||||
current_proxy_improvement = criterion.proxy_impurity_improvement()
|
||||
|
||||
if current_proxy_improvement > best_proxy_improvement:
|
||||
current_split.n_missing = n_missing
|
||||
|
||||
# if there are no missing values in the training data, during
|
||||
# test time, we send missing values to the branch that contains
|
||||
# the most samples during training time.
|
||||
if has_missing:
|
||||
current_split.missing_go_to_left = missing_go_to_left
|
||||
else:
|
||||
current_split.missing_go_to_left = n_left > n_right
|
||||
|
||||
best_proxy_improvement = current_proxy_improvement
|
||||
best_split = current_split # copy
|
||||
|
||||
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
||||
if best_split.pos < end:
|
||||
if current_split.feature != best_split.feature:
|
||||
partitioner.partition_samples_final(
|
||||
best_split.pos,
|
||||
best_split.threshold,
|
||||
best_split.feature,
|
||||
best_split.n_missing
|
||||
)
|
||||
criterion.init_missing(best_split.n_missing)
|
||||
criterion.missing_go_to_left = best_split.missing_go_to_left
|
||||
|
||||
criterion.reset()
|
||||
criterion.update(best_split.pos)
|
||||
criterion.children_impurity(
|
||||
&best_split.impurity_left, &best_split.impurity_right
|
||||
)
|
||||
best_split.improvement = criterion.impurity_improvement(
|
||||
impurity,
|
||||
best_split.impurity_left,
|
||||
best_split.impurity_right
|
||||
)
|
||||
|
||||
shift_missing_values_to_left_if_required(&best_split, samples, end)
|
||||
|
||||
# Respect invariant for constant features: the original order of
|
||||
# element in features[:n_known_constants] must be preserved for sibling
|
||||
# and child nodes
|
||||
memcpy(&features[0], &constant_features[0], sizeof(intp_t) * n_known_constants)
|
||||
|
||||
# Copy newly found constant features
|
||||
memcpy(&constant_features[n_known_constants],
|
||||
&features[n_known_constants],
|
||||
sizeof(intp_t) * n_found_constants)
|
||||
|
||||
# Return values
|
||||
parent_record.n_constant_features = n_total_constants
|
||||
split[0] = best_split
|
||||
return 0
|
||||
|
||||
|
||||
cdef class BestSplitter(Splitter):
|
||||
"""Splitter for finding the best split on dense data."""
|
||||
cdef DensePartitioner partitioner
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1:
|
||||
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
|
||||
self.partitioner = DensePartitioner(
|
||||
X, self.samples, self.feature_values, missing_values_in_feature_mask
|
||||
)
|
||||
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent_record,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil:
|
||||
return node_split_best(
|
||||
self,
|
||||
self.partitioner,
|
||||
self.criterion,
|
||||
split,
|
||||
parent_record,
|
||||
)
|
||||
|
||||
cdef class BestSparseSplitter(Splitter):
|
||||
"""Splitter for finding the best split, using the sparse data."""
|
||||
cdef SparsePartitioner partitioner
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1:
|
||||
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
|
||||
self.partitioner = SparsePartitioner(
|
||||
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
|
||||
)
|
||||
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent_record,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil:
|
||||
return node_split_best(
|
||||
self,
|
||||
self.partitioner,
|
||||
self.criterion,
|
||||
split,
|
||||
parent_record,
|
||||
)
|
||||
|
||||
cdef class RandomSplitter(Splitter):
|
||||
"""Splitter for finding the best random split on dense data."""
|
||||
cdef DensePartitioner partitioner
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1:
|
||||
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
|
||||
self.partitioner = DensePartitioner(
|
||||
X, self.samples, self.feature_values, missing_values_in_feature_mask
|
||||
)
|
||||
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent_record,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil:
|
||||
return node_split_random(
|
||||
self,
|
||||
self.partitioner,
|
||||
self.criterion,
|
||||
split,
|
||||
parent_record,
|
||||
)
|
||||
|
||||
cdef class RandomSparseSplitter(Splitter):
|
||||
"""Splitter for finding the best random split, using the sparse data."""
|
||||
cdef SparsePartitioner partitioner
|
||||
cdef int init(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
) except -1:
|
||||
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
|
||||
self.partitioner = SparsePartitioner(
|
||||
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
|
||||
)
|
||||
cdef int node_split(
|
||||
self,
|
||||
ParentInfo* parent_record,
|
||||
SplitRecord* split,
|
||||
) except -1 nogil:
|
||||
return node_split_random(
|
||||
self,
|
||||
self.partitioner,
|
||||
self.criterion,
|
||||
split,
|
||||
parent_record,
|
||||
)
|
||||
Binary file not shown.
@@ -0,0 +1,133 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See _tree.pyx for details.
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
|
||||
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t
|
||||
|
||||
from sklearn.tree._splitter cimport Splitter
|
||||
from sklearn.tree._splitter cimport SplitRecord
|
||||
|
||||
cdef struct Node:
|
||||
# Base storage structure for the nodes in a Tree object
|
||||
|
||||
intp_t left_child # id of the left child of the node
|
||||
intp_t right_child # id of the right child of the node
|
||||
intp_t feature # Feature used for splitting the node
|
||||
float64_t threshold # Threshold value at the node
|
||||
float64_t impurity # Impurity of the node (i.e., the value of the criterion)
|
||||
intp_t n_node_samples # Number of samples at the node
|
||||
float64_t weighted_n_node_samples # Weighted number of samples at the node
|
||||
uint8_t missing_go_to_left # Whether features have missing values
|
||||
|
||||
|
||||
cdef struct ParentInfo:
|
||||
# Structure to store information about the parent of a node
|
||||
# This is passed to the splitter, to provide information about the previous split
|
||||
|
||||
float64_t lower_bound # the lower bound of the parent's impurity
|
||||
float64_t upper_bound # the upper bound of the parent's impurity
|
||||
float64_t impurity # the impurity of the parent
|
||||
intp_t n_constant_features # the number of constant features found in parent
|
||||
|
||||
cdef class Tree:
|
||||
# The Tree object is a binary tree structure constructed by the
|
||||
# TreeBuilder. The tree structure is used for predictions and
|
||||
# feature importances.
|
||||
|
||||
# Input/Output layout
|
||||
cdef public intp_t n_features # Number of features in X
|
||||
cdef intp_t* n_classes # Number of classes in y[:, k]
|
||||
cdef public intp_t n_outputs # Number of outputs in y
|
||||
cdef public intp_t max_n_classes # max(n_classes)
|
||||
|
||||
# Inner structures: values are stored separately from node structure,
|
||||
# since size is determined at runtime.
|
||||
cdef public intp_t max_depth # Max depth of the tree
|
||||
cdef public intp_t node_count # Counter for node IDs
|
||||
cdef public intp_t capacity # Capacity of tree, in terms of nodes
|
||||
cdef Node* nodes # Array of nodes
|
||||
cdef float64_t* value # (capacity, n_outputs, max_n_classes) array of values
|
||||
cdef intp_t value_stride # = n_outputs * max_n_classes
|
||||
|
||||
# Methods
|
||||
cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
|
||||
intp_t feature, float64_t threshold, float64_t impurity,
|
||||
intp_t n_node_samples,
|
||||
float64_t weighted_n_node_samples,
|
||||
uint8_t missing_go_to_left) except -1 nogil
|
||||
cdef int _resize(self, intp_t capacity) except -1 nogil
|
||||
cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
|
||||
|
||||
cdef cnp.ndarray _get_value_ndarray(self)
|
||||
cdef cnp.ndarray _get_node_ndarray(self)
|
||||
|
||||
cpdef cnp.ndarray predict(self, object X)
|
||||
|
||||
cpdef cnp.ndarray apply(self, object X)
|
||||
cdef cnp.ndarray _apply_dense(self, object X)
|
||||
cdef cnp.ndarray _apply_sparse_csr(self, object X)
|
||||
|
||||
cpdef object decision_path(self, object X)
|
||||
cdef object _decision_path_dense(self, object X)
|
||||
cdef object _decision_path_sparse_csr(self, object X)
|
||||
|
||||
cpdef compute_node_depths(self)
|
||||
cpdef compute_feature_importances(self, normalize=*)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tree builder
|
||||
# =============================================================================
|
||||
|
||||
cdef class TreeBuilder:
|
||||
# The TreeBuilder recursively builds a Tree object from training samples,
|
||||
# using a Splitter object for splitting internal nodes and assigning
|
||||
# values to leaves.
|
||||
#
|
||||
# This class controls the various stopping criteria and the node splitting
|
||||
# evaluation order, e.g. depth-first or best-first.
|
||||
|
||||
cdef Splitter splitter # Splitting algorithm
|
||||
|
||||
cdef intp_t min_samples_split # Minimum number of samples in an internal node
|
||||
cdef intp_t min_samples_leaf # Minimum number of samples in a leaf
|
||||
cdef float64_t min_weight_leaf # Minimum weight in a leaf
|
||||
cdef intp_t max_depth # Maximal tree depth
|
||||
cdef float64_t min_impurity_decrease # Impurity threshold for early stopping
|
||||
|
||||
cpdef build(
|
||||
self,
|
||||
Tree tree,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight=*,
|
||||
const uint8_t[::1] missing_values_in_feature_mask=*,
|
||||
)
|
||||
|
||||
cdef _check_input(
|
||||
self,
|
||||
object X,
|
||||
const float64_t[:, ::1] y,
|
||||
const float64_t[:] sample_weight,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tree pruning
|
||||
# =============================================================================
|
||||
|
||||
# The private function allows any external caller to prune the tree and return
|
||||
# a new tree with the pruned nodes. The pruned tree is a new tree object.
|
||||
#
|
||||
# .. warning:: this function is not backwards compatible and may change without
|
||||
# notice.
|
||||
cdef void _build_pruned_tree(
|
||||
Tree tree, # OUT
|
||||
Tree orig_tree,
|
||||
const uint8_t[:] leaves_in_subtree,
|
||||
intp_t capacity
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,70 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# See _utils.pyx for details.
|
||||
|
||||
cimport numpy as cnp
|
||||
from sklearn.tree._tree cimport Node
|
||||
from sklearn.neighbors._quad_tree cimport Cell
|
||||
from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t
|
||||
|
||||
|
||||
cdef enum:
|
||||
# Max value for our rand_r replacement (near the bottom).
|
||||
# We don't use RAND_MAX because it's different across platforms and
|
||||
# particularly tiny on Windows/MSVC.
|
||||
# It corresponds to the maximum representable value for
|
||||
# 32-bit signed integers (i.e. 2^31 - 1).
|
||||
RAND_R_MAX = 2147483647
|
||||
|
||||
|
||||
# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or
|
||||
# raises a MemoryError. It never calls free, since that's __dealloc__'s job.
|
||||
# cdef float32_t *p = NULL
|
||||
# safe_realloc(&p, n)
|
||||
# is equivalent to p = malloc(n * sizeof(*p)) with error checking.
|
||||
ctypedef fused realloc_ptr:
|
||||
# Add pointer types here as needed.
|
||||
(float32_t*)
|
||||
(intp_t*)
|
||||
(uint8_t*)
|
||||
(float64_t*)
|
||||
(float64_t**)
|
||||
(Node*)
|
||||
(Cell*)
|
||||
(Node**)
|
||||
|
||||
cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
|
||||
|
||||
|
||||
cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
|
||||
|
||||
|
||||
cdef intp_t rand_int(intp_t low, intp_t high,
|
||||
uint32_t* random_state) noexcept nogil
|
||||
|
||||
|
||||
cdef float64_t rand_uniform(float64_t low, float64_t high,
|
||||
uint32_t* random_state) noexcept nogil
|
||||
|
||||
|
||||
cdef float64_t log(float64_t x) noexcept nogil
|
||||
|
||||
|
||||
cdef class WeightedFenwickTree:
|
||||
cdef intp_t size # number of leaves (ranks)
|
||||
cdef float64_t* tree_w # BIT for weights
|
||||
cdef float64_t* tree_wy # BIT for weighted targets
|
||||
cdef intp_t max_pow2 # highest power of two <= n
|
||||
cdef float64_t total_w # running total weight
|
||||
cdef float64_t total_wy # running total weighted target
|
||||
|
||||
cdef void reset(self, intp_t size) noexcept nogil
|
||||
cdef void add(self, intp_t idx, float64_t y, float64_t w) noexcept nogil
|
||||
cdef intp_t search(
|
||||
self,
|
||||
float64_t t,
|
||||
float64_t* cw_out,
|
||||
float64_t* cwy_out,
|
||||
intp_t* prev_idx_out,
|
||||
) noexcept nogil
|
||||
@@ -0,0 +1,291 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from libc.stdlib cimport free
|
||||
from libc.stdlib cimport realloc
|
||||
from libc.math cimport log as ln
|
||||
from libc.math cimport isnan
|
||||
from libc.string cimport memset
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
cnp.import_array()
|
||||
|
||||
from sklearn.utils._random cimport our_rand_r
|
||||
|
||||
# =============================================================================
|
||||
# Helper functions
|
||||
# =============================================================================
|
||||
|
||||
cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
|
||||
# sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
|
||||
# 0.20.1 to crash.
|
||||
cdef size_t nbytes = nelems * sizeof(p[0][0])
|
||||
if nbytes / sizeof(p[0][0]) != nelems:
|
||||
# Overflow in the multiplication
|
||||
raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes")
|
||||
|
||||
cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
|
||||
if tmp == NULL:
|
||||
raise MemoryError(f"could not allocate {nbytes} bytes")
|
||||
|
||||
p[0] = tmp
|
||||
return 0
|
||||
|
||||
|
||||
def _realloc_test():
|
||||
# Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
|
||||
# bytes, which will always overflow.
|
||||
cdef intp_t* p = NULL
|
||||
safe_realloc(&p, <size_t>(-1) / 2)
|
||||
if p != NULL:
|
||||
free(p)
|
||||
assert False
|
||||
|
||||
|
||||
cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
|
||||
"""Return copied data as 1D numpy array of intp's."""
|
||||
cdef cnp.npy_intp shape[1]
|
||||
shape[0] = <cnp.npy_intp> size
|
||||
return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
|
||||
|
||||
|
||||
cdef inline intp_t rand_int(intp_t low, intp_t high,
|
||||
uint32_t* random_state) noexcept nogil:
|
||||
"""Generate a random integer in [low; end)."""
|
||||
return low + our_rand_r(random_state) % (high - low)
|
||||
|
||||
|
||||
cdef inline float64_t rand_uniform(float64_t low, float64_t high,
|
||||
uint32_t* random_state) noexcept nogil:
|
||||
"""Generate a random float64_t in [low; high)."""
|
||||
return ((high - low) * <float64_t> our_rand_r(random_state) /
|
||||
<float64_t> RAND_R_MAX) + low
|
||||
|
||||
|
||||
cdef inline float64_t log(float64_t x) noexcept nogil:
|
||||
return ln(x) / ln(2.0)
|
||||
|
||||
|
||||
def _any_isnan_axis0(const float32_t[:, :] X):
|
||||
"""Same as np.any(np.isnan(X), axis=0)"""
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_samples = X.shape[0]
|
||||
intp_t n_features = X.shape[1]
|
||||
uint8_t[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)
|
||||
|
||||
with nogil:
|
||||
for i in range(n_samples):
|
||||
for j in range(n_features):
|
||||
if isnan_out[j]:
|
||||
continue
|
||||
if isnan(X[i, j]):
|
||||
isnan_out[j] = True
|
||||
break
|
||||
return np.asarray(isnan_out)
|
||||
|
||||
|
||||
cdef class WeightedFenwickTree:
|
||||
"""
|
||||
Fenwick tree (Binary Indexed Tree) specialized for maintaining:
|
||||
- prefix sums of weights
|
||||
- prefix sums of weight * target (y)
|
||||
|
||||
Notes:
|
||||
- Implementation uses 1-based indexing internally for the Fenwick tree
|
||||
arrays, hence the +1 sized buffers. 1-based indexing is customary for this
|
||||
data structure and makes the some index handling slightly more efficient and
|
||||
natural.
|
||||
- Memory ownership: this class allocates and frees the underlying C buffers.
|
||||
- Typical operations:
|
||||
add(rank, y, w) -> O(log n)
|
||||
search(t) -> O(log n), finds the smallest rank with
|
||||
cumulative weight > t (see search for details).
|
||||
"""
|
||||
|
||||
def __cinit__(self, intp_t capacity):
|
||||
self.tree_w = NULL
|
||||
self.tree_wy = NULL
|
||||
|
||||
# Allocate arrays of length (capacity + 1) because indices are 1-based.
|
||||
safe_realloc(&self.tree_w, capacity + 1)
|
||||
safe_realloc(&self.tree_wy, capacity + 1)
|
||||
|
||||
cdef void reset(self, intp_t size) noexcept nogil:
|
||||
"""
|
||||
Reset the tree to hold 'size' elements and clear all aggregates.
|
||||
"""
|
||||
cdef intp_t p
|
||||
cdef intp_t n_bytes = (size + 1) * sizeof(float64_t) # +1 for 1-based storage
|
||||
|
||||
# Public size and zeroed aggregates.
|
||||
self.size = size
|
||||
memset(self.tree_w, 0, n_bytes)
|
||||
memset(self.tree_wy, 0, n_bytes)
|
||||
self.total_w = 0.0
|
||||
self.total_wy = 0.0
|
||||
|
||||
# highest power of two <= size
|
||||
p = 1
|
||||
while p <= size:
|
||||
p <<= 1
|
||||
self.max_pow2 = p >> 1
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.tree_w != NULL:
|
||||
free(self.tree_w)
|
||||
if self.tree_wy != NULL:
|
||||
free(self.tree_wy)
|
||||
|
||||
cdef void add(self, intp_t idx, float64_t y_value, float64_t weight) noexcept nogil:
|
||||
"""
|
||||
Add a weighted observation to the Fenwick tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
idx : intp_t
|
||||
The 0-based index where to add the observation
|
||||
y_value : float64_t
|
||||
The target value (y) of the observation
|
||||
weight : float64_t
|
||||
The sample weight
|
||||
|
||||
Notes
|
||||
-----
|
||||
Updates both weight sums and weighted target sums in O(log n) time.
|
||||
"""
|
||||
cdef float64_t weighted_y = weight * y_value
|
||||
cdef intp_t fenwick_idx = idx + 1 # Convert to 1-based indexing
|
||||
|
||||
# Update Fenwick tree nodes by traversing up the tree
|
||||
while fenwick_idx <= self.size:
|
||||
self.tree_w[fenwick_idx] += weight
|
||||
self.tree_wy[fenwick_idx] += weighted_y
|
||||
# Move to next node using bit manipulation: add lowest set bit
|
||||
fenwick_idx += fenwick_idx & -fenwick_idx
|
||||
|
||||
# Update global totals
|
||||
self.total_w += weight
|
||||
self.total_wy += weighted_y
|
||||
|
||||
cdef intp_t search(
|
||||
self,
|
||||
float64_t target_weight,
|
||||
float64_t* cumul_weight_out,
|
||||
float64_t* cumul_weighted_y_out,
|
||||
intp_t* prev_idx_out,
|
||||
) noexcept nogil:
|
||||
"""
|
||||
Binary search to find the position where cumulative weight reaches target.
|
||||
|
||||
This method performs a binary search on the Fenwick tree to find indices
|
||||
such that the cumulative weight at 'prev_idx' is < target_weight and
|
||||
the cumulative weight at the returned index is >= target_weight.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_weight : float64_t
|
||||
The target cumulative weight to search for
|
||||
cumul_weight_out : float64_t*
|
||||
Output pointer for cumulative weight up to returned index (exclusive)
|
||||
cumul_weighted_y_out : float64_t*
|
||||
Output pointer for cumulative weighted y-sum up to returned index (exclusive)
|
||||
prev_idx_out : intp_t*
|
||||
Output pointer for the previous index (largest index with cumul_weight < target)
|
||||
|
||||
Returns
|
||||
-------
|
||||
intp_t
|
||||
The index where cumulative weight first reaches or exceeds target_weight
|
||||
|
||||
Notes
|
||||
-----
|
||||
- O(log n) complexity
|
||||
- Ignores nodes with zero weights (corresponding to uninserted y-values)
|
||||
- Assumes at least one active (positive-weight) item exists
|
||||
- Assumes 0 <= target_weight <= total_weight
|
||||
"""
|
||||
cdef:
|
||||
intp_t current_idx = 0
|
||||
intp_t next_idx, prev_idx, equal_bit
|
||||
float64_t cumul_weight = 0.0
|
||||
float64_t cumul_weighted_y = 0.0
|
||||
intp_t search_bit = self.max_pow2 # Start from highest power of 2
|
||||
float64_t node_weight, equal_target
|
||||
|
||||
# Phase 1: Standard Fenwick binary search with prefix accumulation
|
||||
# Traverse down the tree, moving right when we can consume more weight
|
||||
while search_bit != 0:
|
||||
next_idx = current_idx + search_bit
|
||||
if next_idx <= self.size:
|
||||
node_weight = self.tree_w[next_idx]
|
||||
if target_weight == node_weight:
|
||||
# Exact match found - store state for later processing
|
||||
equal_target = target_weight
|
||||
equal_bit = search_bit
|
||||
break
|
||||
elif target_weight > node_weight:
|
||||
# We can consume this node's weight - move right and accumulate
|
||||
target_weight -= node_weight
|
||||
current_idx = next_idx
|
||||
cumul_weight += node_weight
|
||||
cumul_weighted_y += self.tree_wy[next_idx]
|
||||
search_bit >>= 1
|
||||
|
||||
# If no exact match, we're done with standard search
|
||||
if search_bit == 0:
|
||||
cumul_weight_out[0] = cumul_weight
|
||||
cumul_weighted_y_out[0] = cumul_weighted_y
|
||||
prev_idx_out[0] = current_idx
|
||||
return current_idx
|
||||
|
||||
# Phase 2: Handle exact match case - find prev_idx
|
||||
# Search for the largest index with cumulative weight < original target
|
||||
prev_idx = current_idx
|
||||
while search_bit != 0:
|
||||
next_idx = prev_idx + search_bit
|
||||
if next_idx <= self.size:
|
||||
node_weight = self.tree_w[next_idx]
|
||||
if target_weight > node_weight:
|
||||
target_weight -= node_weight
|
||||
prev_idx = next_idx
|
||||
search_bit >>= 1
|
||||
|
||||
# Phase 3: Complete the exact match search
|
||||
# Restore state and search for the largest index with
|
||||
# cumulative weight <= original target (and this is case, we know we have ==)
|
||||
search_bit = equal_bit
|
||||
target_weight = equal_target
|
||||
while search_bit != 0:
|
||||
next_idx = current_idx + search_bit
|
||||
if next_idx <= self.size:
|
||||
node_weight = self.tree_w[next_idx]
|
||||
if target_weight >= node_weight:
|
||||
target_weight -= node_weight
|
||||
current_idx = next_idx
|
||||
cumul_weight += node_weight
|
||||
cumul_weighted_y += self.tree_wy[next_idx]
|
||||
search_bit >>= 1
|
||||
|
||||
# Output results
|
||||
cumul_weight_out[0] = cumul_weight
|
||||
cumul_weighted_y_out[0] = cumul_weighted_y
|
||||
prev_idx_out[0] = prev_idx
|
||||
return current_idx
|
||||
|
||||
|
||||
cdef class PytestWeightedFenwickTree(WeightedFenwickTree):
|
||||
"""Used for testing only"""
|
||||
|
||||
def py_reset(self, intp_t n):
|
||||
self.reset(n)
|
||||
|
||||
def py_add(self, intp_t idx, float64_t y, float64_t w):
|
||||
self.add(idx, y, w)
|
||||
|
||||
def py_search(self, float64_t t):
|
||||
cdef float64_t w, wy
|
||||
cdef intp_t prev_idx
|
||||
idx = self.search(t, &w, &wy, &prev_idx)
|
||||
return prev_idx, idx, w, wy
|
||||
@@ -0,0 +1,28 @@
|
||||
tree_extension_metadata = {
|
||||
'_tree':
|
||||
{'sources': [cython_gen_cpp.process('_tree.pyx')],
|
||||
'override_options': ['optimization=3']},
|
||||
'_splitter':
|
||||
{'sources': [cython_gen.process('_splitter.pyx')],
|
||||
'override_options': ['optimization=3']},
|
||||
'_partitioner':
|
||||
{'sources': [cython_gen.process('_partitioner.pyx')],
|
||||
'override_options': ['optimization=3']},
|
||||
'_criterion':
|
||||
{'sources': [cython_gen.process('_criterion.pyx')],
|
||||
'override_options': ['optimization=3']},
|
||||
'_utils':
|
||||
{'sources': [cython_gen.process('_utils.pyx')],
|
||||
'override_options': ['optimization=3']},
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : tree_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
[ext_dict.get('sources'), utils_cython_tree],
|
||||
dependencies: [np_dep],
|
||||
override_options : ext_dict.get('override_options', []),
|
||||
subdir: 'sklearn/tree',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,635 @@
|
||||
"""
|
||||
Testing for export functions of decision trees (sklearn.tree.export).
|
||||
"""
|
||||
|
||||
from io import StringIO
|
||||
from re import finditer, search
|
||||
from textwrap import dedent
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.random import RandomState
|
||||
|
||||
from sklearn.base import is_classifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.tree import (
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
export_graphviz,
|
||||
export_text,
|
||||
plot_tree,
|
||||
)
|
||||
|
||||
# toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y = [-1, -1, -1, 1, 1, 1]
|
||||
y2 = [[-1, 1], [-1, 1], [-1, 1], [1, 2], [1, 2], [1, 3]]
|
||||
w = [1, 1, 1, 0.5, 0.5, 0.5]
|
||||
y_degraded = [1, 1, 1, 1, 1, 1]
|
||||
|
||||
|
||||
def test_graphviz_toy():
|
||||
# Check correctness of export_graphviz
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test export code
|
||||
contents1 = export_graphviz(clf, out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test with feature_names
|
||||
contents1 = export_graphviz(
|
||||
clf, feature_names=["feature0", "feature1"], out_file=None
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test with feature_names (escaped)
|
||||
contents1 = export_graphviz(
|
||||
clf, feature_names=['feature"0"', 'feature"1"'], out_file=None
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="feature\\"0\\" <= 0.0\\n'
|
||||
"gini = 0.5\\nsamples = 6\\n"
|
||||
'value = [3, 3]"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test with class_names
|
||||
contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]\\nclass = yes"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
|
||||
'class = yes"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
|
||||
'class = no"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test with class_names (escaped)
|
||||
contents1 = export_graphviz(clf, class_names=['"yes"', '"no"'], out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]\\nclass = \\"yes\\""] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
|
||||
'class = \\"yes\\""] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
|
||||
'class = \\"no\\""] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test plot_options
|
||||
contents1 = export_graphviz(
|
||||
clf,
|
||||
filled=True,
|
||||
impurity=False,
|
||||
proportion=True,
|
||||
special_characters=True,
|
||||
rounded=True,
|
||||
out_file=None,
|
||||
fontname="sans",
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, style="filled, rounded", color="black", '
|
||||
'fontname="sans"] ;\n'
|
||||
'edge [fontname="sans"] ;\n'
|
||||
"0 [label=<x<SUB>0</SUB> ≤ 0.0<br/>samples = 100.0%<br/>"
|
||||
'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n'
|
||||
"1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, "
|
||||
'fillcolor="#e58139"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
"2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, "
|
||||
'fillcolor="#399de5"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test max_depth
|
||||
contents1 = export_graphviz(clf, max_depth=0, class_names=True, out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]\\nclass = y[0]"] ;\n'
|
||||
'1 [label="(...)"] ;\n'
|
||||
"0 -> 1 ;\n"
|
||||
'2 [label="(...)"] ;\n'
|
||||
"0 -> 2 ;\n"
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test max_depth with plot_options
|
||||
contents1 = export_graphviz(
|
||||
clf, max_depth=0, filled=True, out_file=None, node_ids=True
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, style="filled", color="black", '
|
||||
'fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="node #0\\nx[0] <= 0.0\\ngini = 0.5\\n'
|
||||
'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n'
|
||||
'1 [label="(...)", fillcolor="#C0C0C0"] ;\n'
|
||||
"0 -> 1 ;\n"
|
||||
'2 [label="(...)", fillcolor="#C0C0C0"] ;\n'
|
||||
"0 -> 2 ;\n"
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test multi-output with weighted samples
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=2, min_samples_split=2, criterion="gini", random_state=2
|
||||
)
|
||||
clf = clf.fit(X, y2, sample_weight=w)
|
||||
|
||||
contents1 = export_graphviz(clf, filled=True, impurity=False, out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, style="filled", color="black", '
|
||||
'fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\nsamples = 6\\n'
|
||||
"value = [[3.0, 1.5, 0.0]\\n"
|
||||
'[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n'
|
||||
'1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n'
|
||||
'[3, 0, 0]]", fillcolor="#e58139"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="x[0] <= 1.5\\nsamples = 3\\n'
|
||||
"value = [[0.0, 1.5, 0.0]\\n"
|
||||
'[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
'3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n'
|
||||
'[0, 1, 0]]", fillcolor="#e58139"] ;\n'
|
||||
"2 -> 3 ;\n"
|
||||
'4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n'
|
||||
'[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n'
|
||||
"2 -> 4 ;\n"
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test regression output with plot_options
|
||||
clf = DecisionTreeRegressor(
|
||||
max_depth=3, min_samples_split=2, criterion="squared_error", random_state=2
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
contents1 = export_graphviz(
|
||||
clf,
|
||||
filled=True,
|
||||
leaves_parallel=True,
|
||||
out_file=None,
|
||||
rotate=True,
|
||||
rounded=True,
|
||||
fontname="sans",
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, style="filled, rounded", color="black", '
|
||||
'fontname="sans"] ;\n'
|
||||
"graph [ranksep=equally, splines=polyline] ;\n"
|
||||
'edge [fontname="sans"] ;\n'
|
||||
"rankdir=LR ;\n"
|
||||
'0 [label="x[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n'
|
||||
'value = 0.0", fillcolor="#f2c09c"] ;\n'
|
||||
'1 [label="squared_error = 0.0\\nsamples = 3\\'
|
||||
'nvalue = -1.0", '
|
||||
'fillcolor="#ffffff"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", '
|
||||
'fillcolor="#e58139"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"{rank=same ; 0} ;\n"
|
||||
"{rank=same ; 1; 2} ;\n"
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test classifier with degraded learning set
|
||||
clf = DecisionTreeClassifier(max_depth=3)
|
||||
clf.fit(X, y_degraded)
|
||||
|
||||
contents1 = export_graphviz(clf, filled=True, out_file=None)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, style="filled", color="black", '
|
||||
'fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", '
|
||||
'fillcolor="#ffffff"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor", [list, np.array])
|
||||
def test_graphviz_feature_class_names_array_support(constructor):
|
||||
# Check that export_graphviz treats feature names
|
||||
# and class names correctly and supports arrays
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test with feature_names
|
||||
contents1 = export_graphviz(
|
||||
clf, feature_names=constructor(["feature0", "feature1"]), out_file=None
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
# Test with class_names
|
||||
contents1 = export_graphviz(
|
||||
clf, class_names=constructor(["yes", "no"]), out_file=None
|
||||
)
|
||||
contents2 = (
|
||||
"digraph Tree {\n"
|
||||
'node [shape=box, fontname="helvetica"] ;\n'
|
||||
'edge [fontname="helvetica"] ;\n'
|
||||
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
|
||||
'value = [3, 3]\\nclass = yes"] ;\n'
|
||||
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
|
||||
'class = yes"] ;\n'
|
||||
"0 -> 1 [labeldistance=2.5, labelangle=45, "
|
||||
'headlabel="True"] ;\n'
|
||||
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
|
||||
'class = no"] ;\n'
|
||||
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
|
||||
'headlabel="False"] ;\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
assert contents1 == contents2
|
||||
|
||||
|
||||
def test_graphviz_errors():
|
||||
# Check for errors of export_graphviz
|
||||
clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
|
||||
|
||||
# Check not-fitted decision tree error
|
||||
out = StringIO()
|
||||
with pytest.raises(NotFittedError):
|
||||
export_graphviz(clf, out)
|
||||
|
||||
clf.fit(X, y)
|
||||
|
||||
# Check if it errors when length of feature_names
|
||||
# mismatches with number of features
|
||||
message = "Length of feature_names, 1 does not match number of features, 2"
|
||||
with pytest.raises(ValueError, match=message):
|
||||
export_graphviz(clf, None, feature_names=["a"])
|
||||
|
||||
message = "Length of feature_names, 3 does not match number of features, 2"
|
||||
with pytest.raises(ValueError, match=message):
|
||||
export_graphviz(clf, None, feature_names=["a", "b", "c"])
|
||||
|
||||
# Check error when feature_names contains non-string elements
|
||||
message = "All feature names must be strings."
|
||||
with pytest.raises(ValueError, match=message):
|
||||
export_graphviz(clf, None, feature_names=["a", 1])
|
||||
|
||||
# Check error when argument is not an estimator
|
||||
message = "is not an estimator instance"
|
||||
with pytest.raises(TypeError, match=message):
|
||||
export_graphviz(clf.fit(X, y).tree_)
|
||||
|
||||
# Check class_names error
|
||||
out = StringIO()
|
||||
with pytest.raises(IndexError):
|
||||
export_graphviz(clf, out, class_names=[])
|
||||
|
||||
|
||||
def test_friedman_mse_in_graphviz():
|
||||
clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
|
||||
clf.fit(X, y)
|
||||
dot_data = StringIO()
|
||||
export_graphviz(clf, out_file=dot_data)
|
||||
|
||||
clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
|
||||
clf.fit(X, y)
|
||||
for estimator in clf.estimators_:
|
||||
export_graphviz(estimator[0], out_file=dot_data)
|
||||
|
||||
for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
|
||||
assert "friedman_mse" in finding.group()
|
||||
|
||||
|
||||
def test_precision():
|
||||
rng_reg = RandomState(2)
|
||||
rng_clf = RandomState(8)
|
||||
for X, y, clf in zip(
|
||||
(rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))),
|
||||
(rng_reg.random_sample((5,)), rng_clf.randint(2, size=(1000,))),
|
||||
(
|
||||
DecisionTreeRegressor(
|
||||
criterion="friedman_mse", random_state=0, max_depth=1
|
||||
),
|
||||
DecisionTreeClassifier(max_depth=1, random_state=0),
|
||||
),
|
||||
):
|
||||
clf.fit(X, y)
|
||||
for precision in (4, 3):
|
||||
dot_data = export_graphviz(
|
||||
clf, out_file=None, precision=precision, proportion=True
|
||||
)
|
||||
|
||||
# With the current random state, the impurity and the threshold
|
||||
# will have the number of precision set in the export_graphviz
|
||||
# function. We will check the number of precision with a strict
|
||||
# equality. The value reported will have only 2 precision and
|
||||
# therefore, only a less equal comparison will be done.
|
||||
|
||||
# check value
|
||||
for finding in finditer(r"value = \d+\.\d+", dot_data):
|
||||
assert len(search(r"\.\d+", finding.group()).group()) <= precision + 1
|
||||
# check impurity
|
||||
if is_classifier(clf):
|
||||
pattern = r"gini = \d+\.\d+"
|
||||
else:
|
||||
pattern = r"friedman_mse = \d+\.\d+"
|
||||
|
||||
# check impurity
|
||||
for finding in finditer(pattern, dot_data):
|
||||
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
|
||||
# check threshold
|
||||
for finding in finditer(r"<= \d+\.\d+", dot_data):
|
||||
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
|
||||
|
||||
|
||||
def test_export_text_errors():
|
||||
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
|
||||
clf.fit(X, y)
|
||||
err_msg = "feature_names must contain 2 elements, got 1"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
export_text(clf, feature_names=["a"])
|
||||
err_msg = (
|
||||
"When `class_names` is an array, it should contain as"
|
||||
" many items as `decision_tree.classes_`. Got 1 while"
|
||||
" the tree was fitted with 2 classes."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
export_text(clf, class_names=["a"])
|
||||
|
||||
|
||||
def test_export_text():
|
||||
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- feature_1 <= 0.00
|
||||
| |--- class: -1
|
||||
|--- feature_1 > 0.00
|
||||
| |--- class: 1
|
||||
"""
|
||||
).lstrip()
|
||||
|
||||
assert export_text(clf) == expected_report
|
||||
# testing that leaves at level 1 are not truncated
|
||||
assert export_text(clf, max_depth=0) == expected_report
|
||||
# testing that the rest of the tree is truncated
|
||||
assert export_text(clf, max_depth=10) == expected_report
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- feature_1 <= 0.00
|
||||
| |--- weights: [3.00, 0.00] class: -1
|
||||
|--- feature_1 > 0.00
|
||||
| |--- weights: [0.00, 3.00] class: 1
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(clf, show_weights=True) == expected_report
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|- feature_1 <= 0.00
|
||||
| |- class: -1
|
||||
|- feature_1 > 0.00
|
||||
| |- class: 1
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(clf, spacing=1) == expected_report
|
||||
|
||||
X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
|
||||
y_l = [-1, -1, -1, 1, 1, 1, 2]
|
||||
clf = DecisionTreeClassifier(max_depth=4, random_state=0)
|
||||
clf.fit(X_l, y_l)
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- feature_1 <= 0.00
|
||||
| |--- class: -1
|
||||
|--- feature_1 > 0.00
|
||||
| |--- truncated branch of depth 2
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(clf, max_depth=0) == expected_report
|
||||
|
||||
X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]
|
||||
|
||||
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
|
||||
reg.fit(X_mo, y_mo)
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- feature_1 <= 0.0
|
||||
| |--- value: [-1.0, -1.0]
|
||||
|--- feature_1 > 0.0
|
||||
| |--- value: [1.0, 1.0]
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(reg, decimals=1) == expected_report
|
||||
assert export_text(reg, decimals=1, show_weights=True) == expected_report
|
||||
|
||||
X_single = [[-2], [-1], [-1], [1], [1], [2]]
|
||||
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
|
||||
reg.fit(X_single, y_mo)
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- first <= 0.0
|
||||
| |--- value: [-1.0, -1.0]
|
||||
|--- first > 0.0
|
||||
| |--- value: [1.0, 1.0]
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(reg, decimals=1, feature_names=["first"]) == expected_report
|
||||
assert (
|
||||
export_text(reg, decimals=1, show_weights=True, feature_names=["first"])
|
||||
== expected_report
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor", [list, np.array])
|
||||
def test_export_text_feature_class_names_array_support(constructor):
|
||||
# Check that export_graphviz treats feature names
|
||||
# and class names correctly and supports arrays
|
||||
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- b <= 0.00
|
||||
| |--- class: -1
|
||||
|--- b > 0.00
|
||||
| |--- class: 1
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(clf, feature_names=constructor(["a", "b"])) == expected_report
|
||||
|
||||
expected_report = dedent(
|
||||
"""
|
||||
|--- feature_1 <= 0.00
|
||||
| |--- class: cat
|
||||
|--- feature_1 > 0.00
|
||||
| |--- class: dog
|
||||
"""
|
||||
).lstrip()
|
||||
assert export_text(clf, class_names=constructor(["cat", "dog"])) == expected_report
|
||||
|
||||
|
||||
def test_plot_tree_entropy(pyplot):
|
||||
# mostly smoke tests
|
||||
# Check correctness of export_graphviz for criterion = entropy
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=3, min_samples_split=2, criterion="entropy", random_state=2
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test export code
|
||||
feature_names = ["first feat", "sepal_width"]
|
||||
nodes = plot_tree(clf, feature_names=feature_names)
|
||||
assert len(nodes) == 5
|
||||
assert (
|
||||
nodes[0].get_text()
|
||||
== "first feat <= 0.0\nentropy = 1.0\nsamples = 6\nvalue = [3, 3]"
|
||||
)
|
||||
assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]"
|
||||
assert nodes[2].get_text() == "True "
|
||||
assert nodes[3].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
|
||||
assert nodes[4].get_text() == " False"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fontsize", [None, 10, 20])
|
||||
def test_plot_tree_gini(pyplot, fontsize):
|
||||
# mostly smoke tests
|
||||
# Check correctness of export_graphviz for criterion = gini
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=3,
|
||||
min_samples_split=2,
|
||||
criterion="gini",
|
||||
random_state=2,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
# Test export code
|
||||
feature_names = ["first feat", "sepal_width"]
|
||||
nodes = plot_tree(clf, feature_names=feature_names, fontsize=fontsize)
|
||||
assert len(nodes) == 5
|
||||
if fontsize is not None:
|
||||
assert all(node.get_fontsize() == fontsize for node in nodes)
|
||||
assert (
|
||||
nodes[0].get_text()
|
||||
== "first feat <= 0.0\ngini = 0.5\nsamples = 6\nvalue = [3, 3]"
|
||||
)
|
||||
assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]"
|
||||
assert nodes[2].get_text() == "True "
|
||||
assert nodes[3].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
|
||||
assert nodes[4].get_text() == " False"
|
||||
|
||||
|
||||
def test_not_fitted_tree(pyplot):
|
||||
# Testing if not fitted tree throws the correct error
|
||||
clf = DecisionTreeRegressor()
|
||||
with pytest.raises(NotFittedError):
|
||||
plot_tree(clf)
|
||||
@@ -0,0 +1,51 @@
|
||||
import numpy as np
|
||||
|
||||
from sklearn.tree._utils import PytestWeightedFenwickTree
|
||||
|
||||
|
||||
def test_cython_weighted_fenwick_tree(global_random_seed):
|
||||
"""
|
||||
Test Cython's weighted Fenwick tree implementation
|
||||
"""
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
|
||||
n = 100
|
||||
indices = rng.permutation(n)
|
||||
y = rng.normal(size=n)
|
||||
w = rng.integers(0, 4, size=n)
|
||||
y_included_so_far = np.zeros_like(y)
|
||||
w_included_so_far = np.zeros_like(w)
|
||||
|
||||
tree = PytestWeightedFenwickTree(n)
|
||||
tree.py_reset(n)
|
||||
|
||||
for i in range(n):
|
||||
idx = indices[i]
|
||||
tree.py_add(idx, y[idx], w[idx])
|
||||
y_included_so_far[idx] = y[idx]
|
||||
w_included_so_far[idx] = w[idx]
|
||||
|
||||
target = rng.uniform(0, w_included_so_far.sum())
|
||||
t_idx_low, t_idx, cw, cwy = tree.py_search(target)
|
||||
|
||||
# check the aggregates are consistent with the returned idx
|
||||
assert np.isclose(cw, np.sum(w_included_so_far[:t_idx]))
|
||||
assert np.isclose(
|
||||
cwy, np.sum(w_included_so_far[:t_idx] * y_included_so_far[:t_idx])
|
||||
)
|
||||
|
||||
# check if the cumulative weight is less than or equal to the target
|
||||
# depending on t_idx_low and t_idx
|
||||
if t_idx_low == t_idx:
|
||||
assert cw < target
|
||||
else:
|
||||
assert cw == target
|
||||
|
||||
# check that if we add the next non-null weight, we are above the target:
|
||||
next_weights = w_included_so_far[t_idx:][w_included_so_far[t_idx:] > 0]
|
||||
if next_weights.size > 0:
|
||||
assert cw + next_weights[0] > target
|
||||
# and not below the target for `t_idx_low`:
|
||||
next_weights = w_included_so_far[t_idx_low:][w_included_so_far[t_idx_low:] > 0]
|
||||
if next_weights.size > 0:
|
||||
assert cw + next_weights[0] >= target
|
||||
@@ -0,0 +1,512 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
from sklearn.ensemble import (
|
||||
ExtraTreesClassifier,
|
||||
ExtraTreesRegressor,
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
)
|
||||
from sklearn.tree import (
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
ExtraTreeClassifier,
|
||||
ExtraTreeRegressor,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS
|
||||
|
||||
TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
|
||||
TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor]
|
||||
TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [
|
||||
RandomForestClassifier,
|
||||
ExtraTreesClassifier,
|
||||
]
|
||||
TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [
|
||||
RandomForestRegressor,
|
||||
ExtraTreesRegressor,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
|
||||
@pytest.mark.parametrize("depth_first_builder", (True, False))
|
||||
@pytest.mark.parametrize("sparse_splitter", (True, False))
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_monotonic_constraints_classifications(
|
||||
TreeClassifier,
|
||||
depth_first_builder,
|
||||
sparse_splitter,
|
||||
global_random_seed,
|
||||
csc_container,
|
||||
):
|
||||
n_samples = 1000
|
||||
n_samples_train = 900
|
||||
X, y = make_classification(
|
||||
n_samples=n_samples,
|
||||
n_classes=2,
|
||||
n_features=5,
|
||||
n_informative=5,
|
||||
n_redundant=0,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
X_train, y_train = X[:n_samples_train], y[:n_samples_train]
|
||||
X_test, _ = X[n_samples_train:], y[n_samples_train:]
|
||||
|
||||
X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test)
|
||||
X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test)
|
||||
X_test_0incr[:, 0] += 10
|
||||
X_test_0decr[:, 0] -= 10
|
||||
X_test_1incr[:, 1] += 10
|
||||
X_test_1decr[:, 1] -= 10
|
||||
monotonic_cst = np.zeros(X.shape[1])
|
||||
monotonic_cst[0] = 1
|
||||
monotonic_cst[1] = -1
|
||||
|
||||
if depth_first_builder:
|
||||
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst)
|
||||
else:
|
||||
est = TreeClassifier(
|
||||
max_depth=None,
|
||||
monotonic_cst=monotonic_cst,
|
||||
max_leaf_nodes=n_samples_train,
|
||||
)
|
||||
if hasattr(est, "random_state"):
|
||||
est.set_params(**{"random_state": global_random_seed})
|
||||
if hasattr(est, "n_estimators"):
|
||||
est.set_params(**{"n_estimators": 5})
|
||||
if sparse_splitter:
|
||||
X_train = csc_container(X_train)
|
||||
est.fit(X_train, y_train)
|
||||
proba_test = est.predict_proba(X_test)
|
||||
|
||||
assert np.logical_and(proba_test >= 0.0, proba_test <= 1.0).all(), (
|
||||
"Probability should always be in [0, 1] range."
|
||||
)
|
||||
assert_allclose(proba_test.sum(axis=1), 1.0)
|
||||
|
||||
# Monotonic increase constraint, it applies to the positive class
|
||||
assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1])
|
||||
assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1])
|
||||
|
||||
# Monotonic decrease constraint, it applies to the positive class
|
||||
assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1])
|
||||
assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
|
||||
@pytest.mark.parametrize("depth_first_builder", (True, False))
|
||||
@pytest.mark.parametrize("sparse_splitter", (True, False))
|
||||
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_monotonic_constraints_regressions(
|
||||
TreeRegressor,
|
||||
depth_first_builder,
|
||||
sparse_splitter,
|
||||
criterion,
|
||||
global_random_seed,
|
||||
csc_container,
|
||||
):
|
||||
n_samples = 1000
|
||||
n_samples_train = 900
|
||||
# Build a regression task using 5 informative features
|
||||
X, y = make_regression(
|
||||
n_samples=n_samples,
|
||||
n_features=5,
|
||||
n_informative=5,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
train = np.arange(n_samples_train)
|
||||
test = np.arange(n_samples_train, n_samples)
|
||||
X_train = X[train]
|
||||
y_train = y[train]
|
||||
X_test = np.copy(X[test])
|
||||
X_test_incr = np.copy(X_test)
|
||||
X_test_decr = np.copy(X_test)
|
||||
X_test_incr[:, 0] += 10
|
||||
X_test_decr[:, 1] += 10
|
||||
monotonic_cst = np.zeros(X.shape[1])
|
||||
monotonic_cst[0] = 1
|
||||
monotonic_cst[1] = -1
|
||||
|
||||
if depth_first_builder:
|
||||
est = TreeRegressor(
|
||||
max_depth=None,
|
||||
monotonic_cst=monotonic_cst,
|
||||
criterion=criterion,
|
||||
)
|
||||
else:
|
||||
est = TreeRegressor(
|
||||
max_depth=8,
|
||||
monotonic_cst=monotonic_cst,
|
||||
criterion=criterion,
|
||||
max_leaf_nodes=n_samples_train,
|
||||
)
|
||||
if hasattr(est, "random_state"):
|
||||
est.set_params(random_state=global_random_seed)
|
||||
if hasattr(est, "n_estimators"):
|
||||
est.set_params(**{"n_estimators": 5})
|
||||
if sparse_splitter:
|
||||
X_train = csc_container(X_train)
|
||||
est.fit(X_train, y_train)
|
||||
y = est.predict(X_test)
|
||||
# Monotonic increase constraint
|
||||
y_incr = est.predict(X_test_incr)
|
||||
# y_incr should always be greater than y
|
||||
assert np.all(y_incr >= y)
|
||||
|
||||
# Monotonic decrease constraint
|
||||
y_decr = est.predict(X_test_decr)
|
||||
# y_decr should always be lower than y
|
||||
assert np.all(y_decr <= y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
|
||||
def test_multiclass_raises(TreeClassifier):
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0
|
||||
)
|
||||
y[0] = 0
|
||||
monotonic_cst = np.zeros(X.shape[1])
|
||||
monotonic_cst[0] = -1
|
||||
monotonic_cst[1] = 1
|
||||
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
|
||||
|
||||
msg = "Monotonicity constraints are not supported with multiclass classification"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
|
||||
def test_multiple_output_raises(TreeClassifier):
|
||||
X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
|
||||
y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]]
|
||||
|
||||
est = TreeClassifier(
|
||||
max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0
|
||||
)
|
||||
msg = "Monotonicity constraints are not supported with multiple output"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Tree",
|
||||
[
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
ExtraTreeClassifier,
|
||||
ExtraTreeRegressor,
|
||||
],
|
||||
)
|
||||
def test_missing_values_raises(Tree):
|
||||
X, y = make_classification(
|
||||
n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0
|
||||
)
|
||||
X[0, 0] = np.nan
|
||||
monotonic_cst = np.zeros(X.shape[1])
|
||||
monotonic_cst[0] = 1
|
||||
est = Tree(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
|
||||
|
||||
msg = "Input X contains NaN"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
|
||||
def test_bad_monotonic_cst_raises(TreeClassifier):
|
||||
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
|
||||
y = [1, 0, 1, 0, 1]
|
||||
|
||||
msg = "monotonic_cst has shape 3 but the input data X has 2 features."
|
||||
est = TreeClassifier(
|
||||
max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.fit(X, y)
|
||||
|
||||
msg = "monotonic_cst must be None or an array-like of -1, 0 or 1."
|
||||
est = TreeClassifier(
|
||||
max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.fit(X, y)
|
||||
|
||||
est = TreeClassifier(
|
||||
max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg + "(.*)0.8]"):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign):
|
||||
values = tree_.value
|
||||
for i in range(tree_.node_count):
|
||||
if tree_.children_left[i] > i and tree_.children_right[i] > i:
|
||||
# Check monotonicity on children
|
||||
i_left = tree_.children_left[i]
|
||||
i_right = tree_.children_right[i]
|
||||
if monotonic_sign == 1:
|
||||
assert values[i_left] <= values[i_right]
|
||||
elif monotonic_sign == -1:
|
||||
assert values[i_left] >= values[i_right]
|
||||
val_middle = (values[i_left] + values[i_right]) / 2
|
||||
# Check bounds on grand-children, filtering out leaf nodes
|
||||
if tree_.feature[i_left] >= 0:
|
||||
i_left_right = tree_.children_right[i_left]
|
||||
if monotonic_sign == 1:
|
||||
assert values[i_left_right] <= val_middle
|
||||
elif monotonic_sign == -1:
|
||||
assert values[i_left_right] >= val_middle
|
||||
if tree_.feature[i_right] >= 0:
|
||||
i_right_left = tree_.children_left[i_right]
|
||||
if monotonic_sign == 1:
|
||||
assert val_middle <= values[i_right_left]
|
||||
elif monotonic_sign == -1:
|
||||
assert val_middle >= values[i_right_left]
|
||||
|
||||
|
||||
def test_assert_1d_reg_tree_children_monotonic_bounded():
|
||||
X = np.linspace(-1, 1, 7).reshape(-1, 1)
|
||||
y = np.sin(2 * np.pi * X.ravel())
|
||||
|
||||
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1)
|
||||
|
||||
|
||||
def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps):
|
||||
X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1)
|
||||
y_pred_grid = clf.predict(X_grid)
|
||||
if monotonic_sign == 1:
|
||||
assert (np.diff(y_pred_grid) >= 0.0).all()
|
||||
elif monotonic_sign == -1:
|
||||
assert (np.diff(y_pred_grid) <= 0.0).all()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
|
||||
def test_1d_opposite_monotonicity_cst_data(TreeRegressor):
|
||||
# Check that positive monotonic data with negative monotonic constraint
|
||||
# yield constant predictions, equal to the average of target values
|
||||
X = np.linspace(-2, 2, 10).reshape(-1, 1)
|
||||
y = X.ravel()
|
||||
clf = TreeRegressor(monotonic_cst=[-1])
|
||||
clf.fit(X, y)
|
||||
assert clf.tree_.node_count == 1
|
||||
assert clf.tree_.value[0] == 0.0
|
||||
|
||||
# Swap monotonicity
|
||||
clf = TreeRegressor(monotonic_cst=[1])
|
||||
clf.fit(X, -y)
|
||||
assert clf.tree_.node_count == 1
|
||||
assert clf.tree_.value[0] == 0.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
|
||||
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
|
||||
@pytest.mark.parametrize("depth_first_builder", (True, False))
|
||||
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
|
||||
def test_1d_tree_nodes_values(
|
||||
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
|
||||
):
|
||||
# Adaptation from test_nodes_values in test_monotonic_constraints.py
|
||||
# in sklearn.ensemble._hist_gradient_boosting
|
||||
# Build a single tree with only one feature, and make sure the node
|
||||
# values respect the monotonicity constraints.
|
||||
|
||||
# Considering the following tree with a monotonic +1 constraint, we
|
||||
# should have:
|
||||
#
|
||||
# root
|
||||
# / \
|
||||
# a b
|
||||
# / \ / \
|
||||
# c d e f
|
||||
#
|
||||
# a <= root <= b
|
||||
# c <= d <= (a + b) / 2 <= e <= f
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 1000
|
||||
n_features = 1
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
|
||||
if depth_first_builder:
|
||||
# No max_leaf_nodes, default depth first tree builder
|
||||
clf = TreeRegressor(
|
||||
monotonic_cst=[monotonic_sign],
|
||||
criterion=criterion,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
else:
|
||||
# max_leaf_nodes triggers best first tree builder
|
||||
clf = TreeRegressor(
|
||||
monotonic_cst=[monotonic_sign],
|
||||
max_leaf_nodes=n_samples,
|
||||
criterion=criterion,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign)
|
||||
assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100)
|
||||
|
||||
|
||||
def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst):
|
||||
upper_bound = np.full(tree_.node_count, np.inf)
|
||||
lower_bound = np.full(tree_.node_count, -np.inf)
|
||||
for i in range(tree_.node_count):
|
||||
feature = tree_.feature[i]
|
||||
node_value = tree_.value[i][0][0] # unpack value from nx1x1 array
|
||||
# While building the tree, the computed middle value is slightly
|
||||
# different from the average of the siblings values, because
|
||||
# sum_right / weighted_n_right
|
||||
# is slightly different from the value of the right sibling.
|
||||
# This can cause a discrepancy up to numerical noise when clipping,
|
||||
# which is resolved by comparing with some loss of precision.
|
||||
assert np.float32(node_value) <= np.float32(upper_bound[i])
|
||||
assert np.float32(node_value) >= np.float32(lower_bound[i])
|
||||
|
||||
if feature < 0:
|
||||
# Leaf: nothing to do
|
||||
continue
|
||||
|
||||
# Split node: check and update bounds for the children.
|
||||
i_left = tree_.children_left[i]
|
||||
i_right = tree_.children_right[i]
|
||||
# unpack value from nx1x1 array
|
||||
middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2
|
||||
|
||||
if monotonic_cst[feature] == 0:
|
||||
# Feature without monotonicity constraint: propagate bounds
|
||||
# down the tree to both children.
|
||||
# Otherwise, with 2 features and a monotonic increase constraint
|
||||
# (encoded by +1) on feature 0, the following tree can be accepted,
|
||||
# although it does not respect the monotonic increase constraint:
|
||||
#
|
||||
# X[0] <= 0
|
||||
# value = 100
|
||||
# / \
|
||||
# X[0] <= -1 X[1] <= 0
|
||||
# value = 50 value = 150
|
||||
# / \ / \
|
||||
# leaf leaf leaf leaf
|
||||
# value = 25 value = 75 value = 50 value = 250
|
||||
|
||||
lower_bound[i_left] = lower_bound[i]
|
||||
upper_bound[i_left] = upper_bound[i]
|
||||
lower_bound[i_right] = lower_bound[i]
|
||||
upper_bound[i_right] = upper_bound[i]
|
||||
|
||||
elif monotonic_cst[feature] == 1:
|
||||
# Feature with constraint: check monotonicity
|
||||
assert tree_.value[i_left] <= tree_.value[i_right]
|
||||
|
||||
# Propagate bounds down the tree to both children.
|
||||
lower_bound[i_left] = lower_bound[i]
|
||||
upper_bound[i_left] = middle_value
|
||||
lower_bound[i_right] = middle_value
|
||||
upper_bound[i_right] = upper_bound[i]
|
||||
|
||||
elif monotonic_cst[feature] == -1:
|
||||
# Feature with constraint: check monotonicity
|
||||
assert tree_.value[i_left] >= tree_.value[i_right]
|
||||
|
||||
# Update and propagate bounds down the tree to both children.
|
||||
lower_bound[i_left] = middle_value
|
||||
upper_bound[i_left] = upper_bound[i]
|
||||
lower_bound[i_right] = lower_bound[i]
|
||||
upper_bound[i_right] = middle_value
|
||||
|
||||
else: # pragma: no cover
|
||||
raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}")
|
||||
|
||||
|
||||
def test_assert_nd_reg_tree_children_monotonic_bounded():
|
||||
# Check that assert_nd_reg_tree_children_monotonic_bounded can detect
|
||||
# non-monotonic tree predictions.
|
||||
X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1)
|
||||
y = np.sin(X).ravel()
|
||||
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
|
||||
|
||||
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0])
|
||||
|
||||
# Check that assert_nd_reg_tree_children_monotonic_bounded raises
|
||||
# when the data (and therefore the model) is naturally monotonic in the
|
||||
# opposite direction.
|
||||
X = np.linspace(-5, 5, 5).reshape(-1, 1)
|
||||
y = X.ravel() ** 3
|
||||
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
|
||||
|
||||
# For completeness, check that the converse holds when swapping the sign.
|
||||
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
|
||||
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
|
||||
@pytest.mark.parametrize("depth_first_builder", (True, False))
|
||||
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
|
||||
def test_nd_tree_nodes_values(
|
||||
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
|
||||
):
|
||||
# Build tree with several features, and make sure the nodes
|
||||
# values respect the monotonicity constraints.
|
||||
|
||||
# Considering the following tree with a monotonic increase constraint on X[0],
|
||||
# we should have:
|
||||
#
|
||||
# root
|
||||
# X[0]<=t
|
||||
# / \
|
||||
# a b
|
||||
# X[0]<=u X[1]<=v
|
||||
# / \ / \
|
||||
# c d e f
|
||||
#
|
||||
# i) a <= root <= b
|
||||
# ii) c <= a <= d <= (a+b)/2
|
||||
# iii) (a+b)/2 <= min(e,f)
|
||||
# For iii) we check that each node value is within the proper lower and
|
||||
# upper bounds.
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 1000
|
||||
n_features = 2
|
||||
monotonic_cst = [monotonic_sign, 0]
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
|
||||
if depth_first_builder:
|
||||
# No max_leaf_nodes, default depth first tree builder
|
||||
clf = TreeRegressor(
|
||||
monotonic_cst=monotonic_cst,
|
||||
criterion=criterion,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
else:
|
||||
# max_leaf_nodes triggers best first tree builder
|
||||
clf = TreeRegressor(
|
||||
monotonic_cst=monotonic_cst,
|
||||
max_leaf_nodes=n_samples,
|
||||
criterion=criterion,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst)
|
||||
@@ -0,0 +1,49 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.tree._reingold_tilford import Tree, buchheim
|
||||
|
||||
simple_tree = Tree("", 0, Tree("", 1), Tree("", 2))
|
||||
|
||||
bigger_tree = Tree(
|
||||
"",
|
||||
0,
|
||||
Tree(
|
||||
"",
|
||||
1,
|
||||
Tree("", 3),
|
||||
Tree("", 4, Tree("", 7), Tree("", 8)),
|
||||
),
|
||||
Tree("", 2, Tree("", 5), Tree("", 6)),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tree, n_nodes", [(simple_tree, 3), (bigger_tree, 9)])
|
||||
def test_buchheim(tree, n_nodes):
|
||||
def walk_tree(draw_tree):
|
||||
res = [(draw_tree.x, draw_tree.y)]
|
||||
for child in draw_tree.children:
|
||||
# parents higher than children:
|
||||
assert child.y == draw_tree.y + 1
|
||||
res.extend(walk_tree(child))
|
||||
if len(draw_tree.children):
|
||||
# these trees are always binary
|
||||
# parents are centered above children
|
||||
assert (
|
||||
draw_tree.x == (draw_tree.children[0].x + draw_tree.children[1].x) / 2
|
||||
)
|
||||
return res
|
||||
|
||||
layout = buchheim(tree)
|
||||
coordinates = walk_tree(layout)
|
||||
assert len(coordinates) == n_nodes
|
||||
# test that x values are unique per depth / level
|
||||
# we could also do it quicker using defaultdicts..
|
||||
depth = 0
|
||||
while True:
|
||||
x_at_this_depth = [node[0] for node in coordinates if node[1] == depth]
|
||||
if not x_at_this_depth:
|
||||
# reached all leafs
|
||||
break
|
||||
assert len(np.unique(x_at_this_depth)) == len(x_at_this_depth)
|
||||
depth += 1
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user