Videre
This commit is contained in:
@@ -0,0 +1,817 @@
|
||||
"""Partition samples in the construction of a tree.
|
||||
|
||||
This module contains the algorithms for moving sample indices to
|
||||
the left and right child node given a split determined by the
|
||||
splitting algorithm in `_splitter.pyx`.
|
||||
|
||||
Partitioning is done in a way that is efficient for both dense data,
|
||||
and sparse data stored in a Compressed Sparse Column (CSC) format.
|
||||
"""
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from cython cimport final
|
||||
from libc.math cimport isnan, log2
|
||||
from libc.stdlib cimport qsort
|
||||
from libc.string cimport memcpy
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
|
||||
# Constant to switch between algorithm non zero value extract algorithm
|
||||
# in SparsePartitioner
|
||||
cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
|
||||
|
||||
# Allow for 32 bit float comparisons
|
||||
cdef float32_t INFINITY_32t = np.inf
|
||||
|
||||
|
||||
@final
|
||||
cdef class DensePartitioner:
|
||||
"""Partitioner specialized for dense data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
const float32_t[:, :] X,
|
||||
intp_t[::1] samples,
|
||||
float32_t[::1] feature_values,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
):
|
||||
self.X = X
|
||||
self.samples = samples
|
||||
self.feature_values = feature_values
|
||||
self.missing_values_in_feature_mask = missing_values_in_feature_mask
|
||||
|
||||
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
|
||||
"""Initialize splitter at the beginning of node_split."""
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void sort_samples_and_feature_values(
|
||||
self, intp_t current_feature
|
||||
) noexcept nogil:
|
||||
"""Simultaneously sort based on the feature_values.
|
||||
|
||||
Missing values are stored at the end of feature_values.
|
||||
The number of missing values observed in feature_values is stored
|
||||
in self.n_missing.
|
||||
"""
|
||||
cdef:
|
||||
intp_t i, current_end
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
const float32_t[:, :] X = self.X
|
||||
intp_t[::1] samples = self.samples
|
||||
intp_t n_missing = 0
|
||||
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
|
||||
|
||||
# Sort samples along that feature; by copying the values into an array and
|
||||
# sorting the array in a manner which utilizes the cache more effectively.
|
||||
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
|
||||
i, current_end = self.start, self.end - 1
|
||||
# Missing values are placed at the end and do not participate in the sorting.
|
||||
while i <= current_end:
|
||||
# Finds the right-most value that is not missing so that
|
||||
# it can be swapped with missing values at its left.
|
||||
if isnan(X[samples[current_end], current_feature]):
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
continue
|
||||
|
||||
# X[samples[current_end], current_feature] is a non-missing value
|
||||
if isnan(X[samples[i], current_feature]):
|
||||
samples[i], samples[current_end] = samples[current_end], samples[i]
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
|
||||
feature_values[i] = X[samples[i], current_feature]
|
||||
i += 1
|
||||
else:
|
||||
# When there are no missing values, we only need to copy the data into
|
||||
# feature_values
|
||||
for i in range(self.start, self.end):
|
||||
feature_values[i] = X[samples[i], current_feature]
|
||||
|
||||
sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
|
||||
self.n_missing = n_missing
|
||||
|
||||
cdef inline void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil:
|
||||
"""Find the minimum and maximum value for current_feature.
|
||||
|
||||
Missing values are stored at the end of feature_values. The number of missing
|
||||
values observed in feature_values is stored in self.n_missing.
|
||||
"""
|
||||
cdef:
|
||||
intp_t p, current_end
|
||||
float32_t current_feature_value
|
||||
const float32_t[:, :] X = self.X
|
||||
intp_t[::1] samples = self.samples
|
||||
float32_t min_feature_value = INFINITY_32t
|
||||
float32_t max_feature_value = -INFINITY_32t
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t n_missing = 0
|
||||
const uint8_t[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
|
||||
|
||||
# We are copying the values into an array and finding min/max of the array in
|
||||
# a manner which utilizes the cache more effectively. We need to also count
|
||||
# the number of missing-values there are.
|
||||
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
|
||||
p, current_end = self.start, self.end - 1
|
||||
# Missing values are placed at the end and do not participate in the
|
||||
# min/max calculation.
|
||||
while p <= current_end:
|
||||
# Finds the right-most value that is not missing so that
|
||||
# it can be swapped with missing values towards its left.
|
||||
if isnan(X[samples[current_end], current_feature]):
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
continue
|
||||
|
||||
# X[samples[current_end], current_feature] is a non-missing value
|
||||
if isnan(X[samples[p], current_feature]):
|
||||
samples[p], samples[current_end] = samples[current_end], samples[p]
|
||||
n_missing += 1
|
||||
current_end -= 1
|
||||
|
||||
current_feature_value = X[samples[p], current_feature]
|
||||
feature_values[p] = current_feature_value
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
p += 1
|
||||
else:
|
||||
min_feature_value = X[samples[self.start], current_feature]
|
||||
max_feature_value = min_feature_value
|
||||
|
||||
feature_values[self.start] = min_feature_value
|
||||
for p in range(self.start + 1, self.end):
|
||||
current_feature_value = X[samples[p], current_feature]
|
||||
feature_values[p] = current_feature_value
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
min_feature_value_out[0] = min_feature_value
|
||||
max_feature_value_out[0] = max_feature_value
|
||||
self.n_missing = n_missing
|
||||
|
||||
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
|
||||
"""Compute the next p_prev and p for iterating over feature values.
|
||||
|
||||
The missing values are not included when iterating through the feature values.
|
||||
"""
|
||||
cdef intp_t end_non_missing = self.end - self.n_missing
|
||||
|
||||
while (
|
||||
p[0] + 1 < end_non_missing and
|
||||
self.feature_values[p[0] + 1] <= self.feature_values[p[0]] + FEATURE_THRESHOLD
|
||||
):
|
||||
p[0] += 1
|
||||
|
||||
p_prev[0] = p[0]
|
||||
|
||||
# By adding 1, we have
|
||||
# (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
|
||||
p[0] += 1
|
||||
|
||||
cdef inline intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil:
|
||||
"""Partition samples for feature_values at the current_threshold."""
|
||||
cdef:
|
||||
intp_t p = self.start
|
||||
intp_t partition_end = self.end - self.n_missing
|
||||
intp_t[::1] samples = self.samples
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
|
||||
while p < partition_end:
|
||||
if feature_values[p] <= current_threshold:
|
||||
p += 1
|
||||
else:
|
||||
partition_end -= 1
|
||||
|
||||
feature_values[p], feature_values[partition_end] = (
|
||||
feature_values[partition_end], feature_values[p]
|
||||
)
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
|
||||
return partition_end
|
||||
|
||||
cdef inline void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t best_n_missing,
|
||||
) noexcept nogil:
|
||||
"""Partition samples for X at the best_threshold and best_feature.
|
||||
|
||||
If missing values are present, this method partitions `samples`
|
||||
so that the `best_n_missing` missing values' indices are in the
|
||||
right-most end of `samples`, that is `samples[end_non_missing:end]`.
|
||||
"""
|
||||
cdef:
|
||||
# Local invariance: start <= p <= partition_end <= end
|
||||
intp_t start = self.start
|
||||
intp_t p = start
|
||||
intp_t end = self.end - 1
|
||||
intp_t partition_end = end - best_n_missing
|
||||
intp_t[::1] samples = self.samples
|
||||
const float32_t[:, :] X = self.X
|
||||
float32_t current_value
|
||||
|
||||
if best_n_missing != 0:
|
||||
# Move samples with missing values to the end while partitioning the
|
||||
# non-missing samples
|
||||
while p <= partition_end:
|
||||
# Keep samples with missing values at the end
|
||||
if isnan(X[samples[end], best_feature]):
|
||||
end -= 1
|
||||
continue
|
||||
|
||||
# Swap sample with missing values with the sample at the end
|
||||
current_value = X[samples[p], best_feature]
|
||||
if isnan(current_value):
|
||||
samples[p], samples[end] = samples[end], samples[p]
|
||||
end -= 1
|
||||
|
||||
# The swapped sample at the end is always a non-missing value, so
|
||||
# we can continue the algorithm without checking for missingness.
|
||||
current_value = X[samples[p], best_feature]
|
||||
|
||||
# Partition the non-missing samples
|
||||
if current_value <= best_threshold:
|
||||
p += 1
|
||||
else:
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
partition_end -= 1
|
||||
else:
|
||||
# Partitioning routine when there are no missing values
|
||||
while p < partition_end:
|
||||
if X[samples[p], best_feature] <= best_threshold:
|
||||
p += 1
|
||||
else:
|
||||
samples[p], samples[partition_end] = samples[partition_end], samples[p]
|
||||
partition_end -= 1
|
||||
|
||||
|
||||
@final
|
||||
cdef class SparsePartitioner:
|
||||
"""Partitioner specialized for sparse CSC data.
|
||||
|
||||
Note that this partitioner is agnostic to the splitting strategy (best vs. random).
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
object X,
|
||||
intp_t[::1] samples,
|
||||
intp_t n_samples,
|
||||
float32_t[::1] feature_values,
|
||||
const uint8_t[::1] missing_values_in_feature_mask,
|
||||
):
|
||||
if not (issparse(X) and X.format == "csc"):
|
||||
raise ValueError("X should be in csc format")
|
||||
|
||||
self.samples = samples
|
||||
self.feature_values = feature_values
|
||||
|
||||
# Initialize X
|
||||
cdef intp_t n_total_samples = X.shape[0]
|
||||
|
||||
self.X_data = X.data
|
||||
self.X_indices = X.indices
|
||||
self.X_indptr = X.indptr
|
||||
self.n_total_samples = n_total_samples
|
||||
|
||||
# Initialize auxiliary array used to perform split
|
||||
self.index_to_samples = np.full(n_total_samples, fill_value=-1, dtype=np.intp)
|
||||
self.sorted_samples = np.empty(n_samples, dtype=np.intp)
|
||||
|
||||
cdef intp_t p
|
||||
for p in range(n_samples):
|
||||
self.index_to_samples[samples[p]] = p
|
||||
|
||||
self.missing_values_in_feature_mask = missing_values_in_feature_mask
|
||||
|
||||
cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
|
||||
"""Initialize splitter at the beginning of node_split."""
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.is_samples_sorted = 0
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void sort_samples_and_feature_values(
|
||||
self,
|
||||
intp_t current_feature
|
||||
) noexcept nogil:
|
||||
"""Simultaneously sort based on the feature_values."""
|
||||
cdef:
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t[::1] index_to_samples = self.index_to_samples
|
||||
intp_t[::1] samples = self.samples
|
||||
|
||||
self.extract_nnz(current_feature)
|
||||
# Sort the positive and negative parts of `feature_values`
|
||||
sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
|
||||
if self.start_positive < self.end:
|
||||
sort(
|
||||
&feature_values[self.start_positive],
|
||||
&samples[self.start_positive],
|
||||
self.end - self.start_positive
|
||||
)
|
||||
|
||||
# Update index_to_samples to take into account the sort
|
||||
for p in range(self.start, self.end_negative):
|
||||
index_to_samples[samples[p]] = p
|
||||
for p in range(self.start_positive, self.end):
|
||||
index_to_samples[samples[p]] = p
|
||||
|
||||
# Add one or two zeros in feature_values, if there is any
|
||||
if self.end_negative < self.start_positive:
|
||||
self.start_positive -= 1
|
||||
feature_values[self.start_positive] = 0.
|
||||
|
||||
if self.end_negative != self.start_positive:
|
||||
feature_values[self.end_negative] = 0.
|
||||
self.end_negative += 1
|
||||
|
||||
# XXX: When sparse supports missing values, this should be set to the
|
||||
# number of missing values for current_feature
|
||||
self.n_missing = 0
|
||||
|
||||
cdef inline void find_min_max(
|
||||
self,
|
||||
intp_t current_feature,
|
||||
float32_t* min_feature_value_out,
|
||||
float32_t* max_feature_value_out,
|
||||
) noexcept nogil:
|
||||
"""Find the minimum and maximum value for current_feature."""
|
||||
cdef:
|
||||
intp_t p
|
||||
float32_t current_feature_value, min_feature_value, max_feature_value
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
|
||||
self.extract_nnz(current_feature)
|
||||
|
||||
if self.end_negative != self.start_positive:
|
||||
# There is a zero
|
||||
min_feature_value = 0
|
||||
max_feature_value = 0
|
||||
else:
|
||||
min_feature_value = feature_values[self.start]
|
||||
max_feature_value = min_feature_value
|
||||
|
||||
# Find min, max in feature_values[start:end_negative]
|
||||
for p in range(self.start, self.end_negative):
|
||||
current_feature_value = feature_values[p]
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
# Update min, max given feature_values[start_positive:end]
|
||||
for p in range(self.start_positive, self.end):
|
||||
current_feature_value = feature_values[p]
|
||||
|
||||
if current_feature_value < min_feature_value:
|
||||
min_feature_value = current_feature_value
|
||||
elif current_feature_value > max_feature_value:
|
||||
max_feature_value = current_feature_value
|
||||
|
||||
min_feature_value_out[0] = min_feature_value
|
||||
max_feature_value_out[0] = max_feature_value
|
||||
|
||||
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
|
||||
"""Compute the next p_prev and p for iterating over feature values."""
|
||||
cdef intp_t p_next
|
||||
|
||||
if p[0] + 1 != self.end_negative:
|
||||
p_next = p[0] + 1
|
||||
else:
|
||||
p_next = self.start_positive
|
||||
|
||||
while (p_next < self.end and
|
||||
self.feature_values[p_next] <= self.feature_values[p[0]] + FEATURE_THRESHOLD):
|
||||
p[0] = p_next
|
||||
if p[0] + 1 != self.end_negative:
|
||||
p_next = p[0] + 1
|
||||
else:
|
||||
p_next = self.start_positive
|
||||
|
||||
p_prev[0] = p[0]
|
||||
p[0] = p_next
|
||||
|
||||
cdef inline intp_t partition_samples(
|
||||
self,
|
||||
float64_t current_threshold
|
||||
) noexcept nogil:
|
||||
"""Partition samples for feature_values at the current_threshold."""
|
||||
return self._partition(current_threshold, self.start_positive)
|
||||
|
||||
cdef inline void partition_samples_final(
|
||||
self,
|
||||
intp_t best_pos,
|
||||
float64_t best_threshold,
|
||||
intp_t best_feature,
|
||||
intp_t n_missing,
|
||||
) noexcept nogil:
|
||||
"""Partition samples for X at the best_threshold and best_feature."""
|
||||
self.extract_nnz(best_feature)
|
||||
self._partition(best_threshold, best_pos)
|
||||
|
||||
cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil:
|
||||
"""Partition samples[start:end] based on threshold."""
|
||||
cdef:
|
||||
intp_t p, partition_end
|
||||
intp_t[::1] index_to_samples = self.index_to_samples
|
||||
float32_t[::1] feature_values = self.feature_values
|
||||
intp_t[::1] samples = self.samples
|
||||
|
||||
if threshold < 0.:
|
||||
p = self.start
|
||||
partition_end = self.end_negative
|
||||
elif threshold > 0.:
|
||||
p = self.start_positive
|
||||
partition_end = self.end
|
||||
else:
|
||||
# Data are already split
|
||||
return zero_pos
|
||||
|
||||
while p < partition_end:
|
||||
if feature_values[p] <= threshold:
|
||||
p += 1
|
||||
|
||||
else:
|
||||
partition_end -= 1
|
||||
|
||||
feature_values[p], feature_values[partition_end] = (
|
||||
feature_values[partition_end], feature_values[p]
|
||||
)
|
||||
sparse_swap(index_to_samples, samples, p, partition_end)
|
||||
|
||||
return partition_end
|
||||
|
||||
cdef inline void extract_nnz(self, intp_t feature) noexcept nogil:
|
||||
"""Extract and partition values for a given feature.
|
||||
|
||||
The extracted values are partitioned between negative values
|
||||
feature_values[start:end_negative[0]] and positive values
|
||||
feature_values[start_positive[0]:end].
|
||||
The samples and index_to_samples are modified according to this
|
||||
partition.
|
||||
|
||||
The extraction corresponds to the intersection between the arrays
|
||||
X_indices[indptr_start:indptr_end] and samples[start:end].
|
||||
This is done efficiently using either an index_to_samples based approach
|
||||
or binary search based approach.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature : intp_t,
|
||||
Index of the feature we want to extract non zero value.
|
||||
"""
|
||||
cdef intp_t[::1] samples = self.samples
|
||||
cdef float32_t[::1] feature_values = self.feature_values
|
||||
cdef intp_t indptr_start = self.X_indptr[feature]
|
||||
cdef intp_t indptr_end = self.X_indptr[feature + 1]
|
||||
cdef intp_t n_indices = <intp_t>(indptr_end - indptr_start)
|
||||
cdef intp_t n_samples = self.end - self.start
|
||||
cdef intp_t[::1] index_to_samples = self.index_to_samples
|
||||
cdef intp_t[::1] sorted_samples = self.sorted_samples
|
||||
cdef const int32_t[::1] X_indices = self.X_indices
|
||||
cdef const float32_t[::1] X_data = self.X_data
|
||||
|
||||
# Use binary search if n_samples * log(n_indices) <
|
||||
# n_indices and index_to_samples approach otherwise.
|
||||
# O(n_samples * log(n_indices)) is the running time of binary
|
||||
# search and O(n_indices) is the running time of index_to_samples
|
||||
# approach.
|
||||
if ((1 - self.is_samples_sorted) * n_samples * log2(n_samples) +
|
||||
n_samples * log2(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
|
||||
extract_nnz_binary_search(X_indices, X_data,
|
||||
indptr_start, indptr_end,
|
||||
samples, self.start, self.end,
|
||||
index_to_samples,
|
||||
feature_values,
|
||||
&self.end_negative, &self.start_positive,
|
||||
sorted_samples, &self.is_samples_sorted)
|
||||
|
||||
# Using an index to samples technique to extract non zero values
|
||||
# index_to_samples is a mapping from X_indices to samples
|
||||
else:
|
||||
extract_nnz_index_to_samples(X_indices, X_data,
|
||||
indptr_start, indptr_end,
|
||||
samples, self.start, self.end,
|
||||
index_to_samples,
|
||||
feature_values,
|
||||
&self.end_negative, &self.start_positive)
|
||||
|
||||
|
||||
cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
|
||||
"""Comparison function for sort.
|
||||
|
||||
This must return an `int` as it is used by stdlib's qsort, which expects
|
||||
an `int` return value.
|
||||
"""
|
||||
return <int>((<intp_t*>a)[0] - (<intp_t*>b)[0])
|
||||
|
||||
|
||||
cdef inline void binary_search(const int32_t[::1] sorted_array,
|
||||
int32_t start, int32_t end,
|
||||
intp_t value, intp_t* index,
|
||||
int32_t* new_start) noexcept nogil:
|
||||
"""Return the index of value in the sorted array.
|
||||
|
||||
If not found, return -1. new_start is the last pivot + 1
|
||||
"""
|
||||
cdef int32_t pivot
|
||||
index[0] = -1
|
||||
while start < end:
|
||||
pivot = start + (end - start) / 2
|
||||
|
||||
if sorted_array[pivot] == value:
|
||||
index[0] = pivot
|
||||
start = pivot + 1
|
||||
break
|
||||
|
||||
if sorted_array[pivot] < value:
|
||||
start = pivot + 1
|
||||
else:
|
||||
end = pivot
|
||||
new_start[0] = start
|
||||
|
||||
|
||||
cdef inline void extract_nnz_index_to_samples(const int32_t[::1] X_indices,
|
||||
const float32_t[::1] X_data,
|
||||
int32_t indptr_start,
|
||||
int32_t indptr_end,
|
||||
intp_t[::1] samples,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
intp_t[::1] index_to_samples,
|
||||
float32_t[::1] feature_values,
|
||||
intp_t* end_negative,
|
||||
intp_t* start_positive) noexcept nogil:
|
||||
"""Extract and partition values for a feature using index_to_samples.
|
||||
|
||||
Complexity is O(indptr_end - indptr_start).
|
||||
"""
|
||||
cdef int32_t k
|
||||
cdef intp_t index
|
||||
cdef intp_t end_negative_ = start
|
||||
cdef intp_t start_positive_ = end
|
||||
|
||||
for k in range(indptr_start, indptr_end):
|
||||
if start <= index_to_samples[X_indices[k]] < end:
|
||||
if X_data[k] > 0:
|
||||
start_positive_ -= 1
|
||||
feature_values[start_positive_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, start_positive_)
|
||||
|
||||
elif X_data[k] < 0:
|
||||
feature_values[end_negative_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, end_negative_)
|
||||
end_negative_ += 1
|
||||
|
||||
# Returned values
|
||||
end_negative[0] = end_negative_
|
||||
start_positive[0] = start_positive_
|
||||
|
||||
|
||||
cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
|
||||
const float32_t[::1] X_data,
|
||||
int32_t indptr_start,
|
||||
int32_t indptr_end,
|
||||
intp_t[::1] samples,
|
||||
intp_t start,
|
||||
intp_t end,
|
||||
intp_t[::1] index_to_samples,
|
||||
float32_t[::1] feature_values,
|
||||
intp_t* end_negative,
|
||||
intp_t* start_positive,
|
||||
intp_t[::1] sorted_samples,
|
||||
bint* is_samples_sorted) noexcept nogil:
|
||||
"""Extract and partition values for a given feature using binary search.
|
||||
|
||||
If n_samples = end - start and n_indices = indptr_end - indptr_start,
|
||||
the complexity is
|
||||
|
||||
O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
|
||||
n_samples * log(n_indices)).
|
||||
"""
|
||||
cdef intp_t n_samples
|
||||
|
||||
if not is_samples_sorted[0]:
|
||||
n_samples = end - start
|
||||
memcpy(&sorted_samples[start], &samples[start],
|
||||
n_samples * sizeof(intp_t))
|
||||
qsort(&sorted_samples[start], n_samples, sizeof(intp_t),
|
||||
compare_SIZE_t)
|
||||
is_samples_sorted[0] = 1
|
||||
|
||||
while (indptr_start < indptr_end and
|
||||
sorted_samples[start] > X_indices[indptr_start]):
|
||||
indptr_start += 1
|
||||
|
||||
while (indptr_start < indptr_end and
|
||||
sorted_samples[end - 1] < X_indices[indptr_end - 1]):
|
||||
indptr_end -= 1
|
||||
|
||||
cdef intp_t p = start
|
||||
cdef intp_t index
|
||||
cdef intp_t k
|
||||
cdef intp_t end_negative_ = start
|
||||
cdef intp_t start_positive_ = end
|
||||
|
||||
while (p < end and indptr_start < indptr_end):
|
||||
# Find index of sorted_samples[p] in X_indices
|
||||
binary_search(X_indices, indptr_start, indptr_end,
|
||||
sorted_samples[p], &k, &indptr_start)
|
||||
|
||||
if k != -1:
|
||||
# If k != -1, we have found a non zero value
|
||||
|
||||
if X_data[k] > 0:
|
||||
start_positive_ -= 1
|
||||
feature_values[start_positive_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, start_positive_)
|
||||
|
||||
elif X_data[k] < 0:
|
||||
feature_values[end_negative_] = X_data[k]
|
||||
index = index_to_samples[X_indices[k]]
|
||||
sparse_swap(index_to_samples, samples, index, end_negative_)
|
||||
end_negative_ += 1
|
||||
p += 1
|
||||
|
||||
# Returned values
|
||||
end_negative[0] = end_negative_
|
||||
start_positive[0] = start_positive_
|
||||
|
||||
|
||||
cdef inline void sparse_swap(intp_t[::1] index_to_samples, intp_t[::1] samples,
|
||||
intp_t pos_1, intp_t pos_2) noexcept nogil:
|
||||
"""Swap sample pos_1 and pos_2 preserving sparse invariant."""
|
||||
samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1]
|
||||
index_to_samples[samples[pos_1]] = pos_1
|
||||
index_to_samples[samples[pos_2]] = pos_2
|
||||
|
||||
|
||||
cdef inline void shift_missing_values_to_left_if_required(
|
||||
SplitRecord* best,
|
||||
intp_t[::1] samples,
|
||||
intp_t end,
|
||||
) noexcept nogil:
|
||||
"""Shift missing value sample indices to the left of the split if required.
|
||||
|
||||
Note: this should always be called at the very end because it will
|
||||
move samples around, thereby affecting the criterion.
|
||||
This affects the computation of the children impurity, which affects
|
||||
the computation of the next node.
|
||||
"""
|
||||
cdef intp_t i, p, current_end
|
||||
# The partitioner partitions the data such that the missing values are in
|
||||
# samples[-n_missing:] for the criterion to consume. If the missing values
|
||||
# are going to the right node, then the missing values are already in the
|
||||
# correct position. If the missing values go left, then we move the missing
|
||||
# values to samples[best.pos:best.pos+n_missing] and update `best.pos`.
|
||||
if best.n_missing > 0 and best.missing_go_to_left:
|
||||
for p in range(best.n_missing):
|
||||
i = best.pos + p
|
||||
current_end = end - 1 - p
|
||||
samples[i], samples[current_end] = samples[current_end], samples[i]
|
||||
best.pos += best.n_missing
|
||||
|
||||
|
||||
def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n):
|
||||
"""Used for testing sort."""
|
||||
sort(&feature_values[0], &samples[0], n)
|
||||
|
||||
|
||||
# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
|
||||
# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
|
||||
cdef void sort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
|
||||
if n == 0:
|
||||
return
|
||||
cdef intp_t maxd = 2 * <intp_t>log2(n)
|
||||
introsort(feature_values, samples, n, maxd)
|
||||
|
||||
|
||||
cdef inline void swap(floating* feature_values, intp_t* samples,
|
||||
intp_t i, intp_t j) noexcept nogil:
|
||||
# Helper for sort
|
||||
feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
|
||||
samples[i], samples[j] = samples[j], samples[i]
|
||||
|
||||
|
||||
cdef inline floating median3(floating* feature_values, intp_t n) noexcept nogil:
|
||||
# Median of three pivot selection, after Bentley and McIlroy (1993).
|
||||
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
|
||||
cdef floating a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
|
||||
if a < b:
|
||||
if b < c:
|
||||
return b
|
||||
elif a < c:
|
||||
return c
|
||||
else:
|
||||
return a
|
||||
elif b < c:
|
||||
if a < c:
|
||||
return a
|
||||
else:
|
||||
return c
|
||||
else:
|
||||
return b
|
||||
|
||||
|
||||
# Introsort with median of 3 pivot selection and 3-way partition function
|
||||
# (robust to repeated elements, e.g. lots of zero features).
|
||||
cdef void introsort(floating* feature_values, intp_t *samples,
|
||||
intp_t n, intp_t maxd) noexcept nogil:
|
||||
cdef floating pivot
|
||||
cdef intp_t i, l, r
|
||||
|
||||
while n > 1:
|
||||
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
|
||||
heapsort(feature_values, samples, n)
|
||||
return
|
||||
maxd -= 1
|
||||
|
||||
pivot = median3(feature_values, n)
|
||||
|
||||
# Three-way partition.
|
||||
i = l = 0
|
||||
r = n
|
||||
while i < r:
|
||||
if feature_values[i] < pivot:
|
||||
swap(feature_values, samples, i, l)
|
||||
i += 1
|
||||
l += 1
|
||||
elif feature_values[i] > pivot:
|
||||
r -= 1
|
||||
swap(feature_values, samples, i, r)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
introsort(feature_values, samples, l, maxd)
|
||||
feature_values += r
|
||||
samples += r
|
||||
n -= r
|
||||
|
||||
|
||||
cdef inline void sift_down(floating* feature_values, intp_t* samples,
|
||||
intp_t start, intp_t end) noexcept nogil:
|
||||
# Restore heap order in feature_values[start:end] by moving the max element to start.
|
||||
cdef intp_t child, maxind, root
|
||||
|
||||
root = start
|
||||
while True:
|
||||
child = root * 2 + 1
|
||||
|
||||
# find max of root, left child, right child
|
||||
maxind = root
|
||||
if child < end and feature_values[maxind] < feature_values[child]:
|
||||
maxind = child
|
||||
if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
|
||||
maxind = child + 1
|
||||
|
||||
if maxind == root:
|
||||
break
|
||||
else:
|
||||
swap(feature_values, samples, root, maxind)
|
||||
root = maxind
|
||||
|
||||
|
||||
cdef void heapsort(floating* feature_values, intp_t* samples, intp_t n) noexcept nogil:
|
||||
cdef intp_t start, end
|
||||
|
||||
# heapify
|
||||
start = (n - 2) / 2
|
||||
end = n
|
||||
while True:
|
||||
sift_down(feature_values, samples, start, end)
|
||||
if start == 0:
|
||||
break
|
||||
start -= 1
|
||||
|
||||
# sort by shrinking the heap, putting the max element immediately after it
|
||||
end = n - 1
|
||||
while end > 0:
|
||||
swap(feature_values, samples, 0, end)
|
||||
sift_down(feature_values, samples, 0, end)
|
||||
end = end - 1
|
||||
Reference in New Issue
Block a user