Videre
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from sklearn.utils._array_api import (
|
||||
_find_matching_floating_dtype,
|
||||
get_namespace_and_device,
|
||||
)
|
||||
|
||||
|
||||
def _weighted_percentile(
|
||||
array, sample_weight, percentile_rank=50, average=False, xp=None
|
||||
):
|
||||
"""Compute the weighted percentile.
|
||||
|
||||
Implement an array API compatible (weighted version) of NumPy's 'inverted_cdf'
|
||||
method when `average=False` (default) and 'averaged_inverted_cdf' when
|
||||
`average=True`.
|
||||
|
||||
For an array ordered by increasing values, when the percentile lies exactly on a
|
||||
data point:
|
||||
|
||||
* 'inverted_cdf' takes the exact data point.
|
||||
* 'averaged_inverted_cdf' takes the average of the exact data point and the one
|
||||
above it (this means it gives the same result as `median` for unit weights).
|
||||
|
||||
E.g., for the array [1, 2, 3, 4] the percentile rank at each data point would
|
||||
be [25, 50, 75, 100]. Percentile rank 50 lies on '2'. 'average_inverted_cdf'
|
||||
computes the average of '2' and '3', making it 'symmetrical' because if you
|
||||
reverse the array, rank 50 would fall on '3'. It also matches 'median'.
|
||||
On the other hand, 'inverted_cdf', which does not satisfy the symmetry property,
|
||||
would give '2'.
|
||||
|
||||
When the requested percentile lies between two data points, both methods return
|
||||
the higher data point.
|
||||
E.g., for the array [1, 2, 3, 4, 5] the percentile rank at each data point would
|
||||
be [20, 40, 60, 80, 100]. Percentile rank 50, lies between '2' and '3'. Taking the
|
||||
higher data point is symmetrical because if you reverse the array, 50 would lie
|
||||
between '4' and '3'. Both methods match median in this case.
|
||||
|
||||
If `array` is a 2D array, the `values` are selected along axis 0.
|
||||
|
||||
`NaN` values are ignored by setting their weights to 0. If `array` is 2D, this
|
||||
is done in a column-isolated manner: a `NaN` in the second column, does not impact
|
||||
the percentile computed for the first column even if `sample_weight` is 1D.
|
||||
|
||||
.. versionchanged:: 0.24
|
||||
Accepts 2D `array`.
|
||||
|
||||
.. versionchanged:: 1.7
|
||||
Supports handling of `NaN` values.
|
||||
|
||||
.. versionchanged:: 1.8
|
||||
Supports `average`, which calculates percentile using the
|
||||
"averaged_inverted_cdf" method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : 1D or 2D array
|
||||
Values to take the weighted percentile of.
|
||||
|
||||
sample_weight: 1D or 2D array
|
||||
Weights for each value in `array`. Must be same shape as `array` or of shape
|
||||
`(array.shape[0],)`.
|
||||
|
||||
percentile_rank: scalar or 1D array, default=50
|
||||
The probability level(s) of the percentile(s) to compute, in percent. Must be
|
||||
between 0 and 100. If a 1D array, computes all percentiles (along each
|
||||
axis 0 if `array` is 2D).
|
||||
|
||||
average : bool, default=False
|
||||
If `True`, uses the "averaged_inverted_cdf" quantile method, otherwise
|
||||
defaults to "inverted_cdf". "averaged_inverted_cdf" is symmetrical with
|
||||
unit `sample_weight`, such that the total of `sample_weight` below or equal to
|
||||
`_weighted_percentile(percentile_rank)` is the same as the total of
|
||||
`sample_weight` above or equal to `_weighted_percentile(100-percentile_rank)`.
|
||||
This symmetry is not guaranteed with non-unit weights.
|
||||
|
||||
xp : array_namespace, default=None
|
||||
The standard-compatible namespace for `array`. Default: infer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
percentile : scalar, 1D array, or 2D array
|
||||
Weighted percentile at the requested probability level(s).
|
||||
If `array` is 1D and `percentile_rank` is scalar, returns a scalar.
|
||||
If `array` is 2D and `percentile_rank` is scalar, returns a 1D array
|
||||
of shape `(array.shape[1],)`
|
||||
If `array` is 1D and `percentile_rank` is 1D, returns a 1D array
|
||||
of shape `(percentile_rank.shape[0],)`
|
||||
If `array` is 2D and `percentile_rank` is 1D, returns a 2D array
|
||||
of shape `(array.shape[1], percentile_rank.shape[0])`
|
||||
"""
|
||||
xp, _, device = get_namespace_and_device(array)
|
||||
# `sample_weight` should follow `array` for dtypes
|
||||
floating_dtype = _find_matching_floating_dtype(array, xp=xp)
|
||||
array = xp.asarray(array, dtype=floating_dtype, device=device)
|
||||
sample_weight = xp.asarray(sample_weight, dtype=floating_dtype, device=device)
|
||||
percentile_rank = xp.asarray(percentile_rank, dtype=floating_dtype, device=device)
|
||||
|
||||
n_dim = array.ndim
|
||||
if n_dim == 0:
|
||||
return array
|
||||
if array.ndim == 1:
|
||||
array = xp.reshape(array, (-1, 1))
|
||||
# When sample_weight 1D, repeat for each array.shape[1]
|
||||
if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
|
||||
sample_weight = xp.tile(sample_weight, (array.shape[1], 1)).T
|
||||
|
||||
n_dim_percentile = percentile_rank.ndim
|
||||
if n_dim_percentile == 0:
|
||||
percentile_rank = xp.reshape(percentile_rank, (1,))
|
||||
|
||||
# Sort `array` and `sample_weight` along axis=0:
|
||||
sorted_idx = xp.argsort(array, axis=0, stable=False)
|
||||
sorted_weights = xp.take_along_axis(sample_weight, sorted_idx, axis=0)
|
||||
|
||||
# Set NaN values in `sample_weight` to 0. Only perform this operation if NaN
|
||||
# values present to avoid temporary allocations of size `(n_samples, n_features)`.
|
||||
n_features = array.shape[1]
|
||||
largest_value_per_column = array[
|
||||
sorted_idx[-1, ...], xp.arange(n_features, device=device)
|
||||
]
|
||||
# NaN values get sorted to end (largest value)
|
||||
if xp.any(xp.isnan(largest_value_per_column)):
|
||||
sorted_nan_mask = xp.take_along_axis(xp.isnan(array), sorted_idx, axis=0)
|
||||
sorted_weights[sorted_nan_mask] = 0
|
||||
|
||||
# Compute the weighted cumulative distribution function (CDF) based on
|
||||
# `sample_weight` and scale `percentile_rank` along it.
|
||||
#
|
||||
# Note: we call `xp.cumulative_sum` on the transposed `sorted_weights` to
|
||||
# ensure that the result is of shape `(n_features, n_samples)` so
|
||||
# `xp.searchsorted` calls take contiguous inputs as a result (for
|
||||
# performance reasons).
|
||||
weight_cdf = xp.cumulative_sum(sorted_weights.T, axis=1)
|
||||
|
||||
n_percentiles = percentile_rank.shape[0]
|
||||
result = xp.empty((n_features, n_percentiles), dtype=floating_dtype, device=device)
|
||||
|
||||
for p_idx, p_rank in enumerate(percentile_rank):
|
||||
adjusted_percentile_rank = p_rank / 100 * weight_cdf[..., -1]
|
||||
|
||||
# Ignore leading `sample_weight=0` observations
|
||||
# when `percentile_rank=0` (#20528)
|
||||
mask = adjusted_percentile_rank == 0
|
||||
adjusted_percentile_rank[mask] = xp.nextafter(
|
||||
adjusted_percentile_rank[mask], adjusted_percentile_rank[mask] + 1
|
||||
)
|
||||
# For each feature with index j, find sample index i of the scalar value
|
||||
# `adjusted_percentile_rank[j]` in 1D array `weight_cdf[j]`, such that:
|
||||
# weight_cdf[j, i-1] < adjusted_percentile_rank[j] <= weight_cdf[j, i].
|
||||
# Note `searchsorted` defaults to equality on the right, whereas Hyndman and Fan
|
||||
# reference equation has equality on the left.
|
||||
percentile_indices = xp.stack(
|
||||
[
|
||||
xp.searchsorted(
|
||||
weight_cdf[feature_idx, ...], adjusted_percentile_rank[feature_idx]
|
||||
)
|
||||
for feature_idx in range(weight_cdf.shape[0])
|
||||
],
|
||||
)
|
||||
# `percentile_indices` may be equal to `sorted_idx.shape[0]` due to floating
|
||||
# point error (see #11813)
|
||||
max_idx = sorted_idx.shape[0] - 1
|
||||
percentile_indices = xp.clip(percentile_indices, 0, max_idx)
|
||||
|
||||
col_indices = xp.arange(array.shape[1], device=device)
|
||||
percentile_in_sorted = sorted_idx[percentile_indices, col_indices]
|
||||
|
||||
if average:
|
||||
# From Hyndman and Fan (1996), `fraction_above` is `g`
|
||||
fraction_above = (
|
||||
weight_cdf[col_indices, percentile_indices] - adjusted_percentile_rank
|
||||
)
|
||||
is_fraction_above = fraction_above > xp.finfo(floating_dtype).eps
|
||||
percentile_plus_one_indices = xp.clip(percentile_indices + 1, 0, max_idx)
|
||||
percentile_plus_one_in_sorted = sorted_idx[
|
||||
percentile_plus_one_indices, col_indices
|
||||
]
|
||||
# Handle case when next index ('plus one') has sample weight of 0
|
||||
zero_weight_cols = col_indices[
|
||||
sample_weight[percentile_plus_one_in_sorted, col_indices] == 0
|
||||
]
|
||||
for col_idx in zero_weight_cols:
|
||||
cdf_val = weight_cdf[col_idx, percentile_indices[col_idx]]
|
||||
# Search for next index where `weighted_cdf` is greater
|
||||
next_index = xp.searchsorted(
|
||||
weight_cdf[col_idx, ...], cdf_val, side="right"
|
||||
)
|
||||
# Handle case where there are trailing 0 sample weight samples
|
||||
# and `percentile_indices` is already max index
|
||||
if next_index >= max_idx:
|
||||
# use original `percentile_indices` again
|
||||
next_index = percentile_indices[col_idx]
|
||||
|
||||
percentile_plus_one_in_sorted[col_idx] = sorted_idx[next_index, col_idx]
|
||||
|
||||
result[..., p_idx] = xp.where(
|
||||
is_fraction_above,
|
||||
array[percentile_in_sorted, col_indices],
|
||||
(
|
||||
array[percentile_in_sorted, col_indices]
|
||||
+ array[percentile_plus_one_in_sorted, col_indices]
|
||||
)
|
||||
/ 2,
|
||||
)
|
||||
else:
|
||||
result[..., p_idx] = array[percentile_in_sorted, col_indices]
|
||||
|
||||
if n_dim_percentile == 0:
|
||||
result = result[..., 0]
|
||||
|
||||
return result[0, ...] if n_dim == 1 else result
|
||||
Reference in New Issue
Block a user