Videre
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
"""Transformers for missing value imputation."""
|
||||
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import typing
|
||||
|
||||
from sklearn.impute._base import MissingIndicator, SimpleImputer
|
||||
from sklearn.impute._knn import KNNImputer
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
# Avoid errors in type checkers (e.g. mypy) for experimental estimators.
|
||||
# TODO: remove this check once the estimator is no longer experimental.
|
||||
from sklearn.impute._iterative import IterativeImputer # noqa: F401
|
||||
|
||||
__all__ = ["KNNImputer", "MissingIndicator", "SimpleImputer"]
|
||||
|
||||
|
||||
# TODO: remove this check once the estimator is no longer experimental.
|
||||
def __getattr__(name):
|
||||
if name == "IterativeImputer":
|
||||
raise ImportError(
|
||||
f"{name} is experimental and the API might change without any "
|
||||
"deprecation cycle. To use it, you need to explicitly import "
|
||||
"enable_iterative_imputer:\n"
|
||||
"from sklearn.experimental import enable_iterative_imputer"
|
||||
)
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,411 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import _fit_context
|
||||
from sklearn.impute._base import _BaseImputer
|
||||
from sklearn.metrics import pairwise_distances_chunked
|
||||
from sklearn.metrics.pairwise import _NAN_METRICS
|
||||
from sklearn.neighbors._base import _get_weights
|
||||
from sklearn.utils._mask import _get_mask
|
||||
from sklearn.utils._missing import is_scalar_nan
|
||||
from sklearn.utils._param_validation import Hidden, Interval, StrOptions
|
||||
from sklearn.utils.validation import (
|
||||
FLOAT_DTYPES,
|
||||
_check_feature_names_in,
|
||||
check_is_fitted,
|
||||
validate_data,
|
||||
)
|
||||
|
||||
|
||||
class KNNImputer(_BaseImputer):
|
||||
"""Imputation for completing missing values using k-Nearest Neighbors.
|
||||
|
||||
Each sample's missing values are imputed using the mean value from
|
||||
`n_neighbors` nearest neighbors found in the training set. Two samples are
|
||||
close if the features that neither is missing are close.
|
||||
|
||||
Read more in the :ref:`User Guide <knnimpute>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
missing_values : int, float, str, np.nan or None, default=np.nan
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to np.nan, since `pd.NA` will be converted to np.nan.
|
||||
|
||||
n_neighbors : int, default=5
|
||||
Number of neighboring samples to use for imputation.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood are
|
||||
weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- callable : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
metric : {'nan_euclidean'} or callable, default='nan_euclidean'
|
||||
Distance metric for searching neighbors. Possible values:
|
||||
|
||||
- 'nan_euclidean'
|
||||
- callable : a user-defined function which conforms to the definition
|
||||
of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y`
|
||||
corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively.
|
||||
The callable should returns a scalar distance value.
|
||||
|
||||
copy : bool, default=True
|
||||
If True, a copy of X will be created. If False, imputation will
|
||||
be done in-place whenever possible.
|
||||
|
||||
add_indicator : bool, default=False
|
||||
If True, a :class:`MissingIndicator` transform will stack onto the
|
||||
output of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on the
|
||||
missing indicator even if there are missing values at transform/test
|
||||
time.
|
||||
|
||||
keep_empty_features : bool, default=False
|
||||
If True, features that consist exclusively of missing values when
|
||||
`fit` is called are returned in results when `transform` is called.
|
||||
The imputed value is always `0`.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Attributes
|
||||
----------
|
||||
indicator_ : :class:`~sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
``None`` if add_indicator is False.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SimpleImputer : Univariate imputer for completing missing values
|
||||
with simple strategies.
|
||||
IterativeImputer : Multivariate imputer that estimates values to impute for
|
||||
each feature with missing values from all the others.
|
||||
|
||||
References
|
||||
----------
|
||||
* `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
|
||||
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
|
||||
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
|
||||
no. 6, 2001 Pages 520-525.
|
||||
<https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.impute import KNNImputer
|
||||
>>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
|
||||
>>> imputer = KNNImputer(n_neighbors=2)
|
||||
>>> imputer.fit_transform(X)
|
||||
array([[1. , 2. , 4. ],
|
||||
[3. , 4. , 3. ],
|
||||
[5.5, 6. , 5. ],
|
||||
[8. , 8. , 7. ]])
|
||||
|
||||
For a more detailed example see
|
||||
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_BaseImputer._parameter_constraints,
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
|
||||
"metric": [StrOptions(set(_NAN_METRICS)), callable],
|
||||
"copy": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
missing_values=np.nan,
|
||||
n_neighbors=5,
|
||||
weights="uniform",
|
||||
metric="nan_euclidean",
|
||||
copy=True,
|
||||
add_indicator=False,
|
||||
keep_empty_features=False,
|
||||
):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator,
|
||||
keep_empty_features=keep_empty_features,
|
||||
)
|
||||
self.n_neighbors = n_neighbors
|
||||
self.weights = weights
|
||||
self.metric = metric
|
||||
self.copy = copy
|
||||
|
||||
def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
|
||||
"""Helper function to impute a single column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
|
||||
Distance matrix between the receivers and potential donors from
|
||||
training set. There must be at least one non-nan distance between
|
||||
a receiver and a potential donor.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors to consider.
|
||||
|
||||
fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Column of potential donors from training set.
|
||||
|
||||
mask_fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Missing mask for fit_X_col.
|
||||
|
||||
Returns
|
||||
-------
|
||||
imputed_values: ndarray of shape (n_receivers,)
|
||||
Imputed values for receiver.
|
||||
"""
|
||||
# Get donors
|
||||
donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
|
||||
:, :n_neighbors
|
||||
]
|
||||
|
||||
# Get weight matrix from distance matrix
|
||||
donors_dist = dist_pot_donors[
|
||||
np.arange(donors_idx.shape[0])[:, None], donors_idx
|
||||
]
|
||||
|
||||
weight_matrix = _get_weights(donors_dist, self.weights)
|
||||
|
||||
# fill nans with zeros
|
||||
if weight_matrix is not None:
|
||||
weight_matrix[np.isnan(weight_matrix)] = 0.0
|
||||
else:
|
||||
weight_matrix = np.ones_like(donors_dist)
|
||||
weight_matrix[np.isnan(donors_dist)] = 0.0
|
||||
|
||||
# Retrieve donor values and calculate kNN average
|
||||
donors = fit_X_col.take(donors_idx)
|
||||
donors_mask = mask_fit_X_col.take(donors_idx)
|
||||
donors = np.ma.array(donors, mask=donors_mask)
|
||||
|
||||
return np.ma.average(donors, axis=1, weights=weight_matrix).data
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the imputer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like shape of (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
The fitted `KNNImputer` class instance.
|
||||
"""
|
||||
# Check data integrity and calling arguments
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
ensure_all_finite = True
|
||||
else:
|
||||
ensure_all_finite = "allow-nan"
|
||||
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=False,
|
||||
dtype=FLOAT_DTYPES,
|
||||
ensure_all_finite=ensure_all_finite,
|
||||
copy=self.copy,
|
||||
)
|
||||
|
||||
self._fit_X = X
|
||||
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
|
||||
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
|
||||
|
||||
super()._fit_indicator(self._mask_fit_X)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Impute all missing values in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array-like of shape (n_samples, n_output_features)
|
||||
The imputed dataset. `n_output_features` is the number of features
|
||||
that is not always missing during `fit`.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
ensure_all_finite = True
|
||||
else:
|
||||
ensure_all_finite = "allow-nan"
|
||||
X = validate_data(
|
||||
self,
|
||||
X,
|
||||
accept_sparse=False,
|
||||
dtype=FLOAT_DTYPES,
|
||||
force_writeable=True,
|
||||
ensure_all_finite=ensure_all_finite,
|
||||
copy=self.copy,
|
||||
reset=False,
|
||||
)
|
||||
|
||||
mask = _get_mask(X, self.missing_values)
|
||||
mask_fit_X = self._mask_fit_X
|
||||
valid_mask = self._valid_mask
|
||||
|
||||
X_indicator = super()._transform_indicator(mask)
|
||||
|
||||
# Removes columns where the training data is all nan
|
||||
if not np.any(mask[:, valid_mask]):
|
||||
# No missing values in X
|
||||
if self.keep_empty_features:
|
||||
Xc = X
|
||||
Xc[:, ~valid_mask] = 0
|
||||
else:
|
||||
Xc = X[:, valid_mask]
|
||||
|
||||
# Even if there are no missing values in X, we still concatenate Xc
|
||||
# with the missing value indicator matrix, X_indicator.
|
||||
# This is to ensure that the output maintains consistency in terms
|
||||
# of columns, regardless of whether missing values exist in X or not.
|
||||
return super()._concatenate_indicator(Xc, X_indicator)
|
||||
|
||||
row_missing_idx = np.flatnonzero(mask[:, valid_mask].any(axis=1))
|
||||
|
||||
non_missing_fix_X = np.logical_not(mask_fit_X)
|
||||
|
||||
# Maps from indices from X to indices in dist matrix
|
||||
dist_idx_map = np.zeros(X.shape[0], dtype=int)
|
||||
dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
|
||||
|
||||
def process_chunk(dist_chunk, start):
|
||||
row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
|
||||
|
||||
# Find and impute missing by column
|
||||
for col in range(X.shape[1]):
|
||||
if not valid_mask[col]:
|
||||
# column was all missing during training
|
||||
continue
|
||||
|
||||
col_mask = mask[row_missing_chunk, col]
|
||||
if not np.any(col_mask):
|
||||
# column has no missing values
|
||||
continue
|
||||
|
||||
(potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
|
||||
|
||||
# receivers_idx are indices in X
|
||||
receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
|
||||
|
||||
# distances for samples that needed imputation for column
|
||||
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
||||
:, potential_donors_idx
|
||||
]
|
||||
|
||||
# receivers with all nan distances impute with mean
|
||||
all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
|
||||
all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
|
||||
|
||||
if all_nan_receivers_idx.size:
|
||||
col_mean = np.ma.array(
|
||||
self._fit_X[:, col], mask=mask_fit_X[:, col]
|
||||
).mean()
|
||||
X[all_nan_receivers_idx, col] = col_mean
|
||||
|
||||
if len(all_nan_receivers_idx) == len(receivers_idx):
|
||||
# all receivers imputed with mean
|
||||
continue
|
||||
|
||||
# receivers with at least one defined distance
|
||||
receivers_idx = receivers_idx[~all_nan_dist_mask]
|
||||
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
||||
:, potential_donors_idx
|
||||
]
|
||||
|
||||
n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
|
||||
value = self._calc_impute(
|
||||
dist_subset,
|
||||
n_neighbors,
|
||||
self._fit_X[potential_donors_idx, col],
|
||||
mask_fit_X[potential_donors_idx, col],
|
||||
)
|
||||
X[receivers_idx, col] = value
|
||||
|
||||
# process in fixed-memory chunks
|
||||
gen = pairwise_distances_chunked(
|
||||
X[row_missing_idx, :],
|
||||
self._fit_X,
|
||||
metric=self.metric,
|
||||
missing_values=self.missing_values,
|
||||
ensure_all_finite=ensure_all_finite,
|
||||
reduce_func=process_chunk,
|
||||
)
|
||||
for chunk in gen:
|
||||
# process_chunk modifies X in place. No return value.
|
||||
pass
|
||||
|
||||
if self.keep_empty_features:
|
||||
Xc = X
|
||||
Xc[:, ~valid_mask] = 0
|
||||
else:
|
||||
Xc = X[:, valid_mask]
|
||||
|
||||
return super()._concatenate_indicator(Xc, X_indicator)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then the following input feature names are generated:
|
||||
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
names = input_features[self._valid_mask]
|
||||
return self._concatenate_indicator_feature_names_out(names, input_features)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,107 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.impute._base import _BaseImputer
|
||||
from sklearn.impute._iterative import _assign_where
|
||||
from sklearn.utils._mask import _get_mask
|
||||
from sklearn.utils._testing import _convert_container, assert_allclose
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
X = np.random.randn(10, 2)
|
||||
X[::2] = np.nan
|
||||
return X
|
||||
|
||||
|
||||
class NoFitIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
class NoTransformIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
mask = _get_mask(X, value_to_mask=np.nan)
|
||||
super()._fit_indicator(mask)
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, None)
|
||||
|
||||
|
||||
class NoPrecomputedMaskFit(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
self._fit_indicator(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
class NoPrecomputedMaskTransform(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
mask = _get_mask(X, value_to_mask=np.nan)
|
||||
self._fit_indicator(mask)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
def test_base_imputer_not_fit(data):
|
||||
imputer = NoFitIndicatorImputer(add_indicator=True)
|
||||
err_msg = "Make sure to call _fit_indicator before _transform_indicator"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_imputer_not_transform(data):
|
||||
imputer = NoTransformIndicatorImputer(add_indicator=True)
|
||||
err_msg = (
|
||||
"Call _fit_indicator and _transform_indicator in the imputer implementation"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_no_precomputed_mask_fit(data):
|
||||
imputer = NoPrecomputedMaskFit(add_indicator=True)
|
||||
err_msg = "precomputed is True but the input data is not a mask"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_no_precomputed_mask_transform(data):
|
||||
imputer = NoPrecomputedMaskTransform(add_indicator=True)
|
||||
err_msg = "precomputed is True but the input data is not a mask"
|
||||
imputer.fit(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X1_type", ["array", "dataframe"])
|
||||
def test_assign_where(X1_type):
|
||||
"""Check the behaviour of the private helpers `_assign_where`."""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples, n_features = 10, 5
|
||||
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
|
||||
X2 = rng.randn(n_samples, n_features)
|
||||
mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)
|
||||
|
||||
_assign_where(X1, X2, mask)
|
||||
|
||||
if X1_type == "dataframe":
|
||||
X1 = X1.to_numpy()
|
||||
assert_allclose(X1[mask], X2[mask])
|
||||
@@ -0,0 +1,226 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
||||
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def imputers():
|
||||
return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
|
||||
|
||||
|
||||
def sparse_imputers():
|
||||
return [SimpleImputer()]
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_imputation_missing_value_in_test_array(imputer):
|
||||
# [Non Regression Test for issue #13968] Missing value in test set should
|
||||
# not throw an error and return a finite dataset
|
||||
train = [[1], [2]]
|
||||
test = [[3], [np.nan]]
|
||||
imputer = clone(imputer)
|
||||
imputer.set_params(add_indicator=True)
|
||||
imputer.fit(train).transform(test)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1, 0])
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_imputers_add_indicator(marker, imputer):
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
X_true_indicator = np.array(
|
||||
[
|
||||
[1.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 1.0, 0.0, 1.0],
|
||||
[0.0, 0.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0],
|
||||
]
|
||||
)
|
||||
imputer = clone(imputer)
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1])
|
||||
@pytest.mark.parametrize(
|
||||
"imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_imputers_add_indicator_sparse(imputer, marker, csr_container):
|
||||
imputer = clone(imputer) # Avoid side effects from shared instances.
|
||||
X = csr_container(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
X_true_indicator = csr_container(
|
||||
[
|
||||
[1.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 1.0, 0.0, 1.0],
|
||||
[0.0, 0.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0],
|
||||
]
|
||||
)
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("add_indicator", [True, False])
|
||||
def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
marker = np.nan
|
||||
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
# fit on numpy array
|
||||
X_trans_expected = imputer.fit_transform(X)
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
|
||||
|
||||
# fit on pandas dataframe with IntegerArrays
|
||||
X_trans = imputer.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans_expected, X_trans)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("add_indicator", [True, False])
|
||||
def test_imputers_feature_names_out_pandas(imputer, add_indicator):
|
||||
"""Check feature names out for imputers."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
marker = np.nan
|
||||
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, 3, marker, 1],
|
||||
[2, marker, 1, 4, marker, 2],
|
||||
[6, 3, 7, marker, marker, 3],
|
||||
[1, 2, 9, 8, marker, 4],
|
||||
]
|
||||
)
|
||||
X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
|
||||
imputer.fit(X_df)
|
||||
|
||||
names = imputer.get_feature_names_out()
|
||||
|
||||
if add_indicator:
|
||||
expected_names = [
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"f",
|
||||
"missingindicator_a",
|
||||
"missingindicator_b",
|
||||
"missingindicator_d",
|
||||
"missingindicator_e",
|
||||
]
|
||||
assert_array_equal(expected_names, names)
|
||||
else:
|
||||
expected_names = ["a", "b", "c", "d", "f"]
|
||||
assert_array_equal(expected_names, names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_empty_features", [True, False])
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_keep_empty_features(imputer, keep_empty_features):
|
||||
"""Check that the imputer keeps features with only missing values."""
|
||||
X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]])
|
||||
imputer = clone(imputer)
|
||||
imputer = imputer.set_params(
|
||||
add_indicator=False, keep_empty_features=keep_empty_features
|
||||
)
|
||||
|
||||
for method in ["fit_transform", "transform"]:
|
||||
X_imputed = getattr(imputer, method)(X)
|
||||
if keep_empty_features:
|
||||
assert X_imputed.shape == X.shape
|
||||
else:
|
||||
assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("missing_value_test", [np.nan, 1])
|
||||
def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
|
||||
imputer, missing_value_test
|
||||
):
|
||||
"""Check that missing indicator always exists when add_indicator=True.
|
||||
|
||||
Non-regression test for gh-26590.
|
||||
"""
|
||||
X_train = np.array([[0, np.nan], [1, 2]])
|
||||
|
||||
# Test data where missing_value_test variable can be set to np.nan or 1.
|
||||
X_test = np.array([[0, missing_value_test], [1, 2]])
|
||||
|
||||
imputer = clone(imputer)
|
||||
imputer.set_params(add_indicator=True)
|
||||
imputer.fit(X_train)
|
||||
|
||||
X_test_imputed_with_indicator = imputer.transform(X_test)
|
||||
assert X_test_imputed_with_indicator.shape == (2, 3)
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
imputer.fit(X_train)
|
||||
X_test_imputed_without_indicator = imputer.transform(X_test)
|
||||
assert X_test_imputed_without_indicator.shape == (2, 2)
|
||||
|
||||
assert_allclose(
|
||||
X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
|
||||
)
|
||||
if np.isnan(missing_value_test):
|
||||
expected_missing_indicator = [1, 0]
|
||||
else:
|
||||
expected_missing_indicator = [0, 0]
|
||||
|
||||
assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,570 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.metrics.pairwise import nan_euclidean_distances, pairwise_distances
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
@pytest.mark.parametrize("n_neighbors", range(1, 6))
|
||||
def test_knn_imputer_shape(weights, n_neighbors):
|
||||
# Verify the shapes of the imputed matrix for different weights and
|
||||
# number of neighbors.
|
||||
n_rows = 10
|
||||
n_cols = 2
|
||||
X = np.random.rand(n_rows, n_cols)
|
||||
X[0, 0] = np.nan
|
||||
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
|
||||
X_imputed = imputer.fit_transform(X)
|
||||
assert X_imputed.shape == (n_rows, n_cols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_default_with_invalid_input(na):
|
||||
# Test imputation with default values and invalid input
|
||||
|
||||
# Test with inf present
|
||||
X = np.array(
|
||||
[
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
|
||||
KNNImputer(missing_values=na).fit(X)
|
||||
|
||||
# Test with inf present in matrix passed in transform()
|
||||
X = np.array(
|
||||
[
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
|
||||
X_fit = np.array(
|
||||
[
|
||||
[0, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
imputer = KNNImputer(missing_values=na).fit(X_fit)
|
||||
with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
|
||||
imputer.transform(X)
|
||||
|
||||
# Test with missing_values=0 when NaN present
|
||||
imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
|
||||
X = np.array(
|
||||
[
|
||||
[np.nan, 0, 0, 0, 5],
|
||||
[np.nan, 1, 0, np.nan, 3],
|
||||
[np.nan, 2, 0, 0, 0],
|
||||
[np.nan, 6, 0, 5, 13],
|
||||
]
|
||||
)
|
||||
msg = "Input X contains NaN"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
imputer.fit(X)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[0, 0],
|
||||
[np.nan, 2],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_removes_all_na_features(na):
|
||||
X = np.array(
|
||||
[
|
||||
[1, 1, na, 1, 1, 1.0],
|
||||
[2, 3, na, 2, 2, 2],
|
||||
[3, 4, na, 3, 3, na],
|
||||
[6, 4, na, na, 6, 6],
|
||||
]
|
||||
)
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)
|
||||
|
||||
X_transform = knn.transform(X)
|
||||
assert not np.isnan(X_transform).any()
|
||||
assert X_transform.shape == (4, 5)
|
||||
|
||||
X_test = np.arange(0, 12).reshape(2, 6)
|
||||
X_transform = knn.transform(X_test)
|
||||
assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_zero_nan_imputes_the_same(na):
|
||||
# Test with an imputable matrix and compare with different missing_values
|
||||
X_zero = np.array(
|
||||
[
|
||||
[1, 0, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 0],
|
||||
[6, 6, 0, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
X_nan = np.array(
|
||||
[
|
||||
[1, na, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, na],
|
||||
[6, 6, na, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 2.5, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 1.5],
|
||||
[6, 6, 2.5, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
|
||||
|
||||
imputer_nan = KNNImputer(missing_values=na, n_neighbors=2, weights="uniform")
|
||||
|
||||
assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed)
|
||||
assert_allclose(
|
||||
imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_verify(na):
|
||||
# Test with an imputable matrix
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[na, 4, 5, 5],
|
||||
[6, na, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
]
|
||||
)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, 8],
|
||||
[3, 2, 3, 8],
|
||||
[4, 4, 5, 5],
|
||||
[6, 3, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when there is not enough neighbors
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, na],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[4, 4, 5, na],
|
||||
[6, 7, 6, na],
|
||||
[8, 8, 8, na],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22],
|
||||
]
|
||||
)
|
||||
|
||||
# Not enough neighbors, use column mean from training
|
||||
X_impute_value = (20 + 22) / 2
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, X_impute_value],
|
||||
[2, 1, 2, X_impute_value],
|
||||
[3, 2, 3, X_impute_value],
|
||||
[4, 4, 5, X_impute_value],
|
||||
[6, 7, 6, X_impute_value],
|
||||
[8, 8, 8, X_impute_value],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when data in fit() and transform() are different
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]])
|
||||
|
||||
X1 = np.array([[1, 0], [3, 2], [4, na]])
|
||||
|
||||
X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
|
||||
X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]])
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_one_n_neighbors(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
|
||||
|
||||
X_imputed = np.array([[0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13]])
|
||||
|
||||
imputer = KNNImputer(n_neighbors=1, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_all_samples_are_neighbors(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
|
||||
|
||||
X_imputed = np.array(
|
||||
[[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]]
|
||||
)
|
||||
|
||||
n_neighbors = X.shape[0] - 1
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
n_neighbors = X.shape[0]
|
||||
imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
assert_allclose(imputer_plus1.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_uniform(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
|
||||
|
||||
# Test with "uniform" weight (or unweighted)
|
||||
X_imputed_uniform = np.array(
|
||||
[[0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="uniform", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" weight
|
||||
def no_weight(dist):
|
||||
return None
|
||||
|
||||
imputer = KNNImputer(weights=no_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" uniform weight
|
||||
def uniform_weight(dist):
|
||||
return np.ones_like(dist)
|
||||
|
||||
imputer = KNNImputer(weights=uniform_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_distance(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
|
||||
|
||||
# Test with "distance" weight
|
||||
nn = KNeighborsRegressor(metric="euclidean", weights="distance")
|
||||
X_rows_idx = [0, 2, 3, 4, 5, 6]
|
||||
nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
|
||||
knn_imputed_value = nn.predict(X[1:2, 1:])[0]
|
||||
|
||||
# Manual calculation
|
||||
X_neighbors_idx = [0, 2, 3, 4, 5]
|
||||
dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
|
||||
weights = 1 / dist[:, X_neighbors_idx].ravel()
|
||||
manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)
|
||||
|
||||
X_imputed_distance1 = np.array(
|
||||
[[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
# NearestNeighbor calculation
|
||||
X_imputed_distance2 = np.array(
|
||||
[[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance2)
|
||||
|
||||
# Test with weights = "distance" and n_neighbors=2
|
||||
X = np.array(
|
||||
[
|
||||
[na, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
]
|
||||
)
|
||||
|
||||
# neighbors are rows 1, 2, the nan_euclidean_distances are:
|
||||
dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2))
|
||||
dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2))
|
||||
imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[imputed_value, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test with varying missingness patterns
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[0, na, 1, na],
|
||||
[1, 1, 1, na],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
]
|
||||
)
|
||||
|
||||
# Get weights of donor neighbors
|
||||
dist = nan_euclidean_distances(X, missing_values=na)
|
||||
r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
|
||||
r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
|
||||
r1c1_nbor_wt = 1 / r1c1_nbor_dists
|
||||
r1c3_nbor_wt = 1 / r1c3_nbor_dists
|
||||
|
||||
r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
|
||||
r2c3_nbor_wt = 1 / r2c3_nbor_dists
|
||||
|
||||
# Collect donor values
|
||||
col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
|
||||
col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
|
||||
|
||||
# Final imputed values
|
||||
r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
|
||||
r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
|
||||
r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[0, r1c1_imp, 1, r1c3_imp],
|
||||
[1, 1, 1, r2c3_imp],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[0, 0, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
dist = pairwise_distances(
|
||||
X, metric="nan_euclidean", squared=False, missing_values=na
|
||||
)
|
||||
|
||||
# Calculate weights
|
||||
r0c3_w = 1.0 / dist[0, 2:-1]
|
||||
r1c3_w = 1.0 / dist[1, 2:-1]
|
||||
r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
|
||||
r7c0_w = 1.0 / dist[7, 2:7]
|
||||
|
||||
# Calculate weighted averages
|
||||
r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
|
||||
r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
|
||||
r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
|
||||
r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[0, 0, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
|
||||
assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
def test_knn_imputer_callable_metric():
|
||||
# Define callable metric that returns the l1 norm:
|
||||
def custom_callable(x, y, missing_values=np.nan, squared=False):
|
||||
x = np.ma.array(x, mask=np.isnan(x))
|
||||
y = np.ma.array(y, mask=np.isnan(y))
|
||||
dist = np.nansum(np.abs(x - y))
|
||||
return dist
|
||||
|
||||
X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.0]])
|
||||
|
||||
X_0_3 = (9 + 9) / 2
|
||||
X_3_0 = (6 + 4) / 2
|
||||
X_imputed = np.array(
|
||||
[[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9], [X_3_0, 9, 11, 10.0]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
# Note that we use working_memory=0 to ensure that chunking is tested, even
|
||||
# for a small dataset. However, it should raise a UserWarning that we ignore.
|
||||
@pytest.mark.filterwarnings("ignore:adhere to working_memory")
|
||||
def test_knn_imputer_with_simple_example(na, working_memory):
|
||||
X = np.array(
|
||||
[
|
||||
[0, na, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
r0c1 = np.mean(X[1:6, 1])
|
||||
r0c3 = np.mean(X[2:-1, -1])
|
||||
r1c3 = np.mean(X[2:-1, -1])
|
||||
r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
|
||||
r7c0 = np.mean(X[2:-1, 0])
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[0, r0c1, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
imputer_comp = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer_comp.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
def test_knn_imputer_not_enough_valid_distances(na, weights):
|
||||
# Samples with needed feature has nan distance
|
||||
X1 = np.array([[na, 11], [na, 1], [3, na]])
|
||||
X1_imputed = np.array([[3, 11], [3, 1], [3, 6]])
|
||||
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
|
||||
assert_allclose(knn.fit_transform(X1), X1_imputed)
|
||||
|
||||
X2 = np.array([[4, na]])
|
||||
X2_imputed = np.array([[4, 6]])
|
||||
assert_allclose(knn.transform(X2), X2_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
def test_knn_imputer_nan_distance(na, weights):
|
||||
# Samples with nan distance should be excluded from the mean computation
|
||||
X1_train = np.array([[1, 1], [na, 2]])
|
||||
X1_test = np.array([[0, na]])
|
||||
X1_test_expected = np.array([[0, 1]])
|
||||
|
||||
knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
|
||||
knn1.fit(X1_train)
|
||||
assert_allclose(knn1.transform(X1_test), X1_test_expected)
|
||||
|
||||
X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]])
|
||||
X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]])
|
||||
X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]])
|
||||
|
||||
knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
|
||||
knn2.fit(X2_train)
|
||||
assert_allclose(knn2.transform(X2_test), X2_test_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_drops_all_nan_features(na):
|
||||
X1 = np.array([[na, 1], [na, 2]])
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1)
|
||||
X1_expected = np.array([[1], [2]])
|
||||
assert_allclose(knn.fit_transform(X1), X1_expected)
|
||||
|
||||
X2 = np.array([[1, 2], [3, na]])
|
||||
X2_expected = np.array([[2], [1.5]])
|
||||
assert_allclose(knn.transform(X2), X2_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory):
|
||||
X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]])
|
||||
|
||||
dist = pairwise_distances(
|
||||
X, metric="nan_euclidean", squared=False, missing_values=na
|
||||
)
|
||||
|
||||
X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5])
|
||||
X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5])
|
||||
X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5])
|
||||
X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5])
|
||||
|
||||
X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]])
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights="distance")
|
||||
assert_allclose(knn_3.fit_transform(X), X_expected)
|
||||
|
||||
knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights="distance")
|
||||
assert_allclose(knn_4.fit_transform(X), X_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)])
|
||||
def test_knn_tags(na, allow_nan):
|
||||
knn = KNNImputer(missing_values=na)
|
||||
assert knn.__sklearn_tags__().input_tags.allow_nan == allow_nan
|
||||
Reference in New Issue
Block a user