This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,16 @@
import pytest
from numpy.testing import assert_allclose
from sklearn.utils import check_random_state
from sklearn.utils._arpack import _init_arpack_v0
@pytest.mark.parametrize("seed", range(100))
def test_init_arpack_v0(seed):
# check that the initialization a sampling from an uniform distribution
# where we can fix the random state
size = 1000
v0 = _init_arpack_v0(size, seed)
rng = check_random_state(seed)
assert_allclose(v0, rng.uniform(-1, 1, size=size))

View File

@@ -0,0 +1,931 @@
import os
from functools import partial
import numpy
import pytest
import scipy
import scipy.sparse as sp
from numpy.testing import assert_allclose
from sklearn._config import config_context
from sklearn._loss import HalfMultinomialLoss
from sklearn.base import BaseEstimator
from sklearn.utils._array_api import (
_add_to_diagonal,
_asarray_with_order,
_atol_for_type,
_average,
_convert_to_numpy,
_count_nonzero,
_estimator_with_converted_arrays,
_fill_diagonal,
_get_namespace_device_dtype_ids,
_half_multinomial_loss,
_is_numpy_namespace,
_isin,
_logsumexp,
_max_precision_float_dtype,
_median,
_nanmax,
_nanmean,
_nanmin,
_ravel,
_validate_diagonal_args,
device,
get_namespace,
get_namespace_and_device,
indexing_dtype,
move_to,
np_compat,
supported_float_dtypes,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
SkipTest,
_array_api_for_tests,
_convert_container,
assert_array_equal,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, np_version, parse_version
@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
def test_get_namespace_ndarray_default(X):
"""Check that get_namespace returns NumPy wrapper"""
xp_out, is_array_api_compliant = get_namespace(X)
assert xp_out is np_compat
assert not is_array_api_compliant
def test_get_namespace_ndarray_creation_device():
"""Check expected behavior with device and creation functions."""
X = numpy.asarray([1, 2, 3])
xp_out, _ = get_namespace(X)
full_array = xp_out.full(10, fill_value=2.0, device="cpu")
assert_allclose(full_array, [2.0] * 10)
with pytest.raises(ValueError, match="Unsupported device"):
xp_out.zeros(10, device="cuda")
@skip_if_array_api_compat_not_configured
def test_get_namespace_ndarray_with_dispatch():
"""Test get_namespace on NumPy ndarrays."""
X_np = numpy.asarray([[1, 2, 3]])
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_np)
assert is_array_api_compliant
# In the future, NumPy should become API compliant library and we should have
# assert xp_out is numpy
assert xp_out is np_compat
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"constructor_name", ["pyarrow", "dataframe", "polars", "series"]
)
def test_get_namespace_df_with_dispatch(constructor_name):
"""Test get_namespace on dataframes and series."""
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(df)
assert not is_array_api_compliant
# When operating on dataframes or series the Numpy namespace is
# the right thing to use.
assert xp_out is np_compat
@skip_if_array_api_compat_not_configured
def test_get_namespace_sparse_with_dispatch():
"""Test get_namespace on sparse arrays."""
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(sp.csr_array([[1, 2, 3]]))
assert not is_array_api_compliant
# When operating on sparse arrays the Numpy namespace is
# the right thing to use.
assert xp_out is np_compat
@skip_if_array_api_compat_not_configured
def test_get_namespace_array_api(monkeypatch):
"""Test get_namespace for ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1, 2, 3]])
X_xp = xp.asarray(X_np)
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_xp)
assert is_array_api_compliant
with pytest.raises(TypeError):
xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
def mock_getenv(key):
if key == "SCIPY_ARRAY_API":
return "0"
monkeypatch.setattr("os.environ.get", mock_getenv)
assert os.environ.get("SCIPY_ARRAY_API") != "1"
with pytest.raises(
RuntimeError,
match="scipy's own support is not enabled.",
):
get_namespace(X_xp)
@pytest.mark.parametrize(
"array_input, reference",
[
pytest.param(("cupy", None), ("torch", "cuda"), id="cupy to torch cuda"),
pytest.param(("torch", "mps"), ("numpy", None), id="torch mps to numpy"),
pytest.param(("numpy", None), ("torch", "cuda"), id="numpy to torch cuda"),
pytest.param(("numpy", None), ("torch", "mps"), id="numpy to torch mps"),
pytest.param(
("array_api_strict", None),
("torch", "mps"),
id="array_api_strict to torch mps",
),
],
)
def test_move_to_array_api_conversions(array_input, reference):
"""Check conversion between various namespace and devices."""
if array_input[0] == "array_api_strict":
array_api_strict = pytest.importorskip(
"array_api_strict", reason="array-api-strict not available"
)
xp = _array_api_for_tests(reference[0], reference[1])
xp_array = _array_api_for_tests(array_input[0], array_input[1])
with config_context(array_api_dispatch=True):
device_ = device(xp.asarray([1], device=reference[1]))
if array_input[0] == "array_api_strict":
array_device = array_api_strict.Device("CPU_DEVICE")
else:
array_device = array_input[1]
array = xp_array.asarray([1, 2, 3], device=array_device)
array_out = move_to(array, xp=xp, device=device_)
assert get_namespace(array_out)[0] == xp
assert device(array_out) == device_
def test_move_to_sparse():
"""Check sparse inputs are handled correctly."""
xp_numpy = _array_api_for_tests("numpy", None)
xp_torch = _array_api_for_tests("torch", "cpu")
sparse1 = sp.csr_array([0, 1, 2, 3])
sparse2 = sp.csr_array([0, 1, 0, 1])
numpy_array = numpy.array([1, 2, 3])
with config_context(array_api_dispatch=True):
device_cpu = xp_torch.asarray([1]).device
# sparse and None to NumPy
result1, result2 = move_to(sparse1, None, xp=xp_numpy, device=None)
assert result1 is sparse1
assert result2 is None
# sparse to non-NumPy
msg = r"Sparse arrays are only accepted \(and passed through\)"
with pytest.raises(TypeError, match=msg):
move_to(sparse1, numpy_array, xp=xp_torch, device=device_cpu)
with pytest.raises(TypeError, match=msg):
move_to(sparse1, None, xp=xp_torch, device=device_cpu)
@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
def test_asarray_with_order(array_api):
"""Test _asarray_with_order passes along order for NumPy arrays."""
xp = pytest.importorskip(array_api)
X = xp.asarray([1.2, 3.4, 5.1])
X_new = _asarray_with_order(X, order="F", xp=xp)
X_new_np = numpy.asarray(X_new)
assert X_new_np.flags["F_CONTIGUOUS"]
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize(
"weights, axis, normalize, expected",
[
# normalize = True
(None, None, True, 3.5),
(None, 0, True, [2.5, 3.5, 4.5]),
(None, 1, True, [2, 5]),
([True, False], 0, True, [1, 2, 3]), # boolean weights
([True, True, False], 1, True, [1.5, 4.5]), # boolean weights
([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
([1, 2], 0, True, [3, 4, 5]),
([1, 1, 2], 1, True, [2.25, 5.25]),
([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
# normalize = False
(None, None, False, 21),
(None, 0, False, [5, 7, 9]),
(None, 1, False, [6, 15]),
([True, False], 0, False, [1, 2, 3]), # boolean weights
([True, True, False], 1, False, [3, 9]), # boolean weights
([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
([1, 2], 0, False, [9, 12, 15]),
([1, 1, 2], 1, False, [9, 21]),
([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
],
)
def test_average(
array_namespace, device_, dtype_name, weights, axis, normalize, expected
):
xp = _array_api_for_tests(array_namespace, device_)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device_)
if weights is not None:
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device_)
with config_context(array_api_dispatch=True):
result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
# https://github.com/numpy/numpy/issues/26850
assert device(array_in) == device(result)
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
ids=_get_namespace_device_dtype_ids,
)
def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
[4, 3], dtype=dtype_name
)
complex_type_name = array_in.dtype.name
if not hasattr(xp, complex_type_name):
# This is the case for cupy as of March 2024 for instance.
pytest.skip(f"{array_namespace} does not support {complex_type_name}")
array_in = xp.asarray(array_in, device=device)
err_msg = "Complex floating point values are not supported by average."
with (
config_context(array_api_dispatch=True),
pytest.raises(NotImplementedError, match=err_msg),
):
_average(array_in)
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize(
"axis, weights, error, error_msg",
(
(
None,
[1, 2],
TypeError,
"Axis must be specified",
),
(
0,
[[1, 2]],
# NumPy 2 raises ValueError, NumPy 1 raises TypeError
(ValueError, TypeError),
"weights", # the message is different for NumPy 1 and 2...
),
(
0,
[1, 2, 3, 4],
ValueError,
"weights",
),
(0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
),
)
def test_average_raises_with_invalid_parameters(
array_namespace, device, dtype_name, axis, weights, error, error_msg
):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device)
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device)
with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
_average(array_in, axis=axis, weights=weights)
def test_device_none_if_no_input():
assert device() is None
assert device(None, "name") is None
@skip_if_array_api_compat_not_configured
def test_device_inspection():
class Device:
def __init__(self, name):
self.name = name
def __eq__(self, device):
return self.name == device.name
def __hash__(self):
raise TypeError("Device object is not hashable")
def __str__(self):
return self.name
class Array:
def __init__(self, device_name):
self.device = Device(device_name)
# Sanity check: ensure our Device mock class is non hashable, to
# accurately account for non-hashable device objects in some array
# libraries, because of which the `device` inspection function shouldn't
# make use of hash lookup tables (in particular, not use `set`)
with pytest.raises(TypeError):
hash(Array("device").device)
# If array API dispatch is disabled the device should be ignored. Erroring
# early for different devices would prevent the np.asarray conversion to
# happen. For example, `r2_score(np.ones(5), torch.ones(5))` should work
# fine with array API disabled.
assert device(Array("cpu"), Array("mygpu")) is None
# Test that ValueError is raised if on different devices and array API dispatch is
# enabled.
err_msg = "Input arrays use different devices: cpu, mygpu"
with config_context(array_api_dispatch=True):
with pytest.raises(ValueError, match=err_msg):
device(Array("cpu"), Array("mygpu"))
# Test expected value is returned otherwise
array1 = Array("device")
array2 = Array("device")
assert array1.device == device(array1)
assert array1.device == device(array1, array2)
assert array1.device == device(array1, array1, array2)
# TODO: add cupy to the list of libraries once the following upstream issue
# has been fixed:
# https://github.com/cupy/cupy/issues/8180
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
@pytest.mark.parametrize(
"X,reduction,expected",
[
([1, 2, numpy.nan], _nanmin, 1),
([1, -2, -numpy.nan], _nanmin, -2),
([numpy.inf, numpy.inf], _nanmin, numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=0),
[1.0, 2.0, 3.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=1),
[1.0, numpy.nan, 4.0],
),
([1, 2, numpy.nan], _nanmax, 2),
([1, 2, numpy.nan], _nanmax, 2),
([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=0),
[4.0, 5.0, 6.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=1),
[3.0, numpy.nan, 6.0],
),
([1, 2, numpy.nan], _nanmean, 1.5),
([1, -2, -numpy.nan], _nanmean, -0.5),
([-numpy.inf, -numpy.inf], _nanmean, -numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmean, axis=0),
[2.5, 3.5, 4.5],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmean, axis=1),
[2.0, numpy.nan, 5.0],
),
],
)
def test_nan_reductions(library, X, reduction, expected):
"""Check NaN reductions like _nanmin and _nanmax"""
xp = pytest.importorskip(library)
with config_context(array_api_dispatch=True):
result = reduction(xp.asarray(X))
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected)
@pytest.mark.parametrize(
"namespace, _device, _dtype",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_ravel(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
array_xp = xp.asarray(array, device=_device)
with config_context(array_api_dispatch=True):
result = _ravel(array_xp)
result = _convert_to_numpy(result, xp)
expected = numpy.ravel(array, order="C")
assert_allclose(expected, result)
if _is_numpy_namespace(xp):
assert numpy.asarray(result).flags["C_CONTIGUOUS"]
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["cupy", "torch"])
def test_convert_to_numpy_gpu(library): # pragma: nocover
"""Check convert_to_numpy for GPU backed libraries."""
xp = pytest.importorskip(library)
if library == "torch":
if not xp.backends.cuda.is_built():
pytest.skip("test requires cuda")
X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
else:
X_gpu = xp.asarray([1.0, 2.0, 3.0])
X_cpu = _convert_to_numpy(X_gpu, xp=xp)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
def test_convert_to_numpy_cpu():
"""Check convert_to_numpy for PyTorch CPU arrays."""
torch = pytest.importorskip("torch")
X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
X_cpu = _convert_to_numpy(X_torch, xp=torch)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
class SimpleEstimator(BaseEstimator):
def fit(self, X, y=None):
self.X_ = X
self.n_features_ = X.shape[0]
return self
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, converter",
[
("torch", lambda array: array.cpu().numpy()),
("array_api_strict", lambda array: numpy.asarray(array)),
("cupy", lambda array: array.get()),
],
)
def test_convert_estimator_to_ndarray(array_namespace, converter):
"""Convert estimator attributes to ndarray."""
xp = pytest.importorskip(array_namespace)
X = xp.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X)
new_est = _estimator_with_converted_arrays(est, converter)
assert isinstance(new_est.X_, numpy.ndarray)
@skip_if_array_api_compat_not_configured
def test_convert_estimator_to_array_api():
"""Convert estimator attributes to ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X_np)
new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
assert hasattr(new_est.X_, "__array_namespace__")
@pytest.mark.parametrize(
"namespace, _device, _dtype",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_indexing_dtype(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
if _IS_32BIT:
assert indexing_dtype(xp) == xp.int32
else:
assert indexing_dtype(xp) == xp.int64
@pytest.mark.parametrize(
"namespace, _device, _dtype",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_max_precision_float_dtype(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
expected_dtype = xp.float32 if _device == "mps" else xp.float64
assert _max_precision_float_dtype(xp, _device) == expected_dtype
@pytest.mark.parametrize(
"array_namespace, device, _",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize("invert", [True, False])
@pytest.mark.parametrize("assume_unique", [True, False])
@pytest.mark.parametrize("element_size", [6, 10, 14])
@pytest.mark.parametrize("int_dtype", ["int16", "int32", "int64", "uint8"])
def test_isin(
array_namespace, device, _, invert, assume_unique, element_size, int_dtype
):
xp = _array_api_for_tests(array_namespace, device)
r = element_size // 2
element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype)
test_elements = numpy.array(numpy.arange(14), dtype=int_dtype)
element_xp = xp.asarray(element, device=device)
test_elements_xp = xp.asarray(test_elements, device=device)
expected = numpy.isin(
element=element,
test_elements=test_elements,
assume_unique=assume_unique,
invert=invert,
)
with config_context(array_api_dispatch=True):
result = _isin(
element=element_xp,
test_elements=test_elements_xp,
xp=xp,
assume_unique=assume_unique,
invert=invert,
)
assert_array_equal(_convert_to_numpy(result, xp=xp), expected)
@pytest.mark.skipif(
os.environ.get("SCIPY_ARRAY_API") != "1", reason="SCIPY_ARRAY_API not set to 1."
)
def test_get_namespace_and_device():
# Use torch as a library with custom Device objects:
torch = pytest.importorskip("torch")
from sklearn.externals.array_api_compat import torch as torch_compat
some_torch_tensor = torch.arange(3, device="cpu")
some_numpy_array = numpy.arange(3)
# When dispatch is disabled, get_namespace_and_device should return the
# default NumPy wrapper namespace and "cpu" device. Our code will handle such
# inputs via the usual __array__ interface without attempting to dispatch
# via the array API.
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
assert namespace is get_namespace(some_numpy_array)[0]
assert not is_array_api
assert device is None
# Otherwise, expose the torch namespace and device via array API compat
# wrapper.
with config_context(array_api_dispatch=True):
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
assert namespace is torch_compat
assert is_array_api
assert device == some_torch_tensor.device
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("axis", [0, 1, None, -1, -2])
@pytest.mark.parametrize("sample_weight_type", [None, "int", "float"])
def test_count_nonzero(
array_namespace, device_, dtype_name, csr_container, axis, sample_weight_type
):
from sklearn.utils.sparsefuncs import count_nonzero as sparse_count_nonzero
xp = _array_api_for_tests(array_namespace, device_)
array = numpy.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]])
if sample_weight_type == "int":
sample_weight = numpy.asarray([1, 2, 2, 3, 1])
elif sample_weight_type == "float":
sample_weight = numpy.asarray([0.5, 1.5, 0.8, 3.2, 2.4], dtype=dtype_name)
else:
sample_weight = None
expected = sparse_count_nonzero(
csr_container(array), axis=axis, sample_weight=sample_weight
)
array_xp = xp.asarray(array, device=device_)
with config_context(array_api_dispatch=True):
result = _count_nonzero(
array_xp, axis=axis, sample_weight=sample_weight, xp=xp, device=device_
)
assert_allclose(_convert_to_numpy(result, xp=xp), expected)
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
# https://github.com/numpy/numpy/issues/26850
assert device(array_xp) == device(result)
@pytest.mark.parametrize(
"array, value, match",
[
(numpy.array([1, 2, 3]), 1, "`array` should be 2D"),
(numpy.array([[1, 2], [3, 4]]), numpy.array([1, 2, 3]), "`value` needs to be"),
(numpy.array([[1, 2], [3, 4]]), [1, 2, 3], "`value` needs to be"),
(
numpy.array([[1, 2], [3, 4]]),
numpy.array([[1, 2], [3, 4]]),
"`value` needs to be a",
),
],
)
def test_validate_diagonal_args(array, value, match):
"""Check `_validate_diagonal_args` raises the correct errors."""
xp = _array_api_for_tests("numpy", None)
with pytest.raises(ValueError, match=match):
_validate_diagonal_args(array, value, xp)
@pytest.mark.parametrize("function", ["fill", "add"])
@pytest.mark.parametrize("c_contiguity", [True, False])
def test_fill_and_add_to_diagonal(c_contiguity, function):
"""Check `_fill/add_to_diagonal` behaviour correct with numpy arrays."""
xp = _array_api_for_tests("numpy", None)
if c_contiguity:
array = numpy.zeros((3, 4))
else:
array = numpy.zeros((3, 4)).T
assert array.flags["C_CONTIGUOUS"] == c_contiguity
if function == "fill":
func = _fill_diagonal
else:
func = _add_to_diagonal
func(array, 1, xp)
assert_allclose(array.diagonal(), numpy.ones((3,)))
func(array, [0, 1, 2], xp)
if function == "fill":
expected_diag = numpy.arange(3)
else:
expected_diag = numpy.ones((3,)) + numpy.arange(3)
assert_allclose(array.diagonal(), expected_diag)
fill_array = numpy.array([11, 12, 13])
func(array, fill_array, xp)
if function == "fill":
expected_diag = fill_array
else:
expected_diag = fill_array + numpy.arange(3) + numpy.ones((3,))
assert_allclose(array.diagonal(), expected_diag)
@pytest.mark.parametrize("array", ["standard", "transposed", "non-contiguous"])
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_fill_diagonal(array, array_namespace, device_, dtype_name):
"""Check array API `_fill_diagonal` consistent with `numpy._fill_diagonal`."""
xp = _array_api_for_tests(array_namespace, device_)
array_np = numpy.zeros((4, 5), dtype=dtype_name)
if array == "transposed":
array_xp = xp.asarray(array_np.copy(), device=device_).T
array_np = array_np.T
elif array == "non-contiguous":
array_xp = xp.asarray(array_np.copy(), device=device_)[::2, ::2]
array_np = array_np[::2, ::2]
else:
array_xp = xp.asarray(array_np.copy(), device=device_)
numpy.fill_diagonal(array_np, val=1)
with config_context(array_api_dispatch=True):
_fill_diagonal(array_xp, value=1, xp=xp)
assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_add_to_diagonal(array_namespace, device_, dtype_name):
"""Check `_add_to_diagonal` consistent between array API xp and numpy namespace."""
xp = _array_api_for_tests(array_namespace, device_)
np_xp = _array_api_for_tests("numpy", None)
array_np = numpy.zeros((3, 4), dtype=dtype_name)
array_xp = xp.asarray(array_np.copy(), device=device_)
add_val = [1, 2, 3]
_fill_diagonal(array_np, value=add_val, xp=np_xp)
with config_context(array_api_dispatch=True):
_fill_diagonal(array_xp, value=add_val, xp=xp)
assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("dispatch", [True, False])
def test_sparse_device(csr_container, dispatch):
np_arr = numpy.array([1])
# For numpy < 2, the device attribute is not available on numpy arrays
expected_numpy_array_device = getattr(np_arr, "device", None) if dispatch else None
a, b = csr_container(numpy.array([[1]])), csr_container(numpy.array([[2]]))
if dispatch and os.environ.get("SCIPY_ARRAY_API") is None:
raise SkipTest("SCIPY_ARRAY_API is not set: not checking array_api input")
with config_context(array_api_dispatch=dispatch):
assert device(a, b) is None
assert device(a, np_arr) == expected_numpy_array_device
assert get_namespace_and_device(a, b)[2] is None
assert get_namespace_and_device(a, np_arr)[2] == expected_numpy_array_device
@pytest.mark.parametrize(
"namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize("axis", [None, 0, 1])
def test_median(namespace, device, dtype_name, axis):
# Note: depending on the value of `axis`, this test will compare median
# computations on arrays of even (4) or odd (5) numbers of elements, hence
# will test for median computation with and without interpolation to check
# that array API namespaces yield consistent results even when the median is
# not mathematically uniquely defined.
xp = _array_api_for_tests(namespace, device)
rng = numpy.random.RandomState(0)
X_np = rng.uniform(low=0.0, high=1.0, size=(5, 4)).astype(dtype_name)
result_np = numpy.median(X_np, axis=axis)
X_xp = xp.asarray(X_np, device=device)
with config_context(array_api_dispatch=True):
result_xp = _median(X_xp, axis=axis)
if xp.__name__ != "array_api_strict":
# We convert array-api-strict arrays to numpy arrays as `median` is not
# part of the Array API spec
assert get_namespace(result_xp)[0] == xp
assert result_xp.device == X_xp.device
assert_allclose(result_np, _convert_to_numpy(result_xp, xp=xp))
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize("axis", [0, 1, None])
def test_logsumexp_like_scipy_logsumexp(array_namespace, device_, dtype_name, axis):
xp = _array_api_for_tests(array_namespace, device_)
array_np = numpy.asarray(
[
[0, 3, 1000],
[2, -1, 1000],
[-10, 0, 0],
[-50, 8, -numpy.inf],
[4, 0, 5],
],
dtype=dtype_name,
)
array_xp = xp.asarray(array_np, device=device_)
res_np = scipy.special.logsumexp(array_np, axis=axis)
rtol = 1e-6 if "float32" in str(dtype_name) else 1e-12
# if torch on CPU or array api strict on default device
# check that _logsumexp works when array API dispatch is disabled
if (array_namespace == "torch" and device_ == "cpu") or (
array_namespace == "array_api_strict" and "CPU" in str(device_)
):
assert_allclose(_logsumexp(array_xp, axis=axis), res_np, rtol=rtol)
with config_context(array_api_dispatch=True):
res_xp = _logsumexp(array_xp, axis=axis)
res_xp = _convert_to_numpy(res_xp, xp)
assert_allclose(res_np, res_xp, rtol=rtol)
# Test with NaNs and +np.inf
array_np_2 = numpy.asarray(
[
[0, numpy.nan, 1000],
[2, -1, 1000],
[numpy.inf, 0, 0],
[-50, 8, -numpy.inf],
[4, 0, 5],
],
dtype=dtype_name,
)
array_xp_2 = xp.asarray(array_np_2, device=device_)
res_np_2 = scipy.special.logsumexp(array_np_2, axis=axis)
with config_context(array_api_dispatch=True):
res_xp_2 = _logsumexp(array_xp_2, axis=axis)
res_xp_2 = _convert_to_numpy(res_xp_2, xp)
assert_allclose(res_np_2, res_xp_2, rtol=rtol)
@pytest.mark.parametrize(
("namespace", "device_", "expected_types"),
[
("numpy", None, ("float64", "float32", "float16")),
("array_api_strict", None, ("float64", "float32")),
("torch", "cpu", ("float64", "float32", "float16")),
("torch", "cuda", ("float64", "float32", "float16")),
("torch", "mps", ("float32", "float16")),
],
)
def test_supported_float_types(namespace, device_, expected_types):
xp = _array_api_for_tests(namespace, device_)
float_types = supported_float_dtypes(xp, device=device_)
expected = tuple(getattr(xp, dtype_name) for dtype_name in expected_types)
assert float_types == expected
@pytest.mark.parametrize("use_sample_weight", [False, True])
@pytest.mark.parametrize(
"namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
)
def test_half_multinomial_loss(use_sample_weight, namespace, device_, dtype_name):
"""Check that the array API version of :func:`_half_multinomial_loss` works
correctly and matches the results produced by :class:`HalfMultinomialLoss`
of the private `_loss` module.
"""
n_samples = 5
n_classes = 3
rng = numpy.random.RandomState(42)
y = rng.randint(0, n_classes, n_samples).astype(dtype_name)
pred = rng.rand(n_samples, n_classes).astype(dtype_name)
xp = _array_api_for_tests(namespace, device_)
y_xp = xp.asarray(y, device=device_)
pred_xp = xp.asarray(pred, device=device_)
if use_sample_weight:
sample_weight = numpy.ones_like(y)
sample_weight[1::2] = 2
sample_weight_xp = xp.asarray(sample_weight, device=device_)
else:
sample_weight, sample_weight_xp = None, None
np_loss = HalfMultinomialLoss(n_classes=n_classes)(
y_true=y, raw_prediction=pred, sample_weight=sample_weight
)
with config_context(array_api_dispatch=True):
xp_loss = _half_multinomial_loss(
y=y_xp, pred=pred_xp, sample_weight=sample_weight_xp, xp=xp
)
assert numpy.isclose(np_loss, xp_loss)

View File

@@ -0,0 +1,40 @@
import numpy as np
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
def test_min_pos():
# Check that min_pos returns a positive value and that it's consistent
# between float and double
X = np.random.RandomState(0).randn(100)
min_double = min_pos(X)
min_float = min_pos(X.astype(np.float32))
assert_allclose(min_double, min_float)
assert min_double >= 0
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_min_pos_no_positive(dtype):
# Check that the return value of min_pos is the maximum representable
# value of the input dtype when all input elements are <= 0 (#19328)
X = np.full(100, -1.0).astype(dtype, copy=False)
assert min_pos(X) == np.finfo(dtype).max
@pytest.mark.parametrize(
"dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
)
@pytest.mark.parametrize("value", [0, 1.5, -1])
def test_all_with_any_reduction_axis_1(dtype, value):
# Check that return value is False when there is no row equal to `value`
X = np.arange(12, dtype=dtype).reshape(3, 4)
assert not _all_with_any_reduction_axis_1(X, value=value)
# Make a row equal to `value`
X[1, :] = value
assert _all_with_any_reduction_axis_1(X, value=value)

View File

@@ -0,0 +1,32 @@
import warnings
import numpy as np
import pytest
from sklearn.utils import Bunch
def test_bunch_attribute_deprecation():
"""Check that bunch raises deprecation message with `__getattr__`."""
bunch = Bunch()
values = np.asarray([1, 2, 3])
msg = (
"Key: 'values', is deprecated in 1.3 and will be "
"removed in 1.5. Please use 'grid_values' instead"
)
bunch._set_deprecated(
values, new_key="grid_values", deprecated_key="values", warning_message=msg
)
with warnings.catch_warnings():
# Does not warn for "grid_values"
warnings.simplefilter("error")
v = bunch["grid_values"]
assert v is values
with pytest.warns(FutureWarning, match=msg):
# Warns for "values"
v = bunch["values"]
assert v is values

View File

@@ -0,0 +1,73 @@
import warnings
from itertools import chain
import pytest
from sklearn import config_context
from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
from sklearn.utils._testing import assert_array_equal
def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
assert_array_equal(some_range, joined_range)
@pytest.mark.parametrize(
("row_bytes", "max_n_rows", "working_memory", "expected"),
[
(1024, None, 1, 1024),
(1024, None, 0.99999999, 1023),
(1023, None, 1, 1025),
(1025, None, 1, 1023),
(1024, None, 2, 2048),
(1024, 7, 1, 7),
(1024 * 1024, None, 1, 1),
],
)
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
def test_get_chunk_n_rows_warns():
"""Check that warning is raised when working_memory is too low."""
row_bytes = 1024 * 1024 + 1
max_n_rows = None
working_memory = 1
expected = 1
warn_msg = (
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
)
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)

View File

@@ -0,0 +1,334 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.utils.fixes import CSC_CONTAINERS
def test_compute_class_weight():
# Test (and demo) compute_class_weight.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
cw = compute_class_weight("balanced", classes=classes, y=y)
# total effect of samples is preserved
class_counts = np.bincount(y)[2:]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert cw[0] < cw[1] < cw[2]
@pytest.mark.parametrize(
"y_type, class_weight, classes, err_msg",
[
(
"numeric",
"balanced",
np.arange(4),
"classes should have valid labels that are in y",
),
# Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
(
"numeric",
{"label_not_present": 1.0},
np.arange(4),
r"The classes, \[0, 1, 2, 3\], are not in class_weight",
),
(
"numeric",
"balanced",
np.arange(2),
"classes should include all valid labels",
),
(
"numeric",
{0: 1.0, 1: 2.0},
np.arange(2),
"classes should include all valid labels",
),
(
"string",
{"dogs": 3, "cat": 2},
np.array(["dog", "cat"]),
r"The classes, \['dog'\], are not in class_weight",
),
],
)
def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
# Raise error when y does not contain all class labels
y = (
np.asarray([0, 0, 0, 1, 1, 2])
if y_type == "numeric"
else np.asarray(["dog", "cat", "dog"])
)
print(y)
with pytest.raises(ValueError, match=err_msg):
compute_class_weight(class_weight, classes=classes, y=y)
def test_compute_class_weight_dict():
classes = np.arange(3)
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
y = np.asarray([0, 0, 1, 2])
cw = compute_class_weight(class_weights, classes=classes, y=y)
# When the user specifies class weights, compute_class_weights should just
# return them.
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
# When a class weight is specified that isn't in classes, the weight is ignored
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([1.0, 2.0, 3.0], cw)
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([4.0, 2.0, 3.0], cw)
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_compute_class_weight_balanced_negative():
# Test compute_class_weight when labels are negative
# Test with balanced class labels.
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
def test_compute_class_weight_balanced_sample_weight_equivalence():
# Test with unbalanced and negative class labels for
# equivalence between repeated and weighted samples
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
sw = np.asarray([1, 0, 1, 1, 1, 2])
y_rep = np.repeat(y, sw, axis=0)
class_weights_weighted = compute_class_weight(
"balanced", classes=classes, y=y, sample_weight=sw
)
class_weights_repeated = compute_class_weight("balanced", classes=classes, y=y_rep)
assert len(class_weights_weighted) == len(classes)
assert len(class_weights_repeated) == len(classes)
class_counts_weighted = np.bincount(y + 2, weights=sw)
class_counts_repeated = np.bincount(y_rep + 2)
assert np.dot(class_weights_weighted, class_counts_weighted) == pytest.approx(
np.dot(class_weights_repeated, class_counts_repeated)
)
assert_allclose(class_weights_weighted, class_weights_repeated)
def test_compute_class_weight_balanced_unordered():
# Test compute_class_weight when classes are unordered
classes = np.array([1, 0, 3])
y = np.asarray([1, 0, 0, 3, 3, 3])
cw = compute_class_weight("balanced", classes=classes, y=y)
class_counts = np.bincount(y)[classes]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
def test_compute_class_weight_default():
# Test for the case where no weight is given for a present class.
# Current behaviour is to assign the unweighted classes a weight of 1.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
classes_len = len(classes)
# Test for non specified weights
cw = compute_class_weight(None, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, np.ones(3))
# Tests for partly specified weights
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
def test_compute_sample_weight():
# Test (and demo) compute_sample_weight.
# Test with balanced classes
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with user-defined weights
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with unbalanced classes
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y)
expected_balanced = np.array(
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
)
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
# Test with `None` weights
sample_weight = compute_sample_weight(None, y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output of balanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output with user-defined weights
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
# Test with multi-output of unbalanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
assert_array_almost_equal(sample_weight, expected_balanced)
# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced**2)
# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
@pytest.mark.parametrize(
"y_type, class_weight, indices, err_msg",
[
(
"single-output",
{1: 2, 2: 1},
range(4),
"The only valid class_weight for subsampling is 'balanced'.",
),
(
"multi-output",
{1: 2, 2: 1},
None,
"For multi-output, class_weight should be a list of dicts, or the string",
),
(
"multi-output",
[{1: 2, 2: 1}],
None,
r"Got 1 element\(s\) while having 2 outputs",
),
],
)
def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
# Test compute_sample_weight raises errors expected.
# Invalid preset string
y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
y = y_single_output if y_type == "single-output" else y_multi_output
with pytest.raises(ValueError, match=err_msg):
compute_sample_weight(class_weight, y, indices=indices)
def test_compute_sample_weight_more_than_32():
# Non-regression smoke test for #12146
y = np.arange(50) # more than 32 distinct classes
indices = np.arange(50) # use subsampling
weight = compute_sample_weight("balanced", y, indices=indices)
assert_array_almost_equal(weight, np.ones(y.shape[0]))
def test_class_weight_does_not_contains_more_classes():
"""Check that class_weight can contain more labels than in y.
Non-regression test for #22413
"""
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
# Does not raise
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_compute_sample_weight_sparse(csc_container):
"""Check that we can compute weight for sparse `y`."""
y = csc_container(np.asarray([[0], [1], [1]]))
sample_weight = compute_sample_weight("balanced", y)
assert_allclose(sample_weight, [1.5, 0.75, 0.75])

View File

@@ -0,0 +1,250 @@
import numpy as np
import pytest
from sklearn.utils._cython_blas import (
BLAS_Order,
BLAS_Trans,
_asum_memview,
_axpy_memview,
_copy_memview,
_dot_memview,
_gemm_memview,
_gemv_memview,
_ger_memview,
_nrm2_memview,
_rot_memview,
_rotg_memview,
_scal_memview,
)
from sklearn.utils._testing import assert_allclose
def _numpy_to_cython(dtype):
cython = pytest.importorskip("cython")
if dtype == np.float32:
return cython.float
elif dtype == np.float64:
return cython.double
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
ORDER = {BLAS_Order.RowMajor: "C", BLAS_Order.ColMajor: "F"}
def _no_op(x):
return x
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_dot(dtype):
dot = _dot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
expected = x.dot(y)
actual = dot(x, y)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_asum(dtype):
asum = _asum_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.abs(x).sum()
actual = asum(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_axpy(dtype):
axpy = _axpy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x + y
axpy(alpha, x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_nrm2(dtype):
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.linalg.norm(x)
actual = nrm2(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_copy(dtype):
copy = _copy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = np.empty_like(x)
expected = x.copy()
copy(x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_scal(dtype):
scal = _scal_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x
scal(alpha, x)
assert_allclose(x, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rotg(dtype):
rotg = _rotg_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
a = dtype(rng.randn())
b = dtype(rng.randn())
c, s = 0.0, 0.0
def expected_rotg(a, b):
roe = a if abs(a) > abs(b) else b
if a == 0 and b == 0:
c, s, r, z = (1, 0, 0, 0)
else:
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
c, s = a / r, b / r
z = s if roe == a else (1 if c == 0 else 1 / c)
return r, z, c, s
expected = expected_rotg(a, b)
actual = rotg(a, b, c, s)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rot(dtype):
rot = _rot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
c = dtype(rng.randn())
s = dtype(rng.randn())
expected_x = c * x + s * y
expected_y = c * y - s * x
rot(x, y, c, s)
assert_allclose(x, expected_x)
assert_allclose(y, expected_y)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opA, transA",
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
ids=["NoTrans", "Trans"],
)
@pytest.mark.parametrize(
"order",
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
ids=["RowMajor", "ColMajor"],
)
def test_gemv(dtype, opA, transA, order):
gemv = _gemv_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(x) + beta * y
gemv(transA, alpha, A, x, beta, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"order",
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
)
def test_ger(dtype, order):
ger = _ger_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
A = np.asarray(
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha = 2.5
expected = alpha * np.outer(x, y) + A
ger(alpha, x, y, A)
assert_allclose(A, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opB, transB",
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
ids=["NoTrans", "Trans"],
)
@pytest.mark.parametrize(
"opA, transA",
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
ids=["NoTrans", "Trans"],
)
@pytest.mark.parametrize(
"order",
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
)
def test_gemm(dtype, opA, transA, opB, transB, order):
gemm = _gemm_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
B = np.asarray(
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
)
C = np.asarray(
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(opB(B)) + beta * C
gemm(transA, transB, alpha, A, B, beta, C)
assert_allclose(C, expected, rtol=RTOL[dtype])

View File

@@ -0,0 +1,84 @@
"""Tests for dataframe detection functions."""
import numpy as np
import pytest
from sklearn._min_dependencies import dependent_packages
from sklearn.utils._dataframe import is_df_or_series, is_pandas_df, is_polars_df
from sklearn.utils._testing import _convert_container
@pytest.mark.parametrize("constructor_name", ["pyarrow", "dataframe", "polars"])
def test_is_df_or_series(constructor_name):
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
assert is_df_or_series(df)
assert not is_df_or_series(np.asarray([1, 2, 3]))
@pytest.mark.parametrize("constructor_name", ["pyarrow", "dataframe", "polars"])
def test_is_pandas_df_other_libraries(constructor_name):
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
if constructor_name in ("pyarrow", "polars"):
assert not is_pandas_df(df)
else:
assert is_pandas_df(df)
def test_is_pandas_df():
"""Check behavior of is_pandas_df when pandas is installed."""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3]])
assert is_pandas_df(df)
assert not is_pandas_df(np.asarray([1, 2, 3]))
assert not is_pandas_df(1)
def test_is_pandas_df_pandas_not_installed(hide_available_pandas):
"""Check is_pandas_df when pandas is not installed."""
assert not is_pandas_df(np.asarray([1, 2, 3]))
assert not is_pandas_df(1)
@pytest.mark.parametrize(
"constructor_name, minversion",
[
("pyarrow", dependent_packages["pyarrow"][0]),
("dataframe", dependent_packages["pandas"][0]),
("polars", dependent_packages["polars"][0]),
],
)
def test_is_polars_df_other_libraries(constructor_name, minversion):
df = _convert_container(
[[1, 4, 2], [3, 3, 6]],
constructor_name,
minversion=minversion,
)
if constructor_name in ("pyarrow", "dataframe"):
assert not is_polars_df(df)
else:
assert is_polars_df(df)
def test_is_polars_df_for_duck_typed_polars_dataframe():
"""Check is_polars_df for object that looks like a polars dataframe"""
class NotAPolarsDataFrame:
def __init__(self):
self.columns = [1, 2, 3]
self.schema = "my_schema"
not_a_polars_df = NotAPolarsDataFrame()
assert not is_polars_df(not_a_polars_df)
def test_is_polars_df():
"""Check that is_polars_df return False for non-dataframe objects."""
class LooksLikePolars:
def __init__(self):
self.columns = ["a", "b"]
self.schema = ["a", "b"]
assert not is_polars_df(LooksLikePolars())

View File

@@ -0,0 +1,98 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import pickle
from inspect import signature
import pytest
from sklearn.utils.deprecation import _is_deprecated, deprecated
@deprecated("qwerty")
class MockClass1:
pass
class MockClass2:
@deprecated("mockclass2_method")
def method(self):
pass
@deprecated("n_features_ is deprecated") # type: ignore[prop-decorator]
@property
def n_features_(self):
"""Number of input features."""
return 10
class MockClass3:
@deprecated()
def __init__(self):
pass
class MockClass4:
pass
class MockClass5(MockClass1):
"""Inherit from deprecated class but does not call super().__init__."""
def __init__(self, a):
self.a = a
@deprecated("a message")
class MockClass6:
"""A deprecated class that overrides __new__."""
def __new__(cls, *args, **kwargs):
assert len(args) > 0
return super().__new__(cls)
@deprecated()
def mock_function():
return 10
def test_deprecated():
with pytest.warns(FutureWarning, match="qwerty"):
MockClass1()
with pytest.warns(FutureWarning, match="mockclass2_method"):
MockClass2().method()
with pytest.warns(FutureWarning, match="deprecated"):
MockClass3()
with pytest.warns(FutureWarning, match="qwerty"):
MockClass5(42)
with pytest.warns(FutureWarning, match="a message"):
MockClass6(42)
with pytest.warns(FutureWarning, match="deprecated"):
val = mock_function()
assert val == 10
def test_is_deprecated():
# Test if _is_deprecated helper identifies wrapping via deprecated
# NOTE it works only for class methods and functions
assert _is_deprecated(MockClass1.__new__)
assert _is_deprecated(MockClass2().method)
assert _is_deprecated(MockClass3.__init__)
assert not _is_deprecated(MockClass4.__init__)
assert _is_deprecated(MockClass5.__new__)
assert _is_deprecated(mock_function)
def test_pickle():
pickle.loads(pickle.dumps(mock_function))
def test_deprecated_class_signature():
@deprecated()
class MockClass:
def __init__(self, a, b=1, c=2):
pass
assert list(signature(MockClass).parameters.keys()) == ["a", "b", "c"]

View File

@@ -0,0 +1,274 @@
import pickle
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
@pytest.mark.parametrize(
"values, expected",
[
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
(
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
np.array([1, 2, np.nan], dtype="float32"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
),
(
np.array(["b", "a", None, "a", None], dtype=object),
np.array(["a", "b", None], dtype=object),
),
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
],
ids=["int64", "float32-nan", "object", "object-None", "str"],
)
def test_encode_util(values, expected):
uniques = _unique(values)
assert_array_equal(uniques, expected)
result, encoded = _unique(values, return_inverse=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
result, counts = _unique(values, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(counts, np.array([2, 1, 2]))
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
assert_array_equal(counts, np.array([2, 1, 2]))
def test_encode_with_check_unknown():
# test for the check_unknown parameter of _encode()
uniques = np.array([1, 2, 3])
values = np.array([1, 2, 3, 4])
# Default is True, raise error
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=True)
# dont raise error if False
_encode(values, uniques=uniques, check_unknown=False)
# parameter is ignored for object dtype
uniques = np.array(["a", "b", "c"], dtype=object)
values = np.array(["a", "b", "c", "d"], dtype=object)
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=False)
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
diff = _check_unknown(values, uniques)
assert_array_equal(diff, expected_diff)
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
assert_array_equal(diff, expected_diff)
assert_array_equal(valid_mask, expected_mask)
@pytest.mark.parametrize(
"values, uniques, expected_diff, expected_mask",
[
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1]),
[4, np.nan],
[True, True, False, False],
),
(
np.array([2, 1, 4, 5]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array(["a", "b", "c", "d"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"], dtype=object),
np.array(["a", "c", "b"], dtype=object),
np.array(["d"], dtype=object),
[False, True, True, True],
),
(
np.array(["a", "b", "c", "d"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"]),
np.array(["a", "c", "b"]),
np.array(["d"]),
[False, True, True, True],
),
],
)
def test_check_unknown(values, uniques, expected_diff, expected_mask):
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_check_unknown_missing_values(missing_value, pickle_uniques):
# check for check_unknown with missing values with object dtypes
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d"]
expected_mask = [False, True, True, True, True]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d", missing_value]
expected_mask = [False, True, True, True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["a", missing_value], dtype=object)
uniques = np.array(["a", "b", "z"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = [missing_value]
expected_mask = [True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
# check for _unique and _encode with missing values with object dtypes
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
uniques = _unique(values)
if missing_value is None:
assert_array_equal(uniques, expected_uniques)
else: # missing_value == np.nan
assert_array_equal(uniques[:-1], expected_uniques[:-1])
assert np.isnan(uniques[-1])
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
def test_unique_util_missing_values_numeric():
# Check missing values in numerical values
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
uniques = _unique(values)
assert_array_equal(uniques, expected_uniques)
uniques, inverse = _unique(values, return_inverse=True)
assert_array_equal(uniques, expected_uniques)
assert_array_equal(inverse, expected_inverse)
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, expected_inverse)
def test_unique_util_with_all_missing_values():
# test for all types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
uniques = _unique(values)
assert_array_equal(uniques[:-1], ["a", "c", None])
# last value is nan
assert np.isnan(uniques[-1])
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
_, inverse = _unique(values, return_inverse=True)
assert_array_equal(inverse, expected_inverse)
def test_check_unknown_with_both_missing_values():
# test for both types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
assert diff[0] is None
assert np.isnan(diff[1])
diff, valid_mask = _check_unknown(
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
)
assert diff[0] is None
assert np.isnan(diff[1])
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
@pytest.mark.parametrize(
"values, uniques, expected_counts",
[
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
(
np.array([1] * 10 + [2] * 4 + [3] * 15),
np.array([1, 2, 3, 5]),
[10, 4, 15, 0],
),
(
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
np.array([2, 3, np.nan]),
[4, 15, 10],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c"],
[16, 4, 20],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", "b", "a"],
[20, 4, 16],
),
(
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", np.nan, "a"],
[20, 4, 16],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c", "e"],
[16, 4, 20, 0],
),
],
)
def test_get_counts(values, uniques, expected_counts):
counts = _get_counts(values, uniques)
assert_array_equal(counts, expected_counts)

View File

@@ -0,0 +1,47 @@
"""Test fast_dict."""
import numpy as np
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.utils._fast_dict import IntFloatDict, argmin
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
assert len(d) == len(keys)
d.append(120, 3.0)
assert d[120] == 3.0
assert len(d) == len(keys) + 1
for i in range(2000):
d.append(i + 1000, 4.0)
assert d[1100] == 4.0
def test_int_float_dict_argmin():
# Test the argmin implementation on the IntFloatDict
keys = np.arange(100, dtype=np.intp)
values = np.arange(100, dtype=np.float64)
d = IntFloatDict(keys, values)
assert argmin(d) == (0, 0)
def test_to_arrays():
# Test that an IntFloatDict is converted into arrays
# of keys and values correctly
keys_in = np.array([1, 2, 3], dtype=np.intp)
values_in = np.array([4, 5, 6], dtype=np.float64)
d = IntFloatDict(keys_in, values_in)
keys_out, values_out = d.to_arrays()
assert keys_out.dtype == keys_in.dtype
assert values_in.dtype == values_out.dtype
assert_array_equal(keys_out, keys_in)
assert_allclose(values_out, values_in)

View File

@@ -0,0 +1,160 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
def test_object_dtype_isnan(dtype, val):
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
expected_mask = np.array([[False, True], [True, False]])
mask = _object_dtype_isnan(X)
assert_array_equal(mask, expected_mask)
@pytest.mark.parametrize(
"params, expected_dtype",
[
({}, np.int32), # default behaviour
({"maxval": np.iinfo(np.int32).max}, np.int32),
({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
],
)
def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` depending only on the
`max_val` parameter.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# Arrays dtype is int64 and thus should not be downcasted to int32 without
# checking the content of providing maxval.
({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
# One of the array is int64 and should not be downcasted to int32
# for the same reasons.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int64),
)
},
np.int64,
),
# Both arrays are already int32: we can just keep this dtype.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int32),
)
},
np.int32,
),
# Arrays should be upcasted to at least int32 precision.
({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
# Check that `maxval` takes precedence over the arrays and thus upcast to
# int64.
(
{
"arrays": np.array([1, 2], dtype=np.int32),
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_without_checking_contents(
params, expected_dtype
):
"""Check the behaviour of `smallest_admissible_index_dtype` using the passed
arrays but without checking the contents of the arrays.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# empty arrays should always be converted to int32 indices
(
{
"arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
"check_contents": True,
},
np.int32,
),
# arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
# be converted to int32,
(
{"arrays": np.array([1], dtype=np.int64), "check_contents": True},
np.int32,
),
# otherwise, it should be converted to int64. We need to create a uint32
# arrays to accommodate a value > np.iinfo(np.int32).max
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
},
np.int64,
),
# maxval should take precedence over the arrays contents and thus upcast to
# int64.
(
{
"arrays": np.array([1], dtype=np.int32),
"check_contents": True,
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
# when maxval is small, but check_contents is True and the contents
# require np.int64, we still require np.int64 indexing in the end.
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
"maxval": 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
arrays but as well the contents.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, err_type, err_msg",
[
(
{"maxval": np.iinfo(np.int64).max + 1},
ValueError,
"is to large to be represented as np.int64",
),
(
{"arrays": np.array([1, 2], dtype=np.float64)},
ValueError,
"Array dtype float64 is not supported",
),
({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
],
)
def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
"""Check that we raise the proper error message."""
with pytest.raises(err_type, match=err_msg):
_smallest_admissible_index_dtype(**params)

View File

@@ -0,0 +1,80 @@
import numpy as np
import pytest
from scipy.sparse.csgraph import connected_components
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.graph import _fix_connected_components
def test_fix_connected_components():
# Test that _fix_connected_components reduces the number of component to 1.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
graph = _fix_connected_components(X, graph, n_connected_components, labels)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
def test_fix_connected_components_precomputed():
# Test that _fix_connected_components accepts precomputed distance matrix.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
distances = pairwise_distances(X)
graph = _fix_connected_components(
distances, graph, n_connected_components, labels, metric="precomputed"
)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
# but it does not work with precomputed neighbors graph
with pytest.raises(RuntimeError, match="does not work with a sparse"):
_fix_connected_components(
graph, graph, n_connected_components, labels, metric="precomputed"
)
def test_fix_connected_components_wrong_mode():
# Test that the an error is raised if the mode string is incorrect.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
with pytest.raises(ValueError, match="Unknown mode"):
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="foo"
)
def test_fix_connected_components_connectivity_mode():
# Test that the connectivity mode fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="connectivity"
)
assert np.all(graph.data == 1)
def test_fix_connected_components_distance_mode():
# Test that the distance mode does not fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
assert np.all(graph.data == 1)
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="distance"
)
assert not np.all(graph.data == 1)

View File

@@ -0,0 +1,703 @@
import warnings
from copy import copy
from unittest import SkipTest
import numpy as np
import pytest
from scipy.stats import kstest
import sklearn
from sklearn.externals._packaging.version import parse as parse_version
from sklearn.utils import _safe_indexing, resample, shuffle
from sklearn.utils._array_api import (
_convert_to_numpy,
_get_namespace_device_dtype_ids,
device,
move_to,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._indexing import (
_determine_key_type,
_get_column_indices,
_safe_assign,
)
from sklearn.utils._mocking import MockDataFrame
from sklearn.utils._testing import (
_array_api_for_tests,
_convert_container,
assert_allclose,
assert_allclose_dense_sparse,
assert_array_equal,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# toy array
X_toy = np.arange(9).reshape((3, 3))
def test_polars_indexing():
"""Check _safe_indexing for polars as expected."""
pl = pytest.importorskip("polars", minversion="0.18.2")
df = pl.DataFrame(
{"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
)
from polars.testing import assert_frame_equal
str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
for key in str_keys:
out = _safe_indexing(df, key, axis=1)
assert_frame_equal(df[key], out)
bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
for bool_key, str_key in bool_keys:
out = _safe_indexing(df, bool_key, axis=1)
assert_frame_equal(df[:, str_key], out)
int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
for int_key, str_key in int_keys:
out = _safe_indexing(df, int_key, axis=1)
assert_frame_equal(df[:, str_key], out)
axis_0_keys = [[0, 1], [1, 3], [3, 2]]
for key in axis_0_keys:
out = _safe_indexing(df, key, axis=0)
assert_frame_equal(df[key], out)
@pytest.mark.parametrize(
"key, dtype",
[
(0, "int"),
("0", "str"),
(True, "bool"),
(np.bool_(True), "bool"),
([0, 1, 2], "int"),
(["0", "1", "2"], "str"),
((0, 1, 2), "int"),
(("0", "1", "2"), "str"),
(slice(None, None), None),
(slice(0, 2), "int"),
(np.array([0, 1, 2], dtype=np.int32), "int"),
(np.array([0, 1, 2], dtype=np.int64), "int"),
(np.array([0, 1, 2], dtype=np.uint8), "int"),
([True, False], "bool"),
((True, False), "bool"),
(np.array([True, False]), "bool"),
("col_0", "str"),
(["col_0", "col_1", "col_2"], "str"),
(("col_0", "col_1", "col_2"), "str"),
(slice("begin", "end"), "str"),
(np.array(["col_0", "col_1", "col_2"]), "str"),
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
],
)
def test_determine_key_type(key, dtype):
assert _determine_key_type(key) == dtype
def test_determine_key_type_error():
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(1.0)
def test_determine_key_type_slice_error():
with pytest.raises(TypeError, match="Only array-like or scalar are"):
_determine_key_type(slice(0, 2, 1), accept_slice=False)
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_determine_key_type_array_api(array_namespace, device_, dtype_name):
xp = _array_api_for_tests(array_namespace, device_)
with sklearn.config_context(array_api_dispatch=True):
int_array_key = xp.asarray([1, 2, 3], device=device_)
assert _determine_key_type(int_array_key) == "int"
bool_array_key = xp.asarray([True, False, True], device=device_)
assert _determine_key_type(bool_array_key) == "bool"
try:
complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j], device=device_)
except TypeError:
# Complex numbers are not supported by all Array API libraries.
complex_array_key = None
if complex_array_key is not None:
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(complex_array_key)
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
@pytest.mark.parametrize(
"indexing_key",
(
0,
-1,
[1, 3],
np.array([1, 3]),
slice(1, 2),
[True, False, True, True],
np.asarray([False, False, False, False]),
),
)
@pytest.mark.parametrize("axis", [0, 1])
def test_safe_indexing_array_api_support(
array_namespace, device_, dtype_name, indexing_key, axis
):
xp = _array_api_for_tests(array_namespace, device_)
array_to_index_np = np.arange(16).reshape(4, 4)
expected_result = _safe_indexing(array_to_index_np, indexing_key, axis=axis)
array_to_index_xp = move_to(array_to_index_np, xp=xp, device=device_)
with sklearn.config_context(array_api_dispatch=True):
indexed_array_xp = _safe_indexing(array_to_index_xp, indexing_key, axis=axis)
assert device(indexed_array_xp) == device(array_to_index_xp)
assert indexed_array_xp.dtype == array_to_index_xp.dtype
assert_allclose(_convert_to_numpy(indexed_array_xp, xp=xp), expected_result)
@pytest.mark.parametrize(
"array_type", ["list", "array", "sparse", "dataframe", "polars", "pyarrow"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
)
@pytest.mark.parametrize(
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_1d_container(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize(
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
# validation of the indices
# we make a copy because indices is mutable and shared between tests
indices_converted = copy(indices)
if indices_type == "slice" and isinstance(indices[1], int):
indices_converted[1] += 1
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices_converted = _convert_container(indices_converted, indices_type)
if isinstance(indices[0], str) and array_type in ("array", "sparse"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices_converted, axis=1)
else:
subset = _safe_indexing(array, indices_converted, axis=1)
assert_allclose_dense_sparse(
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
)
@pytest.mark.parametrize("array_read_only", [True, False])
@pytest.mark.parametrize("indices_read_only", [True, False])
@pytest.mark.parametrize(
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
)
@pytest.mark.parametrize("indices_type", ["array", "series"])
@pytest.mark.parametrize(
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_read_only_axis_1(
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
):
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
if array_read_only:
array.setflags(write=False)
array = _convert_container(array, array_type)
indices = np.array([1, 2])
if indices_read_only:
indices.setflags(write=False)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
@pytest.mark.parametrize(
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize(
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
@pytest.mark.parametrize(
"axis, expected_subset",
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
)
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices = [False, True, True]
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_subset, array_type)
)
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("list", "list"),
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
("pyarrow", "pyarrow_array"),
],
)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
expected_array = _convert_container([7, 8, 9], expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize(
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
)
def test_safe_indexing_1d_scalar(array_type):
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
assert subset == 3
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
("pyarrow", "pyarrow_array"),
],
)
@pytest.mark.parametrize("indices", [2, "col_2"])
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
if isinstance(indices, str) and array_type in ("array", "sparse"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=1)
else:
subset = _safe_indexing(array, indices, axis=1)
expected_output = [3, 6, 9]
if expected_output_type == "sparse":
# sparse matrix are keeping the 2D shape
expected_output = [[3], [6], [9]]
expected_array = _convert_container(expected_output, expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
def test_safe_indexing_None_axis_0(array_type):
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
X_subset = _safe_indexing(X, None, axis=0)
assert_allclose_dense_sparse(X_subset, X)
def test_safe_indexing_pandas_no_matching_cols_error():
pd = pytest.importorskip("pandas")
err_msg = "No valid specification of the columns."
X = pd.DataFrame(X_toy)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, [1.0], axis=1)
@pytest.mark.parametrize("axis", [None, 3])
def test_safe_indexing_error_axis(axis):
with pytest.raises(ValueError, match="'axis' should be either 0"):
_safe_indexing(X_toy, [0, 1], axis=axis)
@pytest.mark.parametrize(
"X_constructor", ["array", "series", "polars_series", "pyarrow_array"]
)
def test_safe_indexing_1d_array_error(X_constructor):
# check that we are raising an error if the array-like passed is 1D and
# we try to index on the 2nd dimension
X = list(range(5))
if X_constructor == "array":
X_constructor = np.asarray(X)
elif X_constructor == "series":
pd = pytest.importorskip("pandas")
X_constructor = pd.Series(X)
elif X_constructor == "polars_series":
pl = pytest.importorskip("polars")
X_constructor = pl.Series(values=X)
elif X_constructor == "pyarrow_array":
pa = pytest.importorskip("pyarrow")
X_constructor = pa.array(X)
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X_constructor, [0, 1], axis=1)
def test_safe_indexing_container_axis_0_unsupported_type():
indices = ["col_1", "col_2"]
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
err_msg = r"String indexing.*is not supported with 'axis=0'"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=0)
def test_safe_indexing_pandas_no_settingwithcopy_warning():
# Using safe_indexing with an array-like indexer gives a copy of the
# DataFrame -> ensure it doesn't raise a warning if modified
pd = pytest.importorskip("pandas")
pd_version = parse_version(pd.__version__)
pd_base_version = parse_version(pd_version.base_version)
if pd_base_version >= parse_version("3"):
raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
subset = _safe_indexing(X, [0, 1], axis=0)
if hasattr(pd.errors, "SettingWithCopyWarning"):
SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
else:
# backward compatibility for pandas < 1.5
SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
with warnings.catch_warnings():
warnings.simplefilter("error", SettingWithCopyWarning)
subset.iloc[0, 0] = 10
# The original dataframe is unaffected by the assignment on the subset:
assert X.iloc[0, 0] == 1
@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
def test_safe_indexing_list_axis_1_unsupported(indices):
"""Check that we raise a ValueError when axis=1 with input as list."""
X = [[1, 2], [4, 5], [7, 8]]
err_msg = "axis=1 is not supported for lists"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, indices, axis=1)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
def test_safe_assign(array_type):
"""Check that `_safe_assign` works as expected."""
rng = np.random.RandomState(0)
X_array = rng.randn(10, 5)
row_indexer = [1, 2]
values = rng.randn(len(row_indexer), X_array.shape[1])
X = _convert_container(X_array, array_type)
_safe_assign(X, values, row_indexer=row_indexer)
assigned_portion = _safe_indexing(X, row_indexer, axis=0)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
column_indexer = [1, 2]
values = rng.randn(X_array.shape[0], len(column_indexer))
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assigned_portion = _safe_indexing(X, column_indexer, axis=1)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
row_indexer, column_indexer = None, None
values = rng.randn(*X.shape)
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assert_allclose_dense_sparse(X, _convert_container(values, array_type))
@pytest.mark.parametrize(
"key, err_msg",
[
(10, r"all features must be in \[0, 2\]"),
("whatever", "A given column is not a column of the dataframe"),
(object(), "No valid specification of the columns"),
],
)
def test_get_column_indices_error(key, err_msg):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
with pytest.raises(ValueError, match=err_msg):
_get_column_indices(X_df, key)
@pytest.mark.parametrize(
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
)
def test_get_column_indices_pandas_nonunique_columns_error(key):
pd = pytest.importorskip("pandas")
toy = np.zeros((1, 5), dtype=int)
columns = ["col1", "col1", "col2", "col3", "col2"]
X = pd.DataFrame(toy, columns=columns)
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
with pytest.raises(ValueError) as exc_info:
_get_column_indices(X, key)
assert str(exc_info.value) == err_msg
def test_get_column_indices_interchange():
"""Check _get_column_indices for edge cases with the interchange"""
pl = pytest.importorskip("polars")
# Polars dataframes go down the interchange path.
df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])
key_results = [
(slice(1, None), [1, 2]),
(slice(None, 2), [0, 1]),
(slice(1, 2), [1]),
(["b", "c"], [1, 2]),
(slice("a", "b"), [0, 1]),
(slice("a", None), [0, 1, 2]),
(slice(None, "a"), [0]),
(["c", "a"], [2, 0]),
([], []),
]
for key, result in key_results:
assert _get_column_indices(df, key) == result
msg = "A given column is not a column of the dataframe"
with pytest.raises(ValueError, match=msg):
_get_column_indices(df, ["not_a_column"])
msg = "key.step must be 1 or None"
with pytest.raises(NotImplementedError, match=msg):
_get_column_indices(df, slice("a", None, 2))
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
with pytest.raises(ValueError):
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
def test_resample_weighted():
# Check that sampling with replacement with integer weights yields the
# samples from the same distribution as sampling uniformly with
# repeated data points.
data = np.array([-1, 0, 1])
sample_weight = np.asarray([0, 100, 1])
mean_repeated = []
mean_reweighted = []
for seed in range(100):
mean_repeated.append(
resample(
data.repeat(sample_weight),
replace=True,
random_state=seed,
n_samples=data.shape[0],
).mean()
)
mean_reweighted.append(
resample(
data,
sample_weight=sample_weight,
replace=True,
random_state=seed,
n_samples=data.shape[0],
).mean()
)
mean_repeated = np.asarray(mean_repeated)
mean_reweighted = np.asarray(mean_reweighted)
test_result = kstest(mean_repeated, mean_reweighted)
# Should never be negative because -1 has a 0 weight.
assert np.all(mean_reweighted >= 0)
# The null-hypothesis (the computed means are identically distributed)
# cannot be rejected.
assert test_result.pvalue > 0.05
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = 0.9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
def test_resample_stratified_replace():
# Make sure stratified resampling supports the replace parameter
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=n_samples)
X_replace, _ = resample(
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
)
X_no_replace, _ = resample(
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
)
assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50
# make sure n_samples can be greater than X.shape[0] if we sample with
# replacement
X_replace, _ = resample(
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
)
assert X_replace.shape[0] == 1000
assert np.unique(X_replace).shape[0] == 100
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
def test_notimplementederror():
with pytest.raises(
NotImplementedError,
match="Resampling with sample_weight is only implemented for replace=True.",
):
resample([0, 1], [0, 1], sample_weight=[1, 1], replace=False)
with pytest.raises(
NotImplementedError,
match="Resampling with sample_weight is only implemented for stratify=None",
):
resample([0, 1], [0, 1], sample_weight=[1, 1], stratify=[0, 1])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_resample_stratify_sparse_error(csr_container):
# resample must be ndarray
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 2))
y = rng.randint(0, 2, size=n_samples)
stratify = csr_container(y.reshape(-1, 1))
with pytest.raises(TypeError, match="Sparse data was passed"):
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_shuffle_dont_convert_to_array(csc_container):
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ["a", "b", "c"]
b = np.array(["a", "b", "c"], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
e = csc_container(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
assert a_s == ["c", "b", "a"]
assert type(a_s) == list
assert_array_equal(b_s, ["c", "b", "a"])
assert b_s.dtype == object
assert c_s == [3, 2, 1]
assert type(c_s) == list
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
assert type(d_s) == MockDataFrame
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))

View File

@@ -0,0 +1,19 @@
import pytest
from sklearn.utils._mask import safe_mask
from sklearn.utils.fixes import CSR_CONTAINERS
from sklearn.utils.validation import check_random_state
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_safe_mask(csr_container):
random_state = check_random_state(0)
X = random_state.rand(5, 4)
X_csr = csr_container(X)
mask = [False, False, True, True, True]
mask = safe_mask(X, mask)
assert X[mask].shape[0] == 3
mask = safe_mask(X_csr, mask)
assert X_csr[mask].shape[0] == 3

View File

@@ -0,0 +1,63 @@
import pickle
import pytest
from sklearn.utils.metaestimators import available_if
class AvailableParameterEstimator:
"""This estimator's `available` parameter toggles the presence of a method"""
def __init__(self, available=True, return_value=1):
self.available = available
self.return_value = return_value
@available_if(lambda est: est.available)
def available_func(self):
"""This is a mock available_if function"""
return self.return_value
def test_available_if_docstring():
assert "This is a mock available_if function" in str(
AvailableParameterEstimator.__dict__["available_func"].__doc__
)
assert "This is a mock available_if function" in str(
AvailableParameterEstimator.available_func.__doc__
)
assert "This is a mock available_if function" in str(
AvailableParameterEstimator().available_func.__doc__
)
def test_available_if():
assert hasattr(AvailableParameterEstimator(), "available_func")
assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
def test_available_if_unbound_method():
# This is a non regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/20614
# to make sure that decorated functions can be used as an unbound method,
# for instance when monkeypatching.
est = AvailableParameterEstimator()
AvailableParameterEstimator.available_func(est)
est = AvailableParameterEstimator(available=False)
with pytest.raises(
AttributeError,
match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
):
AvailableParameterEstimator.available_func(est)
def test_available_if_methods_can_be_pickled():
"""Check that available_if methods can be pickled.
Non-regression test for #21344.
"""
return_value = 10
est = AvailableParameterEstimator(available=True, return_value=return_value)
pickled_bytes = pickle.dumps(est.available_func)
unpickled_func = pickle.loads(pickled_bytes)
assert unpickled_func() == return_value

View File

@@ -0,0 +1,27 @@
import numpy as np
import pytest
from sklearn.utils._missing import is_scalar_nan
@pytest.mark.parametrize(
"value, result",
[
(float("nan"), True),
(np.nan, True),
(float(np.nan), True),
(np.float32(np.nan), True),
(np.float64(np.nan), True),
(0, False),
(0.0, False),
(None, False),
("", False),
("nan", False),
([np.nan], False),
(9867966753463435747313673, False), # Python int that overflows with C type
],
)
def test_is_scalar_nan(value, result):
assert is_scalar_nan(value) is result
# make sure that we are returning a Python bool
assert isinstance(is_scalar_nan(value), bool)

View File

@@ -0,0 +1,205 @@
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.utils import _safe_indexing, check_array
from sklearn.utils._mocking import (
CheckingClassifier,
_MockEstimatorOnOffPrediction,
)
from sklearn.utils._testing import _convert_container
from sklearn.utils.fixes import CSR_CONTAINERS
@pytest.fixture
def iris():
return load_iris(return_X_y=True)
def _success(x):
return True
def _fail(x):
return False
@pytest.mark.parametrize(
"kwargs",
[
{},
{"check_X": _success},
{"check_y": _success},
{"check_X": _success, "check_y": _success},
],
)
def test_check_on_fit_success(iris, kwargs):
X, y = iris
CheckingClassifier(**kwargs).fit(X, y)
@pytest.mark.parametrize(
"kwargs",
[
{"check_X": _fail},
{"check_y": _fail},
{"check_X": _success, "check_y": _fail},
{"check_X": _fail, "check_y": _success},
{"check_X": _fail, "check_y": _fail},
],
)
def test_check_on_fit_fail(iris, kwargs):
X, y = iris
clf = CheckingClassifier(**kwargs)
with pytest.raises(AssertionError):
clf.fit(X, y)
@pytest.mark.parametrize(
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
)
def test_check_X_on_predict_success(iris, pred_func):
X, y = iris
clf = CheckingClassifier(check_X=_success).fit(X, y)
getattr(clf, pred_func)(X)
@pytest.mark.parametrize(
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
)
def test_check_X_on_predict_fail(iris, pred_func):
X, y = iris
clf = CheckingClassifier(check_X=_success).fit(X, y)
clf.set_params(check_X=_fail)
with pytest.raises(AssertionError):
getattr(clf, pred_func)(X)
@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
def test_checking_classifier(iris, input_type):
# Check that the CheckingClassifier outputs what we expect
X, y = iris
X = _convert_container(X, input_type)
clf = CheckingClassifier()
clf.fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
assert len(clf.classes_) == 3
assert clf.n_features_in_ == 4
y_pred = clf.predict(X)
assert all(pred in clf.classes_ for pred in y_pred)
assert clf.score(X) == pytest.approx(0)
clf.set_params(foo_param=10)
assert clf.fit(X, y).score(X) == pytest.approx(1)
y_proba = clf.predict_proba(X)
assert y_proba.shape == (150, 3)
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
y_decision = clf.decision_function(X)
assert y_decision.shape == (150, 3)
# check the shape in case of binary classification
first_2_classes = np.logical_or(y == 0, y == 1)
X = _safe_indexing(X, first_2_classes)
y = _safe_indexing(y, first_2_classes)
clf.fit(X, y)
y_proba = clf.predict_proba(X)
assert y_proba.shape == (100, 2)
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
y_decision = clf.decision_function(X)
assert y_decision.shape == (100,)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_checking_classifier_with_params(iris, csr_container):
X, y = iris
X_sparse = csr_container(X)
clf = CheckingClassifier(check_X=sparse.issparse)
with pytest.raises(AssertionError):
clf.fit(X, y)
clf.fit(X_sparse, y)
clf = CheckingClassifier(
check_X=check_array, check_X_params={"accept_sparse": False}
)
clf.fit(X, y)
with pytest.raises(TypeError, match="Sparse data was passed"):
clf.fit(X_sparse, y)
def test_checking_classifier_fit_params(iris):
# check the error raised when the number of samples is not the one expected
X, y = iris
clf = CheckingClassifier(expected_sample_weight=True)
sample_weight = np.ones(len(X) // 2)
msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
with pytest.raises(ValueError) as exc:
clf.fit(X, y, sample_weight=sample_weight)
assert exc.value.args[0] == msg
def test_checking_classifier_missing_fit_params(iris):
X, y = iris
clf = CheckingClassifier(expected_sample_weight=True)
err_msg = "Expected sample_weight to be passed"
with pytest.raises(AssertionError, match=err_msg):
clf.fit(X, y)
@pytest.mark.parametrize(
"methods_to_check",
[["predict"], ["predict", "predict_proba"]],
)
@pytest.mark.parametrize(
"predict_method", ["predict", "predict_proba", "decision_function", "score"]
)
def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
# check that methods_to_check allows to bypass checks
X, y = iris
clf = CheckingClassifier(
check_X=sparse.issparse,
methods_to_check=methods_to_check,
)
clf.fit(X, y)
if predict_method in methods_to_check:
with pytest.raises(AssertionError):
getattr(clf, predict_method)(X)
else:
getattr(clf, predict_method)(X)
@pytest.mark.parametrize(
"response_methods",
[
["predict"],
["predict", "predict_proba"],
["predict", "decision_function"],
["predict", "predict_proba", "decision_function"],
],
)
def test_mock_estimator_on_off_prediction(iris, response_methods):
X, y = iris
estimator = _MockEstimatorOnOffPrediction(response_methods=response_methods)
estimator.fit(X, y)
assert hasattr(estimator, "classes_")
assert_array_equal(estimator.classes_, np.unique(y))
possible_responses = ["predict", "predict_proba", "decision_function"]
for response in possible_responses:
if response in response_methods:
assert hasattr(estimator, response)
assert getattr(estimator, response)(X) == response
else:
assert not hasattr(estimator, response)

View File

@@ -0,0 +1,634 @@
import warnings
from itertools import product
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn import config_context, datasets
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.utils._array_api import (
_get_namespace_device_dtype_ids,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
_array_api_for_tests,
_convert_container,
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.estimator_checks import _NotAnArray
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
from sklearn.utils.metaestimators import _safe_split
from sklearn.utils.multiclass import (
_ovr_decision_function,
check_classification_targets,
class_distribution,
is_multilabel,
type_of_target,
unique_labels,
)
multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
multilabel_explicit_zero[:, 0] = 0
def _generate_sparse(
data,
sparse_containers=tuple(
COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
),
dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
):
return [
sparse_container(data, dtype=dtype)
for sparse_container in sparse_containers
for dtype in dtypes
]
EXAMPLES = {
"multilabel-indicator": [
# valid when the data is formatted as sparse or dense, identified
# by CSR format when the testing takes place
*_generate_sparse(
np.random.RandomState(42).randint(2, size=(10, 10)),
sparse_containers=CSR_CONTAINERS,
dtypes=(int,),
),
[[0, 1], [1, 0]],
[[0, 1]],
*_generate_sparse(
multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
),
*_generate_sparse([[0, 1], [1, 0]]),
*_generate_sparse([[0, 0], [0, 0]]),
*_generate_sparse([[0, 1]]),
# Only valid when data is dense
[[-1, 1], [1, -1]],
np.array([[-1, 1], [1, -1]]),
np.array([[-3, 3], [3, -3]]),
_NotAnArray(np.array([[-3, 3], [3, -3]])),
],
"multiclass": [
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
np.array([1, 0, 2]),
np.array([1, 0, 2], dtype=np.int8),
np.array([1, 0, 2], dtype=np.uint8),
np.array([1, 0, 2], dtype=float),
np.array([1, 0, 2], dtype=np.float32),
np.array([[1], [0], [2]]),
_NotAnArray(np.array([1, 0, 2])),
[0, 1, 2],
["a", "b", "c"],
np.array(["a", "b", "c"]),
np.array(["a", "b", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
],
"multiclass-multioutput": [
[[1, 0, 2, 2], [1, 4, 2, 4]],
[["a", "b"], ["c", "d"]],
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
*_generate_sparse(
[[1, 0, 2, 2], [1, 4, 2, 4]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(int, np.int8, np.uint8, float, np.float32),
),
np.array([["a", "b"], ["c", "d"]]),
np.array([["a", "b"], ["c", "d"]]),
np.array([["a", "b"], ["c", "d"]], dtype=object),
np.array([[1, 0, 2]]),
_NotAnArray(np.array([[1, 0, 2]])),
],
"binary": [
[0, 1],
[1, 1],
[],
[0],
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
np.array([[0], [1]]),
_NotAnArray(np.array([[0], [1]])),
[1, -1],
[3, 5],
["a"],
["a", "b"],
["abc", "def"],
np.array(["abc", "def"]),
["a", "b"],
np.array(["abc", "def"], dtype=object),
],
"continuous": [
[1e-5],
[0, 0.5],
np.array([[0], [0.5]]),
np.array([[0], [0.5]], dtype=np.float32),
],
"continuous-multioutput": [
np.array([[0, 0.5], [0.5, 0]]),
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
np.array([[0, 0.5]]),
*_generate_sparse(
[[0, 0.5], [0.5, 0]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(float, np.float32),
),
*_generate_sparse(
[[0, 0.5]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(float, np.float32),
),
],
"unknown": [
[[]],
np.array([[]], dtype=object),
[()],
# sequence of sequences that weren't supported even before deprecation
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
[np.array([]), np.array([1, 2, 3])],
[{1, 2, 3}, {1, 2}],
[frozenset([1, 2, 3]), frozenset([1, 2])],
# and also confusable as sequences of sequences
[{0: "a", 1: "b"}, {0: "a"}],
# ndim 0
np.array(0),
# empty second dimension
np.array([[], []]),
# 3d
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
],
}
ARRAY_API_EXAMPLES = {
"multilabel-indicator": [
np.random.RandomState(42).randint(2, size=(10, 10)),
[[0, 1], [1, 0]],
[[0, 1]],
multilabel_explicit_zero,
[[0, 0], [0, 0]],
[[-1, 1], [1, -1]],
np.array([[-1, 1], [1, -1]]),
np.array([[-3, 3], [3, -3]]),
_NotAnArray(np.array([[-3, 3], [3, -3]])),
],
"multiclass": [
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
np.array([1, 0, 2]),
np.array([1, 0, 2], dtype=np.int8),
np.array([1, 0, 2], dtype=np.uint8),
np.array([1, 0, 2], dtype=float),
np.array([1, 0, 2], dtype=np.float32),
np.array([[1], [0], [2]]),
_NotAnArray(np.array([1, 0, 2])),
[0, 1, 2],
],
"multiclass-multioutput": [
[[1, 0, 2, 2], [1, 4, 2, 4]],
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
np.array([[1, 0, 2]]),
_NotAnArray(np.array([[1, 0, 2]])),
],
"binary": [
[0, 1],
[1, 1],
[],
[0],
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
np.array([[0], [1]]),
_NotAnArray(np.array([[0], [1]])),
[1, -1],
[3, 5],
],
"continuous": [
[1e-5],
[0, 0.5],
np.array([[0], [0.5]]),
np.array([[0], [0.5]], dtype=np.float32),
],
"continuous-multioutput": [
np.array([[0, 0.5], [0.5, 0]]),
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
np.array([[0, 0.5]]),
],
"unknown": [
[[]],
[()],
np.array(0),
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
],
}
NON_ARRAY_LIKE_EXAMPLES = [
{1, 2, 3},
{0: "a", 1: "b"},
{0: [5], 1: [5]},
"abc",
frozenset([1, 2, 3]),
None,
]
MULTILABEL_SEQUENCES = [
[[1], [2], [0, 1]],
[(), (2), (0, 1)],
np.array([[], [1, 2]], dtype="object"),
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
]
def test_unique_labels():
# Empty iterable
with pytest.raises(ValueError):
unique_labels()
# Multiclass problem
assert_array_equal(unique_labels(range(10)), np.arange(10))
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
# Multilabel indicator
assert_array_equal(
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
)
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
# Several arrays passed
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
# Border line case with binary indicator matrix
with pytest.raises(ValueError):
unique_labels([4, 0, 2], np.ones((5, 5)))
with pytest.raises(ValueError):
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
def test_check_classification_targets_too_many_unique_classes():
"""Check that we raise a warning when the number of unique classes is greater than
50% of the number of samples.
We need to check that we don't raise if we have less than 20 samples.
"""
# Create array of unique labels. This does raise a warning.
y = np.arange(25)
msg = r"The number of unique classes is greater than 50% of the number of samples."
with pytest.warns(UserWarning, match=msg):
check_classification_targets(y)
# less than 20 samples, no warning should be raised
y = np.arange(10)
with warnings.catch_warnings():
warnings.simplefilter("error")
check_classification_targets(y)
def test_unique_labels_non_specific():
# Test unique_labels with a variety of collected examples
# Smoke test for all supported format
for format in ["binary", "multiclass", "multilabel-indicator"]:
for y in EXAMPLES[format]:
unique_labels(y)
# We don't support those format at the moment
for example in NON_ARRAY_LIKE_EXAMPLES:
with pytest.raises(ValueError):
unique_labels(example)
for y_type in [
"unknown",
"continuous",
"continuous-multioutput",
"multiclass-multioutput",
]:
for example in EXAMPLES[y_type]:
with pytest.raises(ValueError):
unique_labels(example)
def test_unique_labels_mixed_types():
# Mix with binary or multiclass and multilabel
mix_clf_format = product(
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
)
for y_multilabel, y_multiclass in mix_clf_format:
with pytest.raises(ValueError):
unique_labels(y_multiclass, y_multilabel)
with pytest.raises(ValueError):
unique_labels(y_multilabel, y_multiclass)
with pytest.raises(ValueError):
unique_labels([[1, 2]], [["a", "d"]])
with pytest.raises(ValueError):
unique_labels(["1", 2])
with pytest.raises(ValueError):
unique_labels([["1", 2], [1, 3]])
with pytest.raises(ValueError):
unique_labels([["1", "2"], [2, 3]])
def test_is_multilabel():
for group, group_examples in EXAMPLES.items():
dense_exp = group == "multilabel-indicator"
for example in group_examples:
# Only mark explicitly defined sparse examples as valid sparse
# multilabel-indicators
sparse_exp = dense_exp and issparse(example)
if issparse(example) or (
hasattr(example, "__array__")
and np.asarray(example).ndim == 2
and np.asarray(example).dtype.kind in "biuf"
and np.asarray(example).shape[1] > 0
):
examples_sparse = [
sparse_container(example)
for sparse_container in (
COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
)
]
for exmpl_sparse in examples_sparse:
assert sparse_exp == is_multilabel(exmpl_sparse), (
f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
)
# Densify sparse examples before testing
if issparse(example):
example = example.toarray()
assert dense_exp == is_multilabel(example), (
f"is_multilabel({example!r}) should be {dense_exp}"
)
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(),
ids=_get_namespace_device_dtype_ids,
)
def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
for group, group_examples in ARRAY_API_EXAMPLES.items():
dense_exp = group == "multilabel-indicator"
for example in group_examples:
if np.asarray(example).dtype.kind == "f":
example = np.asarray(example, dtype=dtype_name)
else:
example = np.asarray(example)
example = xp.asarray(example, device=device)
with config_context(array_api_dispatch=True):
assert dense_exp == is_multilabel(example), (
f"is_multilabel({example!r}) should be {dense_exp}"
)
def test_check_classification_targets():
for y_type in EXAMPLES.keys():
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
for example in EXAMPLES[y_type]:
msg = "Unknown label type: "
with pytest.raises(ValueError, match=msg):
check_classification_targets(example)
else:
for example in EXAMPLES[y_type]:
check_classification_targets(example)
def test_type_of_target():
for group, group_examples in EXAMPLES.items():
for example in group_examples:
assert type_of_target(example) == group, (
"type_of_target(%r) should be %r, got %r"
% (
example,
group,
type_of_target(example),
)
)
for example in NON_ARRAY_LIKE_EXAMPLES:
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
with pytest.raises(ValueError, match=msg_regex):
type_of_target(example)
for example in MULTILABEL_SEQUENCES:
msg = (
"You appear to be using a legacy multi-label data "
"representation. Sequence of sequences are no longer supported;"
" use a binary array or sparse matrix instead."
)
with pytest.raises(ValueError, match=msg):
type_of_target(example)
def test_type_of_target_pandas_sparse():
pd = pytest.importorskip("pandas")
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
with pytest.raises(ValueError, match=msg):
type_of_target(y)
def test_type_of_target_pandas_nullable():
"""Check that type_of_target works with pandas nullable dtypes."""
pd = pytest.importorskip("pandas")
for dtype in ["Int32", "Float32"]:
y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
assert type_of_target(y_true) == "multiclass"
y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
assert type_of_target(y_true) == "binary"
y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
assert type_of_target(y_true) == "continuous-multioutput"
y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
assert type_of_target(y_true) == "multilabel-indicator"
y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
assert type_of_target(y_true) == "multiclass-multioutput"
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_unique_labels_pandas_nullable(dtype):
"""Checks that unique_labels work with pandas nullable dtypes.
Non-regression test for gh-25634.
"""
pd = pytest.importorskip("pandas")
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
labels = unique_labels(y_true, y_predicted)
assert_array_equal(labels, [0, 1])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_class_distribution(csc_container):
y = np.array(
[
[1, 0, 0, 1],
[2, 2, 0, 1],
[1, 3, 0, 1],
[4, 2, 0, 1],
[2, 0, 0, 1],
[1, 3, 0, 1],
]
)
# Define the sparse matrix with a mix of implicit and explicit zeros
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
indptr = np.array([0, 6, 11, 11, 17])
y_sp = csc_container((data, indices, indptr), shape=(6, 4))
classes, n_classes, class_prior = class_distribution(y)
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
n_classes_expected = [3, 3, 1, 1]
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
# Test again with explicit sample weights
(classes, n_classes, class_prior) = class_distribution(
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
)
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
)
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
iris = datasets.load_iris()
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
train, test = next(iter(cv.split(X)))
X_train, y_train = _safe_split(clf, X, y, train)
K_train, y_train2 = _safe_split(clfp, K, y, train)
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
assert_array_almost_equal(y_train, y_train2)
X_test, y_test = _safe_split(clf, X, y, test, train)
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
assert_array_almost_equal(y_test, y_test2)
def test_ovr_decision_function():
# test properties for ovr decision function
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
confidences = np.array(
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
)
n_classes = 3
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
# check that the decision values are within 0.5 range of the votes
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
assert_allclose(votes, dec_values, atol=0.5)
# check that the prediction are what we expect
# highest vote or highest confidence if there is a tie.
# for the second sample we have a tie (should be won by 1)
expected_prediction = np.array([2, 1, 2, 2])
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
# third and fourth sample have the same vote but third sample
# has higher confidence, this should reflect on the decision values
assert dec_values[2, 2] > dec_values[3, 2]
# assert subset invariance.
dec_values_one = [
_ovr_decision_function(
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
)[0]
for i in range(4)
]
assert_allclose(dec_values, dec_values_one, atol=1e-6)
@pytest.mark.parametrize("input_type", ["list", "array"])
def test_labels_in_bytes_format_error(input_type):
# check that we raise an error with bytes encoded labels
# non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16980
target = _convert_container([b"a", b"b"], input_type)
err_msg = "Support for labels represented as bytes is not supported"
with pytest.raises(TypeError, match=err_msg):
type_of_target(target)

View File

@@ -0,0 +1,73 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from numpy.testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.murmurhash import murmurhash3_32
def test_mmhash3_int():
assert murmurhash3_32(3) == 847579505
assert murmurhash3_32(3, seed=0) == 847579505
assert murmurhash3_32(3, seed=42) == -1823081949
assert murmurhash3_32(3, positive=False) == 847579505
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
assert murmurhash3_32(3, positive=True) == 847579505
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
def test_mmhash3_int_array():
rng = np.random.RandomState(42)
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
keys = keys.reshape((3, 2, 1))
for seed in [0, 42]:
expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed), expected)
for seed in [0, 42]:
expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
def test_mmhash3_bytes():
assert murmurhash3_32(b"foo", 0) == -156908512
assert murmurhash3_32(b"foo", 42) == -1322301282
assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
def test_mmhash3_unicode():
assert murmurhash3_32("foo", 0) == -156908512
assert murmurhash3_32("foo", 42) == -1322301282
assert murmurhash3_32("foo", 0, positive=True) == 4138058784
assert murmurhash3_32("foo", 42, positive=True) == 2972666014
def test_no_collision_on_byte_range():
previous_hashes = set()
for i in range(100):
h = murmurhash3_32(" " * i, 0)
assert h not in previous_hashes, "Found collision on growing empty string"
def test_uniform_distribution():
n_bins, n_samples = 10, 100000
bins = np.zeros(n_bins, dtype=np.float64)
for i in range(n_samples):
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
means = bins / n_samples
expected = np.full(n_bins, 1.0 / n_bins)
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)

View File

@@ -0,0 +1,220 @@
import warnings
import numpy as np
import pytest
from scipy.optimize import fmin_ncg
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._bunch import Bunch
from sklearn.utils._testing import assert_allclose
from sklearn.utils.optimize import _check_optimize_result, _newton_cg
def test_newton_cg(global_random_seed):
# Test that newton_cg gives same result as scipy's fmin_ncg
rng = np.random.RandomState(global_random_seed)
A = rng.normal(size=(10, 10))
x0 = np.ones(10)
def func(x):
Ax = A.dot(x)
return 0.5 * (Ax).dot(Ax)
def grad(x):
return A.T.dot(A.dot(x))
def hess(x, p):
return p.dot(A.T.dot(A.dot(x.all())))
def grad_hess(x):
return grad(x), lambda x: A.T.dot(A.dot(x))
# func is a definite positive quadratic form, so the minimum is at x = 0
# hence the use of absolute tolerance.
assert np.all(np.abs(_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0]) <= 1e-7)
assert_allclose(
_newton_cg(grad_hess, func, grad, x0, tol=1e-7)[0],
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
atol=1e-5,
)
@pytest.mark.parametrize("verbose", [0, 1, 2])
def test_newton_cg_verbosity(capsys, verbose):
"""Test the std output of verbose newton_cg solver."""
A = np.eye(2)
b = np.array([1, 2], dtype=float)
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.zeros(A.shape[0]),
verbose=verbose,
) # returns array([1., 2])
captured = capsys.readouterr()
if verbose == 0:
assert captured.out == ""
else:
msg = [
"Newton-CG iter = 1",
"Check Convergence",
"max |gradient|",
"Solver did converge at loss = ",
]
for m in msg:
assert m in captured.out
if verbose >= 2:
msg = [
"Inner CG solver iteration 1 stopped with",
"sum(|residuals|) <= tol",
"Line Search",
"try line search wolfe1",
"wolfe1 line search was successful",
]
for m in msg:
assert m in captured.out
if verbose >= 2:
# Set up a badly scaled singular Hessian with a completely wrong starting
# position. This should trigger 2nd line search check
A = np.array([[1.0, 2], [2, 4]]) * 1e30 # collinear columns
b = np.array([1.0, 2.0])
# Note that scipy.optimize._linesearch LineSearchWarning inherits from
# RuntimeWarning, but we do not want to import from non public APIs.
with pytest.warns(RuntimeWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.array([-2.0, 1]), # null space of hessian
verbose=verbose,
)
captured = capsys.readouterr()
msg = [
"wolfe1 line search was not successful",
"check loss |improvement| <= eps * |loss_old|:",
"check sum(|gradient|) < sum(|gradient_old|):",
"last resort: try line search wolfe2",
]
for m in msg:
assert m in captured.out
# Set up a badly conditioned Hessian that leads to tiny curvature.
# X.T @ X have singular values array([1.00000400e+01, 1.00008192e-11])
A = np.array([[1.0, 2], [1, 2 + 1e-15]])
b = np.array([-2.0, 1])
with pytest.warns(ConvergenceWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=b,
verbose=verbose,
maxiter=2,
)
captured = capsys.readouterr()
msg = [
"tiny_|p| = eps * ||p||^2",
]
for m in msg:
assert m in captured.out
# Test for a case with negative Hessian.
# We do not trigger "Inner CG solver iteration {i} stopped with negative
# curvature", but that is very hard to trigger.
A = np.eye(2)
b = np.array([-2.0, 1])
with pytest.warns(RuntimeWarning):
_newton_cg(
# Note the wrong sign in the hessian product.
grad_hess=lambda x: (A @ x - b, lambda z: -A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.array([1.0, 1.0]),
verbose=verbose,
maxiter=3,
)
captured = capsys.readouterr()
msg = [
"Inner CG solver iteration 0 fell back to steepest descent",
]
for m in msg:
assert m in captured.out
A = np.diag([1e-3, 1, 1e3])
b = np.array([-2.0, 1, 2.0])
with pytest.warns(ConvergenceWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.ones_like(b),
verbose=verbose,
maxiter=2,
maxinner=1,
)
captured = capsys.readouterr()
msg = [
"Inner CG solver stopped reaching maxiter=1",
]
for m in msg:
assert m in captured.out
def test_check_optimize():
# Mock some lbfgs output using a Bunch instance:
result = Bunch()
# First case: no warnings
result.nit = 1
result.status = 0
result.message = "OK"
with warnings.catch_warnings():
warnings.simplefilter("error")
_check_optimize_result("lbfgs", result)
# Second case: warning about implicit `max_iter`: do not recommend the user
# to increase `max_iter` this is not a user settable parameter.
result.status = 1
result.message = "STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT"
with pytest.warns(ConvergenceWarning) as record:
_check_optimize_result("lbfgs", result)
assert len(record) == 1
warn_msg = record[0].message.args[0]
assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
assert result.message in warn_msg
assert "Increase the number of iterations" not in warn_msg
assert "scale the data" in warn_msg
# Third case: warning about explicit `max_iter`: recommend user to increase
# `max_iter`.
with pytest.warns(ConvergenceWarning) as record:
_check_optimize_result("lbfgs", result, max_iter=1)
assert len(record) == 1
warn_msg = record[0].message.args[0]
assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
assert result.message in warn_msg
assert "Increase the number of iterations" in warn_msg
assert "scale the data" in warn_msg
# Fourth case: other convergence problem before reaching `max_iter`: do not
# recommend increasing `max_iter`.
result.nit = 2
result.status = 2
result.message = "ABNORMAL"
with pytest.warns(ConvergenceWarning) as record:
_check_optimize_result("lbfgs", result, max_iter=10)
assert len(record) == 1
warn_msg = record[0].message.args[0]
assert "lbfgs failed to converge after 2 iteration(s)" in warn_msg
assert result.message in warn_msg
assert "Increase the number of iterations" not in warn_msg
assert "scale the data" in warn_msg

View File

@@ -0,0 +1,197 @@
import itertools
import re
import time
import warnings
import joblib
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn import config_context, get_config
from sklearn.compose import make_column_transformer
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.fixes import _IS_WASM
from sklearn.utils.parallel import Parallel, delayed
def get_working_memory():
return get_config()["working_memory"]
@pytest.mark.parametrize("n_jobs", [1, 2])
@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
def test_configuration_passes_through_to_joblib(n_jobs, backend):
# Tests that the global global configuration is passed to joblib jobs
with config_context(working_memory=123):
results = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(get_working_memory)() for _ in range(2)
)
assert_array_equal(results, [123] * 2)
def test_parallel_delayed_warnings():
"""Informative warnings should be raised when mixing sklearn and joblib API"""
# We should issue a warning when one wants to use sklearn.utils.fixes.Parallel
# with joblib.delayed. The config will not be propagated to the workers.
warn_msg = "`sklearn.utils.parallel.Parallel` needs to be used in conjunction"
with pytest.warns(UserWarning, match=warn_msg) as records:
Parallel()(joblib.delayed(time.sleep)(0) for _ in range(10))
assert len(records) == 10
# We should issue a warning if one wants to use sklearn.utils.fixes.delayed with
# joblib.Parallel
warn_msg = (
"`sklearn.utils.parallel.delayed` should be used with "
"`sklearn.utils.parallel.Parallel` to make it possible to propagate"
)
with pytest.warns(UserWarning, match=warn_msg) as records:
joblib.Parallel()(delayed(time.sleep)(0) for _ in range(10))
assert len(records) == 10
@pytest.mark.parametrize("n_jobs", [1, 2])
def test_dispatch_config_parallel(n_jobs):
"""Check that we properly dispatch the configuration in parallel processing.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/25239
"""
pd = pytest.importorskip("pandas")
iris = load_iris(as_frame=True)
class TransformerRequiredDataFrame(StandardScaler):
def fit(self, X, y=None):
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
return super().fit(X, y)
def transform(self, X, y=None):
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
return super().transform(X, y)
dropper = make_column_transformer(
("drop", [0]),
remainder="passthrough",
n_jobs=n_jobs,
)
param_grid = {"randomforestclassifier__max_depth": [1, 2, 3]}
search_cv = GridSearchCV(
make_pipeline(
dropper,
TransformerRequiredDataFrame(),
RandomForestClassifier(n_estimators=5, n_jobs=n_jobs),
),
param_grid,
cv=5,
n_jobs=n_jobs,
error_score="raise", # this search should not fail
)
# make sure that `fit` would fail in case we don't request dataframe
with pytest.raises(AssertionError, match="X should be a DataFrame"):
search_cv.fit(iris.data, iris.target)
with config_context(transform_output="pandas"):
# we expect each intermediate steps to output a DataFrame
search_cv.fit(iris.data, iris.target)
assert not np.isnan(search_cv.cv_results_["mean_test_score"]).any()
def raise_warning():
warnings.warn("Convergence warning", ConvergenceWarning)
def _yield_n_jobs_backend_combinations():
n_jobs_values = [1, 2]
backend_values = ["loky", "threading", "multiprocessing"]
for n_jobs, backend in itertools.product(n_jobs_values, backend_values):
if n_jobs == 2 and backend == "loky":
# XXX Mark thread-unsafe to avoid:
# RuntimeError: The executor underlying Parallel has been shutdown.
# See https://github.com/joblib/joblib/issues/1743 for more details.
yield pytest.param(n_jobs, backend, marks=pytest.mark.thread_unsafe)
else:
yield n_jobs, backend
@pytest.mark.parametrize("n_jobs, backend", _yield_n_jobs_backend_combinations())
def test_filter_warning_propagates(n_jobs, backend):
"""Check warning propagates to the job."""
with warnings.catch_warnings():
warnings.simplefilter("error", category=ConvergenceWarning)
with pytest.raises(ConvergenceWarning):
Parallel(n_jobs=n_jobs, backend=backend)(
delayed(raise_warning)() for _ in range(2)
)
def get_warning_filters():
# In free-threading Python >= 3.14, warnings filters are managed through a
# ContextVar and warnings.filters is not modified inside a
# warnings.catch_warnings context. You need to use warnings._get_filters().
# For more details, see
# https://docs.python.org/3.14/whatsnew/3.14.html#concurrent-safe-warnings-control
filters_func = getattr(warnings, "_get_filters", None)
return filters_func() if filters_func is not None else warnings.filters
def test_check_warnings_threading():
"""Check that warnings filters are set correctly in the threading backend."""
with warnings.catch_warnings():
warnings.simplefilter("error", category=ConvergenceWarning)
main_warning_filters = get_warning_filters()
assert ("error", None, ConvergenceWarning, None, 0) in main_warning_filters
all_worker_warning_filters = Parallel(n_jobs=2, backend="threading")(
delayed(get_warning_filters)() for _ in range(2)
)
def normalize_main_module(filters):
# In Python 3.14 free-threaded, there is a small discrepancy main
# warning filters have an entry with module = "__main__" whereas it
# is a regex in the workers
return [
(
action,
message,
type_,
module
if "__main__" not in str(module)
or not isinstance(module, re.Pattern)
else module.pattern,
lineno,
)
for action, message, type_, module, lineno in main_warning_filters
]
for worker_warning_filter in all_worker_warning_filters:
assert normalize_main_module(
worker_warning_filter
) == normalize_main_module(main_warning_filters)
@pytest.mark.xfail(_IS_WASM, reason="Pyodide always use the sequential backend")
def test_filter_warning_propagates_no_side_effect_with_loky_backend():
with warnings.catch_warnings():
warnings.simplefilter("error", category=ConvergenceWarning)
Parallel(n_jobs=2, backend="loky")(delayed(time.sleep)(0) for _ in range(10))
# Since loky workers are reused, make sure that inside the loky workers,
# warnings filters have been reset to their original value. Using joblib
# directly should not turn ConvergenceWarning into an error.
joblib.Parallel(n_jobs=2, backend="loky")(
joblib.delayed(warnings.warn)("Convergence warning", ConvergenceWarning)
for _ in range(10)
)

View File

@@ -0,0 +1,786 @@
from numbers import Integral, Real
import numpy as np
import pytest
from scipy.sparse import csr_matrix
from sklearn._config import config_context, get_config
from sklearn.base import BaseEstimator, _fit_context
from sklearn.model_selection import LeaveOneOut
from sklearn.utils import deprecated
from sklearn.utils._param_validation import (
HasMethods,
Hidden,
Interval,
InvalidParameterError,
MissingValues,
Options,
RealNotInt,
StrOptions,
_ArrayLikes,
_Booleans,
_Callables,
_CVObjects,
_InstancesOf,
_IterablesNotString,
_NanConstraint,
_NoneConstraint,
_PandasNAConstraint,
_RandomStates,
_SparseMatrices,
_VerboseHelper,
generate_invalid_param_val,
generate_valid_param,
make_constraint,
validate_params,
)
from sklearn.utils.fixes import CSR_CONTAINERS
# Some helpers for the tests
@validate_params(
{"a": [Real], "b": [Real], "c": [Real], "d": [Real]},
prefer_skip_nested_validation=True,
)
def _func(a, b=0, *args, c, d=0, **kwargs):
"""A function to test the validation of functions."""
class _Class:
"""A class to test the _InstancesOf constraint and the validation of methods."""
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
def _method(self, a):
"""A validated method"""
@deprecated()
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
def _deprecated_method(self, a):
"""A deprecated validated method"""
class _Estimator(BaseEstimator):
"""An estimator to test the validation of estimator parameters."""
_parameter_constraints: dict = {"a": [Real]}
def __init__(self, a):
self.a = a
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X=None, y=None):
pass
@pytest.mark.parametrize("interval_type", [Integral, Real])
def test_interval_range(interval_type):
"""Check the range of values depending on closed."""
interval = Interval(interval_type, -2, 2, closed="left")
assert -2 in interval
assert 2 not in interval
interval = Interval(interval_type, -2, 2, closed="right")
assert -2 not in interval
assert 2 in interval
interval = Interval(interval_type, -2, 2, closed="both")
assert -2 in interval
assert 2 in interval
interval = Interval(interval_type, -2, 2, closed="neither")
assert -2 not in interval
assert 2 not in interval
@pytest.mark.parametrize("interval_type", [Integral, Real])
def test_interval_large_integers(interval_type):
"""Check that Interval constraint work with large integers.
non-regression test for #26648.
"""
interval = Interval(interval_type, 0, 2, closed="neither")
assert 2**65 not in interval
assert 2**128 not in interval
assert float(2**65) not in interval
assert float(2**128) not in interval
interval = Interval(interval_type, 0, 2**128, closed="neither")
assert 2**65 in interval
assert 2**128 not in interval
assert float(2**65) in interval
assert float(2**128) not in interval
assert 2**1024 not in interval
def test_interval_inf_in_bounds():
"""Check that inf is included iff a bound is closed and set to None.
Only valid for real intervals.
"""
interval = Interval(Real, 0, None, closed="right")
assert np.inf in interval
interval = Interval(Real, None, 0, closed="left")
assert -np.inf in interval
interval = Interval(Real, None, None, closed="neither")
assert np.inf not in interval
assert -np.inf not in interval
@pytest.mark.parametrize(
"interval",
[Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")],
)
def test_nan_not_in_interval(interval):
"""Check that np.nan is not in any interval."""
assert np.nan not in interval
@pytest.mark.parametrize(
"params, error, match",
[
(
{"type": Integral, "left": 1.0, "right": 2, "closed": "both"},
TypeError,
r"Expecting left to be an int for an interval over the integers",
),
(
{"type": Integral, "left": 1, "right": 2.0, "closed": "neither"},
TypeError,
"Expecting right to be an int for an interval over the integers",
),
(
{"type": Integral, "left": None, "right": 0, "closed": "left"},
ValueError,
r"left can't be None when closed == left",
),
(
{"type": Integral, "left": 0, "right": None, "closed": "right"},
ValueError,
r"right can't be None when closed == right",
),
(
{"type": Integral, "left": 1, "right": -1, "closed": "both"},
ValueError,
r"right can't be less than left",
),
],
)
def test_interval_errors(params, error, match):
"""Check that informative errors are raised for invalid combination of parameters"""
with pytest.raises(error, match=match):
Interval(**params)
def test_stroptions():
"""Sanity check for the StrOptions constraint"""
options = StrOptions({"a", "b", "c"}, deprecated={"c"})
assert options.is_satisfied_by("a")
assert options.is_satisfied_by("c")
assert not options.is_satisfied_by("d")
assert "'c' (deprecated)" in str(options)
def test_options():
"""Sanity check for the Options constraint"""
options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5})
assert options.is_satisfied_by(-0.5)
assert options.is_satisfied_by(np.inf)
assert not options.is_satisfied_by(1.23)
assert "-0.5 (deprecated)" in str(options)
@pytest.mark.parametrize(
"type, expected_type_name",
[
(int, "int"),
(Integral, "int"),
(Real, "float"),
(np.ndarray, "numpy.ndarray"),
],
)
def test_instances_of_type_human_readable(type, expected_type_name):
"""Check the string representation of the _InstancesOf constraint."""
constraint = _InstancesOf(type)
assert str(constraint) == f"an instance of '{expected_type_name}'"
def test_hasmethods():
"""Check the HasMethods constraint."""
constraint = HasMethods(["a", "b"])
class _Good:
def a(self):
pass # pragma: no cover
def b(self):
pass # pragma: no cover
class _Bad:
def a(self):
pass # pragma: no cover
assert constraint.is_satisfied_by(_Good())
assert not constraint.is_satisfied_by(_Bad())
assert str(constraint) == "an object implementing 'a' and 'b'"
@pytest.mark.parametrize(
"constraint",
[
Interval(Real, None, 0, closed="left"),
Interval(Real, 0, None, closed="left"),
Interval(Real, None, None, closed="neither"),
StrOptions({"a", "b", "c"}),
MissingValues(),
MissingValues(numeric_only=True),
_VerboseHelper(),
HasMethods("fit"),
_IterablesNotString(),
_CVObjects(),
],
)
def test_generate_invalid_param_val(constraint):
"""Check that the value generated does not satisfy the constraint"""
bad_value = generate_invalid_param_val(constraint)
assert not constraint.is_satisfied_by(bad_value)
@pytest.mark.parametrize(
"integer_interval, real_interval",
[
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, -5, 5, closed="both"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, -5, 5, closed="neither"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 4, 5, closed="both"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 5, None, closed="left"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 4, None, closed="neither"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, -5, 5, closed="both"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, -5, 5, closed="neither"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, 1, 2, closed="both"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, None, -5, closed="left"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, None, -4, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, None, 1, closed="right"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, 1, None, closed="left"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, -10, -4, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, -10, -4, closed="right"),
),
(
Interval(Integral, -5, 5, closed="neither"),
Interval(RealNotInt, 6, 10, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="neither"),
Interval(RealNotInt, 6, 10, closed="left"),
),
(
Interval(Integral, 2, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="both"),
),
(
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="both"),
),
],
)
def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval):
"""Check that the value generated for an interval constraint does not satisfy any of
the interval constraints.
"""
bad_value = generate_invalid_param_val(constraint=real_interval)
assert not real_interval.is_satisfied_by(bad_value)
assert not integer_interval.is_satisfied_by(bad_value)
bad_value = generate_invalid_param_val(constraint=integer_interval)
assert not real_interval.is_satisfied_by(bad_value)
assert not integer_interval.is_satisfied_by(bad_value)
@pytest.mark.parametrize(
"constraint",
[
_ArrayLikes(),
_InstancesOf(list),
_Callables(),
_NoneConstraint(),
_RandomStates(),
_SparseMatrices(),
_Booleans(),
Interval(Integral, None, None, closed="neither"),
],
)
def test_generate_invalid_param_val_all_valid(constraint):
"""Check that the function raises NotImplementedError when there's no invalid value
for the constraint.
"""
with pytest.raises(NotImplementedError):
generate_invalid_param_val(constraint)
@pytest.mark.parametrize(
"constraint",
[
_ArrayLikes(),
_Callables(),
_InstancesOf(list),
_NoneConstraint(),
_RandomStates(),
_SparseMatrices(),
_Booleans(),
_VerboseHelper(),
MissingValues(),
MissingValues(numeric_only=True),
StrOptions({"a", "b", "c"}),
Options(Integral, {1, 2, 3}),
Interval(Integral, None, None, closed="neither"),
Interval(Integral, 0, 10, closed="neither"),
Interval(Integral, 0, None, closed="neither"),
Interval(Integral, None, 0, closed="neither"),
Interval(Real, 0, 1, closed="neither"),
Interval(Real, 0, None, closed="both"),
Interval(Real, None, 0, closed="right"),
HasMethods("fit"),
_IterablesNotString(),
_CVObjects(),
],
)
def test_generate_valid_param(constraint):
"""Check that the value generated does satisfy the constraint."""
value = generate_valid_param(constraint)
assert constraint.is_satisfied_by(value)
@pytest.mark.parametrize(
"constraint_declaration, value",
[
(Interval(Real, 0, 1, closed="both"), 0.42),
(Interval(Integral, 0, None, closed="neither"), 42),
(StrOptions({"a", "b", "c"}), "b"),
(Options(type, {np.float32, np.float64}), np.float64),
(callable, lambda x: x + 1),
(None, None),
("array-like", [[1, 2], [3, 4]]),
("array-like", np.array([[1, 2], [3, 4]])),
("sparse matrix", csr_matrix([[1, 2], [3, 4]])),
*[
("sparse matrix", container([[1, 2], [3, 4]]))
for container in CSR_CONTAINERS
],
("random_state", 0),
("random_state", np.random.RandomState(0)),
("random_state", None),
(_Class, _Class()),
(int, 1),
(Real, 0.5),
("boolean", False),
("verbose", 1),
("nan", np.nan),
(MissingValues(), -1),
(MissingValues(), -1.0),
(MissingValues(), 2**1028),
(MissingValues(), None),
(MissingValues(), float("nan")),
(MissingValues(), np.nan),
(MissingValues(), "missing"),
(HasMethods("fit"), _Estimator(a=0)),
("cv_object", 5),
],
)
def test_is_satisfied_by(constraint_declaration, value):
"""Sanity check for the is_satisfied_by method"""
constraint = make_constraint(constraint_declaration)
assert constraint.is_satisfied_by(value)
@pytest.mark.parametrize(
"constraint_declaration, expected_constraint_class",
[
(Interval(Real, 0, 1, closed="both"), Interval),
(StrOptions({"option1", "option2"}), StrOptions),
(Options(Real, {0.42, 1.23}), Options),
("array-like", _ArrayLikes),
("sparse matrix", _SparseMatrices),
("random_state", _RandomStates),
(None, _NoneConstraint),
(callable, _Callables),
(int, _InstancesOf),
("boolean", _Booleans),
("verbose", _VerboseHelper),
(MissingValues(numeric_only=True), MissingValues),
(HasMethods("fit"), HasMethods),
("cv_object", _CVObjects),
("nan", _NanConstraint),
(np.nan, _NanConstraint),
],
)
def test_make_constraint(constraint_declaration, expected_constraint_class):
"""Check that make_constraint dispatches to the appropriate constraint class"""
constraint = make_constraint(constraint_declaration)
assert constraint.__class__ is expected_constraint_class
def test_make_constraint_unknown():
"""Check that an informative error is raised when an unknown constraint is passed"""
with pytest.raises(ValueError, match="Unknown constraint"):
make_constraint("not a valid constraint")
def test_validate_params():
"""Check that validate_params works no matter how the arguments are passed"""
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _func must be"
):
_func("wrong", c=1)
with pytest.raises(
InvalidParameterError, match="The 'b' parameter of _func must be"
):
_func(*[1, "wrong"], c=1)
with pytest.raises(
InvalidParameterError, match="The 'c' parameter of _func must be"
):
_func(1, **{"c": "wrong"})
with pytest.raises(
InvalidParameterError, match="The 'd' parameter of _func must be"
):
_func(1, c=1, d="wrong")
# check in the presence of extra positional and keyword args
with pytest.raises(
InvalidParameterError, match="The 'b' parameter of _func must be"
):
_func(0, *["wrong", 2, 3], c=4, **{"e": 5})
with pytest.raises(
InvalidParameterError, match="The 'c' parameter of _func must be"
):
_func(0, *[1, 2, 3], c="four", **{"e": 5})
def test_validate_params_missing_params():
"""Check that no error is raised when there are parameters without
constraints
"""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def func(a, b):
pass
func(1, 2)
def test_decorate_validated_function():
"""Check that validate_params functions can be decorated"""
decorated_function = deprecated()(_func)
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
decorated_function(1, 2, c=3)
# outer decorator does not interfere with validation
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
with pytest.raises(
InvalidParameterError, match=r"The 'c' parameter of _func must be"
):
decorated_function(1, 2, c="wrong")
def test_validate_params_method():
"""Check that validate_params works with methods"""
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _Class._method must be"
):
_Class()._method("wrong")
# validated method can be decorated
with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"):
with pytest.raises(
InvalidParameterError,
match="The 'a' parameter of _Class._deprecated_method must be",
):
_Class()._deprecated_method("wrong")
def test_validate_params_estimator():
"""Check that validate_params works with Estimator instances"""
# no validation in init
est = _Estimator("wrong")
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _Estimator must be"
):
est.fit()
def test_stroptions_deprecated_subset():
"""Check that the deprecated parameter must be a subset of options."""
with pytest.raises(ValueError, match="deprecated options must be a subset"):
StrOptions({"a", "b", "c"}, deprecated={"a", "d"})
def test_hidden_constraint():
"""Check that internal constraints are not exposed in the error message."""
@validate_params(
{"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True
)
def f(param):
pass
# list and dict are valid params
f({"a": 1, "b": 2, "c": 3})
f([1, 2, 3])
with pytest.raises(
InvalidParameterError, match="The 'param' parameter"
) as exc_info:
f(param="bad")
# the list option is not exposed in the error message
err_msg = str(exc_info.value)
assert "an instance of 'dict'" in err_msg
assert "an instance of 'list'" not in err_msg
def test_hidden_stroptions():
"""Check that we can have 2 StrOptions constraints, one being hidden."""
@validate_params(
{"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]},
prefer_skip_nested_validation=True,
)
def f(param):
pass
# "auto" and "warn" are valid params
f("auto")
f("warn")
with pytest.raises(
InvalidParameterError, match="The 'param' parameter"
) as exc_info:
f(param="bad")
# the "warn" option is not exposed in the error message
err_msg = str(exc_info.value)
assert "auto" in err_msg
assert "warn" not in err_msg
def test_validate_params_set_param_constraints_attribute():
"""Check that the validate_params decorator properly sets the parameter constraints
as attribute of the decorated function/method.
"""
assert hasattr(_func, "_skl_parameter_constraints")
assert hasattr(_Class()._method, "_skl_parameter_constraints")
def test_boolean_constraint_deprecated_int():
"""Check that validate_params raise a deprecation message but still passes
validation when using an int for a parameter accepting a boolean.
"""
@validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True)
def f(param):
pass
# True/False and np.bool_(True/False) are valid params
f(True)
f(np.bool_(False))
def test_no_validation():
"""Check that validation can be skipped for a parameter."""
@validate_params(
{"param1": [int, None], "param2": "no_validation"},
prefer_skip_nested_validation=True,
)
def f(param1=None, param2=None):
pass
# param1 is validated
with pytest.raises(InvalidParameterError, match="The 'param1' parameter"):
f(param1="wrong")
# param2 is not validated: any type is valid.
class SomeType:
pass
f(param2=SomeType)
f(param2=SomeType())
def test_pandas_na_constraint_with_pd_na():
"""Add a specific test for checking support for `pandas.NA`."""
pd = pytest.importorskip("pandas")
na_constraint = _PandasNAConstraint()
assert na_constraint.is_satisfied_by(pd.NA)
assert not na_constraint.is_satisfied_by(np.array([1, 2, 3]))
def test_iterable_not_string():
"""Check that a string does not satisfy the _IterableNotString constraint."""
constraint = _IterablesNotString()
assert constraint.is_satisfied_by([1, 2, 3])
assert constraint.is_satisfied_by(range(10))
assert not constraint.is_satisfied_by("some string")
def test_cv_objects():
"""Check that the _CVObjects constraint accepts all current ways
to pass cv objects."""
constraint = _CVObjects()
assert constraint.is_satisfied_by(5)
assert constraint.is_satisfied_by(LeaveOneOut())
assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
assert constraint.is_satisfied_by(None)
assert not constraint.is_satisfied_by("not a CV object")
def test_third_party_estimator():
"""Check that the validation from a scikit-learn estimator inherited by a third
party estimator does not impose a match between the dict of constraints and the
parameters of the estimator.
"""
class ThirdPartyEstimator(_Estimator):
def __init__(self, b):
self.b = b
super().__init__(a=0)
def fit(self, X=None, y=None):
super().fit(X, y)
# does not raise, even though "b" is not in the constraints dict and "a" is not
# a parameter of the estimator.
ThirdPartyEstimator(b=0).fit()
def test_interval_real_not_int():
"""Check for the type RealNotInt in the Interval constraint."""
constraint = Interval(RealNotInt, 0, 1, closed="both")
assert constraint.is_satisfied_by(1.0)
assert not constraint.is_satisfied_by(1)
def test_real_not_int():
"""Check for the RealNotInt type."""
assert isinstance(1.0, RealNotInt)
assert not isinstance(1, RealNotInt)
assert isinstance(np.float64(1), RealNotInt)
assert not isinstance(np.int64(1), RealNotInt)
def test_skip_param_validation():
"""Check that param validation can be skipped using config_context."""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def f(a):
pass
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
f(a="1")
# does not raise
with config_context(skip_parameter_validation=True):
f(a="1")
@pytest.mark.parametrize("prefer_skip_nested_validation", [True, False])
def test_skip_nested_validation(prefer_skip_nested_validation):
"""Check that nested validation can be skipped."""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def f(a):
pass
@validate_params(
{"b": [int]},
prefer_skip_nested_validation=prefer_skip_nested_validation,
)
def g(b):
# calls f with a bad parameter type
return f(a="invalid_param_value")
# Validation for g is never skipped.
with pytest.raises(InvalidParameterError, match="The 'b' parameter"):
g(b="invalid_param_value")
if prefer_skip_nested_validation:
g(b=1) # does not raise because inner f is not validated
else:
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
g(b=1)
@pytest.mark.parametrize(
"skip_parameter_validation, prefer_skip_nested_validation, expected_skipped",
[
(True, True, True),
(True, False, True),
(False, True, True),
(False, False, False),
],
)
def test_skip_nested_validation_and_config_context(
skip_parameter_validation, prefer_skip_nested_validation, expected_skipped
):
"""Check interaction between global skip and local skip."""
@validate_params(
{"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation
)
def g(a):
return get_config()["skip_parameter_validation"]
with config_context(skip_parameter_validation=skip_parameter_validation):
actual_skipped = g(1)
assert actual_skipped == expected_skipped

View File

@@ -0,0 +1,543 @@
import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.utils._plotting import (
_BinaryClassifierCurveDisplayMixin,
_deprecate_estimator_name,
_despine,
_interval_max_min_ratio,
_validate_score_name,
_validate_style_kwargs,
)
from sklearn.utils._response import _get_response_values_binary
from sklearn.utils._testing import assert_allclose
@pytest.mark.parametrize("ax", [None, "Ax"])
@pytest.mark.parametrize(
"name, expected_name_out", [(None, "TestEstimator"), ("CustomName", "CustomName")]
)
def test_validate_plot_params(pyplot, ax, name, expected_name_out):
"""Check `_validate_plot_params` returns the correct values."""
display = _BinaryClassifierCurveDisplayMixin()
display.estimator_name = "TestEstimator"
if ax:
_, ax = pyplot.subplots()
ax_out, _, name_out = display._validate_plot_params(ax=ax, name=name)
assert name_out == expected_name_out
if ax:
assert ax == ax_out
@pytest.mark.parametrize("pos_label", [None, 0])
@pytest.mark.parametrize("name", [None, "CustomName"])
@pytest.mark.parametrize(
"response_method", ["auto", "predict_proba", "decision_function"]
)
def test_validate_and_get_response_values(pyplot, pos_label, name, response_method):
"""Check `_validate_and_get_response_values` returns the correct values."""
X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
y = np.array([0, 0, 2, 2])
estimator = LogisticRegression().fit(X, y)
y_pred, pos_label, name_out = (
_BinaryClassifierCurveDisplayMixin._validate_and_get_response_values(
estimator,
X,
y,
response_method=response_method,
pos_label=pos_label,
name=name,
)
)
expected_y_pred, expected_pos_label = _get_response_values_binary(
estimator, X, response_method=response_method, pos_label=pos_label
)
assert_allclose(y_pred, expected_y_pred)
assert pos_label == expected_pos_label
# Check name is handled correctly
expected_name = name if name is not None else "LogisticRegression"
assert name_out == expected_name
@pytest.mark.parametrize(
"y_true, error_message",
[
(np.array([0, 1, 2]), "The target y is not binary."),
(np.array([0, 1]), "Found input variables with inconsistent"),
(np.array([0, 2, 0, 2]), r"y_true takes value in \{0, 2\} and pos_label"),
],
)
def test_validate_from_predictions_params_errors(pyplot, y_true, error_message):
"""Check `_validate_from_predictions_params` raises the correct errors."""
y_pred = np.array([0.1, 0.2, 0.3, 0.4])
sample_weight = np.ones(4)
with pytest.raises(ValueError, match=error_message):
_BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
y_true=y_true,
y_pred=y_pred,
sample_weight=sample_weight,
pos_label=None,
)
@pytest.mark.parametrize("name", [None, "CustomName"])
@pytest.mark.parametrize(
"pos_label, y_true",
[
(None, np.array([0, 1, 0, 1])),
(2, np.array([0, 2, 0, 2])),
],
)
def test_validate_from_predictions_params_returns(pyplot, name, pos_label, y_true):
"""Check `_validate_from_predictions_params` returns the correct values."""
y_pred = np.array([0.1, 0.2, 0.3, 0.4])
pos_label_out, name_out = (
_BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
y_true=y_true,
y_pred=y_pred,
sample_weight=None,
pos_label=pos_label,
name=name,
)
)
# Check name is handled correctly
expected_name = name if name is not None else "Classifier"
assert name_out == expected_name
# Check pos_label is handled correctly
expected_pos_label = pos_label if pos_label is not None else 1
assert pos_label_out == expected_pos_label
@pytest.mark.parametrize(
"params, err_msg",
[
(
{
# Missing "indices" key
"cv_results": {"estimator": "dummy"},
"X": np.array([[1, 2], [3, 4]]),
"y": np.array([0, 1]),
"sample_weight": None,
},
"`cv_results` does not contain one of the following",
),
(
{
"cv_results": {
"estimator": "dummy",
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
},
# `X` wrong length
"X": np.array([[1, 2]]),
"y": np.array([0, 1]),
"sample_weight": None,
},
"`X` does not contain the correct number of",
),
(
{
"cv_results": {
"estimator": "dummy",
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
},
"X": np.array([1, 2, 3, 4]),
# `y` not binary
"y": np.array([0, 2, 1, 3]),
"sample_weight": None,
},
"The target `y` is not binary",
),
(
{
"cv_results": {
"estimator": "dummy",
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
},
"X": np.array([1, 2, 3, 4]),
"y": np.array([0, 1, 0, 1]),
# `sample_weight` wrong length
"sample_weight": np.array([0.5]),
},
"Found input variables with inconsistent",
),
],
)
def test_validate_from_cv_results_params(pyplot, params, err_msg):
"""Check parameter validation is performed correctly."""
with pytest.raises(ValueError, match=err_msg):
_BinaryClassifierCurveDisplayMixin()._validate_from_cv_results_params(**params)
@pytest.mark.parametrize(
"curve_legend_metric, curve_name, expected_label",
[
(0.85, None, "AUC = 0.85"),
(None, "Model A", "Model A"),
(0.95, "Random Forest", "Random Forest (AUC = 0.95)"),
(None, None, None),
],
)
def test_get_legend_label(curve_legend_metric, curve_name, expected_label):
"""Check `_get_legend_label` returns the correct label."""
legend_metric_name = "AUC"
label = _BinaryClassifierCurveDisplayMixin._get_legend_label(
curve_legend_metric, curve_name, legend_metric_name
)
assert label == expected_label
# TODO(1.9) : Remove
@pytest.mark.parametrize("curve_kwargs", [{"alpha": 1.0}, None])
@pytest.mark.parametrize("kwargs", [{}, {"alpha": 1.0}])
def test_validate_curve_kwargs_deprecate_kwargs(curve_kwargs, kwargs):
"""Check `_validate_curve_kwargs` deprecates kwargs correctly."""
n_curves = 1
name = None
legend_metric = {"mean": 0.8, "std": 0.1}
legend_metric_name = "AUC"
if curve_kwargs and kwargs:
with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves,
name,
legend_metric,
legend_metric_name,
curve_kwargs,
**kwargs,
)
elif kwargs:
with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and"):
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves,
name,
legend_metric,
legend_metric_name,
curve_kwargs,
**kwargs,
)
else:
# No warning or error should be raised
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves, name, legend_metric, legend_metric_name, curve_kwargs, **kwargs
)
def test_validate_curve_kwargs_error():
"""Check `_validate_curve_kwargs` performs parameter validation correctly."""
n_curves = 3
legend_metric = {"mean": 0.8, "std": 0.1}
legend_metric_name = "AUC"
with pytest.raises(ValueError, match="`curve_kwargs` must be None"):
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name=None,
legend_metric=legend_metric,
legend_metric_name=legend_metric_name,
curve_kwargs=[{"alpha": 1.0}],
)
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
name = ["one", "two", "three"]
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name=name,
legend_metric=legend_metric,
legend_metric_name=legend_metric_name,
curve_kwargs=None,
)
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name=name,
legend_metric=legend_metric,
legend_metric_name=legend_metric_name,
curve_kwargs={"alpha": 1.0},
)
@pytest.mark.parametrize("name", [None, "curve_name", ["curve_name"]])
@pytest.mark.parametrize(
"legend_metric",
[{"mean": 0.8, "std": 0.2}, {"mean": None, "std": None}],
)
@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
@pytest.mark.parametrize("curve_kwargs", [None, {"color": "red"}])
def test_validate_curve_kwargs_single_legend(
name, legend_metric, legend_metric_name, curve_kwargs
):
"""Check `_validate_curve_kwargs` returns correct kwargs for single legend entry."""
n_curves = 3
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name=name,
legend_metric=legend_metric,
legend_metric_name=legend_metric_name,
curve_kwargs=curve_kwargs,
)
assert isinstance(curve_kwargs_out, list)
assert len(curve_kwargs_out) == n_curves
expected_label = None
if isinstance(name, list):
name = name[0]
if name is not None:
expected_label = name
if legend_metric["mean"] is not None:
expected_label = expected_label + f" ({legend_metric_name} = 0.80 +/- 0.20)"
# `name` is None
elif legend_metric["mean"] is not None:
expected_label = f"{legend_metric_name} = 0.80 +/- 0.20"
assert curve_kwargs_out[0]["label"] == expected_label
# All remaining curves should have None as "label"
assert curve_kwargs_out[1]["label"] is None
assert curve_kwargs_out[2]["label"] is None
if curve_kwargs is None:
assert all("color" not in kwargs for kwargs in curve_kwargs_out)
else:
assert all(kwargs["color"] == "red" for kwargs in curve_kwargs_out)
@pytest.mark.parametrize("name", [None, "curve_name", ["one", "two", "three"]])
@pytest.mark.parametrize(
"legend_metric", [{"metric": [1.0, 1.0, 1.0]}, {"metric": [None, None, None]}]
)
@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
def test_validate_curve_kwargs_multi_legend(name, legend_metric, legend_metric_name):
"""Check `_validate_curve_kwargs` returns correct kwargs for multi legend entry."""
n_curves = 3
curve_kwargs = [{"color": "red"}, {"color": "yellow"}, {"color": "blue"}]
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name=name,
legend_metric=legend_metric,
legend_metric_name=legend_metric_name,
curve_kwargs=curve_kwargs,
)
assert isinstance(curve_kwargs_out, list)
assert len(curve_kwargs_out) == n_curves
expected_labels = [None, None, None]
if isinstance(name, str):
expected_labels = "curve_name"
if legend_metric["metric"][0] is not None:
expected_labels = expected_labels + f" ({legend_metric_name} = 1.00)"
expected_labels = [expected_labels] * n_curves
elif isinstance(name, list) and legend_metric["metric"][0] is None:
expected_labels = name
elif isinstance(name, list) and legend_metric["metric"][0] is not None:
expected_labels = [
f"{name_single} ({legend_metric_name} = 1.00)" for name_single in name
]
# `name` is None
elif legend_metric["metric"][0] is not None:
expected_labels = [f"{legend_metric_name} = 1.00"] * n_curves
for idx, expected_label in enumerate(expected_labels):
assert curve_kwargs_out[idx]["label"] == expected_label
for curve_kwarg, curve_kwarg_out in zip(curve_kwargs, curve_kwargs_out):
assert curve_kwarg_out["color"] == curve_kwarg["color"]
@pytest.mark.parametrize("curve_kwargs", [None, {"color": "red"}])
@pytest.mark.parametrize("n_curves", [1, 3])
def test_validate_curve_kwargs_default_kwargs(n_curves, curve_kwargs):
"""Check default kwargs are incorporated correctly."""
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
n_curves=n_curves,
name="test",
legend_metric={"mean": 0.8, "std": 0.2},
legend_metric_name="metric",
curve_kwargs=curve_kwargs,
default_curve_kwargs={"color": "blue"},
default_multi_curve_kwargs={"alpha": 0.7, "linestyle": "--", "color": "green"},
)
if n_curves > 1:
# `default_multi_curve_kwargs` are incorporated
assert all(kwarg["alpha"] == 0.7 for kwarg in curve_kwargs_out)
assert all(kwarg["linestyle"] == "--" for kwarg in curve_kwargs_out)
if curve_kwargs is None:
# `default_multi_curve_kwargs` over-rides `default_curve_kwargs`
assert all(kwarg["color"] == "green" for kwarg in curve_kwargs_out)
else:
# `curve_kwargs` over-rides any defaults
assert all(kwarg["color"] == "red" for kwarg in curve_kwargs_out)
# Single curve
elif curve_kwargs is None:
# Use `default_curve_kwargs`
assert all(kwarg["color"] == "blue" for kwarg in curve_kwargs_out)
else:
# Use `curve_kwargs`
assert all(kwarg["color"] == "red" for kwarg in curve_kwargs_out)
def metric():
pass # pragma: no cover
def neg_metric():
pass # pragma: no cover
@pytest.mark.parametrize(
"score_name, scoring, negate_score, expected_score_name",
[
("accuracy", None, False, "accuracy"), # do not transform the name
(None, "accuracy", False, "Accuracy"), # capitalize the name
(None, "accuracy", True, "Negative accuracy"), # add "Negative"
(None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
(None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_"
("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name
(None, None, False, "Score"), # default name
(None, None, True, "Negative score"), # default name but negated
("Some metric", metric, False, "Some metric"), # do not transform the name
("Some metric", metric, True, "Some metric"), # do not transform the name
(None, metric, False, "Metric"), # default name
(None, metric, True, "Negative metric"), # default name but negated
("Some metric", neg_metric, False, "Some metric"), # do not transform the name
("Some metric", neg_metric, True, "Some metric"), # do not transform the name
(None, neg_metric, False, "Negative metric"), # default name
(None, neg_metric, True, "Metric"), # default name but negated
],
)
def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
"""Check that we return the right score name."""
assert (
_validate_score_name(score_name, scoring, negate_score) == expected_score_name
)
# In the following test, we check the value of the max to min ratio
# for parameter value intervals to check that using a decision threshold
# of 5. is a good heuristic to decide between linear and log scales on
# common ranges of parameter values.
@pytest.mark.parametrize(
"data, lower_bound, upper_bound",
[
# Such a range could be clearly displayed with either log scale or linear
# scale.
(np.geomspace(0.1, 1, 5), 5, 6),
# Checking that the ratio is still positive on a negative log scale.
(-np.geomspace(0.1, 1, 10), 7, 8),
# Evenly spaced parameter values lead to a ratio of 1.
(np.linspace(0, 1, 5), 0.9, 1.1),
# This is not exactly spaced on a log scale but we will benefit from treating
# it as such for visualization.
([1, 2, 5, 10, 20, 50], 20, 40),
],
)
def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
assert lower_bound < _interval_max_min_ratio(data) < upper_bound
@pytest.mark.parametrize(
"default_kwargs, user_kwargs, expected",
[
(
{"color": "blue", "linewidth": 2},
{"linestyle": "dashed"},
{"color": "blue", "linewidth": 2, "linestyle": "dashed"},
),
(
{"color": "blue", "linestyle": "solid"},
{"c": "red", "ls": "dashed"},
{"color": "red", "linestyle": "dashed"},
),
(
{"label": "xxx", "color": "k", "linestyle": "--"},
{"ls": "-."},
{"label": "xxx", "color": "k", "linestyle": "-."},
),
({}, {}, {}),
(
{},
{
"ls": "dashed",
"c": "red",
"ec": "black",
"fc": "yellow",
"lw": 2,
"mec": "green",
"mfcalt": "blue",
"ms": 5,
},
{
"linestyle": "dashed",
"color": "red",
"edgecolor": "black",
"facecolor": "yellow",
"linewidth": 2,
"markeredgecolor": "green",
"markerfacecoloralt": "blue",
"markersize": 5,
},
),
],
)
def test_validate_style_kwargs(default_kwargs, user_kwargs, expected):
"""Check the behaviour of `validate_style_kwargs` with various type of entries."""
result = _validate_style_kwargs(default_kwargs, user_kwargs)
assert result == expected, (
"The validation of style keywords does not provide the expected results: "
f"Got {result} instead of {expected}."
)
@pytest.mark.parametrize(
"default_kwargs, user_kwargs",
[({}, {"ls": 2, "linestyle": 3}), ({}, {"c": "r", "color": "blue"})],
)
def test_validate_style_kwargs_error(default_kwargs, user_kwargs):
"""Check that `validate_style_kwargs` raises TypeError"""
with pytest.raises(TypeError):
_validate_style_kwargs(default_kwargs, user_kwargs)
def test_despine(pyplot):
ax = pyplot.gca()
_despine(ax)
assert ax.spines["top"].get_visible() is False
assert ax.spines["right"].get_visible() is False
assert ax.spines["bottom"].get_bounds() == (0, 1)
assert ax.spines["left"].get_bounds() == (0, 1)
@pytest.mark.parametrize("estimator_name", ["my_est_name", "deprecated"])
@pytest.mark.parametrize("name", [None, "my_name"])
def test_deprecate_estimator_name(estimator_name, name):
"""Check `_deprecate_estimator_name` behaves correctly"""
version = "1.7"
version_remove = "1.9"
if estimator_name == "deprecated":
name_out = _deprecate_estimator_name(estimator_name, name, version)
assert name_out == name
# `estimator_name` is provided and `name` is:
elif name is None:
warning_message = (
f"`estimator_name` is deprecated in {version} and will be removed in "
f"{version_remove}. Use `name` instead."
)
with pytest.warns(FutureWarning, match=warning_message):
result = _deprecate_estimator_name(estimator_name, name, version)
assert result == estimator_name
elif name is not None:
error_message = (
f"Cannot provide both `estimator_name` and `name`. `estimator_name` "
f"is deprecated in {version} and will be removed in {version_remove}. "
)
with pytest.raises(ValueError, match=error_message):
_deprecate_estimator_name(estimator_name, name, version)

View File

@@ -0,0 +1,693 @@
import re
from pprint import PrettyPrinter
import numpy as np
import pytest
from sklearn import config_context
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.utils._pprint import _EstimatorPrettyPrinter
# Constructors excerpted to test pprinting
class LogisticRegression(BaseEstimator):
def __init__(
self,
C=1.0,
l1_ratio=0,
dual=False,
tol=1e-4,
fit_intercept=True,
intercept_scaling=1,
class_weight=None,
random_state=None,
solver="warn",
max_iter=100,
multi_class="warn",
verbose=0,
warm_start=False,
n_jobs=None,
):
self.C = C
self.l1_ratio = l1_ratio
self.dual = dual
self.tol = tol
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.verbose = verbose
self.warm_start = warm_start
self.n_jobs = n_jobs
def fit(self, X, y):
return self
class StandardScaler(TransformerMixin, BaseEstimator):
def __init__(self, copy=True, with_mean=True, with_std=True):
self.with_mean = with_mean
self.with_std = with_std
self.copy = copy
def transform(self, X, copy=None):
return self
class RFE(BaseEstimator):
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
self.estimator = estimator
self.n_features_to_select = n_features_to_select
self.step = step
self.verbose = verbose
class GridSearchCV(BaseEstimator):
def __init__(
self,
estimator,
param_grid,
scoring=None,
n_jobs=None,
iid="warn",
refit=True,
cv="warn",
verbose=0,
pre_dispatch="2*n_jobs",
error_score="raise-deprecating",
return_train_score=False,
):
self.estimator = estimator
self.param_grid = param_grid
self.scoring = scoring
self.n_jobs = n_jobs
self.iid = iid
self.refit = refit
self.cv = cv
self.verbose = verbose
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
class CountVectorizer(BaseEstimator):
def __init__(
self,
input="content",
encoding="utf-8",
decode_error="strict",
strip_accents=None,
lowercase=True,
preprocessor=None,
tokenizer=None,
stop_words=None,
token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1),
analyzer="word",
max_df=1.0,
min_df=1,
max_features=None,
vocabulary=None,
binary=False,
dtype=np.int64,
):
self.input = input
self.encoding = encoding
self.decode_error = decode_error
self.strip_accents = strip_accents
self.preprocessor = preprocessor
self.tokenizer = tokenizer
self.analyzer = analyzer
self.lowercase = lowercase
self.token_pattern = token_pattern
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
self.max_features = max_features
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
self.dtype = dtype
class Pipeline(BaseEstimator):
def __init__(self, steps, memory=None):
self.steps = steps
self.memory = memory
class SVC(BaseEstimator):
def __init__(
self,
C=1.0,
kernel="rbf",
degree=3,
gamma="auto_deprecated",
coef0=0.0,
shrinking=True,
probability=False,
tol=1e-3,
cache_size=200,
class_weight=None,
verbose=False,
max_iter=-1,
decision_function_shape="ovr",
random_state=None,
):
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
self.tol = tol
self.C = C
self.shrinking = shrinking
self.probability = probability
self.cache_size = cache_size
self.class_weight = class_weight
self.verbose = verbose
self.max_iter = max_iter
self.decision_function_shape = decision_function_shape
self.random_state = random_state
class PCA(BaseEstimator):
def __init__(
self,
n_components=None,
copy=True,
whiten=False,
svd_solver="auto",
tol=0.0,
iterated_power="auto",
random_state=None,
):
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
self.random_state = random_state
class NMF(BaseEstimator):
def __init__(
self,
n_components=None,
init=None,
solver="cd",
beta_loss="frobenius",
tol=1e-4,
max_iter=200,
random_state=None,
alpha=0.0,
l1_ratio=0.0,
verbose=0,
shuffle=False,
):
self.n_components = n_components
self.init = init
self.solver = solver
self.beta_loss = beta_loss
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.alpha = alpha
self.l1_ratio = l1_ratio
self.verbose = verbose
self.shuffle = shuffle
class SimpleImputer(BaseEstimator):
def __init__(
self,
missing_values=np.nan,
strategy="mean",
fill_value=None,
verbose=0,
copy=True,
):
self.missing_values = missing_values
self.strategy = strategy
self.fill_value = fill_value
self.verbose = verbose
self.copy = copy
@config_context(print_changed_only=False)
def test_basic():
# Basic pprint test
lr = LogisticRegression()
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0, max_iter=100,
multi_class='warn', n_jobs=None, random_state=None,
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
def test_changed_only():
# Make sure the changed_only param is correctly used when True (default)
lr = LogisticRegression(C=99)
expected = """LogisticRegression(C=99)"""
assert lr.__repr__() == expected
# Check with a repr that doesn't fit on a single line
lr = LogisticRegression(
C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
)
expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
verbose=True)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
imputer = SimpleImputer(missing_values=0)
expected = """SimpleImputer(missing_values=0)"""
assert imputer.__repr__() == expected
# Defaults to np.nan, trying with float('NaN')
imputer = SimpleImputer(missing_values=float("NaN"))
expected = """SimpleImputer()"""
assert imputer.__repr__() == expected
# make sure array parameters don't throw error (see #13583)
repr(LogisticRegressionCV(Cs=np.array([0.1, 1]), use_legacy_attributes=False))
@config_context(print_changed_only=False)
def test_pipeline():
# Render a pipeline object
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
expected = """
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('logisticregression',
LogisticRegression(C=999, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=0, max_iter=100,
multi_class='warn', n_jobs=None,
random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False))],
transform_input=None, verbose=False)"""
expected = expected[1:] # remove first \n
assert pipeline.__repr__() == expected
@config_context(print_changed_only=False)
def test_deeply_nested():
# Render a deeply nested estimator
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
expected = """
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
class_weight=None,
dual=False,
fit_intercept=True,
intercept_scaling=1,
l1_ratio=0,
max_iter=100,
multi_class='warn',
n_jobs=None,
random_state=None,
solver='warn',
tol=0.0001,
verbose=0,
warm_start=False),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1, verbose=0),
n_features_to_select=None, step=1,
verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0)"""
expected = expected[1:] # remove first \n
assert rfe.__repr__() == expected
@pytest.mark.parametrize(
("print_changed_only", "expected"),
[
(True, "RFE(estimator=RFE(...))"),
(
False,
"RFE(estimator=RFE(...), n_features_to_select=None, step=1, verbose=0)",
),
],
)
def test_print_estimator_max_depth(print_changed_only, expected):
with config_context(print_changed_only=print_changed_only):
pp = _EstimatorPrettyPrinter(depth=1)
rfe = RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))
assert pp.pformat(rfe) == expected
@config_context(print_changed_only=False)
def test_gridsearch():
# render a gridsearch
param_grid = [
{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
{"kernel": ["linear"], "C": [1, 10, 100, 1000]},
]
gs = GridSearchCV(SVC(), param_grid, cv=5)
expected = """
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
'kernel': ['rbf']},
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert gs.__repr__() == expected
@config_context(print_changed_only=False)
def test_gridsearch_pipeline():
# render a pipeline inside a gridsearch
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
"reduce_dim": [PCA(iterated_power=7), NMF()],
"reduce_dim__n_components": N_FEATURES_OPTIONS,
"classify__C": C_OPTIONS,
},
{
"reduce_dim": [SelectKBest(chi2)],
"reduce_dim__k": N_FEATURES_OPTIONS,
"classify__C": C_OPTIONS,
},
]
gspipeline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
expected = """
GridSearchCV(cv=3, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('reduce_dim',
PCA(copy=True, iterated_power='auto',
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False)),
('classify',
SVC(C=1.0, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr',
degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1,
probability=False,
random_state=None, shrinking=True,
tol=0.001, verbose=False))]),
iid='warn', n_jobs=1,
param_grid=[{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [PCA(copy=True, iterated_power=7,
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False),
NMF(alpha=0.0, beta_loss='frobenius',
init=None, l1_ratio=0.0,
max_iter=200, n_components=None,
random_state=None, shuffle=False,
solver='cd', tol=0.0001,
verbose=0)],
'reduce_dim__n_components': [2, 4, 8]},
{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [SelectKBest(k=10,
score_func=<function chi2 at some_address>)],
'reduce_dim__k': [2, 4, 8]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)""" # noqa: E501
expected = expected[1:] # remove first \n
repr_ = pp.pformat(gspipeline)
# Remove address of '<function chi2 at 0x.....>' for reproducibility
repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
assert repr_ == expected
@config_context(print_changed_only=False)
def test_n_max_elements_to_show():
n_max_elements_to_show = 30
pp = _EstimatorPrettyPrinter(
compact=True,
indent=1,
indent_at_name=True,
n_max_elements_to_show=n_max_elements_to_show,
)
# No ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Now with ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29, ...})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Also test with lists
param_grid = {"C": list(range(n_max_elements_to_show))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
# Now with ellipsis
param_grid = {"C": list(range(n_max_elements_to_show + 1))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, ...]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
@config_context(print_changed_only=False)
def test_bruteforce_ellipsis():
# Check that the bruteforce ellipsis (used when the number of non-blank
# characters exceeds N_CHAR_MAX) renders correctly.
lr = LogisticRegression()
# test when the left and right side of the ellipsis aren't on the same
# line.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
in...
multi_class='warn', n_jobs=None, random_state=None,
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__(N_CHAR_MAX=150) == expected
# test with very small N_CHAR_MAX
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
# weird reprs we still keep the whole line of the right part (after the
# ellipsis).
expected = """
Lo...
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__(N_CHAR_MAX=4) == expected
# test with N_CHAR_MAX == number of non-blank characters: In this case we
# don't want ellipsis
full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
n_nonblank = len("".join(full_repr.split()))
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
assert "..." not in full_repr
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on different lines. In this case we
# want to expend the whole line of the right side
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0,...00,
multi_class='warn', n_jobs=None, random_state=None,
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 10) == expected
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on the same line. In this case we don't
# want to expend the whole line of the right side, just add the ellispsis
# between the 2 sides.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0, max...r=100,
multi_class='warn', n_jobs=None, random_state=None,
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 4) == expected
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
# right side of the ellispsis are on the same line, but adding the ellipsis
# would actually make the repr longer. So we don't add the ellipsis.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0, max_iter=100,
multi_class='warn', n_jobs=None, random_state=None,
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 2) == expected
def test_builtin_prettyprinter():
# non regression test than ensures we can still use the builtin
# PrettyPrinter class for estimators (as done e.g. by joblib).
# Used to be a bug
PrettyPrinter().pprint(LogisticRegression())
def test_kwargs_in_init():
# Make sure the changed_only=True mode is OK when an argument is passed as
# kwargs.
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/17206
class WithKWargs(BaseEstimator):
# Estimator with a kwargs argument. These need to hack around
# set_params and get_params. Here we mimic what LightGBM does.
def __init__(self, a="willchange", b="unchanged", **kwargs):
self.a = a
self.b = b
self._other_params = {}
self.set_params(**kwargs)
def get_params(self, deep=True):
params = super().get_params(deep=deep)
params.update(self._other_params)
return params
def set_params(self, **params):
for key, value in params.items():
setattr(self, key, value)
self._other_params[key] = value
return self
est = WithKWargs(a="something", c="abcd", d=None)
expected = "WithKWargs(a='something', c='abcd', d=None)"
assert est.__repr__() == expected
with config_context(print_changed_only=False):
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
assert est.__repr__() == expected
def test_complexity_print_changed_only():
# Make sure `__repr__` is called the same amount of times
# whether `print_changed_only` is True or False
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18490
class DummyEstimator(TransformerMixin, BaseEstimator):
nb_times_repr_called = 0
def __init__(self, estimator=None):
self.estimator = estimator
def __repr__(self):
DummyEstimator.nb_times_repr_called += 1
return super().__repr__()
def transform(self, X, copy=None): # pragma: no cover
return X
estimator = DummyEstimator(
make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
)
with config_context(print_changed_only=False):
repr(estimator)
nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
DummyEstimator.nb_times_repr_called = 0
with config_context(print_changed_only=True):
repr(estimator)
nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true

View File

@@ -0,0 +1,192 @@
import numpy as np
import pytest
import scipy.sparse as sp
from numpy.testing import assert_array_almost_equal
from scipy.special import comb
from sklearn.utils._random import _our_rand_r_py
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
###############################################################################
# test custom sampling without replacement algorithm
###############################################################################
def test_invalid_sample_without_replacement_algorithm():
with pytest.raises(ValueError):
sample_without_replacement(5, 4, "unknown")
def test_sample_without_replacement_algorithms():
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
for m in methods:
def sample_without_replacement_method(
n_population, n_samples, random_state=None
):
return sample_without_replacement(
n_population, n_samples, method=m, random_state=random_state
)
check_edge_case_of_sample_int(sample_without_replacement_method)
check_sample_int(sample_without_replacement_method)
check_sample_int_distribution(sample_without_replacement_method)
def check_edge_case_of_sample_int(sample_without_replacement):
# n_population < n_sample
with pytest.raises(ValueError):
sample_without_replacement(0, 1)
with pytest.raises(ValueError):
sample_without_replacement(1, 2)
# n_population == n_samples
assert sample_without_replacement(0, 0).shape == (0,)
assert sample_without_replacement(1, 1).shape == (1,)
# n_population >= n_samples
assert sample_without_replacement(5, 0).shape == (0,)
assert sample_without_replacement(5, 1).shape == (1,)
# n_population < 0 or n_samples < 0
with pytest.raises(ValueError):
sample_without_replacement(-1, 5)
with pytest.raises(ValueError):
sample_without_replacement(5, -1)
def check_sample_int(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# the sample is of the correct length and contains only unique items
n_population = 100
for n_samples in range(n_population + 1):
s = sample_without_replacement(n_population, n_samples)
assert len(s) == n_samples
unique = np.unique(s)
assert np.size(unique) == n_samples
assert np.all(unique < n_population)
# test edge case n_population == n_samples == 0
assert np.size(sample_without_replacement(0, 0)) == 0
def check_sample_int_distribution(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# sample generates all possible permutations
n_population = 10
# a large number of trials prevents false negatives without slowing normal
# case
n_trials = 10000
for n_samples in range(n_population):
# Counting the number of combinations is not as good as counting the
# the number of permutations. However, it works with sampling algorithm
# that does not provide a random permutation of the subset of integer.
n_expected = comb(n_population, n_samples, exact=True)
output = {}
for i in range(n_trials):
output[frozenset(sample_without_replacement(n_population, n_samples))] = (
None
)
if len(output) == n_expected:
break
else:
raise AssertionError(
"number of combinations != number of expected (%s != %s)"
% (len(output), n_expected)
)
def test_random_choice_csc(n_samples=10000, random_state=24):
# Explicit class probabilities
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Implicit class probabilities
classes = [[0, 1], [1, 2]] # test for array-like support
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
got = _random_choice_csc(
n_samples=n_samples, classes=classes, random_state=random_state
)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Edge case probabilities 1.0 and 0.0
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = (
np.bincount(
got[:, [k]].toarray().ravel(), minlength=len(class_probabilities[k])
)
/ n_samples
)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# One class target data
classes = [[1], [0]] # test for array-like support
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
got = _random_choice_csc(
n_samples=n_samples, classes=classes, random_state=random_state
)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got[:, [k]].toarray().ravel()) / n_samples
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
def test_random_choice_csc_errors():
# the length of an array in classes and class_probabilities is mismatched
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# Given probabilities don't sum to 1
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
def test_our_rand_r():
assert 131541053 == _our_rand_r_py(1273642419)
assert 270369 == _our_rand_r_py(0)

View File

@@ -0,0 +1,396 @@
import warnings
import numpy as np
import pytest
from sklearn.base import clone
from sklearn.datasets import (
load_iris,
make_classification,
make_multilabel_classification,
make_regression,
)
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import (
LinearRegression,
LogisticRegression,
)
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
from sklearn.utils._response import _get_response_values, _get_response_values_binary
from sklearn.utils._testing import assert_allclose, assert_array_equal
X, y = load_iris(return_X_y=True)
# scale the data to avoid ConvergenceWarning with LogisticRegression
X = scale(X, copy=False)
X_binary, y_binary = X[:100], y[:100]
@pytest.mark.parametrize(
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
)
def test_get_response_values_regressor_error(response_method):
"""Check the error message with regressor an not supported response
method."""
my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method])
X = "mocking_data", "mocking_target"
err_msg = f"{my_estimator.__class__.__name__} should either be a classifier"
with pytest.raises(ValueError, match=err_msg):
_get_response_values(my_estimator, X, response_method=response_method)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_regressor(return_response_method_used):
"""Check the behaviour of `_get_response_values` with regressor."""
X, y = make_regression(n_samples=10, random_state=0)
regressor = LinearRegression().fit(X, y)
results = _get_response_values(
regressor,
X,
response_method="predict",
return_response_method_used=return_response_method_used,
)
assert_array_equal(results[0], regressor.predict(X))
assert results[1] is None
if return_response_method_used:
assert results[2] == "predict"
@pytest.mark.parametrize(
"response_method",
["predict", "decision_function", ["decision_function", "predict"]],
)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_outlier_detection(
response_method, return_response_method_used
):
"""Check the behaviour of `_get_response_values` with outlier detector."""
X, y = make_classification(n_samples=50, random_state=0)
outlier_detector = IsolationForest(random_state=0).fit(X, y)
results = _get_response_values(
outlier_detector,
X,
response_method=response_method,
return_response_method_used=return_response_method_used,
)
chosen_response_method = (
response_method[0] if isinstance(response_method, list) else response_method
)
prediction_method = getattr(outlier_detector, chosen_response_method)
assert_array_equal(results[0], prediction_method(X))
assert results[1] is None
if return_response_method_used:
assert results[2] == chosen_response_method
@pytest.mark.parametrize(
"response_method",
["predict_proba", "decision_function", "predict", "predict_log_proba"],
)
def test_get_response_values_classifier_unknown_pos_label(response_method):
"""Check that `_get_response_values` raises the proper error message with
classifier."""
X, y = make_classification(n_samples=10, n_classes=2, random_state=0)
classifier = LogisticRegression().fit(X, y)
# provide a `pos_label` which is not in `y`
err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]"
with pytest.raises(ValueError, match=err_msg):
_get_response_values(
classifier,
X,
response_method=response_method,
pos_label="whatever",
)
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba(
response_method,
):
"""Check that `_get_response_values` will raise an error when `y_pred` has a
single class with `predict_proba`."""
X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0)
y_single_class = np.zeros_like(y_two_class)
classifier = DecisionTreeClassifier().fit(X, y_single_class)
err_msg = (
r"Got predict_proba of shape \(10, 1\), but need classifier with "
r"two classes"
)
with pytest.raises(ValueError, match=err_msg):
_get_response_values(classifier, X, response_method=response_method)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_binary_classifier_decision_function(
return_response_method_used,
):
"""Check the behaviour of `_get_response_values` with `decision_function`
and binary classifier."""
X, y = make_classification(
n_samples=10,
n_classes=2,
weights=[0.3, 0.7],
random_state=0,
)
classifier = LogisticRegression().fit(X, y)
response_method = "decision_function"
# default `pos_label`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=None,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X))
assert results[1] == 1
if return_response_method_used:
assert results[2] == "decision_function"
# when forcing `pos_label=classifier.classes_[0]`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=classifier.classes_[0],
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X) * -1)
assert results[1] == 0
if return_response_method_used:
assert results[2] == "decision_function"
@pytest.mark.parametrize("return_response_method_used", [True, False])
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
def test_get_response_values_binary_classifier_predict_proba(
return_response_method_used, response_method
):
"""Check that `_get_response_values` with `predict_proba` and binary
classifier."""
X, y = make_classification(
n_samples=10,
n_classes=2,
weights=[0.3, 0.7],
random_state=0,
)
classifier = LogisticRegression().fit(X, y)
# default `pos_label`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=None,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1])
assert results[1] == 1
if return_response_method_used:
assert len(results) == 3
assert results[2] == response_method
else:
assert len(results) == 2
# when forcing `pos_label=classifier.classes_[0]`
y_pred, pos_label, *_ = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=classifier.classes_[0],
return_response_method_used=return_response_method_used,
)
assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0])
assert pos_label == 0
@pytest.mark.parametrize(
"estimator, X, y, err_msg, params",
[
(
DecisionTreeRegressor(),
X_binary,
y_binary,
"Expected 'estimator' to be a binary classifier",
{"response_method": "auto"},
),
(
DecisionTreeClassifier(),
X_binary,
y_binary,
r"pos_label=unknown is not a valid label: It should be one of \[0 1\]",
{"response_method": "auto", "pos_label": "unknown"},
),
(
DecisionTreeClassifier(),
X,
y,
"be a binary classifier. Got 3 classes instead.",
{"response_method": "predict_proba"},
),
],
)
def test_get_response_error(estimator, X, y, err_msg, params):
"""Check that we raise the proper error messages in _get_response_values_binary."""
estimator = clone(estimator).fit(X, y) # clone to make test execution thread-safe
with pytest.raises(ValueError, match=err_msg):
_get_response_values_binary(estimator, X, **params)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_predict_proba(return_response_method_used):
"""Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
results = _get_response_values_binary(
classifier,
X_binary,
response_method="predict_proba",
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
assert results[1] == 1
if return_response_method_used:
assert results[2] == "predict_proba"
results = _get_response_values_binary(
classifier,
X_binary,
response_method="predict_proba",
pos_label=0,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
assert results[1] == 0
if return_response_method_used:
assert results[2] == "predict_proba"
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_decision_function(return_response_method_used):
"""Check the behaviour of `_get_response_values_binary` using decision_function."""
classifier = LogisticRegression().fit(X_binary, y_binary)
results = _get_response_values_binary(
classifier,
X_binary,
response_method="decision_function",
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X_binary))
assert results[1] == 1
if return_response_method_used:
assert results[2] == "decision_function"
results = _get_response_values_binary(
classifier,
X_binary,
response_method="decision_function",
pos_label=0,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
assert results[1] == 0
if return_response_method_used:
assert results[2] == "decision_function"
@pytest.mark.parametrize(
"estimator, response_method",
[
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"),
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"),
(LogisticRegression(), "decision_function"),
],
)
def test_get_response_values_multiclass(estimator, response_method):
"""Check that we can call `_get_response_values` with a multiclass estimator.
It should return the predictions untouched.
"""
estimator = clone(estimator)
estimator.fit(X, y)
predictions, pos_label = _get_response_values(
estimator, X, response_method=response_method
)
assert pos_label is None
assert predictions.shape == (X.shape[0], len(estimator.classes_))
if response_method == "predict_proba":
assert np.logical_and(predictions >= 0, predictions <= 1).all()
elif response_method == "predict_log_proba":
assert (predictions <= 0.0).all()
def test_get_response_values_with_response_list():
"""Check the behaviour of passing a list of responses to `_get_response_values`."""
classifier = LogisticRegression().fit(X_binary, y_binary)
# it should use `predict_proba`
y_pred, pos_label, response_method = _get_response_values(
classifier,
X_binary,
response_method=["predict_proba", "decision_function"],
return_response_method_used=True,
)
assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1])
assert pos_label == 1
assert response_method == "predict_proba"
# it should use `decision_function`
y_pred, pos_label, response_method = _get_response_values(
classifier,
X_binary,
response_method=["decision_function", "predict_proba"],
return_response_method_used=True,
)
assert_allclose(y_pred, classifier.decision_function(X_binary))
assert pos_label == 1
assert response_method == "decision_function"
@pytest.mark.parametrize(
"response_method", ["predict_proba", "decision_function", "predict"]
)
def test_get_response_values_multilabel_indicator(response_method):
X, Y = make_multilabel_classification(random_state=0)
estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
y_pred, pos_label = _get_response_values(
estimator, X, response_method=response_method
)
assert pos_label is None
assert y_pred.shape == Y.shape
if response_method == "predict_proba":
assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
elif response_method == "decision_function":
# values returned by `decision_function` are not bounded in [0, 1]
assert (y_pred < 0).sum() > 0
assert (y_pred > 1).sum() > 0
else: # response_method == "predict"
assert np.logical_or(y_pred == 0, y_pred == 1).all()
def test_response_values_type_of_target_on_classes_no_warning():
"""
Ensure `_get_response_values` doesn't raise spurious warning.
"The number of unique classes is greater than > 50% of samples"
warning should not be raised when calling `type_of_target(classes_)`.
Non-regression test for issue #31583.
"""
X = np.random.RandomState(0).randn(120, 3)
# 30 classes, less than 50% of number of samples
y = np.repeat(np.arange(30), 4)
clf = LogisticRegression().fit(X, y)
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
_get_response_values(clf, X, response_method="predict_proba")

View File

@@ -0,0 +1,185 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from functools import partial
from itertools import product
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.datasets import load_iris
from sklearn.utils._seq_dataset import (
ArrayDataset32,
ArrayDataset64,
CSRDataset32,
CSRDataset64,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSR_CONTAINERS
iris = load_iris()
X64 = iris.data.astype(np.float64)
y64 = iris.target.astype(np.float64)
sample_weight64 = np.arange(y64.size, dtype=np.float64)
X32 = iris.data.astype(np.float32)
y32 = iris.target.astype(np.float32)
sample_weight32 = np.arange(y32.size, dtype=np.float32)
floating = [np.float32, np.float64]
def assert_csr_equal_values(current, expected):
current.eliminate_zeros()
expected.eliminate_zeros()
expected = expected.astype(current.dtype)
assert current.shape[0] == expected.shape[0]
assert current.shape[1] == expected.shape[1]
assert_array_equal(current.data, expected.data)
assert_array_equal(current.indices, expected.indices)
assert_array_equal(current.indptr, expected.indptr)
def _make_dense_dataset(float_dtype):
if float_dtype == np.float32:
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
def _make_sparse_dataset(csr_container, float_dtype):
if float_dtype == np.float32:
X, y, sample_weight, csr_dataset = X32, y32, sample_weight32, CSRDataset32
else:
X, y, sample_weight, csr_dataset = X64, y64, sample_weight64, CSRDataset64
X = csr_container(X)
return csr_dataset(X.data, X.indptr, X.indices, y, sample_weight, seed=42)
def _dense_dataset_factories():
return [partial(_make_dense_dataset, float_dtype) for float_dtype in floating]
def _sparse_dataset_factories():
return [
partial(_make_sparse_dataset, csr_container, float_dtype)
for csr_container, float_dtype in product(CSR_CONTAINERS, floating)
]
def _fused_types_dataset_factories():
all_factories = _dense_dataset_factories() + _sparse_dataset_factories()
# group dataset by array types to get a tuple (float32, float64)
return [all_factories[idx : idx + 2] for idx in range(0, len(all_factories), 2)]
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize(
"dataset_factory", _dense_dataset_factories() + _sparse_dataset_factories()
)
def test_seq_dataset_basic_iteration(dataset_factory, csr_container):
NUMBER_OF_RUNS = 5
X_csr64 = csr_container(X64)
dataset = dataset_factory()
for _ in range(NUMBER_OF_RUNS):
# next sample
xi_, yi, swi, idx = dataset._next_py()
xi = csr_container(xi_, shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[[idx]])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
# random sample
xi_, yi, swi, idx = dataset._random_py()
xi = csr_container(xi_, shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[[idx]])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
@pytest.mark.parametrize(
"float_dtype, csr_container", product(floating, CSR_CONTAINERS)
)
def test_seq_dataset_shuffle(float_dtype, csr_container):
dense_dataset = _make_dense_dataset(float_dtype)
sparse_dataset = _make_sparse_dataset(csr_container, float_dtype)
# not shuffled
for i in range(5):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
for i in [132, 50, 9, 18, 58]:
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == i
assert idx2 == i
seed = 77
dense_dataset._shuffle_py(seed)
sparse_dataset._shuffle_py(seed)
idx_next = [63, 91, 148, 87, 29]
idx_shuffle = [137, 125, 56, 121, 127]
for i, j in zip(idx_next, idx_shuffle):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == j
assert idx2 == j
@pytest.mark.parametrize(
"dataset_32_factory, dataset_64_factory", _fused_types_dataset_factories()
)
def test_fused_types_consistency(dataset_32_factory, dataset_64_factory):
dataset_32, dataset_64 = dataset_32_factory(), dataset_64_factory()
NUMBER_OF_RUNS = 5
for _ in range(NUMBER_OF_RUNS):
# next sample
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
assert xi_data32.dtype == np.float32
assert xi_data64.dtype == np.float64
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
assert_allclose(yi64, yi32, rtol=1e-5)
def test_buffer_dtype_mismatch_error():
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
ArrayDataset64(X32, y32, sample_weight32, seed=42)
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
ArrayDataset32(X64, y64, sample_weight64, seed=42)
for csr_container in CSR_CONTAINERS:
X_csr32 = csr_container(X32)
X_csr64 = csr_container(X64)
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
CSRDataset64(
X_csr32.data,
X_csr32.indptr,
X_csr32.indices,
y32,
sample_weight32,
seed=42,
)
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
CSRDataset32(
X_csr64.data,
X_csr64.indptr,
X_csr64.indices,
y64,
sample_weight64,
seed=42,
)

View File

@@ -0,0 +1,471 @@
import importlib
from collections import namedtuple
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn._config import config_context, get_config
from sklearn.preprocessing import StandardScaler
from sklearn.utils._set_output import (
ADAPTERS_MANAGER,
ContainerAdapterProtocol,
_get_adapter_from_container,
_get_output_config,
_safe_set_output,
_SetOutputMixin,
_wrap_data_with_container,
check_library_installed,
)
from sklearn.utils.fixes import CSR_CONTAINERS
def test_pandas_adapter():
"""Check pandas adapter has expected behavior."""
pd = pytest.importorskip("pandas")
X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
columns = np.asarray(["f0", "f1", "f2"], dtype=object)
index = np.asarray([1, 2])
X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_ser_orig = pd.Series([2, 3], index=index)
adapter = ADAPTERS_MANAGER.adapters["pandas"]
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
assert isinstance(X_container, pd.DataFrame)
assert_array_equal(X_container.columns, columns)
assert_array_equal(X_container.index, index)
# use original index when the original is a series
X_container = adapter.create_container(X_np, X_ser_orig, columns=lambda: columns)
assert isinstance(X_container, pd.DataFrame)
assert_array_equal(X_container.columns, columns)
assert_array_equal(X_container.index, index)
# Input dataframe's index does not change
new_columns = np.asarray(["f0", "f1"], dtype=object)
X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
new_df = adapter.create_container(X_df, X_df_orig, columns=new_columns)
assert_array_equal(new_df.columns, new_columns)
assert_array_equal(new_df.index, X_df.index)
assert adapter.is_supported_container(X_df)
assert not adapter.is_supported_container(X_np)
# adapter.update_columns updates the columns
new_columns = np.array(["a", "c"], dtype=object)
new_df = adapter.rename_columns(X_df, new_columns)
assert_array_equal(new_df.columns, new_columns)
# adapter.hstack stacks the dataframes horizontally.
X_df_1 = pd.DataFrame([[1, 2, 5], [3, 4, 6]], columns=["a", "b", "e"])
X_df_2 = pd.DataFrame([[4], [5]], columns=["c"])
X_stacked = adapter.hstack([X_df_1, X_df_2])
expected_df = pd.DataFrame(
[[1, 2, 5, 4], [3, 4, 6, 5]], columns=["a", "b", "e", "c"]
)
pd.testing.assert_frame_equal(X_stacked, expected_df)
# check that we update properly the columns even with duplicate column names
# this use-case potentially happen when using ColumnTransformer
# non-regression test for gh-28260
X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
new_columns = np.array(["x__a", "y__a"], dtype=object)
new_df = adapter.rename_columns(X_df, new_columns)
assert_array_equal(new_df.columns, new_columns)
# check the behavior of the inplace parameter in `create_container`
# we should trigger a copy
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=False)
assert X_output is not X_df
assert list(X_df.columns) == [0, 1]
assert list(X_output.columns) == ["a", "b"]
# the operation is inplace
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=True)
assert X_output is X_df
assert list(X_df.columns) == ["a", "b"]
assert list(X_output.columns) == ["a", "b"]
def test_polars_adapter():
"""Check Polars adapter has expected behavior."""
pl = pytest.importorskip("polars")
X_np = np.array([[1, 0, 3], [0, 0, 1]])
columns = ["f1", "f2", "f3"]
X_df_orig = pl.DataFrame(X_np, schema=columns, orient="row")
adapter = ADAPTERS_MANAGER.adapters["polars"]
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
assert isinstance(X_container, pl.DataFrame)
assert_array_equal(X_container.columns, columns)
# Update columns with create_container
new_columns = np.asarray(["a", "b", "c"], dtype=object)
new_df = adapter.create_container(X_df_orig, X_df_orig, columns=new_columns)
assert_array_equal(new_df.columns, new_columns)
assert adapter.is_supported_container(X_df_orig)
assert not adapter.is_supported_container(X_np)
# adapter.update_columns updates the columns
new_columns = np.array(["a", "c", "g"], dtype=object)
new_df = adapter.rename_columns(X_df_orig, new_columns)
assert_array_equal(new_df.columns, new_columns)
# adapter.hstack stacks the dataframes horizontally.
X_df_1 = pl.DataFrame([[1, 2, 5], [3, 4, 6]], schema=["a", "b", "e"], orient="row")
X_df_2 = pl.DataFrame([[4], [5]], schema=["c"], orient="row")
X_stacked = adapter.hstack([X_df_1, X_df_2])
expected_df = pl.DataFrame(
[[1, 2, 5, 4], [3, 4, 6, 5]], schema=["a", "b", "e", "c"], orient="row"
)
from polars.testing import assert_frame_equal
assert_frame_equal(X_stacked, expected_df)
# check the behavior of the inplace parameter in `create_container`
# we should trigger a copy
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=False)
assert X_output is not X_df
assert list(X_df.columns) == ["a", "b"]
assert list(X_output.columns) == ["c", "d"]
# the operation is inplace
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=True)
assert X_output is X_df
assert list(X_df.columns) == ["c", "d"]
assert list(X_output.columns) == ["c", "d"]
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test__container_error_validation(csr_container):
"""Check errors in _wrap_data_with_container."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
X_csr = csr_container(X)
match = "The transformer outputs a scipy sparse matrix."
with config_context(transform_output="pandas"):
with pytest.raises(ValueError, match=match):
_wrap_data_with_container("transform", X_csr, X, StandardScaler())
class EstimatorWithoutSetOutputAndWithoutTransform:
pass
class EstimatorNoSetOutputWithTransform:
def transform(self, X, y=None):
return X # pragma: no cover
class EstimatorWithSetOutput(_SetOutputMixin):
def fit(self, X, y=None):
self.n_features_in_ = X.shape[1]
return self
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
def test__safe_set_output():
"""Check _safe_set_output works as expected."""
# Estimator without transform will not raise when setting set_output for transform.
est = EstimatorWithoutSetOutputAndWithoutTransform()
_safe_set_output(est, transform="pandas")
# Estimator with transform but without set_output will raise
est = EstimatorNoSetOutputWithTransform()
with pytest.raises(ValueError, match="Unable to configure output"):
_safe_set_output(est, transform="pandas")
est = EstimatorWithSetOutput().fit(np.asarray([[1, 2, 3]]))
_safe_set_output(est, transform="pandas")
config = _get_output_config("transform", est)
assert config["dense"] == "pandas"
_safe_set_output(est, transform="default")
config = _get_output_config("transform", est)
assert config["dense"] == "default"
# transform is None is a no-op, so the config remains "default"
_safe_set_output(est, transform=None)
config = _get_output_config("transform", est)
assert config["dense"] == "default"
class EstimatorNoSetOutputWithTransformNoFeatureNamesOut(_SetOutputMixin):
def transform(self, X, y=None):
return X # pragma: no cover
def test_set_output_mixin():
"""Estimator without get_feature_names_out does not define `set_output`."""
est = EstimatorNoSetOutputWithTransformNoFeatureNamesOut()
assert not hasattr(est, "set_output")
def test__safe_set_output_error():
"""Check transform with invalid config."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput()
_safe_set_output(est, transform="bad")
msg = "output config must be in"
with pytest.raises(ValueError, match=msg):
est.transform(X)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_set_output_method(dataframe_lib):
"""Check that the output is a dataframe."""
lib = pytest.importorskip(dataframe_lib)
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput().fit(X)
# transform=None is a no-op
est2 = est.set_output(transform=None)
assert est2 is est
X_trans_np = est2.transform(X)
assert isinstance(X_trans_np, np.ndarray)
est.set_output(transform=dataframe_lib)
X_trans_pd = est.transform(X)
assert isinstance(X_trans_pd, lib.DataFrame)
def test_set_output_method_error():
"""Check transform fails with invalid transform."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput().fit(X)
est.set_output(transform="bad")
msg = "output config must be in"
with pytest.raises(ValueError, match=msg):
est.transform(X)
@pytest.mark.parametrize("transform_output", ["pandas", "polars"])
def test__get_output_config(transform_output):
"""Check _get_output_config works as expected."""
# Without a configuration set, the global config is used
global_config = get_config()["transform_output"]
config = _get_output_config("transform")
assert config["dense"] == global_config
with config_context(transform_output=transform_output):
# with estimator=None, the global config is used
config = _get_output_config("transform")
assert config["dense"] == transform_output
est = EstimatorNoSetOutputWithTransform()
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
est = EstimatorWithSetOutput()
# If estimator has not config, use global config
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
# If estimator has a config, use local config
est.set_output(transform="default")
config = _get_output_config("transform", est)
assert config["dense"] == "default"
est.set_output(transform=transform_output)
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
class EstimatorWithSetOutputNoAutoWrap(_SetOutputMixin, auto_wrap_output_keys=None):
def transform(self, X, y=None):
return X
def test_get_output_auto_wrap_false():
"""Check that auto_wrap_output_keys=None does not wrap."""
est = EstimatorWithSetOutputNoAutoWrap()
assert not hasattr(est, "set_output")
X = np.asarray([[1, 0, 3], [0, 0, 1]])
assert X is est.transform(X)
def test_auto_wrap_output_keys_errors_with_incorrect_input():
msg = "auto_wrap_output_keys must be None or a tuple of keys."
with pytest.raises(ValueError, match=msg):
class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
pass
class AnotherMixin:
def __init_subclass__(cls, custom_parameter, **kwargs):
super().__init_subclass__(**kwargs)
cls.custom_parameter = custom_parameter
def test_set_output_mixin_custom_mixin():
"""Check that multiple init_subclasses passes parameters up."""
class BothMixinEstimator(_SetOutputMixin, AnotherMixin, custom_parameter=123):
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return input_features
est = BothMixinEstimator()
assert est.custom_parameter == 123
assert hasattr(est, "set_output")
def test_set_output_mro():
"""Check that multi-inheritance resolves to the correct class method.
Non-regression test gh-25293.
"""
class Base(_SetOutputMixin):
def transform(self, X):
return "Base"
class A(Base):
pass
class B(Base):
def transform(self, X):
return "B"
class C(A, B):
pass
assert C().transform(None) == "B"
class EstimatorWithSetOutputIndex(_SetOutputMixin):
def fit(self, X, y=None):
self.n_features_in_ = X.shape[1]
return self
def transform(self, X, y=None):
import pandas as pd
# transform by giving output a new index.
return pd.DataFrame(X.to_numpy(), index=[f"s{i}" for i in range(X.shape[0])])
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
def test_set_output_pandas_keep_index():
"""Check that set_output does not override index.
Non-regression test for gh-25730.
"""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=[0, 1])
est = EstimatorWithSetOutputIndex().set_output(transform="pandas")
est.fit(X)
X_trans = est.transform(X)
assert_array_equal(X_trans.index, ["s0", "s1"])
class EstimatorReturnTuple(_SetOutputMixin):
def __init__(self, OutputTuple):
self.OutputTuple = OutputTuple
def transform(self, X, y=None):
return self.OutputTuple(X, 2 * X)
def test_set_output_named_tuple_out():
"""Check that namedtuples are kept by default."""
Output = namedtuple("Output", "X, Y")
X = np.asarray([[1, 2, 3]])
est = EstimatorReturnTuple(OutputTuple=Output)
X_trans = est.transform(X)
assert isinstance(X_trans, Output)
assert_array_equal(X_trans.X, X)
assert_array_equal(X_trans.Y, 2 * X)
class EstimatorWithListInput(_SetOutputMixin):
def fit(self, X, y=None):
assert isinstance(X, list)
self.n_features_in_ = len(X[0])
return self
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_set_output_list_input(dataframe_lib):
"""Check set_output for list input.
Non-regression test for #27037.
"""
lib = pytest.importorskip(dataframe_lib)
X = [[0, 1, 2, 3], [4, 5, 6, 7]]
est = EstimatorWithListInput()
est.set_output(transform=dataframe_lib)
X_out = est.fit(X).transform(X)
assert isinstance(X_out, lib.DataFrame)
assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])
@pytest.mark.parametrize("name", sorted(ADAPTERS_MANAGER.adapters))
def test_adapter_class_has_interface(name):
"""Check adapters have the correct interface."""
assert isinstance(ADAPTERS_MANAGER.adapters[name], ContainerAdapterProtocol)
def test_check_library_installed(monkeypatch):
"""Check import error changed."""
orig_import_module = importlib.import_module
def patched_import_module(name):
if name == "pandas":
raise ImportError()
orig_import_module(name, package=None)
monkeypatch.setattr(importlib, "import_module", patched_import_module)
msg = "Setting output container to 'pandas' requires"
with pytest.raises(ImportError, match=msg):
check_library_installed("pandas")
def test_get_adapter_from_container():
"""Check the behavior fo `_get_adapter_from_container`."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
adapter = _get_adapter_from_container(X)
assert adapter.container_lib == "pandas"
err_msg = "The container does not have a registered adapter in scikit-learn."
with pytest.raises(ValueError, match=err_msg):
_get_adapter_from_container(X.to_numpy())

View File

@@ -0,0 +1,65 @@
from collections import defaultdict
import numpy as np
from numpy.testing import assert_array_almost_equal
from sklearn.utils.graph import single_source_shortest_path_length
def floyd_warshall_slow(graph, directed=False):
N = graph.shape[0]
# set nonzero entries to infinity
graph[np.where(graph == 0)] = np.inf
# set diagonal to zero
graph.flat[:: N + 1] = 0
if not directed:
graph = np.minimum(graph, graph.T)
for k in range(N):
for i in range(N):
for j in range(N):
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
graph[np.where(np.isinf(graph))] = 0
return graph
def generate_graph(N=20):
# sparse grid of distances
rng = np.random.RandomState(0)
dist_matrix = rng.random_sample((N, N))
# make symmetric: distances are not direction-dependent
dist_matrix = dist_matrix + dist_matrix.T
# make graph sparse
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
dist_matrix[i] = 0
# set diagonal to zero
dist_matrix.flat[:: N + 1] = 0
return dist_matrix
def test_shortest_path():
dist_matrix = generate_graph(20)
# We compare path length and not costs (-> set distances to 0 or 1)
dist_matrix[dist_matrix != 0] = 1
for directed in (True, False):
if not directed:
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
for i in range(dist_matrix.shape[0]):
# Non-reachable nodes have distance 0 in graph_py
dist_dict = defaultdict(int)
dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
for j in range(graph_py[i].shape[0]):
assert_array_almost_equal(dist_dict[j], graph_py[i, j])

View File

@@ -0,0 +1,40 @@
from threadpoolctl import threadpool_info
from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
from sklearn.utils._testing import ignore_warnings
def test_get_sys_info():
sys_info = _get_sys_info()
assert "python" in sys_info
assert "executable" in sys_info
assert "machine" in sys_info
def test_get_deps_info():
with ignore_warnings():
deps_info = _get_deps_info()
assert "pip" in deps_info
assert "setuptools" in deps_info
assert "sklearn" in deps_info
assert "numpy" in deps_info
assert "scipy" in deps_info
assert "Cython" in deps_info
assert "pandas" in deps_info
assert "matplotlib" in deps_info
assert "joblib" in deps_info
def test_show_versions(capsys):
with ignore_warnings():
show_versions()
out, err = capsys.readouterr()
assert "python" in out
assert "numpy" in out
info = threadpool_info()
if info:
assert "threadpoolctl info:" in out

View File

@@ -0,0 +1,487 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from pytest import approx
from sklearn._config import config_context
from sklearn.utils._array_api import (
_convert_to_numpy,
get_namespace,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._array_api import device as array_device
from sklearn.utils.estimator_checks import _array_api_for_tests
from sklearn.utils.fixes import np_version, parse_version
from sklearn.utils.stats import _weighted_percentile
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("size", [10, 15])
def test_weighted_percentile_matches_median(size, average):
"""Ensure `_weighted_percentile` matches `median` when expected.
With unit `sample_weight`, `_weighted_percentile` should match the median except
when `average=False` and the number of samples is even.
For an even array and `average=False`, `percentile_rank=50` gives the lower
of the two 'middle' values, that are averaged when calculating the `median`.
"""
y = np.arange(size)
sample_weight = np.ones_like(y)
score = _weighted_percentile(y, sample_weight, 50, average=average)
# `_weighted_percentile(average=False)` does not match `median` when n is even
if size % 2 == 0 and average is False:
assert score != np.median(y)
else:
assert approx(score) == np.median(y)
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("percentile_rank", [20, 35, 61, [5, 47]])
@pytest.mark.parametrize("size", [10, 15])
def test_weighted_percentile_matches_numpy(
global_random_seed, size, percentile_rank, average
):
"""Check `_weighted_percentile` with unit weights is correct.
`average=True` results should be the same as `np.percentile`'s
'averaged_inverted_cdf'.
`average=False` results should be the same as `np.percentile`'s
'inverted_cdf'.
Note `np.percentile` is the same as `np.quantile` except `q` is in range [0, 100].
We parametrize through different `percentile_rank` and `size` to
ensure we get cases where `g=0` and `g>0` (see Hyndman and Fan 1996 for details).
"""
rng = np.random.RandomState(global_random_seed)
y = rng.randint(20, size=size)
sw = np.ones_like(y)
score = _weighted_percentile(y, sw, percentile_rank, average=average)
if average:
method = "averaged_inverted_cdf"
else:
method = "inverted_cdf"
assert approx(score) == np.percentile(y, percentile_rank, method=method)
@pytest.mark.parametrize("percentile_rank", [50, 100])
def test_weighted_percentile_plus_one_clip_max(percentile_rank):
"""Check `j+1` index is clipped to max, when `average=True`.
`percentile_plus_one_indices` can exceed max index when `percentile_indices`
is already at max index.
Note that when `g` (Hyndman and Fan) / `fraction_above` is greater than 0,
`j+1` (Hyndman and Fan) / `percentile_plus_one_indices` is calculated but
never used, so it does not matter what this value is.
When percentile of percentile rank 100 falls exactly on the last value in the
`weighted_cdf`, `g=0` and `percentile_indices` is at max index. In this case
we set `percentile_plus_one_indices` to be max index as well, so the result is
the average of 2x the max index (i.e. last value of `weighted_cdf`).
"""
# Note for both `percentile_rank`s 50 and 100,`percentile_indices` is already at
# max index
y = np.array([[0, 0], [1, 1]])
sw = np.array([[0.1, 0.2], [2, 3]])
score = _weighted_percentile(y, sw, percentile_rank, average=True)
for idx in range(2):
assert score[idx] == approx(1.0)
def test_weighted_percentile_equal():
"""Check `weighted_percentile` with unit weights and all 0 values in `array`."""
y = np.zeros(102, dtype=np.float64)
sw = np.ones(102, dtype=np.float64)
score = _weighted_percentile(y, sw, 50)
assert approx(score) == 0
# XXX: is this really what we want? Shouldn't we raise instead?
# https://github.com/scikit-learn/scikit-learn/issues/31032
def test_weighted_percentile_all_zero_weights():
"""Check `weighted_percentile` with all weights equal to 0 returns last index."""
y = np.arange(10)
sw = np.zeros(10)
value = _weighted_percentile(y, sw, 50)
assert approx(value) == 9.0
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("percentile_rank, expected_value", [(0, 2), (50, 3), (100, 5)])
def test_weighted_percentile_ignores_zero_weight(
average, percentile_rank, expected_value
):
"""Check leading, trailing and middle 0 weights behave correctly.
Check that leading zero-weight observations are ignored when `percentile_rank=0`.
See #20528 for details.
Check that when `average=True` and the `j+1` ('plus one') index has sample weight
of 0, it is ignored. Also check that trailing zero weight observations are ignored
(e.g., when `percentile_rank=100`).
"""
y = np.array([0, 1, 2, 3, 4, 5, 6])
sw = np.array([0, 0, 1, 1, 0, 1, 0])
value = _weighted_percentile(
np.vstack((y, y)).T, np.vstack((sw, sw)).T, percentile_rank, average=average
)
for idx in range(2):
assert approx(value[idx]) == expected_value
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("percentile_rank", [20, 35, 50, 61])
def test_weighted_percentile_frequency_weight_semantics(
global_random_seed, percentile_rank, average
):
"""Check integer weights give the same result as repeating values."""
rng = np.random.RandomState(global_random_seed)
x = rng.randint(20, size=10)
weights = rng.choice(5, size=10)
x_repeated = np.repeat(x, weights)
percentile_weights = _weighted_percentile(
x, weights, percentile_rank, average=average
)
percentile_repeated = _weighted_percentile(
x_repeated, np.ones_like(x_repeated), percentile_rank, average=average
)
assert percentile_weights == approx(percentile_repeated)
# Also check `percentile_rank=50` matches `median`
if percentile_rank == 50 and average:
assert percentile_weights == approx(np.median(x_repeated))
@pytest.mark.parametrize("constant", [5, 8])
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("percentile_rank", [20, 35, 50, 61, [20, 35, 50, 61]])
def test_weighted_percentile_constant_multiplier(
global_random_seed, percentile_rank, average, constant
):
"""Check multiplying weights by a constant does not change the result.
Note scale invariance does not always hold when multiplying by a
float due to cumulative sum numerical error (which grows proportional to n).
"""
rng = np.random.RandomState(global_random_seed)
x = rng.randint(20, size=20)
weights = rng.choice(5, size=20)
weights_multiplied = weights * constant
percentile = _weighted_percentile(x, weights, percentile_rank, average=average)
percentile_multiplier = _weighted_percentile(
x, weights_multiplied, percentile_rank, average=average
)
assert percentile == approx(percentile_multiplier)
@pytest.mark.parametrize("percentile_rank", [50, [20, 35, 50]])
@pytest.mark.parametrize("average", [True, False])
def test_weighted_percentile_2d(global_random_seed, percentile_rank, average):
"""Check `_weighted_percentile` behaviour is correct when `array` is 2D."""
# Check for when array 2D and sample_weight 1D
rng = np.random.RandomState(global_random_seed)
x1 = rng.randint(10, size=10)
w1 = rng.choice(5, size=10)
x2 = rng.randint(20, size=10)
x_2d = np.vstack((x1, x2)).T
wp = _weighted_percentile(
x_2d, w1, percentile_rank=percentile_rank, average=average
)
if isinstance(percentile_rank, list):
p_list = []
for pr in percentile_rank:
p_list.append(
[
_weighted_percentile(
x_2d[:, i], w1, percentile_rank=pr, average=average
)
for i in range(x_2d.shape[1])
]
)
p_axis_0 = np.stack(p_list, axis=-1)
assert wp.shape == (x_2d.shape[1], len(percentile_rank))
else:
# percentile_rank is scalar
p_axis_0 = [
_weighted_percentile(
x_2d[:, i], w1, percentile_rank=percentile_rank, average=average
)
for i in range(x_2d.shape[1])
]
assert wp.shape == (x_2d.shape[1],)
assert_allclose(wp, p_axis_0)
# Check when array and sample_weight both 2D
w2 = rng.choice(5, size=10)
w_2d = np.vstack((w1, w2)).T
wp = _weighted_percentile(
x_2d, w_2d, percentile_rank=percentile_rank, average=average
)
if isinstance(percentile_rank, list):
p_list = []
for pr in percentile_rank:
p_list.append(
[
_weighted_percentile(
x_2d[:, i], w_2d[:, i], percentile_rank=pr, average=average
)
for i in range(x_2d.shape[1])
]
)
p_axis_0 = np.stack(p_list, axis=-1)
assert wp.shape == (x_2d.shape[1], len(percentile_rank))
else:
# percentile_rank is scalar
p_axis_0 = [
_weighted_percentile(
x_2d[:, i], w_2d[:, i], percentile_rank=percentile_rank, average=average
)
for i in range(x_2d.shape[1])
]
assert wp.shape == (x_2d.shape[1],)
assert_allclose(wp, p_axis_0)
@pytest.mark.parametrize(
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
"data, weights, percentile",
[
# NumPy scalars input (handled as 0D arrays on array API)
(np.float32(42), np.int32(1), 50),
# Random 1D array, constant weights
(lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
# Random 2D array and random 1D weights
(lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
# Random 2D array and random 2D weights
(
lambda rng: rng.rand(20, 3),
lambda rng: rng.rand(20, 3).astype(np.float32),
[25, 75],
),
# zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
(np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
# np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
(np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
# `sample_weight` dtype: int32
(
np.array([0, 1, 2, 3, 4, 5]),
np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
[25, 75],
),
],
)
def test_weighted_percentile_array_api_consistency(
global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
):
"""Check `_weighted_percentile` gives consistent results with array API."""
xp = _array_api_for_tests(array_namespace, device)
# Skip test for percentile=0 edge case (#20528) on namespace/device where
# xp.nextafter is broken. This is the case for torch with MPS device:
# https://github.com/pytorch/pytorch/issues/150027
zero = xp.zeros(1, device=device)
one = xp.ones(1, device=device)
if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
pytest.xfail(f"xp.nextafter is broken on {device}")
rng = np.random.RandomState(global_random_seed)
X_np = data(rng) if callable(data) else data
weights_np = weights(rng) if callable(weights) else weights
# Ensure `data` of correct dtype
X_np = X_np.astype(dtype_name)
result_np = _weighted_percentile(X_np, weights_np, percentile)
# Convert to Array API arrays
X_xp = xp.asarray(X_np, device=device)
weights_xp = xp.asarray(weights_np, device=device)
with config_context(array_api_dispatch=True):
result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
assert array_device(result_xp) == array_device(X_xp)
assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
result_xp_np = _convert_to_numpy(result_xp, xp=xp)
assert result_xp_np.dtype == result_np.dtype
assert result_xp_np.shape == result_np.shape
assert_allclose(result_np, result_xp_np)
# Check dtype correct (`sample_weight` should follow `array`)
if dtype_name == "float32":
assert result_xp_np.dtype == result_np.dtype == np.float32
else:
assert result_xp_np.dtype == np.float64
@pytest.mark.parametrize("average", [True, False])
@pytest.mark.parametrize("sample_weight_ndim", [1, 2])
def test_weighted_percentile_nan_filtered(
global_random_seed, sample_weight_ndim, average
):
"""Test `_weighted_percentile` ignores NaNs.
Calling `_weighted_percentile` on an array with nan values returns the same
results as calling `_weighted_percentile` on a filtered version of the data.
We test both with sample_weight of the same shape as the data and with
one-dimensional sample_weight.
"""
rng = np.random.RandomState(global_random_seed)
array_with_nans = rng.rand(100, 10)
array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
nan_mask = np.isnan(array_with_nans)
if sample_weight_ndim == 2:
sample_weight = rng.randint(1, 6, size=(100, 10))
else:
sample_weight = rng.randint(1, 6, size=(100,))
# Find the weighted percentile on the array with nans:
results = _weighted_percentile(array_with_nans, sample_weight, 30, average=average)
# Find the weighted percentile on the filtered array:
filtered_array = [
array_with_nans[~nan_mask[:, col], col]
for col in range(array_with_nans.shape[1])
]
if sample_weight.ndim == 1:
sample_weight = np.repeat(sample_weight, array_with_nans.shape[1]).reshape(
array_with_nans.shape[0], array_with_nans.shape[1]
)
filtered_weights = [
sample_weight[~nan_mask[:, col], col] for col in range(array_with_nans.shape[1])
]
expected_results = np.array(
[
_weighted_percentile(
filtered_array[col], filtered_weights[col], 30, average=average
)
for col in range(array_with_nans.shape[1])
]
)
assert_array_equal(expected_results, results)
@pytest.mark.parametrize(
"percentile_rank, expected",
[
(90, [np.nan, 5]),
([50, 90], [[np.nan, np.nan], [2.0, 5.0]]),
],
)
def test_weighted_percentile_all_nan_column(percentile_rank, expected):
"""Check that nans are ignored in general, except for all NaN columns."""
array = np.array(
[
[np.nan, 5],
[np.nan, 1],
[np.nan, np.nan],
[np.nan, np.nan],
[np.nan, 2],
[np.nan, np.nan],
]
)
weights = np.ones_like(array)
values = _weighted_percentile(array, weights, percentile_rank)
# The percentile of the second column should be `5` even though there are many nan
# values present; the percentile of the first column can only be nan, since there
# are no other possible values:
assert np.array_equal(values, expected, equal_nan=True)
@pytest.mark.skipif(
np_version < parse_version("2.0"),
reason="np.quantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
@pytest.mark.parametrize("average", [False, True])
@pytest.mark.parametrize("uniform_weight", [False, True])
def test_weighted_percentile_like_numpy_quantile(
percentile, average, uniform_weight, global_random_seed
):
"""Check `_weighted_percentile` is equivalent to `np.quantile` with weights."""
# TODO: remove the following skip once no longer applicable.
if average and not uniform_weight:
pytest.skip(
"np.quantile does not support weights with method='averaged_inverted_cdf'"
)
rng = np.random.RandomState(global_random_seed)
array = rng.rand(10, 100)
if uniform_weight:
sample_weight = np.ones_like(array) * rng.randint(1, 6, size=1)
else:
sample_weight = rng.randint(1, 6, size=(10, 100))
percentile_weighted_percentile = _weighted_percentile(
array, sample_weight, percentile, average=average
)
percentile_numpy_quantile = np.quantile(
array,
percentile / 100,
weights=sample_weight if not uniform_weight else None,
method="averaged_inverted_cdf" if average else "inverted_cdf",
axis=0,
)
assert_array_equal(percentile_weighted_percentile, percentile_numpy_quantile)
@pytest.mark.skipif(
np_version < parse_version("2.0"),
reason="np.nanquantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
@pytest.mark.parametrize("average", [False, True])
@pytest.mark.parametrize("uniform_weight", [False, True])
def test_weighted_percentile_like_numpy_nanquantile(
percentile, average, uniform_weight, global_random_seed
):
"""Check `_weighted_percentile` equivalent to `np.nanquantile` with weights."""
# TODO: remove the following skip once no longer applicable.
if average and not uniform_weight:
pytest.skip(
"np.nanquantile does not support weights with "
"method='averaged_inverted_cdf'"
)
rng = np.random.RandomState(global_random_seed)
array_with_nans = rng.rand(10, 100)
array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
if uniform_weight:
sample_weight = np.ones_like(array_with_nans) * rng.randint(
1,
6,
size=1,
)
else:
sample_weight = rng.randint(1, 6, size=(10, 100))
percentile_weighted_percentile = _weighted_percentile(
array_with_nans, sample_weight, percentile, average=average
)
percentile_numpy_nanquantile = np.nanquantile(
array_with_nans,
percentile / 100,
weights=sample_weight if not uniform_weight else None,
method="averaged_inverted_cdf" if average else "inverted_cdf",
axis=0,
)
assert_array_equal(percentile_weighted_percentile, percentile_numpy_nanquantile)

View File

@@ -0,0 +1,142 @@
from dataclasses import dataclass, fields
import numpy as np
import pytest
from sklearn.base import (
BaseEstimator,
ClassifierMixin,
RegressorMixin,
TransformerMixin,
)
from sklearn.pipeline import Pipeline
from sklearn.utils import (
Tags,
get_tags,
)
from sklearn.utils.estimator_checks import (
check_estimator_tags_renamed,
check_valid_tag_types,
)
class EmptyClassifier(ClassifierMixin, BaseEstimator):
pass
class EmptyTransformer(TransformerMixin, BaseEstimator):
pass
class EmptyRegressor(RegressorMixin, BaseEstimator):
pass
@pytest.mark.parametrize(
"estimator, value",
[
[EmptyClassifier(), True],
[EmptyTransformer(), False],
[EmptyRegressor(), True],
[BaseEstimator(), False],
],
)
def test_requires_y(estimator, value):
assert get_tags(estimator).target_tags.required == value
def test_no___sklearn_tags__with_more_tags():
"""Test that calling `get_tags` on a class that defines `_more_tags` but not
`__sklearn_tags__` raises an error.
"""
class MoreTagsEstimator(BaseEstimator):
def _more_tags(self):
return {"requires_y": True} # pragma: no cover
with pytest.raises(
TypeError, match="has defined either `_more_tags` or `_get_tags`"
):
check_estimator_tags_renamed("MoreTagsEstimator", MoreTagsEstimator())
def test_tag_test_passes_with_inheritance():
@dataclass
class MyTags(Tags):
my_tag: bool = True # type: ignore[annotation-unchecked]
class MyEstimator(BaseEstimator):
def __sklearn_tags__(self):
tags_orig = super().__sklearn_tags__()
as_dict = {
field.name: getattr(tags_orig, field.name)
for field in fields(tags_orig)
}
tags = MyTags(**as_dict)
tags.my_tag = True
return tags
check_valid_tag_types("MyEstimator", MyEstimator())
def test_tags_no_sklearn_tags_concrete_implementation():
"""Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/30479
Either the estimator doesn't implement `__sklearn_tags` or there is no class
implementing `__sklearn_tags__` without calling `super().__sklearn_tags__()` in
its mro. Thus, we raise an error and request to inherit from
`BaseEstimator` that implements `__sklearn_tags__`.
"""
X = np.array([[1, 2], [2, 3], [3, 4]])
y = np.array([1, 0, 1])
# 1st case, the estimator inherits from a class that only implements
# `__sklearn_tags__` by calling `super().__sklearn_tags__()`.
class MyEstimator(ClassifierMixin):
def __init__(self, *, param=1):
self.param = param
def fit(self, X, y=None):
self.is_fitted_ = True
return self
def predict(self, X):
return np.full(shape=X.shape[0], fill_value=self.param)
my_pipeline = Pipeline([("estimator", MyEstimator(param=1))])
with pytest.raises(AttributeError, match="The following error was raised"):
my_pipeline.fit(X, y).predict(X)
# 2nd case, the estimator doesn't implement `__sklearn_tags__` at all.
class MyEstimator2:
def __init__(self, *, param=1):
self.param = param
def fit(self, X, y=None):
self.is_fitted_ = True
return self
def predict(self, X):
return np.full(shape=X.shape[0], fill_value=self.param)
my_pipeline = Pipeline([("estimator", MyEstimator2(param=1))])
with pytest.raises(AttributeError, match="The following error was raised"):
my_pipeline.fit(X, y).predict(X)
# check that we still raise an error if it is not an AttributeError or related to
# __sklearn_tags__
class MyEstimator3(MyEstimator, BaseEstimator):
def __init__(self, *, param=1, error_type=AttributeError):
self.param = param
self.error_type = error_type
def __sklearn_tags__(self):
super().__sklearn_tags__()
raise self.error_type("test")
for error_type in (AttributeError, TypeError, ValueError):
estimator = MyEstimator3(param=1, error_type=error_type)
with pytest.raises(error_type):
get_tags(estimator)

View File

@@ -0,0 +1,25 @@
import numpy as np
import pytest
from sklearn.utils._typedefs import testing_make_array_from_typed_val
@pytest.mark.parametrize(
"type_t, value, expected_dtype",
[
("float64_t", 1.0, np.float64),
("float32_t", 1.0, np.float32),
("intp_t", 1, np.intp),
("int8_t", 1, np.int8),
("int32_t", 1, np.int32),
("int64_t", 1, np.int64),
("uint8_t", 1, np.uint8),
("uint32_t", 1, np.uint32),
("uint64_t", 1, np.uint64),
],
)
def test_types(type_t, value, expected_dtype):
"""Check that the types defined in _typedefs correspond to the expected
numpy dtypes.
"""
assert testing_make_array_from_typed_val[type_t](value).dtype == expected_dtype

View File

@@ -0,0 +1,54 @@
import numpy as np
from numpy.testing import assert_array_equal
from sklearn.utils._unique import attach_unique, cached_unique
from sklearn.utils.validation import check_array
def test_attach_unique_attaches_unique_to_array():
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_ = attach_unique(arr)
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
assert_array_equal(arr_, arr)
def test_cached_unique_returns_cached_unique():
my_dtype = np.dtype(np.float64, metadata={"unique": np.array([1, 2])})
arr = np.array([1, 2, 2, 3, 4, 4, 5], dtype=my_dtype)
assert_array_equal(cached_unique(arr), np.array([1, 2]))
def test_attach_unique_not_ndarray():
"""Test that when not np.ndarray, we don't touch the array."""
arr = [1, 2, 2, 3, 4, 4, 5]
arr_ = attach_unique(arr)
assert arr_ is arr
def test_attach_unique_returns_view():
"""Test that attach_unique returns a view of the array."""
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_ = attach_unique(arr)
assert arr_.base is arr
def test_attach_unique_return_tuple():
"""Test return_tuple argument of the function."""
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_tuple = attach_unique(arr, return_tuple=True)
assert isinstance(arr_tuple, tuple)
assert len(arr_tuple) == 1
assert_array_equal(arr_tuple[0], arr)
arr_single = attach_unique(arr, return_tuple=False)
assert isinstance(arr_single, np.ndarray)
assert_array_equal(arr_single, arr)
def test_check_array_keeps_unique():
"""Test that check_array keeps the unique metadata."""
arr = np.array([[1, 2, 2, 3, 4, 4, 5]])
arr_ = attach_unique(arr)
arr_ = check_array(arr_)
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
assert_array_equal(arr_, arr)

View File

@@ -0,0 +1,65 @@
import string
import timeit
import pytest
from sklearn.utils._user_interface import _message_with_time, _print_elapsed_time
@pytest.mark.parametrize(
["source", "message", "is_long"],
[
("ABC", string.ascii_lowercase, False),
("ABCDEF", string.ascii_lowercase, False),
("ABC", string.ascii_lowercase * 3, True),
("ABC" * 10, string.ascii_lowercase, True),
("ABC", string.ascii_lowercase + "\u1048", False),
],
)
@pytest.mark.parametrize(
["time", "time_str"],
[
(0.2, " 0.2s"),
(20, " 20.0s"),
(2000, "33.3min"),
(20000, "333.3min"),
],
)
def test_message_with_time(source, message, is_long, time, time_str):
out = _message_with_time(source, message, time)
if is_long:
assert len(out) > 70
else:
assert len(out) == 70
assert out.startswith("[" + source + "] ")
out = out[len(source) + 3 :]
assert out.endswith(time_str)
out = out[: -len(time_str)]
assert out.endswith(", total=")
out = out[: -len(", total=")]
assert out.endswith(message)
out = out[: -len(message)]
assert out.endswith(" ")
out = out[:-1]
if is_long:
assert not out
else:
assert list(set(out)) == ["."]
@pytest.mark.parametrize(
["message", "expected"],
[
("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
("", _message_with_time("ABC", "", 0.1) + "\n"),
(None, ""),
],
)
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
monkeypatch.setattr(timeit, "default_timer", lambda: 0)
with _print_elapsed_time("ABC", message):
monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
assert capsys.readouterr().out == expected

View File

@@ -0,0 +1,25 @@
import numpy as np
import pytest
from sklearn.utils._weight_vector import (
WeightVector32,
WeightVector64,
)
@pytest.mark.parametrize(
"dtype, WeightVector",
[
(np.float32, WeightVector32),
(np.float64, WeightVector64),
],
)
def test_type_invariance(dtype, WeightVector):
"""Check the `dtype` consistency of `WeightVector`."""
weights = np.random.rand(100).astype(dtype)
average_weights = np.random.rand(100).astype(dtype)
weight_vector = WeightVector(weights, average_weights)
assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)