Videre
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,16 @@
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(100))
|
||||
def test_init_arpack_v0(seed):
|
||||
# check that the initialization a sampling from an uniform distribution
|
||||
# where we can fix the random state
|
||||
size = 1000
|
||||
v0 = _init_arpack_v0(size, seed)
|
||||
|
||||
rng = check_random_state(seed)
|
||||
assert_allclose(v0, rng.uniform(-1, 1, size=size))
|
||||
@@ -0,0 +1,931 @@
|
||||
import os
|
||||
from functools import partial
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
import scipy
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn._loss import HalfMultinomialLoss
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils._array_api import (
|
||||
_add_to_diagonal,
|
||||
_asarray_with_order,
|
||||
_atol_for_type,
|
||||
_average,
|
||||
_convert_to_numpy,
|
||||
_count_nonzero,
|
||||
_estimator_with_converted_arrays,
|
||||
_fill_diagonal,
|
||||
_get_namespace_device_dtype_ids,
|
||||
_half_multinomial_loss,
|
||||
_is_numpy_namespace,
|
||||
_isin,
|
||||
_logsumexp,
|
||||
_max_precision_float_dtype,
|
||||
_median,
|
||||
_nanmax,
|
||||
_nanmean,
|
||||
_nanmin,
|
||||
_ravel,
|
||||
_validate_diagonal_args,
|
||||
device,
|
||||
get_namespace,
|
||||
get_namespace_and_device,
|
||||
indexing_dtype,
|
||||
move_to,
|
||||
np_compat,
|
||||
supported_float_dtypes,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
SkipTest,
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_array_equal,
|
||||
skip_if_array_api_compat_not_configured,
|
||||
)
|
||||
from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, np_version, parse_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
|
||||
def test_get_namespace_ndarray_default(X):
|
||||
"""Check that get_namespace returns NumPy wrapper"""
|
||||
xp_out, is_array_api_compliant = get_namespace(X)
|
||||
assert xp_out is np_compat
|
||||
assert not is_array_api_compliant
|
||||
|
||||
|
||||
def test_get_namespace_ndarray_creation_device():
|
||||
"""Check expected behavior with device and creation functions."""
|
||||
X = numpy.asarray([1, 2, 3])
|
||||
xp_out, _ = get_namespace(X)
|
||||
|
||||
full_array = xp_out.full(10, fill_value=2.0, device="cpu")
|
||||
assert_allclose(full_array, [2.0] * 10)
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported device"):
|
||||
xp_out.zeros(10, device="cuda")
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_get_namespace_ndarray_with_dispatch():
|
||||
"""Test get_namespace on NumPy ndarrays."""
|
||||
|
||||
X_np = numpy.asarray([[1, 2, 3]])
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_np)
|
||||
assert is_array_api_compliant
|
||||
|
||||
# In the future, NumPy should become API compliant library and we should have
|
||||
# assert xp_out is numpy
|
||||
assert xp_out is np_compat
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name", ["pyarrow", "dataframe", "polars", "series"]
|
||||
)
|
||||
def test_get_namespace_df_with_dispatch(constructor_name):
|
||||
"""Test get_namespace on dataframes and series."""
|
||||
|
||||
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(df)
|
||||
assert not is_array_api_compliant
|
||||
|
||||
# When operating on dataframes or series the Numpy namespace is
|
||||
# the right thing to use.
|
||||
assert xp_out is np_compat
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_get_namespace_sparse_with_dispatch():
|
||||
"""Test get_namespace on sparse arrays."""
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(sp.csr_array([[1, 2, 3]]))
|
||||
assert not is_array_api_compliant
|
||||
|
||||
# When operating on sparse arrays the Numpy namespace is
|
||||
# the right thing to use.
|
||||
assert xp_out is np_compat
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_get_namespace_array_api(monkeypatch):
|
||||
"""Test get_namespace for ArrayAPI arrays."""
|
||||
xp = pytest.importorskip("array_api_strict")
|
||||
|
||||
X_np = numpy.asarray([[1, 2, 3]])
|
||||
X_xp = xp.asarray(X_np)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_xp)
|
||||
assert is_array_api_compliant
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
|
||||
|
||||
def mock_getenv(key):
|
||||
if key == "SCIPY_ARRAY_API":
|
||||
return "0"
|
||||
|
||||
monkeypatch.setattr("os.environ.get", mock_getenv)
|
||||
assert os.environ.get("SCIPY_ARRAY_API") != "1"
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match="scipy's own support is not enabled.",
|
||||
):
|
||||
get_namespace(X_xp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_input, reference",
|
||||
[
|
||||
pytest.param(("cupy", None), ("torch", "cuda"), id="cupy to torch cuda"),
|
||||
pytest.param(("torch", "mps"), ("numpy", None), id="torch mps to numpy"),
|
||||
pytest.param(("numpy", None), ("torch", "cuda"), id="numpy to torch cuda"),
|
||||
pytest.param(("numpy", None), ("torch", "mps"), id="numpy to torch mps"),
|
||||
pytest.param(
|
||||
("array_api_strict", None),
|
||||
("torch", "mps"),
|
||||
id="array_api_strict to torch mps",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_move_to_array_api_conversions(array_input, reference):
|
||||
"""Check conversion between various namespace and devices."""
|
||||
if array_input[0] == "array_api_strict":
|
||||
array_api_strict = pytest.importorskip(
|
||||
"array_api_strict", reason="array-api-strict not available"
|
||||
)
|
||||
xp = _array_api_for_tests(reference[0], reference[1])
|
||||
xp_array = _array_api_for_tests(array_input[0], array_input[1])
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
device_ = device(xp.asarray([1], device=reference[1]))
|
||||
|
||||
if array_input[0] == "array_api_strict":
|
||||
array_device = array_api_strict.Device("CPU_DEVICE")
|
||||
else:
|
||||
array_device = array_input[1]
|
||||
array = xp_array.asarray([1, 2, 3], device=array_device)
|
||||
|
||||
array_out = move_to(array, xp=xp, device=device_)
|
||||
assert get_namespace(array_out)[0] == xp
|
||||
assert device(array_out) == device_
|
||||
|
||||
|
||||
def test_move_to_sparse():
|
||||
"""Check sparse inputs are handled correctly."""
|
||||
xp_numpy = _array_api_for_tests("numpy", None)
|
||||
xp_torch = _array_api_for_tests("torch", "cpu")
|
||||
|
||||
sparse1 = sp.csr_array([0, 1, 2, 3])
|
||||
sparse2 = sp.csr_array([0, 1, 0, 1])
|
||||
numpy_array = numpy.array([1, 2, 3])
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
device_cpu = xp_torch.asarray([1]).device
|
||||
|
||||
# sparse and None to NumPy
|
||||
result1, result2 = move_to(sparse1, None, xp=xp_numpy, device=None)
|
||||
assert result1 is sparse1
|
||||
assert result2 is None
|
||||
|
||||
# sparse to non-NumPy
|
||||
msg = r"Sparse arrays are only accepted \(and passed through\)"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
move_to(sparse1, numpy_array, xp=xp_torch, device=device_cpu)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
move_to(sparse1, None, xp=xp_torch, device=device_cpu)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
|
||||
def test_asarray_with_order(array_api):
|
||||
"""Test _asarray_with_order passes along order for NumPy arrays."""
|
||||
xp = pytest.importorskip(array_api)
|
||||
|
||||
X = xp.asarray([1.2, 3.4, 5.1])
|
||||
X_new = _asarray_with_order(X, order="F", xp=xp)
|
||||
|
||||
X_new_np = numpy.asarray(X_new)
|
||||
assert X_new_np.flags["F_CONTIGUOUS"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"weights, axis, normalize, expected",
|
||||
[
|
||||
# normalize = True
|
||||
(None, None, True, 3.5),
|
||||
(None, 0, True, [2.5, 3.5, 4.5]),
|
||||
(None, 1, True, [2, 5]),
|
||||
([True, False], 0, True, [1, 2, 3]), # boolean weights
|
||||
([True, True, False], 1, True, [1.5, 4.5]), # boolean weights
|
||||
([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
|
||||
([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
|
||||
([1, 2], 0, True, [3, 4, 5]),
|
||||
([1, 1, 2], 1, True, [2.25, 5.25]),
|
||||
([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
|
||||
([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
|
||||
# normalize = False
|
||||
(None, None, False, 21),
|
||||
(None, 0, False, [5, 7, 9]),
|
||||
(None, 1, False, [6, 15]),
|
||||
([True, False], 0, False, [1, 2, 3]), # boolean weights
|
||||
([True, True, False], 1, False, [3, 9]), # boolean weights
|
||||
([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
|
||||
([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
|
||||
([1, 2], 0, False, [9, 12, 15]),
|
||||
([1, 1, 2], 1, False, [9, 21]),
|
||||
([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
|
||||
([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
|
||||
],
|
||||
)
|
||||
def test_average(
|
||||
array_namespace, device_, dtype_name, weights, axis, normalize, expected
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
|
||||
array_in = xp.asarray(array_in, device=device_)
|
||||
if weights is not None:
|
||||
weights = numpy.asarray(weights, dtype=dtype_name)
|
||||
weights = xp.asarray(weights, device=device_)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
|
||||
|
||||
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
|
||||
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
|
||||
# https://github.com/numpy/numpy/issues/26850
|
||||
assert device(array_in) == device(result)
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
|
||||
[4, 3], dtype=dtype_name
|
||||
)
|
||||
complex_type_name = array_in.dtype.name
|
||||
if not hasattr(xp, complex_type_name):
|
||||
# This is the case for cupy as of March 2024 for instance.
|
||||
pytest.skip(f"{array_namespace} does not support {complex_type_name}")
|
||||
|
||||
array_in = xp.asarray(array_in, device=device)
|
||||
|
||||
err_msg = "Complex floating point values are not supported by average."
|
||||
with (
|
||||
config_context(array_api_dispatch=True),
|
||||
pytest.raises(NotImplementedError, match=err_msg),
|
||||
):
|
||||
_average(array_in)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"axis, weights, error, error_msg",
|
||||
(
|
||||
(
|
||||
None,
|
||||
[1, 2],
|
||||
TypeError,
|
||||
"Axis must be specified",
|
||||
),
|
||||
(
|
||||
0,
|
||||
[[1, 2]],
|
||||
# NumPy 2 raises ValueError, NumPy 1 raises TypeError
|
||||
(ValueError, TypeError),
|
||||
"weights", # the message is different for NumPy 1 and 2...
|
||||
),
|
||||
(
|
||||
0,
|
||||
[1, 2, 3, 4],
|
||||
ValueError,
|
||||
"weights",
|
||||
),
|
||||
(0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
|
||||
),
|
||||
)
|
||||
def test_average_raises_with_invalid_parameters(
|
||||
array_namespace, device, dtype_name, axis, weights, error, error_msg
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
|
||||
array_in = xp.asarray(array_in, device=device)
|
||||
|
||||
weights = numpy.asarray(weights, dtype=dtype_name)
|
||||
weights = xp.asarray(weights, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
|
||||
_average(array_in, axis=axis, weights=weights)
|
||||
|
||||
|
||||
def test_device_none_if_no_input():
|
||||
assert device() is None
|
||||
|
||||
assert device(None, "name") is None
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_device_inspection():
|
||||
class Device:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def __eq__(self, device):
|
||||
return self.name == device.name
|
||||
|
||||
def __hash__(self):
|
||||
raise TypeError("Device object is not hashable")
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
class Array:
|
||||
def __init__(self, device_name):
|
||||
self.device = Device(device_name)
|
||||
|
||||
# Sanity check: ensure our Device mock class is non hashable, to
|
||||
# accurately account for non-hashable device objects in some array
|
||||
# libraries, because of which the `device` inspection function shouldn't
|
||||
# make use of hash lookup tables (in particular, not use `set`)
|
||||
with pytest.raises(TypeError):
|
||||
hash(Array("device").device)
|
||||
|
||||
# If array API dispatch is disabled the device should be ignored. Erroring
|
||||
# early for different devices would prevent the np.asarray conversion to
|
||||
# happen. For example, `r2_score(np.ones(5), torch.ones(5))` should work
|
||||
# fine with array API disabled.
|
||||
assert device(Array("cpu"), Array("mygpu")) is None
|
||||
|
||||
# Test that ValueError is raised if on different devices and array API dispatch is
|
||||
# enabled.
|
||||
err_msg = "Input arrays use different devices: cpu, mygpu"
|
||||
with config_context(array_api_dispatch=True):
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
device(Array("cpu"), Array("mygpu"))
|
||||
|
||||
# Test expected value is returned otherwise
|
||||
array1 = Array("device")
|
||||
array2 = Array("device")
|
||||
|
||||
assert array1.device == device(array1)
|
||||
assert array1.device == device(array1, array2)
|
||||
assert array1.device == device(array1, array1, array2)
|
||||
|
||||
|
||||
# TODO: add cupy to the list of libraries once the following upstream issue
|
||||
# has been fixed:
|
||||
# https://github.com/cupy/cupy/issues/8180
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
|
||||
@pytest.mark.parametrize(
|
||||
"X,reduction,expected",
|
||||
[
|
||||
([1, 2, numpy.nan], _nanmin, 1),
|
||||
([1, -2, -numpy.nan], _nanmin, -2),
|
||||
([numpy.inf, numpy.inf], _nanmin, numpy.inf),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmin, axis=0),
|
||||
[1.0, 2.0, 3.0],
|
||||
),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmin, axis=1),
|
||||
[1.0, numpy.nan, 4.0],
|
||||
),
|
||||
([1, 2, numpy.nan], _nanmax, 2),
|
||||
([1, 2, numpy.nan], _nanmax, 2),
|
||||
([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmax, axis=0),
|
||||
[4.0, 5.0, 6.0],
|
||||
),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmax, axis=1),
|
||||
[3.0, numpy.nan, 6.0],
|
||||
),
|
||||
([1, 2, numpy.nan], _nanmean, 1.5),
|
||||
([1, -2, -numpy.nan], _nanmean, -0.5),
|
||||
([-numpy.inf, -numpy.inf], _nanmean, -numpy.inf),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmean, axis=0),
|
||||
[2.5, 3.5, 4.5],
|
||||
),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmean, axis=1),
|
||||
[2.0, numpy.nan, 5.0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nan_reductions(library, X, reduction, expected):
|
||||
"""Check NaN reductions like _nanmin and _nanmax"""
|
||||
xp = pytest.importorskip(library)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = reduction(xp.asarray(X))
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, _device, _dtype",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_ravel(namespace, _device, _dtype):
|
||||
xp = _array_api_for_tests(namespace, _device)
|
||||
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
|
||||
array_xp = xp.asarray(array, device=_device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _ravel(array_xp)
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
expected = numpy.ravel(array, order="C")
|
||||
|
||||
assert_allclose(expected, result)
|
||||
|
||||
if _is_numpy_namespace(xp):
|
||||
assert numpy.asarray(result).flags["C_CONTIGUOUS"]
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize("library", ["cupy", "torch"])
|
||||
def test_convert_to_numpy_gpu(library): # pragma: nocover
|
||||
"""Check convert_to_numpy for GPU backed libraries."""
|
||||
xp = pytest.importorskip(library)
|
||||
|
||||
if library == "torch":
|
||||
if not xp.backends.cuda.is_built():
|
||||
pytest.skip("test requires cuda")
|
||||
X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
|
||||
else:
|
||||
X_gpu = xp.asarray([1.0, 2.0, 3.0])
|
||||
|
||||
X_cpu = _convert_to_numpy(X_gpu, xp=xp)
|
||||
expected_output = numpy.asarray([1.0, 2.0, 3.0])
|
||||
assert_allclose(X_cpu, expected_output)
|
||||
|
||||
|
||||
def test_convert_to_numpy_cpu():
|
||||
"""Check convert_to_numpy for PyTorch CPU arrays."""
|
||||
torch = pytest.importorskip("torch")
|
||||
X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
|
||||
|
||||
X_cpu = _convert_to_numpy(X_torch, xp=torch)
|
||||
expected_output = numpy.asarray([1.0, 2.0, 3.0])
|
||||
assert_allclose(X_cpu, expected_output)
|
||||
|
||||
|
||||
class SimpleEstimator(BaseEstimator):
|
||||
def fit(self, X, y=None):
|
||||
self.X_ = X
|
||||
self.n_features_ = X.shape[0]
|
||||
return self
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, converter",
|
||||
[
|
||||
("torch", lambda array: array.cpu().numpy()),
|
||||
("array_api_strict", lambda array: numpy.asarray(array)),
|
||||
("cupy", lambda array: array.get()),
|
||||
],
|
||||
)
|
||||
def test_convert_estimator_to_ndarray(array_namespace, converter):
|
||||
"""Convert estimator attributes to ndarray."""
|
||||
xp = pytest.importorskip(array_namespace)
|
||||
|
||||
X = xp.asarray([[1.3, 4.5]])
|
||||
est = SimpleEstimator().fit(X)
|
||||
|
||||
new_est = _estimator_with_converted_arrays(est, converter)
|
||||
assert isinstance(new_est.X_, numpy.ndarray)
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_convert_estimator_to_array_api():
|
||||
"""Convert estimator attributes to ArrayAPI arrays."""
|
||||
xp = pytest.importorskip("array_api_strict")
|
||||
|
||||
X_np = numpy.asarray([[1.3, 4.5]])
|
||||
est = SimpleEstimator().fit(X_np)
|
||||
|
||||
new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
|
||||
assert hasattr(new_est.X_, "__array_namespace__")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, _device, _dtype",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_indexing_dtype(namespace, _device, _dtype):
|
||||
xp = _array_api_for_tests(namespace, _device)
|
||||
|
||||
if _IS_32BIT:
|
||||
assert indexing_dtype(xp) == xp.int32
|
||||
else:
|
||||
assert indexing_dtype(xp) == xp.int64
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, _device, _dtype",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_max_precision_float_dtype(namespace, _device, _dtype):
|
||||
xp = _array_api_for_tests(namespace, _device)
|
||||
expected_dtype = xp.float32 if _device == "mps" else xp.float64
|
||||
assert _max_precision_float_dtype(xp, _device) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, _",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize("invert", [True, False])
|
||||
@pytest.mark.parametrize("assume_unique", [True, False])
|
||||
@pytest.mark.parametrize("element_size", [6, 10, 14])
|
||||
@pytest.mark.parametrize("int_dtype", ["int16", "int32", "int64", "uint8"])
|
||||
def test_isin(
|
||||
array_namespace, device, _, invert, assume_unique, element_size, int_dtype
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
r = element_size // 2
|
||||
element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype)
|
||||
test_elements = numpy.array(numpy.arange(14), dtype=int_dtype)
|
||||
element_xp = xp.asarray(element, device=device)
|
||||
test_elements_xp = xp.asarray(test_elements, device=device)
|
||||
expected = numpy.isin(
|
||||
element=element,
|
||||
test_elements=test_elements,
|
||||
assume_unique=assume_unique,
|
||||
invert=invert,
|
||||
)
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _isin(
|
||||
element=element_xp,
|
||||
test_elements=test_elements_xp,
|
||||
xp=xp,
|
||||
assume_unique=assume_unique,
|
||||
invert=invert,
|
||||
)
|
||||
|
||||
assert_array_equal(_convert_to_numpy(result, xp=xp), expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("SCIPY_ARRAY_API") != "1", reason="SCIPY_ARRAY_API not set to 1."
|
||||
)
|
||||
def test_get_namespace_and_device():
|
||||
# Use torch as a library with custom Device objects:
|
||||
torch = pytest.importorskip("torch")
|
||||
|
||||
from sklearn.externals.array_api_compat import torch as torch_compat
|
||||
|
||||
some_torch_tensor = torch.arange(3, device="cpu")
|
||||
some_numpy_array = numpy.arange(3)
|
||||
|
||||
# When dispatch is disabled, get_namespace_and_device should return the
|
||||
# default NumPy wrapper namespace and "cpu" device. Our code will handle such
|
||||
# inputs via the usual __array__ interface without attempting to dispatch
|
||||
# via the array API.
|
||||
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
|
||||
assert namespace is get_namespace(some_numpy_array)[0]
|
||||
assert not is_array_api
|
||||
assert device is None
|
||||
|
||||
# Otherwise, expose the torch namespace and device via array API compat
|
||||
# wrapper.
|
||||
with config_context(array_api_dispatch=True):
|
||||
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
|
||||
assert namespace is torch_compat
|
||||
assert is_array_api
|
||||
assert device == some_torch_tensor.device
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("axis", [0, 1, None, -1, -2])
|
||||
@pytest.mark.parametrize("sample_weight_type", [None, "int", "float"])
|
||||
def test_count_nonzero(
|
||||
array_namespace, device_, dtype_name, csr_container, axis, sample_weight_type
|
||||
):
|
||||
from sklearn.utils.sparsefuncs import count_nonzero as sparse_count_nonzero
|
||||
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array = numpy.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]])
|
||||
if sample_weight_type == "int":
|
||||
sample_weight = numpy.asarray([1, 2, 2, 3, 1])
|
||||
elif sample_weight_type == "float":
|
||||
sample_weight = numpy.asarray([0.5, 1.5, 0.8, 3.2, 2.4], dtype=dtype_name)
|
||||
else:
|
||||
sample_weight = None
|
||||
expected = sparse_count_nonzero(
|
||||
csr_container(array), axis=axis, sample_weight=sample_weight
|
||||
)
|
||||
array_xp = xp.asarray(array, device=device_)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _count_nonzero(
|
||||
array_xp, axis=axis, sample_weight=sample_weight, xp=xp, device=device_
|
||||
)
|
||||
|
||||
assert_allclose(_convert_to_numpy(result, xp=xp), expected)
|
||||
|
||||
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
|
||||
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
|
||||
# https://github.com/numpy/numpy/issues/26850
|
||||
assert device(array_xp) == device(result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array, value, match",
|
||||
[
|
||||
(numpy.array([1, 2, 3]), 1, "`array` should be 2D"),
|
||||
(numpy.array([[1, 2], [3, 4]]), numpy.array([1, 2, 3]), "`value` needs to be"),
|
||||
(numpy.array([[1, 2], [3, 4]]), [1, 2, 3], "`value` needs to be"),
|
||||
(
|
||||
numpy.array([[1, 2], [3, 4]]),
|
||||
numpy.array([[1, 2], [3, 4]]),
|
||||
"`value` needs to be a",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_validate_diagonal_args(array, value, match):
|
||||
"""Check `_validate_diagonal_args` raises the correct errors."""
|
||||
xp = _array_api_for_tests("numpy", None)
|
||||
with pytest.raises(ValueError, match=match):
|
||||
_validate_diagonal_args(array, value, xp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("function", ["fill", "add"])
|
||||
@pytest.mark.parametrize("c_contiguity", [True, False])
|
||||
def test_fill_and_add_to_diagonal(c_contiguity, function):
|
||||
"""Check `_fill/add_to_diagonal` behaviour correct with numpy arrays."""
|
||||
xp = _array_api_for_tests("numpy", None)
|
||||
if c_contiguity:
|
||||
array = numpy.zeros((3, 4))
|
||||
else:
|
||||
array = numpy.zeros((3, 4)).T
|
||||
assert array.flags["C_CONTIGUOUS"] == c_contiguity
|
||||
|
||||
if function == "fill":
|
||||
func = _fill_diagonal
|
||||
else:
|
||||
func = _add_to_diagonal
|
||||
|
||||
func(array, 1, xp)
|
||||
assert_allclose(array.diagonal(), numpy.ones((3,)))
|
||||
|
||||
func(array, [0, 1, 2], xp)
|
||||
if function == "fill":
|
||||
expected_diag = numpy.arange(3)
|
||||
else:
|
||||
expected_diag = numpy.ones((3,)) + numpy.arange(3)
|
||||
assert_allclose(array.diagonal(), expected_diag)
|
||||
|
||||
fill_array = numpy.array([11, 12, 13])
|
||||
func(array, fill_array, xp)
|
||||
if function == "fill":
|
||||
expected_diag = fill_array
|
||||
else:
|
||||
expected_diag = fill_array + numpy.arange(3) + numpy.ones((3,))
|
||||
assert_allclose(array.diagonal(), expected_diag)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array", ["standard", "transposed", "non-contiguous"])
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_fill_diagonal(array, array_namespace, device_, dtype_name):
|
||||
"""Check array API `_fill_diagonal` consistent with `numpy._fill_diagonal`."""
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array_np = numpy.zeros((4, 5), dtype=dtype_name)
|
||||
|
||||
if array == "transposed":
|
||||
array_xp = xp.asarray(array_np.copy(), device=device_).T
|
||||
array_np = array_np.T
|
||||
elif array == "non-contiguous":
|
||||
array_xp = xp.asarray(array_np.copy(), device=device_)[::2, ::2]
|
||||
array_np = array_np[::2, ::2]
|
||||
else:
|
||||
array_xp = xp.asarray(array_np.copy(), device=device_)
|
||||
|
||||
numpy.fill_diagonal(array_np, val=1)
|
||||
with config_context(array_api_dispatch=True):
|
||||
_fill_diagonal(array_xp, value=1, xp=xp)
|
||||
|
||||
assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_add_to_diagonal(array_namespace, device_, dtype_name):
|
||||
"""Check `_add_to_diagonal` consistent between array API xp and numpy namespace."""
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
np_xp = _array_api_for_tests("numpy", None)
|
||||
|
||||
array_np = numpy.zeros((3, 4), dtype=dtype_name)
|
||||
array_xp = xp.asarray(array_np.copy(), device=device_)
|
||||
|
||||
add_val = [1, 2, 3]
|
||||
_fill_diagonal(array_np, value=add_val, xp=np_xp)
|
||||
with config_context(array_api_dispatch=True):
|
||||
_fill_diagonal(array_xp, value=add_val, xp=xp)
|
||||
|
||||
assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("dispatch", [True, False])
|
||||
def test_sparse_device(csr_container, dispatch):
|
||||
np_arr = numpy.array([1])
|
||||
# For numpy < 2, the device attribute is not available on numpy arrays
|
||||
expected_numpy_array_device = getattr(np_arr, "device", None) if dispatch else None
|
||||
a, b = csr_container(numpy.array([[1]])), csr_container(numpy.array([[2]]))
|
||||
if dispatch and os.environ.get("SCIPY_ARRAY_API") is None:
|
||||
raise SkipTest("SCIPY_ARRAY_API is not set: not checking array_api input")
|
||||
with config_context(array_api_dispatch=dispatch):
|
||||
assert device(a, b) is None
|
||||
assert device(a, np_arr) == expected_numpy_array_device
|
||||
assert get_namespace_and_device(a, b)[2] is None
|
||||
assert get_namespace_and_device(a, np_arr)[2] == expected_numpy_array_device
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize("axis", [None, 0, 1])
|
||||
def test_median(namespace, device, dtype_name, axis):
|
||||
# Note: depending on the value of `axis`, this test will compare median
|
||||
# computations on arrays of even (4) or odd (5) numbers of elements, hence
|
||||
# will test for median computation with and without interpolation to check
|
||||
# that array API namespaces yield consistent results even when the median is
|
||||
# not mathematically uniquely defined.
|
||||
xp = _array_api_for_tests(namespace, device)
|
||||
rng = numpy.random.RandomState(0)
|
||||
|
||||
X_np = rng.uniform(low=0.0, high=1.0, size=(5, 4)).astype(dtype_name)
|
||||
result_np = numpy.median(X_np, axis=axis)
|
||||
|
||||
X_xp = xp.asarray(X_np, device=device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
result_xp = _median(X_xp, axis=axis)
|
||||
|
||||
if xp.__name__ != "array_api_strict":
|
||||
# We convert array-api-strict arrays to numpy arrays as `median` is not
|
||||
# part of the Array API spec
|
||||
assert get_namespace(result_xp)[0] == xp
|
||||
assert result_xp.device == X_xp.device
|
||||
assert_allclose(result_np, _convert_to_numpy(result_xp, xp=xp))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
def test_logsumexp_like_scipy_logsumexp(array_namespace, device_, dtype_name, axis):
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array_np = numpy.asarray(
|
||||
[
|
||||
[0, 3, 1000],
|
||||
[2, -1, 1000],
|
||||
[-10, 0, 0],
|
||||
[-50, 8, -numpy.inf],
|
||||
[4, 0, 5],
|
||||
],
|
||||
dtype=dtype_name,
|
||||
)
|
||||
array_xp = xp.asarray(array_np, device=device_)
|
||||
|
||||
res_np = scipy.special.logsumexp(array_np, axis=axis)
|
||||
|
||||
rtol = 1e-6 if "float32" in str(dtype_name) else 1e-12
|
||||
|
||||
# if torch on CPU or array api strict on default device
|
||||
# check that _logsumexp works when array API dispatch is disabled
|
||||
if (array_namespace == "torch" and device_ == "cpu") or (
|
||||
array_namespace == "array_api_strict" and "CPU" in str(device_)
|
||||
):
|
||||
assert_allclose(_logsumexp(array_xp, axis=axis), res_np, rtol=rtol)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
res_xp = _logsumexp(array_xp, axis=axis)
|
||||
res_xp = _convert_to_numpy(res_xp, xp)
|
||||
assert_allclose(res_np, res_xp, rtol=rtol)
|
||||
|
||||
# Test with NaNs and +np.inf
|
||||
array_np_2 = numpy.asarray(
|
||||
[
|
||||
[0, numpy.nan, 1000],
|
||||
[2, -1, 1000],
|
||||
[numpy.inf, 0, 0],
|
||||
[-50, 8, -numpy.inf],
|
||||
[4, 0, 5],
|
||||
],
|
||||
dtype=dtype_name,
|
||||
)
|
||||
array_xp_2 = xp.asarray(array_np_2, device=device_)
|
||||
|
||||
res_np_2 = scipy.special.logsumexp(array_np_2, axis=axis)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
res_xp_2 = _logsumexp(array_xp_2, axis=axis)
|
||||
res_xp_2 = _convert_to_numpy(res_xp_2, xp)
|
||||
assert_allclose(res_np_2, res_xp_2, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("namespace", "device_", "expected_types"),
|
||||
[
|
||||
("numpy", None, ("float64", "float32", "float16")),
|
||||
("array_api_strict", None, ("float64", "float32")),
|
||||
("torch", "cpu", ("float64", "float32", "float16")),
|
||||
("torch", "cuda", ("float64", "float32", "float16")),
|
||||
("torch", "mps", ("float32", "float16")),
|
||||
],
|
||||
)
|
||||
def test_supported_float_types(namespace, device_, expected_types):
|
||||
xp = _array_api_for_tests(namespace, device_)
|
||||
float_types = supported_float_dtypes(xp, device=device_)
|
||||
expected = tuple(getattr(xp, dtype_name) for dtype_name in expected_types)
|
||||
assert float_types == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_sample_weight", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
def test_half_multinomial_loss(use_sample_weight, namespace, device_, dtype_name):
|
||||
"""Check that the array API version of :func:`_half_multinomial_loss` works
|
||||
correctly and matches the results produced by :class:`HalfMultinomialLoss`
|
||||
of the private `_loss` module.
|
||||
"""
|
||||
n_samples = 5
|
||||
n_classes = 3
|
||||
rng = numpy.random.RandomState(42)
|
||||
y = rng.randint(0, n_classes, n_samples).astype(dtype_name)
|
||||
pred = rng.rand(n_samples, n_classes).astype(dtype_name)
|
||||
xp = _array_api_for_tests(namespace, device_)
|
||||
y_xp = xp.asarray(y, device=device_)
|
||||
pred_xp = xp.asarray(pred, device=device_)
|
||||
if use_sample_weight:
|
||||
sample_weight = numpy.ones_like(y)
|
||||
sample_weight[1::2] = 2
|
||||
sample_weight_xp = xp.asarray(sample_weight, device=device_)
|
||||
else:
|
||||
sample_weight, sample_weight_xp = None, None
|
||||
|
||||
np_loss = HalfMultinomialLoss(n_classes=n_classes)(
|
||||
y_true=y, raw_prediction=pred, sample_weight=sample_weight
|
||||
)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_loss = _half_multinomial_loss(
|
||||
y=y_xp, pred=pred_xp, sample_weight=sample_weight_xp, xp=xp
|
||||
)
|
||||
|
||||
assert numpy.isclose(np_loss, xp_loss)
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
|
||||
|
||||
|
||||
def test_min_pos():
|
||||
# Check that min_pos returns a positive value and that it's consistent
|
||||
# between float and double
|
||||
X = np.random.RandomState(0).randn(100)
|
||||
|
||||
min_double = min_pos(X)
|
||||
min_float = min_pos(X.astype(np.float32))
|
||||
|
||||
assert_allclose(min_double, min_float)
|
||||
assert min_double >= 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_min_pos_no_positive(dtype):
|
||||
# Check that the return value of min_pos is the maximum representable
|
||||
# value of the input dtype when all input elements are <= 0 (#19328)
|
||||
X = np.full(100, -1.0).astype(dtype, copy=False)
|
||||
|
||||
assert min_pos(X) == np.finfo(dtype).max
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
|
||||
)
|
||||
@pytest.mark.parametrize("value", [0, 1.5, -1])
|
||||
def test_all_with_any_reduction_axis_1(dtype, value):
|
||||
# Check that return value is False when there is no row equal to `value`
|
||||
X = np.arange(12, dtype=dtype).reshape(3, 4)
|
||||
assert not _all_with_any_reduction_axis_1(X, value=value)
|
||||
|
||||
# Make a row equal to `value`
|
||||
X[1, :] = value
|
||||
assert _all_with_any_reduction_axis_1(X, value=value)
|
||||
@@ -0,0 +1,32 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils import Bunch
|
||||
|
||||
|
||||
def test_bunch_attribute_deprecation():
|
||||
"""Check that bunch raises deprecation message with `__getattr__`."""
|
||||
bunch = Bunch()
|
||||
values = np.asarray([1, 2, 3])
|
||||
msg = (
|
||||
"Key: 'values', is deprecated in 1.3 and will be "
|
||||
"removed in 1.5. Please use 'grid_values' instead"
|
||||
)
|
||||
bunch._set_deprecated(
|
||||
values, new_key="grid_values", deprecated_key="values", warning_message=msg
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
# Does not warn for "grid_values"
|
||||
warnings.simplefilter("error")
|
||||
v = bunch["grid_values"]
|
||||
|
||||
assert v is values
|
||||
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
# Warns for "values"
|
||||
v = bunch["values"]
|
||||
|
||||
assert v is values
|
||||
@@ -0,0 +1,73 @@
|
||||
import warnings
|
||||
from itertools import chain
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_gen_even_slices():
|
||||
# check that gen_even_slices contains all samples
|
||||
some_range = range(10)
|
||||
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
|
||||
assert_array_equal(some_range, joined_range)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("row_bytes", "max_n_rows", "working_memory", "expected"),
|
||||
[
|
||||
(1024, None, 1, 1024),
|
||||
(1024, None, 0.99999999, 1023),
|
||||
(1023, None, 1, 1025),
|
||||
(1025, None, 1, 1023),
|
||||
(1024, None, 2, 2048),
|
||||
(1024, 7, 1, 7),
|
||||
(1024 * 1024, None, 1, 1),
|
||||
],
|
||||
)
|
||||
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
with config_context(working_memory=working_memory):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
|
||||
def test_get_chunk_n_rows_warns():
|
||||
"""Check that warning is raised when working_memory is too low."""
|
||||
row_bytes = 1024 * 1024 + 1
|
||||
max_n_rows = None
|
||||
working_memory = 1
|
||||
expected = 1
|
||||
|
||||
warn_msg = (
|
||||
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
@@ -0,0 +1,334 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
|
||||
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS
|
||||
|
||||
|
||||
def test_compute_class_weight():
|
||||
# Test (and demo) compute_class_weight.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
# total effect of samples is preserved
|
||||
class_counts = np.bincount(y)[2:]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert cw[0] < cw[1] < cw[2]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_type, class_weight, classes, err_msg",
|
||||
[
|
||||
(
|
||||
"numeric",
|
||||
"balanced",
|
||||
np.arange(4),
|
||||
"classes should have valid labels that are in y",
|
||||
),
|
||||
# Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
|
||||
(
|
||||
"numeric",
|
||||
{"label_not_present": 1.0},
|
||||
np.arange(4),
|
||||
r"The classes, \[0, 1, 2, 3\], are not in class_weight",
|
||||
),
|
||||
(
|
||||
"numeric",
|
||||
"balanced",
|
||||
np.arange(2),
|
||||
"classes should include all valid labels",
|
||||
),
|
||||
(
|
||||
"numeric",
|
||||
{0: 1.0, 1: 2.0},
|
||||
np.arange(2),
|
||||
"classes should include all valid labels",
|
||||
),
|
||||
(
|
||||
"string",
|
||||
{"dogs": 3, "cat": 2},
|
||||
np.array(["dog", "cat"]),
|
||||
r"The classes, \['dog'\], are not in class_weight",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
|
||||
# Raise error when y does not contain all class labels
|
||||
y = (
|
||||
np.asarray([0, 0, 0, 1, 1, 2])
|
||||
if y_type == "numeric"
|
||||
else np.asarray(["dog", "cat", "dog"])
|
||||
)
|
||||
|
||||
print(y)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
compute_class_weight(class_weight, classes=classes, y=y)
|
||||
|
||||
|
||||
def test_compute_class_weight_dict():
|
||||
classes = np.arange(3)
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
|
||||
y = np.asarray([0, 0, 1, 2])
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
# When the user specifies class weights, compute_class_weights should just
|
||||
# return them.
|
||||
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
|
||||
|
||||
# When a class weight is specified that isn't in classes, the weight is ignored
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([1.0, 2.0, 3.0], cw)
|
||||
|
||||
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([4.0, 2.0, 3.0], cw)
|
||||
|
||||
|
||||
def test_compute_class_weight_invariance():
|
||||
# Test that results with class_weight="balanced" is invariant wrt
|
||||
# class imbalance if the number of samples is identical.
|
||||
# The test uses a balanced two class dataset with 100 datapoints.
|
||||
# It creates three versions, one where class 1 is duplicated
|
||||
# resulting in 150 points of class 1 and 50 of class 0,
|
||||
# one where there are 50 points in class 1 and 150 in class 0,
|
||||
# and one where there are 100 points of each class (this one is balanced
|
||||
# again).
|
||||
# With balancing class weights, all three should give the same model.
|
||||
X, y = make_blobs(centers=2, random_state=0)
|
||||
# create dataset where class 1 is duplicated twice
|
||||
X_1 = np.vstack([X] + [X[y == 1]] * 2)
|
||||
y_1 = np.hstack([y] + [y[y == 1]] * 2)
|
||||
# create dataset where class 0 is duplicated twice
|
||||
X_0 = np.vstack([X] + [X[y == 0]] * 2)
|
||||
y_0 = np.hstack([y] + [y[y == 0]] * 2)
|
||||
# duplicate everything
|
||||
X_ = np.vstack([X] * 2)
|
||||
y_ = np.hstack([y] * 2)
|
||||
# results should be identical
|
||||
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
|
||||
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
|
||||
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
|
||||
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
|
||||
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_negative():
|
||||
# Test compute_class_weight when labels are negative
|
||||
# Test with balanced class labels.
|
||||
classes = np.array([-2, -1, 0])
|
||||
y = np.asarray([-1, -1, 0, 0, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_sample_weight_equivalence():
|
||||
# Test with unbalanced and negative class labels for
|
||||
# equivalence between repeated and weighted samples
|
||||
|
||||
classes = np.array([-2, -1, 0])
|
||||
y = np.asarray([-1, -1, 0, 0, -2, -2])
|
||||
sw = np.asarray([1, 0, 1, 1, 1, 2])
|
||||
|
||||
y_rep = np.repeat(y, sw, axis=0)
|
||||
|
||||
class_weights_weighted = compute_class_weight(
|
||||
"balanced", classes=classes, y=y, sample_weight=sw
|
||||
)
|
||||
class_weights_repeated = compute_class_weight("balanced", classes=classes, y=y_rep)
|
||||
assert len(class_weights_weighted) == len(classes)
|
||||
assert len(class_weights_repeated) == len(classes)
|
||||
|
||||
class_counts_weighted = np.bincount(y + 2, weights=sw)
|
||||
class_counts_repeated = np.bincount(y_rep + 2)
|
||||
|
||||
assert np.dot(class_weights_weighted, class_counts_weighted) == pytest.approx(
|
||||
np.dot(class_weights_repeated, class_counts_repeated)
|
||||
)
|
||||
|
||||
assert_allclose(class_weights_weighted, class_weights_repeated)
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_unordered():
|
||||
# Test compute_class_weight when classes are unordered
|
||||
classes = np.array([1, 0, 3])
|
||||
y = np.asarray([1, 0, 0, 3, 3, 3])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
class_counts = np.bincount(y)[classes]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
|
||||
|
||||
|
||||
def test_compute_class_weight_default():
|
||||
# Test for the case where no weight is given for a present class.
|
||||
# Current behaviour is to assign the unweighted classes a weight of 1.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
classes_len = len(classes)
|
||||
|
||||
# Test for non specified weights
|
||||
cw = compute_class_weight(None, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, np.ones(3))
|
||||
|
||||
# Tests for partly specified weights
|
||||
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
|
||||
|
||||
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
|
||||
|
||||
|
||||
def test_compute_sample_weight():
|
||||
# Test (and demo) compute_sample_weight.
|
||||
# Test with balanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with user-defined weights
|
||||
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with unbalanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
expected_balanced = np.array(
|
||||
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
|
||||
)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
|
||||
|
||||
# Test with `None` weights
|
||||
sample_weight = compute_sample_weight(None, y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output of balanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output with user-defined weights
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with multi-output of unbalanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
|
||||
|
||||
|
||||
def test_compute_sample_weight_with_subsample():
|
||||
# Test compute_sample_weight with subsamples specified.
|
||||
# Test with balanced classes and all samples present
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes and all samples present
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with a subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
|
||||
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with a bootstrap subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced)
|
||||
|
||||
# Test with a bootstrap subsample for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2)
|
||||
|
||||
# Test with a missing class
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
# Test with a missing class for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_type, class_weight, indices, err_msg",
|
||||
[
|
||||
(
|
||||
"single-output",
|
||||
{1: 2, 2: 1},
|
||||
range(4),
|
||||
"The only valid class_weight for subsampling is 'balanced'.",
|
||||
),
|
||||
(
|
||||
"multi-output",
|
||||
{1: 2, 2: 1},
|
||||
None,
|
||||
"For multi-output, class_weight should be a list of dicts, or the string",
|
||||
),
|
||||
(
|
||||
"multi-output",
|
||||
[{1: 2, 2: 1}],
|
||||
None,
|
||||
r"Got 1 element\(s\) while having 2 outputs",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
|
||||
# Test compute_sample_weight raises errors expected.
|
||||
# Invalid preset string
|
||||
y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
|
||||
y = y_single_output if y_type == "single-output" else y_multi_output
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
compute_sample_weight(class_weight, y, indices=indices)
|
||||
|
||||
|
||||
def test_compute_sample_weight_more_than_32():
|
||||
# Non-regression smoke test for #12146
|
||||
y = np.arange(50) # more than 32 distinct classes
|
||||
indices = np.arange(50) # use subsampling
|
||||
weight = compute_sample_weight("balanced", y, indices=indices)
|
||||
assert_array_almost_equal(weight, np.ones(y.shape[0]))
|
||||
|
||||
|
||||
def test_class_weight_does_not_contains_more_classes():
|
||||
"""Check that class_weight can contain more labels than in y.
|
||||
|
||||
Non-regression test for #22413
|
||||
"""
|
||||
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
|
||||
|
||||
# Does not raise
|
||||
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_compute_sample_weight_sparse(csc_container):
|
||||
"""Check that we can compute weight for sparse `y`."""
|
||||
y = csc_container(np.asarray([[0], [1], [1]]))
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_allclose(sample_weight, [1.5, 0.75, 0.75])
|
||||
@@ -0,0 +1,250 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._cython_blas import (
|
||||
BLAS_Order,
|
||||
BLAS_Trans,
|
||||
_asum_memview,
|
||||
_axpy_memview,
|
||||
_copy_memview,
|
||||
_dot_memview,
|
||||
_gemm_memview,
|
||||
_gemv_memview,
|
||||
_ger_memview,
|
||||
_nrm2_memview,
|
||||
_rot_memview,
|
||||
_rotg_memview,
|
||||
_scal_memview,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
def _numpy_to_cython(dtype):
|
||||
cython = pytest.importorskip("cython")
|
||||
if dtype == np.float32:
|
||||
return cython.float
|
||||
elif dtype == np.float64:
|
||||
return cython.double
|
||||
|
||||
|
||||
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
|
||||
ORDER = {BLAS_Order.RowMajor: "C", BLAS_Order.ColMajor: "F"}
|
||||
|
||||
|
||||
def _no_op(x):
|
||||
return x
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_dot(dtype):
|
||||
dot = _dot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = x.dot(y)
|
||||
actual = dot(x, y)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_asum(dtype):
|
||||
asum = _asum_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.abs(x).sum()
|
||||
actual = asum(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_axpy(dtype):
|
||||
axpy = _axpy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x + y
|
||||
axpy(alpha, x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_nrm2(dtype):
|
||||
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.linalg.norm(x)
|
||||
actual = nrm2(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_copy(dtype):
|
||||
copy = _copy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = np.empty_like(x)
|
||||
|
||||
expected = x.copy()
|
||||
copy(x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_scal(dtype):
|
||||
scal = _scal_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x
|
||||
scal(alpha, x)
|
||||
|
||||
assert_allclose(x, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rotg(dtype):
|
||||
rotg = _rotg_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
a = dtype(rng.randn())
|
||||
b = dtype(rng.randn())
|
||||
c, s = 0.0, 0.0
|
||||
|
||||
def expected_rotg(a, b):
|
||||
roe = a if abs(a) > abs(b) else b
|
||||
if a == 0 and b == 0:
|
||||
c, s, r, z = (1, 0, 0, 0)
|
||||
else:
|
||||
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
|
||||
c, s = a / r, b / r
|
||||
z = s if roe == a else (1 if c == 0 else 1 / c)
|
||||
return r, z, c, s
|
||||
|
||||
expected = expected_rotg(a, b)
|
||||
actual = rotg(a, b, c, s)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rot(dtype):
|
||||
rot = _rot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
c = dtype(rng.randn())
|
||||
s = dtype(rng.randn())
|
||||
|
||||
expected_x = c * x + s * y
|
||||
expected_y = c * y - s * x
|
||||
|
||||
rot(x, y, c, s)
|
||||
|
||||
assert_allclose(x, expected_x)
|
||||
assert_allclose(y, expected_y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA",
|
||||
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
|
||||
ids=["NoTrans", "Trans"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
|
||||
ids=["RowMajor", "ColMajor"],
|
||||
)
|
||||
def test_gemv(dtype, opA, transA, order):
|
||||
gemv = _gemv_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(x) + beta * y
|
||||
gemv(transA, alpha, A, x, beta, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
|
||||
ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
|
||||
)
|
||||
def test_ger(dtype, order):
|
||||
ger = _ger_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
A = np.asarray(
|
||||
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * np.outer(x, y) + A
|
||||
ger(alpha, x, y, A)
|
||||
|
||||
assert_allclose(A, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opB, transB",
|
||||
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
|
||||
ids=["NoTrans", "Trans"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA",
|
||||
[(_no_op, BLAS_Trans.NoTrans), (np.transpose, BLAS_Trans.Trans)],
|
||||
ids=["NoTrans", "Trans"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[BLAS_Order.RowMajor, BLAS_Order.ColMajor],
|
||||
ids=["BLAS_Order.RowMajor", "BLAS_Order.ColMajor"],
|
||||
)
|
||||
def test_gemm(dtype, opA, transA, opB, transB, order):
|
||||
gemm = _gemm_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
B = np.asarray(
|
||||
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
C = np.asarray(
|
||||
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(opB(B)) + beta * C
|
||||
gemm(transA, transB, alpha, A, B, beta, C)
|
||||
|
||||
assert_allclose(C, expected, rtol=RTOL[dtype])
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Tests for dataframe detection functions."""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn._min_dependencies import dependent_packages
|
||||
from sklearn.utils._dataframe import is_df_or_series, is_pandas_df, is_polars_df
|
||||
from sklearn.utils._testing import _convert_container
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["pyarrow", "dataframe", "polars"])
|
||||
def test_is_df_or_series(constructor_name):
|
||||
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
|
||||
|
||||
assert is_df_or_series(df)
|
||||
assert not is_df_or_series(np.asarray([1, 2, 3]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["pyarrow", "dataframe", "polars"])
|
||||
def test_is_pandas_df_other_libraries(constructor_name):
|
||||
df = _convert_container([[1, 4, 2], [3, 3, 6]], constructor_name)
|
||||
if constructor_name in ("pyarrow", "polars"):
|
||||
assert not is_pandas_df(df)
|
||||
else:
|
||||
assert is_pandas_df(df)
|
||||
|
||||
|
||||
def test_is_pandas_df():
|
||||
"""Check behavior of is_pandas_df when pandas is installed."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
df = pd.DataFrame([[1, 2, 3]])
|
||||
assert is_pandas_df(df)
|
||||
assert not is_pandas_df(np.asarray([1, 2, 3]))
|
||||
assert not is_pandas_df(1)
|
||||
|
||||
|
||||
def test_is_pandas_df_pandas_not_installed(hide_available_pandas):
|
||||
"""Check is_pandas_df when pandas is not installed."""
|
||||
|
||||
assert not is_pandas_df(np.asarray([1, 2, 3]))
|
||||
assert not is_pandas_df(1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, minversion",
|
||||
[
|
||||
("pyarrow", dependent_packages["pyarrow"][0]),
|
||||
("dataframe", dependent_packages["pandas"][0]),
|
||||
("polars", dependent_packages["polars"][0]),
|
||||
],
|
||||
)
|
||||
def test_is_polars_df_other_libraries(constructor_name, minversion):
|
||||
df = _convert_container(
|
||||
[[1, 4, 2], [3, 3, 6]],
|
||||
constructor_name,
|
||||
minversion=minversion,
|
||||
)
|
||||
if constructor_name in ("pyarrow", "dataframe"):
|
||||
assert not is_polars_df(df)
|
||||
else:
|
||||
assert is_polars_df(df)
|
||||
|
||||
|
||||
def test_is_polars_df_for_duck_typed_polars_dataframe():
|
||||
"""Check is_polars_df for object that looks like a polars dataframe"""
|
||||
|
||||
class NotAPolarsDataFrame:
|
||||
def __init__(self):
|
||||
self.columns = [1, 2, 3]
|
||||
self.schema = "my_schema"
|
||||
|
||||
not_a_polars_df = NotAPolarsDataFrame()
|
||||
assert not is_polars_df(not_a_polars_df)
|
||||
|
||||
|
||||
def test_is_polars_df():
|
||||
"""Check that is_polars_df return False for non-dataframe objects."""
|
||||
|
||||
class LooksLikePolars:
|
||||
def __init__(self):
|
||||
self.columns = ["a", "b"]
|
||||
self.schema = ["a", "b"]
|
||||
|
||||
assert not is_polars_df(LooksLikePolars())
|
||||
@@ -0,0 +1,98 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
|
||||
import pickle
|
||||
from inspect import signature
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.deprecation import _is_deprecated, deprecated
|
||||
|
||||
|
||||
@deprecated("qwerty")
|
||||
class MockClass1:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass2:
|
||||
@deprecated("mockclass2_method")
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
@deprecated("n_features_ is deprecated") # type: ignore[prop-decorator]
|
||||
@property
|
||||
def n_features_(self):
|
||||
"""Number of input features."""
|
||||
return 10
|
||||
|
||||
|
||||
class MockClass3:
|
||||
@deprecated()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockClass4:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass5(MockClass1):
|
||||
"""Inherit from deprecated class but does not call super().__init__."""
|
||||
|
||||
def __init__(self, a):
|
||||
self.a = a
|
||||
|
||||
|
||||
@deprecated("a message")
|
||||
class MockClass6:
|
||||
"""A deprecated class that overrides __new__."""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
assert len(args) > 0
|
||||
return super().__new__(cls)
|
||||
|
||||
|
||||
@deprecated()
|
||||
def mock_function():
|
||||
return 10
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
with pytest.warns(FutureWarning, match="qwerty"):
|
||||
MockClass1()
|
||||
with pytest.warns(FutureWarning, match="mockclass2_method"):
|
||||
MockClass2().method()
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
MockClass3()
|
||||
with pytest.warns(FutureWarning, match="qwerty"):
|
||||
MockClass5(42)
|
||||
with pytest.warns(FutureWarning, match="a message"):
|
||||
MockClass6(42)
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
val = mock_function()
|
||||
assert val == 10
|
||||
|
||||
|
||||
def test_is_deprecated():
|
||||
# Test if _is_deprecated helper identifies wrapping via deprecated
|
||||
# NOTE it works only for class methods and functions
|
||||
assert _is_deprecated(MockClass1.__new__)
|
||||
assert _is_deprecated(MockClass2().method)
|
||||
assert _is_deprecated(MockClass3.__init__)
|
||||
assert not _is_deprecated(MockClass4.__init__)
|
||||
assert _is_deprecated(MockClass5.__new__)
|
||||
assert _is_deprecated(mock_function)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
pickle.loads(pickle.dumps(mock_function))
|
||||
|
||||
|
||||
def test_deprecated_class_signature():
|
||||
@deprecated()
|
||||
class MockClass:
|
||||
def __init__(self, a, b=1, c=2):
|
||||
pass
|
||||
|
||||
assert list(signature(MockClass).parameters.keys()) == ["a", "b", "c"]
|
||||
@@ -0,0 +1,274 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected",
|
||||
[
|
||||
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
|
||||
(
|
||||
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
|
||||
np.array([1, 2, np.nan], dtype="float32"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", None, "a", None], dtype=object),
|
||||
np.array(["a", "b", None], dtype=object),
|
||||
),
|
||||
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
|
||||
],
|
||||
ids=["int64", "float32-nan", "object", "object-None", "str"],
|
||||
)
|
||||
def test_encode_util(values, expected):
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected)
|
||||
|
||||
result, encoded = _unique(values, return_inverse=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
result, counts = _unique(values, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
|
||||
def test_encode_with_check_unknown():
|
||||
# test for the check_unknown parameter of _encode()
|
||||
uniques = np.array([1, 2, 3])
|
||||
values = np.array([1, 2, 3, 4])
|
||||
|
||||
# Default is True, raise error
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=True)
|
||||
|
||||
# dont raise error if False
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
# parameter is ignored for object dtype
|
||||
uniques = np.array(["a", "b", "c"], dtype=object)
|
||||
values = np.array(["a", "b", "c", "d"], dtype=object)
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
|
||||
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
diff = _check_unknown(values, uniques)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
|
||||
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
assert_array_equal(valid_mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_diff, expected_mask",
|
||||
[
|
||||
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
|
||||
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
|
||||
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1]),
|
||||
[4, np.nan],
|
||||
[True, True, False, False],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, 5]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"], dtype=object),
|
||||
np.array(["a", "c", "b"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[False, True, True, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"]),
|
||||
np.array(["a", "c", "b"]),
|
||||
np.array(["d"]),
|
||||
[False, True, True, True],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_check_unknown_missing_values(missing_value, pickle_uniques):
|
||||
# check for check_unknown with missing values with object dtypes
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d"]
|
||||
expected_mask = [False, True, True, True, True]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d", missing_value]
|
||||
|
||||
expected_mask = [False, True, True, True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["a", missing_value], dtype=object)
|
||||
uniques = np.array(["a", "b", "z"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = [missing_value]
|
||||
expected_mask = [True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
|
||||
# check for _unique and _encode with missing values with object dtypes
|
||||
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
|
||||
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
|
||||
if missing_value is None:
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
else: # missing_value == np.nan
|
||||
assert_array_equal(uniques[:-1], expected_uniques[:-1])
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
|
||||
|
||||
|
||||
def test_unique_util_missing_values_numeric():
|
||||
# Check missing values in numerical values
|
||||
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
|
||||
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
|
||||
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
|
||||
uniques, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, expected_inverse)
|
||||
|
||||
|
||||
def test_unique_util_with_all_missing_values():
|
||||
# test for all types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques[:-1], ["a", "c", None])
|
||||
# last value is nan
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
|
||||
_, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
|
||||
def test_check_unknown_with_both_missing_values():
|
||||
# test for both types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
|
||||
|
||||
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
|
||||
diff, valid_mask = _check_unknown(
|
||||
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
|
||||
)
|
||||
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_counts",
|
||||
[
|
||||
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
|
||||
(
|
||||
np.array([1] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([1, 2, 3, 5]),
|
||||
[10, 4, 15, 0],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([2, 3, np.nan]),
|
||||
[4, 15, 10],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c"],
|
||||
[16, 4, 20],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", "b", "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", np.nan, "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c", "e"],
|
||||
[16, 4, 20, 0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_counts(values, uniques, expected_counts):
|
||||
counts = _get_counts(values, uniques)
|
||||
assert_array_equal(counts, expected_counts)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,47 @@
|
||||
"""Test fast_dict."""
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn.utils._fast_dict import IntFloatDict, argmin
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
assert len(d) == len(keys)
|
||||
|
||||
d.append(120, 3.0)
|
||||
assert d[120] == 3.0
|
||||
assert len(d) == len(keys) + 1
|
||||
for i in range(2000):
|
||||
d.append(i + 1000, 4.0)
|
||||
assert d[1100] == 4.0
|
||||
|
||||
|
||||
def test_int_float_dict_argmin():
|
||||
# Test the argmin implementation on the IntFloatDict
|
||||
keys = np.arange(100, dtype=np.intp)
|
||||
values = np.arange(100, dtype=np.float64)
|
||||
d = IntFloatDict(keys, values)
|
||||
assert argmin(d) == (0, 0)
|
||||
|
||||
|
||||
def test_to_arrays():
|
||||
# Test that an IntFloatDict is converted into arrays
|
||||
# of keys and values correctly
|
||||
keys_in = np.array([1, 2, 3], dtype=np.intp)
|
||||
values_in = np.array([4, 5, 6], dtype=np.float64)
|
||||
|
||||
d = IntFloatDict(keys_in, values_in)
|
||||
keys_out, values_out = d.to_arrays()
|
||||
|
||||
assert keys_out.dtype == keys_in.dtype
|
||||
assert values_in.dtype == values_out.dtype
|
||||
assert_array_equal(keys_out, keys_in)
|
||||
assert_allclose(values_out, values_in)
|
||||
@@ -0,0 +1,160 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
|
||||
def test_object_dtype_isnan(dtype, val):
|
||||
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
|
||||
|
||||
expected_mask = np.array([[False, True], [True, False]])
|
||||
|
||||
mask = _object_dtype_isnan(X)
|
||||
|
||||
assert_array_equal(mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
({}, np.int32), # default behaviour
|
||||
({"maxval": np.iinfo(np.int32).max}, np.int32),
|
||||
({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` depending only on the
|
||||
`max_val` parameter.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
# Arrays dtype is int64 and thus should not be downcasted to int32 without
|
||||
# checking the content of providing maxval.
|
||||
({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
|
||||
# One of the array is int64 and should not be downcasted to int32
|
||||
# for the same reasons.
|
||||
(
|
||||
{
|
||||
"arrays": (
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
np.array([1, 2], dtype=np.int64),
|
||||
)
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# Both arrays are already int32: we can just keep this dtype.
|
||||
(
|
||||
{
|
||||
"arrays": (
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
)
|
||||
},
|
||||
np.int32,
|
||||
),
|
||||
# Arrays should be upcasted to at least int32 precision.
|
||||
({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
|
||||
# Check that `maxval` takes precedence over the arrays and thus upcast to
|
||||
# int64.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([1, 2], dtype=np.int32),
|
||||
"maxval": np.iinfo(np.int32).max + 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_without_checking_contents(
|
||||
params, expected_dtype
|
||||
):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` using the passed
|
||||
arrays but without checking the contents of the arrays.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
# empty arrays should always be converted to int32 indices
|
||||
(
|
||||
{
|
||||
"arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
|
||||
"check_contents": True,
|
||||
},
|
||||
np.int32,
|
||||
),
|
||||
# arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
|
||||
# be converted to int32,
|
||||
(
|
||||
{"arrays": np.array([1], dtype=np.int64), "check_contents": True},
|
||||
np.int32,
|
||||
),
|
||||
# otherwise, it should be converted to int64. We need to create a uint32
|
||||
# arrays to accommodate a value > np.iinfo(np.int32).max
|
||||
(
|
||||
{
|
||||
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
|
||||
"check_contents": True,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# maxval should take precedence over the arrays contents and thus upcast to
|
||||
# int64.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([1], dtype=np.int32),
|
||||
"check_contents": True,
|
||||
"maxval": np.iinfo(np.int32).max + 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# when maxval is small, but check_contents is True and the contents
|
||||
# require np.int64, we still require np.int64 indexing in the end.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
|
||||
"check_contents": True,
|
||||
"maxval": 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
|
||||
arrays but as well the contents.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
(
|
||||
{"maxval": np.iinfo(np.int64).max + 1},
|
||||
ValueError,
|
||||
"is to large to be represented as np.int64",
|
||||
),
|
||||
(
|
||||
{"arrays": np.array([1, 2], dtype=np.float64)},
|
||||
ValueError,
|
||||
"Array dtype float64 is not supported",
|
||||
),
|
||||
({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
|
||||
"""Check that we raise the proper error message."""
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
_smallest_admissible_index_dtype(**params)
|
||||
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.utils.graph import _fix_connected_components
|
||||
|
||||
|
||||
def test_fix_connected_components():
|
||||
# Test that _fix_connected_components reduces the number of component to 1.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
graph = _fix_connected_components(X, graph, n_connected_components, labels)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
|
||||
def test_fix_connected_components_precomputed():
|
||||
# Test that _fix_connected_components accepts precomputed distance matrix.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
distances = pairwise_distances(X)
|
||||
graph = _fix_connected_components(
|
||||
distances, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
# but it does not work with precomputed neighbors graph
|
||||
with pytest.raises(RuntimeError, match="does not work with a sparse"):
|
||||
_fix_connected_components(
|
||||
graph, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_wrong_mode():
|
||||
# Test that the an error is raised if the mode string is incorrect.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
|
||||
with pytest.raises(ValueError, match="Unknown mode"):
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="foo"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_connectivity_mode():
|
||||
# Test that the connectivity mode fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="connectivity"
|
||||
)
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
|
||||
def test_fix_connected_components_distance_mode():
|
||||
# Test that the distance mode does not fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="distance"
|
||||
)
|
||||
assert not np.all(graph.data == 1)
|
||||
@@ -0,0 +1,703 @@
|
||||
import warnings
|
||||
from copy import copy
|
||||
from unittest import SkipTest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.stats import kstest
|
||||
|
||||
import sklearn
|
||||
from sklearn.externals._packaging.version import parse as parse_version
|
||||
from sklearn.utils import _safe_indexing, resample, shuffle
|
||||
from sklearn.utils._array_api import (
|
||||
_convert_to_numpy,
|
||||
_get_namespace_device_dtype_ids,
|
||||
device,
|
||||
move_to,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._indexing import (
|
||||
_determine_key_type,
|
||||
_get_column_indices,
|
||||
_safe_assign,
|
||||
)
|
||||
from sklearn.utils._mocking import MockDataFrame
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
skip_if_array_api_compat_not_configured,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
# toy array
|
||||
X_toy = np.arange(9).reshape((3, 3))
|
||||
|
||||
|
||||
def test_polars_indexing():
|
||||
"""Check _safe_indexing for polars as expected."""
|
||||
pl = pytest.importorskip("polars", minversion="0.18.2")
|
||||
df = pl.DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
|
||||
)
|
||||
|
||||
from polars.testing import assert_frame_equal
|
||||
|
||||
str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
|
||||
|
||||
for key in str_keys:
|
||||
out = _safe_indexing(df, key, axis=1)
|
||||
assert_frame_equal(df[key], out)
|
||||
|
||||
bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
|
||||
|
||||
for bool_key, str_key in bool_keys:
|
||||
out = _safe_indexing(df, bool_key, axis=1)
|
||||
assert_frame_equal(df[:, str_key], out)
|
||||
|
||||
int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
|
||||
|
||||
for int_key, str_key in int_keys:
|
||||
out = _safe_indexing(df, int_key, axis=1)
|
||||
assert_frame_equal(df[:, str_key], out)
|
||||
|
||||
axis_0_keys = [[0, 1], [1, 3], [3, 2]]
|
||||
for key in axis_0_keys:
|
||||
out = _safe_indexing(df, key, axis=0)
|
||||
assert_frame_equal(df[key], out)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, dtype",
|
||||
[
|
||||
(0, "int"),
|
||||
("0", "str"),
|
||||
(True, "bool"),
|
||||
(np.bool_(True), "bool"),
|
||||
([0, 1, 2], "int"),
|
||||
(["0", "1", "2"], "str"),
|
||||
((0, 1, 2), "int"),
|
||||
(("0", "1", "2"), "str"),
|
||||
(slice(None, None), None),
|
||||
(slice(0, 2), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int32), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int64), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.uint8), "int"),
|
||||
([True, False], "bool"),
|
||||
((True, False), "bool"),
|
||||
(np.array([True, False]), "bool"),
|
||||
("col_0", "str"),
|
||||
(["col_0", "col_1", "col_2"], "str"),
|
||||
(("col_0", "col_1", "col_2"), "str"),
|
||||
(slice("begin", "end"), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"]), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
|
||||
],
|
||||
)
|
||||
def test_determine_key_type(key, dtype):
|
||||
assert _determine_key_type(key) == dtype
|
||||
|
||||
|
||||
def test_determine_key_type_error():
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(1.0)
|
||||
|
||||
|
||||
def test_determine_key_type_slice_error():
|
||||
with pytest.raises(TypeError, match="Only array-like or scalar are"):
|
||||
_determine_key_type(slice(0, 2, 1), accept_slice=False)
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_determine_key_type_array_api(array_namespace, device_, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
|
||||
with sklearn.config_context(array_api_dispatch=True):
|
||||
int_array_key = xp.asarray([1, 2, 3], device=device_)
|
||||
assert _determine_key_type(int_array_key) == "int"
|
||||
|
||||
bool_array_key = xp.asarray([True, False, True], device=device_)
|
||||
assert _determine_key_type(bool_array_key) == "bool"
|
||||
|
||||
try:
|
||||
complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j], device=device_)
|
||||
except TypeError:
|
||||
# Complex numbers are not supported by all Array API libraries.
|
||||
complex_array_key = None
|
||||
|
||||
if complex_array_key is not None:
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(complex_array_key)
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"indexing_key",
|
||||
(
|
||||
0,
|
||||
-1,
|
||||
[1, 3],
|
||||
np.array([1, 3]),
|
||||
slice(1, 2),
|
||||
[True, False, True, True],
|
||||
np.asarray([False, False, False, False]),
|
||||
),
|
||||
)
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
def test_safe_indexing_array_api_support(
|
||||
array_namespace, device_, dtype_name, indexing_key, axis
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
|
||||
array_to_index_np = np.arange(16).reshape(4, 4)
|
||||
expected_result = _safe_indexing(array_to_index_np, indexing_key, axis=axis)
|
||||
array_to_index_xp = move_to(array_to_index_np, xp=xp, device=device_)
|
||||
|
||||
with sklearn.config_context(array_api_dispatch=True):
|
||||
indexed_array_xp = _safe_indexing(array_to_index_xp, indexing_key, axis=axis)
|
||||
assert device(indexed_array_xp) == device(array_to_index_xp)
|
||||
assert indexed_array_xp.dtype == array_to_index_xp.dtype
|
||||
|
||||
assert_allclose(_convert_to_numpy(indexed_array_xp, xp=xp), expected_result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "sparse", "dataframe", "polars", "pyarrow"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_1d_container(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
|
||||
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
|
||||
# validation of the indices
|
||||
# we make a copy because indices is mutable and shared between tests
|
||||
indices_converted = copy(indices)
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices_converted[1] += 1
|
||||
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices_converted = _convert_container(indices_converted, indices_type)
|
||||
|
||||
if isinstance(indices[0], str) and array_type in ("array", "sparse"):
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported for dataframes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices_converted, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices_converted, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_read_only", [True, False])
|
||||
@pytest.mark.parametrize("indices_read_only", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
|
||||
)
|
||||
def test_safe_indexing_2d_read_only_axis_1(
|
||||
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
|
||||
):
|
||||
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
if array_read_only:
|
||||
array.setflags(write=False)
|
||||
array = _convert_container(array, array_type)
|
||||
indices = np.array([1, 2])
|
||||
if indices_read_only:
|
||||
indices.setflags(write=False)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
def test_safe_indexing_1d_container_mask(array_type, indices_type):
|
||||
indices = [False] + [True] * 2 + [False] * 6
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["array", "sparse", "dataframe", "polars", "pyarrow"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_subset",
|
||||
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
|
||||
)
|
||||
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices = [False, True, True]
|
||||
indices = _convert_container(indices, indices_type)
|
||||
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container(expected_subset, array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[
|
||||
("list", "list"),
|
||||
("array", "array"),
|
||||
("sparse", "sparse"),
|
||||
("dataframe", "series"),
|
||||
("polars", "polars_series"),
|
||||
("pyarrow", "pyarrow_array"),
|
||||
],
|
||||
)
|
||||
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
expected_array = _convert_container([7, 8, 9], expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
|
||||
)
|
||||
def test_safe_indexing_1d_scalar(array_type):
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert subset == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[
|
||||
("array", "array"),
|
||||
("sparse", "sparse"),
|
||||
("dataframe", "series"),
|
||||
("polars", "polars_series"),
|
||||
("pyarrow", "pyarrow_array"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("indices", [2, "col_2"])
|
||||
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
|
||||
if isinstance(indices, str) and array_type in ("array", "sparse"):
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported for dataframes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices, axis=1)
|
||||
expected_output = [3, 6, 9]
|
||||
if expected_output_type == "sparse":
|
||||
# sparse matrix are keeping the 2D shape
|
||||
expected_output = [[3], [6], [9]]
|
||||
expected_array = _convert_container(expected_output, expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
|
||||
def test_safe_indexing_None_axis_0(array_type):
|
||||
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
X_subset = _safe_indexing(X, None, axis=0)
|
||||
assert_allclose_dense_sparse(X_subset, X)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_matching_cols_error():
|
||||
pd = pytest.importorskip("pandas")
|
||||
err_msg = "No valid specification of the columns."
|
||||
X = pd.DataFrame(X_toy)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, [1.0], axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [None, 3])
|
||||
def test_safe_indexing_error_axis(axis):
|
||||
with pytest.raises(ValueError, match="'axis' should be either 0"):
|
||||
_safe_indexing(X_toy, [0, 1], axis=axis)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_constructor", ["array", "series", "polars_series", "pyarrow_array"]
|
||||
)
|
||||
def test_safe_indexing_1d_array_error(X_constructor):
|
||||
# check that we are raising an error if the array-like passed is 1D and
|
||||
# we try to index on the 2nd dimension
|
||||
X = list(range(5))
|
||||
if X_constructor == "array":
|
||||
X_constructor = np.asarray(X)
|
||||
elif X_constructor == "series":
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_constructor = pd.Series(X)
|
||||
elif X_constructor == "polars_series":
|
||||
pl = pytest.importorskip("polars")
|
||||
X_constructor = pl.Series(values=X)
|
||||
elif X_constructor == "pyarrow_array":
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
X_constructor = pa.array(X)
|
||||
|
||||
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X_constructor, [0, 1], axis=1)
|
||||
|
||||
|
||||
def test_safe_indexing_container_axis_0_unsupported_type():
|
||||
indices = ["col_1", "col_2"]
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
||||
err_msg = r"String indexing.*is not supported with 'axis=0'"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=0)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_settingwithcopy_warning():
|
||||
# Using safe_indexing with an array-like indexer gives a copy of the
|
||||
# DataFrame -> ensure it doesn't raise a warning if modified
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
pd_version = parse_version(pd.__version__)
|
||||
pd_base_version = parse_version(pd_version.base_version)
|
||||
|
||||
if pd_base_version >= parse_version("3"):
|
||||
raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
|
||||
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
|
||||
subset = _safe_indexing(X, [0, 1], axis=0)
|
||||
if hasattr(pd.errors, "SettingWithCopyWarning"):
|
||||
SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
|
||||
else:
|
||||
# backward compatibility for pandas < 1.5
|
||||
SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", SettingWithCopyWarning)
|
||||
subset.iloc[0, 0] = 10
|
||||
# The original dataframe is unaffected by the assignment on the subset:
|
||||
assert X.iloc[0, 0] == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
|
||||
def test_safe_indexing_list_axis_1_unsupported(indices):
|
||||
"""Check that we raise a ValueError when axis=1 with input as list."""
|
||||
X = [[1, 2], [4, 5], [7, 8]]
|
||||
err_msg = "axis=1 is not supported for lists"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, indices, axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
def test_safe_assign(array_type):
|
||||
"""Check that `_safe_assign` works as expected."""
|
||||
rng = np.random.RandomState(0)
|
||||
X_array = rng.randn(10, 5)
|
||||
|
||||
row_indexer = [1, 2]
|
||||
values = rng.randn(len(row_indexer), X_array.shape[1])
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, row_indexer=row_indexer)
|
||||
|
||||
assigned_portion = _safe_indexing(X, row_indexer, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
assigned_portion, _convert_container(values, array_type)
|
||||
)
|
||||
|
||||
column_indexer = [1, 2]
|
||||
values = rng.randn(X_array.shape[0], len(column_indexer))
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, column_indexer=column_indexer)
|
||||
|
||||
assigned_portion = _safe_indexing(X, column_indexer, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
assigned_portion, _convert_container(values, array_type)
|
||||
)
|
||||
|
||||
row_indexer, column_indexer = None, None
|
||||
values = rng.randn(*X.shape)
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, column_indexer=column_indexer)
|
||||
|
||||
assert_allclose_dense_sparse(X, _convert_container(values, array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, err_msg",
|
||||
[
|
||||
(10, r"all features must be in \[0, 2\]"),
|
||||
("whatever", "A given column is not a column of the dataframe"),
|
||||
(object(), "No valid specification of the columns"),
|
||||
],
|
||||
)
|
||||
def test_get_column_indices_error(key, err_msg):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_column_indices(X_df, key)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
|
||||
)
|
||||
def test_get_column_indices_pandas_nonunique_columns_error(key):
|
||||
pd = pytest.importorskip("pandas")
|
||||
toy = np.zeros((1, 5), dtype=int)
|
||||
columns = ["col1", "col1", "col2", "col3", "col2"]
|
||||
X = pd.DataFrame(toy, columns=columns)
|
||||
|
||||
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
_get_column_indices(X, key)
|
||||
assert str(exc_info.value) == err_msg
|
||||
|
||||
|
||||
def test_get_column_indices_interchange():
|
||||
"""Check _get_column_indices for edge cases with the interchange"""
|
||||
pl = pytest.importorskip("polars")
|
||||
|
||||
# Polars dataframes go down the interchange path.
|
||||
df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])
|
||||
|
||||
key_results = [
|
||||
(slice(1, None), [1, 2]),
|
||||
(slice(None, 2), [0, 1]),
|
||||
(slice(1, 2), [1]),
|
||||
(["b", "c"], [1, 2]),
|
||||
(slice("a", "b"), [0, 1]),
|
||||
(slice("a", None), [0, 1, 2]),
|
||||
(slice(None, "a"), [0]),
|
||||
(["c", "a"], [2, 0]),
|
||||
([], []),
|
||||
]
|
||||
for key, result in key_results:
|
||||
assert _get_column_indices(df, key) == result
|
||||
|
||||
msg = "A given column is not a column of the dataframe"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_get_column_indices(df, ["not_a_column"])
|
||||
|
||||
msg = "key.step must be 1 or None"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
_get_column_indices(df, slice("a", None, 2))
|
||||
|
||||
|
||||
def test_resample():
|
||||
# Border case not worth mentioning in doctests
|
||||
assert resample() is None
|
||||
|
||||
# Check that invalid arguments yield ValueError
|
||||
with pytest.raises(ValueError):
|
||||
resample([0], [0, 1])
|
||||
with pytest.raises(ValueError):
|
||||
resample([0, 1], [0, 1], replace=False, n_samples=3)
|
||||
|
||||
# Issue:6581, n_samples can be more when replace is True (default).
|
||||
assert len(resample([1, 2], n_samples=5)) == 5
|
||||
|
||||
|
||||
def test_resample_weighted():
|
||||
# Check that sampling with replacement with integer weights yields the
|
||||
# samples from the same distribution as sampling uniformly with
|
||||
# repeated data points.
|
||||
data = np.array([-1, 0, 1])
|
||||
sample_weight = np.asarray([0, 100, 1])
|
||||
|
||||
mean_repeated = []
|
||||
mean_reweighted = []
|
||||
|
||||
for seed in range(100):
|
||||
mean_repeated.append(
|
||||
resample(
|
||||
data.repeat(sample_weight),
|
||||
replace=True,
|
||||
random_state=seed,
|
||||
n_samples=data.shape[0],
|
||||
).mean()
|
||||
)
|
||||
mean_reweighted.append(
|
||||
resample(
|
||||
data,
|
||||
sample_weight=sample_weight,
|
||||
replace=True,
|
||||
random_state=seed,
|
||||
n_samples=data.shape[0],
|
||||
).mean()
|
||||
)
|
||||
|
||||
mean_repeated = np.asarray(mean_repeated)
|
||||
mean_reweighted = np.asarray(mean_reweighted)
|
||||
|
||||
test_result = kstest(mean_repeated, mean_reweighted)
|
||||
# Should never be negative because -1 has a 0 weight.
|
||||
assert np.all(mean_reweighted >= 0)
|
||||
# The null-hypothesis (the computed means are identically distributed)
|
||||
# cannot be rejected.
|
||||
assert test_result.pvalue > 0.05
|
||||
|
||||
|
||||
def test_resample_stratified():
|
||||
# Make sure resample can stratify
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
p = 0.9
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.binomial(1, p, size=n_samples)
|
||||
|
||||
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
|
||||
assert np.all(y_not_stratified == 1)
|
||||
|
||||
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
|
||||
assert not np.all(y_stratified == 1)
|
||||
assert np.sum(y_stratified) == 9 # all 1s, one 0
|
||||
|
||||
|
||||
def test_resample_stratified_replace():
|
||||
# Make sure stratified resampling supports the replace parameter
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
X_no_replace, _ = resample(
|
||||
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
assert np.unique(X_replace).shape[0] < 50
|
||||
assert np.unique(X_no_replace).shape[0] == 50
|
||||
|
||||
# make sure n_samples can be greater than X.shape[0] if we sample with
|
||||
# replacement
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
|
||||
)
|
||||
assert X_replace.shape[0] == 1000
|
||||
assert np.unique(X_replace).shape[0] == 100
|
||||
|
||||
|
||||
def test_resample_stratify_2dy():
|
||||
# Make sure y can be 2d when stratifying
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=(n_samples, 2))
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
|
||||
assert y.ndim == 2
|
||||
|
||||
|
||||
def test_notimplementederror():
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Resampling with sample_weight is only implemented for replace=True.",
|
||||
):
|
||||
resample([0, 1], [0, 1], sample_weight=[1, 1], replace=False)
|
||||
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Resampling with sample_weight is only implemented for stratify=None",
|
||||
):
|
||||
resample([0, 1], [0, 1], sample_weight=[1, 1], stratify=[0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_resample_stratify_sparse_error(csr_container):
|
||||
# resample must be ndarray
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 2))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
stratify = csr_container(y.reshape(-1, 1))
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
|
||||
|
||||
|
||||
def test_shuffle_on_ndim_equals_three():
|
||||
def to_tuple(A): # to make the inner arrays hashable
|
||||
return tuple(tuple(tuple(C) for C in B) for B in A)
|
||||
|
||||
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
|
||||
S = set(to_tuple(A))
|
||||
shuffle(A) # shouldn't raise a ValueError for dim = 3
|
||||
assert set(to_tuple(A)) == S
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_shuffle_dont_convert_to_array(csc_container):
|
||||
# Check that shuffle does not try to convert to numpy arrays with float
|
||||
# dtypes can let any indexable datastructure pass-through.
|
||||
a = ["a", "b", "c"]
|
||||
b = np.array(["a", "b", "c"], dtype=object)
|
||||
c = [1, 2, 3]
|
||||
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
|
||||
e = csc_container(np.arange(6).reshape(3, 2))
|
||||
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
|
||||
|
||||
assert a_s == ["c", "b", "a"]
|
||||
assert type(a_s) == list
|
||||
|
||||
assert_array_equal(b_s, ["c", "b", "a"])
|
||||
assert b_s.dtype == object
|
||||
|
||||
assert c_s == [3, 2, 1]
|
||||
assert type(c_s) == list
|
||||
|
||||
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
|
||||
assert type(d_s) == MockDataFrame
|
||||
|
||||
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
|
||||
@@ -0,0 +1,19 @@
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._mask import safe_mask
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
from sklearn.utils.validation import check_random_state
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_safe_mask(csr_container):
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.rand(5, 4)
|
||||
X_csr = csr_container(X)
|
||||
mask = [False, False, True, True, True]
|
||||
|
||||
mask = safe_mask(X, mask)
|
||||
assert X[mask].shape[0] == 3
|
||||
|
||||
mask = safe_mask(X_csr, mask)
|
||||
assert X_csr[mask].shape[0] == 3
|
||||
@@ -0,0 +1,63 @@
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.metaestimators import available_if
|
||||
|
||||
|
||||
class AvailableParameterEstimator:
|
||||
"""This estimator's `available` parameter toggles the presence of a method"""
|
||||
|
||||
def __init__(self, available=True, return_value=1):
|
||||
self.available = available
|
||||
self.return_value = return_value
|
||||
|
||||
@available_if(lambda est: est.available)
|
||||
def available_func(self):
|
||||
"""This is a mock available_if function"""
|
||||
return self.return_value
|
||||
|
||||
|
||||
def test_available_if_docstring():
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.__dict__["available_func"].__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.available_func.__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator().available_func.__doc__
|
||||
)
|
||||
|
||||
|
||||
def test_available_if():
|
||||
assert hasattr(AvailableParameterEstimator(), "available_func")
|
||||
assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
|
||||
|
||||
|
||||
def test_available_if_unbound_method():
|
||||
# This is a non regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/20614
|
||||
# to make sure that decorated functions can be used as an unbound method,
|
||||
# for instance when monkeypatching.
|
||||
est = AvailableParameterEstimator()
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
est = AvailableParameterEstimator(available=False)
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
|
||||
):
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
|
||||
def test_available_if_methods_can_be_pickled():
|
||||
"""Check that available_if methods can be pickled.
|
||||
|
||||
Non-regression test for #21344.
|
||||
"""
|
||||
return_value = 10
|
||||
est = AvailableParameterEstimator(available=True, return_value=return_value)
|
||||
pickled_bytes = pickle.dumps(est.available_func)
|
||||
unpickled_func = pickle.loads(pickled_bytes)
|
||||
assert unpickled_func() == return_value
|
||||
@@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._missing import is_scalar_nan
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value, result",
|
||||
[
|
||||
(float("nan"), True),
|
||||
(np.nan, True),
|
||||
(float(np.nan), True),
|
||||
(np.float32(np.nan), True),
|
||||
(np.float64(np.nan), True),
|
||||
(0, False),
|
||||
(0.0, False),
|
||||
(None, False),
|
||||
("", False),
|
||||
("nan", False),
|
||||
([np.nan], False),
|
||||
(9867966753463435747313673, False), # Python int that overflows with C type
|
||||
],
|
||||
)
|
||||
def test_is_scalar_nan(value, result):
|
||||
assert is_scalar_nan(value) is result
|
||||
# make sure that we are returning a Python bool
|
||||
assert isinstance(is_scalar_nan(value), bool)
|
||||
@@ -0,0 +1,205 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils import _safe_indexing, check_array
|
||||
from sklearn.utils._mocking import (
|
||||
CheckingClassifier,
|
||||
_MockEstimatorOnOffPrediction,
|
||||
)
|
||||
from sklearn.utils._testing import _convert_container
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def iris():
|
||||
return load_iris(return_X_y=True)
|
||||
|
||||
|
||||
def _success(x):
|
||||
return True
|
||||
|
||||
|
||||
def _fail(x):
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{},
|
||||
{"check_X": _success},
|
||||
{"check_y": _success},
|
||||
{"check_X": _success, "check_y": _success},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_success(iris, kwargs):
|
||||
X, y = iris
|
||||
CheckingClassifier(**kwargs).fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"check_X": _fail},
|
||||
{"check_y": _fail},
|
||||
{"check_X": _success, "check_y": _fail},
|
||||
{"check_X": _fail, "check_y": _success},
|
||||
{"check_X": _fail, "check_y": _fail},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_fail(iris, kwargs):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(**kwargs)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_success(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_fail(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
clf.set_params(check_X=_fail)
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
|
||||
def test_checking_classifier(iris, input_type):
|
||||
# Check that the CheckingClassifier outputs what we expect
|
||||
X, y = iris
|
||||
X = _convert_container(X, input_type)
|
||||
clf = CheckingClassifier()
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.classes_, np.unique(y))
|
||||
assert len(clf.classes_) == 3
|
||||
assert clf.n_features_in_ == 4
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert all(pred in clf.classes_ for pred in y_pred)
|
||||
|
||||
assert clf.score(X) == pytest.approx(0)
|
||||
clf.set_params(foo_param=10)
|
||||
assert clf.fit(X, y).score(X) == pytest.approx(1)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (150, 3)
|
||||
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (150, 3)
|
||||
|
||||
# check the shape in case of binary classification
|
||||
first_2_classes = np.logical_or(y == 0, y == 1)
|
||||
X = _safe_indexing(X, first_2_classes)
|
||||
y = _safe_indexing(y, first_2_classes)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (100, 2)
|
||||
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (100,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_checking_classifier_with_params(iris, csr_container):
|
||||
X, y = iris
|
||||
X_sparse = csr_container(X)
|
||||
|
||||
clf = CheckingClassifier(check_X=sparse.issparse)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=check_array, check_X_params={"accept_sparse": False}
|
||||
)
|
||||
clf.fit(X, y)
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
|
||||
def test_checking_classifier_fit_params(iris):
|
||||
# check the error raised when the number of samples is not the one expected
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
sample_weight = np.ones(len(X) // 2)
|
||||
|
||||
msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
assert exc.value.args[0] == msg
|
||||
|
||||
|
||||
def test_checking_classifier_missing_fit_params(iris):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
err_msg = "Expected sample_weight to be passed"
|
||||
with pytest.raises(AssertionError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"methods_to_check",
|
||||
[["predict"], ["predict", "predict_proba"]],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"predict_method", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
|
||||
# check that methods_to_check allows to bypass checks
|
||||
X, y = iris
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=sparse.issparse,
|
||||
methods_to_check=methods_to_check,
|
||||
)
|
||||
|
||||
clf.fit(X, y)
|
||||
if predict_method in methods_to_check:
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, predict_method)(X)
|
||||
else:
|
||||
getattr(clf, predict_method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_methods",
|
||||
[
|
||||
["predict"],
|
||||
["predict", "predict_proba"],
|
||||
["predict", "decision_function"],
|
||||
["predict", "predict_proba", "decision_function"],
|
||||
],
|
||||
)
|
||||
def test_mock_estimator_on_off_prediction(iris, response_methods):
|
||||
X, y = iris
|
||||
estimator = _MockEstimatorOnOffPrediction(response_methods=response_methods)
|
||||
|
||||
estimator.fit(X, y)
|
||||
assert hasattr(estimator, "classes_")
|
||||
assert_array_equal(estimator.classes_, np.unique(y))
|
||||
|
||||
possible_responses = ["predict", "predict_proba", "decision_function"]
|
||||
for response in possible_responses:
|
||||
if response in response_methods:
|
||||
assert hasattr(estimator, response)
|
||||
assert getattr(estimator, response)(X) == response
|
||||
else:
|
||||
assert not hasattr(estimator, response)
|
||||
@@ -0,0 +1,634 @@
|
||||
import warnings
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.utils._array_api import (
|
||||
_get_namespace_device_dtype_ids,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.estimator_checks import _NotAnArray
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.metaestimators import _safe_split
|
||||
from sklearn.utils.multiclass import (
|
||||
_ovr_decision_function,
|
||||
check_classification_targets,
|
||||
class_distribution,
|
||||
is_multilabel,
|
||||
type_of_target,
|
||||
unique_labels,
|
||||
)
|
||||
|
||||
multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
|
||||
multilabel_explicit_zero[:, 0] = 0
|
||||
|
||||
|
||||
def _generate_sparse(
|
||||
data,
|
||||
sparse_containers=tuple(
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
),
|
||||
dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
|
||||
):
|
||||
return [
|
||||
sparse_container(data, dtype=dtype)
|
||||
for sparse_container in sparse_containers
|
||||
for dtype in dtypes
|
||||
]
|
||||
|
||||
|
||||
EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
# valid when the data is formatted as sparse or dense, identified
|
||||
# by CSR format when the testing takes place
|
||||
*_generate_sparse(
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
sparse_containers=CSR_CONTAINERS,
|
||||
dtypes=(int,),
|
||||
),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
*_generate_sparse(
|
||||
multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
|
||||
),
|
||||
*_generate_sparse([[0, 1], [1, 0]]),
|
||||
*_generate_sparse([[0, 0], [0, 0]]),
|
||||
*_generate_sparse([[0, 1]]),
|
||||
# Only valid when data is dense
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
["a", "b", "c"],
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
[["a", "b"], ["c", "d"]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
*_generate_sparse(
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(int, np.int8, np.uint8, float, np.float32),
|
||||
),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]], dtype=object),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
["a"],
|
||||
["a", "b"],
|
||||
["abc", "def"],
|
||||
np.array(["abc", "def"]),
|
||||
["a", "b"],
|
||||
np.array(["abc", "def"], dtype=object),
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5], [0.5, 0]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
np.array([[]], dtype=object),
|
||||
[()],
|
||||
# sequence of sequences that weren't supported even before deprecation
|
||||
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
|
||||
[np.array([]), np.array([1, 2, 3])],
|
||||
[{1, 2, 3}, {1, 2}],
|
||||
[frozenset([1, 2, 3]), frozenset([1, 2])],
|
||||
# and also confusable as sequences of sequences
|
||||
[{0: "a", 1: "b"}, {0: "a"}],
|
||||
# ndim 0
|
||||
np.array(0),
|
||||
# empty second dimension
|
||||
np.array([[], []]),
|
||||
# 3d
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
ARRAY_API_EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
multilabel_explicit_zero,
|
||||
[[0, 0], [0, 0]],
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
[()],
|
||||
np.array(0),
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
NON_ARRAY_LIKE_EXAMPLES = [
|
||||
{1, 2, 3},
|
||||
{0: "a", 1: "b"},
|
||||
{0: [5], 1: [5]},
|
||||
"abc",
|
||||
frozenset([1, 2, 3]),
|
||||
None,
|
||||
]
|
||||
|
||||
MULTILABEL_SEQUENCES = [
|
||||
[[1], [2], [0, 1]],
|
||||
[(), (2), (0, 1)],
|
||||
np.array([[], [1, 2]], dtype="object"),
|
||||
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
|
||||
]
|
||||
|
||||
|
||||
def test_unique_labels():
|
||||
# Empty iterable
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels()
|
||||
|
||||
# Multiclass problem
|
||||
assert_array_equal(unique_labels(range(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
|
||||
|
||||
# Multilabel indicator
|
||||
assert_array_equal(
|
||||
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
|
||||
)
|
||||
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
|
||||
|
||||
# Several arrays passed
|
||||
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
|
||||
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
|
||||
|
||||
# Border line case with binary indicator matrix
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([4, 0, 2], np.ones((5, 5)))
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
|
||||
|
||||
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
|
||||
|
||||
|
||||
def test_check_classification_targets_too_many_unique_classes():
|
||||
"""Check that we raise a warning when the number of unique classes is greater than
|
||||
50% of the number of samples.
|
||||
|
||||
We need to check that we don't raise if we have less than 20 samples.
|
||||
"""
|
||||
|
||||
# Create array of unique labels. This does raise a warning.
|
||||
y = np.arange(25)
|
||||
msg = r"The number of unique classes is greater than 50% of the number of samples."
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
check_classification_targets(y)
|
||||
|
||||
# less than 20 samples, no warning should be raised
|
||||
y = np.arange(10)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
check_classification_targets(y)
|
||||
|
||||
|
||||
def test_unique_labels_non_specific():
|
||||
# Test unique_labels with a variety of collected examples
|
||||
|
||||
# Smoke test for all supported format
|
||||
for format in ["binary", "multiclass", "multilabel-indicator"]:
|
||||
for y in EXAMPLES[format]:
|
||||
unique_labels(y)
|
||||
|
||||
# We don't support those format at the moment
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
for y_type in [
|
||||
"unknown",
|
||||
"continuous",
|
||||
"continuous-multioutput",
|
||||
"multiclass-multioutput",
|
||||
]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
|
||||
def test_unique_labels_mixed_types():
|
||||
# Mix with binary or multiclass and multilabel
|
||||
mix_clf_format = product(
|
||||
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
|
||||
)
|
||||
|
||||
for y_multilabel, y_multiclass in mix_clf_format:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multiclass, y_multilabel)
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multilabel, y_multiclass)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([[1, 2]], [["a", "d"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(["1", 2])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", 2], [1, 3]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", "2"], [2, 3]])
|
||||
|
||||
|
||||
def test_is_multilabel():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
|
||||
for example in group_examples:
|
||||
# Only mark explicitly defined sparse examples as valid sparse
|
||||
# multilabel-indicators
|
||||
sparse_exp = dense_exp and issparse(example)
|
||||
|
||||
if issparse(example) or (
|
||||
hasattr(example, "__array__")
|
||||
and np.asarray(example).ndim == 2
|
||||
and np.asarray(example).dtype.kind in "biuf"
|
||||
and np.asarray(example).shape[1] > 0
|
||||
):
|
||||
examples_sparse = [
|
||||
sparse_container(example)
|
||||
for sparse_container in (
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
)
|
||||
]
|
||||
for exmpl_sparse in examples_sparse:
|
||||
assert sparse_exp == is_multilabel(exmpl_sparse), (
|
||||
f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
|
||||
)
|
||||
|
||||
# Densify sparse examples before testing
|
||||
if issparse(example):
|
||||
example = example.toarray()
|
||||
|
||||
assert dense_exp == is_multilabel(example), (
|
||||
f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
ids=_get_namespace_device_dtype_ids,
|
||||
)
|
||||
def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
for group, group_examples in ARRAY_API_EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
for example in group_examples:
|
||||
if np.asarray(example).dtype.kind == "f":
|
||||
example = np.asarray(example, dtype=dtype_name)
|
||||
else:
|
||||
example = np.asarray(example)
|
||||
example = xp.asarray(example, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
assert dense_exp == is_multilabel(example), (
|
||||
f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
)
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
for y_type in EXAMPLES.keys():
|
||||
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
msg = "Unknown label type: "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(example)
|
||||
else:
|
||||
for example in EXAMPLES[y_type]:
|
||||
check_classification_targets(example)
|
||||
|
||||
|
||||
def test_type_of_target():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
for example in group_examples:
|
||||
assert type_of_target(example) == group, (
|
||||
"type_of_target(%r) should be %r, got %r"
|
||||
% (
|
||||
example,
|
||||
group,
|
||||
type_of_target(example),
|
||||
)
|
||||
)
|
||||
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
|
||||
with pytest.raises(ValueError, match=msg_regex):
|
||||
type_of_target(example)
|
||||
|
||||
for example in MULTILABEL_SEQUENCES:
|
||||
msg = (
|
||||
"You appear to be using a legacy multi-label data "
|
||||
"representation. Sequence of sequences are no longer supported;"
|
||||
" use a binary array or sparse matrix instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(example)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_sparse():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
|
||||
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(y)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_nullable():
|
||||
"""Check that type_of_target works with pandas nullable dtypes."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
for dtype in ["Int32", "Float32"]:
|
||||
y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
|
||||
assert type_of_target(y_true) == "multiclass"
|
||||
|
||||
y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
|
||||
assert type_of_target(y_true) == "binary"
|
||||
|
||||
y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
|
||||
assert type_of_target(y_true) == "continuous-multioutput"
|
||||
|
||||
y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multilabel-indicator"
|
||||
|
||||
y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multiclass-multioutput"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_unique_labels_pandas_nullable(dtype):
|
||||
"""Checks that unique_labels work with pandas nullable dtypes.
|
||||
|
||||
Non-regression test for gh-25634.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
|
||||
y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
|
||||
|
||||
labels = unique_labels(y_true, y_predicted)
|
||||
assert_array_equal(labels, [0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_class_distribution(csc_container):
|
||||
y = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 2, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
[4, 2, 0, 1],
|
||||
[2, 0, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
]
|
||||
)
|
||||
# Define the sparse matrix with a mix of implicit and explicit zeros
|
||||
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
|
||||
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
|
||||
indptr = np.array([0, 6, 11, 11, 17])
|
||||
y_sp = csc_container((data, indices, indptr), shape=(6, 4))
|
||||
|
||||
classes, n_classes, class_prior = class_distribution(y)
|
||||
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
|
||||
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
|
||||
n_classes_expected = [3, 3, 1, 1]
|
||||
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
# Test again with explicit sample weights
|
||||
(classes, n_classes, class_prior) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
|
||||
def test_safe_split_with_precomputed_kernel():
|
||||
clf = SVC()
|
||||
clfp = SVC(kernel="precomputed")
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
K = np.dot(X, X.T)
|
||||
|
||||
cv = ShuffleSplit(test_size=0.25, random_state=0)
|
||||
train, test = next(iter(cv.split(X)))
|
||||
|
||||
X_train, y_train = _safe_split(clf, X, y, train)
|
||||
K_train, y_train2 = _safe_split(clfp, K, y, train)
|
||||
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
|
||||
assert_array_almost_equal(y_train, y_train2)
|
||||
|
||||
X_test, y_test = _safe_split(clf, X, y, test, train)
|
||||
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
|
||||
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
|
||||
assert_array_almost_equal(y_test, y_test2)
|
||||
|
||||
|
||||
def test_ovr_decision_function():
|
||||
# test properties for ovr decision function
|
||||
|
||||
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
|
||||
|
||||
confidences = np.array(
|
||||
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
|
||||
)
|
||||
|
||||
n_classes = 3
|
||||
|
||||
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
|
||||
|
||||
# check that the decision values are within 0.5 range of the votes
|
||||
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
|
||||
|
||||
assert_allclose(votes, dec_values, atol=0.5)
|
||||
|
||||
# check that the prediction are what we expect
|
||||
# highest vote or highest confidence if there is a tie.
|
||||
# for the second sample we have a tie (should be won by 1)
|
||||
expected_prediction = np.array([2, 1, 2, 2])
|
||||
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
|
||||
|
||||
# third and fourth sample have the same vote but third sample
|
||||
# has higher confidence, this should reflect on the decision values
|
||||
assert dec_values[2, 2] > dec_values[3, 2]
|
||||
|
||||
# assert subset invariance.
|
||||
dec_values_one = [
|
||||
_ovr_decision_function(
|
||||
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
|
||||
)[0]
|
||||
for i in range(4)
|
||||
]
|
||||
|
||||
assert_allclose(dec_values, dec_values_one, atol=1e-6)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_type", ["list", "array"])
|
||||
def test_labels_in_bytes_format_error(input_type):
|
||||
# check that we raise an error with bytes encoded labels
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16980
|
||||
target = _convert_container([b"a", b"b"], input_type)
|
||||
err_msg = "Support for labels represented as bytes is not supported"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
type_of_target(target)
|
||||
@@ -0,0 +1,73 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.utils.murmurhash import murmurhash3_32
|
||||
|
||||
|
||||
def test_mmhash3_int():
|
||||
assert murmurhash3_32(3) == 847579505
|
||||
assert murmurhash3_32(3, seed=0) == 847579505
|
||||
assert murmurhash3_32(3, seed=42) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
|
||||
|
||||
|
||||
def test_mmhash3_int_array():
|
||||
rng = np.random.RandomState(42)
|
||||
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
|
||||
keys = keys.reshape((3, 2, 1))
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed), expected)
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
|
||||
|
||||
|
||||
def test_mmhash3_bytes():
|
||||
assert murmurhash3_32(b"foo", 0) == -156908512
|
||||
assert murmurhash3_32(b"foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_mmhash3_unicode():
|
||||
assert murmurhash3_32("foo", 0) == -156908512
|
||||
assert murmurhash3_32("foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32("foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32("foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_no_collision_on_byte_range():
|
||||
previous_hashes = set()
|
||||
for i in range(100):
|
||||
h = murmurhash3_32(" " * i, 0)
|
||||
assert h not in previous_hashes, "Found collision on growing empty string"
|
||||
|
||||
|
||||
def test_uniform_distribution():
|
||||
n_bins, n_samples = 10, 100000
|
||||
bins = np.zeros(n_bins, dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
|
||||
|
||||
means = bins / n_samples
|
||||
expected = np.full(n_bins, 1.0 / n_bins)
|
||||
|
||||
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
|
||||
@@ -0,0 +1,220 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.optimize import fmin_ncg
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils._bunch import Bunch
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.optimize import _check_optimize_result, _newton_cg
|
||||
|
||||
|
||||
def test_newton_cg(global_random_seed):
|
||||
# Test that newton_cg gives same result as scipy's fmin_ncg
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
A = rng.normal(size=(10, 10))
|
||||
x0 = np.ones(10)
|
||||
|
||||
def func(x):
|
||||
Ax = A.dot(x)
|
||||
return 0.5 * (Ax).dot(Ax)
|
||||
|
||||
def grad(x):
|
||||
return A.T.dot(A.dot(x))
|
||||
|
||||
def hess(x, p):
|
||||
return p.dot(A.T.dot(A.dot(x.all())))
|
||||
|
||||
def grad_hess(x):
|
||||
return grad(x), lambda x: A.T.dot(A.dot(x))
|
||||
|
||||
# func is a definite positive quadratic form, so the minimum is at x = 0
|
||||
# hence the use of absolute tolerance.
|
||||
assert np.all(np.abs(_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0]) <= 1e-7)
|
||||
assert_allclose(
|
||||
_newton_cg(grad_hess, func, grad, x0, tol=1e-7)[0],
|
||||
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
|
||||
atol=1e-5,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("verbose", [0, 1, 2])
|
||||
def test_newton_cg_verbosity(capsys, verbose):
|
||||
"""Test the std output of verbose newton_cg solver."""
|
||||
A = np.eye(2)
|
||||
b = np.array([1, 2], dtype=float)
|
||||
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.zeros(A.shape[0]),
|
||||
verbose=verbose,
|
||||
) # returns array([1., 2])
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if verbose == 0:
|
||||
assert captured.out == ""
|
||||
else:
|
||||
msg = [
|
||||
"Newton-CG iter = 1",
|
||||
"Check Convergence",
|
||||
"max |gradient|",
|
||||
"Solver did converge at loss = ",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
if verbose >= 2:
|
||||
msg = [
|
||||
"Inner CG solver iteration 1 stopped with",
|
||||
"sum(|residuals|) <= tol",
|
||||
"Line Search",
|
||||
"try line search wolfe1",
|
||||
"wolfe1 line search was successful",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
if verbose >= 2:
|
||||
# Set up a badly scaled singular Hessian with a completely wrong starting
|
||||
# position. This should trigger 2nd line search check
|
||||
A = np.array([[1.0, 2], [2, 4]]) * 1e30 # collinear columns
|
||||
b = np.array([1.0, 2.0])
|
||||
# Note that scipy.optimize._linesearch LineSearchWarning inherits from
|
||||
# RuntimeWarning, but we do not want to import from non public APIs.
|
||||
with pytest.warns(RuntimeWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.array([-2.0, 1]), # null space of hessian
|
||||
verbose=verbose,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"wolfe1 line search was not successful",
|
||||
"check loss |improvement| <= eps * |loss_old|:",
|
||||
"check sum(|gradient|) < sum(|gradient_old|):",
|
||||
"last resort: try line search wolfe2",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
# Set up a badly conditioned Hessian that leads to tiny curvature.
|
||||
# X.T @ X have singular values array([1.00000400e+01, 1.00008192e-11])
|
||||
A = np.array([[1.0, 2], [1, 2 + 1e-15]])
|
||||
b = np.array([-2.0, 1])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=b,
|
||||
verbose=verbose,
|
||||
maxiter=2,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"tiny_|p| = eps * ||p||^2",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
# Test for a case with negative Hessian.
|
||||
# We do not trigger "Inner CG solver iteration {i} stopped with negative
|
||||
# curvature", but that is very hard to trigger.
|
||||
A = np.eye(2)
|
||||
b = np.array([-2.0, 1])
|
||||
with pytest.warns(RuntimeWarning):
|
||||
_newton_cg(
|
||||
# Note the wrong sign in the hessian product.
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: -A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.array([1.0, 1.0]),
|
||||
verbose=verbose,
|
||||
maxiter=3,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"Inner CG solver iteration 0 fell back to steepest descent",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
A = np.diag([1e-3, 1, 1e3])
|
||||
b = np.array([-2.0, 1, 2.0])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.ones_like(b),
|
||||
verbose=verbose,
|
||||
maxiter=2,
|
||||
maxinner=1,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"Inner CG solver stopped reaching maxiter=1",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
|
||||
def test_check_optimize():
|
||||
# Mock some lbfgs output using a Bunch instance:
|
||||
result = Bunch()
|
||||
|
||||
# First case: no warnings
|
||||
result.nit = 1
|
||||
result.status = 0
|
||||
result.message = "OK"
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
_check_optimize_result("lbfgs", result)
|
||||
|
||||
# Second case: warning about implicit `max_iter`: do not recommend the user
|
||||
# to increase `max_iter` this is not a user settable parameter.
|
||||
result.status = 1
|
||||
result.message = "STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT"
|
||||
with pytest.warns(ConvergenceWarning) as record:
|
||||
_check_optimize_result("lbfgs", result)
|
||||
|
||||
assert len(record) == 1
|
||||
warn_msg = record[0].message.args[0]
|
||||
assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
|
||||
assert result.message in warn_msg
|
||||
assert "Increase the number of iterations" not in warn_msg
|
||||
assert "scale the data" in warn_msg
|
||||
|
||||
# Third case: warning about explicit `max_iter`: recommend user to increase
|
||||
# `max_iter`.
|
||||
with pytest.warns(ConvergenceWarning) as record:
|
||||
_check_optimize_result("lbfgs", result, max_iter=1)
|
||||
|
||||
assert len(record) == 1
|
||||
warn_msg = record[0].message.args[0]
|
||||
assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg
|
||||
assert result.message in warn_msg
|
||||
assert "Increase the number of iterations" in warn_msg
|
||||
assert "scale the data" in warn_msg
|
||||
|
||||
# Fourth case: other convergence problem before reaching `max_iter`: do not
|
||||
# recommend increasing `max_iter`.
|
||||
result.nit = 2
|
||||
result.status = 2
|
||||
result.message = "ABNORMAL"
|
||||
with pytest.warns(ConvergenceWarning) as record:
|
||||
_check_optimize_result("lbfgs", result, max_iter=10)
|
||||
|
||||
assert len(record) == 1
|
||||
warn_msg = record[0].message.args[0]
|
||||
assert "lbfgs failed to converge after 2 iteration(s)" in warn_msg
|
||||
assert result.message in warn_msg
|
||||
assert "Increase the number of iterations" not in warn_msg
|
||||
assert "scale the data" in warn_msg
|
||||
@@ -0,0 +1,197 @@
|
||||
import itertools
|
||||
import re
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn import config_context, get_config
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils.fixes import _IS_WASM
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
|
||||
|
||||
def get_working_memory():
|
||||
return get_config()["working_memory"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
|
||||
def test_configuration_passes_through_to_joblib(n_jobs, backend):
|
||||
# Tests that the global global configuration is passed to joblib jobs
|
||||
|
||||
with config_context(working_memory=123):
|
||||
results = Parallel(n_jobs=n_jobs, backend=backend)(
|
||||
delayed(get_working_memory)() for _ in range(2)
|
||||
)
|
||||
|
||||
assert_array_equal(results, [123] * 2)
|
||||
|
||||
|
||||
def test_parallel_delayed_warnings():
|
||||
"""Informative warnings should be raised when mixing sklearn and joblib API"""
|
||||
# We should issue a warning when one wants to use sklearn.utils.fixes.Parallel
|
||||
# with joblib.delayed. The config will not be propagated to the workers.
|
||||
warn_msg = "`sklearn.utils.parallel.Parallel` needs to be used in conjunction"
|
||||
with pytest.warns(UserWarning, match=warn_msg) as records:
|
||||
Parallel()(joblib.delayed(time.sleep)(0) for _ in range(10))
|
||||
assert len(records) == 10
|
||||
|
||||
# We should issue a warning if one wants to use sklearn.utils.fixes.delayed with
|
||||
# joblib.Parallel
|
||||
warn_msg = (
|
||||
"`sklearn.utils.parallel.delayed` should be used with "
|
||||
"`sklearn.utils.parallel.Parallel` to make it possible to propagate"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg) as records:
|
||||
joblib.Parallel()(delayed(time.sleep)(0) for _ in range(10))
|
||||
assert len(records) == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
def test_dispatch_config_parallel(n_jobs):
|
||||
"""Check that we properly dispatch the configuration in parallel processing.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/25239
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
iris = load_iris(as_frame=True)
|
||||
|
||||
class TransformerRequiredDataFrame(StandardScaler):
|
||||
def fit(self, X, y=None):
|
||||
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
|
||||
return super().fit(X, y)
|
||||
|
||||
def transform(self, X, y=None):
|
||||
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
|
||||
return super().transform(X, y)
|
||||
|
||||
dropper = make_column_transformer(
|
||||
("drop", [0]),
|
||||
remainder="passthrough",
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
param_grid = {"randomforestclassifier__max_depth": [1, 2, 3]}
|
||||
search_cv = GridSearchCV(
|
||||
make_pipeline(
|
||||
dropper,
|
||||
TransformerRequiredDataFrame(),
|
||||
RandomForestClassifier(n_estimators=5, n_jobs=n_jobs),
|
||||
),
|
||||
param_grid,
|
||||
cv=5,
|
||||
n_jobs=n_jobs,
|
||||
error_score="raise", # this search should not fail
|
||||
)
|
||||
|
||||
# make sure that `fit` would fail in case we don't request dataframe
|
||||
with pytest.raises(AssertionError, match="X should be a DataFrame"):
|
||||
search_cv.fit(iris.data, iris.target)
|
||||
|
||||
with config_context(transform_output="pandas"):
|
||||
# we expect each intermediate steps to output a DataFrame
|
||||
search_cv.fit(iris.data, iris.target)
|
||||
|
||||
assert not np.isnan(search_cv.cv_results_["mean_test_score"]).any()
|
||||
|
||||
|
||||
def raise_warning():
|
||||
warnings.warn("Convergence warning", ConvergenceWarning)
|
||||
|
||||
|
||||
def _yield_n_jobs_backend_combinations():
|
||||
n_jobs_values = [1, 2]
|
||||
backend_values = ["loky", "threading", "multiprocessing"]
|
||||
for n_jobs, backend in itertools.product(n_jobs_values, backend_values):
|
||||
if n_jobs == 2 and backend == "loky":
|
||||
# XXX Mark thread-unsafe to avoid:
|
||||
# RuntimeError: The executor underlying Parallel has been shutdown.
|
||||
# See https://github.com/joblib/joblib/issues/1743 for more details.
|
||||
yield pytest.param(n_jobs, backend, marks=pytest.mark.thread_unsafe)
|
||||
else:
|
||||
yield n_jobs, backend
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs, backend", _yield_n_jobs_backend_combinations())
|
||||
def test_filter_warning_propagates(n_jobs, backend):
|
||||
"""Check warning propagates to the job."""
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", category=ConvergenceWarning)
|
||||
|
||||
with pytest.raises(ConvergenceWarning):
|
||||
Parallel(n_jobs=n_jobs, backend=backend)(
|
||||
delayed(raise_warning)() for _ in range(2)
|
||||
)
|
||||
|
||||
|
||||
def get_warning_filters():
|
||||
# In free-threading Python >= 3.14, warnings filters are managed through a
|
||||
# ContextVar and warnings.filters is not modified inside a
|
||||
# warnings.catch_warnings context. You need to use warnings._get_filters().
|
||||
# For more details, see
|
||||
# https://docs.python.org/3.14/whatsnew/3.14.html#concurrent-safe-warnings-control
|
||||
filters_func = getattr(warnings, "_get_filters", None)
|
||||
return filters_func() if filters_func is not None else warnings.filters
|
||||
|
||||
|
||||
def test_check_warnings_threading():
|
||||
"""Check that warnings filters are set correctly in the threading backend."""
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", category=ConvergenceWarning)
|
||||
|
||||
main_warning_filters = get_warning_filters()
|
||||
|
||||
assert ("error", None, ConvergenceWarning, None, 0) in main_warning_filters
|
||||
|
||||
all_worker_warning_filters = Parallel(n_jobs=2, backend="threading")(
|
||||
delayed(get_warning_filters)() for _ in range(2)
|
||||
)
|
||||
|
||||
def normalize_main_module(filters):
|
||||
# In Python 3.14 free-threaded, there is a small discrepancy main
|
||||
# warning filters have an entry with module = "__main__" whereas it
|
||||
# is a regex in the workers
|
||||
return [
|
||||
(
|
||||
action,
|
||||
message,
|
||||
type_,
|
||||
module
|
||||
if "__main__" not in str(module)
|
||||
or not isinstance(module, re.Pattern)
|
||||
else module.pattern,
|
||||
lineno,
|
||||
)
|
||||
for action, message, type_, module, lineno in main_warning_filters
|
||||
]
|
||||
|
||||
for worker_warning_filter in all_worker_warning_filters:
|
||||
assert normalize_main_module(
|
||||
worker_warning_filter
|
||||
) == normalize_main_module(main_warning_filters)
|
||||
|
||||
|
||||
@pytest.mark.xfail(_IS_WASM, reason="Pyodide always use the sequential backend")
|
||||
def test_filter_warning_propagates_no_side_effect_with_loky_backend():
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", category=ConvergenceWarning)
|
||||
|
||||
Parallel(n_jobs=2, backend="loky")(delayed(time.sleep)(0) for _ in range(10))
|
||||
|
||||
# Since loky workers are reused, make sure that inside the loky workers,
|
||||
# warnings filters have been reset to their original value. Using joblib
|
||||
# directly should not turn ConvergenceWarning into an error.
|
||||
joblib.Parallel(n_jobs=2, backend="loky")(
|
||||
joblib.delayed(warnings.warn)("Convergence warning", ConvergenceWarning)
|
||||
for _ in range(10)
|
||||
)
|
||||
@@ -0,0 +1,786 @@
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
from sklearn._config import config_context, get_config
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.model_selection import LeaveOneOut
|
||||
from sklearn.utils import deprecated
|
||||
from sklearn.utils._param_validation import (
|
||||
HasMethods,
|
||||
Hidden,
|
||||
Interval,
|
||||
InvalidParameterError,
|
||||
MissingValues,
|
||||
Options,
|
||||
RealNotInt,
|
||||
StrOptions,
|
||||
_ArrayLikes,
|
||||
_Booleans,
|
||||
_Callables,
|
||||
_CVObjects,
|
||||
_InstancesOf,
|
||||
_IterablesNotString,
|
||||
_NanConstraint,
|
||||
_NoneConstraint,
|
||||
_PandasNAConstraint,
|
||||
_RandomStates,
|
||||
_SparseMatrices,
|
||||
_VerboseHelper,
|
||||
generate_invalid_param_val,
|
||||
generate_valid_param,
|
||||
make_constraint,
|
||||
validate_params,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
# Some helpers for the tests
|
||||
@validate_params(
|
||||
{"a": [Real], "b": [Real], "c": [Real], "d": [Real]},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def _func(a, b=0, *args, c, d=0, **kwargs):
|
||||
"""A function to test the validation of functions."""
|
||||
|
||||
|
||||
class _Class:
|
||||
"""A class to test the _InstancesOf constraint and the validation of methods."""
|
||||
|
||||
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
|
||||
def _method(self, a):
|
||||
"""A validated method"""
|
||||
|
||||
@deprecated()
|
||||
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
|
||||
def _deprecated_method(self, a):
|
||||
"""A deprecated validated method"""
|
||||
|
||||
|
||||
class _Estimator(BaseEstimator):
|
||||
"""An estimator to test the validation of estimator parameters."""
|
||||
|
||||
_parameter_constraints: dict = {"a": [Real]}
|
||||
|
||||
def __init__(self, a):
|
||||
self.a = a
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X=None, y=None):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interval_type", [Integral, Real])
|
||||
def test_interval_range(interval_type):
|
||||
"""Check the range of values depending on closed."""
|
||||
interval = Interval(interval_type, -2, 2, closed="left")
|
||||
assert -2 in interval
|
||||
assert 2 not in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="right")
|
||||
assert -2 not in interval
|
||||
assert 2 in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="both")
|
||||
assert -2 in interval
|
||||
assert 2 in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="neither")
|
||||
assert -2 not in interval
|
||||
assert 2 not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interval_type", [Integral, Real])
|
||||
def test_interval_large_integers(interval_type):
|
||||
"""Check that Interval constraint work with large integers.
|
||||
|
||||
non-regression test for #26648.
|
||||
"""
|
||||
interval = Interval(interval_type, 0, 2, closed="neither")
|
||||
assert 2**65 not in interval
|
||||
assert 2**128 not in interval
|
||||
assert float(2**65) not in interval
|
||||
assert float(2**128) not in interval
|
||||
|
||||
interval = Interval(interval_type, 0, 2**128, closed="neither")
|
||||
assert 2**65 in interval
|
||||
assert 2**128 not in interval
|
||||
assert float(2**65) in interval
|
||||
assert float(2**128) not in interval
|
||||
|
||||
assert 2**1024 not in interval
|
||||
|
||||
|
||||
def test_interval_inf_in_bounds():
|
||||
"""Check that inf is included iff a bound is closed and set to None.
|
||||
|
||||
Only valid for real intervals.
|
||||
"""
|
||||
interval = Interval(Real, 0, None, closed="right")
|
||||
assert np.inf in interval
|
||||
|
||||
interval = Interval(Real, None, 0, closed="left")
|
||||
assert -np.inf in interval
|
||||
|
||||
interval = Interval(Real, None, None, closed="neither")
|
||||
assert np.inf not in interval
|
||||
assert -np.inf not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interval",
|
||||
[Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")],
|
||||
)
|
||||
def test_nan_not_in_interval(interval):
|
||||
"""Check that np.nan is not in any interval."""
|
||||
assert np.nan not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, error, match",
|
||||
[
|
||||
(
|
||||
{"type": Integral, "left": 1.0, "right": 2, "closed": "both"},
|
||||
TypeError,
|
||||
r"Expecting left to be an int for an interval over the integers",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 1, "right": 2.0, "closed": "neither"},
|
||||
TypeError,
|
||||
"Expecting right to be an int for an interval over the integers",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": None, "right": 0, "closed": "left"},
|
||||
ValueError,
|
||||
r"left can't be None when closed == left",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 0, "right": None, "closed": "right"},
|
||||
ValueError,
|
||||
r"right can't be None when closed == right",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 1, "right": -1, "closed": "both"},
|
||||
ValueError,
|
||||
r"right can't be less than left",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_interval_errors(params, error, match):
|
||||
"""Check that informative errors are raised for invalid combination of parameters"""
|
||||
with pytest.raises(error, match=match):
|
||||
Interval(**params)
|
||||
|
||||
|
||||
def test_stroptions():
|
||||
"""Sanity check for the StrOptions constraint"""
|
||||
options = StrOptions({"a", "b", "c"}, deprecated={"c"})
|
||||
assert options.is_satisfied_by("a")
|
||||
assert options.is_satisfied_by("c")
|
||||
assert not options.is_satisfied_by("d")
|
||||
|
||||
assert "'c' (deprecated)" in str(options)
|
||||
|
||||
|
||||
def test_options():
|
||||
"""Sanity check for the Options constraint"""
|
||||
options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5})
|
||||
assert options.is_satisfied_by(-0.5)
|
||||
assert options.is_satisfied_by(np.inf)
|
||||
assert not options.is_satisfied_by(1.23)
|
||||
|
||||
assert "-0.5 (deprecated)" in str(options)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"type, expected_type_name",
|
||||
[
|
||||
(int, "int"),
|
||||
(Integral, "int"),
|
||||
(Real, "float"),
|
||||
(np.ndarray, "numpy.ndarray"),
|
||||
],
|
||||
)
|
||||
def test_instances_of_type_human_readable(type, expected_type_name):
|
||||
"""Check the string representation of the _InstancesOf constraint."""
|
||||
constraint = _InstancesOf(type)
|
||||
assert str(constraint) == f"an instance of '{expected_type_name}'"
|
||||
|
||||
|
||||
def test_hasmethods():
|
||||
"""Check the HasMethods constraint."""
|
||||
constraint = HasMethods(["a", "b"])
|
||||
|
||||
class _Good:
|
||||
def a(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
def b(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
class _Bad:
|
||||
def a(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
assert constraint.is_satisfied_by(_Good())
|
||||
assert not constraint.is_satisfied_by(_Bad())
|
||||
assert str(constraint) == "an object implementing 'a' and 'b'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
Interval(Real, None, 0, closed="left"),
|
||||
Interval(Real, 0, None, closed="left"),
|
||||
Interval(Real, None, None, closed="neither"),
|
||||
StrOptions({"a", "b", "c"}),
|
||||
MissingValues(),
|
||||
MissingValues(numeric_only=True),
|
||||
_VerboseHelper(),
|
||||
HasMethods("fit"),
|
||||
_IterablesNotString(),
|
||||
_CVObjects(),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val(constraint):
|
||||
"""Check that the value generated does not satisfy the constraint"""
|
||||
bad_value = generate_invalid_param_val(constraint)
|
||||
assert not constraint.is_satisfied_by(bad_value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"integer_interval, real_interval",
|
||||
[
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, -5, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, -5, 5, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 4, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 5, None, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 4, None, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, -5, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, -5, 5, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, 1, 2, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, None, -5, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, None, -4, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, None, 1, closed="right"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, 1, None, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, -10, -4, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, -10, -4, closed="right"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="neither"),
|
||||
Interval(RealNotInt, 6, 10, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="neither"),
|
||||
Interval(RealNotInt, 6, 10, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 2, None, closed="left"),
|
||||
Interval(RealNotInt, 0, 1, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
Interval(RealNotInt, 0, 1, closed="both"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval):
|
||||
"""Check that the value generated for an interval constraint does not satisfy any of
|
||||
the interval constraints.
|
||||
"""
|
||||
bad_value = generate_invalid_param_val(constraint=real_interval)
|
||||
assert not real_interval.is_satisfied_by(bad_value)
|
||||
assert not integer_interval.is_satisfied_by(bad_value)
|
||||
|
||||
bad_value = generate_invalid_param_val(constraint=integer_interval)
|
||||
assert not real_interval.is_satisfied_by(bad_value)
|
||||
assert not integer_interval.is_satisfied_by(bad_value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
_ArrayLikes(),
|
||||
_InstancesOf(list),
|
||||
_Callables(),
|
||||
_NoneConstraint(),
|
||||
_RandomStates(),
|
||||
_SparseMatrices(),
|
||||
_Booleans(),
|
||||
Interval(Integral, None, None, closed="neither"),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val_all_valid(constraint):
|
||||
"""Check that the function raises NotImplementedError when there's no invalid value
|
||||
for the constraint.
|
||||
"""
|
||||
with pytest.raises(NotImplementedError):
|
||||
generate_invalid_param_val(constraint)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
_ArrayLikes(),
|
||||
_Callables(),
|
||||
_InstancesOf(list),
|
||||
_NoneConstraint(),
|
||||
_RandomStates(),
|
||||
_SparseMatrices(),
|
||||
_Booleans(),
|
||||
_VerboseHelper(),
|
||||
MissingValues(),
|
||||
MissingValues(numeric_only=True),
|
||||
StrOptions({"a", "b", "c"}),
|
||||
Options(Integral, {1, 2, 3}),
|
||||
Interval(Integral, None, None, closed="neither"),
|
||||
Interval(Integral, 0, 10, closed="neither"),
|
||||
Interval(Integral, 0, None, closed="neither"),
|
||||
Interval(Integral, None, 0, closed="neither"),
|
||||
Interval(Real, 0, 1, closed="neither"),
|
||||
Interval(Real, 0, None, closed="both"),
|
||||
Interval(Real, None, 0, closed="right"),
|
||||
HasMethods("fit"),
|
||||
_IterablesNotString(),
|
||||
_CVObjects(),
|
||||
],
|
||||
)
|
||||
def test_generate_valid_param(constraint):
|
||||
"""Check that the value generated does satisfy the constraint."""
|
||||
value = generate_valid_param(constraint)
|
||||
assert constraint.is_satisfied_by(value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint_declaration, value",
|
||||
[
|
||||
(Interval(Real, 0, 1, closed="both"), 0.42),
|
||||
(Interval(Integral, 0, None, closed="neither"), 42),
|
||||
(StrOptions({"a", "b", "c"}), "b"),
|
||||
(Options(type, {np.float32, np.float64}), np.float64),
|
||||
(callable, lambda x: x + 1),
|
||||
(None, None),
|
||||
("array-like", [[1, 2], [3, 4]]),
|
||||
("array-like", np.array([[1, 2], [3, 4]])),
|
||||
("sparse matrix", csr_matrix([[1, 2], [3, 4]])),
|
||||
*[
|
||||
("sparse matrix", container([[1, 2], [3, 4]]))
|
||||
for container in CSR_CONTAINERS
|
||||
],
|
||||
("random_state", 0),
|
||||
("random_state", np.random.RandomState(0)),
|
||||
("random_state", None),
|
||||
(_Class, _Class()),
|
||||
(int, 1),
|
||||
(Real, 0.5),
|
||||
("boolean", False),
|
||||
("verbose", 1),
|
||||
("nan", np.nan),
|
||||
(MissingValues(), -1),
|
||||
(MissingValues(), -1.0),
|
||||
(MissingValues(), 2**1028),
|
||||
(MissingValues(), None),
|
||||
(MissingValues(), float("nan")),
|
||||
(MissingValues(), np.nan),
|
||||
(MissingValues(), "missing"),
|
||||
(HasMethods("fit"), _Estimator(a=0)),
|
||||
("cv_object", 5),
|
||||
],
|
||||
)
|
||||
def test_is_satisfied_by(constraint_declaration, value):
|
||||
"""Sanity check for the is_satisfied_by method"""
|
||||
constraint = make_constraint(constraint_declaration)
|
||||
assert constraint.is_satisfied_by(value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint_declaration, expected_constraint_class",
|
||||
[
|
||||
(Interval(Real, 0, 1, closed="both"), Interval),
|
||||
(StrOptions({"option1", "option2"}), StrOptions),
|
||||
(Options(Real, {0.42, 1.23}), Options),
|
||||
("array-like", _ArrayLikes),
|
||||
("sparse matrix", _SparseMatrices),
|
||||
("random_state", _RandomStates),
|
||||
(None, _NoneConstraint),
|
||||
(callable, _Callables),
|
||||
(int, _InstancesOf),
|
||||
("boolean", _Booleans),
|
||||
("verbose", _VerboseHelper),
|
||||
(MissingValues(numeric_only=True), MissingValues),
|
||||
(HasMethods("fit"), HasMethods),
|
||||
("cv_object", _CVObjects),
|
||||
("nan", _NanConstraint),
|
||||
(np.nan, _NanConstraint),
|
||||
],
|
||||
)
|
||||
def test_make_constraint(constraint_declaration, expected_constraint_class):
|
||||
"""Check that make_constraint dispatches to the appropriate constraint class"""
|
||||
constraint = make_constraint(constraint_declaration)
|
||||
assert constraint.__class__ is expected_constraint_class
|
||||
|
||||
|
||||
def test_make_constraint_unknown():
|
||||
"""Check that an informative error is raised when an unknown constraint is passed"""
|
||||
with pytest.raises(ValueError, match="Unknown constraint"):
|
||||
make_constraint("not a valid constraint")
|
||||
|
||||
|
||||
def test_validate_params():
|
||||
"""Check that validate_params works no matter how the arguments are passed"""
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _func must be"
|
||||
):
|
||||
_func("wrong", c=1)
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'b' parameter of _func must be"
|
||||
):
|
||||
_func(*[1, "wrong"], c=1)
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'c' parameter of _func must be"
|
||||
):
|
||||
_func(1, **{"c": "wrong"})
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'd' parameter of _func must be"
|
||||
):
|
||||
_func(1, c=1, d="wrong")
|
||||
|
||||
# check in the presence of extra positional and keyword args
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'b' parameter of _func must be"
|
||||
):
|
||||
_func(0, *["wrong", 2, 3], c=4, **{"e": 5})
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'c' parameter of _func must be"
|
||||
):
|
||||
_func(0, *[1, 2, 3], c="four", **{"e": 5})
|
||||
|
||||
|
||||
def test_validate_params_missing_params():
|
||||
"""Check that no error is raised when there are parameters without
|
||||
constraints
|
||||
"""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def func(a, b):
|
||||
pass
|
||||
|
||||
func(1, 2)
|
||||
|
||||
|
||||
def test_decorate_validated_function():
|
||||
"""Check that validate_params functions can be decorated"""
|
||||
decorated_function = deprecated()(_func)
|
||||
|
||||
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
|
||||
decorated_function(1, 2, c=3)
|
||||
|
||||
# outer decorator does not interfere with validation
|
||||
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match=r"The 'c' parameter of _func must be"
|
||||
):
|
||||
decorated_function(1, 2, c="wrong")
|
||||
|
||||
|
||||
def test_validate_params_method():
|
||||
"""Check that validate_params works with methods"""
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _Class._method must be"
|
||||
):
|
||||
_Class()._method("wrong")
|
||||
|
||||
# validated method can be decorated
|
||||
with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"):
|
||||
with pytest.raises(
|
||||
InvalidParameterError,
|
||||
match="The 'a' parameter of _Class._deprecated_method must be",
|
||||
):
|
||||
_Class()._deprecated_method("wrong")
|
||||
|
||||
|
||||
def test_validate_params_estimator():
|
||||
"""Check that validate_params works with Estimator instances"""
|
||||
# no validation in init
|
||||
est = _Estimator("wrong")
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _Estimator must be"
|
||||
):
|
||||
est.fit()
|
||||
|
||||
|
||||
def test_stroptions_deprecated_subset():
|
||||
"""Check that the deprecated parameter must be a subset of options."""
|
||||
with pytest.raises(ValueError, match="deprecated options must be a subset"):
|
||||
StrOptions({"a", "b", "c"}, deprecated={"a", "d"})
|
||||
|
||||
|
||||
def test_hidden_constraint():
|
||||
"""Check that internal constraints are not exposed in the error message."""
|
||||
|
||||
@validate_params(
|
||||
{"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True
|
||||
)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# list and dict are valid params
|
||||
f({"a": 1, "b": 2, "c": 3})
|
||||
f([1, 2, 3])
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'param' parameter"
|
||||
) as exc_info:
|
||||
f(param="bad")
|
||||
|
||||
# the list option is not exposed in the error message
|
||||
err_msg = str(exc_info.value)
|
||||
assert "an instance of 'dict'" in err_msg
|
||||
assert "an instance of 'list'" not in err_msg
|
||||
|
||||
|
||||
def test_hidden_stroptions():
|
||||
"""Check that we can have 2 StrOptions constraints, one being hidden."""
|
||||
|
||||
@validate_params(
|
||||
{"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# "auto" and "warn" are valid params
|
||||
f("auto")
|
||||
f("warn")
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'param' parameter"
|
||||
) as exc_info:
|
||||
f(param="bad")
|
||||
|
||||
# the "warn" option is not exposed in the error message
|
||||
err_msg = str(exc_info.value)
|
||||
assert "auto" in err_msg
|
||||
assert "warn" not in err_msg
|
||||
|
||||
|
||||
def test_validate_params_set_param_constraints_attribute():
|
||||
"""Check that the validate_params decorator properly sets the parameter constraints
|
||||
as attribute of the decorated function/method.
|
||||
"""
|
||||
assert hasattr(_func, "_skl_parameter_constraints")
|
||||
assert hasattr(_Class()._method, "_skl_parameter_constraints")
|
||||
|
||||
|
||||
def test_boolean_constraint_deprecated_int():
|
||||
"""Check that validate_params raise a deprecation message but still passes
|
||||
validation when using an int for a parameter accepting a boolean.
|
||||
"""
|
||||
|
||||
@validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# True/False and np.bool_(True/False) are valid params
|
||||
f(True)
|
||||
f(np.bool_(False))
|
||||
|
||||
|
||||
def test_no_validation():
|
||||
"""Check that validation can be skipped for a parameter."""
|
||||
|
||||
@validate_params(
|
||||
{"param1": [int, None], "param2": "no_validation"},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def f(param1=None, param2=None):
|
||||
pass
|
||||
|
||||
# param1 is validated
|
||||
with pytest.raises(InvalidParameterError, match="The 'param1' parameter"):
|
||||
f(param1="wrong")
|
||||
|
||||
# param2 is not validated: any type is valid.
|
||||
class SomeType:
|
||||
pass
|
||||
|
||||
f(param2=SomeType)
|
||||
f(param2=SomeType())
|
||||
|
||||
|
||||
def test_pandas_na_constraint_with_pd_na():
|
||||
"""Add a specific test for checking support for `pandas.NA`."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
na_constraint = _PandasNAConstraint()
|
||||
assert na_constraint.is_satisfied_by(pd.NA)
|
||||
assert not na_constraint.is_satisfied_by(np.array([1, 2, 3]))
|
||||
|
||||
|
||||
def test_iterable_not_string():
|
||||
"""Check that a string does not satisfy the _IterableNotString constraint."""
|
||||
constraint = _IterablesNotString()
|
||||
assert constraint.is_satisfied_by([1, 2, 3])
|
||||
assert constraint.is_satisfied_by(range(10))
|
||||
assert not constraint.is_satisfied_by("some string")
|
||||
|
||||
|
||||
def test_cv_objects():
|
||||
"""Check that the _CVObjects constraint accepts all current ways
|
||||
to pass cv objects."""
|
||||
constraint = _CVObjects()
|
||||
assert constraint.is_satisfied_by(5)
|
||||
assert constraint.is_satisfied_by(LeaveOneOut())
|
||||
assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
|
||||
assert constraint.is_satisfied_by(None)
|
||||
assert not constraint.is_satisfied_by("not a CV object")
|
||||
|
||||
|
||||
def test_third_party_estimator():
|
||||
"""Check that the validation from a scikit-learn estimator inherited by a third
|
||||
party estimator does not impose a match between the dict of constraints and the
|
||||
parameters of the estimator.
|
||||
"""
|
||||
|
||||
class ThirdPartyEstimator(_Estimator):
|
||||
def __init__(self, b):
|
||||
self.b = b
|
||||
super().__init__(a=0)
|
||||
|
||||
def fit(self, X=None, y=None):
|
||||
super().fit(X, y)
|
||||
|
||||
# does not raise, even though "b" is not in the constraints dict and "a" is not
|
||||
# a parameter of the estimator.
|
||||
ThirdPartyEstimator(b=0).fit()
|
||||
|
||||
|
||||
def test_interval_real_not_int():
|
||||
"""Check for the type RealNotInt in the Interval constraint."""
|
||||
constraint = Interval(RealNotInt, 0, 1, closed="both")
|
||||
assert constraint.is_satisfied_by(1.0)
|
||||
assert not constraint.is_satisfied_by(1)
|
||||
|
||||
|
||||
def test_real_not_int():
|
||||
"""Check for the RealNotInt type."""
|
||||
assert isinstance(1.0, RealNotInt)
|
||||
assert not isinstance(1, RealNotInt)
|
||||
assert isinstance(np.float64(1), RealNotInt)
|
||||
assert not isinstance(np.int64(1), RealNotInt)
|
||||
|
||||
|
||||
def test_skip_param_validation():
|
||||
"""Check that param validation can be skipped using config_context."""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def f(a):
|
||||
pass
|
||||
|
||||
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
|
||||
f(a="1")
|
||||
|
||||
# does not raise
|
||||
with config_context(skip_parameter_validation=True):
|
||||
f(a="1")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prefer_skip_nested_validation", [True, False])
|
||||
def test_skip_nested_validation(prefer_skip_nested_validation):
|
||||
"""Check that nested validation can be skipped."""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def f(a):
|
||||
pass
|
||||
|
||||
@validate_params(
|
||||
{"b": [int]},
|
||||
prefer_skip_nested_validation=prefer_skip_nested_validation,
|
||||
)
|
||||
def g(b):
|
||||
# calls f with a bad parameter type
|
||||
return f(a="invalid_param_value")
|
||||
|
||||
# Validation for g is never skipped.
|
||||
with pytest.raises(InvalidParameterError, match="The 'b' parameter"):
|
||||
g(b="invalid_param_value")
|
||||
|
||||
if prefer_skip_nested_validation:
|
||||
g(b=1) # does not raise because inner f is not validated
|
||||
else:
|
||||
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
|
||||
g(b=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"skip_parameter_validation, prefer_skip_nested_validation, expected_skipped",
|
||||
[
|
||||
(True, True, True),
|
||||
(True, False, True),
|
||||
(False, True, True),
|
||||
(False, False, False),
|
||||
],
|
||||
)
|
||||
def test_skip_nested_validation_and_config_context(
|
||||
skip_parameter_validation, prefer_skip_nested_validation, expected_skipped
|
||||
):
|
||||
"""Check interaction between global skip and local skip."""
|
||||
|
||||
@validate_params(
|
||||
{"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation
|
||||
)
|
||||
def g(a):
|
||||
return get_config()["skip_parameter_validation"]
|
||||
|
||||
with config_context(skip_parameter_validation=skip_parameter_validation):
|
||||
actual_skipped = g(1)
|
||||
|
||||
assert actual_skipped == expected_skipped
|
||||
@@ -0,0 +1,543 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.utils._plotting import (
|
||||
_BinaryClassifierCurveDisplayMixin,
|
||||
_deprecate_estimator_name,
|
||||
_despine,
|
||||
_interval_max_min_ratio,
|
||||
_validate_score_name,
|
||||
_validate_style_kwargs,
|
||||
)
|
||||
from sklearn.utils._response import _get_response_values_binary
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ax", [None, "Ax"])
|
||||
@pytest.mark.parametrize(
|
||||
"name, expected_name_out", [(None, "TestEstimator"), ("CustomName", "CustomName")]
|
||||
)
|
||||
def test_validate_plot_params(pyplot, ax, name, expected_name_out):
|
||||
"""Check `_validate_plot_params` returns the correct values."""
|
||||
display = _BinaryClassifierCurveDisplayMixin()
|
||||
display.estimator_name = "TestEstimator"
|
||||
if ax:
|
||||
_, ax = pyplot.subplots()
|
||||
ax_out, _, name_out = display._validate_plot_params(ax=ax, name=name)
|
||||
|
||||
assert name_out == expected_name_out
|
||||
|
||||
if ax:
|
||||
assert ax == ax_out
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pos_label", [None, 0])
|
||||
@pytest.mark.parametrize("name", [None, "CustomName"])
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["auto", "predict_proba", "decision_function"]
|
||||
)
|
||||
def test_validate_and_get_response_values(pyplot, pos_label, name, response_method):
|
||||
"""Check `_validate_and_get_response_values` returns the correct values."""
|
||||
X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
|
||||
y = np.array([0, 0, 2, 2])
|
||||
estimator = LogisticRegression().fit(X, y)
|
||||
|
||||
y_pred, pos_label, name_out = (
|
||||
_BinaryClassifierCurveDisplayMixin._validate_and_get_response_values(
|
||||
estimator,
|
||||
X,
|
||||
y,
|
||||
response_method=response_method,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
expected_y_pred, expected_pos_label = _get_response_values_binary(
|
||||
estimator, X, response_method=response_method, pos_label=pos_label
|
||||
)
|
||||
|
||||
assert_allclose(y_pred, expected_y_pred)
|
||||
assert pos_label == expected_pos_label
|
||||
|
||||
# Check name is handled correctly
|
||||
expected_name = name if name is not None else "LogisticRegression"
|
||||
assert name_out == expected_name
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_true, error_message",
|
||||
[
|
||||
(np.array([0, 1, 2]), "The target y is not binary."),
|
||||
(np.array([0, 1]), "Found input variables with inconsistent"),
|
||||
(np.array([0, 2, 0, 2]), r"y_true takes value in \{0, 2\} and pos_label"),
|
||||
],
|
||||
)
|
||||
def test_validate_from_predictions_params_errors(pyplot, y_true, error_message):
|
||||
"""Check `_validate_from_predictions_params` raises the correct errors."""
|
||||
y_pred = np.array([0.1, 0.2, 0.3, 0.4])
|
||||
sample_weight = np.ones(4)
|
||||
|
||||
with pytest.raises(ValueError, match=error_message):
|
||||
_BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
|
||||
y_true=y_true,
|
||||
y_pred=y_pred,
|
||||
sample_weight=sample_weight,
|
||||
pos_label=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", [None, "CustomName"])
|
||||
@pytest.mark.parametrize(
|
||||
"pos_label, y_true",
|
||||
[
|
||||
(None, np.array([0, 1, 0, 1])),
|
||||
(2, np.array([0, 2, 0, 2])),
|
||||
],
|
||||
)
|
||||
def test_validate_from_predictions_params_returns(pyplot, name, pos_label, y_true):
|
||||
"""Check `_validate_from_predictions_params` returns the correct values."""
|
||||
y_pred = np.array([0.1, 0.2, 0.3, 0.4])
|
||||
pos_label_out, name_out = (
|
||||
_BinaryClassifierCurveDisplayMixin._validate_from_predictions_params(
|
||||
y_true=y_true,
|
||||
y_pred=y_pred,
|
||||
sample_weight=None,
|
||||
pos_label=pos_label,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
# Check name is handled correctly
|
||||
expected_name = name if name is not None else "Classifier"
|
||||
assert name_out == expected_name
|
||||
|
||||
# Check pos_label is handled correctly
|
||||
expected_pos_label = pos_label if pos_label is not None else 1
|
||||
assert pos_label_out == expected_pos_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
(
|
||||
{
|
||||
# Missing "indices" key
|
||||
"cv_results": {"estimator": "dummy"},
|
||||
"X": np.array([[1, 2], [3, 4]]),
|
||||
"y": np.array([0, 1]),
|
||||
"sample_weight": None,
|
||||
},
|
||||
"`cv_results` does not contain one of the following",
|
||||
),
|
||||
(
|
||||
{
|
||||
"cv_results": {
|
||||
"estimator": "dummy",
|
||||
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
|
||||
},
|
||||
# `X` wrong length
|
||||
"X": np.array([[1, 2]]),
|
||||
"y": np.array([0, 1]),
|
||||
"sample_weight": None,
|
||||
},
|
||||
"`X` does not contain the correct number of",
|
||||
),
|
||||
(
|
||||
{
|
||||
"cv_results": {
|
||||
"estimator": "dummy",
|
||||
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
|
||||
},
|
||||
"X": np.array([1, 2, 3, 4]),
|
||||
# `y` not binary
|
||||
"y": np.array([0, 2, 1, 3]),
|
||||
"sample_weight": None,
|
||||
},
|
||||
"The target `y` is not binary",
|
||||
),
|
||||
(
|
||||
{
|
||||
"cv_results": {
|
||||
"estimator": "dummy",
|
||||
"indices": {"test": [[1, 2], [1, 2]], "train": [[3, 4], [3, 4]]},
|
||||
},
|
||||
"X": np.array([1, 2, 3, 4]),
|
||||
"y": np.array([0, 1, 0, 1]),
|
||||
# `sample_weight` wrong length
|
||||
"sample_weight": np.array([0.5]),
|
||||
},
|
||||
"Found input variables with inconsistent",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_validate_from_cv_results_params(pyplot, params, err_msg):
|
||||
"""Check parameter validation is performed correctly."""
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_BinaryClassifierCurveDisplayMixin()._validate_from_cv_results_params(**params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"curve_legend_metric, curve_name, expected_label",
|
||||
[
|
||||
(0.85, None, "AUC = 0.85"),
|
||||
(None, "Model A", "Model A"),
|
||||
(0.95, "Random Forest", "Random Forest (AUC = 0.95)"),
|
||||
(None, None, None),
|
||||
],
|
||||
)
|
||||
def test_get_legend_label(curve_legend_metric, curve_name, expected_label):
|
||||
"""Check `_get_legend_label` returns the correct label."""
|
||||
legend_metric_name = "AUC"
|
||||
label = _BinaryClassifierCurveDisplayMixin._get_legend_label(
|
||||
curve_legend_metric, curve_name, legend_metric_name
|
||||
)
|
||||
assert label == expected_label
|
||||
|
||||
|
||||
# TODO(1.9) : Remove
|
||||
@pytest.mark.parametrize("curve_kwargs", [{"alpha": 1.0}, None])
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"alpha": 1.0}])
|
||||
def test_validate_curve_kwargs_deprecate_kwargs(curve_kwargs, kwargs):
|
||||
"""Check `_validate_curve_kwargs` deprecates kwargs correctly."""
|
||||
n_curves = 1
|
||||
name = None
|
||||
legend_metric = {"mean": 0.8, "std": 0.1}
|
||||
legend_metric_name = "AUC"
|
||||
|
||||
if curve_kwargs and kwargs:
|
||||
with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"):
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves,
|
||||
name,
|
||||
legend_metric,
|
||||
legend_metric_name,
|
||||
curve_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
elif kwargs:
|
||||
with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and"):
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves,
|
||||
name,
|
||||
legend_metric,
|
||||
legend_metric_name,
|
||||
curve_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# No warning or error should be raised
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves, name, legend_metric, legend_metric_name, curve_kwargs, **kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_validate_curve_kwargs_error():
|
||||
"""Check `_validate_curve_kwargs` performs parameter validation correctly."""
|
||||
n_curves = 3
|
||||
legend_metric = {"mean": 0.8, "std": 0.1}
|
||||
legend_metric_name = "AUC"
|
||||
with pytest.raises(ValueError, match="`curve_kwargs` must be None"):
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name=None,
|
||||
legend_metric=legend_metric,
|
||||
legend_metric_name=legend_metric_name,
|
||||
curve_kwargs=[{"alpha": 1.0}],
|
||||
)
|
||||
with pytest.raises(ValueError, match="To avoid labeling individual curves"):
|
||||
name = ["one", "two", "three"]
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name=name,
|
||||
legend_metric=legend_metric,
|
||||
legend_metric_name=legend_metric_name,
|
||||
curve_kwargs=None,
|
||||
)
|
||||
_BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name=name,
|
||||
legend_metric=legend_metric,
|
||||
legend_metric_name=legend_metric_name,
|
||||
curve_kwargs={"alpha": 1.0},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", [None, "curve_name", ["curve_name"]])
|
||||
@pytest.mark.parametrize(
|
||||
"legend_metric",
|
||||
[{"mean": 0.8, "std": 0.2}, {"mean": None, "std": None}],
|
||||
)
|
||||
@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
|
||||
@pytest.mark.parametrize("curve_kwargs", [None, {"color": "red"}])
|
||||
def test_validate_curve_kwargs_single_legend(
|
||||
name, legend_metric, legend_metric_name, curve_kwargs
|
||||
):
|
||||
"""Check `_validate_curve_kwargs` returns correct kwargs for single legend entry."""
|
||||
n_curves = 3
|
||||
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name=name,
|
||||
legend_metric=legend_metric,
|
||||
legend_metric_name=legend_metric_name,
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
|
||||
assert isinstance(curve_kwargs_out, list)
|
||||
assert len(curve_kwargs_out) == n_curves
|
||||
|
||||
expected_label = None
|
||||
if isinstance(name, list):
|
||||
name = name[0]
|
||||
if name is not None:
|
||||
expected_label = name
|
||||
if legend_metric["mean"] is not None:
|
||||
expected_label = expected_label + f" ({legend_metric_name} = 0.80 +/- 0.20)"
|
||||
# `name` is None
|
||||
elif legend_metric["mean"] is not None:
|
||||
expected_label = f"{legend_metric_name} = 0.80 +/- 0.20"
|
||||
|
||||
assert curve_kwargs_out[0]["label"] == expected_label
|
||||
# All remaining curves should have None as "label"
|
||||
assert curve_kwargs_out[1]["label"] is None
|
||||
assert curve_kwargs_out[2]["label"] is None
|
||||
|
||||
if curve_kwargs is None:
|
||||
assert all("color" not in kwargs for kwargs in curve_kwargs_out)
|
||||
else:
|
||||
assert all(kwargs["color"] == "red" for kwargs in curve_kwargs_out)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", [None, "curve_name", ["one", "two", "three"]])
|
||||
@pytest.mark.parametrize(
|
||||
"legend_metric", [{"metric": [1.0, 1.0, 1.0]}, {"metric": [None, None, None]}]
|
||||
)
|
||||
@pytest.mark.parametrize("legend_metric_name", ["AUC", "AP"])
|
||||
def test_validate_curve_kwargs_multi_legend(name, legend_metric, legend_metric_name):
|
||||
"""Check `_validate_curve_kwargs` returns correct kwargs for multi legend entry."""
|
||||
n_curves = 3
|
||||
curve_kwargs = [{"color": "red"}, {"color": "yellow"}, {"color": "blue"}]
|
||||
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name=name,
|
||||
legend_metric=legend_metric,
|
||||
legend_metric_name=legend_metric_name,
|
||||
curve_kwargs=curve_kwargs,
|
||||
)
|
||||
|
||||
assert isinstance(curve_kwargs_out, list)
|
||||
assert len(curve_kwargs_out) == n_curves
|
||||
|
||||
expected_labels = [None, None, None]
|
||||
if isinstance(name, str):
|
||||
expected_labels = "curve_name"
|
||||
if legend_metric["metric"][0] is not None:
|
||||
expected_labels = expected_labels + f" ({legend_metric_name} = 1.00)"
|
||||
expected_labels = [expected_labels] * n_curves
|
||||
elif isinstance(name, list) and legend_metric["metric"][0] is None:
|
||||
expected_labels = name
|
||||
elif isinstance(name, list) and legend_metric["metric"][0] is not None:
|
||||
expected_labels = [
|
||||
f"{name_single} ({legend_metric_name} = 1.00)" for name_single in name
|
||||
]
|
||||
# `name` is None
|
||||
elif legend_metric["metric"][0] is not None:
|
||||
expected_labels = [f"{legend_metric_name} = 1.00"] * n_curves
|
||||
|
||||
for idx, expected_label in enumerate(expected_labels):
|
||||
assert curve_kwargs_out[idx]["label"] == expected_label
|
||||
|
||||
for curve_kwarg, curve_kwarg_out in zip(curve_kwargs, curve_kwargs_out):
|
||||
assert curve_kwarg_out["color"] == curve_kwarg["color"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("curve_kwargs", [None, {"color": "red"}])
|
||||
@pytest.mark.parametrize("n_curves", [1, 3])
|
||||
def test_validate_curve_kwargs_default_kwargs(n_curves, curve_kwargs):
|
||||
"""Check default kwargs are incorporated correctly."""
|
||||
curve_kwargs_out = _BinaryClassifierCurveDisplayMixin._validate_curve_kwargs(
|
||||
n_curves=n_curves,
|
||||
name="test",
|
||||
legend_metric={"mean": 0.8, "std": 0.2},
|
||||
legend_metric_name="metric",
|
||||
curve_kwargs=curve_kwargs,
|
||||
default_curve_kwargs={"color": "blue"},
|
||||
default_multi_curve_kwargs={"alpha": 0.7, "linestyle": "--", "color": "green"},
|
||||
)
|
||||
if n_curves > 1:
|
||||
# `default_multi_curve_kwargs` are incorporated
|
||||
assert all(kwarg["alpha"] == 0.7 for kwarg in curve_kwargs_out)
|
||||
assert all(kwarg["linestyle"] == "--" for kwarg in curve_kwargs_out)
|
||||
if curve_kwargs is None:
|
||||
# `default_multi_curve_kwargs` over-rides `default_curve_kwargs`
|
||||
assert all(kwarg["color"] == "green" for kwarg in curve_kwargs_out)
|
||||
else:
|
||||
# `curve_kwargs` over-rides any defaults
|
||||
assert all(kwarg["color"] == "red" for kwarg in curve_kwargs_out)
|
||||
# Single curve
|
||||
elif curve_kwargs is None:
|
||||
# Use `default_curve_kwargs`
|
||||
assert all(kwarg["color"] == "blue" for kwarg in curve_kwargs_out)
|
||||
else:
|
||||
# Use `curve_kwargs`
|
||||
assert all(kwarg["color"] == "red" for kwarg in curve_kwargs_out)
|
||||
|
||||
|
||||
def metric():
|
||||
pass # pragma: no cover
|
||||
|
||||
|
||||
def neg_metric():
|
||||
pass # pragma: no cover
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"score_name, scoring, negate_score, expected_score_name",
|
||||
[
|
||||
("accuracy", None, False, "accuracy"), # do not transform the name
|
||||
(None, "accuracy", False, "Accuracy"), # capitalize the name
|
||||
(None, "accuracy", True, "Negative accuracy"), # add "Negative"
|
||||
(None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
|
||||
(None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_"
|
||||
("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name
|
||||
(None, None, False, "Score"), # default name
|
||||
(None, None, True, "Negative score"), # default name but negated
|
||||
("Some metric", metric, False, "Some metric"), # do not transform the name
|
||||
("Some metric", metric, True, "Some metric"), # do not transform the name
|
||||
(None, metric, False, "Metric"), # default name
|
||||
(None, metric, True, "Negative metric"), # default name but negated
|
||||
("Some metric", neg_metric, False, "Some metric"), # do not transform the name
|
||||
("Some metric", neg_metric, True, "Some metric"), # do not transform the name
|
||||
(None, neg_metric, False, "Negative metric"), # default name
|
||||
(None, neg_metric, True, "Metric"), # default name but negated
|
||||
],
|
||||
)
|
||||
def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
|
||||
"""Check that we return the right score name."""
|
||||
assert (
|
||||
_validate_score_name(score_name, scoring, negate_score) == expected_score_name
|
||||
)
|
||||
|
||||
|
||||
# In the following test, we check the value of the max to min ratio
|
||||
# for parameter value intervals to check that using a decision threshold
|
||||
# of 5. is a good heuristic to decide between linear and log scales on
|
||||
# common ranges of parameter values.
|
||||
@pytest.mark.parametrize(
|
||||
"data, lower_bound, upper_bound",
|
||||
[
|
||||
# Such a range could be clearly displayed with either log scale or linear
|
||||
# scale.
|
||||
(np.geomspace(0.1, 1, 5), 5, 6),
|
||||
# Checking that the ratio is still positive on a negative log scale.
|
||||
(-np.geomspace(0.1, 1, 10), 7, 8),
|
||||
# Evenly spaced parameter values lead to a ratio of 1.
|
||||
(np.linspace(0, 1, 5), 0.9, 1.1),
|
||||
# This is not exactly spaced on a log scale but we will benefit from treating
|
||||
# it as such for visualization.
|
||||
([1, 2, 5, 10, 20, 50], 20, 40),
|
||||
],
|
||||
)
|
||||
def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
|
||||
assert lower_bound < _interval_max_min_ratio(data) < upper_bound
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_kwargs, user_kwargs, expected",
|
||||
[
|
||||
(
|
||||
{"color": "blue", "linewidth": 2},
|
||||
{"linestyle": "dashed"},
|
||||
{"color": "blue", "linewidth": 2, "linestyle": "dashed"},
|
||||
),
|
||||
(
|
||||
{"color": "blue", "linestyle": "solid"},
|
||||
{"c": "red", "ls": "dashed"},
|
||||
{"color": "red", "linestyle": "dashed"},
|
||||
),
|
||||
(
|
||||
{"label": "xxx", "color": "k", "linestyle": "--"},
|
||||
{"ls": "-."},
|
||||
{"label": "xxx", "color": "k", "linestyle": "-."},
|
||||
),
|
||||
({}, {}, {}),
|
||||
(
|
||||
{},
|
||||
{
|
||||
"ls": "dashed",
|
||||
"c": "red",
|
||||
"ec": "black",
|
||||
"fc": "yellow",
|
||||
"lw": 2,
|
||||
"mec": "green",
|
||||
"mfcalt": "blue",
|
||||
"ms": 5,
|
||||
},
|
||||
{
|
||||
"linestyle": "dashed",
|
||||
"color": "red",
|
||||
"edgecolor": "black",
|
||||
"facecolor": "yellow",
|
||||
"linewidth": 2,
|
||||
"markeredgecolor": "green",
|
||||
"markerfacecoloralt": "blue",
|
||||
"markersize": 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_validate_style_kwargs(default_kwargs, user_kwargs, expected):
|
||||
"""Check the behaviour of `validate_style_kwargs` with various type of entries."""
|
||||
result = _validate_style_kwargs(default_kwargs, user_kwargs)
|
||||
assert result == expected, (
|
||||
"The validation of style keywords does not provide the expected results: "
|
||||
f"Got {result} instead of {expected}."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_kwargs, user_kwargs",
|
||||
[({}, {"ls": 2, "linestyle": 3}), ({}, {"c": "r", "color": "blue"})],
|
||||
)
|
||||
def test_validate_style_kwargs_error(default_kwargs, user_kwargs):
|
||||
"""Check that `validate_style_kwargs` raises TypeError"""
|
||||
with pytest.raises(TypeError):
|
||||
_validate_style_kwargs(default_kwargs, user_kwargs)
|
||||
|
||||
|
||||
def test_despine(pyplot):
|
||||
ax = pyplot.gca()
|
||||
_despine(ax)
|
||||
assert ax.spines["top"].get_visible() is False
|
||||
assert ax.spines["right"].get_visible() is False
|
||||
assert ax.spines["bottom"].get_bounds() == (0, 1)
|
||||
assert ax.spines["left"].get_bounds() == (0, 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("estimator_name", ["my_est_name", "deprecated"])
|
||||
@pytest.mark.parametrize("name", [None, "my_name"])
|
||||
def test_deprecate_estimator_name(estimator_name, name):
|
||||
"""Check `_deprecate_estimator_name` behaves correctly"""
|
||||
version = "1.7"
|
||||
version_remove = "1.9"
|
||||
|
||||
if estimator_name == "deprecated":
|
||||
name_out = _deprecate_estimator_name(estimator_name, name, version)
|
||||
assert name_out == name
|
||||
# `estimator_name` is provided and `name` is:
|
||||
elif name is None:
|
||||
warning_message = (
|
||||
f"`estimator_name` is deprecated in {version} and will be removed in "
|
||||
f"{version_remove}. Use `name` instead."
|
||||
)
|
||||
with pytest.warns(FutureWarning, match=warning_message):
|
||||
result = _deprecate_estimator_name(estimator_name, name, version)
|
||||
assert result == estimator_name
|
||||
elif name is not None:
|
||||
error_message = (
|
||||
f"Cannot provide both `estimator_name` and `name`. `estimator_name` "
|
||||
f"is deprecated in {version} and will be removed in {version_remove}. "
|
||||
)
|
||||
with pytest.raises(ValueError, match=error_message):
|
||||
_deprecate_estimator_name(estimator_name, name, version)
|
||||
@@ -0,0 +1,693 @@
|
||||
import re
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.utils._pprint import _EstimatorPrettyPrinter
|
||||
|
||||
|
||||
# Constructors excerpted to test pprinting
|
||||
class LogisticRegression(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
C=1.0,
|
||||
l1_ratio=0,
|
||||
dual=False,
|
||||
tol=1e-4,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
class_weight=None,
|
||||
random_state=None,
|
||||
solver="warn",
|
||||
max_iter=100,
|
||||
multi_class="warn",
|
||||
verbose=0,
|
||||
warm_start=False,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.C = C
|
||||
self.l1_ratio = l1_ratio
|
||||
self.dual = dual
|
||||
self.tol = tol
|
||||
self.fit_intercept = fit_intercept
|
||||
self.intercept_scaling = intercept_scaling
|
||||
self.class_weight = class_weight
|
||||
self.random_state = random_state
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.multi_class = multi_class
|
||||
self.verbose = verbose
|
||||
self.warm_start = warm_start
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
|
||||
class StandardScaler(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, copy=True, with_mean=True, with_std=True):
|
||||
self.with_mean = with_mean
|
||||
self.with_std = with_std
|
||||
self.copy = copy
|
||||
|
||||
def transform(self, X, copy=None):
|
||||
return self
|
||||
|
||||
|
||||
class RFE(BaseEstimator):
|
||||
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
|
||||
self.estimator = estimator
|
||||
self.n_features_to_select = n_features_to_select
|
||||
self.step = step
|
||||
self.verbose = verbose
|
||||
|
||||
|
||||
class GridSearchCV(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
estimator,
|
||||
param_grid,
|
||||
scoring=None,
|
||||
n_jobs=None,
|
||||
iid="warn",
|
||||
refit=True,
|
||||
cv="warn",
|
||||
verbose=0,
|
||||
pre_dispatch="2*n_jobs",
|
||||
error_score="raise-deprecating",
|
||||
return_train_score=False,
|
||||
):
|
||||
self.estimator = estimator
|
||||
self.param_grid = param_grid
|
||||
self.scoring = scoring
|
||||
self.n_jobs = n_jobs
|
||||
self.iid = iid
|
||||
self.refit = refit
|
||||
self.cv = cv
|
||||
self.verbose = verbose
|
||||
self.pre_dispatch = pre_dispatch
|
||||
self.error_score = error_score
|
||||
self.return_train_score = return_train_score
|
||||
|
||||
|
||||
class CountVectorizer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input="content",
|
||||
encoding="utf-8",
|
||||
decode_error="strict",
|
||||
strip_accents=None,
|
||||
lowercase=True,
|
||||
preprocessor=None,
|
||||
tokenizer=None,
|
||||
stop_words=None,
|
||||
token_pattern=r"(?u)\b\w\w+\b",
|
||||
ngram_range=(1, 1),
|
||||
analyzer="word",
|
||||
max_df=1.0,
|
||||
min_df=1,
|
||||
max_features=None,
|
||||
vocabulary=None,
|
||||
binary=False,
|
||||
dtype=np.int64,
|
||||
):
|
||||
self.input = input
|
||||
self.encoding = encoding
|
||||
self.decode_error = decode_error
|
||||
self.strip_accents = strip_accents
|
||||
self.preprocessor = preprocessor
|
||||
self.tokenizer = tokenizer
|
||||
self.analyzer = analyzer
|
||||
self.lowercase = lowercase
|
||||
self.token_pattern = token_pattern
|
||||
self.stop_words = stop_words
|
||||
self.max_df = max_df
|
||||
self.min_df = min_df
|
||||
self.max_features = max_features
|
||||
self.ngram_range = ngram_range
|
||||
self.vocabulary = vocabulary
|
||||
self.binary = binary
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Pipeline(BaseEstimator):
|
||||
def __init__(self, steps, memory=None):
|
||||
self.steps = steps
|
||||
self.memory = memory
|
||||
|
||||
|
||||
class SVC(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
C=1.0,
|
||||
kernel="rbf",
|
||||
degree=3,
|
||||
gamma="auto_deprecated",
|
||||
coef0=0.0,
|
||||
shrinking=True,
|
||||
probability=False,
|
||||
tol=1e-3,
|
||||
cache_size=200,
|
||||
class_weight=None,
|
||||
verbose=False,
|
||||
max_iter=-1,
|
||||
decision_function_shape="ovr",
|
||||
random_state=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.degree = degree
|
||||
self.gamma = gamma
|
||||
self.coef0 = coef0
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.shrinking = shrinking
|
||||
self.probability = probability
|
||||
self.cache_size = cache_size
|
||||
self.class_weight = class_weight
|
||||
self.verbose = verbose
|
||||
self.max_iter = max_iter
|
||||
self.decision_function_shape = decision_function_shape
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class PCA(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
copy=True,
|
||||
whiten=False,
|
||||
svd_solver="auto",
|
||||
tol=0.0,
|
||||
iterated_power="auto",
|
||||
random_state=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.copy = copy
|
||||
self.whiten = whiten
|
||||
self.svd_solver = svd_solver
|
||||
self.tol = tol
|
||||
self.iterated_power = iterated_power
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class NMF(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
init=None,
|
||||
solver="cd",
|
||||
beta_loss="frobenius",
|
||||
tol=1e-4,
|
||||
max_iter=200,
|
||||
random_state=None,
|
||||
alpha=0.0,
|
||||
l1_ratio=0.0,
|
||||
verbose=0,
|
||||
shuffle=False,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.solver = solver
|
||||
self.beta_loss = beta_loss
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.random_state = random_state
|
||||
self.alpha = alpha
|
||||
self.l1_ratio = l1_ratio
|
||||
self.verbose = verbose
|
||||
self.shuffle = shuffle
|
||||
|
||||
|
||||
class SimpleImputer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
missing_values=np.nan,
|
||||
strategy="mean",
|
||||
fill_value=None,
|
||||
verbose=0,
|
||||
copy=True,
|
||||
):
|
||||
self.missing_values = missing_values
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.verbose = verbose
|
||||
self.copy = copy
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_basic():
|
||||
# Basic pprint test
|
||||
lr = LogisticRegression()
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=0, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
|
||||
def test_changed_only():
|
||||
# Make sure the changed_only param is correctly used when True (default)
|
||||
lr = LogisticRegression(C=99)
|
||||
expected = """LogisticRegression(C=99)"""
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
# Check with a repr that doesn't fit on a single line
|
||||
lr = LogisticRegression(
|
||||
C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
|
||||
)
|
||||
expected = """
|
||||
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
|
||||
verbose=True)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
imputer = SimpleImputer(missing_values=0)
|
||||
expected = """SimpleImputer(missing_values=0)"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# Defaults to np.nan, trying with float('NaN')
|
||||
imputer = SimpleImputer(missing_values=float("NaN"))
|
||||
expected = """SimpleImputer()"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# make sure array parameters don't throw error (see #13583)
|
||||
repr(LogisticRegressionCV(Cs=np.array([0.1, 1]), use_legacy_attributes=False))
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_pipeline():
|
||||
# Render a pipeline object
|
||||
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
|
||||
expected = """
|
||||
Pipeline(memory=None,
|
||||
steps=[('standardscaler',
|
||||
StandardScaler(copy=True, with_mean=True, with_std=True)),
|
||||
('logisticregression',
|
||||
LogisticRegression(C=999, class_weight=None, dual=False,
|
||||
fit_intercept=True, intercept_scaling=1,
|
||||
l1_ratio=0, max_iter=100,
|
||||
multi_class='warn', n_jobs=None,
|
||||
random_state=None, solver='warn',
|
||||
tol=0.0001, verbose=0, warm_start=False))],
|
||||
transform_input=None, verbose=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pipeline.__repr__() == expected
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_deeply_nested():
|
||||
# Render a deeply nested estimator
|
||||
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
|
||||
expected = """
|
||||
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
|
||||
class_weight=None,
|
||||
dual=False,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
l1_ratio=0,
|
||||
max_iter=100,
|
||||
multi_class='warn',
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
solver='warn',
|
||||
tol=0.0001,
|
||||
verbose=0,
|
||||
warm_start=False),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1, verbose=0),
|
||||
n_features_to_select=None, step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert rfe.__repr__() == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("print_changed_only", "expected"),
|
||||
[
|
||||
(True, "RFE(estimator=RFE(...))"),
|
||||
(
|
||||
False,
|
||||
"RFE(estimator=RFE(...), n_features_to_select=None, step=1, verbose=0)",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_print_estimator_max_depth(print_changed_only, expected):
|
||||
with config_context(print_changed_only=print_changed_only):
|
||||
pp = _EstimatorPrettyPrinter(depth=1)
|
||||
|
||||
rfe = RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))
|
||||
assert pp.pformat(rfe) == expected
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_gridsearch():
|
||||
# render a gridsearch
|
||||
param_grid = [
|
||||
{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
|
||||
{"kernel": ["linear"], "C": [1, 10, 100, 1000]},
|
||||
]
|
||||
gs = GridSearchCV(SVC(), param_grid, cv=5)
|
||||
|
||||
expected = """
|
||||
GridSearchCV(cv=5, error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
|
||||
'kernel': ['rbf']},
|
||||
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert gs.__repr__() == expected
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_gridsearch_pipeline():
|
||||
# render a pipeline inside a gridsearch
|
||||
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
|
||||
|
||||
pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
|
||||
N_FEATURES_OPTIONS = [2, 4, 8]
|
||||
C_OPTIONS = [1, 10, 100, 1000]
|
||||
param_grid = [
|
||||
{
|
||||
"reduce_dim": [PCA(iterated_power=7), NMF()],
|
||||
"reduce_dim__n_components": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
{
|
||||
"reduce_dim": [SelectKBest(chi2)],
|
||||
"reduce_dim__k": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
]
|
||||
gspipeline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv=3, error_score='raise-deprecating',
|
||||
estimator=Pipeline(memory=None,
|
||||
steps=[('reduce_dim',
|
||||
PCA(copy=True, iterated_power='auto',
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False)),
|
||||
('classify',
|
||||
SVC(C=1.0, cache_size=200,
|
||||
class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr',
|
||||
degree=3, gamma='auto_deprecated',
|
||||
kernel='rbf', max_iter=-1,
|
||||
probability=False,
|
||||
random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False))]),
|
||||
iid='warn', n_jobs=1,
|
||||
param_grid=[{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [PCA(copy=True, iterated_power=7,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False),
|
||||
NMF(alpha=0.0, beta_loss='frobenius',
|
||||
init=None, l1_ratio=0.0,
|
||||
max_iter=200, n_components=None,
|
||||
random_state=None, shuffle=False,
|
||||
solver='cd', tol=0.0001,
|
||||
verbose=0)],
|
||||
'reduce_dim__n_components': [2, 4, 8]},
|
||||
{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [SelectKBest(k=10,
|
||||
score_func=<function chi2 at some_address>)],
|
||||
'reduce_dim__k': [2, 4, 8]}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)""" # noqa: E501
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
repr_ = pp.pformat(gspipeline)
|
||||
# Remove address of '<function chi2 at 0x.....>' for reproducibility
|
||||
repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
|
||||
assert repr_ == expected
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_n_max_elements_to_show():
|
||||
n_max_elements_to_show = 30
|
||||
pp = _EstimatorPrettyPrinter(
|
||||
compact=True,
|
||||
indent=1,
|
||||
indent_at_name=True,
|
||||
n_max_elements_to_show=n_max_elements_to_show,
|
||||
)
|
||||
|
||||
# No ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29, ...})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Also test with lists
|
||||
param_grid = {"C": list(range(n_max_elements_to_show))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
param_grid = {"C": list(range(n_max_elements_to_show + 1))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29, ...]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
|
||||
@config_context(print_changed_only=False)
|
||||
def test_bruteforce_ellipsis():
|
||||
# Check that the bruteforce ellipsis (used when the number of non-blank
|
||||
# characters exceeds N_CHAR_MAX) renders correctly.
|
||||
|
||||
lr = LogisticRegression()
|
||||
|
||||
# test when the left and right side of the ellipsis aren't on the same
|
||||
# line.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
in...
|
||||
multi_class='warn', n_jobs=None, random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__(N_CHAR_MAX=150) == expected
|
||||
|
||||
# test with very small N_CHAR_MAX
|
||||
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
|
||||
# weird reprs we still keep the whole line of the right part (after the
|
||||
# ellipsis).
|
||||
expected = """
|
||||
Lo...
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__(N_CHAR_MAX=4) == expected
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters: In this case we
|
||||
# don't want ellipsis
|
||||
full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
|
||||
n_nonblank = len("".join(full_repr.split()))
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
|
||||
assert "..." not in full_repr
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on different lines. In this case we
|
||||
# want to expend the whole line of the right side
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=0,...00,
|
||||
multi_class='warn', n_jobs=None, random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 10) == expected
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on the same line. In this case we don't
|
||||
# want to expend the whole line of the right side, just add the ellispsis
|
||||
# between the 2 sides.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=0, max...r=100,
|
||||
multi_class='warn', n_jobs=None, random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 4) == expected
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
|
||||
# right side of the ellispsis are on the same line, but adding the ellipsis
|
||||
# would actually make the repr longer. So we don't add the ellipsis.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=0, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0, warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank - 2) == expected
|
||||
|
||||
|
||||
def test_builtin_prettyprinter():
|
||||
# non regression test than ensures we can still use the builtin
|
||||
# PrettyPrinter class for estimators (as done e.g. by joblib).
|
||||
# Used to be a bug
|
||||
|
||||
PrettyPrinter().pprint(LogisticRegression())
|
||||
|
||||
|
||||
def test_kwargs_in_init():
|
||||
# Make sure the changed_only=True mode is OK when an argument is passed as
|
||||
# kwargs.
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/17206
|
||||
|
||||
class WithKWargs(BaseEstimator):
|
||||
# Estimator with a kwargs argument. These need to hack around
|
||||
# set_params and get_params. Here we mimic what LightGBM does.
|
||||
def __init__(self, a="willchange", b="unchanged", **kwargs):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self._other_params = {}
|
||||
self.set_params(**kwargs)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
params = super().get_params(deep=deep)
|
||||
params.update(self._other_params)
|
||||
return params
|
||||
|
||||
def set_params(self, **params):
|
||||
for key, value in params.items():
|
||||
setattr(self, key, value)
|
||||
self._other_params[key] = value
|
||||
return self
|
||||
|
||||
est = WithKWargs(a="something", c="abcd", d=None)
|
||||
|
||||
expected = "WithKWargs(a='something', c='abcd', d=None)"
|
||||
assert est.__repr__() == expected
|
||||
|
||||
with config_context(print_changed_only=False):
|
||||
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
|
||||
assert est.__repr__() == expected
|
||||
|
||||
|
||||
def test_complexity_print_changed_only():
|
||||
# Make sure `__repr__` is called the same amount of times
|
||||
# whether `print_changed_only` is True or False
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18490
|
||||
|
||||
class DummyEstimator(TransformerMixin, BaseEstimator):
|
||||
nb_times_repr_called = 0
|
||||
|
||||
def __init__(self, estimator=None):
|
||||
self.estimator = estimator
|
||||
|
||||
def __repr__(self):
|
||||
DummyEstimator.nb_times_repr_called += 1
|
||||
return super().__repr__()
|
||||
|
||||
def transform(self, X, copy=None): # pragma: no cover
|
||||
return X
|
||||
|
||||
estimator = DummyEstimator(
|
||||
make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
|
||||
)
|
||||
with config_context(print_changed_only=False):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
|
||||
|
||||
DummyEstimator.nb_times_repr_called = 0
|
||||
with config_context(print_changed_only=True):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
|
||||
|
||||
assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true
|
||||
@@ -0,0 +1,192 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from scipy.special import comb
|
||||
|
||||
from sklearn.utils._random import _our_rand_r_py
|
||||
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test custom sampling without replacement algorithm
|
||||
###############################################################################
|
||||
def test_invalid_sample_without_replacement_algorithm():
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, 4, "unknown")
|
||||
|
||||
|
||||
def test_sample_without_replacement_algorithms():
|
||||
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
|
||||
|
||||
for m in methods:
|
||||
|
||||
def sample_without_replacement_method(
|
||||
n_population, n_samples, random_state=None
|
||||
):
|
||||
return sample_without_replacement(
|
||||
n_population, n_samples, method=m, random_state=random_state
|
||||
)
|
||||
|
||||
check_edge_case_of_sample_int(sample_without_replacement_method)
|
||||
check_sample_int(sample_without_replacement_method)
|
||||
check_sample_int_distribution(sample_without_replacement_method)
|
||||
|
||||
|
||||
def check_edge_case_of_sample_int(sample_without_replacement):
|
||||
# n_population < n_sample
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(0, 1)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(1, 2)
|
||||
|
||||
# n_population == n_samples
|
||||
assert sample_without_replacement(0, 0).shape == (0,)
|
||||
|
||||
assert sample_without_replacement(1, 1).shape == (1,)
|
||||
|
||||
# n_population >= n_samples
|
||||
assert sample_without_replacement(5, 0).shape == (0,)
|
||||
assert sample_without_replacement(5, 1).shape == (1,)
|
||||
|
||||
# n_population < 0 or n_samples < 0
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(-1, 5)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, -1)
|
||||
|
||||
|
||||
def check_sample_int(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# the sample is of the correct length and contains only unique items
|
||||
n_population = 100
|
||||
|
||||
for n_samples in range(n_population + 1):
|
||||
s = sample_without_replacement(n_population, n_samples)
|
||||
assert len(s) == n_samples
|
||||
unique = np.unique(s)
|
||||
assert np.size(unique) == n_samples
|
||||
assert np.all(unique < n_population)
|
||||
|
||||
# test edge case n_population == n_samples == 0
|
||||
assert np.size(sample_without_replacement(0, 0)) == 0
|
||||
|
||||
|
||||
def check_sample_int_distribution(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# sample generates all possible permutations
|
||||
n_population = 10
|
||||
|
||||
# a large number of trials prevents false negatives without slowing normal
|
||||
# case
|
||||
n_trials = 10000
|
||||
|
||||
for n_samples in range(n_population):
|
||||
# Counting the number of combinations is not as good as counting the
|
||||
# the number of permutations. However, it works with sampling algorithm
|
||||
# that does not provide a random permutation of the subset of integer.
|
||||
n_expected = comb(n_population, n_samples, exact=True)
|
||||
|
||||
output = {}
|
||||
for i in range(n_trials):
|
||||
output[frozenset(sample_without_replacement(n_population, n_samples))] = (
|
||||
None
|
||||
)
|
||||
|
||||
if len(output) == n_expected:
|
||||
break
|
||||
else:
|
||||
raise AssertionError(
|
||||
"number of combinations != number of expected (%s != %s)"
|
||||
% (len(output), n_expected)
|
||||
)
|
||||
|
||||
|
||||
def test_random_choice_csc(n_samples=10000, random_state=24):
|
||||
# Explicit class probabilities
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Implicit class probabilities
|
||||
classes = [[0, 1], [1, 2]] # test for array-like support
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got[:, [k]].toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Edge case probabilities 1.0 and 0.0
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = (
|
||||
np.bincount(
|
||||
got[:, [k]].toarray().ravel(), minlength=len(class_probabilities[k])
|
||||
)
|
||||
/ n_samples
|
||||
)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# One class target data
|
||||
classes = [[1], [0]] # test for array-like support
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got[:, [k]].toarray().ravel()) / n_samples
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
|
||||
def test_random_choice_csc_errors():
|
||||
# the length of an array in classes and class_probabilities is mismatched
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# Given probabilities don't sum to 1
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
|
||||
def test_our_rand_r():
|
||||
assert 131541053 == _our_rand_r_py(1273642419)
|
||||
assert 270369 == _our_rand_r_py(0)
|
||||
@@ -0,0 +1,396 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import (
|
||||
load_iris,
|
||||
make_classification,
|
||||
make_multilabel_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.linear_model import (
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
)
|
||||
from sklearn.multioutput import ClassifierChain
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
|
||||
from sklearn.utils._response import _get_response_values, _get_response_values_binary
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
# scale the data to avoid ConvergenceWarning with LogisticRegression
|
||||
X = scale(X, copy=False)
|
||||
X_binary, y_binary = X[:100], y[:100]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
|
||||
)
|
||||
def test_get_response_values_regressor_error(response_method):
|
||||
"""Check the error message with regressor an not supported response
|
||||
method."""
|
||||
my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method])
|
||||
X = "mocking_data", "mocking_target"
|
||||
err_msg = f"{my_estimator.__class__.__name__} should either be a classifier"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(my_estimator, X, response_method=response_method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_regressor(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values` with regressor."""
|
||||
X, y = make_regression(n_samples=10, random_state=0)
|
||||
regressor = LinearRegression().fit(X, y)
|
||||
results = _get_response_values(
|
||||
regressor,
|
||||
X,
|
||||
response_method="predict",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_array_equal(results[0], regressor.predict(X))
|
||||
assert results[1] is None
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method",
|
||||
["predict", "decision_function", ["decision_function", "predict"]],
|
||||
)
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_outlier_detection(
|
||||
response_method, return_response_method_used
|
||||
):
|
||||
"""Check the behaviour of `_get_response_values` with outlier detector."""
|
||||
X, y = make_classification(n_samples=50, random_state=0)
|
||||
outlier_detector = IsolationForest(random_state=0).fit(X, y)
|
||||
results = _get_response_values(
|
||||
outlier_detector,
|
||||
X,
|
||||
response_method=response_method,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
chosen_response_method = (
|
||||
response_method[0] if isinstance(response_method, list) else response_method
|
||||
)
|
||||
prediction_method = getattr(outlier_detector, chosen_response_method)
|
||||
assert_array_equal(results[0], prediction_method(X))
|
||||
assert results[1] is None
|
||||
if return_response_method_used:
|
||||
assert results[2] == chosen_response_method
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method",
|
||||
["predict_proba", "decision_function", "predict", "predict_log_proba"],
|
||||
)
|
||||
def test_get_response_values_classifier_unknown_pos_label(response_method):
|
||||
"""Check that `_get_response_values` raises the proper error message with
|
||||
classifier."""
|
||||
X, y = make_classification(n_samples=10, n_classes=2, random_state=0)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
|
||||
# provide a `pos_label` which is not in `y`
|
||||
err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label="whatever",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
|
||||
def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba(
|
||||
response_method,
|
||||
):
|
||||
"""Check that `_get_response_values` will raise an error when `y_pred` has a
|
||||
single class with `predict_proba`."""
|
||||
X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0)
|
||||
y_single_class = np.zeros_like(y_two_class)
|
||||
classifier = DecisionTreeClassifier().fit(X, y_single_class)
|
||||
|
||||
err_msg = (
|
||||
r"Got predict_proba of shape \(10, 1\), but need classifier with "
|
||||
r"two classes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(classifier, X, response_method=response_method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_binary_classifier_decision_function(
|
||||
return_response_method_used,
|
||||
):
|
||||
"""Check the behaviour of `_get_response_values` with `decision_function`
|
||||
and binary classifier."""
|
||||
X, y = make_classification(
|
||||
n_samples=10,
|
||||
n_classes=2,
|
||||
weights=[0.3, 0.7],
|
||||
random_state=0,
|
||||
)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
response_method = "decision_function"
|
||||
|
||||
# default `pos_label`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=None,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X))
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
# when forcing `pos_label=classifier.classes_[0]`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=classifier.classes_[0],
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X) * -1)
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
|
||||
def test_get_response_values_binary_classifier_predict_proba(
|
||||
return_response_method_used, response_method
|
||||
):
|
||||
"""Check that `_get_response_values` with `predict_proba` and binary
|
||||
classifier."""
|
||||
X, y = make_classification(
|
||||
n_samples=10,
|
||||
n_classes=2,
|
||||
weights=[0.3, 0.7],
|
||||
random_state=0,
|
||||
)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
|
||||
# default `pos_label`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=None,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1])
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert len(results) == 3
|
||||
assert results[2] == response_method
|
||||
else:
|
||||
assert len(results) == 2
|
||||
|
||||
# when forcing `pos_label=classifier.classes_[0]`
|
||||
y_pred, pos_label, *_ = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=classifier.classes_[0],
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0])
|
||||
assert pos_label == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, X, y, err_msg, params",
|
||||
[
|
||||
(
|
||||
DecisionTreeRegressor(),
|
||||
X_binary,
|
||||
y_binary,
|
||||
"Expected 'estimator' to be a binary classifier",
|
||||
{"response_method": "auto"},
|
||||
),
|
||||
(
|
||||
DecisionTreeClassifier(),
|
||||
X_binary,
|
||||
y_binary,
|
||||
r"pos_label=unknown is not a valid label: It should be one of \[0 1\]",
|
||||
{"response_method": "auto", "pos_label": "unknown"},
|
||||
),
|
||||
(
|
||||
DecisionTreeClassifier(),
|
||||
X,
|
||||
y,
|
||||
"be a binary classifier. Got 3 classes instead.",
|
||||
{"response_method": "predict_proba"},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_response_error(estimator, X, y, err_msg, params):
|
||||
"""Check that we raise the proper error messages in _get_response_values_binary."""
|
||||
|
||||
estimator = clone(estimator).fit(X, y) # clone to make test execution thread-safe
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values_binary(estimator, X, **params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_predict_proba(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
|
||||
classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="predict_proba",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict_proba"
|
||||
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="predict_proba",
|
||||
pos_label=0,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict_proba"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_decision_function(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values_binary` using decision_function."""
|
||||
classifier = LogisticRegression().fit(X_binary, y_binary)
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="decision_function",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X_binary))
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="decision_function",
|
||||
pos_label=0,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, response_method",
|
||||
[
|
||||
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"),
|
||||
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"),
|
||||
(LogisticRegression(), "decision_function"),
|
||||
],
|
||||
)
|
||||
def test_get_response_values_multiclass(estimator, response_method):
|
||||
"""Check that we can call `_get_response_values` with a multiclass estimator.
|
||||
It should return the predictions untouched.
|
||||
"""
|
||||
estimator = clone(estimator)
|
||||
estimator.fit(X, y)
|
||||
predictions, pos_label = _get_response_values(
|
||||
estimator, X, response_method=response_method
|
||||
)
|
||||
|
||||
assert pos_label is None
|
||||
assert predictions.shape == (X.shape[0], len(estimator.classes_))
|
||||
if response_method == "predict_proba":
|
||||
assert np.logical_and(predictions >= 0, predictions <= 1).all()
|
||||
elif response_method == "predict_log_proba":
|
||||
assert (predictions <= 0.0).all()
|
||||
|
||||
|
||||
def test_get_response_values_with_response_list():
|
||||
"""Check the behaviour of passing a list of responses to `_get_response_values`."""
|
||||
classifier = LogisticRegression().fit(X_binary, y_binary)
|
||||
|
||||
# it should use `predict_proba`
|
||||
y_pred, pos_label, response_method = _get_response_values(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method=["predict_proba", "decision_function"],
|
||||
return_response_method_used=True,
|
||||
)
|
||||
assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1])
|
||||
assert pos_label == 1
|
||||
assert response_method == "predict_proba"
|
||||
|
||||
# it should use `decision_function`
|
||||
y_pred, pos_label, response_method = _get_response_values(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method=["decision_function", "predict_proba"],
|
||||
return_response_method_used=True,
|
||||
)
|
||||
assert_allclose(y_pred, classifier.decision_function(X_binary))
|
||||
assert pos_label == 1
|
||||
assert response_method == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["predict_proba", "decision_function", "predict"]
|
||||
)
|
||||
def test_get_response_values_multilabel_indicator(response_method):
|
||||
X, Y = make_multilabel_classification(random_state=0)
|
||||
estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
|
||||
|
||||
y_pred, pos_label = _get_response_values(
|
||||
estimator, X, response_method=response_method
|
||||
)
|
||||
assert pos_label is None
|
||||
assert y_pred.shape == Y.shape
|
||||
|
||||
if response_method == "predict_proba":
|
||||
assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
|
||||
elif response_method == "decision_function":
|
||||
# values returned by `decision_function` are not bounded in [0, 1]
|
||||
assert (y_pred < 0).sum() > 0
|
||||
assert (y_pred > 1).sum() > 0
|
||||
else: # response_method == "predict"
|
||||
assert np.logical_or(y_pred == 0, y_pred == 1).all()
|
||||
|
||||
|
||||
def test_response_values_type_of_target_on_classes_no_warning():
|
||||
"""
|
||||
Ensure `_get_response_values` doesn't raise spurious warning.
|
||||
|
||||
"The number of unique classes is greater than > 50% of samples"
|
||||
warning should not be raised when calling `type_of_target(classes_)`.
|
||||
|
||||
Non-regression test for issue #31583.
|
||||
"""
|
||||
X = np.random.RandomState(0).randn(120, 3)
|
||||
# 30 classes, less than 50% of number of samples
|
||||
y = np.repeat(np.arange(30), 4)
|
||||
|
||||
clf = LogisticRegression().fit(X, y)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
|
||||
_get_response_values(clf, X, response_method="predict_proba")
|
||||
@@ -0,0 +1,185 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
from functools import partial
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils._seq_dataset import (
|
||||
ArrayDataset32,
|
||||
ArrayDataset64,
|
||||
CSRDataset32,
|
||||
CSRDataset64,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
iris = load_iris()
|
||||
X64 = iris.data.astype(np.float64)
|
||||
y64 = iris.target.astype(np.float64)
|
||||
sample_weight64 = np.arange(y64.size, dtype=np.float64)
|
||||
|
||||
X32 = iris.data.astype(np.float32)
|
||||
y32 = iris.target.astype(np.float32)
|
||||
sample_weight32 = np.arange(y32.size, dtype=np.float32)
|
||||
|
||||
floating = [np.float32, np.float64]
|
||||
|
||||
|
||||
def assert_csr_equal_values(current, expected):
|
||||
current.eliminate_zeros()
|
||||
expected.eliminate_zeros()
|
||||
expected = expected.astype(current.dtype)
|
||||
assert current.shape[0] == expected.shape[0]
|
||||
assert current.shape[1] == expected.shape[1]
|
||||
assert_array_equal(current.data, expected.data)
|
||||
assert_array_equal(current.indices, expected.indices)
|
||||
assert_array_equal(current.indptr, expected.indptr)
|
||||
|
||||
|
||||
def _make_dense_dataset(float_dtype):
|
||||
if float_dtype == np.float32:
|
||||
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
|
||||
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
|
||||
|
||||
|
||||
def _make_sparse_dataset(csr_container, float_dtype):
|
||||
if float_dtype == np.float32:
|
||||
X, y, sample_weight, csr_dataset = X32, y32, sample_weight32, CSRDataset32
|
||||
else:
|
||||
X, y, sample_weight, csr_dataset = X64, y64, sample_weight64, CSRDataset64
|
||||
X = csr_container(X)
|
||||
return csr_dataset(X.data, X.indptr, X.indices, y, sample_weight, seed=42)
|
||||
|
||||
|
||||
def _dense_dataset_factories():
|
||||
return [partial(_make_dense_dataset, float_dtype) for float_dtype in floating]
|
||||
|
||||
|
||||
def _sparse_dataset_factories():
|
||||
return [
|
||||
partial(_make_sparse_dataset, csr_container, float_dtype)
|
||||
for csr_container, float_dtype in product(CSR_CONTAINERS, floating)
|
||||
]
|
||||
|
||||
|
||||
def _fused_types_dataset_factories():
|
||||
all_factories = _dense_dataset_factories() + _sparse_dataset_factories()
|
||||
# group dataset by array types to get a tuple (float32, float64)
|
||||
return [all_factories[idx : idx + 2] for idx in range(0, len(all_factories), 2)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize(
|
||||
"dataset_factory", _dense_dataset_factories() + _sparse_dataset_factories()
|
||||
)
|
||||
def test_seq_dataset_basic_iteration(dataset_factory, csr_container):
|
||||
NUMBER_OF_RUNS = 5
|
||||
X_csr64 = csr_container(X64)
|
||||
dataset = dataset_factory()
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
xi_, yi, swi, idx = dataset._next_py()
|
||||
xi = csr_container(xi_, shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[[idx]])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
# random sample
|
||||
xi_, yi, swi, idx = dataset._random_py()
|
||||
xi = csr_container(xi_, shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[[idx]])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"float_dtype, csr_container", product(floating, CSR_CONTAINERS)
|
||||
)
|
||||
def test_seq_dataset_shuffle(float_dtype, csr_container):
|
||||
dense_dataset = _make_dense_dataset(float_dtype)
|
||||
sparse_dataset = _make_sparse_dataset(csr_container, float_dtype)
|
||||
# not shuffled
|
||||
for i in range(5):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
for i in [132, 50, 9, 18, 58]:
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
seed = 77
|
||||
dense_dataset._shuffle_py(seed)
|
||||
sparse_dataset._shuffle_py(seed)
|
||||
|
||||
idx_next = [63, 91, 148, 87, 29]
|
||||
idx_shuffle = [137, 125, 56, 121, 127]
|
||||
for i, j in zip(idx_next, idx_shuffle):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == j
|
||||
assert idx2 == j
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dataset_32_factory, dataset_64_factory", _fused_types_dataset_factories()
|
||||
)
|
||||
def test_fused_types_consistency(dataset_32_factory, dataset_64_factory):
|
||||
dataset_32, dataset_64 = dataset_32_factory(), dataset_64_factory()
|
||||
NUMBER_OF_RUNS = 5
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
|
||||
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
|
||||
|
||||
assert xi_data32.dtype == np.float32
|
||||
assert xi_data64.dtype == np.float64
|
||||
|
||||
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
|
||||
assert_allclose(yi64, yi32, rtol=1e-5)
|
||||
|
||||
|
||||
def test_buffer_dtype_mismatch_error():
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset64(X32, y32, sample_weight32, seed=42)
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset32(X64, y64, sample_weight64, seed=42)
|
||||
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
X_csr32 = csr_container(X32)
|
||||
X_csr64 = csr_container(X64)
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset64(
|
||||
X_csr32.data,
|
||||
X_csr32.indptr,
|
||||
X_csr32.indices,
|
||||
y32,
|
||||
sample_weight32,
|
||||
seed=42,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset32(
|
||||
X_csr64.data,
|
||||
X_csr64.indptr,
|
||||
X_csr64.indices,
|
||||
y64,
|
||||
sample_weight64,
|
||||
seed=42,
|
||||
)
|
||||
@@ -0,0 +1,471 @@
|
||||
import importlib
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn._config import config_context, get_config
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils._set_output import (
|
||||
ADAPTERS_MANAGER,
|
||||
ContainerAdapterProtocol,
|
||||
_get_adapter_from_container,
|
||||
_get_output_config,
|
||||
_safe_set_output,
|
||||
_SetOutputMixin,
|
||||
_wrap_data_with_container,
|
||||
check_library_installed,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def test_pandas_adapter():
|
||||
"""Check pandas adapter has expected behavior."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
columns = np.asarray(["f0", "f1", "f2"], dtype=object)
|
||||
index = np.asarray([1, 2])
|
||||
X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
X_ser_orig = pd.Series([2, 3], index=index)
|
||||
|
||||
adapter = ADAPTERS_MANAGER.adapters["pandas"]
|
||||
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
|
||||
assert isinstance(X_container, pd.DataFrame)
|
||||
assert_array_equal(X_container.columns, columns)
|
||||
assert_array_equal(X_container.index, index)
|
||||
|
||||
# use original index when the original is a series
|
||||
X_container = adapter.create_container(X_np, X_ser_orig, columns=lambda: columns)
|
||||
assert isinstance(X_container, pd.DataFrame)
|
||||
assert_array_equal(X_container.columns, columns)
|
||||
assert_array_equal(X_container.index, index)
|
||||
|
||||
# Input dataframe's index does not change
|
||||
new_columns = np.asarray(["f0", "f1"], dtype=object)
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
|
||||
new_df = adapter.create_container(X_df, X_df_orig, columns=new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
assert_array_equal(new_df.index, X_df.index)
|
||||
|
||||
assert adapter.is_supported_container(X_df)
|
||||
assert not adapter.is_supported_container(X_np)
|
||||
|
||||
# adapter.update_columns updates the columns
|
||||
new_columns = np.array(["a", "c"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# adapter.hstack stacks the dataframes horizontally.
|
||||
X_df_1 = pd.DataFrame([[1, 2, 5], [3, 4, 6]], columns=["a", "b", "e"])
|
||||
X_df_2 = pd.DataFrame([[4], [5]], columns=["c"])
|
||||
X_stacked = adapter.hstack([X_df_1, X_df_2])
|
||||
|
||||
expected_df = pd.DataFrame(
|
||||
[[1, 2, 5, 4], [3, 4, 6, 5]], columns=["a", "b", "e", "c"]
|
||||
)
|
||||
pd.testing.assert_frame_equal(X_stacked, expected_df)
|
||||
|
||||
# check that we update properly the columns even with duplicate column names
|
||||
# this use-case potentially happen when using ColumnTransformer
|
||||
# non-regression test for gh-28260
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
|
||||
new_columns = np.array(["x__a", "y__a"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# check the behavior of the inplace parameter in `create_container`
|
||||
# we should trigger a copy
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=False)
|
||||
assert X_output is not X_df
|
||||
assert list(X_df.columns) == [0, 1]
|
||||
assert list(X_output.columns) == ["a", "b"]
|
||||
|
||||
# the operation is inplace
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=True)
|
||||
assert X_output is X_df
|
||||
assert list(X_df.columns) == ["a", "b"]
|
||||
assert list(X_output.columns) == ["a", "b"]
|
||||
|
||||
|
||||
def test_polars_adapter():
|
||||
"""Check Polars adapter has expected behavior."""
|
||||
pl = pytest.importorskip("polars")
|
||||
X_np = np.array([[1, 0, 3], [0, 0, 1]])
|
||||
columns = ["f1", "f2", "f3"]
|
||||
X_df_orig = pl.DataFrame(X_np, schema=columns, orient="row")
|
||||
|
||||
adapter = ADAPTERS_MANAGER.adapters["polars"]
|
||||
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
|
||||
|
||||
assert isinstance(X_container, pl.DataFrame)
|
||||
assert_array_equal(X_container.columns, columns)
|
||||
|
||||
# Update columns with create_container
|
||||
new_columns = np.asarray(["a", "b", "c"], dtype=object)
|
||||
new_df = adapter.create_container(X_df_orig, X_df_orig, columns=new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
assert adapter.is_supported_container(X_df_orig)
|
||||
assert not adapter.is_supported_container(X_np)
|
||||
|
||||
# adapter.update_columns updates the columns
|
||||
new_columns = np.array(["a", "c", "g"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df_orig, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# adapter.hstack stacks the dataframes horizontally.
|
||||
X_df_1 = pl.DataFrame([[1, 2, 5], [3, 4, 6]], schema=["a", "b", "e"], orient="row")
|
||||
X_df_2 = pl.DataFrame([[4], [5]], schema=["c"], orient="row")
|
||||
X_stacked = adapter.hstack([X_df_1, X_df_2])
|
||||
|
||||
expected_df = pl.DataFrame(
|
||||
[[1, 2, 5, 4], [3, 4, 6, 5]], schema=["a", "b", "e", "c"], orient="row"
|
||||
)
|
||||
from polars.testing import assert_frame_equal
|
||||
|
||||
assert_frame_equal(X_stacked, expected_df)
|
||||
|
||||
# check the behavior of the inplace parameter in `create_container`
|
||||
# we should trigger a copy
|
||||
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=False)
|
||||
assert X_output is not X_df
|
||||
assert list(X_df.columns) == ["a", "b"]
|
||||
assert list(X_output.columns) == ["c", "d"]
|
||||
|
||||
# the operation is inplace
|
||||
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=True)
|
||||
assert X_output is X_df
|
||||
assert list(X_df.columns) == ["c", "d"]
|
||||
assert list(X_output.columns) == ["c", "d"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test__container_error_validation(csr_container):
|
||||
"""Check errors in _wrap_data_with_container."""
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
X_csr = csr_container(X)
|
||||
match = "The transformer outputs a scipy sparse matrix."
|
||||
with config_context(transform_output="pandas"):
|
||||
with pytest.raises(ValueError, match=match):
|
||||
_wrap_data_with_container("transform", X_csr, X, StandardScaler())
|
||||
|
||||
|
||||
class EstimatorWithoutSetOutputAndWithoutTransform:
|
||||
pass
|
||||
|
||||
|
||||
class EstimatorNoSetOutputWithTransform:
|
||||
def transform(self, X, y=None):
|
||||
return X # pragma: no cover
|
||||
|
||||
|
||||
class EstimatorWithSetOutput(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
self.n_features_in_ = X.shape[1]
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
def test__safe_set_output():
|
||||
"""Check _safe_set_output works as expected."""
|
||||
|
||||
# Estimator without transform will not raise when setting set_output for transform.
|
||||
est = EstimatorWithoutSetOutputAndWithoutTransform()
|
||||
_safe_set_output(est, transform="pandas")
|
||||
|
||||
# Estimator with transform but without set_output will raise
|
||||
est = EstimatorNoSetOutputWithTransform()
|
||||
with pytest.raises(ValueError, match="Unable to configure output"):
|
||||
_safe_set_output(est, transform="pandas")
|
||||
|
||||
est = EstimatorWithSetOutput().fit(np.asarray([[1, 2, 3]]))
|
||||
_safe_set_output(est, transform="pandas")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "pandas"
|
||||
|
||||
_safe_set_output(est, transform="default")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
# transform is None is a no-op, so the config remains "default"
|
||||
_safe_set_output(est, transform=None)
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
|
||||
class EstimatorNoSetOutputWithTransformNoFeatureNamesOut(_SetOutputMixin):
|
||||
def transform(self, X, y=None):
|
||||
return X # pragma: no cover
|
||||
|
||||
|
||||
def test_set_output_mixin():
|
||||
"""Estimator without get_feature_names_out does not define `set_output`."""
|
||||
est = EstimatorNoSetOutputWithTransformNoFeatureNamesOut()
|
||||
assert not hasattr(est, "set_output")
|
||||
|
||||
|
||||
def test__safe_set_output_error():
|
||||
"""Check transform with invalid config."""
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
|
||||
est = EstimatorWithSetOutput()
|
||||
_safe_set_output(est, transform="bad")
|
||||
|
||||
msg = "output config must be in"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
def test_set_output_method(dataframe_lib):
|
||||
"""Check that the output is a dataframe."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
est = EstimatorWithSetOutput().fit(X)
|
||||
|
||||
# transform=None is a no-op
|
||||
est2 = est.set_output(transform=None)
|
||||
assert est2 is est
|
||||
X_trans_np = est2.transform(X)
|
||||
assert isinstance(X_trans_np, np.ndarray)
|
||||
|
||||
est.set_output(transform=dataframe_lib)
|
||||
|
||||
X_trans_pd = est.transform(X)
|
||||
|
||||
assert isinstance(X_trans_pd, lib.DataFrame)
|
||||
|
||||
|
||||
def test_set_output_method_error():
|
||||
"""Check transform fails with invalid transform."""
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
est = EstimatorWithSetOutput().fit(X)
|
||||
est.set_output(transform="bad")
|
||||
|
||||
msg = "output config must be in"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("transform_output", ["pandas", "polars"])
|
||||
def test__get_output_config(transform_output):
|
||||
"""Check _get_output_config works as expected."""
|
||||
|
||||
# Without a configuration set, the global config is used
|
||||
global_config = get_config()["transform_output"]
|
||||
config = _get_output_config("transform")
|
||||
assert config["dense"] == global_config
|
||||
|
||||
with config_context(transform_output=transform_output):
|
||||
# with estimator=None, the global config is used
|
||||
config = _get_output_config("transform")
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
est = EstimatorNoSetOutputWithTransform()
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
est = EstimatorWithSetOutput()
|
||||
# If estimator has not config, use global config
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
# If estimator has a config, use local config
|
||||
est.set_output(transform="default")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
est.set_output(transform=transform_output)
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
|
||||
class EstimatorWithSetOutputNoAutoWrap(_SetOutputMixin, auto_wrap_output_keys=None):
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
|
||||
def test_get_output_auto_wrap_false():
|
||||
"""Check that auto_wrap_output_keys=None does not wrap."""
|
||||
est = EstimatorWithSetOutputNoAutoWrap()
|
||||
assert not hasattr(est, "set_output")
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
assert X is est.transform(X)
|
||||
|
||||
|
||||
def test_auto_wrap_output_keys_errors_with_incorrect_input():
|
||||
msg = "auto_wrap_output_keys must be None or a tuple of keys."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
|
||||
class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
|
||||
pass
|
||||
|
||||
|
||||
class AnotherMixin:
|
||||
def __init_subclass__(cls, custom_parameter, **kwargs):
|
||||
super().__init_subclass__(**kwargs)
|
||||
cls.custom_parameter = custom_parameter
|
||||
|
||||
|
||||
def test_set_output_mixin_custom_mixin():
|
||||
"""Check that multiple init_subclasses passes parameters up."""
|
||||
|
||||
class BothMixinEstimator(_SetOutputMixin, AnotherMixin, custom_parameter=123):
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return input_features
|
||||
|
||||
est = BothMixinEstimator()
|
||||
assert est.custom_parameter == 123
|
||||
assert hasattr(est, "set_output")
|
||||
|
||||
|
||||
def test_set_output_mro():
|
||||
"""Check that multi-inheritance resolves to the correct class method.
|
||||
|
||||
Non-regression test gh-25293.
|
||||
"""
|
||||
|
||||
class Base(_SetOutputMixin):
|
||||
def transform(self, X):
|
||||
return "Base"
|
||||
|
||||
class A(Base):
|
||||
pass
|
||||
|
||||
class B(Base):
|
||||
def transform(self, X):
|
||||
return "B"
|
||||
|
||||
class C(A, B):
|
||||
pass
|
||||
|
||||
assert C().transform(None) == "B"
|
||||
|
||||
|
||||
class EstimatorWithSetOutputIndex(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
self.n_features_in_ = X.shape[1]
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
import pandas as pd
|
||||
|
||||
# transform by giving output a new index.
|
||||
return pd.DataFrame(X.to_numpy(), index=[f"s{i}" for i in range(X.shape[0])])
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
def test_set_output_pandas_keep_index():
|
||||
"""Check that set_output does not override index.
|
||||
|
||||
Non-regression test for gh-25730.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=[0, 1])
|
||||
est = EstimatorWithSetOutputIndex().set_output(transform="pandas")
|
||||
est.fit(X)
|
||||
|
||||
X_trans = est.transform(X)
|
||||
assert_array_equal(X_trans.index, ["s0", "s1"])
|
||||
|
||||
|
||||
class EstimatorReturnTuple(_SetOutputMixin):
|
||||
def __init__(self, OutputTuple):
|
||||
self.OutputTuple = OutputTuple
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self.OutputTuple(X, 2 * X)
|
||||
|
||||
|
||||
def test_set_output_named_tuple_out():
|
||||
"""Check that namedtuples are kept by default."""
|
||||
Output = namedtuple("Output", "X, Y")
|
||||
X = np.asarray([[1, 2, 3]])
|
||||
est = EstimatorReturnTuple(OutputTuple=Output)
|
||||
X_trans = est.transform(X)
|
||||
|
||||
assert isinstance(X_trans, Output)
|
||||
assert_array_equal(X_trans.X, X)
|
||||
assert_array_equal(X_trans.Y, 2 * X)
|
||||
|
||||
|
||||
class EstimatorWithListInput(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
assert isinstance(X, list)
|
||||
self.n_features_in_ = len(X[0])
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
def test_set_output_list_input(dataframe_lib):
|
||||
"""Check set_output for list input.
|
||||
|
||||
Non-regression test for #27037.
|
||||
"""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
X = [[0, 1, 2, 3], [4, 5, 6, 7]]
|
||||
est = EstimatorWithListInput()
|
||||
est.set_output(transform=dataframe_lib)
|
||||
|
||||
X_out = est.fit(X).transform(X)
|
||||
assert isinstance(X_out, lib.DataFrame)
|
||||
assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", sorted(ADAPTERS_MANAGER.adapters))
|
||||
def test_adapter_class_has_interface(name):
|
||||
"""Check adapters have the correct interface."""
|
||||
assert isinstance(ADAPTERS_MANAGER.adapters[name], ContainerAdapterProtocol)
|
||||
|
||||
|
||||
def test_check_library_installed(monkeypatch):
|
||||
"""Check import error changed."""
|
||||
orig_import_module = importlib.import_module
|
||||
|
||||
def patched_import_module(name):
|
||||
if name == "pandas":
|
||||
raise ImportError()
|
||||
orig_import_module(name, package=None)
|
||||
|
||||
monkeypatch.setattr(importlib, "import_module", patched_import_module)
|
||||
|
||||
msg = "Setting output container to 'pandas' requires"
|
||||
with pytest.raises(ImportError, match=msg):
|
||||
check_library_installed("pandas")
|
||||
|
||||
|
||||
def test_get_adapter_from_container():
|
||||
"""Check the behavior fo `_get_adapter_from_container`."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
adapter = _get_adapter_from_container(X)
|
||||
assert adapter.container_lib == "pandas"
|
||||
err_msg = "The container does not have a registered adapter in scikit-learn."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_adapter_from_container(X.to_numpy())
|
||||
@@ -0,0 +1,65 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.utils.graph import single_source_shortest_path_length
|
||||
|
||||
|
||||
def floyd_warshall_slow(graph, directed=False):
|
||||
N = graph.shape[0]
|
||||
|
||||
# set nonzero entries to infinity
|
||||
graph[np.where(graph == 0)] = np.inf
|
||||
|
||||
# set diagonal to zero
|
||||
graph.flat[:: N + 1] = 0
|
||||
|
||||
if not directed:
|
||||
graph = np.minimum(graph, graph.T)
|
||||
|
||||
for k in range(N):
|
||||
for i in range(N):
|
||||
for j in range(N):
|
||||
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
|
||||
|
||||
graph[np.where(np.isinf(graph))] = 0
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
def generate_graph(N=20):
|
||||
# sparse grid of distances
|
||||
rng = np.random.RandomState(0)
|
||||
dist_matrix = rng.random_sample((N, N))
|
||||
|
||||
# make symmetric: distances are not direction-dependent
|
||||
dist_matrix = dist_matrix + dist_matrix.T
|
||||
|
||||
# make graph sparse
|
||||
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
|
||||
dist_matrix[i] = 0
|
||||
|
||||
# set diagonal to zero
|
||||
dist_matrix.flat[:: N + 1] = 0
|
||||
|
||||
return dist_matrix
|
||||
|
||||
|
||||
def test_shortest_path():
|
||||
dist_matrix = generate_graph(20)
|
||||
# We compare path length and not costs (-> set distances to 0 or 1)
|
||||
dist_matrix[dist_matrix != 0] = 1
|
||||
|
||||
for directed in (True, False):
|
||||
if not directed:
|
||||
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
|
||||
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
for i in range(dist_matrix.shape[0]):
|
||||
# Non-reachable nodes have distance 0 in graph_py
|
||||
dist_dict = defaultdict(int)
|
||||
dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
|
||||
|
||||
for j in range(graph_py[i].shape[0]):
|
||||
assert_array_almost_equal(dist_dict[j], graph_py[i, j])
|
||||
@@ -0,0 +1,40 @@
|
||||
from threadpoolctl import threadpool_info
|
||||
|
||||
from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
def test_get_sys_info():
|
||||
sys_info = _get_sys_info()
|
||||
|
||||
assert "python" in sys_info
|
||||
assert "executable" in sys_info
|
||||
assert "machine" in sys_info
|
||||
|
||||
|
||||
def test_get_deps_info():
|
||||
with ignore_warnings():
|
||||
deps_info = _get_deps_info()
|
||||
|
||||
assert "pip" in deps_info
|
||||
assert "setuptools" in deps_info
|
||||
assert "sklearn" in deps_info
|
||||
assert "numpy" in deps_info
|
||||
assert "scipy" in deps_info
|
||||
assert "Cython" in deps_info
|
||||
assert "pandas" in deps_info
|
||||
assert "matplotlib" in deps_info
|
||||
assert "joblib" in deps_info
|
||||
|
||||
|
||||
def test_show_versions(capsys):
|
||||
with ignore_warnings():
|
||||
show_versions()
|
||||
out, err = capsys.readouterr()
|
||||
|
||||
assert "python" in out
|
||||
assert "numpy" in out
|
||||
|
||||
info = threadpool_info()
|
||||
if info:
|
||||
assert "threadpoolctl info:" in out
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,487 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
from pytest import approx
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn.utils._array_api import (
|
||||
_convert_to_numpy,
|
||||
get_namespace,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._array_api import device as array_device
|
||||
from sklearn.utils.estimator_checks import _array_api_for_tests
|
||||
from sklearn.utils.fixes import np_version, parse_version
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("size", [10, 15])
|
||||
def test_weighted_percentile_matches_median(size, average):
|
||||
"""Ensure `_weighted_percentile` matches `median` when expected.
|
||||
|
||||
With unit `sample_weight`, `_weighted_percentile` should match the median except
|
||||
when `average=False` and the number of samples is even.
|
||||
For an even array and `average=False`, `percentile_rank=50` gives the lower
|
||||
of the two 'middle' values, that are averaged when calculating the `median`.
|
||||
"""
|
||||
y = np.arange(size)
|
||||
sample_weight = np.ones_like(y)
|
||||
|
||||
score = _weighted_percentile(y, sample_weight, 50, average=average)
|
||||
|
||||
# `_weighted_percentile(average=False)` does not match `median` when n is even
|
||||
if size % 2 == 0 and average is False:
|
||||
assert score != np.median(y)
|
||||
else:
|
||||
assert approx(score) == np.median(y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("percentile_rank", [20, 35, 61, [5, 47]])
|
||||
@pytest.mark.parametrize("size", [10, 15])
|
||||
def test_weighted_percentile_matches_numpy(
|
||||
global_random_seed, size, percentile_rank, average
|
||||
):
|
||||
"""Check `_weighted_percentile` with unit weights is correct.
|
||||
|
||||
`average=True` results should be the same as `np.percentile`'s
|
||||
'averaged_inverted_cdf'.
|
||||
`average=False` results should be the same as `np.percentile`'s
|
||||
'inverted_cdf'.
|
||||
Note `np.percentile` is the same as `np.quantile` except `q` is in range [0, 100].
|
||||
|
||||
We parametrize through different `percentile_rank` and `size` to
|
||||
ensure we get cases where `g=0` and `g>0` (see Hyndman and Fan 1996 for details).
|
||||
"""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
y = rng.randint(20, size=size)
|
||||
sw = np.ones_like(y)
|
||||
|
||||
score = _weighted_percentile(y, sw, percentile_rank, average=average)
|
||||
|
||||
if average:
|
||||
method = "averaged_inverted_cdf"
|
||||
else:
|
||||
method = "inverted_cdf"
|
||||
|
||||
assert approx(score) == np.percentile(y, percentile_rank, method=method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("percentile_rank", [50, 100])
|
||||
def test_weighted_percentile_plus_one_clip_max(percentile_rank):
|
||||
"""Check `j+1` index is clipped to max, when `average=True`.
|
||||
|
||||
`percentile_plus_one_indices` can exceed max index when `percentile_indices`
|
||||
is already at max index.
|
||||
Note that when `g` (Hyndman and Fan) / `fraction_above` is greater than 0,
|
||||
`j+1` (Hyndman and Fan) / `percentile_plus_one_indices` is calculated but
|
||||
never used, so it does not matter what this value is.
|
||||
When percentile of percentile rank 100 falls exactly on the last value in the
|
||||
`weighted_cdf`, `g=0` and `percentile_indices` is at max index. In this case
|
||||
we set `percentile_plus_one_indices` to be max index as well, so the result is
|
||||
the average of 2x the max index (i.e. last value of `weighted_cdf`).
|
||||
"""
|
||||
# Note for both `percentile_rank`s 50 and 100,`percentile_indices` is already at
|
||||
# max index
|
||||
y = np.array([[0, 0], [1, 1]])
|
||||
sw = np.array([[0.1, 0.2], [2, 3]])
|
||||
score = _weighted_percentile(y, sw, percentile_rank, average=True)
|
||||
for idx in range(2):
|
||||
assert score[idx] == approx(1.0)
|
||||
|
||||
|
||||
def test_weighted_percentile_equal():
|
||||
"""Check `weighted_percentile` with unit weights and all 0 values in `array`."""
|
||||
y = np.zeros(102, dtype=np.float64)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 0
|
||||
|
||||
|
||||
# XXX: is this really what we want? Shouldn't we raise instead?
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/31032
|
||||
def test_weighted_percentile_all_zero_weights():
|
||||
"""Check `weighted_percentile` with all weights equal to 0 returns last index."""
|
||||
y = np.arange(10)
|
||||
sw = np.zeros(10)
|
||||
value = _weighted_percentile(y, sw, 50)
|
||||
assert approx(value) == 9.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("percentile_rank, expected_value", [(0, 2), (50, 3), (100, 5)])
|
||||
def test_weighted_percentile_ignores_zero_weight(
|
||||
average, percentile_rank, expected_value
|
||||
):
|
||||
"""Check leading, trailing and middle 0 weights behave correctly.
|
||||
|
||||
Check that leading zero-weight observations are ignored when `percentile_rank=0`.
|
||||
See #20528 for details.
|
||||
Check that when `average=True` and the `j+1` ('plus one') index has sample weight
|
||||
of 0, it is ignored. Also check that trailing zero weight observations are ignored
|
||||
(e.g., when `percentile_rank=100`).
|
||||
"""
|
||||
y = np.array([0, 1, 2, 3, 4, 5, 6])
|
||||
sw = np.array([0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
value = _weighted_percentile(
|
||||
np.vstack((y, y)).T, np.vstack((sw, sw)).T, percentile_rank, average=average
|
||||
)
|
||||
for idx in range(2):
|
||||
assert approx(value[idx]) == expected_value
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("percentile_rank", [20, 35, 50, 61])
|
||||
def test_weighted_percentile_frequency_weight_semantics(
|
||||
global_random_seed, percentile_rank, average
|
||||
):
|
||||
"""Check integer weights give the same result as repeating values."""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
x = rng.randint(20, size=10)
|
||||
weights = rng.choice(5, size=10)
|
||||
|
||||
x_repeated = np.repeat(x, weights)
|
||||
percentile_weights = _weighted_percentile(
|
||||
x, weights, percentile_rank, average=average
|
||||
)
|
||||
percentile_repeated = _weighted_percentile(
|
||||
x_repeated, np.ones_like(x_repeated), percentile_rank, average=average
|
||||
)
|
||||
assert percentile_weights == approx(percentile_repeated)
|
||||
# Also check `percentile_rank=50` matches `median`
|
||||
if percentile_rank == 50 and average:
|
||||
assert percentile_weights == approx(np.median(x_repeated))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constant", [5, 8])
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("percentile_rank", [20, 35, 50, 61, [20, 35, 50, 61]])
|
||||
def test_weighted_percentile_constant_multiplier(
|
||||
global_random_seed, percentile_rank, average, constant
|
||||
):
|
||||
"""Check multiplying weights by a constant does not change the result.
|
||||
|
||||
Note scale invariance does not always hold when multiplying by a
|
||||
float due to cumulative sum numerical error (which grows proportional to n).
|
||||
"""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
x = rng.randint(20, size=20)
|
||||
weights = rng.choice(5, size=20)
|
||||
weights_multiplied = weights * constant
|
||||
|
||||
percentile = _weighted_percentile(x, weights, percentile_rank, average=average)
|
||||
percentile_multiplier = _weighted_percentile(
|
||||
x, weights_multiplied, percentile_rank, average=average
|
||||
)
|
||||
assert percentile == approx(percentile_multiplier)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("percentile_rank", [50, [20, 35, 50]])
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
def test_weighted_percentile_2d(global_random_seed, percentile_rank, average):
|
||||
"""Check `_weighted_percentile` behaviour is correct when `array` is 2D."""
|
||||
# Check for when array 2D and sample_weight 1D
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
x1 = rng.randint(10, size=10)
|
||||
w1 = rng.choice(5, size=10)
|
||||
|
||||
x2 = rng.randint(20, size=10)
|
||||
x_2d = np.vstack((x1, x2)).T
|
||||
|
||||
wp = _weighted_percentile(
|
||||
x_2d, w1, percentile_rank=percentile_rank, average=average
|
||||
)
|
||||
|
||||
if isinstance(percentile_rank, list):
|
||||
p_list = []
|
||||
for pr in percentile_rank:
|
||||
p_list.append(
|
||||
[
|
||||
_weighted_percentile(
|
||||
x_2d[:, i], w1, percentile_rank=pr, average=average
|
||||
)
|
||||
for i in range(x_2d.shape[1])
|
||||
]
|
||||
)
|
||||
p_axis_0 = np.stack(p_list, axis=-1)
|
||||
assert wp.shape == (x_2d.shape[1], len(percentile_rank))
|
||||
else:
|
||||
# percentile_rank is scalar
|
||||
p_axis_0 = [
|
||||
_weighted_percentile(
|
||||
x_2d[:, i], w1, percentile_rank=percentile_rank, average=average
|
||||
)
|
||||
for i in range(x_2d.shape[1])
|
||||
]
|
||||
assert wp.shape == (x_2d.shape[1],)
|
||||
|
||||
assert_allclose(wp, p_axis_0)
|
||||
|
||||
# Check when array and sample_weight both 2D
|
||||
w2 = rng.choice(5, size=10)
|
||||
w_2d = np.vstack((w1, w2)).T
|
||||
|
||||
wp = _weighted_percentile(
|
||||
x_2d, w_2d, percentile_rank=percentile_rank, average=average
|
||||
)
|
||||
|
||||
if isinstance(percentile_rank, list):
|
||||
p_list = []
|
||||
for pr in percentile_rank:
|
||||
p_list.append(
|
||||
[
|
||||
_weighted_percentile(
|
||||
x_2d[:, i], w_2d[:, i], percentile_rank=pr, average=average
|
||||
)
|
||||
for i in range(x_2d.shape[1])
|
||||
]
|
||||
)
|
||||
p_axis_0 = np.stack(p_list, axis=-1)
|
||||
assert wp.shape == (x_2d.shape[1], len(percentile_rank))
|
||||
else:
|
||||
# percentile_rank is scalar
|
||||
p_axis_0 = [
|
||||
_weighted_percentile(
|
||||
x_2d[:, i], w_2d[:, i], percentile_rank=percentile_rank, average=average
|
||||
)
|
||||
for i in range(x_2d.shape[1])
|
||||
]
|
||||
assert wp.shape == (x_2d.shape[1],)
|
||||
|
||||
assert_allclose(wp, p_axis_0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data, weights, percentile",
|
||||
[
|
||||
# NumPy scalars input (handled as 0D arrays on array API)
|
||||
(np.float32(42), np.int32(1), 50),
|
||||
# Random 1D array, constant weights
|
||||
(lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
|
||||
# Random 2D array and random 1D weights
|
||||
(lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
|
||||
# Random 2D array and random 2D weights
|
||||
(
|
||||
lambda rng: rng.rand(20, 3),
|
||||
lambda rng: rng.rand(20, 3).astype(np.float32),
|
||||
[25, 75],
|
||||
),
|
||||
# zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
|
||||
(np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
|
||||
# np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
|
||||
(np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
|
||||
# `sample_weight` dtype: int32
|
||||
(
|
||||
np.array([0, 1, 2, 3, 4, 5]),
|
||||
np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
|
||||
[25, 75],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_weighted_percentile_array_api_consistency(
|
||||
global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
|
||||
):
|
||||
"""Check `_weighted_percentile` gives consistent results with array API."""
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
# Skip test for percentile=0 edge case (#20528) on namespace/device where
|
||||
# xp.nextafter is broken. This is the case for torch with MPS device:
|
||||
# https://github.com/pytorch/pytorch/issues/150027
|
||||
zero = xp.zeros(1, device=device)
|
||||
one = xp.ones(1, device=device)
|
||||
if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
|
||||
pytest.xfail(f"xp.nextafter is broken on {device}")
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X_np = data(rng) if callable(data) else data
|
||||
weights_np = weights(rng) if callable(weights) else weights
|
||||
# Ensure `data` of correct dtype
|
||||
X_np = X_np.astype(dtype_name)
|
||||
|
||||
result_np = _weighted_percentile(X_np, weights_np, percentile)
|
||||
# Convert to Array API arrays
|
||||
X_xp = xp.asarray(X_np, device=device)
|
||||
weights_xp = xp.asarray(weights_np, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
|
||||
assert array_device(result_xp) == array_device(X_xp)
|
||||
assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
|
||||
result_xp_np = _convert_to_numpy(result_xp, xp=xp)
|
||||
|
||||
assert result_xp_np.dtype == result_np.dtype
|
||||
assert result_xp_np.shape == result_np.shape
|
||||
assert_allclose(result_np, result_xp_np)
|
||||
|
||||
# Check dtype correct (`sample_weight` should follow `array`)
|
||||
if dtype_name == "float32":
|
||||
assert result_xp_np.dtype == result_np.dtype == np.float32
|
||||
else:
|
||||
assert result_xp_np.dtype == np.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [True, False])
|
||||
@pytest.mark.parametrize("sample_weight_ndim", [1, 2])
|
||||
def test_weighted_percentile_nan_filtered(
|
||||
global_random_seed, sample_weight_ndim, average
|
||||
):
|
||||
"""Test `_weighted_percentile` ignores NaNs.
|
||||
|
||||
Calling `_weighted_percentile` on an array with nan values returns the same
|
||||
results as calling `_weighted_percentile` on a filtered version of the data.
|
||||
We test both with sample_weight of the same shape as the data and with
|
||||
one-dimensional sample_weight.
|
||||
"""
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
array_with_nans = rng.rand(100, 10)
|
||||
array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
|
||||
nan_mask = np.isnan(array_with_nans)
|
||||
|
||||
if sample_weight_ndim == 2:
|
||||
sample_weight = rng.randint(1, 6, size=(100, 10))
|
||||
else:
|
||||
sample_weight = rng.randint(1, 6, size=(100,))
|
||||
|
||||
# Find the weighted percentile on the array with nans:
|
||||
results = _weighted_percentile(array_with_nans, sample_weight, 30, average=average)
|
||||
|
||||
# Find the weighted percentile on the filtered array:
|
||||
filtered_array = [
|
||||
array_with_nans[~nan_mask[:, col], col]
|
||||
for col in range(array_with_nans.shape[1])
|
||||
]
|
||||
if sample_weight.ndim == 1:
|
||||
sample_weight = np.repeat(sample_weight, array_with_nans.shape[1]).reshape(
|
||||
array_with_nans.shape[0], array_with_nans.shape[1]
|
||||
)
|
||||
filtered_weights = [
|
||||
sample_weight[~nan_mask[:, col], col] for col in range(array_with_nans.shape[1])
|
||||
]
|
||||
|
||||
expected_results = np.array(
|
||||
[
|
||||
_weighted_percentile(
|
||||
filtered_array[col], filtered_weights[col], 30, average=average
|
||||
)
|
||||
for col in range(array_with_nans.shape[1])
|
||||
]
|
||||
)
|
||||
|
||||
assert_array_equal(expected_results, results)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"percentile_rank, expected",
|
||||
[
|
||||
(90, [np.nan, 5]),
|
||||
([50, 90], [[np.nan, np.nan], [2.0, 5.0]]),
|
||||
],
|
||||
)
|
||||
def test_weighted_percentile_all_nan_column(percentile_rank, expected):
|
||||
"""Check that nans are ignored in general, except for all NaN columns."""
|
||||
|
||||
array = np.array(
|
||||
[
|
||||
[np.nan, 5],
|
||||
[np.nan, 1],
|
||||
[np.nan, np.nan],
|
||||
[np.nan, np.nan],
|
||||
[np.nan, 2],
|
||||
[np.nan, np.nan],
|
||||
]
|
||||
)
|
||||
weights = np.ones_like(array)
|
||||
values = _weighted_percentile(array, weights, percentile_rank)
|
||||
|
||||
# The percentile of the second column should be `5` even though there are many nan
|
||||
# values present; the percentile of the first column can only be nan, since there
|
||||
# are no other possible values:
|
||||
assert np.array_equal(values, expected, equal_nan=True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
np_version < parse_version("2.0"),
|
||||
reason="np.quantile only accepts weights since version 2.0",
|
||||
)
|
||||
@pytest.mark.parametrize("percentile", [66, 10, 50])
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("uniform_weight", [False, True])
|
||||
def test_weighted_percentile_like_numpy_quantile(
|
||||
percentile, average, uniform_weight, global_random_seed
|
||||
):
|
||||
"""Check `_weighted_percentile` is equivalent to `np.quantile` with weights."""
|
||||
# TODO: remove the following skip once no longer applicable.
|
||||
if average and not uniform_weight:
|
||||
pytest.skip(
|
||||
"np.quantile does not support weights with method='averaged_inverted_cdf'"
|
||||
)
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
array = rng.rand(10, 100)
|
||||
if uniform_weight:
|
||||
sample_weight = np.ones_like(array) * rng.randint(1, 6, size=1)
|
||||
else:
|
||||
sample_weight = rng.randint(1, 6, size=(10, 100))
|
||||
|
||||
percentile_weighted_percentile = _weighted_percentile(
|
||||
array, sample_weight, percentile, average=average
|
||||
)
|
||||
percentile_numpy_quantile = np.quantile(
|
||||
array,
|
||||
percentile / 100,
|
||||
weights=sample_weight if not uniform_weight else None,
|
||||
method="averaged_inverted_cdf" if average else "inverted_cdf",
|
||||
axis=0,
|
||||
)
|
||||
|
||||
assert_array_equal(percentile_weighted_percentile, percentile_numpy_quantile)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
np_version < parse_version("2.0"),
|
||||
reason="np.nanquantile only accepts weights since version 2.0",
|
||||
)
|
||||
@pytest.mark.parametrize("percentile", [66, 10, 50])
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("uniform_weight", [False, True])
|
||||
def test_weighted_percentile_like_numpy_nanquantile(
|
||||
percentile, average, uniform_weight, global_random_seed
|
||||
):
|
||||
"""Check `_weighted_percentile` equivalent to `np.nanquantile` with weights."""
|
||||
# TODO: remove the following skip once no longer applicable.
|
||||
if average and not uniform_weight:
|
||||
pytest.skip(
|
||||
"np.nanquantile does not support weights with "
|
||||
"method='averaged_inverted_cdf'"
|
||||
)
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
array_with_nans = rng.rand(10, 100)
|
||||
array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
|
||||
if uniform_weight:
|
||||
sample_weight = np.ones_like(array_with_nans) * rng.randint(
|
||||
1,
|
||||
6,
|
||||
size=1,
|
||||
)
|
||||
else:
|
||||
sample_weight = rng.randint(1, 6, size=(10, 100))
|
||||
|
||||
percentile_weighted_percentile = _weighted_percentile(
|
||||
array_with_nans, sample_weight, percentile, average=average
|
||||
)
|
||||
percentile_numpy_nanquantile = np.nanquantile(
|
||||
array_with_nans,
|
||||
percentile / 100,
|
||||
weights=sample_weight if not uniform_weight else None,
|
||||
method="averaged_inverted_cdf" if average else "inverted_cdf",
|
||||
axis=0,
|
||||
)
|
||||
|
||||
assert_array_equal(percentile_weighted_percentile, percentile_numpy_nanquantile)
|
||||
@@ -0,0 +1,142 @@
|
||||
from dataclasses import dataclass, fields
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import (
|
||||
BaseEstimator,
|
||||
ClassifierMixin,
|
||||
RegressorMixin,
|
||||
TransformerMixin,
|
||||
)
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.utils import (
|
||||
Tags,
|
||||
get_tags,
|
||||
)
|
||||
from sklearn.utils.estimator_checks import (
|
||||
check_estimator_tags_renamed,
|
||||
check_valid_tag_types,
|
||||
)
|
||||
|
||||
|
||||
class EmptyClassifier(ClassifierMixin, BaseEstimator):
|
||||
pass
|
||||
|
||||
|
||||
class EmptyTransformer(TransformerMixin, BaseEstimator):
|
||||
pass
|
||||
|
||||
|
||||
class EmptyRegressor(RegressorMixin, BaseEstimator):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, value",
|
||||
[
|
||||
[EmptyClassifier(), True],
|
||||
[EmptyTransformer(), False],
|
||||
[EmptyRegressor(), True],
|
||||
[BaseEstimator(), False],
|
||||
],
|
||||
)
|
||||
def test_requires_y(estimator, value):
|
||||
assert get_tags(estimator).target_tags.required == value
|
||||
|
||||
|
||||
def test_no___sklearn_tags__with_more_tags():
|
||||
"""Test that calling `get_tags` on a class that defines `_more_tags` but not
|
||||
`__sklearn_tags__` raises an error.
|
||||
"""
|
||||
|
||||
class MoreTagsEstimator(BaseEstimator):
|
||||
def _more_tags(self):
|
||||
return {"requires_y": True} # pragma: no cover
|
||||
|
||||
with pytest.raises(
|
||||
TypeError, match="has defined either `_more_tags` or `_get_tags`"
|
||||
):
|
||||
check_estimator_tags_renamed("MoreTagsEstimator", MoreTagsEstimator())
|
||||
|
||||
|
||||
def test_tag_test_passes_with_inheritance():
|
||||
@dataclass
|
||||
class MyTags(Tags):
|
||||
my_tag: bool = True # type: ignore[annotation-unchecked]
|
||||
|
||||
class MyEstimator(BaseEstimator):
|
||||
def __sklearn_tags__(self):
|
||||
tags_orig = super().__sklearn_tags__()
|
||||
as_dict = {
|
||||
field.name: getattr(tags_orig, field.name)
|
||||
for field in fields(tags_orig)
|
||||
}
|
||||
tags = MyTags(**as_dict)
|
||||
tags.my_tag = True
|
||||
return tags
|
||||
|
||||
check_valid_tag_types("MyEstimator", MyEstimator())
|
||||
|
||||
|
||||
def test_tags_no_sklearn_tags_concrete_implementation():
|
||||
"""Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/30479
|
||||
|
||||
Either the estimator doesn't implement `__sklearn_tags` or there is no class
|
||||
implementing `__sklearn_tags__` without calling `super().__sklearn_tags__()` in
|
||||
its mro. Thus, we raise an error and request to inherit from
|
||||
`BaseEstimator` that implements `__sklearn_tags__`.
|
||||
"""
|
||||
|
||||
X = np.array([[1, 2], [2, 3], [3, 4]])
|
||||
y = np.array([1, 0, 1])
|
||||
|
||||
# 1st case, the estimator inherits from a class that only implements
|
||||
# `__sklearn_tags__` by calling `super().__sklearn_tags__()`.
|
||||
class MyEstimator(ClassifierMixin):
|
||||
def __init__(self, *, param=1):
|
||||
self.param = param
|
||||
|
||||
def fit(self, X, y=None):
|
||||
self.is_fitted_ = True
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return np.full(shape=X.shape[0], fill_value=self.param)
|
||||
|
||||
my_pipeline = Pipeline([("estimator", MyEstimator(param=1))])
|
||||
with pytest.raises(AttributeError, match="The following error was raised"):
|
||||
my_pipeline.fit(X, y).predict(X)
|
||||
|
||||
# 2nd case, the estimator doesn't implement `__sklearn_tags__` at all.
|
||||
class MyEstimator2:
|
||||
def __init__(self, *, param=1):
|
||||
self.param = param
|
||||
|
||||
def fit(self, X, y=None):
|
||||
self.is_fitted_ = True
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return np.full(shape=X.shape[0], fill_value=self.param)
|
||||
|
||||
my_pipeline = Pipeline([("estimator", MyEstimator2(param=1))])
|
||||
with pytest.raises(AttributeError, match="The following error was raised"):
|
||||
my_pipeline.fit(X, y).predict(X)
|
||||
|
||||
# check that we still raise an error if it is not an AttributeError or related to
|
||||
# __sklearn_tags__
|
||||
class MyEstimator3(MyEstimator, BaseEstimator):
|
||||
def __init__(self, *, param=1, error_type=AttributeError):
|
||||
self.param = param
|
||||
self.error_type = error_type
|
||||
|
||||
def __sklearn_tags__(self):
|
||||
super().__sklearn_tags__()
|
||||
raise self.error_type("test")
|
||||
|
||||
for error_type in (AttributeError, TypeError, ValueError):
|
||||
estimator = MyEstimator3(param=1, error_type=error_type)
|
||||
with pytest.raises(error_type):
|
||||
get_tags(estimator)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._typedefs import testing_make_array_from_typed_val
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"type_t, value, expected_dtype",
|
||||
[
|
||||
("float64_t", 1.0, np.float64),
|
||||
("float32_t", 1.0, np.float32),
|
||||
("intp_t", 1, np.intp),
|
||||
("int8_t", 1, np.int8),
|
||||
("int32_t", 1, np.int32),
|
||||
("int64_t", 1, np.int64),
|
||||
("uint8_t", 1, np.uint8),
|
||||
("uint32_t", 1, np.uint32),
|
||||
("uint64_t", 1, np.uint64),
|
||||
],
|
||||
)
|
||||
def test_types(type_t, value, expected_dtype):
|
||||
"""Check that the types defined in _typedefs correspond to the expected
|
||||
numpy dtypes.
|
||||
"""
|
||||
assert testing_make_array_from_typed_val[type_t](value).dtype == expected_dtype
|
||||
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.utils._unique import attach_unique, cached_unique
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
|
||||
def test_attach_unique_attaches_unique_to_array():
|
||||
arr = np.array([1, 2, 2, 3, 4, 4, 5])
|
||||
arr_ = attach_unique(arr)
|
||||
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
|
||||
assert_array_equal(arr_, arr)
|
||||
|
||||
|
||||
def test_cached_unique_returns_cached_unique():
|
||||
my_dtype = np.dtype(np.float64, metadata={"unique": np.array([1, 2])})
|
||||
arr = np.array([1, 2, 2, 3, 4, 4, 5], dtype=my_dtype)
|
||||
assert_array_equal(cached_unique(arr), np.array([1, 2]))
|
||||
|
||||
|
||||
def test_attach_unique_not_ndarray():
|
||||
"""Test that when not np.ndarray, we don't touch the array."""
|
||||
arr = [1, 2, 2, 3, 4, 4, 5]
|
||||
arr_ = attach_unique(arr)
|
||||
assert arr_ is arr
|
||||
|
||||
|
||||
def test_attach_unique_returns_view():
|
||||
"""Test that attach_unique returns a view of the array."""
|
||||
arr = np.array([1, 2, 2, 3, 4, 4, 5])
|
||||
arr_ = attach_unique(arr)
|
||||
assert arr_.base is arr
|
||||
|
||||
|
||||
def test_attach_unique_return_tuple():
|
||||
"""Test return_tuple argument of the function."""
|
||||
arr = np.array([1, 2, 2, 3, 4, 4, 5])
|
||||
arr_tuple = attach_unique(arr, return_tuple=True)
|
||||
assert isinstance(arr_tuple, tuple)
|
||||
assert len(arr_tuple) == 1
|
||||
assert_array_equal(arr_tuple[0], arr)
|
||||
|
||||
arr_single = attach_unique(arr, return_tuple=False)
|
||||
assert isinstance(arr_single, np.ndarray)
|
||||
assert_array_equal(arr_single, arr)
|
||||
|
||||
|
||||
def test_check_array_keeps_unique():
|
||||
"""Test that check_array keeps the unique metadata."""
|
||||
arr = np.array([[1, 2, 2, 3, 4, 4, 5]])
|
||||
arr_ = attach_unique(arr)
|
||||
arr_ = check_array(arr_)
|
||||
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
|
||||
assert_array_equal(arr_, arr)
|
||||
@@ -0,0 +1,65 @@
|
||||
import string
|
||||
import timeit
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._user_interface import _message_with_time, _print_elapsed_time
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["source", "message", "is_long"],
|
||||
[
|
||||
("ABC", string.ascii_lowercase, False),
|
||||
("ABCDEF", string.ascii_lowercase, False),
|
||||
("ABC", string.ascii_lowercase * 3, True),
|
||||
("ABC" * 10, string.ascii_lowercase, True),
|
||||
("ABC", string.ascii_lowercase + "\u1048", False),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
["time", "time_str"],
|
||||
[
|
||||
(0.2, " 0.2s"),
|
||||
(20, " 20.0s"),
|
||||
(2000, "33.3min"),
|
||||
(20000, "333.3min"),
|
||||
],
|
||||
)
|
||||
def test_message_with_time(source, message, is_long, time, time_str):
|
||||
out = _message_with_time(source, message, time)
|
||||
if is_long:
|
||||
assert len(out) > 70
|
||||
else:
|
||||
assert len(out) == 70
|
||||
|
||||
assert out.startswith("[" + source + "] ")
|
||||
out = out[len(source) + 3 :]
|
||||
|
||||
assert out.endswith(time_str)
|
||||
out = out[: -len(time_str)]
|
||||
assert out.endswith(", total=")
|
||||
out = out[: -len(", total=")]
|
||||
assert out.endswith(message)
|
||||
out = out[: -len(message)]
|
||||
assert out.endswith(" ")
|
||||
out = out[:-1]
|
||||
|
||||
if is_long:
|
||||
assert not out
|
||||
else:
|
||||
assert list(set(out)) == ["."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["message", "expected"],
|
||||
[
|
||||
("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
|
||||
("", _message_with_time("ABC", "", 0.1) + "\n"),
|
||||
(None, ""),
|
||||
],
|
||||
)
|
||||
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0)
|
||||
with _print_elapsed_time("ABC", message):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
|
||||
assert capsys.readouterr().out == expected
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._weight_vector import (
|
||||
WeightVector32,
|
||||
WeightVector64,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, WeightVector",
|
||||
[
|
||||
(np.float32, WeightVector32),
|
||||
(np.float64, WeightVector64),
|
||||
],
|
||||
)
|
||||
def test_type_invariance(dtype, WeightVector):
|
||||
"""Check the `dtype` consistency of `WeightVector`."""
|
||||
weights = np.random.rand(100).astype(dtype)
|
||||
average_weights = np.random.rand(100).astype(dtype)
|
||||
|
||||
weight_vector = WeightVector(weights, average_weights)
|
||||
|
||||
assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
|
||||
assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)
|
||||
Reference in New Issue
Block a user