Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/sklearn/utils/class_weight.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/sklearn/utils/class_weight.py
@@ -0,0 +1,231 @@
+"""Utilities for handling weights based on class labels."""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from scipy import sparse
+
+from sklearn.utils._param_validation import StrOptions, validate_params
+from sklearn.utils.validation import _check_sample_weight
+
+
+@validate_params(
+    {
+        "class_weight": [dict, StrOptions({"balanced"}), None],
+        "classes": [np.ndarray],
+        "y": ["array-like"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def compute_class_weight(class_weight, *, classes, y, sample_weight=None):
+    """Estimate class weights for unbalanced datasets.
+
+    Parameters
+    ----------
+    class_weight : dict, "balanced" or None
+        If "balanced", class weights will be given by
+        `n_samples / (n_classes * np.bincount(y))` or their weighted equivalent if
+        `sample_weight` is provided.
+        If a dictionary is given, keys are classes and values are corresponding class
+        weights.
+        If `None` is given, the class weights will be uniform.
+
+    classes : ndarray
+        Array of the classes occurring in the data, as given by
+        `np.unique(y_org)` with `y_org` the original class labels.
+
+    y : array-like of shape (n_samples,)
+        Array of original class labels per sample.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Array of weights that are assigned to individual samples. Only used when
+        `class_weight='balanced'`.
+
+    Returns
+    -------
+    class_weight_vect : ndarray of shape (n_classes,)
+        Array with `class_weight_vect[i]` the weight for i-th class.
+
+    References
+    ----------
+    The "balanced" heuristic is inspired by
+    Logistic Regression in Rare Events Data, King, Zen, 2001.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.class_weight import compute_class_weight
+    >>> y = [1, 1, 1, 1, 0, 0]
+    >>> compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
+    array([1.5 , 0.75])
+    """
+    # Import error caused by circular imports.
+    from sklearn.preprocessing import LabelEncoder
+
+    if set(y) - set(classes):
+        raise ValueError("classes should include all valid labels that can be in y")
+    if class_weight is None or len(class_weight) == 0:
+        # uniform class weights
+        weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
+    elif class_weight == "balanced":
+        # Find the weight of each class as present in y.
+        le = LabelEncoder()
+        y_ind = le.fit_transform(y)
+        if not all(np.isin(classes, le.classes_)):
+            raise ValueError("classes should have valid labels that are in y")
+
+        sample_weight = _check_sample_weight(sample_weight, y)
+        weighted_class_counts = np.bincount(y_ind, weights=sample_weight)
+        recip_freq = weighted_class_counts.sum() / (
+            len(le.classes_) * weighted_class_counts
+        )
+        weight = recip_freq[le.transform(classes)]
+    else:
+        # user-defined dictionary
+        weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
+        unweighted_classes = []
+        for i, c in enumerate(classes):
+            if c in class_weight:
+                weight[i] = class_weight[c]
+            else:
+                unweighted_classes.append(c)
+
+        n_weighted_classes = len(classes) - len(unweighted_classes)
+        if unweighted_classes and n_weighted_classes != len(class_weight):
+            unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()
+            raise ValueError(
+                f"The classes, {unweighted_classes_user_friendly_str}, are not in"
+                " class_weight"
+            )
+
+    return weight
+
+
+@validate_params(
+    {
+        "class_weight": [dict, list, StrOptions({"balanced"}), None],
+        "y": ["array-like", "sparse matrix"],
+        "indices": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def compute_sample_weight(class_weight, y, *, indices=None):
+    """Estimate sample weights by class for unbalanced datasets.
+
+    Parameters
+    ----------
+    class_weight : dict, list of dicts, "balanced", or None
+        Weights associated with classes in the form `{class_label: weight}`.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of
+        `[{1:1}, {2:5}, {3:1}, {4:1}]`.
+
+        The `"balanced"` mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data:
+        `n_samples / (n_classes * np.bincount(y))`.
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+    y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
+        Array of original class labels per sample.
+
+    indices : array-like of shape (n_subsample,), default=None
+        Array of indices to be used in a subsample. Can be of length less than
+        `n_samples` in the case of a subsample, or equal to `n_samples` in the
+        case of a bootstrap subsample with repeated indices. If `None`, the
+        sample weight will be calculated over the full sample. Only `"balanced"`
+        is supported for `class_weight` if this is provided.
+
+    Returns
+    -------
+    sample_weight_vect : ndarray of shape (n_samples,)
+        Array with sample weights as applied to the original `y`.
+
+    Examples
+    --------
+    >>> from sklearn.utils.class_weight import compute_sample_weight
+    >>> y = [1, 1, 1, 1, 0, 0]
+    >>> compute_sample_weight(class_weight="balanced", y=y)
+    array([0.75, 0.75, 0.75, 0.75, 1.5 , 1.5 ])
+    """
+
+    # Ensure y is 2D. Sparse matrices are already 2D.
+    if not sparse.issparse(y):
+        y = np.atleast_1d(y)
+        if y.ndim == 1:
+            y = np.reshape(y, (-1, 1))
+    n_outputs = y.shape[1]
+
+    if indices is not None and class_weight != "balanced":
+        raise ValueError(
+            "The only valid class_weight for subsampling is 'balanced'. "
+            f"Given {class_weight}."
+        )
+    elif n_outputs > 1:
+        if class_weight is None or isinstance(class_weight, dict):
+            raise ValueError(
+                "For multi-output, class_weight should be a list of dicts, or the "
+                "string 'balanced'."
+            )
+        elif isinstance(class_weight, list) and len(class_weight) != n_outputs:
+            raise ValueError(
+                "For multi-output, number of elements in class_weight should match "
+                f"number of outputs. Got {len(class_weight)} element(s) while having "
+                f"{n_outputs} outputs."
+            )
+
+    expanded_class_weight = []
+    for k in range(n_outputs):
+        if sparse.issparse(y):
+            # Ok to densify a single column at a time
+            y_full = y[:, [k]].toarray().flatten()
+        else:
+            y_full = y[:, k]
+        classes_full = np.unique(y_full)
+        classes_missing = None
+
+        if class_weight == "balanced" or n_outputs == 1:
+            class_weight_k = class_weight
+        else:
+            class_weight_k = class_weight[k]
+
+        if indices is not None:
+            # Get class weights for the subsample, covering all classes in
+            # case some labels that were present in the original data are
+            # missing from the sample.
+            y_subsample = y_full[indices]
+            classes_subsample = np.unique(y_subsample)
+
+            weight_k = np.take(
+                compute_class_weight(
+                    class_weight_k, classes=classes_subsample, y=y_subsample
+                ),
+                np.searchsorted(classes_subsample, classes_full),
+                mode="clip",
+            )
+
+            classes_missing = set(classes_full) - set(classes_subsample)
+        else:
+            weight_k = compute_class_weight(
+                class_weight_k, classes=classes_full, y=y_full
+            )
+
+        weight_k = weight_k[np.searchsorted(classes_full, y_full)]
+
+        if classes_missing:
+            # Make missing classes' weight zero
+            weight_k[np.isin(y_full, list(classes_missing))] = 0.0
+
+        expanded_class_weight.append(weight_k)
+
+    expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)
+
+    return expanded_class_weight