Videre

2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions
--- a/linedance-app/venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py
+++ b/linedance-app/venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py
@@ -0,0 +1,654 @@
+"""Labeled Faces in the Wild (LFW) dataset
+
+This dataset is a collection of JPEG pictures of famous people collected
+over the internet, all details are available on the official website:
+
+    http://vis-www.cs.umass.edu/lfw/
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+from numbers import Integral, Real
+from os import PathLike, listdir, makedirs, remove
+from os.path import exists, isdir, join
+
+import numpy as np
+from joblib import Memory
+
+from sklearn.datasets._base import (
+    RemoteFileMetadata,
+    _fetch_remote,
+    get_data_home,
+    load_descr,
+)
+from sklearn.utils import Bunch
+from sklearn.utils._param_validation import (
+    Hidden,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from sklearn.utils.fixes import tarfile_extractall
+
+logger = logging.getLogger(__name__)
+
+# The original data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename="lfw.tgz",
+    url="https://ndownloader.figshare.com/files/5976018",
+    checksum="055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0",
+)
+
+# The original funneled data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz
+FUNNELED_ARCHIVE = RemoteFileMetadata(
+    filename="lfw-funneled.tgz",
+    url="https://ndownloader.figshare.com/files/5976015",
+    checksum="b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100a",
+)
+
+# The original target data can be found in:
+# http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt',
+# http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt',
+# http://vis-www.cs.umass.edu/lfw/pairs.txt',
+TARGETS = (
+    RemoteFileMetadata(
+        filename="pairsDevTrain.txt",
+        url="https://ndownloader.figshare.com/files/5976012",
+        checksum="1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfa",
+    ),
+    RemoteFileMetadata(
+        filename="pairsDevTest.txt",
+        url="https://ndownloader.figshare.com/files/5976009",
+        checksum="7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c",
+    ),
+    RemoteFileMetadata(
+        filename="pairs.txt",
+        url="https://ndownloader.figshare.com/files/5976006",
+        checksum="ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592",
+    ),
+)
+
+
+#
+# Common private utilities for data fetching from the original LFW website
+# local disk caching, and image decoding.
+#
+
+
+def _check_fetch_lfw(
+    data_home=None, funneled=True, download_if_missing=True, n_retries=3, delay=1.0
+):
+    """Helper function to download any missing LFW data"""
+
+    data_home = get_data_home(data_home=data_home)
+    lfw_home = join(data_home, "lfw_home")
+
+    if not exists(lfw_home):
+        makedirs(lfw_home)
+
+    for target in TARGETS:
+        target_filepath = join(lfw_home, target.filename)
+        if not exists(target_filepath):
+            if download_if_missing:
+                logger.info("Downloading LFW metadata: %s", target.url)
+                _fetch_remote(
+                    target, dirname=lfw_home, n_retries=n_retries, delay=delay
+                )
+            else:
+                raise OSError("%s is missing" % target_filepath)
+
+    if funneled:
+        data_folder_path = join(lfw_home, "lfw_funneled")
+        archive = FUNNELED_ARCHIVE
+    else:
+        data_folder_path = join(lfw_home, "lfw")
+        archive = ARCHIVE
+
+    if not exists(data_folder_path):
+        archive_path = join(lfw_home, archive.filename)
+        if not exists(archive_path):
+            if download_if_missing:
+                logger.info("Downloading LFW data (~200MB): %s", archive.url)
+                _fetch_remote(
+                    archive, dirname=lfw_home, n_retries=n_retries, delay=delay
+                )
+            else:
+                raise OSError("%s is missing" % archive_path)
+
+        import tarfile
+
+        logger.debug("Decompressing the data archive to %s", data_folder_path)
+        with tarfile.open(archive_path, "r:gz") as fp:
+            tarfile_extractall(fp, path=lfw_home)
+
+        remove(archive_path)
+
+    return lfw_home, data_folder_path
+
+
+def _load_imgs(file_paths, slice_, color, resize):
+    """Internally used to load images"""
+    try:
+        from PIL import Image
+    except ImportError:
+        raise ImportError(
+            "The Python Imaging Library (PIL) is required to load data "
+            "from jpeg files. Please refer to "
+            "https://pillow.readthedocs.io/en/stable/installation.html "
+            "for installing PIL."
+        )
+
+    # compute the portion of the images to load to respect the slice_ parameter
+    # given by the caller
+    default_slice = (slice(0, 250), slice(0, 250))
+    if slice_ is None:
+        slice_ = default_slice
+    else:
+        slice_ = tuple(s or ds for s, ds in zip(slice_, default_slice))
+
+    h_slice, w_slice = slice_
+    h = (h_slice.stop - h_slice.start) // (h_slice.step or 1)
+    w = (w_slice.stop - w_slice.start) // (w_slice.step or 1)
+
+    if resize is not None:
+        resize = float(resize)
+        h = int(resize * h)
+        w = int(resize * w)
+
+    # allocate some contiguous memory to host the decoded image slices
+    n_faces = len(file_paths)
+    if not color:
+        faces = np.zeros((n_faces, h, w), dtype=np.float32)
+    else:
+        faces = np.zeros((n_faces, h, w, 3), dtype=np.float32)
+
+    # iterate over the collected file path to load the jpeg files as numpy
+    # arrays
+    for i, file_path in enumerate(file_paths):
+        if i % 1000 == 0:
+            logger.debug("Loading face #%05d / %05d", i + 1, n_faces)
+
+        # Checks if jpeg reading worked. Refer to issue #3594 for more
+        # details.
+
+        with Image.open(file_path) as pil_img:
+            pil_img = pil_img.crop(
+                (w_slice.start, h_slice.start, w_slice.stop, h_slice.stop)
+            )
+            if resize is not None:
+                pil_img = pil_img.resize((w, h))
+            face = np.asarray(pil_img, dtype=np.float32)
+
+        if face.ndim == 0:
+            raise RuntimeError(
+                "Failed to read the image file %s, "
+                "Please make sure that libjpeg is installed" % file_path
+            )
+
+        face /= 255.0  # scale uint8 coded colors to the [0.0, 1.0] floats
+        if not color:
+            # average the color channels to compute a gray levels
+            # representation
+            face = face.mean(axis=2)
+
+        faces[i, ...] = face
+
+    return faces
+
+
+#
+# Task #1:  Face Identification on picture with names
+#
+
+
+def _fetch_lfw_people(
+    data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0
+):
+    """Perform the actual data loading for the lfw people dataset
+
+    This operation is meant to be cached by a joblib wrapper.
+    """
+    # scan the data folder content to retain people with more that
+    # `min_faces_per_person` face pictures
+    person_names, file_paths = [], []
+    for person_name in sorted(listdir(data_folder_path)):
+        folder_path = join(data_folder_path, person_name)
+        if not isdir(folder_path):
+            continue
+        paths = [join(folder_path, f) for f in sorted(listdir(folder_path))]
+        n_pictures = len(paths)
+        if n_pictures >= min_faces_per_person:
+            person_name = person_name.replace("_", " ")
+            person_names.extend([person_name] * n_pictures)
+            file_paths.extend(paths)
+
+    n_faces = len(file_paths)
+    if n_faces == 0:
+        raise ValueError(
+            "min_faces_per_person=%d is too restrictive" % min_faces_per_person
+        )
+
+    target_names = np.unique(person_names)
+    target = np.searchsorted(target_names, person_names)
+
+    faces = _load_imgs(file_paths, slice_, color, resize)
+
+    # shuffle the faces with a deterministic RNG scheme to avoid having
+    # all faces of the same person in a row, as it would break some
+    # cross validation and learning algorithms such as SGD and online
+    # k-means that make an IID assumption
+
+    indices = np.arange(n_faces)
+    np.random.RandomState(42).shuffle(indices)
+    faces, target = faces[indices], target[indices]
+    return faces, target, target_names
+
+
+@validate_params(
+    {
+        "data_home": [str, PathLike, None],
+        "funneled": ["boolean"],
+        "resize": [Interval(Real, 0, None, closed="neither"), None],
+        "min_faces_per_person": [Interval(Integral, 0, None, closed="left"), None],
+        "color": ["boolean"],
+        "slice_": [tuple, Hidden(None)],
+        "download_if_missing": ["boolean"],
+        "return_X_y": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_lfw_people(
+    *,
+    data_home=None,
+    funneled=True,
+    resize=0.5,
+    min_faces_per_person=0,
+    color=False,
+    slice_=(slice(70, 195), slice(78, 172)),
+    download_if_missing=True,
+    return_X_y=False,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the Labeled Faces in the Wild (LFW) people dataset \
+(classification).
+
+    Download it if necessary.
+
+    =================   =======================
+    Classes                                5749
+    Samples total                         13233
+    Dimensionality                         5828
+    Features            real, between 0 and 255
+    =================   =======================
+
+    For a usage example of this dataset, see
+    :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`.
+
+    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.
+
+    Parameters
+    ----------
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    funneled : bool, default=True
+        Download and use the funneled variant of the dataset.
+
+    resize : float or None, default=0.5
+        Ratio used to resize the each face picture. If `None`, no resizing is
+        performed.
+
+    min_faces_per_person : int, default=None
+        The extracted dataset will only retain pictures of people that have at
+        least `min_faces_per_person` different pictures.
+
+    color : bool, default=False
+        Keep the 3 RGB channels instead of averaging them to a single
+        gray level channel. If color is True the shape of the data has
+        one more dimension than the shape with color = False.
+
+    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
+        Provide a custom 2D slice (height, width) to extract the
+        'interesting' part of the jpeg files and avoid use statistical
+        correlation from the background.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    return_X_y : bool, default=False
+        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
+        object. See below for more information about the `dataset.data` and
+        `dataset.target` object.
+
+        .. versionadded:: 0.20
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    dataset : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : numpy array of shape (13233, 2914)
+            Each row corresponds to a ravelled face image
+            of original size 62 x 47 pixels.
+            Changing the ``slice_`` or resize parameters will change the
+            shape of the output.
+        images : numpy array of shape (13233, 62, 47)
+            Each row is a face image corresponding to one of the 5749 people in
+            the dataset. Changing the ``slice_``
+            or resize parameters will change the shape of the output.
+        target : numpy array of shape (13233,)
+            Labels associated to each face image.
+            Those labels range from 0-5748 and correspond to the person IDs.
+        target_names : numpy array of shape (5749,)
+            Names of all persons in the dataset.
+            Position in array corresponds to the person ID in the target array.
+        DESCR : str
+            Description of the Labeled Faces in the Wild (LFW) dataset.
+
+    (data, target) : tuple if ``return_X_y`` is True
+        A tuple of two ndarray. The first containing a 2D array of
+        shape (n_samples, n_features) with each row representing one
+        sample and each column representing the features. The second
+        ndarray of shape (n_samples,) containing the target samples.
+
+        .. versionadded:: 0.20
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_lfw_people
+    >>> lfw_people = fetch_lfw_people()
+    >>> lfw_people.data.shape
+    (13233, 2914)
+    >>> lfw_people.target.shape
+    (13233,)
+    >>> for name in lfw_people.target_names[:5]:
+    ...    print(name)
+    AJ Cook
+    AJ Lamas
+    Aaron Eckhart
+    Aaron Guiel
+    Aaron Patterson
+    """
+    lfw_home, data_folder_path = _check_fetch_lfw(
+        data_home=data_home,
+        funneled=funneled,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    logger.debug("Loading LFW people faces from %s", lfw_home)
+
+    # wrap the loader in a memoizing function that will return memmaped data
+    # arrays for optimal memory usage
+    m = Memory(location=lfw_home, compress=6, verbose=0)
+    load_func = m.cache(_fetch_lfw_people)
+
+    # load and memoize the pairs as np arrays
+    faces, target, target_names = load_func(
+        data_folder_path,
+        resize=resize,
+        min_faces_per_person=min_faces_per_person,
+        color=color,
+        slice_=slice_,
+    )
+
+    X = faces.reshape(len(faces), -1)
+
+    fdescr = load_descr("lfw.rst")
+
+    if return_X_y:
+        return X, target
+
+    # pack the results as a Bunch instance
+    return Bunch(
+        data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr
+    )
+
+
+#
+# Task #2:  Face Verification on pairs of face pictures
+#
+
+
+def _fetch_lfw_pairs(
+    index_file_path, data_folder_path, slice_=None, color=False, resize=None
+):
+    """Perform the actual data loading for the LFW pairs dataset
+
+    This operation is meant to be cached by a joblib wrapper.
+    """
+    # parse the index file to find the number of pairs to be able to allocate
+    # the right amount of memory before starting to decode the jpeg files
+    with open(index_file_path, "rb") as index_file:
+        split_lines = [ln.decode().strip().split("\t") for ln in index_file]
+    pair_specs = [sl for sl in split_lines if len(sl) > 2]
+    n_pairs = len(pair_specs)
+
+    # iterating over the metadata lines for each pair to find the filename to
+    # decode and load in memory
+    target = np.zeros(n_pairs, dtype=int)
+    file_paths = list()
+    for i, components in enumerate(pair_specs):
+        if len(components) == 3:
+            target[i] = 1
+            pair = (
+                (components[0], int(components[1]) - 1),
+                (components[0], int(components[2]) - 1),
+            )
+        elif len(components) == 4:
+            target[i] = 0
+            pair = (
+                (components[0], int(components[1]) - 1),
+                (components[2], int(components[3]) - 1),
+            )
+        else:
+            raise ValueError("invalid line %d: %r" % (i + 1, components))
+        for j, (name, idx) in enumerate(pair):
+            try:
+                person_folder = join(data_folder_path, name)
+            except TypeError:
+                person_folder = join(data_folder_path, str(name, "UTF-8"))
+            filenames = list(sorted(listdir(person_folder)))
+            file_path = join(person_folder, filenames[idx])
+            file_paths.append(file_path)
+
+    pairs = _load_imgs(file_paths, slice_, color, resize)
+    shape = list(pairs.shape)
+    n_faces = shape.pop(0)
+    shape.insert(0, 2)
+    shape.insert(0, n_faces // 2)
+    pairs.shape = shape
+
+    return pairs, target, np.array(["Different persons", "Same person"])
+
+
+@validate_params(
+    {
+        "subset": [StrOptions({"train", "test", "10_folds"})],
+        "data_home": [str, PathLike, None],
+        "funneled": ["boolean"],
+        "resize": [Interval(Real, 0, None, closed="neither"), None],
+        "color": ["boolean"],
+        "slice_": [tuple, Hidden(None)],
+        "download_if_missing": ["boolean"],
+        "n_retries": [Interval(Integral, 1, None, closed="left")],
+        "delay": [Interval(Real, 0.0, None, closed="neither")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def fetch_lfw_pairs(
+    *,
+    subset="train",
+    data_home=None,
+    funneled=True,
+    resize=0.5,
+    color=False,
+    slice_=(slice(70, 195), slice(78, 172)),
+    download_if_missing=True,
+    n_retries=3,
+    delay=1.0,
+):
+    """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).
+
+    Download it if necessary.
+
+    =================   =======================
+    Classes                                   2
+    Samples total                         13233
+    Dimensionality                         5828
+    Features            real, between 0 and 255
+    =================   =======================
+
+    In the `original paper <https://people.cs.umass.edu/~elm/papers/lfw.pdf>`_
+    the "pairs" version corresponds to the "restricted task", where
+    the experimenter should not use the name of a person to infer
+    the equivalence or non-equivalence of two face images that
+    are not explicitly given in the training set.
+
+    The original images are 250 x 250 pixels, but the default slice and resize
+    arguments reduce them to 62 x 47.
+
+    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.
+
+    Parameters
+    ----------
+    subset : {'train', 'test', '10_folds'}, default='train'
+        Select the dataset to load: 'train' for the development training
+        set, 'test' for the development test set, and '10_folds' for the
+        official evaluation set that is meant to be used with a 10-folds
+        cross validation.
+
+    data_home : str or path-like, default=None
+        Specify another download and cache folder for the datasets. By
+        default all scikit-learn data is stored in '~/scikit_learn_data'
+        subfolders.
+
+    funneled : bool, default=True
+        Download and use the funneled variant of the dataset.
+
+    resize : float, default=0.5
+        Ratio used to resize the each face picture.
+
+    color : bool, default=False
+        Keep the 3 RGB channels instead of averaging them to a single
+        gray level channel. If color is True the shape of the data has
+        one more dimension than the shape with color = False.
+
+    slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))
+        Provide a custom 2D slice (height, width) to extract the
+        'interesting' part of the jpeg files and avoid use statistical
+        correlation from the background.
+
+    download_if_missing : bool, default=True
+        If False, raise an OSError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    n_retries : int, default=3
+        Number of retries when HTTP errors are encountered.
+
+        .. versionadded:: 1.5
+
+    delay : float, default=1.0
+        Number of seconds between retries.
+
+        .. versionadded:: 1.5
+
+    Returns
+    -------
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+
+        data : ndarray of shape (2200, 5828). Shape depends on ``subset``.
+            Each row corresponds to 2 ravel'd face images
+            of original size 62 x 47 pixels.
+            Changing the ``slice_``, ``resize`` or ``subset`` parameters
+            will change the shape of the output.
+        pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``
+            Each row has 2 face images corresponding
+            to same or different person from the dataset
+            containing 5749 people. Changing the ``slice_``,
+            ``resize`` or ``subset`` parameters will change the shape of the
+            output.
+        target : numpy array of shape (2200,). Shape depends on ``subset``.
+            Labels associated to each pair of images.
+            The two label values being different persons or the same person.
+        target_names : numpy array of shape (2,)
+            Explains the target values of the target array.
+            0 corresponds to "Different person", 1 corresponds to "same person".
+        DESCR : str
+            Description of the Labeled Faces in the Wild (LFW) dataset.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_lfw_pairs
+    >>> lfw_pairs_train = fetch_lfw_pairs(subset='train')
+    >>> list(lfw_pairs_train.target_names)
+    [np.str_('Different persons'), np.str_('Same person')]
+    >>> lfw_pairs_train.pairs.shape
+    (2200, 2, 62, 47)
+    >>> lfw_pairs_train.data.shape
+    (2200, 5828)
+    >>> lfw_pairs_train.target.shape
+    (2200,)
+    """
+    lfw_home, data_folder_path = _check_fetch_lfw(
+        data_home=data_home,
+        funneled=funneled,
+        download_if_missing=download_if_missing,
+        n_retries=n_retries,
+        delay=delay,
+    )
+    logger.debug("Loading %s LFW pairs from %s", subset, lfw_home)
+
+    # wrap the loader in a memoizing function that will return memmaped data
+    # arrays for optimal memory usage
+    m = Memory(location=lfw_home, compress=6, verbose=0)
+    load_func = m.cache(_fetch_lfw_pairs)
+
+    # select the right metadata file according to the requested subset
+    label_filenames = {
+        "train": "pairsDevTrain.txt",
+        "test": "pairsDevTest.txt",
+        "10_folds": "pairs.txt",
+    }
+    if subset not in label_filenames:
+        raise ValueError(
+            "subset='%s' is invalid: should be one of %r"
+            % (subset, list(sorted(label_filenames.keys())))
+        )
+    index_file_path = join(lfw_home, label_filenames[subset])
+
+    # load and memoize the pairs as np arrays
+    pairs, target, target_names = load_func(
+        index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_
+    )
+
+    fdescr = load_descr("lfw.rst")
+
+    # pack the results as a Bunch instance
+    return Bunch(
+        data=pairs.reshape(len(pairs), -1),
+        pairs=pairs,
+        target=target,
+        target_names=target_names,
+        DESCR=fdescr,
+    )