File size: 6,075 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
"""Modified Olivetti faces dataset.
The original database was available from (now defunct)
https://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html
The version retrieved here comes in MATLAB format from the personal
web page of Sam Roweis:
https://cs.nyu.edu/~roweis/
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from numbers import Integral, Real
from os import PathLike, makedirs, remove
from os.path import exists
import joblib
import numpy as np
from scipy.io import loadmat
from ..utils import Bunch, check_random_state
from ..utils._param_validation import Interval, validate_params
from . import get_data_home
from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr
# The original data can be found at:
# https://cs.nyu.edu/~roweis/data/olivettifaces.mat
FACES = RemoteFileMetadata(
filename="olivettifaces.mat",
url="https://ndownloader.figshare.com/files/5976027",
checksum="b612fb967f2dc77c9c62d3e1266e0c73d5fca46a4b8906c18e454d41af987794",
)
@validate_params(
{
"data_home": [str, PathLike, None],
"shuffle": ["boolean"],
"random_state": ["random_state"],
"download_if_missing": ["boolean"],
"return_X_y": ["boolean"],
"n_retries": [Interval(Integral, 1, None, closed="left")],
"delay": [Interval(Real, 0.0, None, closed="neither")],
},
prefer_skip_nested_validation=True,
)
def fetch_olivetti_faces(
*,
data_home=None,
shuffle=False,
random_state=0,
download_if_missing=True,
return_X_y=False,
n_retries=3,
delay=1.0,
):
"""Load the Olivetti faces data-set from AT&T (classification).
Download it if necessary.
================= =====================
Classes 40
Samples total 400
Dimensionality 4096
Features real, between 0 and 1
================= =====================
Read more in the :ref:`User Guide <olivetti_faces_dataset>`.
Parameters
----------
data_home : str or path-like, default=None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
shuffle : bool, default=False
If True the order of the dataset is shuffled to avoid having
images of the same person grouped.
random_state : int, RandomState instance or None, default=0
Determines random number generation for dataset shuffling. Pass an int
for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
download_if_missing : bool, default=True
If False, raise an OSError if the data is not locally available
instead of trying to download the data from the source site.
return_X_y : bool, default=False
If True, returns `(data, target)` instead of a `Bunch` object. See
below for more information about the `data` and `target` object.
.. versionadded:: 0.22
n_retries : int, default=3
Number of retries when HTTP errors are encountered.
.. versionadded:: 1.5
delay : float, default=1.0
Number of seconds between retries.
.. versionadded:: 1.5
Returns
-------
data : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.
data: ndarray, shape (400, 4096)
Each row corresponds to a ravelled
face image of original size 64 x 64 pixels.
images : ndarray, shape (400, 64, 64)
Each row is a face image
corresponding to one of the 40 subjects of the dataset.
target : ndarray, shape (400,)
Labels associated to each face image.
Those labels are ranging from 0-39 and correspond to the
Subject IDs.
DESCR : str
Description of the modified Olivetti Faces Dataset.
(data, target) : tuple if `return_X_y=True`
Tuple with the `data` and `target` objects described above.
.. versionadded:: 0.22
Examples
--------
>>> from sklearn.datasets import fetch_olivetti_faces
>>> olivetti_faces = fetch_olivetti_faces()
>>> olivetti_faces.data.shape
(400, 4096)
>>> olivetti_faces.target.shape
(400,)
>>> olivetti_faces.images.shape
(400, 64, 64)
"""
data_home = get_data_home(data_home=data_home)
if not exists(data_home):
makedirs(data_home)
filepath = _pkl_filepath(data_home, "olivetti.pkz")
if not exists(filepath):
if not download_if_missing:
raise OSError("Data not found and `download_if_missing` is False")
print("downloading Olivetti faces from %s to %s" % (FACES.url, data_home))
mat_path = _fetch_remote(
FACES, dirname=data_home, n_retries=n_retries, delay=delay
)
mfile = loadmat(file_name=mat_path)
# delete raw .mat data
remove(mat_path)
faces = mfile["faces"].T.copy()
joblib.dump(faces, filepath, compress=6)
del mfile
else:
faces = joblib.load(filepath)
# We want floating point data, but float32 is enough (there is only
# one byte of precision in the original uint8s anyway)
faces = np.float32(faces)
faces = faces - faces.min()
faces /= faces.max()
faces = faces.reshape((400, 64, 64)).transpose(0, 2, 1)
# 10 images per class, 400 images total, each class is contiguous.
target = np.array([i // 10 for i in range(400)])
if shuffle:
random_state = check_random_state(random_state)
order = random_state.permutation(len(faces))
faces = faces[order]
target = target[order]
faces_vectorized = faces.reshape(len(faces), -1)
fdescr = load_descr("olivetti_faces.rst")
if return_X_y:
return faces_vectorized, target
return Bunch(data=faces_vectorized, images=faces, target=target, DESCR=fdescr)
|