Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 34 additions & 37 deletions examples/applications/plot_face_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,42 +10,23 @@

.. _LFW: http://vis-www.cs.umass.edu/lfw/

Expected results for the top 5 most represented people in the dataset:

================== ============ ======= ========== =======
precision recall f1-score support
================== ============ ======= ========== =======
Ariel Sharon 0.67 0.92 0.77 13
Colin Powell 0.75 0.78 0.76 60
Donald Rumsfeld 0.78 0.67 0.72 27
George W Bush 0.86 0.86 0.86 146
Gerhard Schroeder 0.76 0.76 0.76 25
Hugo Chavez 0.67 0.67 0.67 15
Tony Blair 0.81 0.69 0.75 36

avg / total 0.80 0.80 0.80 322
================== ============ ======= ========== =======

"""

# %%
from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.utils.fixes import loguniform


# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")


# #############################################################################
# %%
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
Expand All @@ -69,18 +50,21 @@
print("n_classes: %d" % n_classes)


# #############################################################################
# Split into a training set and a test set using a stratified k fold
# %%
# Split into a training set and a test and keep 25% of the data for testing.

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# #############################################################################
# %%
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction

n_components = 150

print(
Expand All @@ -99,23 +83,25 @@
print("done in %0.3fs" % (time() - t0))


# #############################################################################
# %%
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {
"C": [1e3, 5e3, 1e4, 5e4, 1e5],
"gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
"C": loguniform(1e3, 1e5),
"gamma": loguniform(1e-4, 1e-1),
}
clf = GridSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively trying RandomizedSearchCV with n_iter=10 should work well here and be faster than an exhaustive grid search of the 5x5 combinations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition, I think that it could be good to add a StandardScaler before applying the PCA.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RandomizedSearch looks like a good idea using the loguniform to sample C and gamma.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this idea! Yeah, totally makes sense! I will try this option and report performance differences I register.

clf = RandomizedSearchCV(
SVC(kernel="rbf", class_weight="balanced"), param_grid, n_iter=10
)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


# #############################################################################
# %%
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
Expand All @@ -124,10 +110,14 @@
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
ConfusionMatrixDisplay.from_estimator(
clf, X_test_pca, y_test, display_labels=target_names, xticks_rotation="vertical"
)
plt.tight_layout()
plt.show()


# #############################################################################
# %%
# Qualitative evaluation of the predictions using matplotlib


Expand All @@ -143,6 +133,7 @@ def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
plt.yticks(())


# %%
# plot the result of the prediction on a portion of the test set


Expand All @@ -157,10 +148,16 @@ def title(y_pred, y_test, target_names, i):
]

plot_gallery(X_test, prediction_titles, h, w)

# %%
# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()

# %%
# Face recognition problem would be much more effectively solved by training
# convolutional neural networks but this family of models is outside of the scope of
# the scikit-learn library. Interested readers should instead try to use pytorch or
# tensorflow to implement such models.