-
Notifications
You must be signed in to change notification settings - Fork 228
[MRG] FIX LMNN gradient and cost function #201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
86d26d3
61eea28
d5c9dd0
b238b65
f9511a0
562f33b
65057e3
69c328c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,7 +105,7 @@ def fit(self, X, y): | |
# objective than the previous L, following the gradient: | ||
while True: | ||
# the next point next_L to try out is found by a gradient step | ||
L_next = L - 2 * learn_rate * G | ||
L_next = L - learn_rate * G | ||
# we compute the objective at next point | ||
# we copy variables that can be modified by _loss_grad, because if we | ||
# retry we don t want to modify them several times | ||
|
@@ -191,10 +191,11 @@ def _loss_grad(self, X, L, dfG, impostors, it, k, reg, target_neighbors, df, | |
# do the gradient update | ||
assert not np.isnan(df).any() | ||
G = dfG * reg + df * (1 - reg) | ||
G = L.dot(G) | ||
# compute the objective function | ||
objective = total_active * (1 - reg) | ||
objective += G.flatten().dot(L.T.dot(L).flatten()) | ||
return G, objective, total_active, df, a1, a2 | ||
objective += G.flatten().dot(L.flatten()) | ||
return 2 * G, objective, total_active, df, a1, a2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the factor 2 comes from differentiating |
||
|
||
def _select_targets(self, X, label_inds): | ||
target_neighbors = np.empty((X.shape[0], self.k), dtype=int) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
import re | ||
import pytest | ||
import numpy as np | ||
from scipy.optimize import check_grad | ||
from scipy.optimize import check_grad, approx_fprime | ||
from six.moves import xrange | ||
from sklearn.metrics import pairwise_distances | ||
from sklearn.datasets import load_iris, make_classification, make_regression | ||
|
@@ -21,7 +21,7 @@ | |
RCA_Supervised, MMC_Supervised, SDML) | ||
# Import this specially for testing. | ||
from metric_learn.constraints import wrap_pairs | ||
from metric_learn.lmnn import python_LMNN | ||
from metric_learn.lmnn import python_LMNN, _sum_outer_products | ||
|
||
|
||
def class_separation(X, labels): | ||
|
@@ -120,6 +120,98 @@ def test_iris(self): | |
self.iris_labels) | ||
self.assertLess(csep, 0.25) | ||
|
||
def test_loss_grad_lbfgs(self): | ||
"""Test gradient of loss function | ||
Assert that the gradient is almost equal to its finite differences | ||
approximation. | ||
""" | ||
rng = np.random.RandomState(42) | ||
X, y = make_classification(random_state=rng) | ||
L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) | ||
lmnn = LMNN() | ||
|
||
k = lmnn.k | ||
reg = lmnn.regularization | ||
|
||
X, y = lmnn._prepare_inputs(X, y, dtype=float, | ||
ensure_min_samples=2) | ||
num_pts, num_dims = X.shape | ||
unique_labels, label_inds = np.unique(y, return_inverse=True) | ||
lmnn.labels_ = np.arange(len(unique_labels)) | ||
lmnn.transformer_ = np.eye(num_dims) | ||
|
||
target_neighbors = lmnn._select_targets(X, label_inds) | ||
impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds) | ||
|
||
# sum outer products | ||
dfG = _sum_outer_products(X, target_neighbors.flatten(), | ||
np.repeat(np.arange(X.shape[0]), k)) | ||
df = np.zeros_like(dfG) | ||
|
||
# storage | ||
a1 = [None]*k | ||
a2 = [None]*k | ||
for nn_idx in xrange(k): | ||
a1[nn_idx] = np.array([]) | ||
a2[nn_idx] = np.array([]) | ||
|
||
# initialize L | ||
def loss_grad(flat_L): | ||
return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, impostors, | ||
1, k, reg, target_neighbors, df.copy(), | ||
list(a1), list(a2)) | ||
|
||
def fun(x): | ||
return loss_grad(x)[1] | ||
|
||
def grad(x): | ||
return loss_grad(x)[0].ravel() | ||
|
||
# compute relative error | ||
epsilon = np.sqrt(np.finfo(float).eps) | ||
rel_diff = (check_grad(fun, grad, L.ravel()) / | ||
np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) | ||
np.testing.assert_almost_equal(rel_diff, 0., decimal=5) | ||
|
||
|
||
@pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]), | ||
[1, 1, 0, 0], 3.0), | ||
(np.array([[0], [1], [2], [3]]), | ||
[1, 0, 0, 1], 26.)]) | ||
def test_toy_ex_lmnn(X, y, loss): | ||
"""Test that the loss give the right result on a toy example""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did you get these values from? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I computed them by hand, according to the paper (note that it's only the loss function and that I set L to identity so it's easy since the distances are easy to get from this 1D example). |
||
L = np.array([[1]]) | ||
lmnn = LMNN(k=1, regularization=0.5) | ||
|
||
k = lmnn.k | ||
reg = lmnn.regularization | ||
|
||
X, y = lmnn._prepare_inputs(X, y, dtype=float, | ||
ensure_min_samples=2) | ||
num_pts, num_dims = X.shape | ||
unique_labels, label_inds = np.unique(y, return_inverse=True) | ||
lmnn.labels_ = np.arange(len(unique_labels)) | ||
lmnn.transformer_ = np.eye(num_dims) | ||
|
||
target_neighbors = lmnn._select_targets(X, label_inds) | ||
impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds) | ||
|
||
# sum outer products | ||
dfG = _sum_outer_products(X, target_neighbors.flatten(), | ||
np.repeat(np.arange(X.shape[0]), k)) | ||
df = np.zeros_like(dfG) | ||
|
||
# storage | ||
a1 = [None]*k | ||
a2 = [None]*k | ||
for nn_idx in xrange(k): | ||
a1[nn_idx] = np.array([]) | ||
a2[nn_idx] = np.array([]) | ||
|
||
# assert that the loss equals the one computed by hand | ||
assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, impostors, 1, k, | ||
reg, target_neighbors, df, a1, a2)[1] == loss | ||
|
||
|
||
def test_convergence_simple_example(capsys): | ||
# LMNN should converge on this simple example, which it did not with | ||
|
@@ -421,7 +513,9 @@ def grad(M): | |
return nca._loss_grad_lbfgs(M, X, mask)[1].ravel() | ||
|
||
# compute relative error | ||
rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M)) | ||
epsilon = np.sqrt(np.finfo(float).eps) | ||
rel_diff = (check_grad(fun, grad, M.ravel()) / | ||
np.linalg.norm(approx_fprime(M.ravel(), fun, epsilon))) | ||
np.testing.assert_almost_equal(rel_diff, 0., decimal=6) | ||
|
||
def test_simple_example(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In fact this was right, it's the term coming from the +1 that is in the hinge loss (cf. question from #46)