Skip to content

[MRG] FIX LMNN gradient and cost function #201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions metric_learn/lmnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def fit(self, X, y):
# objective than the previous L, following the gradient:
while True:
# the next point next_L to try out is found by a gradient step
L_next = L - 2 * learn_rate * G
L_next = L - learn_rate * G
# we compute the objective at next point
# we copy variables that can be modified by _loss_grad, because if we
# retry we don t want to modify them several times
Expand Down Expand Up @@ -191,10 +191,11 @@ def _loss_grad(self, X, L, dfG, impostors, it, k, reg, target_neighbors, df,
# do the gradient update
assert not np.isnan(df).any()
G = dfG * reg + df * (1 - reg)
G = L.dot(G)
# compute the objective function
objective = total_active * (1 - reg)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact this was right, it's the term coming from the +1 that is in the hinge loss (cf. question from #46)

objective += G.flatten().dot(L.T.dot(L).flatten())
return G, objective, total_active, df, a1, a2
objective += G.flatten().dot(L.flatten())
return 2 * G, objective, total_active, df, a1, a2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the factor 2 comes from differentiating L.T.dot(L). It would be clearer to put it directly in G then


def _select_targets(self, X, label_inds):
target_neighbors = np.empty((X.shape[0], self.k), dtype=int)
Expand Down
100 changes: 97 additions & 3 deletions test/metric_learn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import pytest
import numpy as np
from scipy.optimize import check_grad
from scipy.optimize import check_grad, approx_fprime
from six.moves import xrange
from sklearn.metrics import pairwise_distances
from sklearn.datasets import load_iris, make_classification, make_regression
Expand All @@ -21,7 +21,7 @@
RCA_Supervised, MMC_Supervised, SDML)
# Import this specially for testing.
from metric_learn.constraints import wrap_pairs
from metric_learn.lmnn import python_LMNN
from metric_learn.lmnn import python_LMNN, _sum_outer_products


def class_separation(X, labels):
Expand Down Expand Up @@ -120,6 +120,98 @@ def test_iris(self):
self.iris_labels)
self.assertLess(csep, 0.25)

def test_loss_grad_lbfgs(self):
"""Test gradient of loss function
Assert that the gradient is almost equal to its finite differences
approximation.
"""
rng = np.random.RandomState(42)
X, y = make_classification(random_state=rng)
L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
lmnn = LMNN()

k = lmnn.k
reg = lmnn.regularization

X, y = lmnn._prepare_inputs(X, y, dtype=float,
ensure_min_samples=2)
num_pts, num_dims = X.shape
unique_labels, label_inds = np.unique(y, return_inverse=True)
lmnn.labels_ = np.arange(len(unique_labels))
lmnn.transformer_ = np.eye(num_dims)

target_neighbors = lmnn._select_targets(X, label_inds)
impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds)

# sum outer products
dfG = _sum_outer_products(X, target_neighbors.flatten(),
np.repeat(np.arange(X.shape[0]), k))
df = np.zeros_like(dfG)

# storage
a1 = [None]*k
a2 = [None]*k
for nn_idx in xrange(k):
a1[nn_idx] = np.array([])
a2[nn_idx] = np.array([])

# initialize L
def loss_grad(flat_L):
return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, impostors,
1, k, reg, target_neighbors, df.copy(),
list(a1), list(a2))

def fun(x):
return loss_grad(x)[1]

def grad(x):
return loss_grad(x)[0].ravel()

# compute relative error
epsilon = np.sqrt(np.finfo(float).eps)
rel_diff = (check_grad(fun, grad, L.ravel()) /
np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon)))
np.testing.assert_almost_equal(rel_diff, 0., decimal=5)


@pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]),
[1, 1, 0, 0], 3.0),
(np.array([[0], [1], [2], [3]]),
[1, 0, 0, 1], 26.)])
def test_toy_ex_lmnn(X, y, loss):
"""Test that the loss give the right result on a toy example"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did you get these values from?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I computed them by hand, according to the paper (note that it's only the loss function and that I set L to identity so it's easy since the distances are easy to get from this 1D example).
Maybe I can add a small detail of the computation as a comment ?

L = np.array([[1]])
lmnn = LMNN(k=1, regularization=0.5)

k = lmnn.k
reg = lmnn.regularization

X, y = lmnn._prepare_inputs(X, y, dtype=float,
ensure_min_samples=2)
num_pts, num_dims = X.shape
unique_labels, label_inds = np.unique(y, return_inverse=True)
lmnn.labels_ = np.arange(len(unique_labels))
lmnn.transformer_ = np.eye(num_dims)

target_neighbors = lmnn._select_targets(X, label_inds)
impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds)

# sum outer products
dfG = _sum_outer_products(X, target_neighbors.flatten(),
np.repeat(np.arange(X.shape[0]), k))
df = np.zeros_like(dfG)

# storage
a1 = [None]*k
a2 = [None]*k
for nn_idx in xrange(k):
a1[nn_idx] = np.array([])
a2[nn_idx] = np.array([])

# assert that the loss equals the one computed by hand
assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, impostors, 1, k,
reg, target_neighbors, df, a1, a2)[1] == loss


def test_convergence_simple_example(capsys):
# LMNN should converge on this simple example, which it did not with
Expand Down Expand Up @@ -421,7 +513,9 @@ def grad(M):
return nca._loss_grad_lbfgs(M, X, mask)[1].ravel()

# compute relative error
rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
epsilon = np.sqrt(np.finfo(float).eps)
rel_diff = (check_grad(fun, grad, M.ravel()) /
np.linalg.norm(approx_fprime(M.ravel(), fun, epsilon)))
np.testing.assert_almost_equal(rel_diff, 0., decimal=6)

def test_simple_example(self):
Expand Down