diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 20eeea3b..bb046a95 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,17 +14,25 @@ from __future__ import print_function, absolute_import import numpy as np +import scipy import warnings from collections import Counter from six.moves import xrange from sklearn.exceptions import ChangedBehaviorWarning -from sklearn.metrics import euclidean_distances +from sklearn.metrics import euclidean_distances, pairwise_distances from sklearn.base import TransformerMixin from ._util import _initialize_transformer, _check_n_components from .base_metric import MahalanobisMixin +def re_order_target_neighbors(L, X, target_neighbors): + Xl = np.dot(X, L.T) + dd = np.sum((Xl[:, None, :] - Xl[target_neighbors])**2, axis=2) + sorted_neighbors = np.take_along_axis(target_neighbors, dd.argsort(axis=1), 1) + return sorted_neighbors + + class LMNN(MahalanobisMixin, TransformerMixin): def __init__(self, init=None, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, @@ -155,8 +163,9 @@ def fit(self, X, y): raise ValueError('not enough class labels for specified k' ' (smallest class has %d)' % required_k) - target_neighbors = self._select_targets(X, label_inds) - impostors = self._find_impostors(target_neighbors[:, -1], X, label_inds) + target_neighbors = self._select_targets(self.transformer_, X, label_inds) + impostors = self._find_impostors(self.transformer_, + target_neighbors[:, -1], X, label_inds) if len(impostors) == 0: # L has already been initialized to an identity matrix return @@ -164,26 +173,21 @@ def fit(self, X, y): # sum outer products dfG = _sum_outer_products(X, target_neighbors.flatten(), np.repeat(np.arange(X.shape[0]), k)) - df = np.zeros_like(dfG) - - # storage - a1 = [None]*k - a2 = [None]*k - for nn_idx in xrange(k): - a1[nn_idx] = np.array([]) - a2[nn_idx] = np.array([]) # initialize L L = self.transformer_ # first iteration: we compute variables (including objective and gradient) # at initialization point - G, objective, total_active, df, a1, a2 = ( - self._loss_grad(X, L, dfG, impostors, 1, k, reg, target_neighbors, df, - a1, a2)) + G, objective, total_active = self._loss_grad(X, L, y, dfG, 1, + k, reg, target_neighbors) + + # TODO: need to print here the log it = 1 # we already made one iteration + print(it, objective, 0, total_active, 1.05e-5) # TODO: replace by a + # real learning rate here it's just to fix a bug when printing # main loop for it in xrange(2, self.max_iter): # then at each iteration, we try to find a value of L that has better @@ -194,10 +198,12 @@ def fit(self, X, y): # we compute the objective at next point # we copy variables that can be modified by _loss_grad, because if we # retry we don t want to modify them several times - (G_next, objective_next, total_active_next, df_next, a1_next, - a2_next) = ( - self._loss_grad(X, L_next, dfG, impostors, it, k, reg, - target_neighbors, df.copy(), list(a1), list(a2))) + # target_neighbors_next = self._select_targets(L_next, X, label_inds) + # TODO: I should just re-order the target neighbors + + (G_next, objective_next, total_active_next) = ( + self._loss_grad(X, L_next, label_inds, dfG, it, k, reg, + target_neighbors)) assert not np.isnan(objective) delta_obj = objective_next - objective if delta_obj > 0: @@ -212,8 +218,7 @@ def fit(self, X, y): # old variables to these new ones before next iteration and we # slightly increase the learning rate L = L_next - G, df, objective, total_active, a1, a2 = ( - G_next, df_next, objective_next, total_active_next, a1_next, a2_next) + G, objective, total_active = G_next, objective_next, total_active_next learn_rate *= 1.01 if self.verbose: @@ -231,74 +236,97 @@ def fit(self, X, y): # store the last L self.transformer_ = L self.n_iter_ = it + self.targets_ = target_neighbors + self.impostors_ = impostors return self - def _loss_grad(self, X, L, dfG, impostors, it, k, reg, target_neighbors, df, - a1, a2): + def _loss_grad(self, X, L, y, dfG, it, k, reg, target_neighbors): # Compute pairwise distances under current metric - Lx = L.dot(X.T).T - g0 = _inplace_paired_L2(*Lx[impostors]) - Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :]) - g1, g2 = Ni[impostors] - # compute the gradient - total_active = 0 - for nn_idx in reversed(xrange(k)): - act1 = g0 < g1[:, nn_idx] - act2 = g0 < g2[:, nn_idx] - total_active += act1.sum() + act2.sum() - - if it > 1: - plus1 = act1 & ~a1[nn_idx] - minus1 = a1[nn_idx] & ~act1 - plus2 = act2 & ~a2[nn_idx] - minus2 = a2[nn_idx] & ~act2 - else: - plus1 = act1 - plus2 = act2 - minus1 = np.zeros(0, dtype=int) - minus2 = np.zeros(0, dtype=int) - - targets = target_neighbors[:, nn_idx] - PLUS, pweight = _count_edges(plus1, plus2, impostors, targets) - df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight) - MINUS, mweight = _count_edges(minus1, minus2, impostors, targets) - df -= _sum_outer_products(X, MINUS[:, 0], MINUS[:, 1], mweight) - - in_imp, out_imp = impostors - df += _sum_outer_products(X, in_imp[minus1], out_imp[minus1]) - df += _sum_outer_products(X, in_imp[minus2], out_imp[minus2]) - - df -= _sum_outer_products(X, in_imp[plus1], out_imp[plus1]) - df -= _sum_outer_products(X, in_imp[plus2], out_imp[plus2]) - - a1[nn_idx] = act1 - a2[nn_idx] = act2 - # do the gradient update - assert not np.isnan(df).any() - G = dfG * reg + df * (1 - reg) - G = L.dot(G) - # compute the objective function - objective = total_active * (1 - reg) - objective += G.flatten().dot(L.flatten()) - return 2 * G, objective, total_active, df, a1, a2 - - def _select_targets(self, X, label_inds): + Lx = X.dot(L.T) + n_samples = X.shape[0] + target_dist = np.sum((Lx[:, None] - Lx[target_neighbors])**2, axis=2) + # TODO: maybe this is not the more efficient, to re-order inplace the + # target neighbors ? + target_idx_sorted = np.take_along_axis(target_neighbors, + target_dist.argsort(axis=1), 1) + target_dist = np.sort(target_dist, axis=1) + total_active, push_loss = 0, 0 + weights = np.zeros((n_samples, n_samples)) + for c in np.unique(y): # could maybe avoid this loop and vectorize + same_label = np.where(y == c)[0] # TODO: I can have this pre-computed + diff_label = np.where(y != c)[0] + imp_dist = pairwise_distances(Lx[same_label], Lx[diff_label], + squared=True) + # TODO: do some computations with a count kind of thing maybe + for nn_idx in reversed(xrange(k)): # could maybe avoid this loop and + # vectorize + # TODO: simplify indexing when possible + margins = target_dist[same_label][:, nn_idx][:, None] + 1 - imp_dist + active = margins > 0 + # we mask the further impostors bc they don't need to be compared + # anymore + actives = np.sum(active, axis=1) # result: like a column (but + # result is "list") + current_total_actives = np.sum(actives) + total_active += current_total_actives + pos_margins = margins[active] + push_loss += (1 - reg) * np.sum(pos_margins) + + weights[same_label, + (target_idx_sorted[same_label][:, nn_idx]).ravel()] \ + -= actives + weights[(target_idx_sorted[same_label][:, nn_idx]).ravel(), + same_label] \ + -= \ + actives + weights[(target_idx_sorted[same_label][:, nn_idx]).ravel(), + (target_idx_sorted[same_label][:, nn_idx]).ravel()] += actives + weights[diff_label, diff_label] -= np.sum(active, axis=0) + # + # TODO: be + # careful + # may be wrong here + weights[diff_label[:, None], same_label[None]] += active.T + weights[same_label[:, None], diff_label[None]] += active + + # TODO: maybe for some of the things we can multiply or add a total + # at the end of the loop on nn_idx ? + # TODO: + # maybe the things on the diagonal could be optimized more ( + # like 3 * X instead of 3*np.eye().dot(X) kind of thing ? + push_grad = ((1 - reg) * weights.T.dot(Lx)).T.dot(X) # TODO: optimize + # order of + # ops like + # NCA + # TODO: do better sparse multiplication (avoid the transpose) + pull_grad = L.dot(dfG * reg) # we could do a computation with Lx if d >> n + + pull_loss = reg * np.sum(target_dist) + grad = push_grad + pull_grad + grad *= 2 + it += 1 + objective = pull_loss + push_loss + + return grad, objective, total_active + + def _select_targets(self, L, X, label_inds): target_neighbors = np.empty((X.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(label_inds == label) - dd = euclidean_distances(X[inds], squared=True) + dd = euclidean_distances(X.dot(L.T)[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] return target_neighbors - def _find_impostors(self, furthest_neighbors, X, label_inds): - Lx = self.transform(X) + def _find_impostors(self, L, furthest_neighbors, X, label_inds): + Lx = X.dot(L.T) margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) impostors = [] for label in self.labels_[:-1]: in_inds, = np.nonzero(label_inds == label) - out_inds, = np.nonzero(label_inds > label) + out_inds, = np.nonzero(label_inds > label) # TODO: not sure why >, + # sth like only one pass through labels and avoid symmetric ? dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True) i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None]) i2,j2 = np.nonzero(dist < margin_radii[in_inds]) @@ -335,6 +363,7 @@ def _count_edges(act1, act2, impostors, targets): def _sum_outer_products(data, a_inds, b_inds, weights=None): + # TODO: since used one time, maybe replace by sth else ? Xab = data[a_inds] - data[b_inds] if weights is not None: return np.dot(Xab.T, Xab * weights[:,None]) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index c49c9ef5..2efb395e 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -2,11 +2,12 @@ import re import pytest import numpy as np +import scipy from scipy.optimize import check_grad, approx_fprime from six.moves import xrange -from sklearn.metrics import pairwise_distances +from sklearn.metrics import pairwise_distances, euclidean_distances from sklearn.datasets import (load_iris, make_classification, make_regression, - make_spd_matrix) + make_spd_matrix, make_blobs) from numpy.testing import (assert_array_almost_equal, assert_array_equal, assert_allclose) from sklearn.utils.testing import assert_warns_message @@ -292,6 +293,147 @@ def test_changed_behaviour_warning(self): assert any(msg == str(wrn.message) for wrn in raised_warning) +def test_loss_func(capsys): + """Test the loss function (and its gradient) on a simple example, + by comparing the results with the actual implementation of metric-learn, + with a very simple (but nonperformant) implementation""" + # TODO: we need to find an example where there are still some impostors at + # the beginning and they decrease + # TODO: ideally we would like to do a test where the number of active + # constraints decrease + # TODO: apparently it worked with master, when master does not recomputes + # the impostors, so see if I could not improve this test to ensure it + # tests well impostors recomputation (or at least if the impostors at + # init are not all the impostors) + def hinge(a): + if a > 0: + return a, 1 + else: + return 0, 0 + + def loss_fn(L, X, y, target_neighbors, regularization): + # warning to self: this is probably wrong, see test on the thing that was + # before + L = L.reshape(-1, X.shape[1]) + Lx = np.dot(X, L.T) + loss = 0 + total_active = 0 + grad = np.zeros_like(L) + outer = np.zeros((X.shape[0], X.shape[0])) + for i in range(X.shape[0]): + for j in target_neighbors[i]: + loss += (1 - regularization) * np.sum((Lx[i] - Lx[j])**2) + grad += (1 - regularization) * np.outer(Lx[i] - Lx[j], X[i] - X[j]) + for l in range(X.shape[0]): + if y[i] != y[l]: + hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) - + np.sum((Lx[i] - Lx[l])**2)) + total_active += active # an active constraint is + # active, and is a constraint + if active: + loss += regularization * hin + grad += (regularization * + (np.outer(Lx[i] - Lx[j], X[i] - X[j]) + - np.outer(Lx[i] - Lx[l], X[i] - X[l]))) + outer[i, j] -= 1 + outer[j, i] -= 1 + outer[j, j] += 1 + outer[l, l] -= 1 + outer[l, i] += 1 + outer[i, l] += 1 + grad = 2 * grad + return grad, loss, total_active + + # TODO: keep this but make it lighter + # # we check that the gradient we have computed in the test is indeed the + # # true gradient on a toy example: + # X, y = make_classification(random_state=42, class_sep=0.1, n_features=20) + # + # def _select_targets(X, y, k): + # target_neighbors = np.empty((X.shape[0], k), dtype=int) + # for label in np.unique(y): + # inds, = np.nonzero(y == label) + # dd = euclidean_distances(X[inds], squared=True) + # np.fill_diagonal(dd, np.inf) + # nn = np.argsort(dd)[..., :k] + # target_neighbors[inds] = inds[nn] + # return target_neighbors + # + # target_neighbors = _select_targets(X, y, 2) + # regularization = 0.5 + # x0 = np.random.randn(5, 20) # TODO: take smaller x0, X, y + # + # def loss(x0): + # return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + # regularization)[1] + # + # def grad(x0): + # return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + # regularization)[0].ravel() + # + # scipy.optimize.check_grad(loss, grad, x0.ravel()) + + class LMNN_nonperformant(LMNN): + + def fit(self, X, y): + self.y = y + return super(LMNN_nonperformant, self).fit(X, y) + + def _loss_grad(self, X, L, y, dfG, it, k, reg, target_neighbors): + grad, loss, total_active = loss_fn(L.ravel(), X, self.y, + target_neighbors, self.regularization) + return grad, loss, total_active + + # test that the objective function never has twice the same value + # see https://github.com/metric-learn/metric-learn/issues/88 + + X, y = make_classification(n_samples=10, n_classes=2, + n_features=2, + n_redundant=0, shuffle=True, + scale=[1, 20], random_state=42) + lmnn_perf = LMNN(verbose=True, random_state=42, init='identity', max_iter=10) + lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42, + init='identity', max_iter=10) + objectives, obj_diffs, grads, total_active = dict(), dict(), dict(), dict() + for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']): + algo.fit(X, y) + out, _ = capsys.readouterr() + lines = re.split("\n+", out) + # we get only objectives from each line: + # the regexp matches a float that follows an integer (the iteration + # number), and which is followed by a (signed) float (delta obj). It + # matches for instance: + # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06 + # regex for a signed number allowing scientific expression + num = '(-?\d+.?\d*(e[+|-]\d+)?)' + strings = [re.search("\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})".format(num, num, + num), + s) for + s in + lines] + objectives[name] = [float(match.group(1)) for match in strings if match is + not + None] + obj_diffs[name] = [float(match.group(3)) for match in strings if match is + not + None] + total_active[name] = [float(match.group(5)) for match in strings if + match is not + None] + grads[name] = [float(match.group(6)) for match in strings if match is not + None] + assert len(strings) >= 10 # we ensure that we actually did more than 10 + # iterations + assert total_active[name][0] >= 2 # we ensure that we have some active + # constraints (that's the case we want to test) + # we remove the last element because it can be equal to the penultimate + # if the last gradient update is null + np.testing.assert_allclose(objectives['perf'], objectives['nonperf']) + np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf']) + np.testing.assert_allclose(total_active['perf'], total_active['nonperf']) + np.testing.assert_allclose(grads['perf'], grads['nonperf']) + + @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]), [1, 1, 0, 0], 3.0), (np.array([[0], [1], [2], [3]]),