diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 20eeea3b..bb046a95 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -14,17 +14,25 @@
 
 from __future__ import print_function, absolute_import
 import numpy as np
+import scipy
 import warnings
 from collections import Counter
 from six.moves import xrange
 from sklearn.exceptions import ChangedBehaviorWarning
-from sklearn.metrics import euclidean_distances
+from sklearn.metrics import euclidean_distances, pairwise_distances
 from sklearn.base import TransformerMixin
 
 from ._util import _initialize_transformer, _check_n_components
 from .base_metric import MahalanobisMixin
 
 
+def re_order_target_neighbors(L, X, target_neighbors):
+  Xl = np.dot(X, L.T)
+  dd = np.sum((Xl[:, None, :] - Xl[target_neighbors])**2, axis=2)
+  sorted_neighbors = np.take_along_axis(target_neighbors, dd.argsort(axis=1), 1)
+  return sorted_neighbors
+
+
 class LMNN(MahalanobisMixin, TransformerMixin):
   def __init__(self, init=None, k=3, min_iter=50, max_iter=1000,
                learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
@@ -155,8 +163,9 @@ def fit(self, X, y):
       raise ValueError('not enough class labels for specified k'
                        ' (smallest class has %d)' % required_k)
 
-    target_neighbors = self._select_targets(X, label_inds)
-    impostors = self._find_impostors(target_neighbors[:, -1], X, label_inds)
+    target_neighbors = self._select_targets(self.transformer_, X, label_inds)
+    impostors = self._find_impostors(self.transformer_,
+                                     target_neighbors[:, -1], X, label_inds)
     if len(impostors) == 0:
         # L has already been initialized to an identity matrix
         return
@@ -164,26 +173,21 @@ def fit(self, X, y):
     # sum outer products
     dfG = _sum_outer_products(X, target_neighbors.flatten(),
                               np.repeat(np.arange(X.shape[0]), k))
-    df = np.zeros_like(dfG)
-
-    # storage
-    a1 = [None]*k
-    a2 = [None]*k
-    for nn_idx in xrange(k):
-      a1[nn_idx] = np.array([])
-      a2[nn_idx] = np.array([])
 
     # initialize L
     L = self.transformer_
 
     # first iteration: we compute variables (including objective and gradient)
     #  at initialization point
-    G, objective, total_active, df, a1, a2 = (
-        self._loss_grad(X, L, dfG, impostors, 1, k, reg, target_neighbors, df,
-                        a1, a2))
+    G, objective, total_active = self._loss_grad(X, L, y, dfG, 1,
+                                                 k, reg, target_neighbors)
 
+
+    # TODO: need to print here the log
     it = 1  # we already made one iteration
 
+    print(it, objective, 0, total_active, 1.05e-5)  # TODO: replace by a
+    # real learning rate here it's just to fix a bug when printing
     # main loop
     for it in xrange(2, self.max_iter):
       # then at each iteration, we try to find a value of L that has better
@@ -194,10 +198,12 @@ def fit(self, X, y):
         # we compute the objective at next point
         # we copy variables that can be modified by _loss_grad, because if we
         # retry we don t want to modify them several times
-        (G_next, objective_next, total_active_next, df_next, a1_next,
-         a2_next) = (
-            self._loss_grad(X, L_next, dfG, impostors, it, k, reg,
-                            target_neighbors, df.copy(), list(a1), list(a2)))
+        # target_neighbors_next = self._select_targets(L_next, X, label_inds)
+        # TODO: I should just re-order the target neighbors
+
+        (G_next, objective_next, total_active_next) = (
+            self._loss_grad(X, L_next, label_inds, dfG, it, k, reg,
+                            target_neighbors))
         assert not np.isnan(objective)
         delta_obj = objective_next - objective
         if delta_obj > 0:
@@ -212,8 +218,7 @@ def fit(self, X, y):
       # old variables to these new ones before next iteration and we
       # slightly increase the learning rate
       L = L_next
-      G, df, objective, total_active, a1, a2 = (
-          G_next, df_next, objective_next, total_active_next, a1_next, a2_next)
+      G, objective, total_active = G_next, objective_next, total_active_next
       learn_rate *= 1.01
 
       if self.verbose:
@@ -231,74 +236,97 @@ def fit(self, X, y):
     # store the last L
     self.transformer_ = L
     self.n_iter_ = it
+    self.targets_ = target_neighbors
+    self.impostors_ = impostors
     return self
 
-  def _loss_grad(self, X, L, dfG, impostors, it, k, reg, target_neighbors, df,
-                 a1, a2):
+  def _loss_grad(self, X, L, y, dfG, it, k, reg, target_neighbors):
     # Compute pairwise distances under current metric
-    Lx = L.dot(X.T).T
-    g0 = _inplace_paired_L2(*Lx[impostors])
-    Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :])
-    g1, g2 = Ni[impostors]
-    # compute the gradient
-    total_active = 0
-    for nn_idx in reversed(xrange(k)):
-      act1 = g0 < g1[:, nn_idx]
-      act2 = g0 < g2[:, nn_idx]
-      total_active += act1.sum() + act2.sum()
-
-      if it > 1:
-        plus1 = act1 & ~a1[nn_idx]
-        minus1 = a1[nn_idx] & ~act1
-        plus2 = act2 & ~a2[nn_idx]
-        minus2 = a2[nn_idx] & ~act2
-      else:
-        plus1 = act1
-        plus2 = act2
-        minus1 = np.zeros(0, dtype=int)
-        minus2 = np.zeros(0, dtype=int)
-
-      targets = target_neighbors[:, nn_idx]
-      PLUS, pweight = _count_edges(plus1, plus2, impostors, targets)
-      df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight)
-      MINUS, mweight = _count_edges(minus1, minus2, impostors, targets)
-      df -= _sum_outer_products(X, MINUS[:, 0], MINUS[:, 1], mweight)
-
-      in_imp, out_imp = impostors
-      df += _sum_outer_products(X, in_imp[minus1], out_imp[minus1])
-      df += _sum_outer_products(X, in_imp[minus2], out_imp[minus2])
-
-      df -= _sum_outer_products(X, in_imp[plus1], out_imp[plus1])
-      df -= _sum_outer_products(X, in_imp[plus2], out_imp[plus2])
-
-      a1[nn_idx] = act1
-      a2[nn_idx] = act2
-    # do the gradient update
-    assert not np.isnan(df).any()
-    G = dfG * reg + df * (1 - reg)
-    G = L.dot(G)
-    # compute the objective function
-    objective = total_active * (1 - reg)
-    objective += G.flatten().dot(L.flatten())
-    return 2 * G, objective, total_active, df, a1, a2
-
-  def _select_targets(self, X, label_inds):
+    Lx = X.dot(L.T)
+    n_samples = X.shape[0]
+    target_dist = np.sum((Lx[:, None] - Lx[target_neighbors])**2, axis=2)
+    # TODO: maybe this is not the more efficient, to re-order inplace the
+  #  target neighbors ?
+    target_idx_sorted = np.take_along_axis(target_neighbors,
+                                           target_dist.argsort(axis=1), 1)
+    target_dist = np.sort(target_dist, axis=1)
+    total_active, push_loss = 0, 0
+    weights = np.zeros((n_samples, n_samples))
+    for c in np.unique(y):  # could maybe avoid this loop and vectorize
+      same_label = np.where(y == c)[0]  # TODO: I can have this pre-computed
+      diff_label = np.where(y != c)[0]
+      imp_dist = pairwise_distances(Lx[same_label], Lx[diff_label],
+                                    squared=True)
+      # TODO: do some computations with a count kind of thing maybe
+      for nn_idx in reversed(xrange(k)):  # could maybe avoid this loop and
+        # vectorize
+        # TODO: simplify indexing when possible
+        margins = target_dist[same_label][:, nn_idx][:, None] + 1 - imp_dist
+        active = margins > 0
+        # we mask the further impostors bc they don't need to be compared
+        # anymore
+        actives = np.sum(active, axis=1)  # result: like a column (but
+        # result is "list")
+        current_total_actives = np.sum(actives)
+        total_active += current_total_actives
+        pos_margins = margins[active]
+        push_loss += (1 - reg) * np.sum(pos_margins)
+
+        weights[same_label,
+                (target_idx_sorted[same_label][:, nn_idx]).ravel()] \
+          -= actives
+        weights[(target_idx_sorted[same_label][:, nn_idx]).ravel(),
+                 same_label] \
+          -= \
+          actives
+        weights[(target_idx_sorted[same_label][:, nn_idx]).ravel(),
+                (target_idx_sorted[same_label][:, nn_idx]).ravel()] += actives
+        weights[diff_label, diff_label] -= np.sum(active, axis=0)
+        #
+        # TODO: be
+        # careful
+        # may be wrong here
+        weights[diff_label[:, None], same_label[None]] += active.T
+        weights[same_label[:, None], diff_label[None]] += active
+  
+        # TODO: maybe for some of the things we can multiply or add a total
+        #  at the end of the loop on nn_idx ?
+        # TODO:
+        #  maybe the things on the diagonal could be optimized more (
+        #  like 3 * X instead of 3*np.eye().dot(X) kind of thing ?
+    push_grad = ((1 - reg) * weights.T.dot(Lx)).T.dot(X)  # TODO: optimize
+    # order of
+    # ops like
+    # NCA
+    # TODO: do better sparse multiplication (avoid the transpose)
+    pull_grad = L.dot(dfG * reg)  # we could do a computation with Lx if d >> n
+
+    pull_loss = reg * np.sum(target_dist)
+    grad = push_grad + pull_grad
+    grad *= 2
+    it += 1
+    objective = pull_loss + push_loss
+
+    return grad, objective, total_active
+
+  def _select_targets(self, L, X, label_inds):
     target_neighbors = np.empty((X.shape[0], self.k), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(label_inds == label)
-      dd = euclidean_distances(X[inds], squared=True)
+      dd = euclidean_distances(X.dot(L.T)[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
       nn = np.argsort(dd)[..., :self.k]
       target_neighbors[inds] = inds[nn]
     return target_neighbors
 
-  def _find_impostors(self, furthest_neighbors, X, label_inds):
-    Lx = self.transform(X)
+  def _find_impostors(self, L, furthest_neighbors, X, label_inds):
+    Lx = X.dot(L.T)
     margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx)
     impostors = []
     for label in self.labels_[:-1]:
       in_inds, = np.nonzero(label_inds == label)
-      out_inds, = np.nonzero(label_inds > label)
+      out_inds, = np.nonzero(label_inds > label)  # TODO: not sure why >,
+      # sth like only one pass through labels and avoid symmetric ?
       dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
       i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
       i2,j2 = np.nonzero(dist < margin_radii[in_inds])
@@ -335,6 +363,7 @@ def _count_edges(act1, act2, impostors, targets):
 
 
 def _sum_outer_products(data, a_inds, b_inds, weights=None):
+  # TODO: since used one time, maybe replace by sth else ?
   Xab = data[a_inds] - data[b_inds]
   if weights is not None:
     return np.dot(Xab.T, Xab * weights[:,None])
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index c49c9ef5..2efb395e 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -2,11 +2,12 @@
 import re
 import pytest
 import numpy as np
+import scipy
 from scipy.optimize import check_grad, approx_fprime
 from six.moves import xrange
-from sklearn.metrics import pairwise_distances
+from sklearn.metrics import pairwise_distances, euclidean_distances
 from sklearn.datasets import (load_iris, make_classification, make_regression,
-                              make_spd_matrix)
+                              make_spd_matrix, make_blobs)
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
                            assert_allclose)
 from sklearn.utils.testing import assert_warns_message
@@ -292,6 +293,147 @@ def test_changed_behaviour_warning(self):
     assert any(msg == str(wrn.message) for wrn in raised_warning)
 
 
+def test_loss_func(capsys):
+  """Test the loss function (and its gradient) on a simple example,
+  by comparing the results with the actual implementation of metric-learn,
+  with a very simple (but nonperformant) implementation"""
+  # TODO: we need to find an example where there are still some impostors at
+  #  the beginning and they decrease
+  # TODO: ideally we would like to do a test where the number of active
+  #  constraints decrease
+  # TODO: apparently it worked with master, when master does not recomputes
+  #  the impostors, so see if I could not improve this test to ensure it
+  #  tests well impostors recomputation (or at least if the impostors at
+  #  init are not all the impostors)
+  def hinge(a):
+    if a > 0:
+      return a, 1
+    else:
+      return 0, 0
+
+  def loss_fn(L, X, y, target_neighbors, regularization):
+    # warning to self: this is probably wrong, see test on the thing that was
+    # before
+     L = L.reshape(-1, X.shape[1])
+     Lx = np.dot(X, L.T)
+     loss = 0
+     total_active = 0
+     grad = np.zeros_like(L)
+     outer = np.zeros((X.shape[0], X.shape[0]))
+     for i in range(X.shape[0]):
+       for j in target_neighbors[i]:
+         loss += (1 - regularization) * np.sum((Lx[i] - Lx[j])**2)
+         grad += (1 - regularization) * np.outer(Lx[i] - Lx[j], X[i] - X[j])
+         for l in range(X.shape[0]):
+           if y[i] != y[l]:
+             hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) -
+                                 np.sum((Lx[i] - Lx[l])**2))
+             total_active += active  # an active constraint is
+             # active, and is a constraint
+             if active:
+               loss += regularization * hin
+               grad += (regularization *
+                        (np.outer(Lx[i] - Lx[j], X[i] - X[j])
+                         - np.outer(Lx[i] - Lx[l], X[i] - X[l])))
+               outer[i, j] -= 1
+               outer[j, i] -= 1
+               outer[j, j] += 1
+               outer[l, l] -= 1
+               outer[l, i] += 1
+               outer[i, l] += 1
+     grad = 2 * grad
+     return grad, loss, total_active
+
+  # TODO: keep this but make it lighter
+  # # we check that the gradient we have computed in the test is indeed the
+  # # true gradient on a toy example:
+  # X, y = make_classification(random_state=42, class_sep=0.1, n_features=20)
+  #
+  # def _select_targets(X, y, k):
+  #   target_neighbors = np.empty((X.shape[0], k), dtype=int)
+  #   for label in np.unique(y):
+  #     inds, = np.nonzero(y == label)
+  #     dd = euclidean_distances(X[inds], squared=True)
+  #     np.fill_diagonal(dd, np.inf)
+  #     nn = np.argsort(dd)[..., :k]
+  #     target_neighbors[inds] = inds[nn]
+  #   return target_neighbors
+  #
+  # target_neighbors = _select_targets(X, y, 2)
+  # regularization = 0.5
+  # x0 = np.random.randn(5, 20) # TODO: take smaller x0, X, y
+  #
+  # def loss(x0):
+  #   return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+  #                  regularization)[1]
+  #
+  # def grad(x0):
+  #   return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+  #                  regularization)[0].ravel()
+  #
+  # scipy.optimize.check_grad(loss, grad, x0.ravel())
+
+  class LMNN_nonperformant(LMNN):
+
+    def fit(self, X, y):
+      self.y = y
+      return super(LMNN_nonperformant, self).fit(X, y)
+
+    def _loss_grad(self, X, L, y, dfG, it, k, reg, target_neighbors):
+      grad, loss, total_active = loss_fn(L.ravel(), X, self.y,
+                                         target_neighbors, self.regularization)
+      return grad, loss, total_active
+
+  # test that the objective function never has twice the same value
+  # see https://github.com/metric-learn/metric-learn/issues/88
+
+  X, y = make_classification(n_samples=10, n_classes=2,
+                             n_features=2,
+                             n_redundant=0, shuffle=True,
+                             scale=[1, 20], random_state=42)
+  lmnn_perf = LMNN(verbose=True, random_state=42, init='identity', max_iter=10)
+  lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42,
+                                    init='identity', max_iter=10)
+  objectives, obj_diffs, grads, total_active = dict(), dict(), dict(), dict()
+  for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']):
+    algo.fit(X, y)
+    out, _ = capsys.readouterr()
+    lines = re.split("\n+", out)
+    # we get only objectives from each line:
+    # the regexp matches a float that follows an integer (the iteration
+    # number), and which is followed by a (signed) float (delta obj). It
+    # matches for instance:
+    # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06
+    # regex for a signed number allowing scientific expression
+    num = '(-?\d+.?\d*(e[+|-]\d+)?)'
+    strings = [re.search("\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})".format(num, num,
+                                                                   num),
+                         s) for
+               s in
+                     lines]
+    objectives[name] = [float(match.group(1)) for match in strings if match is
+                        not
+                        None]
+    obj_diffs[name] = [float(match.group(3)) for match in strings if match is
+                             not
+                       None]
+    total_active[name] = [float(match.group(5)) for match in strings if
+                          match is not
+                          None]
+    grads[name] = [float(match.group(6)) for match in strings if match is not
+                   None]
+    assert len(strings) >= 10  # we ensure that we actually did more than 10
+    # iterations
+    assert total_active[name][0] >= 2  # we ensure that we have some active
+    # constraints (that's the case we want to test)
+    # we remove the last element because it can be equal to the penultimate
+    # if the last gradient update is null
+  np.testing.assert_allclose(objectives['perf'], objectives['nonperf'])
+  np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf'])
+  np.testing.assert_allclose(total_active['perf'], total_active['nonperf'])
+  np.testing.assert_allclose(grads['perf'], grads['nonperf'])
+
+
 @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]),
                                          [1, 1, 0, 0], 3.0),
                                         (np.array([[0], [1], [2], [3]]),