diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index ff2206ff5617b..6218e21a7bb19 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -66,7 +66,7 @@
                'linear_model', 'manifold', 'metrics', 'mixture', 'multiclass',
                'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
                'preprocessing', 'qda', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis',
+               'svm', 'tree', 'discriminant_analysis', 'metric_learning'
                # Non-modules:
                'clone']
 
diff --git a/sklearn/metric_learning/NCA.py b/sklearn/metric_learning/NCA.py
new file mode 100644
index 0000000000000..dd324a3760c5c
--- /dev/null
+++ b/sklearn/metric_learning/NCA.py
@@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+"""
+@author: thiolliere and Yuan Tang (terrytangyuan)
+"""
+import numpy as np
+import scipy.optimize as opt
+from sklearn.base import BaseEstimator
+
+class NCAcost(object):
+
+    @staticmethod
+    def cost(A, X, y, threshold=None):
+        """Compute the cost function and the gradient
+        This is the objective function to be minimized
+
+        Parameters:
+        -----------
+        A : array-like
+            Projection matrix, shape = [dim, n_features] with dim <= n_features
+        X : array-like
+            Training data, shape = [n_features, n_samples]
+        y : array-like
+            Target values, shape = [n_samples]
+
+        Returns:
+        --------
+        f : float
+            The value of the objective function
+        gradf : array-like
+            The gradient of the objective function, shape = [dim * n_features]
+        """
+
+        (D, N) = np.shape(X)
+        A = np.reshape(A, (np.size(A) / np.size(X, axis=0), np.size(X, axis=0)))
+        (d, aux) = np.shape(A)
+        assert D == aux
+
+        AX = np.dot(A, X)
+        normAX = np.linalg.norm(AX[:, :, None] - AX[:, None, :], axis=0)
+
+        denomSum = np.sum(np.exp(-normAX[:, :]), axis=0)
+        Pij = np.exp(- normAX) / denomSum[:, None]
+        if threshold is not None:
+            Pij[Pij < threshold] = 0
+            Pij[Pij > 1-threshold] = 1
+
+        mask = (y != y[:, None])
+        Pijmask = np.ma.masked_array(Pij, mask)
+        P = np.array(np.sum(Pijmask, axis=1))
+        mask = np.negative(mask)
+
+        f = np.sum(P)
+
+        Xi = X[:, :, None] - X[:, None, :]
+        Xi = np.swapaxes(Xi, 0, 2)
+
+        Xi = Pij[:, :, None] * Xi
+
+        Xij = Xi[:, :, :, None] * Xi[:, :, None, :]
+
+        gradf = np.sum(P[:, None, None] * np.sum(Xij[:], axis=1), axis=0)
+
+        # To optimize (use mask ?)
+        for i in range(N):
+            aux = np.sum(Xij[i, mask[i]], axis=0)
+            gradf -= aux
+
+        gradf = 2 * np.dot(A, gradf)
+        gradf = -np.reshape(gradf, np.size(gradf))
+        f = np.size(X, 1) - f
+
+        return [f, gradf]
+
+    @staticmethod
+    def f(A, X, y):
+        return cost(A, X, y)[0]
+
+    @staticmethod
+    def grad(A, X, y):
+        return cost(A, X, y)[1]
+
+    @staticmethod
+    def cost_g(A, X, y, threshold=None):
+        """Compute the cost function and the gradient for the K-L divergence
+
+        Parameters:
+        -----------
+        A : array-like
+            Projection matrix, shape = [dim, n_features] with dim <= n_features
+        X : array-like
+            Training data, shape = [n_features, n_samples]
+        y : array-like
+            Target values, shape = [n_samples]
+
+        Returns:
+        --------
+        g : float
+            The value of the objective function
+        gradg : array-like
+            The gradient of the objective function, shape = [dim * n_features]
+        """
+
+        (D, N) = np.shape(X)
+        A = np.reshape(A, (np.size(A) / np.size(X, axis=0), np.size(X, axis=0)))
+        (d, aux) = np.shape(A)
+        assert D == aux
+
+        AX = np.dot(A, X)
+        normAX = np.linalg.norm(AX[:, :, None] - AX[:, None, :], axis=0)
+
+        denomSum = np.sum(np.exp(-normAX[:, :]), axis=0)
+        Pij = np.exp(- normAX) / denomSum[:, None]
+        if threshold is not None:
+            Pij[Pij < threshold] = 0
+            Pij[Pij > 1-threshold] = 1
+
+        mask = (y != y[:, None])
+        Pijmask = np.ma.masked_array(Pij, mask)
+        P = np.array(np.sum(Pijmask, axis=1))
+        mask = np.negative(mask)
+
+        g = np.sum(np.log(P))
+
+        Xi = X[:, :, None] - X[:, None, :]
+        Xi = np.swapaxes(Xi, 0, 2)
+
+        Xi = Pij[:, :, None] * Xi
+
+        Xij = Xi[:, :, :, None] * Xi[:, :, None, :]
+
+        gradg = np.sum(np.sum(Xij[:], axis=1), axis=0)
+
+        # To optimize (use mask ?)
+        for i in range(N):
+            aux = np.sum(Xij[i, mask[i]], axis=0) / P[i]
+            gradg -= aux
+
+        gradg = 2 * np.dot(A, gradg)
+        gradg = -np.reshape(gradg, np.size(gradg))
+        g = -g
+
+        return [g, gradg]
+
+
+class NCA(BaseEstimator):
+
+    def __init__(self, metric=None, dim=None,
+                 threshold=None, objective='mahalanobis', **kwargs):
+        """Classification and/or dimensionality reduction with the neighborhood
+        component analysis.
+
+        The algorithm apply the softmax function on the transformed space and
+        tries to maximise the leave-one-out classification.
+
+        Parameters:
+        -----------
+        metric : array-like, optional
+            The initial distance metric, if not precised, the algorithm will
+            use a poor projection of the Mahalanobis distance.
+            shape = [dim, n_features] with dim <= n_features being the
+            dimension of the output space
+        dim : int, optional
+            The number of dimensions to keep for dimensionality reduction. If
+            not precised, the algorithm wont perform dimensionality reduction.
+        threshold : float, otpional
+            Threshold for the softmax function, set it higher to discard
+            further neighbors.
+        objective : string, optional
+            The objective function to optimize. The two implemented cost
+            functions are for Mahalanobis distance and KL-divergence.
+        **kwargs : keyword arguments, optional
+            See scipy.optimise.minimize for the list of additional arguments.
+            Those arguments include:
+
+            method : string
+                The algorithm to use for optimization.
+            options : dict
+                a dictionary of solver options
+            hess, hessp : callable
+                Hessian matrix
+            bounds : sequence
+                Bounds for variables
+            constraints : dict or sequence of dict
+                Constraints definition
+            tol : float
+                Tolerance for termination
+
+        Attributes:
+        -----------
+        metric : array-like
+            The trained disctance metric
+        """
+        self.metric = metric
+        self.dim = dim
+        self.threshold = threshold
+        if objective == 'mahalanobis':
+            self.objective = NCAcost.cost
+        elif objective == 'kl-divergence':
+            self.objective = NCAcost.cost_g
+        self.kwargs = kwargs
+
+    def fit(self, X, y):
+        """Fit the model using X as training data and y as target values.
+
+        Parameters:
+        -----------
+        X : array-like
+            Training data, shape = [n_features, n_samples]
+        y : array-like
+            Target values, shape = [n_samples]
+        """
+        if self.metric is None:
+            if self.dim is None:
+                self.metric = np.eye(np.size(X, 1))
+                self.dim = np.size(X, 1)
+            else:
+                self.metric = np.eye(self.dim, np.size(X, 1) - self.dim)
+
+        res = opt.minimize(fun=self.objective,
+                           x0=self.metric,
+                           args=(X, y, self.threshold),
+                           jac=True,
+                           **self.kwargs
+                           )
+
+        self.metric = np.reshape(res.x,
+                                 (np.size(res.x) / np.size(X, 0),
+                                  np.size(X, 0)))
+
+    def fit_transform(self, X, y):
+        """Fit the model with X and apply the dimensionality reduction on X.
+
+        Parameters:
+        -----------
+        X : array-like
+            Training data, shape = [n_features, n_samples]
+        y : array-like
+            Target values, shape = [n_samples]
+
+        Returns:
+        --------
+        X_new : array-like
+            shape = [dim, n_samples]
+        """
+        self.fit(self, X, y)
+        return np.dot(self.metric, X)
+
+    def score(self, X, y):
+        """Returns the proportion of X correctly classified by the leave-one-
+        out classification
+
+        Parameters:
+        -----------
+        X : array-like
+            Training data, shape = [n_features, n_samples]
+        y : array-like
+            Target values, shape = [n_samples]
+
+        Returns:
+        --------
+        score : float
+            The proportion of X correctly classified
+        """
+        return 1 - NCAcost.cost(self.metric, X, y)[0]/np.size(X, 1)
+
+    def getParameters(self):
+        """Returns a dictionary of the parameters
+        """
+        return dict(metric=self.metric, dim=self.dim, objective=self.objective,
+                    threshold=self.threshold, **self.kwargs)
diff --git a/sklearn/metric_learning/__init__.py b/sklearn/metric_learning/__init__.py
new file mode 100644
index 0000000000000..f6b2439aa4434
--- /dev/null
+++ b/sklearn/metric_learning/__init__.py
@@ -0,0 +1,8 @@
+"""
+The :mod:`sklearn.metric_learning` module implements metric learning models. 
+
+The algorithms that have been implemented are:
+Relevant Components Analysis (RCA)
+"""
+
+__all__ = ['NCA']
diff --git a/sklearn/tests/test_nca.py b/sklearn/tests/test_nca.py
new file mode 100644
index 0000000000000..f7d7a27aadf27
--- /dev/null
+++ b/sklearn/tests/test_nca.py
@@ -0,0 +1,24 @@
+import numpy as np
+from sklearn.metric_learning import NCA
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_array_almost_equal
+
+N = 300
+aux = (np.concatenate([0.5*np.ones((N/2, 1)),
+                       np.zeros((N/2, 1)), 1.1*np.ones((N/2, 1))], axis=1))
+X = np.concatenate([np.random.rand(N/2, 3),
+                    np.random.rand(N/2, 3) + aux])
+
+y = np.concatenate([np.concatenate([np.ones((N/2, 1)), np.zeros((N/2, 1))]),
+                    np.concatenate([np.zeros((N/2, 1)), np.ones((N/2, 1))])],
+                   axis=1)
+X = X.T
+y = y[:, 0]
+A = np.array([[1, 0, 0], [0, 1, 0]])
+
+def test_NCA():
+	# Training
+	nca = NCA.NCA(metric=A, method='BFGS', objective='kl-divergence', options={'maxiter': 10, 'disp': True})
+	print nca.score(X, y)
+	nca.fit(X, y)
+	print nca.score(X, y)
\ No newline at end of file