diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..f01db0a4
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,27 @@
+ignore:
+  - "test"
+
+# taken from scikit-learn:
+# https://github.com/scikit-learn/scikit-learn/blob/a7e17117bb15eb3f51ebccc1bd53e42fcb4e6cd8/.codecov.yml
+comment: false
+
+coverage:
+  status:
+    project:
+      default:
+        # Commits pushed to master should not make the overall
+        # project coverage decrease by more than 1%:
+        target: auto
+        threshold: 1%
+    patch:
+      default:
+        # Be tolerant on slight code coverage diff on PRs to limit
+        # noisy red coverage status on github PRs.
+        # Note The coverage stats are still uploaded
+        # to codecov so that PR reviewers can see uncovered lines
+        # in the github diff if they install the codecov browser
+        # extension:
+        # https://github.com/codecov/browser-extension
+        target: auto
+        threshold: 1%
+
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..ae757838
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,56 @@
+---
+name: Reproducible bug report
+about: Create a reproducible bug report. Not for support requests.
+labels: 'bug'
+---
+
+#### Description
+<!-- Describe your issue here.-->
+
+#### Steps/Code to Reproduce
+<!-- Please provide a **minimal** highlighted code example for
+reproduction. (See https://help.github.com/articles/creating-and-highlighting-code-blocks/
+for code blocks highlighting, and https://stackoverflow.com/help/mcve
+for what is a minimal reproducible code.)
+
+Example:
+```python
+from metric_learn import NCA
+from sklearn.datasets import make_classification
+from sklearn.utils import check_random_state
+
+X, y = make_classification(random_state=0)
+nca = NCA()
+nca.fit(X, y)
+```
+If the code is too long, feel free to put it in a public gist and link
+it in the issue: https://gist.github.com
+-->
+
+#### Expected Results
+<!-- Example: No error is thrown. Please paste or describe the expected results.-->
+
+#### Actual Results
+<!-- Please paste or specifically describe the actual output or traceback. You can use ```ptb for python traceback formatting-->
+
+#### Versions
+<!-- Please run the following snippet and paste the output below.
+
+import platform; print(platform.platform())
+import sys; print("Python", sys.version)
+import numpy; print("NumPy", numpy.__version__)
+import scipy; print("SciPy", scipy.__version__)
+import sklearn; print("Scikit-Learn", sklearn.__version__)
+import metric_learn; print("Metric-Learn", metric_learn.__version__)
+
+(If the last statement returns "AttributeError: 'module' object has no attribute '__version__'", you can instead run this in a terminal:
+$ pip show metric_learn | grep Version
+)
+-->
+<!-- Thanks for contributing! -->
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Impacted by this bug? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..415acfcd
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,18 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: Have you read the docs?
+    url: http://contrib.scikit-learn.org/metric-learn/
+    about: Much help can be found in the docs
+  - name: Ask a question
+    url: https://github.com/scikit-learn-contrib/metric-learn/discussions/new
+    about: Ask a question or start a discussion about metric-learn
+  - name: Stack Overflow
+    url: https://stackoverflow.com
+    about: Please ask and answer metric-learn usage questions (API, installation...) on Stack Overflow
+  - name: Cross Validated
+    url: https://stats.stackexchange.com
+    about: Please ask and answer metric learning questions (use cases, algorithms & theory...) on Cross Validated
+  - name: Blank issue
+    url: https://github.com/scikit-learn-contrib/metric-learn/issues/new
+    about: Please note that Github Discussions should be used in most cases instead
diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md
new file mode 100644
index 00000000..753cf2f7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/doc_improvement.md
@@ -0,0 +1,23 @@
+---
+name: Documentation improvement
+about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change.
+labels: Documentation
+---
+
+#### Describe the issue linked to the documentation
+
+<!--
+Tell us about the confusion introduced in the documentation.
+-->
+
+#### Suggest a potential alternative/fix
+
+<!--
+Tell us how we could improve the documentation in this regard.
+-->
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Confused by this part of the doc too? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/enhancement_proposal.md b/.github/ISSUE_TEMPLATE/enhancement_proposal.md
new file mode 100644
index 00000000..01dfb1d7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement_proposal.md
@@ -0,0 +1,18 @@
+---
+name: Enhancement proposal
+about: Propose an enhancement for metric-learn
+labels: 'enhancement'
+---
+# Summary
+
+What change needs making?
+
+# Use Cases
+
+When would you use this?
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Want to see this feature happen? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 00000000..0935a109
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,42 @@
+name: CI
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+    
+jobs:
+  # Run normal testing with the latest versions of all dependencies
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run Tests without skggm
+        run:  |
+          sudo apt-get install liblapack-dev
+          pip install --upgrade pip pytest
+          pip install wheel cython numpy scipy codecov pytest-cov scikit-learn
+          pytest test --cov
+          bash <(curl -s https://codecov.io/bash)
+      - name: Run Tests with skggm
+        env:
+          SKGGM_VERSION: a0ed406586c4364ea3297a658f415e13b5cbdaf8
+        run:  |
+          pip install git+https://github.com/skggm/skggm.git@${SKGGM_VERSION}
+          pytest test --cov
+          bash <(curl -s https://codecov.io/bash)
+      - name: Syntax checking with flake8
+        run: |
+          pip install flake8
+          flake8 --extend-ignore=E111,E114 --show-source;
diff --git a/.gitignore b/.gitignore
index 32ed7270..66eb3551 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,10 @@ build/
 dist/
 *.egg-info
 .coverage
+htmlcov/
+.cache/
+.pytest_cache/
+doc/auto_examples/*
+doc/generated/*
+venv/
+.vscode/
diff --git a/.landscape.yml b/.landscape.yml
new file mode 100644
index 00000000..ae342735
--- /dev/null
+++ b/.landscape.yml
@@ -0,0 +1,16 @@
+strictness: medium
+pep8:
+  disable:
+    - E111
+    - E114
+    - E231
+    - E225
+    - E402
+    - W503
+pylint:
+  disable:
+    - bad-indentation
+    - invalid-name
+    - too-many-arguments
+ignore-paths:
+  - bench/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 378cc5f5..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: python
-sudo: false
-cache: pip
-python:
-  - "2.7"
-  - "3.4"
-before_install:
-  - pip install --upgrade pip
-  - pip install wheel
-  - pip install numpy scipy scikit-learn
-script: python setup.py test
diff --git a/README.rst b/README.rst
index 38c088aa..b2f6e6d4 100644
--- a/README.rst
+++ b/README.rst
@@ -1,9 +1,9 @@
-|Travis-CI Build Status| |License| |PyPI version|
+|GitHub Actions Build Status| |License| |PyPI version| |Code coverage|
 
-metric-learn
-=============
+metric-learn: Metric Learning in Python
+=======================================
 
-Metric Learning algorithms in Python.
+metric-learn contains efficient Python implementations of several popular supervised and weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib <https://github.com/scikit-learn-contrib>`_, the API of metric-learn is compatible with `scikit-learn <http://scikit-learn.org/stable/>`_, the leading library for machine learning in Python. This allows to use all the scikit-learn routines (for pipelining, model selection, etc) with metric learning algorithms through a unified interface.
 
 **Algorithms**
 
@@ -11,60 +11,67 @@ Metric Learning algorithms in Python.
 -  Information Theoretic Metric Learning (ITML)
 -  Sparse Determinant Metric Learning (SDML)
 -  Least Squares Metric Learning (LSML)
+-  Sparse Compositional Metric Learning (SCML)
 -  Neighborhood Components Analysis (NCA)
 -  Local Fisher Discriminant Analysis (LFDA)
 -  Relative Components Analysis (RCA)
+-  Metric Learning for Kernel Regression (MLKR)
+-  Mahalanobis Metric for Clustering (MMC)
 
 **Dependencies**
 
--  Python 2.7+, 3.4+
--  numpy, scipy, scikit-learn
--  (for running the examples only: matplotlib)
+-  Python 3.6+ (the last version supporting Python 2 and Python 3.5 was
+   `v0.5.0 <https://pypi.org/project/metric-learn/0.5.0/>`_)
+-  numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3
 
-**Installation/Setup**
+**Optional dependencies**
 
-Run ``pip install metric-learn`` to download and install from PyPI.
+- For SDML, using skggm will allow the algorithm to solve problematic cases
+  (install from commit `a0ed406 <https://github.com/skggm/skggm/commit/a0ed406586c4364ea3297a658f415e13b5cbdaf8>`_).
+  ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub.
+-  For running the examples only: matplotlib
 
-Run ``python setup.py install`` for default installation.
+**Installation/Setup**
 
-Run ``python setup.py test`` to run all tests.
+- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here <https://github.com/conda-forge/metric-learn-feedstock#installing-metric-learn>`_.
 
-**Usage**
+- To install from PyPI: ``pip install metric-learn``.
 
-For full usage examples, see the `sphinx documentation`_.
+- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed).
 
-Each metric is a subclass of ``BaseMetricLearner``, which provides
-default implementations for the methods ``metric``, ``transformer``, and
-``transform``. Subclasses must provide an implementation for either
-``metric`` or ``transformer``.
+**Usage**
 
-For an instance of a metric learner named ``foo`` learning from a set of
-``d``-dimensional points, ``foo.metric()`` returns a ``d`` by ``d``
-matrix ``M`` such that a distance between vectors ``x`` and ``y`` is
-expressed ``(x-y).dot(M).dot(x-y)``.
+See the `sphinx documentation`_ for full documentation about installation, API, usage, and examples.
 
-In the same scenario, ``foo.transformer()`` returns a ``d`` by ``d``
-matrix ``L`` such that a vector ``x`` can be represented in the learned
-space as the vector ``x.dot(L.T)``.
+**Citation**
 
-For convenience, the function ``foo.transform(X)`` is provided for
-converting a matrix of points (``X``) into the learned space, in which
-standard Euclidean distance can be used.
+If you use metric-learn in a scientific publication, we would appreciate
+citations to the following paper:
 
-**Notes**
+`metric-learn: Metric Learning Algorithms in Python
+<http://www.jmlr.org/papers/volume21/19-678/19-678.pdf>`_, de Vazelhes
+*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020.
 
-If a recent version of the Shogun Python modular (``modshogun``) library
-is available, the LMNN implementation will use the fast C++ version from
-there. The two implementations differ slightly, and the C++ version is
-more complete.
+Bibtex entry::
 
+  @article{metric-learn,
+    title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython},
+    author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and
+              {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien},
+    journal = {Journal of Machine Learning Research},
+    year = {2020},
+    volume = {21},
+    number = {138},
+    pages = {1--6}
+  }
 
-.. _sphinx documentation: http://all-umass.github.io/metric-learn/
+.. _sphinx documentation: http://contrib.scikit-learn.org/metric-learn/
 
-.. |Travis-CI Build Status| image:: https://api.travis-ci.org/all-umass/metric-learn.svg?branch=master
-   :target: https://travis-ci.org/all-umass/metric-learn
+.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg
+   :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
    :target: http://badges.mit-license.org
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
-
+.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn
diff --git a/bench/.gitignore b/bench/.gitignore
new file mode 100644
index 00000000..824e23ac
--- /dev/null
+++ b/bench/.gitignore
@@ -0,0 +1,4 @@
+results
+env
+metric-learn
+html
diff --git a/bench/asv.conf.json b/bench/asv.conf.json
new file mode 100644
index 00000000..782d3ab2
--- /dev/null
+++ b/bench/asv.conf.json
@@ -0,0 +1,74 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "metric-learn",
+
+    // The project's homepage
+    "project_url": "https://github.com/all-umass/metric-learn",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "tip" (for mercurial).
+    "branches": ["master"], // for git
+    // "branches": ["tip"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://github.com/all-umass/metric-learn/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.3"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list indicates to just test against the default (latest)
+    // version.
+    "matrix": {
+        "numpy": ["1.12"],
+        "scipy": ["0.18"],
+        "scikit-learn": ["0.18"]
+    },
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    // "env_dir": "env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    // "results_dir": "results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    // "html_dir": "html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    "wheel_cache_size": 4
+}
diff --git a/bench/benchmarks/__init__.py b/bench/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bench/benchmarks/iris.py b/bench/benchmarks/iris.py
new file mode 100644
index 00000000..05035085
--- /dev/null
+++ b/bench/benchmarks/iris.py
@@ -0,0 +1,31 @@
+import numpy as np
+from sklearn.datasets import load_iris
+
+import metric_learn
+
+CLASSES = {
+    'Covariance': metric_learn.Covariance(),
+    'ITML_Supervised': metric_learn.ITML_Supervised(n_constraints=200),
+    'LFDA': metric_learn.LFDA(k=2, dim=2),
+    'LMNN': metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False),
+    'LSML_Supervised': metric_learn.LSML_Supervised(n_constraints=200),
+    'MLKR': metric_learn.MLKR(),
+    'NCA': metric_learn.NCA(max_iter=700, n_components=2),
+    'RCA_Supervised': metric_learn.RCA_Supervised(dim=2, n_chunks=30,
+                                                  chunk_size=2),
+    'SDML_Supervised': metric_learn.SDML_Supervised(n_constraints=1500)
+}
+
+
+class IrisDataset(object):
+  params = [sorted(CLASSES)]
+  param_names = ['alg']
+
+  def setup(self, alg):
+    iris_data = load_iris()
+    self.iris_points = iris_data['data']
+    self.iris_labels = iris_data['target']
+
+  def time_fit(self, alg):
+    np.random.seed(5555)
+    CLASSES[alg].fit(self.iris_points, self.iris_labels)
diff --git a/doc/_static/css/styles.css b/doc/_static/css/styles.css
new file mode 100644
index 00000000..6d350ae4
--- /dev/null
+++ b/doc/_static/css/styles.css
@@ -0,0 +1,36 @@
+.hatnote {
+    border-color: #e1e4e5 ;
+    border-style: solid ;
+    border-width: 1px ;
+    font-size: x-small ;
+    font-style: italic ;
+    margin-left: auto ;
+    margin-right: auto ;
+    margin-bottom: 24px;
+    padding: 12px;
+}
+.hatnote-gray {
+  background-color: #f5f5f5 
+}
+.hatnote li {
+  list-style-type: square;
+  margin-left: 12px !important;
+}
+.hatnote ul {
+  list-style-type: square;
+  margin-left: 0px !important;
+  margin-bottom: 0px !important;
+}
+.deprecated {
+  color: #b94a48;
+  background-color: #F3E5E5;
+  border-color: #eed3d7;
+  margin-top: 0.5rem;
+  padding: 0.5rem;
+  border-radius: 0.5rem;
+  margin-bottom: 0.5rem;
+}
+
+.deprecated p {
+  margin-bottom: 0 !important;
+}
\ No newline at end of file
diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst
new file mode 100644
index 00000000..f0c1b5bc
--- /dev/null
+++ b/doc/_templates/class.rst
@@ -0,0 +1,16 @@
+:mod:`{{module}}`.{{objname}}
+{{ underline }}==============
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+   :undoc-members:
+   :inherited-members:
+   :special-members: __init__
+
+.. include:: {{module}}.{{objname}}.examples
+
+.. raw:: html
+
+    <div style='clear:both'></div>
diff --git a/doc/conf.py b/doc/conf.py
index 5e3f2cd9..c472cc21 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+import sys
+import os
+import warnings
 
 extensions = [
     'sphinx.ext.autodoc',
@@ -7,6 +10,9 @@
     'sphinx.ext.viewcode',
     'sphinx.ext.mathjax',
     'numpydoc',
+    'sphinx_gallery.gen_gallery',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx'
 ]
 
 templates_path = ['_templates']
@@ -15,19 +21,62 @@
 
 # General information about the project.
 project = u'metric-learn'
-copyright = u'2015, CJ Carey and Yuan Tang'
-author = u'CJ Carey and Yuan Tang'
-version = '0.2.1'
-release = '0.2.1'
+copyright = (u'2015-2023, CJ Carey, Yuan Tang, William de Vazelhes, Aurélien '
+             u'Bellet and Nathalie Vauquier')
+author = (u'CJ Carey, Yuan Tang, William de Vazelhes, Aurélien Bellet and '
+          u'Nathalie Vauquier')
+version = '0.7.0'
+release = '0.7.0'
 language = 'en'
 
 exclude_patterns = ['_build']
 pygments_style = 'sphinx'
 todo_include_todos = True
-numpydoc_show_class_members = False
 
 # Options for HTML output
 html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 htmlhelp_basename = 'metric-learndoc'
 
+# Option to hide doctests comments in the documentation (like # doctest:
+# +NORMALIZE_WHITESPACE for instance)
+trim_doctest_flags = True
+
+# intersphinx configuration
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/{.major}'.format(
+        sys.version_info), None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
+    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
+    'scikit-learn': ('https://scikit-learn.org/stable/', None)
+}
+
+
+# sphinx-gallery configuration
+sphinx_gallery_conf = {
+    # to generate mini-galleries at the end of each docstring in the API
+    # section: (see https://sphinx-gallery.github.io/configuration.html
+    # #references-to-examples)
+    'doc_module': 'metric_learn',
+    'backreferences_dir': os.path.join('generated'),
+}
+
+# generate autosummary even if no references
+autosummary_generate = True
+
+
+# Temporary work-around for spacing problem between parameter and parameter
+# type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug
+# has been fixed in sphinx (https://github.com/sphinx-doc/sphinx/pull/5976) but
+# through a change in sphinx basic.css except rtd_theme does not use basic.css.
+# In an ideal world, this would get fixed in this PR:
+# https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files
+def setup(app):
+  app.add_js_file('js/copybutton.js')
+  app.add_css_file('css/styles.css')
+
+
+# Remove matplotlib agg warnings from generated doc when using plt.show
+warnings.filterwarnings("ignore", category=UserWarning,
+                        message='Matplotlib is currently using agg, which is a'
+                                ' non-GUI backend, so cannot show the figure.')
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
new file mode 100644
index 00000000..90b7c7ee
--- /dev/null
+++ b/doc/getting_started.rst
@@ -0,0 +1,47 @@
+###############
+Getting started
+###############
+
+Installation and Setup
+======================
+
+**Installation**
+
+metric-learn can be installed in either of the following ways:
+
+- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here <https://github.com/conda-forge/metric-learn-feedstock#installing-metric-learn>`_.
+
+- To install from PyPI: ``pip install metric-learn``.
+
+- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed).
+
+**Dependencies**
+
+- Python 3.6+ (the last version supporting Python 2 and Python 3.5 was
+  `v0.5.0 <https://pypi.org/project/metric-learn/0.5.0/>`_)
+- numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3
+
+**Optional dependencies**
+
+- For SDML, using skggm will allow the algorithm to solve problematic cases
+  (install from commit `a0ed406 <https://github.com/skggm/skggm/commit/a0ed406586c4364ea3297a658f415e13b5cbdaf8>`_).
+  ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub.
+- For running the examples only: matplotlib
+
+Quick start
+===========
+
+This example loads the iris dataset, and evaluates a k-nearest neighbors
+algorithm on an embedding space learned with `NCA`.
+
+::
+
+    from metric_learn import NCA
+    from sklearn.datasets import load_iris
+    from sklearn.model_selection import cross_val_score
+    from sklearn.pipeline import make_pipeline
+    from sklearn.neighbors import KNeighborsClassifier
+    
+    X, y = load_iris(return_X_y=True)
+    clf = make_pipeline(NCA(), KNeighborsClassifier())
+    cross_val_score(clf, X, y)
diff --git a/doc/index.rst b/doc/index.rst
index df4ed8a6..f9dfd83d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,80 +1,67 @@
 metric-learn: Metric Learning in Python
 =======================================
-|License| |PyPI version|
+|GitHub Actions Build Status| |License| |PyPI version| |Code coverage|
 
-Distance metrics are widely used in the machine learning literature.
-Traditionally, practicioners would choose a standard distance metric
-(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of
-the domain.
-Distance metric learning (or simply, metric learning) is the sub-field of
-machine learning dedicated to automatically constructing optimal distance
-metrics.
+`metric-learn <https://github.com/scikit-learn-contrib/metric-learn>`_
+contains efficient Python implementations of several popular supervised and
+weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib
+<https://github.com/scikit-learn-contrib>`_, the API of metric-learn is compatible with `scikit-learn
+<https://scikit-learn.org/>`_, the leading library for machine learning in
+Python. This allows to use all the scikit-learn routines (for pipelining,
+model selection, etc) with metric learning algorithms through a unified
+interface.
 
-This package contains efficient Python implementations of several popular
-metric learning algorithms.
-
-.. toctree::
-   :caption: Algorithms
-   :maxdepth: 1
-
-   metric_learn.covariance
-   metric_learn.lmnn
-   metric_learn.itml
-   metric_learn.sdml
-   metric_learn.lsml
-   metric_learn.nca
-   metric_learn.lfda
-   metric_learn.rca
-
-Each metric supports the following methods:
-
--  ``fit(...)``, which learns the model.
--  ``transformer()``, which returns a transformation matrix
-   :math:`L \in \mathbb{R}^{D \times d}`, which can be used to convert a
-   data matrix :math:`X \in \mathbb{R}^{n \times d}` to the
-   :math:`D`-dimensional learned metric space :math:`X L^{\top}`,
-   in which standard Euclidean distances may be used.
--  ``transform(X)``, which applies the aforementioned transformation.
--  ``metric()``, which returns a Mahalanobis matrix
-   :math:`M = L^{\top}L` such that distance between vectors ``x`` and
-   ``y`` can be computed as :math:`\left(x-y\right)M\left(x-y\right)`.
+If you use metric-learn in a scientific publication, we would appreciate
+citations to the following paper:
 
+`metric-learn: Metric Learning Algorithms in Python
+<http://www.jmlr.org/papers/volume21/19-678/19-678.pdf>`_, de Vazelhes
+*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020.
 
-Installation and Setup
-======================
+Bibtex entry::
 
-Run ``pip install metric-learn`` to download and install from PyPI.
+  @article{metric-learn,
+    title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython},
+    author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and
+              {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien},
+    journal = {Journal of Machine Learning Research},
+    year = {2020},
+    volume = {21},
+    number = {138},
+    pages = {1--6}
+  }
 
-Alternately, download the source repository and run:
 
--  ``python setup.py install`` for default installation.
--  ``python setup.py test`` to run all tests.
+Documentation outline
+---------------------
 
-**Dependencies**
+.. toctree::
+   :maxdepth: 2
 
--  Python 2.7+, 3.4+
--  numpy, scipy, scikit-learn
--  (for running the examples only: matplotlib)
+   getting_started
 
-**Notes**
+.. toctree::
+   :maxdepth: 2
 
-If a recent version of the Shogun Python modular (``modshogun``) library
-is available, the LMNN implementation will use the fast C++ version from
-there. The two implementations differ slightly, and the C++ version is
-more complete.
+   user_guide
 
-Naviagtion
-----------
+.. toctree::
+   :maxdepth: 2
 
-:ref:`genindex` | :ref:`modindex` | :ref:`search`
+   Package Contents <metric_learn>
 
 .. toctree::
-   :maxdepth: 4
-   :hidden:
+   :maxdepth: 2
+
+   auto_examples/index
 
-   Package Overview <metric_learn>
+:ref:`genindex` | :ref:`search`
 
+.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg
+   :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
    :target: http://badges.mit-license.org
+.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn
diff --git a/doc/introduction.rst b/doc/introduction.rst
new file mode 100644
index 00000000..e9ff0015
--- /dev/null
+++ b/doc/introduction.rst
@@ -0,0 +1,125 @@
+.. _intro_metric_learning:
+
+========================
+What is Metric Learning?
+========================
+
+Many approaches in machine learning require a measure of distance between data
+points. Traditionally, practitioners would choose a standard distance metric
+(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of the
+domain. However, it is often difficult to design metrics that are well-suited
+to the particular data and task of interest.
+
+Distance metric learning (or simply, metric learning) aims at
+automatically constructing task-specific distance metrics from (weakly)
+supervised data, in a machine learning manner. The learned distance metric can
+then be used to perform various tasks (e.g., k-NN classification, clustering,
+information retrieval).
+
+Problem Setting
+===============
+
+Metric learning problems fall into two main categories depending on the type
+of supervision available about the training data:
+
+- :doc:`Supervised learning <supervised>`: the algorithm has access to
+  a set of data points, each of them belonging to a class (label) as in a
+  standard classification problem.
+  Broadly speaking, the goal in this setting is to learn a distance metric
+  that puts points with the same label close together while pushing away
+  points with different labels.
+- :doc:`Weakly supervised learning <weakly_supervised>`: the
+  algorithm has access to a set of data points with supervision only
+  at the tuple level (typically pairs, triplets, or quadruplets of
+  data points). A classic example of such weaker supervision is a set of
+  positive and negative pairs: in this case, the goal is to learn a distance
+  metric that puts positive pairs close together and negative pairs far away.
+
+Based on the above (weakly) supervised data, the metric learning problem is
+generally formulated as an optimization problem where one seeks to find the
+parameters of a distance function that optimize some objective function
+measuring the agreement with the training data.
+
+.. _mahalanobis_distances:
+
+Mahalanobis Distances
+=====================
+
+In the metric-learn package, all algorithms currently implemented learn 
+so-called Mahalanobis distances. Given a real-valued parameter matrix
+:math:`L` of shape ``(num_dims, n_features)`` where ``n_features`` is the
+number features describing the data, the Mahalanobis distance associated with
+:math:`L` is defined as follows:
+
+.. math:: D(x, x') = \sqrt{(Lx-Lx')^\top(Lx-Lx')}
+
+In other words, a Mahalanobis distance is a Euclidean distance after a
+linear transformation of the feature space defined by :math:`L` (taking
+:math:`L` to be the identity matrix recovers the standard Euclidean distance).
+Mahalanobis distance metric learning can thus be seen as learning a new
+embedding space of dimension ``num_dims``. Note that when ``num_dims`` is
+smaller than ``n_features``, this achieves dimensionality reduction.
+
+Strictly speaking, Mahalanobis distances are "pseudo-metrics": they satisfy
+three of the `properties of a metric <https://en.wikipedia.org/wiki/Metric_
+(mathematics)>`_ (non-negativity, symmetry, triangle inequality) but not
+necessarily the identity of indiscernibles.
+
+.. note::
+
+  Mahalanobis distances can also be parameterized by a `positive semi-definite 
+  (PSD) matrix
+  <https://en.wikipedia.org/wiki/Positive-definite_matrix#Positive_semidefinite>`_
+  :math:`M`:
+
+  .. math:: D(x, x') = \sqrt{(x-x')^\top M(x-x')}
+
+  Using the fact that a PSD matrix :math:`M` can always be decomposed as
+  :math:`M=L^\top L` for some  :math:`L`, one can show that both
+  parameterizations are equivalent. In practice, an algorithm may thus solve
+  the metric learning problem with respect to either :math:`M` or :math:`L`.
+
+.. _use_cases:
+
+Use-cases
+=========
+
+There are many use-cases for metric learning. We list here a few popular
+examples (for code illustrating some of these use-cases, see the
+:doc:`examples <auto_examples/index>` section of the documentation):
+
+- `Nearest neighbors models
+  <https://scikit-learn.org/stable/modules/neighbors.html>`_: the learned
+  metric can be used to improve nearest neighbors learning models for
+  classification, regression, anomaly detection...
+- `Clustering <https://scikit-learn.org/stable/modules/clustering.html>`_:
+  metric learning provides a way to bias the clusters found by algorithms like
+  K-Means towards the intended semantics.
+- Information retrieval: the learned metric can be used to retrieve the
+  elements of a database that are semantically closest to a query element.
+- Dimensionality reduction: metric learning may be seen as a way to reduce the
+  data dimension in a (weakly) supervised setting.
+- More generally, the learned transformation :math:`L` can be used to project
+  the data into a new embedding space before feeding it into another machine
+  learning algorithm.
+
+The API of metric-learn is compatible with `scikit-learn
+<https://scikit-learn.org/>`_, the leading library for machine
+learning in Python. This allows to easily pipeline metric learners with other
+scikit-learn estimators to realize the above use-cases, to perform joint
+hyperparameter tuning, etc.
+
+Further reading
+===============
+
+For more information about metric learning and its applications, one can refer
+to the following resources:
+
+- **Tutorial:** `Similarity and Distance Metric Learning with Applications to
+  Computer Vision
+  <http://researchers.lille.inria.fr/abellet/talks/metric_learning_tutorial_ECML_PKDD.pdf>`_ (2015)
+- **Surveys:** `A Survey on Metric Learning for Feature Vectors and Structured
+  Data <https://arxiv.org/pdf/1306.6709.pdf>`_ (2013), `Metric Learning: A
+  Survey <http://dx.doi.org/10.1561/2200000019>`_ (2012)
+- **Book:** `Metric Learning
+  <http://dx.doi.org/10.2200/S00626ED1V01Y201501AIM030>`_ (2015)
diff --git a/doc/metric_learn.base_metric.rst b/doc/metric_learn.base_metric.rst
deleted file mode 100644
index 050a360b..00000000
--- a/doc/metric_learn.base_metric.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-metric_learn.base_metric module
-===============================
-
-.. automodule:: metric_learn.base_metric
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/metric_learn.covariance.rst b/doc/metric_learn.covariance.rst
deleted file mode 100644
index d24229a3..00000000
--- a/doc/metric_learn.covariance.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Covariance metric (baseline method)
-===================================
-
-.. automodule:: metric_learn.covariance
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    from metric_learn import Covariance
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-
-    cov = Covariance()
-    x = cov.fit_transform(iris_data['data'])
diff --git a/doc/metric_learn.itml.rst b/doc/metric_learn.itml.rst
deleted file mode 100644
index d6fb2221..00000000
--- a/doc/metric_learn.itml.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Information Theoretic Metric Learning (ITML)
-============================================
-
-.. automodule:: metric_learn.itml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    from metric_learn import ITML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    itml = ITML_Supervised(num_constraints=200)
-    itml.fit(X, Y)
-
-References
-----------
-`Information-theoretic Metric Learning <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
diff --git a/doc/metric_learn.lfda.rst b/doc/metric_learn.lfda.rst
deleted file mode 100644
index 95cde90d..00000000
--- a/doc/metric_learn.lfda.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-Local Fisher Discriminant Analysis (LFDA)
-=========================================
-
-.. automodule:: metric_learn.lfda
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import LFDA
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lfda = LFDA(k=2, dim=2)
-    lfda.fit(X, Y)
-
-References
-------------------
-`Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_ Masashi Sugiyama.
-
-`Local Fisher Discriminant Analysis on Beer Style Clustering <https://gastrograph.com/resources/whitepapers/local-fisher-discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
diff --git a/doc/metric_learn.lmnn.rst b/doc/metric_learn.lmnn.rst
deleted file mode 100644
index 4062bfa0..00000000
--- a/doc/metric_learn.lmnn.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-Large Margin Nearest Neighbor (LMNN)
-====================================
-
-.. automodule:: metric_learn.lmnn
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import LMNN
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lmnn = LMNN(k=5, learn_rate=1e-6)
-    lmnn.fit(X, Y, verbose=False)
-
-If a recent version of the Shogun Python modular (``modshogun``) library
-is available, the LMNN implementation will use the fast C++ version from
-there. Otherwise, the included pure-Python version will be used.
-The two implementations differ slightly, and the C++ version is more complete.
-
-References
-----------
-`Distance Metric Learning for Large Margin Nearest Neighbor Classification <http://papers.nips.cc/paper/2795-distance-metric-learning-for-large-margin-nearest-neighbor-classification>`_ Kilian Q. Weinberger, John Blitzer, Lawrence K. Saul
diff --git a/doc/metric_learn.lsml.rst b/doc/metric_learn.lsml.rst
deleted file mode 100644
index 12be71b8..00000000
--- a/doc/metric_learn.lsml.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Least Squares Metric Learning (LSML)
-====================================
-
-.. automodule:: metric_learn.lsml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    from metric_learn import LSML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lsml = LSML_Supervised(num_constraints=200)
-    isml.fit(X, Y)
-
-References
-----------
-
diff --git a/doc/metric_learn.nca.rst b/doc/metric_learn.nca.rst
deleted file mode 100644
index 6a2675e5..00000000
--- a/doc/metric_learn.nca.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Neighborhood Components Analysis (NCA)
-======================================
-
-.. automodule:: metric_learn.nca
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import NCA
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    nca = NCA(max_iter=1000, learning_rate=0.01)
-    nca.fit(X, Y)
-
-References
-----------
-
diff --git a/doc/metric_learn.rca.rst b/doc/metric_learn.rca.rst
deleted file mode 100644
index 2430cd82..00000000
--- a/doc/metric_learn.rca.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Relative Components Analysis (RCA)
-==================================
-
-.. automodule:: metric_learn.rca
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    from metric_learn import RCA_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    rca = RCA_Supervised(num_chunks=30, chunk_size=2)
-    rca.fit(X, Y)
-
-References
-------------------
-`Adjustment learning and relevant component analysis <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ Noam Shental, et al.
diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst
index 226fd324..4d0676b9 100644
--- a/doc/metric_learn.rst
+++ b/doc/metric_learn.rst
@@ -1,24 +1,60 @@
 metric_learn package
 ====================
 
-Submodules
-----------
+Module Contents
+---------------
 
-.. toctree::
+Base Classes
+------------
 
-   metric_learn.base_metric
-   metric_learn.itml
-   metric_learn.lfda
-   metric_learn.lmnn
-   metric_learn.lsml
-   metric_learn.nca
-   metric_learn.rca
-   metric_learn.sdml
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
 
-Module contents
----------------
+    metric_learn.Constraints
+    metric_learn.base_metric.BaseMetricLearner
+    metric_learn.base_metric.MetricTransformer
+    metric_learn.base_metric.MahalanobisMixin
+    metric_learn.base_metric._PairsClassifierMixin
+    metric_learn.base_metric._TripletsClassifierMixin
+    metric_learn.base_metric._QuadrupletsClassifierMixin
+
+Supervised Learning Algorithms
+------------------------------
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metric_learn.LFDA
+   metric_learn.LMNN
+   metric_learn.MLKR
+   metric_learn.NCA
+   metric_learn.RCA
+   metric_learn.ITML_Supervised
+   metric_learn.LSML_Supervised
+   metric_learn.MMC_Supervised
+   metric_learn.SDML_Supervised
+   metric_learn.RCA_Supervised
+   metric_learn.SCML_Supervised
+
+Weakly Supervised Learning Algorithms
+-------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metric_learn.ITML
+   metric_learn.LSML
+   metric_learn.MMC
+   metric_learn.SDML
+   metric_learn.SCML
+
+Unsupervised Learning Algorithms
+--------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
 
-.. automodule:: metric_learn
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   metric_learn.Covariance
\ No newline at end of file
diff --git a/doc/metric_learn.sdml.rst b/doc/metric_learn.sdml.rst
deleted file mode 100644
index 83570483..00000000
--- a/doc/metric_learn.sdml.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-Sparse Determinant Metric Learning (SDML)
-=========================================
-
-.. automodule:: metric_learn.sdml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-
-Example Code
-------------
-
-::
-
-    from metric_learn import SDML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    sdml = SDML_Supervised(num_constraints=200)
-    sdml.fit(X, Y)
-
-References
-------------------
diff --git a/doc/preprocessor.rst b/doc/preprocessor.rst
new file mode 100644
index 00000000..ad1ffd8f
--- /dev/null
+++ b/doc/preprocessor.rst
@@ -0,0 +1,111 @@
+.. _preprocessor_section:
+
+============
+Preprocessor
+============
+
+Estimators in metric-learn all have a ``preprocessor`` option at instantiation.
+Filling this argument allows them to take more compact input representation
+when fitting, predicting etc...
+
+If ``preprocessor=None``, no preprocessor will be used and the user must
+provide the classical representation to the fit/predict/score/etc... methods of
+the estimators (see the documentation of the particular estimator to know the
+type of input it accepts). Otherwise, two types of objects can be put in this
+argument:
+
+Array-like
+----------
+You can specify ``preprocessor=X`` where ``X`` is an array-like containing the
+dataset of points. In this case, the fit/predict/score/etc... methods of the
+estimator will be able to take as inputs an array-like of indices, replacing
+under the hood each index by the corresponding sample.
+
+
+Example with a supervised metric learner:
+
+>>> from metric_learn import NCA
+>>>
+>>> X = np.array([[-0.7 , -0.23],
+>>>               [-0.43, -0.49],
+>>>               [ 0.14, -0.37]])  # array of 3 samples of 2 features
+>>> points_indices = np.array([2, 0, 1, 0])
+>>> y = np.array([1, 0, 1, 1])
+>>>
+>>> nca = NCA(preprocessor=X)
+>>> nca.fit(points_indices, y)
+>>> # under the hood the algorithm will create
+>>> # points = np.array([[ 0.14, -0.37],
+>>> #                    [-0.7 , -0.23],
+>>> #                    [-0.43, -0.49],
+>>> #                    [ 0.14, -0.37]]) and fit on it
+
+
+Example with a weakly supervised metric learner:
+
+>>> from metric_learn import MMC
+>>> X = np.array([[-0.7 , -0.23],
+>>>               [-0.43, -0.49],
+>>>               [ 0.14, -0.37]])  # array of 3 samples of 2 features
+>>> pairs_indices = np.array([[2, 0], [1, 0]])
+>>> y_pairs = np.array([1, -1])
+>>>
+>>> mmc = MMC(preprocessor=X)
+>>> mmc.fit(pairs_indices, y_pairs)
+>>> # under the hood the algorithm will create
+>>> # pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]],
+>>> #                    [[-0.43, -0.49], [-0.7 , -0.23]]]) and fit on it
+
+Callable
+--------
+Alternatively, you can provide a callable as ``preprocessor``. Then the
+estimator will accept indicators of points instead of points. Under the hood,
+the estimator will call this callable on the indicators you provide as input
+when fitting, predicting etc... Using a callable can be really useful to
+represent lazily a dataset of images stored on the file system for instance.
+The callable should take as an input a 1D array-like, and return a 2D
+array-like. For supervised learners it will be applied on the whole 1D array of
+indicators at once, and for weakly supervised learners it will be applied on
+each column of the 2D array of tuples.
+
+Example with a supervised metric learner:
+
+>>> def find_images(file_paths):
+>>>    # each file contains a small image to use as an input datapoint
+>>>    return np.row_stack([imread(f).ravel() for f in file_paths])
+>>>
+>>> nca = NCA(preprocessor=find_images)
+>>> nca.fit(['img01.png', 'img00.png', 'img02.png'], [1, 0, 1])
+>>> # under the hood preprocessor(indicators) will be called
+
+
+Example with a weakly supervised metric learner:
+
+>>> pairs_images_paths = [['img02.png', 'img00.png'],
+>>>                       ['img01.png', 'img00.png']]
+>>> y_pairs = np.array([1, -1])
+>>>
+>>> mmc = NCA(preprocessor=find_images)
+>>> mmc.fit(pairs_images_paths, y_pairs)
+>>> # under the hood preprocessor(pairs_indicators[i]) will be called for each
+>>> # i in [0, 1]
+
+
+.. note:: Note that when you fill the ``preprocessor`` option, it allows you
+ to give more compact inputs, but the classical way of providing inputs
+ stays valid (2D array-like for supervised learners and 3D array-like of
+ tuples for weakly supervised learners). If a classical input
+ is provided, the metric learner will not use the preprocessor.
+
+ Example: This will work:
+
+ >>> from metric_learn import MMC
+ >>> def preprocessor_wip(array):
+ >>>    raise NotImplementedError("This preprocessor does nothing yet.")
+ >>>
+ >>> pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]],
+ >>>                   [[-0.43, -0.49], [-0.7 , -0.23]]])
+ >>> y_pairs = np.array([1, -1])
+ >>>
+ >>> mmc = MMC(preprocessor=preprocessor_wip)
+ >>> mmc.fit(pairs, y_pairs)  # preprocessor_wip will not be called here
diff --git a/doc/supervised.rst b/doc/supervised.rst
new file mode 100644
index 00000000..49548b83
--- /dev/null
+++ b/doc/supervised.rst
@@ -0,0 +1,434 @@
+==========================
+Supervised Metric Learning
+==========================
+
+Supervised metric learning algorithms take as inputs points `X` and target
+labels `y`, and learn a distance matrix that make points from the same class
+(for classification) or with close target value (for regression) close to each
+other, and points from different classes or with distant target values far away
+from each other.
+
+General API
+===========
+
+Supervised metric learning algorithms essentially use the same API as
+scikit-learn.
+
+Input data
+----------
+In order to train a model, you need two `array-like <https://scikit-learn\
+.org/stable/glossary.html#term-array-like>`_ objects, `X` and `y`. `X`
+should be a 2D array-like of shape `(n_samples, n_features)`, where
+`n_samples` is the number of points of your dataset and `n_features` is the
+number of attributes describing each point. `y` should be a 1D
+array-like
+of shape `(n_samples,)`, containing for each point in `X` the class it
+belongs to (or the value to regress for this sample, if you use `MLKR` for
+instance).
+
+Here is an example of a dataset of two dogs and one
+cat (the classes are 'dog' and 'cat') an animal being represented by
+two numbers.
+
+>>> import numpy as np
+>>> X = np.array([[2.3, 3.6], [0.2, 0.5], [6.7, 2.1]])
+>>> y = np.array(['dog', 'cat', 'dog'])
+
+.. note::
+
+   You can also use a preprocessor instead of directly giving the inputs as
+   2D arrays. See the :ref:`preprocessor_section` section for more details.
+
+Fit, transform, and so on
+-------------------------
+The goal of supervised metric-learning algorithms is to transform
+points in a new space, in which the distance between two points from the
+same class will be small, and the distance between two points from different
+classes will be large. To do so, we fit the metric learner (example:
+`NCA`).
+
+>>> from metric_learn import NCA
+>>> nca = NCA(random_state=42)
+>>> nca.fit(X, y)
+NCA(init='auto', max_iter=100, n_components=None,
+  preprocessor=None, random_state=42, tol=None, verbose=False)
+
+
+Now that the estimator is fitted, you can use it on new data for several
+purposes.
+
+First, you can transform the data in the learned space, using `transform`:
+Here we transform two points in the new embedding space.
+
+>>> X_new = np.array([[9.4, 4.1], [2.1, 4.4]])
+>>> nca.transform(X_new)
+array([[ 5.91884732, 10.25406973],
+       [ 3.1545886 ,  6.80350083]])
+
+Also, as explained before, our metric learners has learn a distance between
+points. You can use this distance in two main ways:
+
+- You can either return the distance between pairs of points using the
+  `pair_distance` function:
+
+>>> nca.pair_distance([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+array([0.49627072, 3.65287282, 6.06079877])
+
+- Or you can return a function that will return the distance (in the new
+  space) between two 1D arrays (the coordinates of the points in the original
+  space), similarly to distance functions in `scipy.spatial.distance`.
+
+>>> metric_fun = nca.get_metric()
+>>> metric_fun([3.5, 3.6], [5.6, 2.4])
+0.4962707194621285
+
+- Alternatively, you can use `pair_score` to return the **score** between
+  pairs of points (the larger the score, the more similar the pair).
+  For Mahalanobis learners, it is equal to the opposite of the distance.
+
+>>> score = nca.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+>>> score
+array([-0.49627072, -3.65287282, -6.06079877])
+
+This is useful because `pair_score` matches the **score** semantic of 
+scikit-learn's `Classification metrics
+<https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics>`_.
+
+.. note::
+
+    If the metric learner that you use learns a :ref:`Mahalanobis distance
+    <mahalanobis_distances>` (like it is the case for all algorithms
+    currently in metric-learn), you can get the plain learned Mahalanobis
+    matrix using `get_mahalanobis_matrix`.
+
+    >>> nca.get_mahalanobis_matrix()
+    array([[0.43680409, 0.89169412],
+           [0.89169412, 1.9542479 ]])
+
+
+Scikit-learn compatibility
+--------------------------
+
+All supervised algorithms are scikit-learn estimators 
+(`sklearn.base.BaseEstimator`) and transformers 
+(`sklearn.base.TransformerMixin`) so they are compatible with pipelines 
+(`sklearn.pipeline.Pipeline`) and
+scikit-learn model selection routines 
+(`sklearn.model_selection.cross_val_score`,
+`sklearn.model_selection.GridSearchCV`, etc).
+You can also use some of the scoring functions from `sklearn.metrics`.
+
+Algorithms
+==========
+
+.. _lmnn:
+
+:py:class:`LMNN <metric_learn.LMNN>`
+-----------------------------------------
+
+Large Margin Nearest Neighbor Metric Learning
+(:py:class:`LMNN <metric_learn.LMNN>`)
+
+`LMNN` learns a Mahalanobis distance metric in the kNN classification
+setting. The learned metric attempts to keep close k-nearest neighbors 
+from the same class, while keeping examples from different classes 
+separated by a large margin. This algorithm makes no assumptions about
+the distribution of the data.
+
+The distance is learned by solving the following optimization problem:
+
+.. math::
+
+      \min_\mathbf{L}\sum_{i, j}\eta_{ij}||\mathbf{L(x_i-x_j)}||^2 + 
+      c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L(x_i-x_j)}||^2-||
+      \mathbf{L(x_i-x_l)}||^2]_+)
+
+where :math:`\mathbf{x}_i` is a data point, :math:`\mathbf{x}_j` is one 
+of its k-nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` 
+are all the other instances within that region with different labels, 
+:math:`\eta_{ij}, y_{ij} \in \{0, 1\}` are both the indicators, 
+:math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k-nearest 
+neighbors (with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` 
+indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different classes, 
+:math:`[\cdot]_+=\max(0, \cdot)` is the Hinge loss.
+
+.. rubric:: Example Code
+
+::
+
+    import numpy as np
+    from metric_learn import LMNN
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    lmnn.fit(X, Y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+  [1]. Weinberger et al. `Distance Metric Learning for Large Margin Nearest Neighbor Classification <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_. JMLR 2009.
+
+  [2]. `Wikipedia entry on Large Margin Nearest Neighbor <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_.
+             
+
+.. _nca:
+
+:py:class:`NCA <metric_learn.NCA>`
+--------------------------------------
+
+Neighborhood Components Analysis (:py:class:`NCA <metric_learn.NCA>`)
+
+`NCA` is a distance metric learning algorithm which aims to improve the 
+accuracy of nearest neighbors classification compared to the standard 
+Euclidean distance. The algorithm directly maximizes a stochastic variant 
+of the leave-one-out k-nearest neighbors (KNN) score on the training set. 
+It can also learn a low-dimensional linear transformation of data that can 
+be used for data visualization and fast classification.
+
+They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and 
+define the probability :math:`p_{ij}` that :math:`\mathbf{x}_i` is the 
+neighbor of :math:`\mathbf{x}_j` by calculating the softmax likelihood of 
+the Mahalanobis distance:
+
+.. math::
+
+      p_{ij} = \frac{\exp(-|| \mathbf{Lx}_i - \mathbf{Lx}_j ||_2^2)}
+      {\sum_{l\neq i}\exp(-||\mathbf{Lx}_i - \mathbf{Lx}_l||_2^2)}, 
+      \qquad p_{ii}=0
+
+Then the probability that :math:`\mathbf{x}_i` will be correctly classified 
+by the stochastic nearest neighbors rule is:
+
+.. math::
+
+      p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij}
+
+The optimization problem is to find matrix :math:`\mathbf{L}` that maximizes 
+the sum of probability of being correctly classified:
+
+.. math::
+
+      \mathbf{L} = \text{argmax}\sum_i p_i
+
+.. rubric:: Example Code
+
+::
+
+    import numpy as np
+    from metric_learn import NCA
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    nca = NCA(max_iter=1000)
+    nca.fit(X, Y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Goldberger et al. `Neighbourhood Components Analysis <https://papers.nips.cc/paper/2566-neighbourhood-components-analysis.pdf>`_. NIPS 2005.
+
+      [2]. `Wikipedia entry on Neighborhood Components Analysis <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_.
+       
+
+.. _lfda:
+
+:py:class:`LFDA <metric_learn.LFDA>`
+-----------------------------------------
+
+Local Fisher Discriminant Analysis (:py:class:`LFDA <metric_learn.LFDA>`)
+
+`LFDA` is a linear supervised dimensionality reduction method which effectively combines the ideas of `Linear Discriminant Analysis <https://en.wikipedia.org/wiki/Linear_discriminant_analysis>` and Locality-Preserving Projection . It is
+particularly useful when dealing with multi-modality, where one ore more classes
+consist of separate clusters in input space. The core optimization problem of
+LFDA is solved as a generalized eigenvalue problem.
+
+
+The algorithm define the Fisher local within-/between-class scatter matrix 
+:math:`\mathbf{S}^{(w)}/ \mathbf{S}^{(b)}` in a pairwise fashion:
+
+.. math::
+
+    \mathbf{S}^{(w)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(w)}(\mathbf{x}_i - 
+    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
+    \mathbf{S}^{(b)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(b)}(\mathbf{x}_i - 
+    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
+
+where 
+
+.. math::
+
+    W_{ij}^{(w)} = \left\{\begin{aligned}0 \qquad y_i\neq y_j \\
+    \,\,\mathbf{A}_{i,j}/n_l \qquad y_i = y_j\end{aligned}\right.\\
+    W_{ij}^{(b)} = \left\{\begin{aligned}1/n \qquad y_i\neq y_j \\
+    \,\,\mathbf{A}_{i,j}(1/n-1/n_l) \qquad y_i = y_j\end{aligned}\right.\\
+
+here :math:`\mathbf{A}_{i,j}` is the :math:`(i,j)`-th entry of the affinity
+matrix :math:`\mathbf{A}`:, which can be calculated with local scaling methods, `n` and `n_l` are the total number of points and the number of points per cluster `l` respectively.
+
+Then the learning problem becomes derive the LFDA transformation matrix 
+:math:`\mathbf{L}_{LFDA}`:
+
+.. math::
+
+    \mathbf{L}_{LFDA} = \arg\max_\mathbf{L}
+    [\text{tr}((\mathbf{L}^T\mathbf{S}^{(w)}
+    \mathbf{L})^{-1}\mathbf{L}^T\mathbf{S}^{(b)}\mathbf{L})]
+
+That is, it is looking for a transformation matrix :math:`\mathbf{L}` such that 
+nearby data pairs in the same class are made close and the data pairs in 
+different classes are separated from each other; far apart data pairs in the 
+same class are not imposed to be close.
+
+.. rubric:: Example Code
+
+::
+
+    import numpy as np
+    from metric_learn import LFDA
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    lfda = LFDA(k=2, dim=2)
+    lfda.fit(X, Y)
+
+.. note::
+    LDFA suffers from a problem called “sign indeterminacy”, which means the sign of the ``components`` and the output from transform depend on a random state. This is directly related to the calculation of eigenvectors in the algorithm. The same input ran in different times might lead to different transforms, but both valid.
+    
+    To work around this, fit instances of this class to data once, then keep the instance around to do transformations.
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis <http://www.jmlr.org/papers/volume8/sugiyama07b/sugiyama07b.pdf>`_. JMLR 2007.
+
+      [2]. Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering <https://gastrograph.com/resources/whitepapers/local-fisher-discriminant-analysis-on-beer-style-clustering.html#>`_.
+
+.. _mlkr:
+
+:py:class:`MLKR <metric_learn.MLKR>`
+-----------------------------------------
+
+Metric Learning for Kernel Regression (:py:class:`MLKR <metric_learn.MLKR>`)
+
+`MLKR` is an algorithm for supervised metric learning, which learns a
+distance function by directly minimizing the leave-one-out regression error.
+This algorithm can also be viewed as a supervised variation of PCA and can be
+used for dimensionality reduction and high dimensional data visualization.
+
+Theoretically, `MLKR` can be applied with many types of kernel functions and 
+distance metrics, we hereafter focus the exposition on a particular instance 
+of the Gaussian kernel and Mahalanobis metric, as these are used in our 
+empirical development. The Gaussian kernel is denoted as:
+
+.. math::
+
+    k_{ij} = \frac{1}{\sqrt{2\pi}\sigma}\exp(-\frac{d(\mathbf{x}_i, 
+    \mathbf{x}_j)}{\sigma^2})
+
+where :math:`d(\cdot, \cdot)` is the squared distance under some metrics, 
+here in the fashion of Mahalanobis, it should be :math:`d(\mathbf{x}_i, 
+\mathbf{x}_j) = ||\mathbf{L}(\mathbf{x}_i - \mathbf{x}_j)||`, the transition 
+matrix :math:`\mathbf{L}` is derived from the decomposition of Mahalanobis 
+matrix :math:`\mathbf{M=L^TL}`.
+
+Since :math:`\sigma^2` can be integrated into :math:`d(\cdot)`, we can set 
+:math:`\sigma^2=1` for the sake of simplicity. Here we use the cumulative 
+leave-one-out quadratic regression error of the training samples as the 
+loss function:
+
+.. math::
+
+    \mathcal{L} = \sum_i(y_i - \hat{y}_i)^2
+
+where the prediction :math:`\hat{y}_i` is derived from kernel regression by 
+calculating a weighted average of all the training samples:
+
+.. math::
+
+    \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}}
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import MLKR
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    mlkr = MLKR()
+    mlkr.fit(X, Y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Weinberger et al. `Metric Learning for Kernel Regression <http://proceedings.mlr.press/v2/weinberger07a/weinberger07a.pdf>`_. AISTATS 2007.
+
+
+.. _supervised_version:
+
+Supervised versions of weakly-supervised algorithms
+---------------------------------------------------
+
+Each :ref:`weakly-supervised algorithm <weakly_supervised_section>`
+has a supervised version of the form `*_Supervised` where similarity tuples are
+randomly generated from the labels information and passed to the underlying
+algorithm. 
+
+.. warning::
+    Supervised versions of weakly-supervised algorithms interpret label -1
+    (or any negative label) as a point with unknown label.
+    Those points are discarded in the learning process.
+
+For pairs learners (see :ref:`learning_on_pairs`), pairs (tuple of two points
+from the dataset), and pair labels (`int` indicating whether the two points
+are similar (+1) or dissimilar (-1)), are sampled with the function
+`metric_learn.constraints.positive_negative_pairs`. To sample positive pairs
+(of label +1), this method will look at all the samples from the same label and
+sample randomly a pair among them. To sample negative pairs (of label -1), this
+method will look at all the samples from a different class and sample randomly
+a pair among them. The method will try to build `n_constraints` positive
+pairs and `n_constraints` negative pairs, but sometimes it cannot find enough
+of one of those, so forcing `same_length=True` will return both times the
+minimum of the two lenghts.
+
+For using quadruplets learners (see :ref:`learning_on_quadruplets`) in a
+supervised way, positive and negative pairs are sampled as above and
+concatenated so that we have a 3D array of
+quadruplets, where for each quadruplet the two first points are from the same
+class, and the two last points are from a different class (so indeed the two
+last points should be less similar than the two first points).
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import MMC_Supervised
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    mmc = MMC_Supervised(n_constraints=200)
+    mmc.fit(X, Y)
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
new file mode 100644
index 00000000..110b07f9
--- /dev/null
+++ b/doc/unsupervised.rst
@@ -0,0 +1,40 @@
+============================
+Unsupervised Metric Learning
+============================
+
+Unsupervised metric learning algorithms only take as input an (unlabeled)
+dataset `X`. For now, in metric-learn, there only is `Covariance`, which is a
+simple baseline algorithm (see below).
+
+
+Algorithms
+==========
+.. _covariance:
+
+Covariance
+----------
+
+`Covariance` does not "learn" anything, rather it calculates
+the covariance matrix of the input data. This is a simple baseline method.
+It can be used for ZCA whitening of the data (see the Wikipedia page of
+`whitening transformation <https://en.wikipedia.org/wiki/\
+Whitening_transformation>`_).
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import Covariance
+    from sklearn.datasets import load_iris
+
+    iris = load_iris()['data']
+
+    cov = Covariance().fit(iris)
+    x = cov.transform(iris)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936.
\ No newline at end of file
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
new file mode 100644
index 00000000..5472107a
--- /dev/null
+++ b/doc/user_guide.rst
@@ -0,0 +1,16 @@
+.. title:: User guide: contents
+
+.. _user_guide:
+
+==========
+User Guide
+==========
+
+.. toctree::
+   :numbered:
+
+   introduction.rst
+   supervised.rst
+   weakly_supervised.rst
+   unsupervised.rst
+   preprocessor.rst
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
new file mode 100644
index 00000000..76f7c14e
--- /dev/null
+++ b/doc/weakly_supervised.rst
@@ -0,0 +1,974 @@
+.. _weakly_supervised_section:
+
+=================================
+Weakly Supervised Metric Learning
+=================================
+
+Weakly supervised algorithms work on weaker information about the data points
+than supervised algorithms. Rather than labeled points, they take as input
+similarity judgments on tuples of data points, for instance pairs of similar
+and dissimilar points. Refer to the documentation of each algorithm for its
+particular form of input data.
+
+
+General API
+===========
+
+Input data
+----------
+
+In the following paragraph we talk about tuples for sake of generality. These
+can be pairs, triplets, quadruplets etc, depending on the particular metric
+learning algorithm we use.
+
+Basic form
+^^^^^^^^^^
+
+Every weakly supervised algorithm will take as input tuples of
+points, and if needed labels for theses tuples. The tuples of points can
+also be called "constraints". They are a set of points that we consider (ex:
+two points, three points, etc...). The label is some information we have
+about this set of points (e.g. "these two points are similar"). Note that
+some information can be contained in the ordering of these tuples (see for
+instance the section :ref:`learning_on_quadruplets`). For more details about
+specific forms of tuples, refer to the appropriate sections 
+(:ref:`learning_on_pairs` or :ref:`learning_on_quadruplets`).
+
+The `tuples` argument is the first argument of every method (like the `X`
+argument for classical algorithms in scikit-learn). The second argument is the
+label of the tuple: its semantic depends on the algorithm used. For instance
+for pairs learners `y` is a label indicating whether the pair is of similar
+samples or dissimilar samples.
+
+Then one can fit a Weakly Supervised Metric Learner on this tuple, like this:
+
+>>> my_algo.fit(tuples, y)
+
+Like in a classical setting we split the points `X` between train and test,
+here we split the `tuples` between train and test.
+
+>>> from sklearn.model_selection import train_test_split
+>>> pairs_train, pairs_test, y_train, y_test = train_test_split(pairs, y)
+
+These are two data structures that can be used to represent tuple in metric
+learn:
+
+3D array of tuples
+^^^^^^^^^^^^^^^^^^
+
+The most intuitive way to represent tuples is to provide the algorithm with a
+3D array-like of tuples of shape `(n_tuples, tuple_size, n_features)`, where
+`n_tuples` is the number of tuples, `tuple_size` is the number of elements
+in a tuple (2 for pairs, 3 for triplets for instance), and `n_features` is
+the number of features of each point.
+
+.. rubric:: Example Code
+
+Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
+
+>>> import numpy as np
+>>> tuples = np.array([[[-0.12, -1.21, -0.20],
+>>>                     [+0.05, -0.19, -0.05]],
+>>>
+>>>                    [[-2.16, +0.11, -0.02],
+>>>                     [+1.58, +0.16, +0.93]],
+>>>
+>>>                    [[+1.58, +0.16, +0.93],  # same as tuples[1, 1, :]
+>>>                     [+0.89, -0.34, +2.41]],
+>>>
+>>>                    [[-0.12, -1.21, -0.20],  # same as tuples[0, 0, :]
+>>>                     [-2.16, +0.11, -0.02]]])  # same as tuples[1, 0, :]
+>>> y = np.array([-1, 1, 1, -1])
+
+.. warning:: This way of specifying pairs is not recommended for a large number
+   of tuples, as it is redundant (see the comments in the example) and hence
+   takes a lot of memory. Indeed each feature vector of a point will be
+   replicated as many times as a point is involved in a tuple. The second way
+   to specify pairs is more efficient
+
+
+2D array of indicators + preprocessor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Instead of forming each point in each tuple, a more efficient representation
+would be to keep the dataset of points `X` aside, and just represent tuples
+as a collection of tuples of *indices* from the points in `X`. Since we loose
+the feature dimension there, the resulting array is 2D.
+
+.. rubric:: Example Code
+    
+An equivalent representation of the above pairs would be:
+
+>>> X = np.array([[-0.12, -1.21, -0.20],
+>>>               [+0.05, -0.19, -0.05],
+>>>               [-2.16, +0.11, -0.02],
+>>>               [+1.58, +0.16, +0.93],
+>>>               [+0.89, -0.34, +2.41]])
+>>>
+>>> tuples_indices = np.array([[0, 1],
+>>>                            [2, 3],
+>>>                            [3, 4],
+>>>                            [0, 2]])
+>>> y = np.array([-1, 1, 1, -1])
+
+In order to fit metric learning algorithms with this type of input, we need to
+give the original dataset of points `X` to the estimator so that it knows
+the points the indices refer to. We do this when initializing the estimator,
+through the argument `preprocessor` (see below :ref:`fit_ws`)
+
+
+.. note::
+
+   Instead of an array-like, you can give a callable in the argument
+   `preprocessor`, which will go fetch and form the tuples. This allows to
+   give more general indicators than just indices from an array (for instance
+   paths in the filesystem, name of records in a database etc...) See section
+   :ref:`preprocessor_section` for more details on how to use the preprocessor.
+
+.. _fit_ws:
+
+Fit, transform, and so on
+-------------------------
+
+The goal of weakly-supervised metric-learning algorithms is to transform
+points in a new space, in which the tuple-wise constraints between points
+are respected.
+
+>>> from metric_learn import MMC
+>>> mmc = MMC(random_state=42)
+>>> mmc.fit(tuples, y)
+MMC(A0='deprecated', tol=0.001, diagonal=False,
+  diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000,
+  preprocessor=None, random_state=42, verbose=False)
+
+Or alternatively (using a preprocessor):
+
+>>> from metric_learn import MMC
+>>> mmc = MMC(preprocessor=X, random_state=42)
+>>> mmc.fit(pairs_indice, y)
+
+
+Now that the estimator is fitted, you can use it on new data for several
+purposes.
+
+First, you can transform the data in the learned space, using `transform`:
+Here we transform two points in the new embedding space.
+
+>>> X_new = np.array([[9.4, 4.1, 4.2], [2.1, 4.4, 2.3]])
+>>> mmc.transform(X_new)
+array([[-3.24667162e+01,  4.62622348e-07,  3.88325421e-08],
+       [-3.61531114e+01,  4.86778289e-07,  2.12654397e-08]])
+
+Also, as explained before, our metric learner has learned a distance between
+points. You can use this distance in two main ways:
+
+- You can either return the distance between pairs of points using the
+  `pair_distance` function:
+
+>>> mmc.pair_distance([[[3.5, 3.6, 5.2], [5.6, 2.4, 6.7]],
+...                  [[1.2, 4.2, 7.7], [2.1, 6.4, 0.9]]])
+array([7.27607365, 0.88853014])
+
+- Or you can return a function that will return the distance
+  (in the new space) between two 1D arrays (the coordinates of the points in
+  the original space), similarly to distance functions in
+  `scipy.spatial.distance`. To do that, use the `get_metric` method.
+
+>>> metric_fun = mmc.get_metric()
+>>> metric_fun([3.5, 3.6, 5.2], [5.6, 2.4, 6.7])
+7.276073646278203
+
+- Alternatively, you can use `pair_score` to return the **score** between
+  pairs of points (the larger the score, the more similar the pair).
+  For Mahalanobis learners, it is equal to the opposite of the distance.
+
+>>> score = mmc.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+>>> score
+array([-0.49627072, -3.65287282, -6.06079877])
+
+  This is useful because `pair_score` matches the **score** semantic of 
+  scikit-learn's `Classification metrics
+  <https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics>`_.
+
+.. note::
+
+    If the metric learner that you use learns a :ref:`Mahalanobis distance
+    <mahalanobis_distances>` (like it is the case for all algorithms
+    currently in metric-learn), you can get the plain Mahalanobis matrix using
+    `get_mahalanobis_matrix`.
+
+>>> mmc.get_mahalanobis_matrix()
+array([[ 0.58603894, -5.69883982, -1.66614919],
+       [-5.69883982, 55.41743549, 16.20219519],
+       [-1.66614919, 16.20219519,  4.73697721]])
+
+.. _sklearn_compat_ws:
+
+Prediction and scoring
+----------------------
+
+Since weakly supervised are also able, after being fitted, to predict for a
+given tuple what is its label (for pairs) or ordering (for quadruplets). See
+the appropriate section for more details, either :ref:`this
+one <pairs_predicting>` for pairs, or :ref:`this one
+<quadruplets_predicting>` for quadruplets.
+
+They also implement a default scoring method, `score`, that can be
+used to evaluate the performance of a metric-learner on a test dataset. See
+the appropriate section for more details, either :ref:`this
+one <pairs_scoring>` for pairs, or :ref:`this one <learning_on_quadruplets>`
+for quadruplets.
+
+Scikit-learn compatibility
+--------------------------
+
+Weakly supervised estimators are compatible with scikit-learn routines for
+model selection (`sklearn.model_selection.cross_val_score`,
+`sklearn.model_selection.GridSearchCV`, etc).
+
+Example:
+
+>>> from metric_learn import MMC
+>>> import numpy as np
+>>> from sklearn.datasets import load_iris
+>>> from sklearn.model_selection import cross_val_score
+>>> rng = np.random.RandomState(42)
+>>> X, _ = load_iris(return_X_y=True)
+>>> # let's sample 30 random pairs and labels of pairs
+>>> pairs_indices = rng.randint(X.shape[0], size=(30, 2))
+>>> y = 2 * rng.randint(2, size=30) - 1
+>>> mmc = MMC(preprocessor=X)
+>>> cross_val_score(mmc, pairs_indices, y)
+
+.. _learning_on_pairs:
+
+Learning on pairs
+=================
+
+Some metric learning algorithms learn on pairs of samples. In this case, one
+should provide the algorithm with `n_samples` pairs of points, with a
+corresponding target containing `n_samples` values being either +1 or -1.
+These values indicate whether the given pairs are similar points or
+dissimilar points.
+
+Fitting
+-------
+Here is an example for fitting on pairs (see :ref:`fit_ws` for more details on
+the input data format and how to fit, in the general case of learning on
+tuples).
+
+>>> from metric_learn import MMC
+>>> pairs = np.array([[[1.2, 3.2], [2.3, 5.5]],
+>>>                   [[4.5, 2.3], [2.1, 2.3]]])
+>>> y_pairs = np.array([1, -1])
+>>> mmc = MMC(random_state=42)
+>>> mmc.fit(pairs, y_pairs)
+MMC(tol=0.001, diagonal=False,
+    diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000, preprocessor=None,
+    random_state=42, verbose=False)
+
+Here, we learned a metric that puts the two first points closer
+together in the transformed space, and the two next points further away from
+each other.
+
+.. _pairs_predicting:
+
+Prediction
+----------
+
+When a pairs learner is fitted, it is also able to predict, for an unseen
+pair, whether it is a pair of similar or dissimilar points.
+
+>>> mmc.predict([[[0.6, 1.6], [1.15, 2.75]],
+...              [[3.2, 1.1], [5.4, 6.1]]])
+array([1, -1])
+
+.. _calibration:
+
+Prediction threshold
+^^^^^^^^^^^^^^^^^^^^
+
+Predicting whether a new pair represents similar or dissimilar
+samples requires to set a threshold on the learned distance, so that points
+closer (in the learned space) than this threshold are predicted as similar,
+and points further away are predicted as dissimilar. Several methods are
+possible for this thresholding.
+
+- **Calibration at fit time**: The threshold is set with `calibrate_threshold`
+  (see below) on the training set. You can specify the calibration
+  parameters directly
+  in the `fit` method with the `threshold_params` parameter (see the
+  documentation of the `fit` method of any metric learner that learns on pairs
+  of points for more information). Note that calibrating on the training set
+  may cause some overfitting. If you want to avoid that, calibrate the
+  threshold after fitting, on a validation set.
+
+  >>> mmc.fit(pairs, y) # will fit the threshold automatically after fitting
+
+- **Calibration on validation set**: calling `calibrate_threshold` will
+  calibrate the threshold to achieve a particular score on a validation set,
+  the score being among the classical scores for classification (accuracy, f1
+  score...).
+
+  >>> mmc.calibrate_threshold(pairs, y)
+
+- **Manual threshold**: calling `set_threshold` will set the threshold to a
+  particular value.
+
+  >>> mmc.set_threshold(0.4)
+
+See also: `sklearn.calibration`.
+
+.. _pairs_scoring:
+
+Scoring
+-------
+
+Pair metric learners can also return a `decision_function` for a set of pairs.
+It is basically the "score" that will be thresholded to find the prediction
+for the pair. This score corresponds to the opposite of the distance in the
+new space (higher score means points are similar, and lower score dissimilar).
+
+>>> mmc.decision_function([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]]])
+array([-0.12811124, -0.74750256])
+
+This allows to use common scoring functions for binary classification, like
+`sklearn.metrics.accuracy_score` for instance, which
+can be used inside cross-validation routines:
+
+>>> from sklearn.model_selection import cross_val_score
+>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]],
+...                        [[7.7, 5.6], [1.23, 8.4]]])
+>>> y_test = np.array([-1., 1., -1.])
+>>> cross_val_score(mmc, pairs_test, y_test, scoring='accuracy')
+array([1., 0., 1.])
+
+Pairs learners also have a default score, which basically
+returns the `sklearn.metrics.roc_auc_score` (which is threshold-independent).
+
+>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]],
+...                        [[7.7, 5.6], [1.23, 8.4]]])
+>>> y_test = np.array([1., -1., -1.])
+>>> mmc.score(pairs_test, y_test)
+1.0
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
+
+Algorithms
+----------
+
+.. _itml:
+
+:py:class:`ITML <metric_learn.ITML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Information Theoretic Metric Learning (:py:class:`ITML <metric_learn.ITML>`)
+
+`ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler 
+divergence, between two multivariate Gaussians subject to constraints on the 
+associated Mahalanobis distance, which can be formulated into a Bregman 
+optimization problem by minimizing the LogDet divergence subject to 
+linear constraints. This algorithm can handle a wide variety of constraints
+and can optionally incorporate a prior on the distance function. Unlike some
+other methods, `ITML` does not rely on an eigenvalue computation or 
+semi-definite programming.
+
+
+Given a Mahalanobis distance parameterized by :math:`M`, its corresponding 
+multivariate Gaussian is denoted as:
+
+.. math::
+    p(\mathbf{x}; \mathbf{M}) = \frac{1}{Z}\exp(-\frac{1}{2}d_\mathbf{M}
+    (\mathbf{x}, \mu)) 
+    =  \frac{1}{Z}\exp(-\frac{1}{2}((\mathbf{x} - \mu)^T\mathbf{M}
+    (\mathbf{x} - \mu)) 
+
+where :math:`Z` is the normalization constant, the inverse of Mahalanobis 
+matrix :math:`\mathbf{M}^{-1}` is the covariance of the Gaussian.
+
+Given pairs of similar points :math:`S` and pairs of dissimilar points 
+:math:`D`, the distance metric learning problem is to minimize the LogDet
+divergence, which is equivalent as minimizing :math:`\textbf{KL}(p(\mathbf{x}; 
+\mathbf{M}_0) || p(\mathbf{x}; \mathbf{M}))`:
+
+.. math::
+
+    \min_\mathbf{A} D_{\ell \mathrm{d}}\left(M, M_{0}\right) = 
+    \operatorname{tr}\left(M M_{0}^{-1}\right)-\log \operatorname{det}
+    \left(M M_{0}^{-1}\right)-n\\
+    \text{subject to } \quad d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) 
+    \leq u \qquad (\mathbf{x}_i, \mathbf{x}_j)\in S \\
+    d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, 
+    \mathbf{x}_j)\in D
+
+
+where :math:`u` and :math:`l` is the upper and the lower bound of distance
+for similar and dissimilar pairs respectively, and :math:`\mathbf{M}_0` 
+is the prior distance metric, set to identity matrix by default, 
+:math:`D_{\ell \mathrm{d}}(\cdot)` is the log determinant.
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import ITML
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+
+    itml = ITML()
+    itml.fit(pairs, y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Jason V. Davis, et al. `Information-theoretic Metric Learning <https://icml.cc/imls/conferences/2007/proceedings/papers/404.pdf>`_. ICML 2007.
+
+      [2]. Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/ .
+
+
+.. _sdml:
+
+:py:class:`SDML <metric_learn.SDML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sparse High-Dimensional Metric Learning
+(:py:class:`SDML <metric_learn.SDML>`)
+
+`SDML` is an efficient sparse metric learning in high-dimensional space via 
+double regularization: an L1-penalization on the off-diagonal elements of the 
+Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between 
+:math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}` 
+or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the 
+covariance matrix).
+
+The formulated optimization on the semidefinite matrix :math:`\mathbf{M}` 
+is convex:
+
+.. math::
+
+    \min_{\mathbf{M}} = \text{tr}((\mathbf{M}_0 + \eta \mathbf{XLX}^{T})
+    \cdot \mathbf{M}) - \log\det \mathbf{M} + \lambda ||\mathbf{M}||_{1, off}
+
+where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]` is 
+the training data, the incidence matrix :math:`\mathbf{K}_{ij} = 1` if 
+:math:`(\mathbf{x}_i, \mathbf{x}_j)` is a similar pair, otherwise -1. The 
+Laplacian matrix :math:`\mathbf{L}=\mathbf{D}-\mathbf{K}` is calculated from 
+:math:`\mathbf{K}` and :math:`\mathbf{D}`, a diagonal matrix whose entries are 
+the sums of the row elements of :math:`\mathbf{K}`., :math:`||\cdot||_{1, off}` 
+is the off-diagonal L1 norm.
+
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import SDML
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+    sdml = SDML()
+    sdml.fit(pairs, y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Qi et al. `An efficient sparse metric learning in high-dimensional space via L1-penalized log-determinant regularization <https://icml.cc/Conferences/2009/papers/46.pdf>`_. ICML 2009.
+
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439945 .
+
+.. _rca:
+
+:py:class:`RCA <metric_learn.RCA>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Relative Components Analysis (:py:class:`RCA <metric_learn.RCA>`)
+
+`RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
+in-chunklets covariance matrices. It applies a global linear transformation to 
+assign large weights to relevant dimensions and low weights to irrelevant 
+dimensions. Those relevant dimensions are estimated using "chunklets", subsets 
+of points that are known to belong to the same class.
+
+For a training set with :math:`n` training points in :math:`k` chunklets, the 
+algorithm is efficient since it simply amounts to computing
+
+.. math::
+
+      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}
+      (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j)
+      (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j)^T
+
+
+where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}` 
+with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}^{-1}` is used 
+as the Mahalanobis matrix.
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import RCA
+
+    X = [[-0.05,  3.0],[0.05, -3.0],
+        [0.1, -3.55],[-0.1, 3.55],
+        [-0.95, -0.05],[0.95, 0.05],
+        [0.4,  0.05],[-0.4, -0.05]]
+    chunks = [0, 0, 1, 1, 2, 2, 3, 3]
+
+    rca = RCA()
+    rca.fit(X, chunks)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Shental et al. `Adjustment learning and relevant component analysis <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871 &rep=rep1&type=pdf>`_. ECCV 2002.
+
+      [2]. Bar-Hillel et al. `Learning distance functions using equivalence relations <https://aaai.org/Papers/ICML/2003/ICML03-005.pdf>`_. ICML 2003.
+
+      [3]. Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints <http://www.jmlr.org/papers/volume6/bar-hillel05a/bar-hillel05a.pdf>`_. JMLR 2005.
+
+.. _mmc:
+
+:py:class:`MMC <metric_learn.MMC>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Metric Learning with Application for Clustering with Side Information
+(:py:class:`MMC <metric_learn.MMC>`)
+
+`MMC` minimizes the sum of squared distances between similar points, while
+enforcing the sum of distances between dissimilar ones to be greater than one. 
+This leads to a convex and, thus, local-minima-free optimization problem that 
+can be solved efficiently. 
+However, the algorithm involves the computation of eigenvalues, which is the 
+main speed-bottleneck. Since it has initially been designed for clustering 
+applications, one of the implicit assumptions of MMC is that all classes form 
+a compact set, i.e., follow a unimodal distribution, which restricts the 
+possible use-cases of this method. However, it is one of the earliest and a 
+still often cited technique.
+
+The algorithm aims at minimizing the sum of distances between all the similar 
+points, while constrains the sum of distances between dissimilar points:
+
+.. math::
+
+      \min_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(\mathbf{x}_i, 
+      \mathbf{x}_j)\in S} d_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j)
+      \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}_i, \mathbf{x}_j)
+      \in D} d^2_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) \geq 1
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import MMC
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+    mmc = MMC()
+    mmc.fit(pairs, y)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Xing et al. `Distance metric learning with application to clustering with side-information <http://papers.nips .cc/paper/2164-distance-metric-learning-with-application-to-clustering-with-side-information.pdf>`_. NIPS 2002.
+    
+    [2]. Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz .
+
+.. _learning_on_triplets:
+
+Learning on triplets
+====================
+
+Some metric learning algorithms learn on triplets of samples. In this case,
+one should provide the algorithm with `n_samples` triplets of points. The
+semantic of each triplet is that the first point should be closer to the
+second point than to the third one.
+
+Fitting
+-------
+Here is an example for fitting on triplets (see :ref:`fit_ws` for more
+details on the input data format and how to fit, in the general case of
+learning on tuples).
+
+>>> from metric_learn import SCML
+>>> triplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.1, 0.6]],
+>>>                      [[4.5, 2.3], [2.1, 2.3], [7.3, 3.4]]])
+>>> scml = SCML(random_state=42)
+>>> scml.fit(triplets)
+SCML(beta=1e-5, B=None, max_iter=100000, verbose=False,
+    preprocessor=None, random_state=None)
+
+Or alternatively (using a preprocessor):
+
+>>> X = np.array([[[1.2, 3.2], 
+>>>                [2.3, 5.5],
+>>>                [2.1, 0.6],
+>>>                [4.5, 2.3],
+>>>                [2.1, 2.3],
+>>>                [7.3, 3.4]])
+>>> triplets_indices = np.array([[0, 1, 2], [3, 4, 5]])
+>>> scml = SCML(preprocessor=X, random_state=42)
+>>> scml.fit(triplets_indices)
+SCML(beta=1e-5, B=None, max_iter=100000, verbose=False,
+   preprocessor=array([[1.2, 3.2],
+       [2.3, 5.5],
+       [2.4, 6.7],
+       [2.1, 0.6],
+       [4.5, 2.3],
+       [2.1, 2.3],
+       [0.6, 1.2],
+       [7.3, 3.4]]),
+    random_state=None)
+
+
+Here, we want to learn a metric that, for each of the two
+`triplets`, will make the first point closer to the
+second point than to the third one.
+
+.. _triplets_predicting:
+
+Prediction
+----------
+
+When a triplets learner is fitted, it is also able to predict, for an
+upcoming triplet, whether the first point is closer to the second point 
+than to the third one (+1), or not (-1).
+
+>>> triplets_test = np.array(
+... [[[5.6, 5.3], [2.2, 2.1], [1.2, 3.4]],
+...  [[6.0, 4.2], [4.3, 1.2], [0.1, 7.8]]])
+>>> scml.predict(triplets_test)
+array([-1.,  1.])
+
+.. _triplets_scoring:
+
+Scoring
+-------
+
+Triplet metric learners can also return a `decision_function` for a set of triplets,
+which corresponds to the distance between the first two points minus the distance
+between the first and last points of the triplet (the higher the value, the more
+similar the first point to the second point compared to the last one). This "score"
+can be interpreted as a measure of likeliness of having a +1 prediction for this 
+triplet.
+
+>>> scml.decision_function(triplets_test)
+array([-1.75700306,  4.98982131])
+
+In the above example, for the first triplet in `triplets_test`, the first 
+point is predicted less similar to the second point than to the last point
+(they are further away in the transformed space).
+
+Unlike pairs learners, triplets learners do not allow to give a `y` when fitting: we
+assume that the ordering of points within triplets is such that the training triplets
+are all positive. Therefore, it is not possible to use scikit-learn scoring functions
+(such as 'f1_score') for triplets learners.
+
+However, triplets learners do have a default scoring function, which will
+basically return the accuracy score on a given test set, i.e. the proportion
+of triplets that have the right predicted ordering.
+
+>>> scml.score(triplets_test)
+0.5
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
+
+
+
+
+Algorithms
+----------
+
+.. _scml:
+
+:py:class:`SCML <metric_learn.SCML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sparse Compositional Metric Learning
+(:py:class:`SCML <metric_learn.SCML>`)
+
+`SCML` learns a squared Mahalanobis distance from triplet constraints by
+optimizing sparse positive weights assigned to a set of :math:`K` rank-one
+PSD bases. This can be formulated as an optimization problem with only
+:math:`K` parameters, that can be solved with an efficient stochastic
+composite scheme.
+
+The Mahalanobis matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}`
+weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` as:
+
+.. math::
+
+    M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0
+
+Learning :math:`M` in this form makes it PSD by design, as it is a
+nonnegative sum of PSD matrices. The basis set :math:`B` is fixed in advance
+and it is possible to construct it from the data. The optimization problem
+over :math:`w` is formulated as a classic margin-based hinge loss function
+involving the set :math:`C` of triplets. A regularization :math:`\ell_1`
+is added to yield a sparse combination. The formulation is the following:
+
+.. math::
+
+    \min_{w\geq 0} \sum_{(x_i,x_j,x_k)\in C} [1 + d_w(x_i,x_j)-d_w(x_i,x_k)]_+ + \beta||w||_1
+
+where :math:`[\cdot]_+` is the hinge loss. 
+ 
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import SCML
+
+    triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]],
+                [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]],
+                [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]],
+                [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]]
+
+    scml = SCML()
+    scml.fit(triplets)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. (AAAI), 2014.
+
+    [2]. Adapted from original `Matlab implementation. <https://github.com/bellet/SCML>`_.
+
+
+.. _learning_on_quadruplets:
+
+Learning on quadruplets
+=======================
+
+Some metric learning algorithms learn on quadruplets of samples. In this case,
+one should provide the algorithm with `n_samples` quadruplets of points. The
+semantic of each quadruplet is that the first two points should be closer
+together than the last two points.
+
+Fitting
+-------
+Here is an example for fitting on quadruplets (see :ref:`fit_ws` for more
+details on the input data format and how to fit, in the general case of
+learning on tuples).
+
+>>> from metric_learn import LSML
+>>> quadruplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.4, 6.7], [2.1, 0.6]],
+>>>                         [[4.5, 2.3], [2.1, 2.3], [0.6, 1.2], [7.3, 3.4]]])
+>>> lsml = LSML(random_state=42)
+>>> lsml.fit(quadruplets)
+LSML(max_iter=1000, preprocessor=None, prior=None, random_state=42, tol=0.001,
+   verbose=False)
+
+Or alternatively (using a preprocessor):
+
+>>> X = np.array([[1.2, 3.2],
+>>>               [2.3, 5.5],
+>>>               [2.4, 6.7],
+>>>               [2.1, 0.6],
+>>>               [4.5, 2.3],
+>>>               [2.1, 2.3],
+>>>               [0.6, 1.2],
+>>>               [7.3, 3.4]])
+>>> quadruplets_indices = np.array([[0, 1, 2, 3], [4, 5, 6, 7]])
+>>> lsml = LSML(preprocessor=X, random_state=42)
+>>> lsml.fit(quadruplets_indices)
+LSML(max_iter=1000,
+   preprocessor=array([[1.2, 3.2],
+       [2.3, 5.5],
+       [2.4, 6.7],
+       [2.1, 0.6],
+       [4.5, 2.3],
+       [2.1, 2.3],
+       [0.6, 1.2],
+       [7.3, 3.4]]),
+   prior=None, random_state=42, tol=0.001, verbose=False)
+
+
+Here, we want to learn a metric that, for each of the two
+`quadruplets`, will put the two first points closer together than the two
+last points.
+
+.. _quadruplets_predicting:
+
+Prediction
+----------
+
+When a quadruplets learner is fitted, it is also able to predict, for an
+upcoming quadruplet, whether the two first points are more similar than the
+two last points (+1), or not (-1).
+
+>>> quadruplets_test = np.array(
+... [[[5.6, 5.3], [2.2, 2.1], [0.4, 0.6], [1.2, 3.4]],
+...  [[6.0, 4.2], [4.3, 1.2], [4.5, 0.6], [0.1, 7.8]]])
+>>> lsml.predict(quadruplets_test)
+array([-1.,  1.])
+
+.. _quadruplets_scoring:
+
+Scoring
+-------
+
+Quadruplet metric learners can also return a `decision_function` for a set of
+quadruplets, which corresponds to the distance between the first pair of points minus 
+the distance between the second pair of points of the triplet (the higher the value,
+the more similar the first pair is than the last pair). 
+This "score" can be interpreted as a measure of likeliness of having a +1 prediction 
+for this quadruplet.
+
+>>> lsml.decision_function(quadruplets_test)
+array([-1.75700306,  4.98982131])
+
+In the above example, for the first quadruplet in `quadruplets_test`, the
+two first points are predicted less similar than the two last points (they
+are further away in the transformed space).
+
+Like triplet learners, quadruplets learners do not allow to give a `y` when fitting: we
+assume that the ordering of points within triplets is such that the training triplets
+are all positive. Therefore, it is not possible to use scikit-learn scoring functions
+(such as 'f1_score') for triplets learners.
+
+However, quadruplets learners do have a default scoring function, which will
+basically return the accuracy score on a given test set, i.e. the proportion
+of quadruplets have the right predicted ordering.
+
+>>> lsml.score(quadruplets_test)
+0.5
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
+
+
+
+
+Algorithms
+----------
+
+.. _lsml:
+
+:py:class:`LSML <metric_learn.LSML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Metric Learning from Relative Comparisons by Minimizing Squared Residual
+(:py:class:`LSML <metric_learn.LSML>`)
+
+`LSML` proposes a simple, yet effective, algorithm that minimizes a convex 
+objective function corresponding to the sum of squared residuals of 
+constraints. This algorithm uses the constraints in the form of the 
+relative distance comparisons, such method is especially useful where 
+pairwise constraints are not natural to obtain, thus pairwise constraints 
+based algorithms become infeasible to be deployed. Furthermore, its sparsity 
+extension leads to more stable estimation when the dimension is high and 
+only a small amount of constraints is given.
+
+The loss function of each constraint 
+:math:`d(\mathbf{x}_i, \mathbf{x}_j) < d(\mathbf{x}_k, \mathbf{x}_l)` is 
+denoted as:
+
+.. math::
+
+    H(d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) 
+    - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))
+
+where :math:`H(\cdot)` is the squared Hinge loss function defined as:
+
+.. math::
+
+    H(x) = \left\{\begin{aligned}0 \qquad x\leq 0 \\
+    \,\,x^2 \qquad x>0\end{aligned}\right.\\
+
+The summed loss function :math:`L(C)` is the simple sum over all constraints 
+:math:`C = \{(\mathbf{x}_i , \mathbf{x}_j , \mathbf{x}_k , \mathbf{x}_l) 
+: d(\mathbf{x}_i , \mathbf{x}_j) < d(\mathbf{x}_k , \mathbf{x}_l)\}`. The 
+original paper suggested here should be a weighted sum since the confidence 
+or probability of each constraint might differ. However, for the sake of 
+simplicity and assumption of no extra knowledge provided, we just deploy 
+the simple sum here as well as what the authors did in the experiments.
+
+The distance metric learning problem becomes minimizing the summed loss 
+function of all constraints plus a regularization term w.r.t. the prior 
+knowledge:
+
+.. math::
+
+    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_i, 
+    \mathbf{x}_j, \mathbf{x}_k, \mathbf{x}_l)\in C}H(d_\mathbf{M}(
+    \mathbf{x}_i, \mathbf{x}_j) - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))\\
+
+where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity 
+by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
+
+.. math::
+
+    D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
+    (\mathbf{M})
+
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import LSML
+
+    quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
+                   [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
+                   [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
+                   [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
+
+    # we want to make closer points where the first feature is close, and
+    # further if the second feature is close
+
+    lsml = LSML()
+    lsml.fit(quadruplets)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+      [1]. Liu et al. `Metric Learning from Relative Comparisons by Minimizing Squared Residual <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
+
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439917 .
+
+
diff --git a/examples/README.txt b/examples/README.txt
new file mode 100644
index 00000000..10dbe0d5
--- /dev/null
+++ b/examples/README.txt
@@ -0,0 +1,4 @@
+Examples
+========
+
+Below is a gallery of example metric-learn use cases.
\ No newline at end of file
diff --git a/examples/plot_metric_learning_examples.py b/examples/plot_metric_learning_examples.py
new file mode 100644
index 00000000..32759636
--- /dev/null
+++ b/examples/plot_metric_learning_examples.py
@@ -0,0 +1,495 @@
+"""
+Algorithms walkthrough
+~~~~~~~~~~~~~~~~~~~~~~
+
+This is a small walkthrough which illustrates most of the Metric Learning
+algorithms implemented in metric-learn by using them on synthetic data,
+with some visualizations to provide intuitions into what they are designed
+to achieve.
+"""
+
+# License: BSD 3 clause
+# Authors: Bhargav Srinivasa Desikan <bhargavvader@gmail.com>
+#          William de Vazelhes <wdevazelhes@gmail.com>
+
+######################################################################
+# Imports
+# ^^^^^^^
+# .. note::
+#
+#     In order to show the charts of the examples you need a graphical
+#     ``matplotlib`` backend installed. For intance, use ``pip install pyqt5``
+#     to get Qt graphical interface or use your favorite one.
+
+from sklearn.manifold import TSNE
+
+import metric_learn
+import numpy as np
+from sklearn.datasets import make_classification, make_regression
+
+# visualisation imports
+import matplotlib.pyplot as plt
+np.random.seed(42)
+
+
+######################################################################
+# Loading our dataset and setting up plotting
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We will be using a synthetic dataset to illustrate the plotting,
+# using the function `sklearn.datasets.make_classification` from
+# scikit-learn. The dataset will contain:
+# - 100 points in 3 classes with 2 clusters per class
+# - 5 features, among which 3 are informative (correlated with the class
+# labels) and two are random noise with large magnitude
+
+X, y = make_classification(n_samples=100, n_classes=3, n_clusters_per_class=2,
+                           n_informative=3, class_sep=4., n_features=5,
+                           n_redundant=0, shuffle=True,
+                           scale=[1, 1, 20, 20, 20])
+
+###########################################################################
+# Note that the dimensionality of the data is 5, so to plot the
+# transformed data in 2D, we will use the t-sne algorithm. (See
+# `sklearn.manifold.TSNE`).
+
+
+def plot_tsne(X, y, colormap=plt.cm.Paired):
+    plt.figure(figsize=(8, 6))
+
+    # clean the figure
+    plt.clf()
+
+    tsne = TSNE()
+    X_embedded = tsne.fit_transform(X)
+    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, cmap=colormap)
+
+    plt.xticks(())
+    plt.yticks(())
+
+    plt.show()
+
+###################################
+# Let's now plot the dataset as is.
+
+
+plot_tsne(X, y)
+
+#########################################################################
+# We can see that the classes appear mixed up: this is because t-sne
+# is based on preserving the original neighborhood of points in the embedding
+# space, but this original neighborhood is based on the euclidean
+# distance in the input space, in which the contribution of the noisy
+# features is high. So even if points from the same class are close to each
+# other in some subspace of the input space, this is not the case when
+# considering all dimensions of the input space.
+#
+# Metric Learning
+# ^^^^^^^^^^^^^^^
+#
+# Why is Metric Learning useful? We can, with prior knowledge of which
+# points are supposed to be closer, figure out a better way to compute
+# distances between points for the task at hand. Especially in higher
+# dimensions when Euclidean distances are a poor way to measure distance, this
+# becomes very useful.
+#
+# Basically, we learn this distance:
+# :math:`D(x, x') = \sqrt{(x-x')^\top M(x-x')}`. And we learn the parameters
+# :math:`M` of this distance to satisfy certain constraints on the distance
+# between points, for example requiring that points of the same class are
+# close together and points of different class are far away.
+#
+# For more information, check the :ref:`intro_metric_learning` section
+# from the documentation. Some good reading material can also be found
+# `here <https://arxiv.org/pdf/1306.6709.pdf>`__. It serves as a
+# good literature review of Metric Learning.
+#
+# We will briefly explain the metric learning algorithms implemented by
+# metric-learn, before providing some examples for its usage, and also
+# discuss how to perform metric learning with weaker supervision than class
+# labels.
+#
+# Metric-learn can be easily integrated with your other machine learning
+# pipelines, and follows scikit-learn conventions.
+#
+
+
+######################################################################
+# Large Margin Nearest Neighbour
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# LMNN is a metric learning algorithm primarily designed for k-nearest
+# neighbor classification. The algorithm is based on semidefinite
+# programming, a sub-class of convex programming (as most Metric Learning
+# algorithms are).
+#
+# The main intuition behind LMNN is to learn a pseudometric under which
+# all data instances in the training set are surrounded by at least k
+# instances that share the same class label. If this is achieved, the
+# leave-one-out error (a special case of cross validation) is minimized.
+# You'll notice that the points from the same labels are closer together,
+# but they are not necessary in a same cluster. This is particular to LMNN
+# and we'll see that some other algorithms implicitly enforce points from
+# the same class to cluster together.
+#
+# - See more in the :ref:`User Guide <lmnn>`
+# - See more in the documentation of the class :py:class:`LMNN
+#   <metric_learn.LMNN>`
+
+
+######################################################################
+# Fit and then transform!
+# -----------------------
+#
+
+# setting up LMNN
+lmnn = metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6)
+
+# fit the data!
+lmnn.fit(X, y)
+
+# transform our input space
+X_lmnn = lmnn.transform(X)
+
+
+######################################################################
+# So what have we learned? The matrix :math:`M` we talked about before.
+
+
+######################################################################
+# Now let us plot the transformed space - this tells us what the original
+# space looks like after being transformed with the new learned metric.
+#
+
+plot_tsne(X_lmnn, y)
+
+
+######################################################################
+# Pretty neat, huh?
+#
+# The rest of this notebook will briefly explain the other Metric Learning
+# algorithms before plotting them. Also, while we have first run ``fit``
+# and then ``transform`` to see our data transformed, we can also use
+# ``fit_transform``. The rest of the examples and illustrations will use
+# ``fit_transform``.
+
+######################################################################
+# Information Theoretic Metric Learning
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# ITML uses a regularizer that automatically enforces a Semi-Definite
+# Positive Matrix condition - the LogDet divergence. It uses soft
+# must-link or cannot-link constraints, and a simple algorithm based on
+# Bregman projections. Unlike LMNN, ITML will implicitly enforce points from
+# the same class to belong to the same cluster, as you can see below.
+#
+# - See more in the :ref:`User Guide <itml>`
+# - See more in the documentation of the class :py:class:`ITML
+#   <metric_learn.ITML>`
+
+itml = metric_learn.ITML_Supervised()
+X_itml = itml.fit_transform(X, y)
+
+plot_tsne(X_itml, y)
+
+
+######################################################################
+# Mahalanobis Metric for Clustering
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# MMC is an algorithm that will try to minimize the distance between similar
+# points, while ensuring that the sum of distances between dissimilar points is
+# higher than a threshold. This is done by optimizing a cost function
+# subject to an inequality constraint.
+#
+# - See more in the :ref:`User Guide <mmc>`
+# - See more in the documentation of the class :py:class:`MMC
+#   <metric_learn.MMC>`
+
+mmc = metric_learn.MMC_Supervised()
+X_mmc = mmc.fit_transform(X, y)
+
+plot_tsne(X_mmc, y)
+
+######################################################################
+# Sparse Determinant Metric Learning
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Implements an efficient sparse metric learning algorithm in high
+# dimensional space via an :math:`l_1`-penalized log-determinant
+# regularization. Compared to the most existing distance metric learning
+# algorithms, the algorithm exploits the sparsity nature underlying the
+# intrinsic high dimensional feature space.
+#
+# - See more in the :ref:`User Guide <sdml>`
+# - See more in the documentation of the class :py:class:`SDML
+#   <metric_learn.SDML>`
+
+sdml = metric_learn.SDML_Supervised(sparsity_param=0.1, balance_param=0.0015,
+                                    prior='covariance')
+X_sdml = sdml.fit_transform(X, y)
+
+plot_tsne(X_sdml, y)
+
+
+######################################################################
+# Least Squares Metric Learning
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# LSML is a simple, yet effective, algorithm that learns a Mahalanobis
+# metric from a given set of relative comparisons. This is done by
+# formulating and minimizing a convex loss function that corresponds to
+# the sum of squared hinge loss of violated constraints.
+#
+# - See more in the :ref:`User Guide <lsml>`
+# - See more in the documentation of the class :py:class:`LSML
+#   <metric_learn.LSML>`
+
+lsml = metric_learn.LSML_Supervised(tol=0.0001, max_iter=10000,
+                                    prior='covariance')
+X_lsml = lsml.fit_transform(X, y)
+
+plot_tsne(X_lsml, y)
+
+
+######################################################################
+# Neighborhood Components Analysis
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# NCA is an extremly popular metric learning algorithm.
+#
+# Neighborhood components analysis aims at "learning" a distance metric
+# by finding a linear transformation of input data such that the average
+# leave-one-out (LOO) classification performance of a soft-nearest
+# neighbors rule is maximized in the transformed space. The key insight to
+# the algorithm is that a matrix :math:`A` corresponding to the
+# transformation can be found by defining a differentiable objective function
+# for :math:`A`, followed by use of an iterative solver such as
+# `scipy.optimize.fmin_l_bfgs_b`. Like LMNN, this algorithm does not try to
+# cluster points from the same class in a unique cluster, because it
+# enforces conditions at a local neighborhood scale.
+#
+# - See more in the :ref:`User Guide <nca>`
+# - See more in the documentation of the class :py:class:`NCA
+#   <metric_learn.NCA>`
+
+nca = metric_learn.NCA(max_iter=1000)
+X_nca = nca.fit_transform(X, y)
+
+plot_tsne(X_nca, y)
+
+######################################################################
+# Local Fisher Discriminant Analysis
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# LFDA is a linear supervised dimensionality reduction method. It is
+# particularly useful when dealing with multimodality, where one ore more
+# classes consist of separate clusters in input space. The core
+# optimization problem of LFDA is solved as a generalized eigenvalue
+# problem. Like LMNN, and NCA, this algorithm does not try to cluster points
+# from the same class in a unique cluster.
+#
+# - See more in the :ref:`User Guide <lfda>`
+# - See more in the documentation of the class :py:class:`LFDA
+#   <metric_learn.LFDA>`
+
+lfda = metric_learn.LFDA(k=2, n_components=2)
+X_lfda = lfda.fit_transform(X, y)
+
+plot_tsne(X_lfda, y)
+
+
+######################################################################
+# Relative Components Analysis
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# RCA is another one of the older algorithms. It learns a full rank
+# Mahalanobis distance metric based on a weighted sum of in-class
+# covariance matrices. It applies a global linear transformation to assign
+# large weights to relevant dimensions and low weights to irrelevant
+# dimensions. Those relevant dimensions are estimated using "chunklets",
+# subsets of points that are known to belong to the same class.
+#
+# - See more in the :ref:`User Guide <rca>`
+# - See more in the documentation of the class :py:class:`RCA
+#   <metric_learn.RCA>`
+
+rca = metric_learn.RCA_Supervised(n_chunks=30, chunk_size=2)
+X_rca = rca.fit_transform(X, y)
+
+plot_tsne(X_rca, y)
+
+######################################################################
+# Regression example: Metric Learning for Kernel Regression
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# The previous algorithms took as input a dataset with class labels. Metric
+# learning can also be useful for regression, when the labels are real numbers.
+# An algorithm very similar to NCA but for regression is Metric
+# Learning for Kernel Regression (MLKR). It will optimize for the average
+# leave-one-out *regression* performance from a soft-nearest neighbors
+# regression.
+#
+# - See more in the :ref:`User Guide <mlkr>`
+# - See more in the documentation of the class :py:class:`MLKR
+#   <metric_learn.MLKR>`
+#
+# To illustrate MLKR, let's use the dataset
+# `sklearn.datasets.make_regression` the same way as we did with the
+# classification  before. The dataset will contain: 100 points of 5 features
+# each, among which 3 are informative (i.e., used to generate the
+# regression target from a linear model), and two are random noise with the
+# same magnitude.
+
+X_reg, y_reg = make_regression(n_samples=100, n_informative=3, n_features=5,
+                               shuffle=True)
+
+######################################################################
+# Let's plot the dataset as is
+
+plot_tsne(X_reg, y_reg, plt.cm.Oranges)
+
+######################################################################
+# And let's plot the dataset after transformation by MLKR:
+mlkr = metric_learn.MLKR()
+X_mlkr = mlkr.fit_transform(X_reg, y_reg)
+plot_tsne(X_mlkr, y_reg, plt.cm.Oranges)
+
+######################################################################
+# Points that have the same value to regress are now closer to each
+# other ! This would improve the performance of
+# `sklearn.neighbors.KNeighborsRegressor` for instance.
+
+
+######################################################################
+# Metric Learning from Weaker Supervision
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# To learn the metric, so far we have always given the labels of the
+# data to supervise the algorithms. However, in many applications,
+# it is easier to obtain information about whether two samples are
+# similar or dissimilar. For instance, when annotating a dataset of face
+# images, it is easier for an annotator to tell if two faces belong to the same
+# person or not, rather than finding the ID of the face among a huge database
+# of every person's faces.
+# Note that for some problems (e.g., in information
+# retrieval where the goal is to rank documents by similarity to a query
+# document), there is no notion of individual label but one can gather
+# information on which pairs of points are similar or dissimilar.
+# Fortunately, one of the strength of metric learning is the ability to
+# learn from such weaker supervision. Indeed, some of the algorithms we've
+# used above have alternate ways to pass some supervision about the metric
+# we want to learn. The way to go is to pass a 2D array `pairs` of pairs,
+# as well as an array of labels `pairs_labels` such that for each `i` between
+# `0` and `n_pairs` we want `X[pairs[i, 0], :]` and `X[pairs[i, 1], :]` to be
+# similar if `pairs_labels[i] == 1`, and we want them to be dissimilar if
+# `pairs_labels[i] == -1`. In other words, we
+# want to enforce a metric that projects similar points closer together and
+# dissimilar points further away from each other. This kind of input is
+# possible for ITML, SDML, and MMC. See :ref:`weakly_supervised_section` for
+# details on other kinds of weak supervision that some algorithms can work
+# with.
+#
+# For the purpose of this example, we're going to explicitly create these
+# pairwise constraints through the labels we have, i.e. `y`.
+# Do keep in mind that we are doing this method because we know the labels
+# - we can actually create the constraints any way we want to depending on
+# the data!
+#
+# Note that this is what metric-learn did under the hood in the previous
+# examples (do check out the
+# `constraints` module!) - but we'll try our own version of this. We're
+# going to go ahead and assume that two points labeled the same will be
+# closer than two points in different labels.
+
+
+def create_constraints(labels):
+    import itertools
+    import random
+
+    # aggregate indices of same class
+    zeros = np.where(y == 0)[0]
+    ones = np.where(y == 1)[0]
+    twos = np.where(y == 2)[0]
+    # make permutations of all those points in the same class
+    zeros_ = list(itertools.combinations(zeros, 2))
+    ones_ = list(itertools.combinations(ones, 2))
+    twos_ = list(itertools.combinations(twos, 2))
+    # put them together!
+    sim = np.array(zeros_ + ones_ + twos_)
+
+    # similarily, put together indices in different classes
+    dis = []
+    for zero in zeros:
+        for one in ones:
+            dis.append((zero, one))
+        for two in twos:
+            dis.append((zero, two))
+    for one in ones:
+        for two in twos:
+            dis.append((one, two))
+
+    # pick up just enough dissimilar examples as we have similar examples
+    dis = np.array(random.sample(dis, len(sim)))
+
+    # return an array of pairs of indices of shape=(2*len(sim), 2), and the
+    # corresponding labels, array of shape=(2*len(sim))
+    # Each pair of similar points have a label of +1 and each pair of
+    # dissimilar points have a label of -1
+    return (np.vstack([np.column_stack([sim[:, 0], sim[:, 1]]),
+                       np.column_stack([dis[:, 0], dis[:, 1]])]),
+            np.concatenate([np.ones(len(sim)), -np.ones(len(sim))]))
+
+
+pairs, pairs_labels = create_constraints(y)
+
+
+######################################################################
+# Now that we've created our constraints, let's see what it looks like!
+#
+
+print(pairs)
+print(pairs_labels)
+
+
+######################################################################
+# Using our constraints, let's now train ITML again. Note that we are no
+# longer calling the supervised class :py:class:`ITML_Supervised
+# <metric_learn.ITML_Supervised>` but the more generic
+# (weakly-supervised) :py:class:`ITML <metric_learn.ITML>`, which
+# takes the dataset `X` through the `preprocessor` argument (see
+# :ref:`this section  <preprocessor_section>` of the documentation to learn
+# about more advanced uses of `preprocessor`) and the pair information `pairs`
+# and `pairs_labels` in the fit method.
+
+itml = metric_learn.ITML(preprocessor=X)
+itml.fit(pairs, pairs_labels)
+
+X_itml = itml.transform(X)
+
+plot_tsne(X_itml, y)
+
+
+######################################################################
+# And that's the result of ITML after being trained on our manually
+# constructed constraints! A bit different from our old result, but not too
+# different.
+#
+# RCA and LSML also have their own specific ways of taking in inputs -
+# it's worth one's while to poke around in the constraints.py file to see
+# how exactly this is going on.
+#
+# Finally, one of the main advantages of metric-learn is its out-of-the box
+# compatibility with scikit-learn, for doing `model selection
+# <https://scikit-learn.org/stable/model_selection.html>`__,
+# cross-validation, and scoring for instance. Indeed, supervised algorithms are
+# regular `sklearn.base.TransformerMixin` that can be plugged into any
+# pipeline or cross-validation procedure. And weakly-supervised estimators are
+# also compatible with scikit-learn, since their input dataset format described
+# above allows to be sliced along the first dimension when doing
+# cross-validations (see also this :ref:`section <sklearn_compat_ws>`). You
+# can also look at some :ref:`use cases <use_cases>` where you could combine
+# metric-learn with scikit-learn estimators.
+
+########################################################################
+# This brings us to the end of this tutorial! Have fun Metric Learning :)
diff --git a/examples/sandwich.py b/examples/plot_sandwich.py
similarity index 74%
rename from examples/sandwich.py
rename to examples/plot_sandwich.py
index 34b48a00..740852be 100644
--- a/examples/sandwich.py
+++ b/examples/plot_sandwich.py
@@ -1,13 +1,25 @@
+# -*- coding: utf-8 -*-
 """
+Sandwich demo
+=============
+
 Sandwich demo based on code from http://nbviewer.ipython.org/6576096
 """
 
+######################################################################
+# .. note::
+#
+#     In order to show the charts of the examples you need a graphical
+#     ``matplotlib`` backend installed. For intance, use ``pip install pyqt5``
+#     to get Qt graphical interface or use your favorite one.
+
 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
 from sklearn.neighbors import NearestNeighbors
 
-from metric_learn import LMNN, ITML_Supervised, LSML_Supervised, SDML_Supervised
+from metric_learn import (LMNN, ITML_Supervised, LSML_Supervised,
+                          SDML_Supervised)
 
 
 def sandwich_demo():
@@ -23,14 +35,14 @@ def sandwich_demo():
 
   mls = [
       LMNN(),
-      ITML_Supervised(num_constraints=200),
-      SDML_Supervised(num_constraints=200),
-      LSML_Supervised(num_constraints=200),
+      ITML_Supervised(n_constraints=200),
+      SDML_Supervised(n_constraints=200, balance_param=0.001),
+      LSML_Supervised(n_constraints=200),
   ]
 
   for ax_num, ml in enumerate(mls, start=3):
     ml.fit(x, y)
-    tx = ml.transform()
+    tx = ml.transform(x)
     ml_knn = nearest_neighbors(tx, k=2)
     ax = plt.subplot(3, 2, ax_num)
     plot_sandwich_data(tx, y, axis=ax)
@@ -43,10 +55,10 @@ def sandwich_demo():
 
 # TODO: use this somewhere
 def visualize_class_separation(X, labels):
-  _, (ax1,ax2) = plt.subplots(ncols=2)
+  _, (ax1, ax2) = plt.subplots(ncols=2)
   label_order = np.argsort(labels)
   ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest')
-  ax2.imshow(pairwise_distances(labels[label_order,None]),
+  ax2.imshow(pairwise_distances(labels[label_order, None]),
              interpolation='nearest')
 
 
@@ -73,19 +85,19 @@ def sandwich_data():
     for k, xc in enumerate(x_centers):
       data[i, k, 0] = np.random.normal(xc, 0.1)
       data[i, k, 1] = np.random.normal(yc, 0.1)
-    labels[i,:] = i
+    labels[i, :] = i
   return data.reshape((-1, 2)), labels.ravel()
 
 
 def plot_sandwich_data(x, y, axis=plt, colors='rbgmky'):
   for idx, val in enumerate(np.unique(y)):
-    xi = x[y==val]
+    xi = x[y == val]
     axis.scatter(*xi.T, s=50, facecolors='none', edgecolors=colors[idx])
 
 
 def plot_neighborhood_graph(x, nn, y, axis=plt, colors='rbgmky'):
   for i, a in enumerate(x):
-    b = x[nn[i,1]]
+    b = x[nn[i, 1]]
     axis.plot((a[0], b[0]), (a[1], b[1]), colors[y[i]])
 
 
diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py
index cc60049d..92823fb1 100644
--- a/metric_learn/__init__.py
+++ b/metric_learn/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from .constraints import Constraints
 from .covariance import Covariance
 from .itml import ITML, ITML_Supervised
@@ -9,3 +7,14 @@
 from .nca import NCA
 from .lfda import LFDA
 from .rca import RCA, RCA_Supervised
+from .mlkr import MLKR
+from .mmc import MMC, MMC_Supervised
+from .scml import SCML, SCML_Supervised
+
+from ._version import __version__
+
+__all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised',
+           'LMNN', 'LSML', 'LSML_Supervised', 'SDML',
+           'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised',
+           'MLKR', 'MMC', 'MMC_Supervised', 'SCML',
+           'SCML_Supervised', '__version__']
diff --git a/metric_learn/_util.py b/metric_learn/_util.py
new file mode 100644
index 00000000..868ececa
--- /dev/null
+++ b/metric_learn/_util.py
@@ -0,0 +1,787 @@
+import numpy as np
+from numpy.linalg import LinAlgError
+from sklearn.datasets import make_spd_matrix
+from sklearn.decomposition import PCA
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_X_y, check_random_state
+from .exceptions import PreprocessorError, NonPSDError
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from scipy.linalg import pinvh, eigh
+import sys
+import time
+import warnings
+
+# hack around lack of axis kwarg in older numpy versions
+try:
+  np.linalg.norm([[4]], axis=1)
+except TypeError:
+  def vector_norm(X):
+    return np.apply_along_axis(np.linalg.norm, 1, X)
+else:
+  def vector_norm(X):
+    return np.linalg.norm(X, axis=1)
+
+
+def check_input(input_data, y=None, preprocessor=None,
+                type_of_inputs='classic', tuple_size=None, accept_sparse=False,
+                dtype='numeric', order=None,
+                copy=False, force_all_finite=True,
+                multi_output=False, ensure_min_samples=1,
+                ensure_min_features=1, y_numeric=False, estimator=None):
+  """Checks that the input format is valid, and converts it if specified
+  (this is the equivalent of scikit-learn's `check_array` or `check_X_y`).
+  All arguments following tuple_size are scikit-learn's `check_X_y`
+  arguments that will be enforced on the data and labels array. If
+  indicators are given as an input data array, the returned data array
+  will be the formed points/tuples, using the given preprocessor.
+
+  Parameters
+  ----------
+  input : array-like
+    The input data array to check.
+
+  y : array-like
+    The input labels array to check.
+
+  preprocessor : callable (default=`None`)
+    The preprocessor to use. If None, no preprocessor is used.
+
+  type_of_inputs : `str` {'classic', 'tuples'}
+    The type of inputs to check. If 'classic', the input should be
+    a 2D array-like of points or a 1D array like of indicators of points. If
+    'tuples', the input should be a 3D array-like of tuples or a 2D
+    array-like of indicators of tuples.
+
+  accept_sparse : `bool`
+    Set to true to allow sparse inputs (only works for sparse inputs with
+    dim < 3).
+
+  tuple_size : int
+    The number of elements in a tuple (e.g. 2 for pairs).
+
+  dtype : string, type, list of types or None (default='numeric')
+    Data type of result. If None, the dtype of the input is preserved.
+    If 'numeric', dtype is preserved unless array.dtype is object.
+    If dtype is a list of types, conversion on the first type is only
+    performed if the dtype of the input is not in the list.
+
+  order : 'F', 'C' or None (default=`None`)
+    Whether an array will be forced to be fortran or c-style.
+
+  copy : boolean (default=False)
+    Whether a forced copy will be triggered. If copy=False, a copy might
+    be triggered by a conversion.
+
+  force_all_finite : boolean or 'allow-nan', (default=True)
+    Whether to raise an error on np.inf and np.nan in X. This parameter
+    does not influence whether y can have np.inf or np.nan values.
+    The possibilities are:
+     - True: Force all values of X to be finite.
+     - False: accept both np.inf and np.nan in X.
+     - 'allow-nan':  accept  only  np.nan  values in  X.  Values  cannot  be
+       infinite.
+
+  ensure_min_samples : int (default=1)
+    Make sure that X has a minimum number of samples in its first
+    axis (rows for a 2D array).
+
+  ensure_min_features : int (default=1)
+    Make sure that the 2D array has some minimum number of features
+    (columns). The default value of 1 rejects empty datasets.
+    This check is only enforced when X has effectively 2 dimensions or
+    is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
+    this check.
+
+  estimator : str or estimator instance (default=`None`)
+    If passed, include the name of the estimator in warning messages.
+
+  Returns
+  -------
+  X : `numpy.ndarray`
+    The checked input data array.
+
+  y: `numpy.ndarray` (optional)
+    The checked input labels array.
+  """
+
+  context = make_context(estimator)
+
+  args_for_sk_checks = dict(accept_sparse=accept_sparse,
+                            dtype=dtype, order=order,
+                            copy=copy, force_all_finite=force_all_finite,
+                            ensure_min_samples=ensure_min_samples,
+                            ensure_min_features=ensure_min_features,
+                            estimator=estimator)
+
+  # We need to convert input_data into a numpy.ndarray if possible, before
+  # any further checks or conversions, and deal with y if needed. Therefore
+  # we use check_array/check_X_y with fixed permissive arguments.
+  if y is None:
+    input_data = check_array(input_data, ensure_2d=False, allow_nd=True,
+                             copy=False, force_all_finite=False,
+                             accept_sparse=True, dtype=None,
+                             ensure_min_features=0, ensure_min_samples=0)
+  else:
+    input_data, y = check_X_y(input_data, y, ensure_2d=False, allow_nd=True,
+                              copy=False, force_all_finite=False,
+                              accept_sparse=True, dtype=None,
+                              ensure_min_features=0, ensure_min_samples=0,
+                              multi_output=multi_output,
+                              y_numeric=y_numeric)
+
+  if type_of_inputs == 'classic':
+    input_data = check_input_classic(input_data, context, preprocessor,
+                                     args_for_sk_checks)
+
+  elif type_of_inputs == 'tuples':
+    input_data = check_input_tuples(input_data, context, preprocessor,
+                                    args_for_sk_checks, tuple_size)
+
+    # if we have y and the input data are pairs, we need to ensure
+    # the labels are in [-1, 1]:
+    if y is not None and input_data.shape[1] == 2:
+      check_y_valid_values_for_pairs(y)
+
+  else:
+    raise ValueError("Unknown value {} for type_of_inputs. Valid values are "
+                     "'classic' or 'tuples'.".format(type_of_inputs))
+
+  return input_data if y is None else (input_data, y)
+
+
+def check_input_tuples(input_data, context, preprocessor, args_for_sk_checks,
+                       tuple_size):
+  preprocessor_has_been_applied = False
+  if input_data.ndim == 2:
+    if preprocessor is not None:
+      input_data = preprocess_tuples(input_data, preprocessor)
+      preprocessor_has_been_applied = True
+    else:
+      make_error_input(201, input_data, context)
+  elif input_data.ndim == 3:
+    pass
+  else:
+    if preprocessor is not None:
+      make_error_input(420, input_data, context)
+    else:
+      make_error_input(200, input_data, context)
+  input_data = check_array(input_data, allow_nd=True, ensure_2d=False,
+                           **args_for_sk_checks)
+  # we need to check num_features because check_array does not check it
+  # for 3D inputs:
+  if args_for_sk_checks['ensure_min_features'] > 0:
+    n_features = input_data.shape[2]
+    if n_features < args_for_sk_checks['ensure_min_features']:
+      raise ValueError("Found array with {} feature(s) (shape={}) while"
+                       " a minimum of {} is required{}."
+                       .format(n_features, input_data.shape,
+                               args_for_sk_checks['ensure_min_features'],
+                               context))
+  #  normally we don't need to check_tuple_size too because tuple_size
+  # shouldn't be able to be modified by any preprocessor
+  if input_data.ndim != 3:
+    # we have to ensure this because check_array above does not
+    if preprocessor_has_been_applied:
+      make_error_input(211, input_data, context)
+    else:
+      make_error_input(201, input_data, context)
+  check_tuple_size(input_data, tuple_size, context)
+  return input_data
+
+
+def check_input_classic(input_data, context, preprocessor, args_for_sk_checks):
+  preprocessor_has_been_applied = False
+  if input_data.ndim == 1:
+    if preprocessor is not None:
+      input_data = preprocess_points(input_data, preprocessor)
+      preprocessor_has_been_applied = True
+    else:
+      make_error_input(101, input_data, context)
+  elif input_data.ndim == 2:
+    pass  # OK
+  else:
+    if preprocessor is not None:
+      make_error_input(320, input_data, context)
+    else:
+      make_error_input(100, input_data, context)
+
+  input_data = check_array(input_data, allow_nd=True, ensure_2d=False,
+                           **args_for_sk_checks)
+  if input_data.ndim != 2:
+    # we have to ensure this because check_array above does not
+    if preprocessor_has_been_applied:
+      make_error_input(111, input_data, context)
+    else:
+      make_error_input(101, input_data, context)
+  return input_data
+
+
+def make_error_input(code, input_data, context):
+  code_str = {'expected_input': {'1': '2D array of formed points',
+                                 '2': '3D array of formed tuples',
+                                 '3': ('1D array of indicators or 2D array of '
+                                       'formed points'),
+                                 '4': ('2D array of indicators or 3D array '
+                                       'of formed tuples')},
+              'additional_context': {'0': '',
+                                     '2': ' when using a preprocessor',
+                                     '1': (' after the preprocessor has been '
+                                           'applied')},
+              'possible_preprocessor': {'0': '',
+                                        '1': ' and/or use a preprocessor'
+                                        }}
+  code_list = str(code)
+  err_args = dict(expected_input=code_str['expected_input'][code_list[0]],
+                  additional_context=code_str['additional_context']
+                  [code_list[1]],
+                  possible_preprocessor=code_str['possible_preprocessor']
+                  [code_list[2]],
+                  input_data=input_data, context=context,
+                  found_size=input_data.ndim)
+  err_msg = ('{expected_input} expected'
+             '{context}{additional_context}. Found {found_size}D array '
+             'instead:\ninput={input_data}. Reshape your data'
+             '{possible_preprocessor}.\n')
+  raise ValueError(err_msg.format(**err_args))
+
+
+def preprocess_tuples(tuples, preprocessor):
+  try:
+    tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for
+                              i in range(tuples.shape[1])])
+  except Exception as e:
+    raise PreprocessorError(e)
+  return tuples
+
+
+def preprocess_points(points, preprocessor):
+  """form points if there is a preprocessor else keep them as such (assumes
+  that check_points has already been called)"""
+  try:
+    points = preprocessor(points)
+  except Exception as e:
+    raise PreprocessorError(e)
+  return points
+
+
+def make_context(estimator):
+  """Helper function to create a string with the estimator name.
+  Taken from check_array function in scikit-learn.
+  Will return the following for instance:
+  NCA: ' by NCA'
+  'NCA': ' by NCA'
+  None: ''
+  """
+  estimator_name = make_name(estimator)
+  context = (' by ' + estimator_name) if estimator_name is not None else ''
+  return context
+
+
+def make_name(estimator):
+  """Helper function that returns the name of estimator or the given string
+  if a string is given
+  """
+  if estimator is not None:
+    if isinstance(estimator, str):
+      estimator_name = estimator
+    else:
+      estimator_name = estimator.__class__.__name__
+  else:
+    estimator_name = None
+  return estimator_name
+
+
+def check_tuple_size(tuples, tuple_size, context):
+  """Helper function to check that the number of points in each tuple is
+  equal to tuple_size (e.g. 2 for pairs), and raise a `ValueError` otherwise"""
+  if tuple_size is not None and tuples.shape[1] != tuple_size:
+    msg_t = (("Tuples of {} element(s) expected{}. Got tuples of {} "
+             "element(s) instead (shape={}):\ninput={}.\n")
+             .format(tuple_size, context, tuples.shape[1], tuples.shape,
+                     tuples))
+    raise ValueError(msg_t)
+
+
+def check_y_valid_values_for_pairs(y):
+  """Checks that y values are in [-1, 1]"""
+  if not np.array_equal(np.abs(y), np.ones_like(y)):
+    raise ValueError("When training on pairs, the labels (y) should contain "
+                     "only values in [-1, 1]. Found an incorrect value.")
+
+
+class ArrayIndexer:
+
+  def __init__(self, X):
+    # we check the array-like preprocessor here, and we as much permissive
+    # as possible (because the user will check for the desired
+    # format with arguments in check_input, and only this latter function
+    # should return the appropriate errors). We do this only to have a numpy
+    # array object which can be indexed by another numpy array object.
+    X = check_array(X,
+                    accept_sparse=True, dtype=None,
+                    force_all_finite=False,
+                    ensure_2d=False, allow_nd=True,
+                    ensure_min_samples=0, ensure_min_features=0,
+                    estimator=None)
+    self.X = X
+
+  def __call__(self, indices):
+    return self.X[indices]
+
+
+def check_collapsed_pairs(pairs):
+    num_ident = (vector_norm(pairs[:, 0] - pairs[:, 1]) < 1e-9).sum()
+    if num_ident:
+      raise ValueError("{} collapsed pairs found (where the left element is "
+                       "the same as the right element), out of {} pairs "
+                       "in total.".format(num_ident, pairs.shape[0]))
+
+
+def _check_sdp_from_eigen(w, tol=None):
+  """Checks if some of the eigenvalues given are negative, up to a tolerance
+  level, with a default value of the tolerance depending on the eigenvalues.
+  It also returns whether the matrix is positive definite, up to the above
+  tolerance.
+
+  Parameters
+  ----------
+  w : array-like, shape=(n_eigenvalues,)
+    Eigenvalues to check for non semidefinite positiveness.
+
+  tol : positive `float`, optional
+    Absolute eigenvalues below tol are considered zero. If
+    tol is None, and eps is the epsilon value for datatype of w, then tol
+    is set to abs(w).max() * len(w) * eps.
+
+  Returns
+  -------
+  is_definite : bool
+    Whether the matrix is positive definite or not.
+
+  See Also
+  --------
+  np.linalg.matrix_rank for more details on the choice of tolerance (the same
+    strategy is applied here)
+  """
+  if tol is None:
+    tol = np.abs(w).max() * len(w) * np.finfo(w.dtype).eps
+  if tol < 0:
+    raise ValueError("tol should be positive.")
+  if any(w < - tol):
+    raise NonPSDError()
+  if any(abs(w) < tol):
+    return False
+  return True
+
+
+def components_from_metric(metric, tol=None):
+  """Returns the transformation matrix from the Mahalanobis matrix.
+
+  Returns the transformation matrix from the Mahalanobis matrix, i.e. the
+  matrix L such that metric=L.T.dot(L).
+
+  Parameters
+  ----------
+  metric : symmetric `np.ndarray`, shape=(d x d)
+    The input metric, from which we want to extract a transformation matrix.
+
+  tol : positive `float`, optional
+    Eigenvalues of `metric` between 0 and - tol are considered zero. If tol is
+    None, and w_max is `metric`'s largest eigenvalue, and eps is the epsilon
+    value for datatype of w, then tol is set to w_max * metric.shape[0] * eps.
+
+  Returns
+  -------
+  L : np.ndarray, shape=(d x d)
+    The transformation matrix, such that L.T.dot(L) == metric.
+  """
+  if not np.allclose(metric, metric.T):
+    raise ValueError("The input metric should be symmetric.")
+  # If M is diagonal, we will just return the elementwise square root:
+  if np.array_equal(metric, np.diag(np.diag(metric))):
+    _check_sdp_from_eigen(np.diag(metric), tol)
+    return np.diag(np.sqrt(np.maximum(0, np.diag(metric))))
+  else:
+    try:
+      # if `M` is positive semi-definite, it will admit a Cholesky
+      # decomposition: L = cholesky(M).T
+      return np.linalg.cholesky(metric).T
+    except LinAlgError:
+      # However, currently np.linalg.cholesky does not support indefinite
+      # matrices. So if the latter does not work we will return L = V.T w^(
+      # -1/2), with M = V*w*V.T being the eigenvector decomposition of M with
+      # the eigenvalues in the diagonal matrix w and the columns of V being the
+      # eigenvectors.
+      w, V = np.linalg.eigh(metric)
+      _check_sdp_from_eigen(w, tol)
+      return V.T * np.sqrt(np.maximum(0, w[:, None]))
+
+
+def validate_vector(u, dtype=None):
+  # replica of scipy.spatial.distance._validate_vector, for making scipy
+  # compatible functions on vectors (such as distances computations)
+  u = np.asarray(u, dtype=dtype, order='c').squeeze()
+  # Ensure values such as u=1 and u=[1] still return 1-D arrays.
+  u = np.atleast_1d(u)
+  if u.ndim > 1:
+    raise ValueError("Input vector should be 1-D.")
+  return u
+
+
+def _initialize_components(n_components, input, y=None, init='auto',
+                           verbose=False, random_state=None,
+                           has_classes=True):
+  """Returns the initial transformation to be used depending on the arguments.
+
+  Parameters
+  ----------
+  n_components : int
+    The number of components to take. (Note: it should have been checked
+    before, meaning it should not be None and it should be a value in
+    [1, X.shape[1]])
+
+  input : array-like
+    The input samples (can be tuples or regular samples).
+
+  y : array-like or None
+    The input labels (or not if there are no labels).
+
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
+      the description of 'lda' init), as it uses labels information. If
+      not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
+      as it projects data onto meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
+      This initialization is possible only if `has_classes == True`.
+
+    'identity'
+      The identity matrix. If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  verbose : bool
+    Whether to print the details of the initialization or not.
+
+  random_state : int or `numpy.RandomState` or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
+
+  has_classes : bool (default=True)
+    Whether the labels are in fact classes. If true, this will allow to use
+    the 'lda' initialization.
+
+  Returns
+  -------
+  init_components : `numpy.ndarray`
+    The initial transformation to use.
+  """
+  # if we are doing a regression we cannot use lda:
+  n_features = input.shape[-1]
+  authorized_inits = ['auto', 'pca', 'identity', 'random']
+  if has_classes:
+    authorized_inits.append('lda')
+
+  if isinstance(init, np.ndarray):
+    # we copy the array, so that if we update the metric, we don't want to
+    # update the init
+    init = check_array(init, copy=True)
+
+    # Assert that init.shape[1] = X.shape[1]
+    if init.shape[1] != n_features:
+      raise ValueError('The input dimensionality ({}) of the given '
+                       'linear transformation `init` must match the '
+                       'dimensionality of the given inputs `X` ({}).'
+                       .format(init.shape[1], n_features))
+
+    # Assert that init.shape[0] <= init.shape[1]
+    if init.shape[0] > init.shape[1]:
+      raise ValueError('The output dimensionality ({}) of the given '
+                       'linear transformation `init` cannot be '
+                       'greater than its input dimensionality ({}).'
+                       .format(init.shape[0], init.shape[1]))
+
+    # Assert that self.n_components = init.shape[0]
+    if n_components != init.shape[0]:
+      raise ValueError('The preferred dimensionality of the '
+                       'projected space `n_components` ({}) does'
+                       ' not match the output dimensionality of '
+                       'the given linear transformation '
+                       '`init` ({})!'
+                       .format(n_components,
+                               init.shape[0]))
+  elif init not in authorized_inits:
+    raise ValueError(
+        "`init` must be '{}' "
+        "or a numpy array of shape (n_components, n_features)."
+        .format("', '".join(authorized_inits)))
+
+  random_state = check_random_state(random_state)
+  if isinstance(init, np.ndarray):
+    return init
+  n_samples = input.shape[0]
+  if init == 'auto':
+    if has_classes:
+      n_classes = len(np.unique(y))
+    else:
+      n_classes = -1
+    init = _auto_select_init(has_classes, n_features, n_samples, n_components,
+                             n_classes)
+  if init == 'identity':
+    return np.eye(n_components, input.shape[-1])
+  elif init == 'random':
+    return random_state.randn(n_components, input.shape[-1])
+  elif init in {'pca', 'lda'}:
+    init_time = time.time()
+    if init == 'pca':
+      pca = PCA(n_components=n_components,
+                random_state=random_state)
+      if verbose:
+        print('Finding principal components... ')
+        sys.stdout.flush()
+      pca.fit(input)
+      transformation = pca.components_
+    elif init == 'lda':
+      lda = LinearDiscriminantAnalysis(n_components=n_components)
+      if verbose:
+        print('Finding most discriminative components... ')
+        sys.stdout.flush()
+      lda.fit(input, y)
+      transformation = lda.scalings_.T[:n_components]
+    if verbose:
+      print('done in {:5.2f}s'.format(time.time() - init_time))
+    return transformation
+
+
+def _auto_select_init(has_classes, n_features, n_samples, n_components,
+                      n_classes):
+  if has_classes and n_components <= min(n_features, n_classes - 1):
+    init = 'lda'
+  elif n_components < min(n_features, n_samples):
+    init = 'pca'
+  else:
+    init = 'identity'
+  return init
+
+
+def _initialize_metric_mahalanobis(input, init='identity', random_state=None,
+                                   return_inverse=False, strict_pd=False,
+                                   matrix_name='matrix'):
+  """Returns a PSD matrix that can be used as a prior or an initialization
+  for the Mahalanobis distance
+
+  Parameters
+  ----------
+  input : array-like
+    The input samples (can be tuples or regular samples).
+
+  init : string or numpy array, optional (default='identity')
+    Specification for the matrix to initialize. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The (pseudo-)inverse covariance matrix (raises an error if the
+      covariance matrix is not definite and `strict_pd == True`)
+
+    'random'
+      A random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A PSD matrix (or strictly PD if strict_pd==True) of
+      shape (n_features, n_features), that will be used as such to
+      initialize the metric, or set the prior.
+
+  random_state : int or `numpy.RandomState` or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random Mahalanobis
+    matrix. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the matrix.
+
+  return_inverse : bool, optional (default=False)
+    Whether to return the inverse of the specified matrix. This
+    can be sometimes useful. It will return the pseudo-inverse (which is the
+    same as the inverse if the matrix is definite (i.e. invertible)). If
+    `strict_pd == True` and the matrix is not definite, it will return an
+    error.
+
+  strict_pd : bool, optional (default=False)
+    Whether to enforce that the provided matrix is definite (in addition to
+    being PSD).
+
+  param_name : str, optional (default='matrix')
+    The name of the matrix used (example: 'init', 'prior'). Will be used in
+    error messages.
+
+  Returns
+  -------
+  M, or (M, M_inv) : `numpy.ndarray`
+    The initial matrix to use M, and its inverse if `return_inverse=True`.
+  """
+  n_features = input.shape[-1]
+  if isinstance(init, np.ndarray):
+    # we copy the array, so that if we update the metric, we don't want to
+    # update the init
+    init = check_array(init, copy=True)
+
+    # Assert that init.shape[1] = n_features
+    if init.shape != (n_features,) * 2:
+      raise ValueError('The input dimensionality {} of the given '
+                       'mahalanobis matrix `{}` must match the '
+                       'dimensionality of the given inputs ({}).'
+                       .format(init.shape, matrix_name, n_features))
+
+    # Assert that the matrix is symmetric
+    if not np.allclose(init, init.T):
+      raise ValueError("`{}` is not symmetric.".format(matrix_name))
+
+  elif init not in ['identity', 'covariance', 'random']:
+    raise ValueError(
+        "`{}` must be 'identity', 'covariance', 'random' "
+        "or a numpy array of shape (n_features, n_features)."
+        .format(matrix_name))
+
+  random_state = check_random_state(random_state)
+  M = init
+  if isinstance(M, np.ndarray):
+    w, V = eigh(M, check_finite=False)
+    init_is_definite = _check_sdp_from_eigen(w)
+    if strict_pd and not init_is_definite:
+      raise LinAlgError("You should provide a strictly positive definite "
+                        "matrix as `{}`. This one is not definite. Try another"
+                        " {}, or an algorithm that does not "
+                        "require the {} to be strictly positive definite."
+                        .format(*((matrix_name,) * 3)))
+    elif return_inverse and not init_is_definite:
+      warnings.warn('The initialization matrix is not invertible: '
+                    'using the pseudo-inverse instead.')
+    if return_inverse:
+      M_inv = _pseudo_inverse_from_eig(w, V)
+      return M, M_inv
+    else:
+      return M
+  elif init == 'identity':
+    M = np.eye(n_features, n_features)
+    if return_inverse:
+      M_inv = M.copy()
+      return M, M_inv
+    else:
+      return M
+  elif init == 'covariance':
+    if input.ndim == 3:
+      # if the input are tuples, we need to form an X by deduplication
+      X = np.unique(np.vstack(input), axis=0)
+    else:
+      X = input
+    # atleast2d is necessary to deal with scalar covariance matrices
+    M_inv = np.atleast_2d(np.cov(X, rowvar=False))
+    w, V = eigh(M_inv, check_finite=False)
+    cov_is_definite = _check_sdp_from_eigen(w)
+    if strict_pd and not cov_is_definite:
+      raise LinAlgError("Unable to get a true inverse of the covariance "
+                        "matrix since it is not definite. Try another "
+                        "`{}`, or an algorithm that does not "
+                        "require the `{}` to be strictly positive definite."
+                        .format(*((matrix_name,) * 2)))
+    elif not cov_is_definite:
+      warnings.warn('The covariance matrix is not invertible: '
+                    'using the pseudo-inverse instead.'
+                    'To make the covariance matrix invertible'
+                    ' you can remove any linearly dependent features and/or '
+                    'reduce the dimensionality of your input, '
+                    'for instance using `sklearn.decomposition.PCA` as a '
+                    'preprocessing step.')
+    M = _pseudo_inverse_from_eig(w, V)
+    if return_inverse:
+      return M, M_inv
+    else:
+      return M
+  elif init == 'random':
+    # we need to create a random symmetric matrix
+    M = make_spd_matrix(n_features, random_state=random_state)
+    if return_inverse:
+      # we use pinvh even if we know the matrix is definite, just because
+      # we need the returned matrix to be symmetric (and sometimes
+      # np.linalg.inv returns not symmetric inverses of symmetric matrices)
+      # TODO: there might be a more efficient method to do so
+      M_inv = pinvh(M)
+      return M, M_inv
+    else:
+      return M
+
+
+def _check_n_components(n_features, n_components):
+  """Checks that n_components is less than n_features and deal with the None
+  case"""
+  if n_components is None:
+    return n_features
+  if 0 < n_components <= n_features:
+    return n_components
+  raise ValueError('Invalid n_components, must be in [1, %d]' % n_features)
+
+
+def _pseudo_inverse_from_eig(w, V, tol=None):
+  """Compute the (Moore-Penrose) pseudo-inverse of the EVD of a symetric
+  matrix.
+
+  Parameters
+  ----------
+  w : (..., M) ndarray
+    The eigenvalues in ascending order, each repeated according to
+    its multiplicity.
+
+  v : {(..., M, M) ndarray, (..., M, M) matrix}
+    The column ``v[:, i]`` is the normalized eigenvector corresponding
+    to the eigenvalue ``w[i]``.  Will return a matrix object if `a` is
+    a matrix object.
+
+  tol : positive `float`, optional
+    Absolute eigenvalues below tol are considered zero.
+
+  Returns
+  -------
+  output : (..., M, N) array_like
+    The pseudo-inverse given by the EVD.
+  """
+  if tol is None:
+    tol = np.amax(w) * np.max(w.shape) * np.finfo(w.dtype).eps
+  # discard small eigenvalues and invert the rest
+  large = np.abs(w) > tol
+  w = np.divide(1, w, where=large, out=w)
+  w[~large] = 0
+
+  return np.dot(V * w, np.conjugate(V).T)
diff --git a/metric_learn/_version.py b/metric_learn/_version.py
new file mode 100644
index 00000000..a71c5c7f
--- /dev/null
+++ b/metric_learn/_version.py
@@ -0,0 +1 @@
+__version__ = '0.7.0'
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 5fe2ca14..47efe4b7 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,75 +1,926 @@
-from numpy.linalg import inv,cholesky
+"""
+Base module.
+"""
 
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.extmath import stable_cumsum
+from sklearn.utils.validation import _is_arraylike, check_is_fitted
+from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
+import numpy as np
+from abc import ABCMeta, abstractmethod
+from ._util import ArrayIndexer, check_input, validate_vector
+import warnings
 
-class BaseMetricLearner(object):
-  def __init__(self):
-    raise NotImplementedError('BaseMetricLearner should not be instantiated')
 
-  def metric(self):
-    """Computes the Mahalanobis matrix from the transformation matrix.
+class BaseMetricLearner(BaseEstimator, metaclass=ABCMeta):
+  """
+  Base class for all metric-learners.
 
-    .. math:: M = L^{\\top} L
+  Parameters
+  ----------
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+  """
+
+  def __init__(self, preprocessor=None):
+    self.preprocessor = preprocessor
+
+  @abstractmethod
+  def score_pairs(self, pairs):
+    """
+    Returns the score between pairs
+    (can be a similarity, or a distance/metric depending on the algorithm)
+
+    .. deprecated:: 0.7.0
+        Refer to `pair_distance` and `pair_score`.
+
+    .. warning::
+        This method will be removed in 0.8.0. Please refer to `pair_distance`
+        or `pair_score`. This change will occur in order to add learners
+        that don't necessarily learn a Mahalanobis distance.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
 
     Returns
     -------
-    M : (d x d) matrix
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The score of every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference between `score_pairs` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
     """
-    L = self.transformer()
-    return L.T.dot(L)
 
-  def transformer(self):
-    """Computes the transformation matrix from the Mahalanobis matrix.
+  @abstractmethod
+  def pair_score(self, pairs):
+    """
+    .. versionadded:: 0.7.0 Compute the similarity score between pairs
 
-    L = inv(cholesky(M))
+    Returns the similarity score between pairs of points (the larger the score,
+    the more similar the pair). For metric learners that learn a distance,
+    the score is simply the opposite of the distance between pairs. All
+    learners have access to this method.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
 
     Returns
     -------
-    L : (d x d) matrix
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The score of every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_score` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
     """
-    return inv(cholesky(self.metric()))
 
-  def transform(self, X=None):
+  @abstractmethod
+  def pair_distance(self, pairs):
+    """
+    .. versionadded:: 0.7.0 Compute the distance between pairs
+
+    Returns the (pseudo) distance between pairs, when available. For metric
+    learners that do not learn a (pseudo) distance, an error is thrown
+    instead.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs for which to compute the distance, with each
+      row corresponding to two points, for 2D array of indices of pairs
+      if the metric learner uses a preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The distance between every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_distance` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
+    """
+
+  def _check_preprocessor(self):
+    """Initializes the preprocessor"""
+    if _is_arraylike(self.preprocessor):
+      self.preprocessor_ = ArrayIndexer(self.preprocessor)
+    elif callable(self.preprocessor) or self.preprocessor is None:
+      self.preprocessor_ = self.preprocessor
+    else:
+      raise ValueError("Invalid type for the preprocessor: {}. You should "
+                       "provide either None, an array-like object, "
+                       "or a callable.".format(type(self.preprocessor)))
+
+  def _prepare_inputs(self, X, y=None, type_of_inputs='classic',
+                      **kwargs):
+    """Initializes the preprocessor and processes inputs. See `check_input`
+    for more details.
+
+    Parameters
+    ----------
+    X : array-like
+      The input data array to check.
+
+    y : array-like
+      The input labels array to check.
+
+    type_of_inputs : `str` {'classic', 'tuples'}
+      The type of inputs to check. If 'classic', the input should be
+      a 2D array-like of points or a 1D array like of indicators of points. If
+      'tuples', the input should be a 3D array-like of tuples or a 2D
+      array-like of indicators of tuples.
+
+    **kwargs : dict
+      Arguments to pass to check_input.
+
+    Returns
+    -------
+    X : `numpy.ndarray`
+      The checked input data array.
+
+    y : `numpy.ndarray` (optional)
+      The checked input labels array.
+    """
+    self._check_preprocessor()
+
+    check_is_fitted(self, ['preprocessor_'])
+    outs = check_input(X, y,
+                       type_of_inputs=type_of_inputs,
+                       preprocessor=self.preprocessor_,
+                       estimator=self,
+                       tuple_size=getattr(self, '_tuple_size', None),
+                       **kwargs)
+    # Conform to SLEP010
+    if not hasattr(self, 'n_features_in_'):
+      self.n_features_in_ = (outs if y is None else outs[0]).shape[1]
+    return outs
+
+  @abstractmethod
+  def get_metric(self):
+    """Returns a function that takes as input two 1D arrays and outputs
+    the value of the learned metric on these two points. Depending on the
+    algorithm, it can return a distance or a similarity function between
+    pairs.
+
+    This function will be independent from the metric learner that learned it
+    (it will not be modified if the initial metric learner is modified),
+    and it can be directly plugged into the `metric` argument of
+    scikit-learn's estimators.
+
+    Returns
+    -------
+    metric_fun : function
+      The function described above.
+
+
+    Examples
+    --------
+    .. doctest::
+
+      >>> from metric_learn import NCA
+      >>> from sklearn.datasets import make_classification
+      >>> from sklearn.neighbors import KNeighborsClassifier
+      >>> nca = NCA()
+      >>> X, y = make_classification()
+      >>> nca.fit(X, y)
+      >>> knn = KNeighborsClassifier(metric=nca.get_metric())
+      >>> knn.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
+      KNeighborsClassifier(algorithm='auto', leaf_size=30,
+        metric=<function MahalanobisMixin.get_metric.<locals>.metric_fun
+                at 0x...>,
+        metric_params=None, n_jobs=None, n_neighbors=5, p=2,
+        weights='uniform')
+
+    See Also
+    --------
+    pair_distance : a method that returns the distance between several
+      pairs of points. Unlike `get_metric`, this is a method of the metric
+      learner and therefore can change if the metric learner changes. Besides,
+      it can use the metric learner's preprocessor, and works on concatenated
+      arrays.
+
+    pair_score : a method that returns the similarity score between
+      several pairs of points. Unlike `get_metric`, this is a method of the
+      metric learner and therefore can change if the metric learner changes.
+      Besides, it can use the metric learner's preprocessor, and works on
+      concatenated arrays.
+    """
+
+
+class MetricTransformer(metaclass=ABCMeta):
+  """
+  Base class for all learners that can transform data into a new space
+  with the metric learned.
+  """
+  @abstractmethod
+  def transform(self, X):
     """Applies the metric transformation.
 
     Parameters
     ----------
-    X : (n x d) matrix, optional
-        Data to transform. If not supplied, the training data will be used.
+    X : (n x d) matrix
+      Data to transform.
 
     Returns
     -------
     transformed : (n x d) matrix
-        Input data transformed to the metric space by :math:`XL^{\\top}`
+      Input data transformed to the metric space by :math:`XL^{\\top}`
+    """
+
+
+class MahalanobisMixin(BaseMetricLearner, MetricTransformer,
+                       metaclass=ABCMeta):
+  r"""Mahalanobis metric learning algorithms.
+
+  Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`,
+  defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x,
+  x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is a learned symmetric
+  positive semi-definite (PSD) matrix. The metric between points can then be
+  expressed as the euclidean distance between points embedded in a new space
+  through a linear transformation. Indeed, the above matrix can be decomposed
+  into the product of two transpose matrices (through SVD or Cholesky
+  decomposition): :math:`d_M(x, x')^2 = (x-x')^T M (x-x') = (x-x')^T L^T L
+  (x-x') = (L x - L x')^T (L x- L x')`
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+  """
+
+  def score_pairs(self, pairs):
+    r"""
+    Returns the learned Mahalanobis distance between pairs.
+
+    This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
+    where ``M`` is the learned Mahalanobis matrix, for every pair of points
+    ``x`` and ``x'``. This corresponds to the euclidean distance between
+    embeddings of the points in a new space, obtained through a linear
+    transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
+    x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
+    :class:`MahalanobisMixin`).
+
+    .. deprecated:: 0.7.0
+        Please use `pair_distance` instead.
+
+    .. warning::
+        This method will be removed in 0.8.0. Please refer to `pair_distance`
+        or `pair_score`. This change will occur in order to add learners
+        that don't necessarily learn a Mahalanobis distance.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The learned Mahalanobis distance for every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `score_pairs` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
+
+    :ref:`mahalanobis_distances` : The section of the project documentation
+      that describes Mahalanobis Distances.
+    """
+    dpr_msg = ("score_pairs will be deprecated in release 0.7.0. "
+               "Use pair_score to compute similarity scores, or "
+               "pair_distances to compute distances.")
+    warnings.warn(dpr_msg, category=FutureWarning)
+    return self.pair_distance(pairs)
+
+  def pair_score(self, pairs):
+    """
+    Returns the opposite of the learned Mahalanobis distance between pairs.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The opposite of the learned Mahalanobis distance for every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_score` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
+
+    :ref:`mahalanobis_distances` : The section of the project documentation
+      that describes Mahalanobis Distances.
+    """
+    return -1 * self.pair_distance(pairs)
+
+  def pair_distance(self, pairs):
+    """
+    Returns the learned Mahalanobis distance between pairs.
+
+    This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
+    where ``M`` is the learned Mahalanobis matrix, for every pair of points
+    ``x`` and ``x'``. This corresponds to the euclidean distance between
+    embeddings of the points in a new space, obtained through a linear
+    transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
+    x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
+    :class:`MahalanobisMixin`).
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The learned Mahalanobis distance for every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_distance` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
+
+    :ref:`mahalanobis_distances` : The section of the project documentation
+      that describes Mahalanobis Distances.
+    """
+    check_is_fitted(self, ['preprocessor_'])
+    pairs = check_input(pairs, type_of_inputs='tuples',
+                        preprocessor=self.preprocessor_,
+                        estimator=self, tuple_size=2)
+    pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :])
+    # (for MahalanobisMixin, the embedding is linear so we can just embed the
+    # difference)
+    return np.sqrt(np.sum(pairwise_diffs**2, axis=-1))
+
+  def transform(self, X):
+    """Embeds data points in the learned linear embedding space.
+
+    Transforms samples in ``X`` into ``X_embedded``, samples inside a new
+    embedding space such that: ``X_embedded = X.dot(L.T)``, where ``L`` is
+    the learned linear transformation (See :class:`MahalanobisMixin`).
+
+    Parameters
+    ----------
+    X : `numpy.ndarray`, shape=(n_samples, n_features)
+      The data points to embed.
+
+    Returns
+    -------
+    X_embedded : `numpy.ndarray`, shape=(n_samples, n_components)
+      The embedded data points.
+    """
+    check_is_fitted(self, ['preprocessor_', 'components_'])
+    X_checked = check_input(X, type_of_inputs='classic', estimator=self,
+                            preprocessor=self.preprocessor_,
+                            accept_sparse=True)
+    return X_checked.dot(self.components_.T)
+
+  def get_metric(self):
+    check_is_fitted(self, 'components_')
+    components_T = self.components_.T.copy()
+
+    def metric_fun(u, v, squared=False):
+      """This function computes the metric between u and v, according to the
+      previously learned metric.
+
+      Parameters
+      ----------
+      u : array-like, shape=(n_features,)
+        The first point involved in the distance computation.
+
+      v : array-like, shape=(n_features,)
+        The second point involved in the distance computation.
+
+      squared : `bool`
+        If True, the function will return the squared metric between u and
+        v, which is faster to compute.
+
+      Returns
+      -------
+      distance : float
+        The distance between u and v according to the new metric.
+      """
+      u = validate_vector(u)
+      v = validate_vector(v)
+      transformed_diff = (u - v).dot(components_T)
+      dist = np.dot(transformed_diff, transformed_diff.T)
+      if not squared:
+        dist = np.sqrt(dist)
+      return dist
+
+    return metric_fun
+
+  get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__
+
+  def get_mahalanobis_matrix(self):
+    """Returns a copy of the Mahalanobis matrix learned by the metric learner.
+
+    Returns
+    -------
+    M : `numpy.ndarray`, shape=(n_features, n_features)
+      The copy of the learned Mahalanobis matrix.
+    """
+    check_is_fitted(self, 'components_')
+    return self.components_.T.dot(self.components_)
+
+
+class _PairsClassifierMixin(BaseMetricLearner, ClassifierMixin):
+  """Base class for pairs learners.
+
+  Attributes
+  ----------
+  threshold_ : `float`
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
+  """
+
+  classes_ = np.array([0, 1])
+  _tuple_size = 2  # number of points in a tuple, 2 for pairs
+
+  def predict(self, pairs):
+    """Predicts the learned metric between input pairs. (For now it just
+    calls decision function).
+
+    Returns the learned metric value between samples in every pair. It should
+    ideally be low for similar samples and high for dissimilar samples.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to predict, with each row corresponding to two
+      points, or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
+      The predicted learned metric value between samples in every pair.
     """
-    if X is None:
-      X = self.X
-    L = self.transformer()
-    return X.dot(L.T)
+    check_is_fitted(self, 'preprocessor_')
 
-  def get_params(self, deep=False):
-    """Get parameters for this metric learner.
+    if "threshold_" not in vars(self):
+      msg = ("A threshold for this estimator has not been set, "
+             "call its set_threshold or calibrate_threshold method.")
+      raise AttributeError(msg)
+    return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1
+
+  def decision_function(self, pairs):
+    """Returns the decision function used to classify the pairs.
+
+    Returns the opposite of the learned metric value between samples in every
+    pair, to be consistent with scikit-learn conventions. Hence it should
+    ideally be low for dissimilar samples and high for similar samples.
+    This is the decision function that is used to classify pairs as similar
+    (+1), or dissimilar (-1).
 
     Parameters
     ----------
-    deep: boolean, optional
-        @WARNING doesn't do anything, only exists because
-        scikit-learn has this on BaseEstimator.
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to predict, with each row corresponding to two
+      points, or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
 
     Returns
     -------
-    params : mapping of string to any
-        Parameter names mapped to their values.
+    y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
+      The predicted decision function value for each pair.
     """
-    return self.params
+    check_is_fitted(self, 'preprocessor_')
+    pairs = check_input(pairs, type_of_inputs='tuples',
+                        preprocessor=self.preprocessor_,
+                        estimator=self, tuple_size=self._tuple_size)
+    return self.pair_score(pairs)
 
-  def set_params(self, **kwarg):
-    """Set the parameters of this metric learner.
+  def score(self, pairs, y):
+    """Computes score of pairs similarity prediction.
 
-    Overwrites any default parameters or parameters specified in constructor.
+    Returns the ``roc_auc`` score of the fitted metric learner. It is
+    computed in the following way: for every value of a threshold
+    ``t`` we classify all pairs of samples where the predicted distance is
+    inferior to ``t`` as belonging to the "similar" class, and the other as
+    belonging to the "dissimilar" class, and we count false positive and
+    true positives as in a classical ``roc_auc`` curve.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs, with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    y : array-like, shape=(n_constraints,)
+      The corresponding labels.
+
+    Returns
+    -------
+    score : float
+      The ``roc_auc`` score.
+    """
+    return roc_auc_score(y, self.decision_function(pairs))
+
+  def set_threshold(self, threshold):
+    """Sets the threshold of the metric learner to the given value `threshold`.
+
+    See more in the :ref:`User Guide <calibration>`.
+
+    Parameters
+    ----------
+    threshold : float
+      The threshold value we want to set. It is the value to which the
+      predicted distance for test pairs will be compared. If they are superior
+      to the threshold they will be classified as similar (+1),
+      and dissimilar (-1) if not.
 
     Returns
     -------
-    self
+    self : `_PairsClassifier`
+      The pairs classifier with the new threshold set.
     """
-    self.params.update(kwarg)
+    check_is_fitted(self, 'preprocessor_')
+    try:
+      self.threshold_ = float(threshold)
+    except TypeError:
+      raise ValueError('Parameter threshold must be a real number. '
+                       'Got {} instead.'.format(type(threshold)))
+    except ValueError:
+      raise ValueError('Parameter threshold must be a real number. '
+                       'Got {} instead.'.format(type(threshold)))
     return self
+
+  def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
+                          min_rate=None, beta=1.):
+    """Decision threshold calibration for pairwise binary classification
+
+    Method that calibrates the decision threshold (cutoff point) of the metric
+    learner. This threshold will then be used when calling the method
+    `predict`. The methods for picking cutoff points make use of traditional
+    binary classification evaluation statistics such as the true positive and
+    true negative rates and F-scores. The threshold will be found to maximize
+    the chosen score on the validation set ``(pairs_valid, y_valid)``.
+
+    See more in the :ref:`User Guide <calibration>`.
+
+    Parameters
+    ----------
+    strategy : str, optional (default='accuracy')
+      The strategy to use for choosing the cutoff threshold.
+
+      'accuracy'
+          Selects a decision threshold that maximizes the accuracy.
+      'f_beta'
+          Selects a decision threshold that maximizes the f_beta score,
+          with beta given by the parameter `beta`.
+      'max_tpr'
+          Selects a decision threshold that yields the highest true positive
+          rate with true negative rate at least equal to the value of the
+          parameter `min_rate`.
+      'max_tnr'
+          Selects a decision threshold that yields the highest true negative
+          rate with true positive rate at least equal to the value of the
+          parameter `min_rate`.
+
+    beta : float in [0, 1], optional (default=None)
+      Beta value to be used in case strategy == 'f_beta'.
+
+    min_rate : float in [0, 1] or None, (default=None)
+      In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
+      to specify the minimal value for the true negative rate or true positive
+      rate respectively that needs to be achieved.
+
+    pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
+      The validation set of pairs to use to set the threshold.
+
+    y_valid : array-like, shape=(n_pairs_valid,)
+      The labels of the pairs of the validation set to use to set the
+      threshold. They must be +1 for positive pairs and -1 for negative pairs.
+
+    References
+    ----------
+    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
+           evaluation tool in clinical medicine, MH Zweig, G Campbell -
+           Clinical chemistry, 1993
+
+    .. [2] Most of the code of this function is from scikit-learn's PR #10117
+
+    See Also
+    --------
+    sklearn.calibration : scikit-learn's module for calibrating classifiers
+    """
+    check_is_fitted(self, 'preprocessor_')
+
+    self._validate_calibration_params(strategy, min_rate, beta)
+
+    pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
+                                                type_of_inputs='tuples')
+
+    n_samples = pairs_valid.shape[0]
+    if strategy == 'accuracy':
+      scores = self.decision_function(pairs_valid)
+      scores_sorted_idces = np.argsort(scores)[::-1]
+      scores_sorted = scores[scores_sorted_idces]
+      # true labels ordered by decision_function value: (higher first)
+      y_ordered = y_valid[scores_sorted_idces]
+      # we need to add a threshold that will reject all points
+      scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted])
+
+      # finds the threshold that maximizes the accuracy:
+      cum_tp = stable_cumsum(y_ordered == 1)  # cumulative number of true
+      # positives
+      # we need to add the point where all samples are rejected:
+      cum_tp = np.concatenate([[0.], cum_tp])
+      cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
+      cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
+      cum_accuracy = (cum_tp + cum_tn) / n_samples
+      imax = np.argmax(cum_accuracy)
+      # we set the threshold to the lowest accepted score
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
+      self.threshold_ = - scores_sorted[imax]
+      # note: if the best is to reject all points it's already one of the
+      # thresholds (scores_sorted[0])
+      return self
+
+    if strategy == 'f_beta':
+      precision, recall, thresholds = precision_recall_curve(
+          y_valid, self.decision_function(pairs_valid), pos_label=1)
+
+      # here the thresholds are decreasing
+      # We ignore the warnings here, in the same taste as
+      # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
+      # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
+      with np.errstate(divide='ignore', invalid='ignore'):
+        f_beta = ((1 + beta**2) * (precision * recall) /
+                  (beta**2 * precision + recall))
+      # We need to set nans to zero otherwise they will be considered higher
+      # than the others (also discussed in https://github.com/scikit-learn/
+      # scikit-learn/pull/10117/files#r262115773)
+      f_beta[np.isnan(f_beta)] = 0.
+      imax = np.argmax(f_beta)
+      # we set the threshold to the lowest accepted score
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
+      self.threshold_ = - thresholds[imax]
+      # Note: we don't need to deal with rejecting all points (i.e. threshold =
+      # max_scores + 1), since this can never happen to be optimal
+      # (see a more detailed discussion in test_calibrate_threshold_extreme)
+      return self
+
+    fpr, tpr, thresholds = roc_curve(y_valid,
+                                     self.decision_function(pairs_valid),
+                                     pos_label=1)
+    # here the thresholds are decreasing
+    fpr, tpr, thresholds = fpr, tpr, thresholds
+
+    if strategy in ['max_tpr', 'max_tnr']:
+      if strategy == 'max_tpr':
+        indices = np.where(1 - fpr >= min_rate)[0]
+        imax = np.argmax(tpr[indices])
+
+      if strategy == 'max_tnr':
+        indices = np.where(tpr >= min_rate)[0]
+        imax = np.argmax(1 - fpr[indices])
+
+      imax_valid = indices[imax]
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
+      if indices[imax] == len(thresholds):  # we want to accept everything
+        self.threshold_ = - (thresholds[imax_valid] - 1)
+      else:
+        # thanks to roc_curve, the first point will always be max_scores
+        # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523
+        self.threshold_ = - thresholds[imax_valid]
+      return self
+
+  @staticmethod
+  def _validate_calibration_params(strategy='accuracy', min_rate=None,
+                                   beta=1.):
+    """Ensure that calibration parameters have allowed values"""
+    if strategy not in ('accuracy', 'f_beta', 'max_tpr',
+                        'max_tnr'):
+      raise ValueError('Strategy can either be "accuracy", "f_beta" or '
+                       '"max_tpr" or "max_tnr". Got "{}" instead.'
+                       .format(strategy))
+    if strategy == 'max_tpr' or strategy == 'max_tnr':
+      if (min_rate is None or not isinstance(min_rate, (int, float)) or
+              not min_rate >= 0 or not min_rate <= 1):
+        raise ValueError('Parameter min_rate must be a number in'
+                         '[0, 1]. '
+                         'Got {} instead.'.format(min_rate))
+    if strategy == 'f_beta':
+      if beta is None or not isinstance(beta, (int, float)):
+        raise ValueError('Parameter beta must be a real number. '
+                         'Got {} instead.'.format(type(beta)))
+
+
+class _TripletsClassifierMixin(BaseMetricLearner, ClassifierMixin):
+  """
+  Base class for triplets learners.
+  """
+
+  classes_ = np.array([0, 1])
+  _tuple_size = 3  # number of points in a tuple, 3 for triplets
+
+  def predict(self, triplets):
+    """Predicts the ordering between sample distances in input triplets.
+
+    For each triplets, returns 1 if the first element is closer to the second
+    than to the last and -1 if not.
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_triplets, 3, n_features) or (n_triplets, 3)
+      3D array of triplets to predict, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Predictions of the ordering of pairs, for each triplet.
+    """
+    return 2 * (self.decision_function(triplets) > 0) - 1
+
+  def decision_function(self, triplets):
+    """Predicts differences between sample distances in input triplets.
+
+    For each triplet (X_a, X_b, X_c) in the samples, computes the difference
+    between the learned distance of the second pair (X_a, X_c) minus the
+    learned distance of the first pair (X_a, X_b). The higher it is, the more
+    probable it is that the pairs in the triplets are presented in the right
+    order, i.e. that the label of the triplet is 1. The lower it is, the more
+    probable it is that the label of the triplet is -1.
+
+    Parameters
+    ----------
+    triplet : array-like, shape=(n_triplets, 3, n_features) or \
+                  (n_triplets, 3)
+      3D array of triplets to predict, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Metric differences.
+    """
+    check_is_fitted(self, 'preprocessor_')
+    triplets = check_input(triplets, type_of_inputs='tuples',
+                           preprocessor=self.preprocessor_,
+                           estimator=self, tuple_size=self._tuple_size)
+    return (self.pair_score(triplets[:, :2]) -
+            self.pair_score(triplets[:, [0, 2]]))
+
+  def score(self, triplets):
+    """Computes score on input triplets.
+
+    Returns the accuracy score of the following classification task: a triplet
+    (X_a, X_b, X_c) is correctly classified if the predicted similarity between
+    the first pair (X_a, X_b) is higher than that of the second pair (X_a, X_c)
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_triplets, 3, n_features) or \
+                  (n_triplets, 3)
+      3D array of triplets to score, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    score : float
+      The triplets score.
+    """
+    # Since the prediction is a vector of values in {-1, +1}, we need to
+    # rescale them to {0, 1} to compute the accuracy using the mean (because
+    # then 1 means a correctly classified result (pairs are in the right
+    # order), and a 0 an incorrectly classified result (pairs are in the
+    # wrong order).
+    return self.predict(triplets).mean() / 2 + 0.5
+
+
+class _QuadrupletsClassifierMixin(BaseMetricLearner, ClassifierMixin):
+  """
+  Base class for quadruplets learners.
+  """
+
+  classes_ = np.array([0, 1])
+  _tuple_size = 4  # number of points in a tuple, 4 for quadruplets
+
+  def predict(self, quadruplets):
+    """Predicts the ordering between sample distances in input quadruplets.
+
+    For each quadruplet, returns 1 if the quadruplet is in the right order (
+    first pair is more similar than second pair), and -1 if not.
+
+    Parameters
+    ----------
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
+                  (n_quadruplets, 4)
+      3D Array of quadruplets to predict, with each row corresponding to four
+      points, or 2D array of indices of quadruplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Predictions of the ordering of pairs, for each quadruplet.
+    """
+    return np.sign(self.decision_function(quadruplets))
+
+  def decision_function(self, quadruplets):
+    """Predicts differences between sample distances in input quadruplets.
+
+    For each quadruplet in the samples, computes the difference between the
+    learned metric of the second pair minus the learned metric of the first
+    pair. The higher it is, the more probable it is that the pairs in the
+    quadruplet are presented in the right order, i.e. that the label of the
+    quadruplet is 1. The lower it is, the more probable it is that the label of
+    the quadruplet is -1.
+
+    Parameters
+    ----------
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
+                  (n_quadruplets, 4)
+      3D Array of quadruplets to predict, with each row corresponding to four
+      points, or 2D array of indices of quadruplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Metric differences.
+    """
+    check_is_fitted(self, 'preprocessor_')
+    quadruplets = check_input(quadruplets, type_of_inputs='tuples',
+                              preprocessor=self.preprocessor_,
+                              estimator=self, tuple_size=self._tuple_size)
+    return (self.pair_score(quadruplets[:, :2]) -
+            self.pair_score(quadruplets[:, 2:]))
+
+  def score(self, quadruplets):
+    """Computes score on input quadruplets
+
+    Returns the accuracy score of the following classification task: a record
+    is correctly classified if the predicted similarity between the first two
+    samples is higher than that of the last two.
+
+    Parameters
+    ----------
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
+                  (n_quadruplets, 4)
+      3D Array of quadruplets to score, with each row corresponding to four
+      points, or 2D array of indices of quadruplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    score : float
+      The quadruplets score.
+    """
+    # Since the prediction is a vector of values in {-1, +1}, we need to
+    # rescale them to {0, 1} to compute the accuracy using the mean (because
+    # then 1 means a correctly classified result (pairs are in the right
+    # order), and a 0 an incorrectly classified result (pairs are in the
+    # wrong order).
+    return self.predict(quadruplets).mean() / 2 + 0.5
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
index 0f57b3e8..4993e9ef 100644
--- a/metric_learn/constraints.py
+++ b/metric_learn/constraints.py
@@ -3,87 +3,309 @@
 from supervised data labels.
 """
 import numpy as np
-import random
 import warnings
-from six.moves import xrange
-from scipy.sparse import coo_matrix
+from sklearn.utils import check_random_state
+from sklearn.neighbors import NearestNeighbors
+
 
 __all__ = ['Constraints']
 
 
 class Constraints(object):
+  """
+  Class to build constraints from labeled data.
+
+  See more in the :ref:`User Guide <supervised_version>`.
+
+  Parameters
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+    Array of labels, with -1 indicating unknown label.
+
+  Attributes
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+    Array of labels, with -1 indicating unknown label.
+  """
+
   def __init__(self, partial_labels):
-    '''partial_labels : int arraylike, -1 indicating unknown label'''
-    partial_labels = np.asanyarray(partial_labels)
-    self.num_points, = partial_labels.shape
-    self.known_label_idx, = np.where(partial_labels >= 0)
-    self.known_labels = partial_labels[self.known_label_idx]
-
-  def adjacency_matrix(self, num_constraints):
-    a, b, c, d = self.positive_negative_pairs(num_constraints)
-    row = np.concatenate((a, c))
-    col = np.concatenate((b, d))
-    data = np.ones_like(row, dtype=int)
-    data[len(a):] = -1
-    adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
-    # symmetrize
-    return adj + adj.T
-
-  def positive_negative_pairs(self, num_constraints, same_length=False):
-    a, b = self._pairs(num_constraints, same_label=True)
-    c, d = self._pairs(num_constraints, same_label=False)
+    partial_labels = np.asanyarray(partial_labels, dtype=int)
+    self.partial_labels = partial_labels
+
+  def positive_negative_pairs(self, n_constraints, same_length=False,
+                              random_state=None, num_constraints='deprecated'):
+    """
+    Generates positive pairs and negative pairs from labeled data.
+
+    Positive pairs are formed by randomly drawing ``n_constraints`` pairs of
+    points with the same label. Negative pairs are formed by randomly drawing
+    ``n_constraints`` pairs of points with different label.
+
+    In the case where it is not possible to generate enough positive or
+    negative pairs, a smaller number of pairs will be returned with a warning.
+
+    Parameters
+    ----------
+    n_constraints : int
+      Number of positive and negative constraints to generate.
+
+    same_length : bool, optional (default=False)
+      If True, forces the number of positive and negative pairs to be
+      equal by ignoring some pairs from the larger set.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int.
+
+    num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+    Returns
+    -------
+    a : array-like, shape=(n_constraints,)
+      1D array of indicators for the left elements of positive pairs.
+
+    b : array-like, shape=(n_constraints,)
+      1D array of indicators for the right elements of positive pairs.
+
+    c : array-like, shape=(n_constraints,)
+      1D array of indicators for the left elements of negative pairs.
+
+    d : array-like, shape=(n_constraints,)
+      1D array of indicators for the right elements of negative pairs.
+    """
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    random_state = check_random_state(random_state)
+    a, b = self._pairs(n_constraints, same_label=True,
+                       random_state=random_state)
+    c, d = self._pairs(n_constraints, same_label=False,
+                       random_state=random_state)
     if same_length and len(a) != len(c):
       n = min(len(a), len(c))
       return a[:n], b[:n], c[:n], d[:n]
     return a, b, c, d
 
-  def _pairs(self, num_constraints, same_label=True, max_iter=10):
-    num_labels = len(self.known_labels)
+  def generate_knntriplets(self, X, k_genuine, k_impostor):
+    """
+    Generates triplets from labeled data.
+
+    For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all
+    the combinations of taking one of its `k_genuine`-nearest neighbors of the
+    same class (X_b) and taking one of its `k_impostor`-nearest neighbors of
+    other classes (X_c).
+
+    In the case a class doesn't have enough points in the same class (other
+    classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be
+    raised and the maximum value of genuine (impostor) neighbors will be used
+    for that class.
+
+    Parameters
+    ----------
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    k_genuine : int
+      Number of neighbors of the same class to be taken into account.
+
+    k_impostor : int
+      Number of neighbors of different classes to be taken into account.
+
+    Returns
+    -------
+    triplets : array-like, shape=(n_constraints, 3)
+      2D array of triplets of indicators.
+    """
+    # Ignore unlabeled samples
+    known_labels_mask = self.partial_labels >= 0
+    known_labels = self.partial_labels[known_labels_mask]
+    X = X[known_labels_mask]
+
+    labels, labels_count = np.unique(known_labels, return_counts=True)
+    len_input = known_labels.shape[0]
+
+    # Handle the case where there are too few elements to yield k_genuine or
+    # k_impostor neighbors for every class.
+
+    k_genuine_vec = np.full_like(labels, k_genuine)
+    k_impostor_vec = np.full_like(labels, k_impostor)
+
+    for i, count in enumerate(labels_count):
+      if k_genuine + 1 > count:
+        k_genuine_vec[i] = count-1
+        warnings.warn("The class {} has {} elements, which is not sufficient "
+                      "to generate {} genuine neighbors as specified by "
+                      "k_genuine. Will generate {} genuine neighbors instead."
+                      "\n"
+                      .format(labels[i], count, k_genuine+1,
+                              k_genuine_vec[i]))
+      if k_impostor > len_input - count:
+        k_impostor_vec[i] = len_input - count
+        warnings.warn("The class {} has {} elements of other classes, which is"
+                      " not sufficient to generate {} impostor neighbors as "
+                      "specified by k_impostor. Will generate {} impostor "
+                      "neighbors instead.\n"
+                      .format(labels[i], k_impostor_vec[i], k_impostor,
+                              k_impostor_vec[i]))
+
+    # The total number of possible triplets combinations per label comes from
+    # taking one of the k_genuine_vec[i] genuine neighbors and one of the
+    # k_impostor_vec[i] impostor neighbors for the labels_count[i] elements
+    comb_per_label = labels_count * k_genuine_vec * k_impostor_vec
+
+    # Get start and finish for later triplet assigning
+    # append zero at the begining for start and get cumulative sum
+    start_finish_indices = np.hstack((0, comb_per_label)).cumsum()
+
+    # Total number of triplets is the sum of all possible combinations per
+    # label
+    num_triplets = start_finish_indices[-1]
+    triplets = np.empty((num_triplets, 3), dtype=np.intp)
+
+    neigh = NearestNeighbors()
+
+    for i, label in enumerate(labels):
+
+        # generate mask for current label
+        gen_mask = known_labels == label
+        gen_indx = np.where(gen_mask)
+
+        # get k_genuine genuine neighbors
+        neigh.fit(X=X[gen_indx])
+        # Take elements of gen_indx according to the yielded k-neighbors
+        gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i],
+                                             return_distance=False)
+        gen_neigh = np.take(gen_indx, gen_relative_indx)
+
+        # generate mask for impostors of current label
+        imp_indx = np.where(~gen_mask)
+
+        # get k_impostor impostor neighbors
+        neigh.fit(X=X[imp_indx])
+        # Take elements of imp_indx according to the yielded k-neighbors
+        imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i],
+                                             X=X[gen_mask],
+                                             return_distance=False)
+        imp_neigh = np.take(imp_indx, imp_relative_indx)
+
+        # length = len_label*k_genuine*k_impostor
+        start, finish = start_finish_indices[i:i+2]
+
+        triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh,
+                                         k_genuine_vec[i],
+                                         k_impostor_vec[i])
+
+    return triplets
+
+  def _pairs(self, n_constraints, same_label=True, max_iter=10,
+             random_state=np.random):
+    known_label_idx, = np.where(self.partial_labels >= 0)
+    known_labels = self.partial_labels[known_label_idx]
+    num_labels = len(known_labels)
     ab = set()
     it = 0
-    while it < max_iter and len(ab) < num_constraints:
-      nc = num_constraints - len(ab)
-      for aidx in np.random.randint(num_labels, size=nc):
+    while it < max_iter and len(ab) < n_constraints:
+      nc = n_constraints - len(ab)
+      for aidx in random_state.randint(num_labels, size=nc):
         if same_label:
-          mask = self.known_labels[aidx] == self.known_labels
+          mask = known_labels[aidx] == known_labels
           mask[aidx] = False  # avoid identity pairs
         else:
-          mask = self.known_labels[aidx] != self.known_labels
+          mask = known_labels[aidx] != known_labels
         b_choices, = np.where(mask)
         if len(b_choices) > 0:
-          ab.add((aidx, np.random.choice(b_choices)))
+          ab.add((aidx, random_state.choice(b_choices)))
       it += 1
-    if len(ab) < num_constraints:
+    if len(ab) < n_constraints:
       warnings.warn("Only generated %d %s constraints (requested %d)" % (
-          len(ab), 'positive' if same_label else 'negative', num_constraints))
-    ab = np.array(list(ab)[:num_constraints], dtype=int)
-    return self.known_label_idx[ab.T]
-
-  def chunks(self, num_chunks=100, chunk_size=2):
-    chunks = -np.ones_like(self.known_label_idx, dtype=int)
-    uniq, lookup = np.unique(self.known_labels, return_inverse=True)
-    all_inds = [set(np.where(lookup==c)[0]) for c in xrange(len(uniq))]
+          len(ab), 'positive' if same_label else 'negative', n_constraints))
+    ab = np.array(list(ab)[:n_constraints], dtype=int)
+    return known_label_idx[ab.T]
+
+  def chunks(self, n_chunks=100, chunk_size=2, random_state=None,
+             num_chunks='deprecated'):
+    """
+    Generates chunks from labeled data.
+
+    Each of ``n_chunks`` chunks is composed of ``chunk_size`` points from
+    the same class drawn at random. Each point can belong to at most 1 chunk.
+
+    In the case where there is not enough points to generate ``n_chunks``
+    chunks of size ``chunk_size``, a ValueError will be raised.
+
+    Parameters
+    ----------
+    n_chunks : int, optional (default=100)
+      Number of chunks to generate.
+
+    chunk_size : int, optional (default=2)
+      Number of points in each chunk.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int.
+
+    num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0
+
+    Returns
+    -------
+    chunks : array-like, shape=(n_samples,)
+      1D array of chunk indicators, where -1 indicates that the point does not
+      belong to any chunk.
+    """
+    if num_chunks != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_chunks = num_chunks
+    random_state = check_random_state(random_state)
+    chunks = -np.ones_like(self.partial_labels, dtype=int)
+    uniq, lookup = np.unique(self.partial_labels, return_inverse=True)
+    unknown_uniq = np.where(uniq < 0)[0]
+    all_inds = [set(np.where(lookup == c)[0]) for c in range(len(uniq))
+                if c not in unknown_uniq]
+    max_chunks = int(np.sum([len(s) // chunk_size for s in all_inds]))
+    if max_chunks < n_chunks:
+      raise ValueError(('Not enough possible chunks of %d elements in each'
+                        ' class to form expected %d chunks - maximum number'
+                        ' of chunks is %d'
+                        ) % (chunk_size, n_chunks, max_chunks))
     idx = 0
-    while idx < num_chunks and all_inds:
-      c = random.randint(0, len(all_inds)-1)
+    while idx < n_chunks and all_inds:
+      if len(all_inds) == 1:
+        c = 0
+      else:
+        c = random_state.randint(0, high=len(all_inds) - 1)
       inds = all_inds[c]
       if len(inds) < chunk_size:
         del all_inds[c]
         continue
-      ii = random.sample(inds, chunk_size)
+      ii = random_state.choice(list(inds), chunk_size, replace=False)
       inds.difference_update(ii)
       chunks[ii] = idx
       idx += 1
-    if idx < num_chunks:
-      raise ValueError('Unable to make %d chunks of %d examples each' %
-                       (num_chunks, chunk_size))
     return chunks
 
-  @staticmethod
-  def random_subset(all_labels, num_preserved=np.inf):
-    n = len(all_labels)
-    num_ignored = max(0, n - num_preserved)
-    idx = np.random.randint(n, size=num_ignored)
-    partial_labels = np.array(all_labels, copy=True)
-    partial_labels[idx] = -1
-    return Constraints(partial_labels)
+
+def comb(A, B, C, sizeB, sizeC):
+    # generate_knntriplets helper function
+    # generate an array with all combinations of choosing
+    # an element from A, B and C
+    return np.vstack((np.tile(A, (sizeB*sizeC, 1)).ravel(order='F'),
+                      np.tile(np.hstack(B), (sizeC, 1)).ravel(order='F'),
+                      np.tile(C, (1, sizeB)).ravel())).T
+
+
+def wrap_pairs(X, constraints):
+  a = np.array(constraints[0])
+  b = np.array(constraints[1])
+  c = np.array(constraints[2])
+  d = np.array(constraints[3])
+  constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
+  y = np.concatenate([np.ones_like(a), -np.ones_like(c)])
+  pairs = X[constraints]
+  return pairs, y
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
index 541cbfa9..2c05b28d 100644
--- a/metric_learn/covariance.py
+++ b/metric_learn/covariance.py
@@ -1,30 +1,60 @@
 """
 Covariance metric (baseline method)
-
-This method does not "learn" anything, rather it calculates
-the covariance matrix of the input data.
-
-This is a simple baseline method first introduced in
-On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 """
 
-from __future__ import absolute_import
 import numpy as np
+import scipy
+from sklearn.base import TransformerMixin
+
+from .base_metric import MahalanobisMixin
+from ._util import components_from_metric
+
+
+class Covariance(MahalanobisMixin, TransformerMixin):
+  """Covariance metric (baseline method)
 
-from .base_metric import BaseMetricLearner
+  This method does not "learn" anything, rather it calculates
+  the covariance matrix of the input data.
 
+  This is a simple baseline method first introduced in
+  On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 
-class Covariance(BaseMetricLearner):
-  def __init__(self):
-    self.params = {}
+  Read more in the :ref:`User Guide <covariance>`.
 
-  def metric(self):
-    return self.M
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+      The linear transformation ``L`` deduced from the learned Mahalanobis
+      metric (See function `components_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import Covariance
+  >>> from sklearn.datasets import load_iris
+  >>> iris = load_iris()['data']
+  >>> cov = Covariance().fit(iris)
+  >>> x = cov.transform(iris)
+
+  """
+
+  def __init__(self, preprocessor=None):
+    super(Covariance, self).__init__(preprocessor)
 
   def fit(self, X, y=None):
     """
-    X: data matrix, (n x d)
-    y: unused, optional
+    Calculates the covariance matrix of the input data.
+
+    Parameters
+    ----------
+    X : data matrix, (n x d)
+    y : unused
     """
-    self.M = np.cov(X.T)
+    X = self._prepare_inputs(X, ensure_min_samples=2)
+    M = np.atleast_2d(np.cov(X, rowvar=False))
+    if M.size == 1:
+      M = 1. / M
+    else:
+      M = scipy.linalg.pinvh(M)
+
+    self.components_ = components_from_metric(np.atleast_2d(M))
     return self
diff --git a/metric_learn/exceptions.py b/metric_learn/exceptions.py
new file mode 100644
index 00000000..76f09778
--- /dev/null
+++ b/metric_learn/exceptions.py
@@ -0,0 +1,20 @@
+"""
+The :mod:`metric_learn.exceptions` module includes all custom warnings and
+error classes used across metric-learn.
+"""
+from numpy.linalg import LinAlgError
+
+
+class PreprocessorError(Exception):
+
+  def __init__(self, original_error):
+    err_msg = ("An error occurred when trying to use the "
+               "preprocessor: {}").format(repr(original_error))
+    super(PreprocessorError, self).__init__(err_msg)
+
+
+class NonPSDError(LinAlgError):
+
+    def __init__(self):
+      err_msg = "Matrix is not positive semidefinite (PSD)."
+      super(LinAlgError, self).__init__(err_msg)
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 7f2118bd..9537eec2 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -1,184 +1,407 @@
 """
-Information Theoretic Metric Learning, Kulis et al., ICML 2007
-
-ITML minimizes the differential relative entropy between two multivariate
-Gaussians under constraints on the distance function,
-which can be formulated into a Bregman optimization problem by minimizing the
-LogDet divergence subject to linear constraints.
-This algorithm can handle a wide variety of constraints and can optionally
-incorporate a prior on the distance function.
-Unlike some other methods, ITML does not rely on an eigenvalue computation
-or semi-definite programming.
+Information Theoretic Metric Learning (ITML)
 """
 
-from __future__ import print_function, absolute_import
 import numpy as np
-from six.moves import xrange
 from sklearn.metrics import pairwise_distances
+from sklearn.utils.validation import check_array
+from sklearn.base import TransformerMixin
+from .base_metric import _PairsClassifierMixin, MahalanobisMixin
+from .constraints import Constraints, wrap_pairs
+from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
 
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
 
-
-class ITML(BaseMetricLearner):
+class _BaseITML(MahalanobisMixin):
   """Information Theoretic Metric Learning (ITML)"""
-  def __init__(self, gamma=1., max_iters=1000, convergence_threshold=1e-3,
-               verbose=False):
-    """Initialize the learner.
 
-    Parameters
-    ----------
-    gamma : float, optional
-        value for slack variables
-    max_iters : int, optional
-    convergence_threshold : float, optional
-    verbose : bool, optional
-        if True, prints information while learning
-    """
-    self.params = {
-      'gamma': gamma,
-      'max_iters': max_iters,
-      'convergence_threshold': convergence_threshold,
-      'verbose': verbose,
-    }
-
-  def _process_inputs(self, X, constraints, bounds, A0):
-    self.X = X
-    # check to make sure that no two constrained vectors are identical
-    a,b,c,d = constraints
-    ident = _vector_norm(self.X[a] - self.X[b]) > 1e-9
-    a, b = a[ident], b[ident]
-    ident = _vector_norm(self.X[c] - self.X[d]) > 1e-9
-    c, d = c[ident], d[ident]
+  _tuple_size = 2  # constraints are pairs
+
+  def __init__(self, gamma=1., max_iter=1000, tol=1e-3,
+               prior='identity', verbose=False,
+               preprocessor=None, random_state=None,
+               convergence_threshold='deprecated'):
+    if convergence_threshold != 'deprecated':
+      warnings.warn('"convergence_threshold" parameter has been '
+                    ' renamed to "tol". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      tol = convergence_threshold
+    self.convergence_threshold = 'deprecated'  # Avoid errors
+    self.gamma = gamma
+    self.max_iter = max_iter
+    self.tol = tol
+    self.prior = prior
+    self.verbose = verbose
+    self.random_state = random_state
+    super(_BaseITML, self).__init__(preprocessor)
+
+  def _fit(self, pairs, y, bounds=None):
+    pairs, y = self._prepare_inputs(pairs, y,
+                                    type_of_inputs='tuples')
     # init bounds
     if bounds is None:
-      self.bounds = np.percentile(pairwise_distances(X), (5, 95))
-    else:
-      assert len(bounds) == 2
-      self.bounds = bounds
-    # init metric
-    if A0 is None:
-      self.A = np.identity(X.shape[1])
+      X = np.unique(np.vstack(pairs), axis=0)
+      self.bounds_ = np.percentile(pairwise_distances(X), (5, 95))
     else:
-      self.A = A0
-    return a,b,c,d
-
-  def fit(self, X, constraints, bounds=None, A0=None):
-    """Learn the ITML model.
-
-    Parameters
-    ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d])
-    bounds : list (pos,neg) pairs, optional
-        bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg
-    A0 : (d x d) matrix, optional
-        initial regularization matrix, defaults to identity
-    """
-    verbose = self.params['verbose']
-    a,b,c,d = self._process_inputs(X, constraints, bounds, A0)
-    gamma = self.params['gamma']
-    conv_thresh = self.params['convergence_threshold']
-    num_pos = len(a)
-    num_neg = len(c)
+      bounds = check_array(bounds, allow_nd=False, ensure_min_samples=0,
+                           ensure_2d=False)
+      bounds = bounds.ravel()
+      if bounds.size != 2:
+        raise ValueError("`bounds` should be an array-like of two elements.")
+      self.bounds_ = bounds
+    self.bounds_[self.bounds_ == 0] = 1e-9
+    # set the prior
+    # pairs will be deduplicated into X two times, TODO: avoid that
+    A = _initialize_metric_mahalanobis(pairs, self.prior, self.random_state,
+                                       strict_pd=True,
+                                       matrix_name='prior')
+    gamma = self.gamma
+    pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
+    num_pos = len(pos_pairs)
+    num_neg = len(neg_pairs)
     _lambda = np.zeros(num_pos + num_neg)
     lambdaold = np.zeros_like(_lambda)
-    gamma_proj = 1. if gamma is np.inf else gamma/(gamma+1.)
-    pos_bhat = np.zeros(num_pos) + self.bounds[0]
-    neg_bhat = np.zeros(num_neg) + self.bounds[1]
-    A = self.A
+    gamma_proj = 1. if gamma is np.inf else gamma / (gamma + 1.)
+    pos_bhat = np.zeros(num_pos) + self.bounds_[0]
+    neg_bhat = np.zeros(num_neg) + self.bounds_[1]
+    pos_vv = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
+    neg_vv = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
 
-    for it in xrange(self.params['max_iters']):
+    for it in range(self.max_iter):
       # update positives
-      vv = self.X[a] - self.X[b]
-      for i,v in enumerate(vv):
+      for i, v in enumerate(pos_vv):
         wtw = v.dot(A).dot(v)  # scalar
-        alpha = min(_lambda[i], gamma_proj*(1./wtw - 1./pos_bhat[i]))
+        alpha = min(_lambda[i], gamma_proj * (1. / wtw - 1. / pos_bhat[i]))
         _lambda[i] -= alpha
-        beta = alpha/(1 - alpha*wtw)
-        pos_bhat[i] = 1./((1 / pos_bhat[i]) + (alpha / gamma))
-        A += beta * A.dot(np.outer(v,v)).dot(A)
+        beta = alpha / (1 - alpha * wtw)
+        pos_bhat[i] = 1. / ((1 / pos_bhat[i]) + (alpha / gamma))
+        Av = A.dot(v)
+        A += np.outer(Av, Av * beta)
 
       # update negatives
-      vv = self.X[c] - self.X[d]
-      for i,v in enumerate(vv):
+      for i, v in enumerate(neg_vv):
         wtw = v.dot(A).dot(v)  # scalar
-        alpha = min(_lambda[i+num_pos],gamma_proj*(1./neg_bhat[i] - 1./wtw))
-        _lambda[i+num_pos] -= alpha
-        beta = -alpha/(1 + alpha*wtw)
-        neg_bhat[i] = 1./((1 / neg_bhat[i]) - (alpha / gamma))
-        A += beta * A.dot(np.outer(v,v)).dot(A)
+        alpha = min(_lambda[i + num_pos],
+                    gamma_proj * (1. / neg_bhat[i] - 1. / wtw))
+        _lambda[i + num_pos] -= alpha
+        beta = -alpha / (1 + alpha * wtw)
+        neg_bhat[i] = 1. / ((1 / neg_bhat[i]) - (alpha / gamma))
+        Av = A.dot(v)
+        A += np.outer(Av, Av * beta)
 
       normsum = np.linalg.norm(_lambda) + np.linalg.norm(lambdaold)
       if normsum == 0:
         conv = np.inf
         break
       conv = np.abs(lambdaold - _lambda).sum() / normsum
-      if conv < conv_thresh:
+      if conv < self.tol:
         break
       lambdaold = _lambda.copy()
-      if verbose:
+      if self.verbose:
         print('itml iter: %d, conv = %f' % (it, conv))
-    if verbose:
+
+    if self.verbose:
       print('itml converged at iter: %d, conv = %f' % (it, conv))
+    self.n_iter_ = it
+
+    self.components_ = components_from_metric(A)
     return self
 
-  def metric(self):
-    return self.A
 
-# hack around lack of axis kwarg in older numpy versions
-try:
-  np.linalg.norm([[4]], axis=1)
-except TypeError:
-  def _vector_norm(X):
-    return np.apply_along_axis(np.linalg.norm, 1, X)
-else:
-  def _vector_norm(X):
-    return np.linalg.norm(X, axis=1)
+class ITML(_BaseITML, _PairsClassifierMixin):
+  """Information Theoretic Metric Learning (ITML)
 
+  `ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler
+  divergence, between two multivariate Gaussians subject to constraints on the
+  associated Mahalanobis distance, which can be formulated into a Bregman
+  optimization problem by minimizing the LogDet divergence subject to
+  linear constraints. This algorithm can handle a wide variety of constraints
+  and can optionally incorporate a prior on the distance function. Unlike some
+  other methods, `ITML` does not rely on an eigenvalue computation or
+  semi-definite programming.
 
-class ITML_Supervised(ITML):
-  """Information Theoretic Metric Learning (ITML)"""
-  def __init__(self, gamma=1., max_iters=1000, convergence_threshold=1e-3,
-               num_labeled=np.inf, num_constraints=None, bounds=None, A0=None,
-               verbose=False):
-    """Initialize the learner.
+  Read more in the :ref:`User Guide <itml>`.
+
+  Parameters
+  ----------
+  gamma : float, optional (default=1.0)
+    Value for slack variables
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iteration of the optimization procedure.
+
+  tol : float, optional (default=1e-3)
+    Convergence tolerance.
+
+  prior : string or numpy array, optional (default='identity')
+    The Mahalanobis matrix to use as a prior. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
+
+  Attributes
+  ----------
+  bounds_ : `numpy.ndarray`, shape=(2,)
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
+    not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances among
+    all points present in the input `pairs`.
+
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  threshold_ : `float`
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import ITML
+  >>> pairs = [[[1.2, 7.5], [1.3, 1.5]],
+  >>>         [[6.4, 2.6], [6.2, 9.7]],
+  >>>         [[1.3, 4.5], [3.2, 4.6]],
+  >>>         [[6.2, 5.5], [5.4, 5.4]]]
+  >>> y = [1, 1, -1, -1]
+  >>> # in this task we want points where the first feature is close to be
+  >>> # closer to each other, no matter how close the second feature is
+  >>> itml = ITML()
+  >>> itml.fit(pairs, y)
+
+  References
+  ----------
+  .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning
+         <http://www.prateekjain.org/publications/all_papers\
+         /DavisKJSD07_ICML.pdf>`_. ICML 2007.
+  """
+
+  def fit(self, pairs, y, bounds=None, calibration_params=None):
+    """Learn the ITML model.
+
+    The threshold will be calibrated on the trainset using the parameters
+    `calibration_params`.
 
     Parameters
     ----------
-    gamma : float, optional
-        value for slack variables
-    max_iters : int, optional
-    convergence_threshold : float, optional
-    num_labeled : int, optional
-        number of labels to preserve for training
-    num_constraints: int, optional
-        number of constraints to generate
-    verbose : bool, optional
-        if True, prints information while learning
+    pairs: array-like, shape=(n_constraints, 2, n_features) or \
+           (n_constraints, 2)
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    y: array-like, of shape (n_constraints,)
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
+    bounds : array-like of two numbers
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points present in the input `pairs`.
+
+    calibration_params : `dict` or `None`
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
     """
-    ITML.__init__(self, gamma=gamma, max_iters=max_iters,
-                  convergence_threshold=convergence_threshold, verbose=verbose)
-    self.params.update(num_labeled=num_labeled, num_constraints=num_constraints,
-                       bounds=bounds, A0=A0)
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
+    self._fit(pairs, y, bounds=bounds)
+    self.calibrate_threshold(pairs, y, **calibration_params)
+    return self
+
+
+class ITML_Supervised(_BaseITML, TransformerMixin):
+  """Supervised version of Information Theoretic Metric Learning (ITML)
+
+  `ITML_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `ITML` for training.
+
+  Parameters
+  ----------
+  gamma : float, optional (default=1.0)
+    Value for slack variables
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iterations of the optimization procedure.
+
+  tol : float, optional (default=1e-3)
+    Tolerance of the optimization procedure.
+
+  n_constraints : int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  prior : string or numpy array, optional (default='identity')
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
 
-  def fit(self, X, labels):
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior. In any
+    case, `random_state` is also used to randomly sample constraints from
+    labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
+
+  Attributes
+  ----------
+  bounds_ : `numpy.ndarray`, shape=(2,)
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+    If not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances
+    among all points in the training data `X`.
+
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import ITML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> itml = ITML_Supervised(n_constraints=200)
+  >>> itml.fit(X, Y)
+
+  See Also
+  --------
+  metric_learn.ITML : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def __init__(self, gamma=1.0, max_iter=1000, tol=1e-3,
+               n_constraints=None, prior='identity',
+               verbose=False, preprocessor=None, random_state=None,
+               num_constraints='deprecated',
+               convergence_threshold='deprecated'):
+    _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter,
+                       tol=tol,
+                       prior=prior, verbose=verbose,
+                       preprocessor=preprocessor,
+                       random_state=random_state,
+                       convergence_threshold=convergence_threshold)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_constraints = num_constraints
+    self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
+
+  def fit(self, X, y, bounds=None):
     """Create constraints from labels and learn the ITML model.
-    Needs num_constraints specified in constructor.
+
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    labels : (n) data labels
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    y : (n) array-like
+      Data labels.
+
+    bounds : array-like of two numbers
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points in the training data `X`.
     """
-    num_constraints = self.params['num_constraints']
-    if num_constraints is None:
-      num_classes = np.unique(labels)
-      num_constraints = 20*(len(num_classes))**2
-
-    c = Constraints.random_subset(labels, self.params['num_labeled'])
-    return ITML.fit(self, X, c.positive_negative_pairs(num_constraints),
-                    bounds=self.params['bounds'], A0=self.params['A0'])
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    n_constraints = self.n_constraints
+    if n_constraints is None:
+      num_classes = len(np.unique(y))
+      n_constraints = 20 * num_classes**2
+
+    c = Constraints(y)
+    pos_neg = c.positive_negative_pairs(n_constraints,
+                                        random_state=self.random_state)
+    pairs, y = wrap_pairs(X, pos_neg)
+    return _BaseITML._fit(self, pairs, y, bounds=bounds)
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index 097379de..82ae20eb 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -1,122 +1,171 @@
 """
 Local Fisher Discriminant Analysis (LFDA)
-
-Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
-Sugiyama, ICML 2006
-
-LFDA is a linear supervised dimensionality reduction method.
-It is particularly useful when dealing with multimodality,
-where one ore more classes consist of separate clusters in input space.
-The core optimization problem of LFDA is solved as a generalized
-eigenvalue problem.
 """
-from __future__ import division, absolute_import
 import numpy as np
 import scipy
-from six.moves import xrange
+import warnings
 from sklearn.metrics import pairwise_distances
+from sklearn.base import TransformerMixin
 
-from .base_metric import BaseMetricLearner
+from ._util import _check_n_components
+from .base_metric import MahalanobisMixin
 
 
-class LFDA(BaseMetricLearner):
+class LFDA(MahalanobisMixin, TransformerMixin):
   '''
   Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
-  Sugiyama, ICML 2006
-  '''
-  def __init__(self, dim=None, k=7, metric='weighted'):
-    '''
-    dim : dimensionality of reduced space (defaults to dimension of X)
-    k : nearest neighbor used in local scaling method (default: 7)
-    metric : type of metric in the embedding space (default: 'weighted')
-      'weighted'        - weighted eigenvectors
-      'orthonormalized' - orthonormalized
-      'plain'           - raw eigenvectors
-    '''
-    if metric not in ('weighted', 'orthonormalized', 'plain'):
-      raise ValueError('Invalid metric: %r' % metric)
 
-    self.params = {
-      'dim': dim,
-      'metric': metric,
-      'k': k,
-    }
-
-  def transformer(self):
-    return self._transformer
+  LFDA is a linear supervised dimensionality reduction method. It is
+  particularly useful when dealing with multimodality, where one ore more
+  classes consist of separate clusters in input space. The core optimization
+  problem of LFDA is solved as a generalized eigenvalue problem.
+
+  Read more in the :ref:`User Guide <lfda>`.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  k : int, optional (default=None)
+    Number of nearest neighbors used in local scaling method. If None,
+    defaults to min(7, n_features - 1).
+
+  embedding_type : str, optional (default: 'weighted')
+    Type of metric in the embedding space.
+
+    'weighted'
+      weighted eigenvectors
+
+    'orthonormalized'
+      orthonormalized
+
+    'plain'
+      raw eigenvectors
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+      The learned linear transformation ``L``.
+
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import LFDA
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lfda = LFDA(k=2, dim=2)
+  >>> lfda.fit(X, Y)
+
+  References
+  ----------
+  .. [1] Masashi Sugiyama. `Dimensionality Reduction of Multimodal Labeled
+         Data by Local Fisher Discriminant Analysis
+         <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_. JMLR 2007.
+
+  .. [2] Yuan Tang. `Local Fisher Discriminant Analysis on Beer Style
+        Clustering
+        <https://gastrograph.com/resources/whitepapers/local-fisher\
+        -discriminant-analysis-on-beer-style-clustering.html#>`_.
+  '''
 
-  def _process_inputs(self, X, Y):
-    X = np.asanyarray(X)
-    self.X = X
+  def __init__(self, n_components=None,
+               k=None, embedding_type='weighted', preprocessor=None):
+    if embedding_type not in ('weighted', 'orthonormalized', 'plain'):
+      raise ValueError('Invalid embedding_type: %r' % embedding_type)
+    self.n_components = n_components
+    self.embedding_type = embedding_type
+    self.k = k
+    super(LFDA, self).__init__(preprocessor)
+
+  def fit(self, X, y):
+    '''Fit the LFDA model.
+
+    Parameters
+    ----------
+    X : (n, d) array-like
+        Input data.
+
+    y : (n,) array-like
+        Class labels, one per point of data.
+    '''
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    unique_classes, y = np.unique(y, return_inverse=True)
     n, d = X.shape
-    unique_classes, Y = np.unique(Y, return_inverse=True)
     num_classes = len(unique_classes)
 
-    if self.params['dim'] is None:
-      self.params['dim'] = d
-    elif not 0 < self.params['dim'] <= d:
-      raise ValueError('Invalid embedding dimension, must be in [1,%d]' % d)
-
-    if not 0 < self.params['k'] < d:
-      raise ValueError('Invalid k, must be in [0,%d]' % (d-1))
+    dim = _check_n_components(d, self.n_components)
 
-    return X, Y, num_classes, n, d
-
-  def fit(self, X, Y):
-    '''
-     X: (n, d) array-like of samples
-     Y: (n,) array-like of class labels
-    '''
-    X, Y, num_classes, n, d = self._process_inputs(X, Y)
-    tSb = np.zeros((d,d))
-    tSw = np.zeros((d,d))
+    if self.k is None:
+      k = min(7, d - 1)
+    elif self.k >= d:
+      warnings.warn('Chosen k (%d) too large, using %d instead.'
+                    % (self.k, d - 1))
+      k = d - 1
+    else:
+      k = int(self.k)
+    tSb = np.zeros((d, d))
+    tSw = np.zeros((d, d))
 
-    for c in xrange(num_classes):
-      Xc = X[Y==c]
+    for c in range(num_classes):
+      Xc = X[y == c]
       nc = Xc.shape[0]
 
       # classwise affinity matrix
       dist = pairwise_distances(Xc, metric='l2', squared=True)
       # distances to k-th nearest neighbor
-      k = min(self.params['k'], nc-1)
-      sigma = np.sqrt(np.partition(dist, k, axis=0)[:,k])
+      k = min(k, nc - 1)
+      sigma = np.sqrt(np.partition(dist, k, axis=0)[:, k])
 
       local_scale = np.outer(sigma, sigma)
       with np.errstate(divide='ignore', invalid='ignore'):
-        A = np.exp(-dist/local_scale)
-        A[local_scale==0] = 0
+        A = np.exp(-dist / local_scale)
+        A[local_scale == 0] = 0
 
-      G = Xc.T.dot(A.sum(axis=0)[:,None] * Xc) - Xc.T.dot(A).dot(Xc)
-      tSb += G/n + (1-nc/n)*Xc.T.dot(Xc) + _sum_outer(Xc)/n
-      tSw += G/nc
+      G = Xc.T.dot(A.sum(axis=0)[:, None] * Xc) - Xc.T.dot(A).dot(Xc)
+      tSb += G / n + (1 - nc / n) * Xc.T.dot(Xc) + _sum_outer(Xc) / n
+      tSw += G / nc
 
-    tSb -= _sum_outer(X)/n - tSw
+    tSb -= _sum_outer(X) / n - tSw
 
     # symmetrize
-    tSb += tSb.T
-    tSb /= 2
-    tSw += tSw.T
-    tSw /= 2
+    tSb = (tSb + tSb.T) / 2
+    tSw = (tSw + tSw.T) / 2
 
-    if self.params['dim'] == d:
-      vals, vecs = scipy.linalg.eigh(tSb, tSw)
-    else:
-      vals, vecs = scipy.sparse.linalg.eigsh(tSb, k=self.params['dim'], M=tSw,
-                                             which='LA')
-
-    order = np.argsort(-vals)[:self.params['dim']]
-    vals = vals[order]
-    vecs = vecs[:,order]
+    vals, vecs = _eigh(tSb, tSw, dim)
+    order = np.argsort(-vals)[:dim]
+    vals = vals[order].real
+    vecs = vecs[:, order]
 
-    if self.params['metric'] == 'weighted':
+    if self.embedding_type == 'weighted':
        vecs *= np.sqrt(vals)
-    elif self.params['metric'] == 'orthonormalized':
+    elif self.embedding_type == 'orthonormalized':
        vecs, _ = np.linalg.qr(vecs)
 
-    self._transformer = vecs.T
+    self.components_ = vecs.T
     return self
 
 
 def _sum_outer(x):
   s = x.sum(axis=0)
   return np.outer(s, s)
+
+
+def _eigh(a, b, dim):
+  try:
+    return scipy.sparse.linalg.eigsh(a, k=dim, M=b, which='LA')
+  except np.linalg.LinAlgError:
+    pass  # scipy already tried eigh for us
+  except (ValueError, scipy.sparse.linalg.ArpackNoConvergence):
+    try:
+      return scipy.linalg.eigh(a, b)
+    except np.linalg.LinAlgError:
+      pass
+  return scipy.linalg.eig(a, b)
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 757d1be5..47bb065f 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,208 +1,320 @@
 """
-Large-margin nearest neighbor metric learning. (Weinberger 2005)
-
-LMNN learns a Mahanalobis distance metric in the kNN classification setting
-using semidefinite programming.
-The learned metric attempts to keep k-nearest neighbors in the same class,
-while keeping examples from different classes separated by a large margin.
-This algorithm makes no assumptions about the distribution of the data.
+Large Margin Nearest Neighbor Metric learning (LMNN)
 """
-#TODO: periodic recalculation of impostors, PCA initialization
-
-from __future__ import print_function, absolute_import
 import numpy as np
 from collections import Counter
-from six.moves import xrange
-from sklearn.metrics import pairwise_distances
-
-from .base_metric import BaseMetricLearner
-
-
-# commonality between LMNN implementations
-class _base_LMNN(BaseMetricLearner):
-  def __init__(self, **kwargs):
-    self.params = kwargs
-
-  def transformer(self):
-    return self.L
-
-
-# slower Python version
-class python_LMNN(_base_LMNN):
-  def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7,
-               regularization=0.5, convergence_tol=0.001, verbose=False):
-    """Initialize the LMNN object
-
-    k: number of neighbors to consider. (does not include self-edges)
-    regularization: weighting of pull and push terms
-    """
-    _base_LMNN.__init__(self, k=k, min_iter=min_iter, max_iter=max_iter,
-                        learn_rate=learn_rate, regularization=regularization,
-                        convergence_tol=convergence_tol, verbose=verbose)
-
-  def _process_inputs(self, X, labels):
-    num_pts = X.shape[0]
-    assert len(labels) == num_pts
-    unique_labels, self.label_inds = np.unique(labels, return_inverse=True)
-    self.labels = np.arange(len(unique_labels))
-    self.X = X
-    self.L = np.eye(X.shape[1])
-    required_k = np.bincount(self.label_inds).min()
-    assert self.params['k'] <= required_k, (
-        'not enough class labels for specified k'
-        ' (smallest class has %d)' % required_k)
-
-  def fit(self, X, labels):
-    k = self.params['k']
-    verbose = self.params['verbose']
-    reg = self.params['regularization']
-    learn_rate = self.params['learn_rate']
-    convergence_tol = self.params['convergence_tol']
-    min_iter = self.params['min_iter']
-    self._process_inputs(X, labels)
-
-    target_neighbors = self._select_targets()
-    impostors = self._find_impostors(target_neighbors[:,-1])
+from sklearn.metrics import euclidean_distances
+from sklearn.base import TransformerMixin
+import warnings
+
+from ._util import _initialize_components, _check_n_components
+from .base_metric import MahalanobisMixin
+
+
+class LMNN(MahalanobisMixin, TransformerMixin):
+  """Large Margin Nearest Neighbor (LMNN)
+
+  LMNN learns a Mahalanobis distance metric in the kNN classification
+  setting. The learned metric attempts to keep close k-nearest neighbors
+  from the same class, while keeping examples from different classes
+  separated by a large margin. This algorithm makes no assumptions about
+  the distribution of the data.
+
+  Read more in the :ref:`User Guide <lmnn>`.
+
+  Parameters
+  ----------
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  n_neighbors : int, optional (default=3)
+    Number of neighbors to consider, not including self-edges.
+
+  min_iter : int, optional (default=50)
+    Minimum number of iterations of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iterations of the optimization procedure.
+
+  learn_rate : float, optional (default=1e-7)
+    Learning rate of the optimization procedure
+
+  tol : float, optional (default=0.001)
+    Tolerance of the optimization procedure. If the objective value varies
+    less than `tol`, we consider the algorithm has converged and stop it.
+
+  verbose : bool, optional (default=False)
+    Whether to print the progress of the optimization procedure.
+
+  regularization: float, optional (default=0.5)
+    Relative weight between pull and push terms, with 0.5 meaning equal
+    weight.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
+
+  k : Renamed to n_neighbors. Will be deprecated in 0.7.0
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import LMNN
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lmnn = LMNN(n_neighbors=5, learn_rate=1e-6)
+  >>> lmnn.fit(X, Y, verbose=False)
+
+  References
+  ----------
+  .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric
+         Learning for Large Margin Nearest Neighbor Classification
+         <http://papers.nips.cc/paper/2795-distance-metric\
+         -learning-for-large-margin-nearest-neighbor-classification>`_. NIPS
+         2005.
+  """
+
+  def __init__(self, init='auto', n_neighbors=3, min_iter=50, max_iter=1000,
+               learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
+               verbose=False, preprocessor=None,
+               n_components=None, random_state=None, k='deprecated'):
+    self.init = init
+    if k != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_neighbors = k
+    self.k = 'deprecated'  # To avoid no_attribute error
+    self.n_neighbors = n_neighbors
+    self.min_iter = min_iter
+    self.max_iter = max_iter
+    self.learn_rate = learn_rate
+    self.regularization = regularization
+    self.convergence_tol = convergence_tol
+    self.verbose = verbose
+    self.n_components = n_components
+    self.random_state = random_state
+    super(LMNN, self).__init__(preprocessor)
+
+  def fit(self, X, y):
+    k = self.n_neighbors
+    reg = self.regularization
+    learn_rate = self.learn_rate
+
+    X, y = self._prepare_inputs(X, y, dtype=float,
+                                ensure_min_samples=2)
+    num_pts, d = X.shape
+    output_dim = _check_n_components(d, self.n_components)
+    unique_labels, label_inds = np.unique(y, return_inverse=True)
+    if len(label_inds) != num_pts:
+      raise ValueError('Must have one label per point.')
+    self.labels_ = np.arange(len(unique_labels))
+
+    self.components_ = _initialize_components(output_dim, X, y, self.init,
+                                              self.verbose,
+                                              random_state=self.random_state)
+    required_k = np.bincount(label_inds).min()
+    if self.n_neighbors > required_k:
+      raise ValueError('not enough class labels for specified k'
+                       ' (smallest class has %d)' % required_k)
+
+    target_neighbors = self._select_targets(X, label_inds)
 
     # sum outer products
-    dfG = _sum_outer_products(self.X, target_neighbors.flatten(),
-                              np.repeat(np.arange(self.X.shape[0]), k))
-    df = np.zeros_like(dfG)
-
-    # storage
-    a1 = [None]*k
-    a2 = [None]*k
-    for nn_idx in xrange(k):
-      a1[nn_idx] = np.array([])
-      a2[nn_idx] = np.array([])
-
-    # initialize gradient and L
-    G = dfG * reg + df * (1-reg)
-    L = self.L
-    objective = np.inf
+    dfG = _sum_outer_products(X, target_neighbors.flatten(),
+                              np.repeat(np.arange(X.shape[0]), k))
+
+    # initialize L
+    L = self.components_
+
+    # first iteration: we compute variables (including objective and gradient)
+    #  at initialization point
+    G, objective, total_active = self._loss_grad(X, L, dfG, k,
+                                                 reg, target_neighbors,
+                                                 label_inds)
+
+    it = 1  # we already made one iteration
+
+    if self.verbose:
+      print("iter | objective | objective difference | active constraints",
+            "| learning rate")
 
     # main loop
-    for it in xrange(1, self.params['max_iter']):
-      df_old = df.copy()
-      a1_old = [a.copy() for a in a1]
-      a2_old = [a.copy() for a in a2]
-      objective_old = objective
-      # Compute pairwise distances under current metric
-      Lx = L.dot(self.X.T).T
-      g0 = _inplace_paired_L2(*Lx[impostors])
-      Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:,None,:])
-      g1,g2 = Ni[impostors]
-
-      # compute the gradient
-      total_active = 0
-      for nn_idx in reversed(xrange(k)):
-        act1 = g0 < g1[:,nn_idx]
-        act2 = g0 < g2[:,nn_idx]
-        total_active += act1.sum() + act2.sum()
-
-        if it > 1:
-          plus1 = act1 & ~a1[nn_idx]
-          minus1 = a1[nn_idx] & ~act1
-          plus2 = act2 & ~a2[nn_idx]
-          minus2 = a2[nn_idx] & ~act2
+    for it in range(2, self.max_iter):
+      # then at each iteration, we try to find a value of L that has better
+      # objective than the previous L, following the gradient:
+      while True:
+        # the next point next_L to try out is found by a gradient step
+        L_next = L - learn_rate * G
+        # we compute the objective at next point
+        # we copy variables that can be modified by _loss_grad, because if we
+        # retry we don t want to modify them several times
+        (G_next, objective_next, total_active_next) = (
+            self._loss_grad(X, L_next, dfG, k, reg, target_neighbors,
+                            label_inds))
+        assert not np.isnan(objective)
+        delta_obj = objective_next - objective
+        if delta_obj > 0:
+          # if we did not find a better objective, we retry with an L closer to
+          # the starting point, by decreasing the learning rate (making the
+          # gradient step smaller)
+          learn_rate /= 2
         else:
-          plus1 = act1
-          plus2 = act2
-          minus1 = np.zeros(0, dtype=int)
-          minus2 = np.zeros(0, dtype=int)
-
-        targets = target_neighbors[:,nn_idx]
-        PLUS, pweight = _count_edges(plus1, plus2, impostors, targets)
-        df += _sum_outer_products(self.X, PLUS[:,0], PLUS[:,1], pweight)
-        MINUS, mweight = _count_edges(minus1, minus2, impostors, targets)
-        df -= _sum_outer_products(self.X, MINUS[:,0], MINUS[:,1], mweight)
-
-        in_imp, out_imp = impostors
-        df += _sum_outer_products(self.X, in_imp[minus1], out_imp[minus1])
-        df += _sum_outer_products(self.X, in_imp[minus2], out_imp[minus2])
-
-        df -= _sum_outer_products(self.X, in_imp[plus1], out_imp[plus1])
-        df -= _sum_outer_products(self.X, in_imp[plus2], out_imp[plus2])
-
-        a1[nn_idx] = act1
-        a2[nn_idx] = act2
-
-      # do the gradient update
-      assert not np.isnan(df).any()
-      G = dfG * reg + df * (1-reg)
-
-      # compute the objective function
-      objective = total_active * (1-reg)
-      objective += G.flatten().dot(L.T.dot(L).flatten())
-      assert not np.isnan(objective)
-      delta_obj = objective - objective_old
-
-      if verbose:
+          # otherwise, if we indeed found a better obj, we get out of the loop
+          break
+      # when the better L is found (and the related variables), we set the
+      # old variables to these new ones before next iteration and we
+      # slightly increase the learning rate
+      L = L_next
+      G, objective, total_active = G_next, objective_next, total_active_next
+      learn_rate *= 1.01
+
+      if self.verbose:
         print(it, objective, delta_obj, total_active, learn_rate)
 
-      # update step size
-      if delta_obj > 0:
-        # we're getting worse... roll back!
-        learn_rate /= 2.0
-        df = df_old
-        a1 = a1_old
-        a2 = a2_old
-        objective = objective_old
-      else:
-        # update L
-        L -= learn_rate * 2 * L.dot(G)
-        learn_rate *= 1.01
-
       # check for convergence
-      if it > min_iter and abs(delta_obj) < convergence_tol:
-        if verbose:
+      if it > self.min_iter and abs(delta_obj) < self.convergence_tol:
+        if self.verbose:
           print("LMNN converged with objective", objective)
         break
     else:
-      if verbose:
-        print("LMNN didn't converge in %(max_iter)d steps." % self.params)
+      if self.verbose:
+        print("LMNN didn't converge in %d steps." % self.max_iter)
 
     # store the last L
-    self.L = L
+    self.components_ = L
+    self.n_iter_ = it
     return self
 
-  def metric(self):
-    return self.L.T.dot(self.L)
-
-  def transform(self, X=None):
-    if X is None:
-      X = self.X
-    return self.L.dot(X.T).T
-
-  def _select_targets(self):
-    k = self.params['k']
-    target_neighbors = np.empty((self.X.shape[0], k), dtype=int)
-    for label in self.labels:
-      inds, = np.nonzero(self.label_inds == label)
-      dd = pairwise_distances(self.X[inds])
+  def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
+    # Compute pairwise distances under current metric
+    Lx = L.dot(X.T).T
+
+    # we need to find the furthest neighbor:
+    Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :])
+    furthest_neighbors = np.take_along_axis(target_neighbors,
+                                            Ni.argmax(axis=1)[:, None], 1)
+    impostors = self._find_impostors(furthest_neighbors.ravel(), X,
+                                     label_inds, L)
+
+    g0 = _inplace_paired_L2(*Lx[impostors])
+
+    # we reorder the target neighbors
+    g1, g2 = Ni[impostors]
+    # compute the gradient
+    total_active = 0
+    df = np.zeros((X.shape[1], X.shape[1]))
+    for nn_idx in reversed(range(k)):  # note: reverse not useful here
+      act1 = g0 < g1[:, nn_idx]
+      act2 = g0 < g2[:, nn_idx]
+      total_active += act1.sum() + act2.sum()
+
+      targets = target_neighbors[:, nn_idx]
+      PLUS, pweight = _count_edges(act1, act2, impostors, targets)
+      df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight)
+
+      in_imp, out_imp = impostors
+      df -= _sum_outer_products(X, in_imp[act1], out_imp[act1])
+      df -= _sum_outer_products(X, in_imp[act2], out_imp[act2])
+
+    # do the gradient update
+    assert not np.isnan(df).any()
+    G = dfG * reg + df * (1 - reg)
+    G = L.dot(G)
+    # compute the objective function
+    objective = total_active * (1 - reg)
+    objective += G.flatten().dot(L.flatten())
+    return 2 * G, objective, total_active
+
+  def _select_targets(self, X, label_inds):
+    target_neighbors = np.empty((X.shape[0], self.n_neighbors), dtype=int)
+    for label in self.labels_:
+      inds, = np.nonzero(label_inds == label)
+      dd = euclidean_distances(X[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
-      nn = np.argsort(dd)[...,:k]
+      nn = np.argsort(dd)[..., :self.n_neighbors]
       target_neighbors[inds] = inds[nn]
     return target_neighbors
 
-  def _find_impostors(self, furthest_neighbors):
-    Lx = self.transform()
+  def _find_impostors(self, furthest_neighbors, X, label_inds, L):
+    Lx = X.dot(L.T)
     margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx)
     impostors = []
-    for label in self.labels[:-1]:
-      in_inds, = np.nonzero(self.label_inds == label)
-      out_inds, = np.nonzero(self.label_inds > label)
-      dist = pairwise_distances(Lx[out_inds], Lx[in_inds])
-      i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
-      i2,j2 = np.nonzero(dist < margin_radii[in_inds])
-      i = np.hstack((i1,i2))
-      j = np.hstack((j1,j2))
+    for label in self.labels_[:-1]:
+      in_inds, = np.nonzero(label_inds == label)
+      out_inds, = np.nonzero(label_inds > label)
+      dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
+      i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None])
+      i2, j2 = np.nonzero(dist < margin_radii[in_inds])
+      i = np.hstack((i1, i2))
+      j = np.hstack((j1, j2))
       if i.size > 0:
         # get unique (i,j) pairs using index trickery
-        shape = (i.max()+1, j.max()+1)
-        tmp = np.ravel_multi_index((i,j), shape)
-        i,j = np.unravel_index(np.unique(tmp), shape)
+        shape = (i.max() + 1, j.max() + 1)
+        tmp = np.ravel_multi_index((i, j), shape)
+        i, j = np.unravel_index(np.unique(tmp), shape)
       impostors.append(np.vstack((in_inds[j], out_inds[i])))
+    if len(impostors) == 0:
+        # No impostors detected
+        return impostors
     return np.hstack(impostors)
 
 
@@ -213,53 +325,19 @@ def _inplace_paired_L2(A, B):
 
 
 def _count_edges(act1, act2, impostors, targets):
-  imp = impostors[0,act1]
+  imp = impostors[0, act1]
   c = Counter(zip(imp, targets[imp]))
-  imp = impostors[1,act2]
+  imp = impostors[1, act2]
   c.update(zip(imp, targets[imp]))
   if c:
     active_pairs = np.array(list(c.keys()))
   else:
-    active_pairs = np.empty((0,2), dtype=int)
+    active_pairs = np.empty((0, 2), dtype=int)
   return active_pairs, np.array(list(c.values()))
 
 
 def _sum_outer_products(data, a_inds, b_inds, weights=None):
   Xab = data[a_inds] - data[b_inds]
   if weights is not None:
-    return np.dot(Xab.T, Xab * weights[:,None])
+    return np.dot(Xab.T, Xab * weights[:, None])
   return np.dot(Xab.T, Xab)
-
-
-try:
-  # use the fast C++ version, if available
-  from modshogun import LMNN as shogun_LMNN
-  from modshogun import RealFeatures, MulticlassLabels
-
-  class LMNN(_base_LMNN):
-    def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7,
-                 regularization=0.5, convergence_tol=0.001, use_pca=True,
-                 verbose=False):
-      _base_LMNN.__init__(self, k=k, min_iter=min_iter, max_iter=max_iter,
-                          learn_rate=learn_rate, regularization=regularization,
-                          convergence_tol=convergence_tol, use_pca=use_pca,
-                          verbose=verbose)
-
-    def fit(self, X, labels):
-      self.X = X
-      self.L = np.eye(X.shape[1])
-      labels = MulticlassLabels(labels.astype(np.float64))
-      self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k'])
-      self._lmnn.set_maxiter(self.params['max_iter'])
-      self._lmnn.set_obj_threshold(self.params['convergence_tol'])
-      self._lmnn.set_regularization(self.params['regularization'])
-      self._lmnn.set_stepsize(self.params['learn_rate'])
-      if self.params['use_pca']:
-        self._lmnn.train()
-      else:
-        self._lmnn.train(self.L)
-      self.L = self._lmnn.get_linear_transform()
-      return self
-
-except ImportError:
-  LMNN = python_LMNN
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index 3a576ab8..af7fa95b 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -1,176 +1,348 @@
 """
-Liu et al.
-"Metric Learning from Relative Comparisons by Minimizing Squared Residual".
-ICDM 2012.
-
-Adapted from https://gist.github.com/kcarnold/5439917
-Paper: http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
+Metric Learning from Relative Comparisons by Minimizing Squared Residual (LSML)
 """
 
-from __future__ import print_function, absolute_import
 import numpy as np
 import scipy.linalg
-from six.moves import xrange
+from sklearn.base import TransformerMixin
 
-from .base_metric import BaseMetricLearner
+from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints
+from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
 
 
-class LSML(BaseMetricLearner):
-  def __init__(self, tol=1e-3, max_iter=1000, verbose=False):
-    """Initialize the learner.
+class _BaseLSML(MahalanobisMixin):
 
-    Parameters
-    ----------
-    tol : float, optional
-    max_iter : int, optional
-    verbose : bool, optional
-        if True, prints information while learning
-    """
-    self.params = {
-      'tol': tol,
-      'max_iter': max_iter,
-      'verbose': verbose,
-    }
-
-  def _prepare_inputs(self, X, constraints, weights, prior):
-    self.X = X
-    a,b,c,d = constraints
-    self.vab = X[a] - X[b]
-    self.vcd = X[c] - X[d]
-    assert self.vab.shape == self.vcd.shape, 'Constraints must have same length'
-    if weights is None:
-      self.w = np.ones(self.vab.shape[0])
-    else:
-      self.w = weights
-    self.w /= self.w.sum()  # weights must sum to 1
-    if prior is None:
-      self.M = np.cov(X.T)
-    else:
-      self.M = prior
+  _tuple_size = 4  # constraints are quadruplets
 
-  def metric(self):
-    return self.M
+  def __init__(self, tol=1e-3, max_iter=1000, prior='identity',
+               verbose=False, preprocessor=None, random_state=None):
+    self.prior = prior
+    self.tol = tol
+    self.max_iter = max_iter
+    self.verbose = verbose
+    self.random_state = random_state
+    super(_BaseLSML, self).__init__(preprocessor)
 
-  def fit(self, X, constraints, weights=None, prior=None):
-    """Learn the LSML model.
+  def _fit(self, quadruplets, weights=None):
+    quadruplets = self._prepare_inputs(quadruplets,
+                                       type_of_inputs='tuples')
+
+    # check to make sure that no two constrained vectors are identical
+    vab = quadruplets[:, 0, :] - quadruplets[:, 1, :]
+    vcd = quadruplets[:, 2, :] - quadruplets[:, 3, :]
+    if vab.shape != vcd.shape:
+      raise ValueError('Constraints must have same length')
+    if weights is None:
+      self.w_ = np.ones(vab.shape[0])
+    else:
+      self.w_ = weights
+    self.w_ /= self.w_.sum()  # weights must sum to 1
+    M, prior_inv = _initialize_metric_mahalanobis(
+        quadruplets, self.prior,
+        return_inverse=True, strict_pd=True, matrix_name='prior',
+        random_state=self.random_state)
 
-    Parameters
-    ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d])
-    weights : (m,) array of floats, optional
-        scale factor for each constraint
-    prior : (d x d) matrix, optional
-        guess at a metric [default: covariance(X)]
-    """
-    verbose = self.params['verbose']
-    self._prepare_inputs(X, constraints, weights, prior)
-    prior_inv = scipy.linalg.inv(self.M)
-    s_best = self._total_loss(self.M, prior_inv)
     step_sizes = np.logspace(-10, 0, 10)
-    if verbose:
+    # Keep track of the best step size and the loss at that step.
+    l_best = 0
+    s_best = self._total_loss(M, vab, vcd, prior_inv)
+    if self.verbose:
       print('initial loss', s_best)
-    tol = self.params['tol']
-    for it in xrange(1, self.params['max_iter']+1):
-      grad = self._gradient(self.M, prior_inv)
+    for it in range(1, self.max_iter + 1):
+      grad = self._gradient(M, vab, vcd, prior_inv)
       grad_norm = scipy.linalg.norm(grad)
-      if grad_norm < tol:
+      if grad_norm < self.tol:
         break
-      if verbose:
+      if self.verbose:
         print('gradient norm', grad_norm)
       M_best = None
       for step_size in step_sizes:
         step_size /= grad_norm
-        new_metric = self.M - step_size * grad
+        new_metric = M - step_size * grad
         w, v = scipy.linalg.eigh(new_metric)
         new_metric = v.dot((np.maximum(w, 1e-8) * v).T)
-        cur_s = self._total_loss(new_metric, prior_inv)
+        cur_s = self._total_loss(new_metric, vab, vcd, prior_inv)
         if cur_s < s_best:
           l_best = step_size
           s_best = cur_s
           M_best = new_metric
-      if verbose:
+      if self.verbose:
         print('iter', it, 'cost', s_best, 'best step', l_best * grad_norm)
       if M_best is None:
         break
-      self.M = M_best
+      M = M_best
     else:
-      if verbose:
+      if self.verbose:
         print("Didn't converge after", it, "iterations. Final loss:", s_best)
+    self.n_iter_ = it
+
+    self.components_ = components_from_metric(M)
     return self
 
-  def _comparison_loss(self, metric):
-    dab = np.sum(self.vab.dot(metric) * self.vab, axis=1)
-    dcd = np.sum(self.vcd.dot(metric) * self.vcd, axis=1)
+  def _comparison_loss(self, metric, vab, vcd):
+    dab = np.sum(vab.dot(metric) * vab, axis=1)
+    dcd = np.sum(vcd.dot(metric) * vcd, axis=1)
     violations = dab > dcd
-    return self.w[violations].dot((np.sqrt(dab[violations]) -
-                                   np.sqrt(dcd[violations]))**2)
+    return self.w_[violations].dot((np.sqrt(dab[violations]) -
+                                    np.sqrt(dcd[violations]))**2)
 
-  def _total_loss(self, metric, prior_inv):
-    return (self._comparison_loss(metric) +
-            _regularization_loss(metric, prior_inv))
+  def _total_loss(self, metric, vab, vcd, prior_inv):
+    # Regularization loss
+    sign, logdet = np.linalg.slogdet(metric)
+    reg_loss = np.sum(metric * prior_inv) - sign * logdet
+    return self._comparison_loss(metric, vab, vcd) + reg_loss
 
-  def _gradient(self, metric, prior_inv):
-    dMetric = prior_inv - scipy.linalg.inv(metric)
-    dabs = np.sum(self.vab.dot(metric) * self.vab, axis=1)
-    dcds = np.sum(self.vcd.dot(metric) * self.vcd, axis=1)
+  def _gradient(self, metric, vab, vcd, prior_inv):
+    dMetric = prior_inv - np.linalg.inv(metric)
+    dabs = np.sum(vab.dot(metric) * vab, axis=1)
+    dcds = np.sum(vcd.dot(metric) * vcd, axis=1)
     violations = dabs > dcds
     # TODO: vectorize
-    for vab, dab, vcd, dcd in zip(self.vab[violations], dabs[violations],
-                                  self.vcd[violations], dcds[violations]):
-      dMetric += ((1-np.sqrt(dcd/dab))*np.outer(vab, vab) +
-                  (1-np.sqrt(dab/dcd))*np.outer(vcd, vcd))
+    for vab, dab, vcd, dcd in zip(vab[violations], dabs[violations],
+                                  vcd[violations], dcds[violations]):
+      dMetric += ((1 - np.sqrt(dcd / dab)) * np.outer(vab, vab) +
+                  (1 - np.sqrt(dab / dcd)) * np.outer(vcd, vcd))
     return dMetric
 
 
-def _regularization_loss(metric, prior_inv):
-  sign, logdet = np.linalg.slogdet(metric)
-  return np.sum(metric * prior_inv) - sign * logdet
+class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
+  """Least Squared-residual Metric Learning (LSML)
+
+  `LSML` proposes a simple, yet effective, algorithm that minimizes a convex
+  objective function corresponding to the sum of squared residuals of
+  constraints. This algorithm uses the constraints in the form of the
+  relative distance comparisons, such method is especially useful where
+  pairwise constraints are not natural to obtain, thus pairwise constraints
+  based algorithms become infeasible to be deployed. Furthermore, its sparsity
+  extension leads to more stable estimation when the dimension is high and
+  only a small amount of constraints is given.
+
+  Read more in the :ref:`User Guide <lsml>`.
+
+  Parameters
+  ----------
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  tol : float, optional (default=1e-3)
+    Convergence tolerance of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iteration of the optimization procedure.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior.
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import LSML
+  >>> quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
+  >>>                [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
+  >>>                [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
+  >>>                [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
+  >>> # we want to make closer points where the first feature is close, and
+  >>> # further if the second feature is close
+  >>> lsml = LSML()
+  >>> lsml.fit(quadruplets)
 
+  References
+  ----------
+  .. [1] Liu et al. `Metric Learning from Relative Comparisons by Minimizing
+         Squared Residual
+         <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
 
-class LSML_Supervised(LSML):
-  def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf,
-               num_constraints=None, weights=None, verbose=False):
-    """Initialize the learner.
+  .. [2] Code adapted from https://gist.github.com/kcarnold/5439917
+
+  See Also
+  --------
+  metric_learn.LSML : The original weakly-supervised algorithm
+
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def fit(self, quadruplets, weights=None):
+    """Learn the LSML model.
 
     Parameters
     ----------
-    tol : float, optional
-    max_iter : int, optional
-    prior : (d x d) matrix, optional
-        guess at a metric [default: covariance(X)]
-    num_labeled : int, optional
-        number of labels to preserve for training
-    num_constraints: int, optional
-        number of constraints to generate
-    weights : (m,) array of floats, optional
-        scale factor for each constraint
-    verbose : bool, optional
-        if True, prints information while learning
+    quadruplets : array-like, shape=(n_constraints, 4, n_features) or \
+                  (n_constraints, 4)
+      3D array-like of quadruplets of points or 2D array of quadruplets of
+      indicators. In order to supervise the algorithm in the right way, we
+      should have the four samples ordered in a way such that:
+      d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
+      n_constraints.
+
+    weights : (n_constraints,) array of floats, optional
+      scale factor for each constraint
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
     """
-    LSML.__init__(self, tol=tol, max_iter=max_iter, verbose=verbose)
-    self.params.update(prior=prior, num_labeled=num_labeled,
-                       num_constraints=num_constraints, weights=weights)
+    return self._fit(quadruplets, weights=weights)
+
+
+class LSML_Supervised(_BaseLSML, TransformerMixin):
+  """Supervised version of Least Squared-residual Metric Learning (LSML)
+
+  `LSML_Supervised` creates quadruplets from labeled samples by taking two
+  samples from the same class, and two samples from different classes.
+  This way it builds quadruplets where the two first points must be more
+  similar than the two last points.
+
+  Parameters
+  ----------
+  tol : float, optional (default=1e-3)
+    Convergence tolerance of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+    Number of maximum iterations of the optimization procedure.
+
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-  def fit(self, X, labels):
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  n_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  weights : (n_constraints,) array of floats, optional (default=None)
+    Relative weight given to each constraint. If None, defaults to uniform
+    weights.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import LSML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lsml = LSML_Supervised(n_constraints=200)
+  >>> lsml.fit(X, Y)
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+  """
+
+  def __init__(self, tol=1e-3, max_iter=1000, prior='identity',
+               n_constraints=None, weights=None,
+               verbose=False, preprocessor=None, random_state=None,
+               num_constraints='deprecated'):
+    _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior,
+                       verbose=verbose, preprocessor=preprocessor,
+                       random_state=random_state)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
+    self.weights = weights
+
+  def fit(self, X, y):
     """Create constraints from labels and learn the LSML model.
-    Needs num_constraints specified in constructor.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    labels : (n) data labels
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    y : (n) array-like
+      Data labels.
     """
-    num_constraints = self.params['num_constraints']
-    if num_constraints is None:
-      num_classes = np.unique(labels)
-      num_constraints = 20*(len(num_classes))**2
-
-    c = Constraints.random_subset(labels, self.params['num_labeled'])
-    pairs = c.positive_negative_pairs(num_constraints, same_length=True)
-    return LSML.fit(self, X, pairs, weights=self.params['weights'],
-                    prior=self.params['prior'])
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    n_constraints = self.n_constraints
+    if n_constraints is None:
+      num_classes = len(np.unique(y))
+      n_constraints = 20 * num_classes**2
+
+    c = Constraints(y)
+    pos_neg = c.positive_negative_pairs(n_constraints, same_length=True,
+                                        random_state=self.random_state)
+    return _BaseLSML._fit(self, X[np.column_stack(pos_neg)],
+                          weights=self.weights)
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
new file mode 100644
index 00000000..01d185e7
--- /dev/null
+++ b/metric_learn/mlkr.py
@@ -0,0 +1,208 @@
+"""
+Metric Learning for Kernel Regression (MLKR)
+"""
+import time
+import sys
+import warnings
+import numpy as np
+from scipy.optimize import minimize
+from scipy.special import logsumexp
+from sklearn.base import TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances
+
+from .base_metric import MahalanobisMixin
+from ._util import _initialize_components, _check_n_components
+
+EPS = np.finfo(float).eps
+
+
+class MLKR(MahalanobisMixin, TransformerMixin):
+  """Metric Learning for Kernel Regression (MLKR)
+
+  MLKR is an algorithm for supervised metric learning, which learns a
+  distance function by directly minimizing the leave-one-out regression error.
+  This algorithm can also be viewed as a supervised variation of PCA and can be
+  used for dimensionality reduction and high dimensional data visualization.
+
+  Read more in the :ref:`User Guide <mlkr>`.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components < min(n_features, n_samples)``,
+      we use 'pca', as it projects data in meaningful directions (those
+      of higher variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  tol : float, optional (default=None)
+    Convergence tolerance for the optimization.
+
+  max_iter : int, optional (default=1000)
+    Cap on number of conjugate gradient iterations.
+
+  verbose : bool, optional (default=False)
+    Whether to print progress messages or not.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+
+  Examples
+  --------
+
+  >>> from metric_learn import MLKR
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> mlkr = MLKR()
+  >>> mlkr.fit(X, Y)
+
+  References
+  ----------
+  .. [1] K.Q. Weinberger and G. Tesauto. `Metric Learning for Kernel
+         Regression <http://proceedings.mlr.press/v2/weinberger07a\
+         /weinberger07a.pdf>`_. AISTATS 2007.
+  """
+
+  def __init__(self, n_components=None, init='auto',
+               tol=None, max_iter=1000, verbose=False,
+               preprocessor=None, random_state=None):
+    self.n_components = n_components
+    self.init = init
+    self.tol = tol
+    self.max_iter = max_iter
+    self.verbose = verbose
+    self.random_state = random_state
+    super(MLKR, self).__init__(preprocessor)
+
+  def fit(self, X, y):
+      """
+      Fit MLKR model
+
+      Parameters
+      ----------
+      X : (n x d) array of samples
+      y : (n) data labels
+      """
+      X, y = self._prepare_inputs(X, y, y_numeric=True,
+                                  ensure_min_samples=2)
+      n, d = X.shape
+      if y.shape[0] != n:
+          raise ValueError('Data and label lengths mismatch: %d != %d'
+                           % (n, y.shape[0]))
+
+      m = _check_n_components(d, self.n_components)
+      m = self.n_components
+      if m is None:
+          m = d
+      # if the init is the default (None), we raise a warning
+      A = _initialize_components(m, X, y, init=self.init,
+                                 random_state=self.random_state,
+                                 # MLKR works on regression targets:
+                                 has_classes=False)
+
+      # Measure the total training time
+      train_time = time.time()
+
+      self.n_iter_ = 0
+      res = minimize(self._loss, A.ravel(), (X, y), method='L-BFGS-B',
+                     jac=True, tol=self.tol,
+                     options=dict(maxiter=self.max_iter))
+      self.components_ = res.x.reshape(A.shape)
+
+      # Stop timer
+      train_time = time.time() - train_time
+      if self.verbose:
+          cls_name = self.__class__.__name__
+          # Warn the user if the algorithm did not converge
+          if not res.success:
+              warnings.warn('[{}] MLKR did not converge: {}'
+                            .format(cls_name, res.message), ConvergenceWarning)
+          print('[{}] Training took {:8.2f}s.'.format(cls_name, train_time))
+
+      return self
+
+  def _loss(self, flatA, X, y):
+
+    if self.n_iter_ == 0 and self.verbose:
+      header_fields = ['Iteration', 'Objective Value', 'Time(s)']
+      header_fmt = '{:>10} {:>20} {:>10}'
+      header = header_fmt.format(*header_fields)
+      cls_name = self.__class__.__name__
+      print('[{cls}]'.format(cls=cls_name))
+      print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name,
+                                                     header=header,
+                                                     sep='-' * len(header)))
+
+    start_time = time.time()
+
+    A = flatA.reshape((-1, X.shape[1]))
+    X_embedded = np.dot(X, A.T)
+    dist = pairwise_distances(X_embedded, squared=True)
+    np.fill_diagonal(dist, np.inf)
+    softmax = np.exp(- dist - logsumexp(- dist, axis=1)[:, np.newaxis])
+    yhat = softmax.dot(y)
+    ydiff = yhat - y
+    cost = (ydiff ** 2).sum()
+
+    # also compute the gradient
+    W = softmax * ydiff[:, np.newaxis] * (y - yhat[:, np.newaxis])
+    W_sym = W + W.T
+    np.fill_diagonal(W_sym, - W.sum(axis=0))
+    grad = 4 * (X_embedded.T.dot(W_sym)).dot(X)
+
+    if self.verbose:
+      start_time = time.time() - start_time
+      values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}'
+      print(values_fmt.format(cls=self.__class__.__name__,
+                              n_iter=self.n_iter_, loss=cost,
+                              start_time=start_time))
+      sys.stdout.flush()
+
+    self.n_iter_ += 1
+
+    return cost, grad.ravel()
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
new file mode 100644
index 00000000..5cf166fd
--- /dev/null
+++ b/metric_learn/mmc.py
@@ -0,0 +1,601 @@
+"""Mahalanobis Metric for Clustering (MMC)"""
+import numpy as np
+from sklearn.base import TransformerMixin
+from sklearn.utils.validation import assert_all_finite
+
+from .base_metric import _PairsClassifierMixin, MahalanobisMixin
+from .constraints import Constraints, wrap_pairs
+from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
+
+
+class _BaseMMC(MahalanobisMixin):
+
+  _tuple_size = 2  # constraints are pairs
+
+  def __init__(self, max_iter=100, max_proj=10000, tol=1e-3,
+               init='identity', diagonal=False,
+               diagonal_c=1.0, verbose=False, preprocessor=None,
+               random_state=None,
+               convergence_threshold='deprecated'):
+    if convergence_threshold != 'deprecated':
+      warnings.warn('"convergence_threshold" parameter has been '
+                    ' renamed to "tol". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      tol = convergence_threshold
+    self.convergence_threshold = 'deprecated'  # Avoid errors
+    self.max_iter = max_iter
+    self.max_proj = max_proj
+    self.tol = tol
+    self.init = init
+    self.diagonal = diagonal
+    self.diagonal_c = diagonal_c
+    self.verbose = verbose
+    self.random_state = random_state
+    super(_BaseMMC, self).__init__(preprocessor)
+
+  def _fit(self, pairs, y):
+    pairs, y = self._prepare_inputs(pairs, y,
+                                    type_of_inputs='tuples')
+
+    self.A_ = _initialize_metric_mahalanobis(pairs, self.init,
+                                             random_state=self.random_state,
+                                             matrix_name='init')
+
+    if self.diagonal:
+      return self._fit_diag(pairs, y)
+    else:
+      return self._fit_full(pairs, y)
+
+  def _fit_full(self, pairs, y):
+    """Learn full metric using MMC.
+
+    Parameters
+    ----------
+    X : (n x d) data matrix
+      Each row corresponds to a single instance.
+    constraints : 4-tuple of arrays
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
+    """
+    num_dim = pairs.shape[2]
+
+    error2 = 1e10
+    eps = 0.01        # error-bound of iterative projection on C1 and C2
+    A = self.A_
+
+    pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
+
+    # Create weight vector from similar samples
+    pos_diff = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
+    w = np.einsum('ij,ik->jk', pos_diff, pos_diff).ravel()
+    # `w` is the sum of all outer products of the rows in `pos_diff`.
+    # The above `einsum` is equivalent to the much more inefficient:
+    # w = np.apply_along_axis(
+    #         lambda x: np.outer(x,x).ravel(),
+    #         1,
+    #         X[a] - X[b]
+    #     ).sum(axis = 0)
+    t = w.dot(A.ravel()) / 100.0
+
+    w_norm = np.linalg.norm(w)
+    w1 = w / w_norm  # make `w` a unit vector
+    t1 = t / w_norm  # distance from origin to `w^T*x=t` plane
+
+    cycle = 1
+    alpha = 0.1  # initial step size along gradient
+    grad1 = self._fS1(pos_pairs, A)            # gradient of similarity
+    # constraint function
+    grad2 = self._fD1(neg_pairs, A)            # gradient of dissimilarity
+    # constraint function
+    # gradient of fD1 orthogonal to fS1:
+    M = self._grad_projection(grad1, grad2)
+
+    A_old = A.copy()
+
+    for cycle in range(self.max_iter):
+
+      # projection of constraints C1 and C2
+      satisfy = False
+
+      for it in range(self.max_proj):
+
+        # First constraint:
+        # f(A) = \sum_{i,j \in S} d_ij' A d_ij <= t              (1)
+        # (1) can be rewritten as a linear constraint: w^T x = t,
+        # where x is the unrolled matrix of A,
+        # w is also an unrolled matrix of W where
+        # W_{kl}= \sum_{i,j \in S}d_ij^k * d_ij^l
+        x0 = A.ravel()
+        if w.dot(x0) <= t:
+          x = x0
+        else:
+          x = x0 + (t1 - w1.dot(x0)) * w1
+          A[:] = x.reshape(num_dim, num_dim)
+
+        # Second constraint:
+        # PSD constraint A >= 0
+        # project A onto domain A>0
+        l, V = np.linalg.eigh((A + A.T) / 2)
+        A[:] = np.dot(V * np.maximum(0, l[None, :]), V.T)
+
+        fDC2 = w.dot(A.ravel())
+        error2 = (fDC2 - t) / t
+        if error2 < eps:
+          satisfy = True
+          break
+
+      # third constraint: gradient ascent
+      # max: g(A) >= 1
+      # here we suppose g(A) = fD(A) = \sum_{I,J \in D} sqrt(d_ij' A d_ij)
+
+      obj_previous = self._fD(neg_pairs, A_old)  # g(A_old)
+      obj = self._fD(neg_pairs, A)               # g(A)
+
+      if satisfy and (obj > obj_previous or cycle == 0):
+
+        # If projection of 1 and 2 is successful, and such projection
+        # improves objective function, slightly increase learning rate
+        # and update from the current A.
+        alpha *= 1.05
+        A_old[:] = A
+        grad2 = self._fS1(pos_pairs, A)
+        grad1 = self._fD1(neg_pairs, A)
+        M = self._grad_projection(grad1, grad2)
+        A += alpha * M
+
+      else:
+
+        # If projection of 1 and 2 failed, or obj <= obj_previous due
+        # to projection of 1 and 2, shrink learning rate and re-update
+        # from the previous A.
+        alpha /= 2
+        A[:] = A_old + alpha * M
+
+      delta = np.linalg.norm(alpha * M) / np.linalg.norm(A_old)
+      if delta < self.tol:
+        break
+      if self.verbose:
+        print('mmc iter: %d, conv = %f, projections = %d' %
+              (cycle, delta, it + 1))
+
+    if delta > self.tol:
+      self.converged_ = False
+      if self.verbose:
+        print('mmc did not converge, conv = %f' % (delta,))
+    else:
+      self.converged_ = True
+      if self.verbose:
+        print('mmc converged at iter %d, conv = %f' % (cycle, delta))
+    self.A_[:] = A_old
+    self.n_iter_ = cycle
+
+    self.components_ = components_from_metric(self.A_)
+    return self
+
+  def _fit_diag(self, pairs, y):
+    """Learn diagonal metric using MMC.
+    Parameters
+    ----------
+    X : (n x d) data matrix
+      Each row corresponds to a single instance.
+    constraints : 4-tuple of arrays
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
+    """
+    num_dim = pairs.shape[2]
+    pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
+    s_sum = np.sum((pos_pairs[:, 0, :] - pos_pairs[:, 1, :]) ** 2, axis=0)
+
+    it = 0
+    error = 1.0
+    eps = 1e-6
+    reduction = 2.0
+    w = np.diag(self.A_).copy()
+
+    while error > self.tol and it < self.max_iter:
+
+      fD0, fD_1st_d, fD_2nd_d = self._D_constraint(neg_pairs, w)
+      obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0
+      fS_1st_d = s_sum  # first derivative of the similarity constraints
+
+      # gradient of the objective:
+      gradient = fS_1st_d - self.diagonal_c * fD_1st_d
+      # Hessian of the objective:
+      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim)
+      step = np.dot(np.linalg.inv(hessian), gradient)
+
+      # Newton-Rapshon update
+      # search over optimal lambda
+      lambd = 1  # initial step-size
+      w_tmp = np.maximum(0, w - lambd * step)
+      obj = (np.dot(s_sum, w_tmp) + self.diagonal_c *
+             self._D_objective(neg_pairs, w_tmp))
+      assert_all_finite(obj)
+      obj_previous = np.inf  # just to get the while-loop started
+
+      inner_it = 0
+      while obj < obj_previous:
+        obj_previous = obj
+        w_previous = w_tmp.copy()
+        lambd /= reduction
+        w_tmp = np.maximum(0, w - lambd * step)
+        obj = (np.dot(s_sum, w_tmp) + self.diagonal_c *
+               self._D_objective(neg_pairs, w_tmp))
+        inner_it += 1
+        assert_all_finite(obj)
+
+      w[:] = w_previous
+      error = np.abs((obj_previous - obj_initial) / obj_previous)
+      if self.verbose:
+        print('mmc iter: %d, conv = %f' % (it, error))
+      it += 1
+
+    self.A_ = np.diag(w)
+
+    self.components_ = components_from_metric(self.A_)
+    return self
+
+  def _fD(self, neg_pairs, A):
+    r"""The value of the dissimilarity constraint function.
+
+    f = f(\sum_{ij \in D} distance(x_i, x_j))
+    i.e. distance can be L1:  \sqrt{(x_i-x_j)A(x_i-x_j)'}
+    """
+    diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
+    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis=1))) +
+                  1e-6)
+
+  def _fD1(self, neg_pairs, A):
+    r"""The gradient of the dissimilarity constraint function w.r.t. A.
+
+    For example, let distance by L1 norm:
+    f = f(\sum_{ij \in D} \sqrt{(x_i-x_j)A(x_i-x_j)'})
+    df/dA_{kl} = f'* d(\sum_{ij \in D} \sqrt{(x_i-x_j)^k*(x_i-x_j)^l})/dA_{kl}
+
+    Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A)
+    so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij
+        df/dA = f'(\sum_{ij \in D} \sqrt{tr(d_ij'*d_ij*A)})
+                * 0.5*(\sum_{ij \in D} (1/sqrt{tr(d_ij'*d_ij*A)})*(d_ij'*d_ij))
+    """
+    diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
+    # outer products of all rows in `diff`
+    M = np.einsum('ij,ik->ijk', diff, diff)
+    # faster version of: dist = np.sqrt(np.sum(M * A[None,:,:], axis=(1,2)))
+    dist = np.sqrt(np.einsum('ijk,jk', M, A))
+    # faster version of: sum_deri = np.sum(M /
+    # (2 * (dist[:,None,None] + 1e-6)), axis=0)
+    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))
+    sum_dist = dist.sum()
+    return sum_deri / (sum_dist + 1e-6)
+
+  def _fS1(self, pos_pairs, A):
+    r"""The gradient of the similarity constraint function w.r.t. A.
+
+    f = \sum_{ij}(x_i-x_j)A(x_i-x_j)' = \sum_{ij}d_ij*A*d_ij'
+    df/dA = d(d_ij*A*d_ij')/dA
+
+    Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A)
+    so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij
+    """
+    diff = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
+    # sum of outer products of all rows in `diff`:
+    return np.einsum('ij,ik->jk', diff, diff)
+
+  def _grad_projection(self, grad1, grad2):
+    grad2 = grad2 / np.linalg.norm(grad2)
+    gtemp = grad1 - np.sum(grad1 * grad2) * grad2
+    gtemp /= np.linalg.norm(gtemp)
+    return gtemp
+
+  def _D_objective(self, neg_pairs, w):
+    return np.log(np.sum(np.sqrt(np.sum(((neg_pairs[:, 0, :] -
+                                          neg_pairs[:, 1, :]) ** 2) *
+                                        w[None, :], axis=1) + 1e-6)))
+
+  def _D_constraint(self, neg_pairs, w):
+    """Compute the value, 1st derivative, second derivative (Hessian) of
+    a dissimilarity constraint function gF(sum_ij distance(d_ij A d_ij))
+    where A is a diagonal matrix (in the form of a column vector 'w').
+    """
+    diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
+    diff_sq = diff * diff
+    dist = np.sqrt(diff_sq.dot(w))
+    sum_deri1 = np.einsum('ij,i', diff_sq, 0.5 / np.maximum(dist, 1e-6))
+    sum_deri2 = np.einsum(
+        'ij,ik->jk',
+        diff_sq,
+        diff_sq / (-4 * np.maximum(1e-6, dist**3))[:, None]
+    )
+    sum_dist = dist.sum()
+    return (
+        np.log(sum_dist),
+        sum_deri1 / sum_dist,
+        sum_deri2 / sum_dist -
+        np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist)
+    )
+
+
+class MMC(_BaseMMC, _PairsClassifierMixin):
+  """Mahalanobis Metric for Clustering (MMC)
+
+  MMC minimizes the sum of squared distances between similar points, while
+  enforcing the sum of distances between dissimilar ones to be greater than
+  one. This leads to a convex and, thus, local-minima-free optimization
+  problem that can be solved efficiently.
+  However, the algorithm involves the computation of eigenvalues, which is the
+  main speed-bottleneck. Since it has initially been designed for clustering
+  applications, one of the implicit assumptions of MMC is that all classes form
+  a compact set, i.e., follow a unimodal distribution, which restricts the
+  possible use-cases of this method. However, it is one of the earliest and a
+  still often cited technique.
+
+  Read more in the :ref:`User Guide <mmc>`.
+
+  Parameters
+  ----------
+  max_iter : int, optional (default=100)
+    Maximum number of iterations of the optimization procedure.
+
+  max_proj : int, optional (default=10000)
+    Maximum number of projection steps.
+
+  tol : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
+
+  init : string or numpy array, optional (default='identity')
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      An SPD matrix of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
+
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation.
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  threshold_ : `float`
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import MMC
+  >>> pairs = [[[1.2, 7.5], [1.3, 1.5]],
+  >>>          [[6.4, 2.6], [6.2, 9.7]],
+  >>>          [[1.3, 4.5], [3.2, 4.6]],
+  >>>          [[6.2, 5.5], [5.4, 5.4]]]
+  >>> y = [1, 1, -1, -1]
+  >>> # in this task we want points where the first feature is close to be
+  >>> # closer to each other, no matter how close the second feature is
+  >>> mmc = MMC()
+  >>> mmc.fit(pairs, y)
+
+  References
+  ----------
+  .. [1] Xing, Jordan, Russell, Ng. `Distance metric learning with application
+         to clustering with side-information
+         <http://papers.nips.cc/paper/2164-distance-metric-\
+         learning-with-application-to-clustering-with-side-information.pdf>`_.
+         NIPS 2002.
+
+  See Also
+  --------
+  metric_learn.MMC : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def fit(self, pairs, y, calibration_params=None):
+    """Learn the MMC model.
+
+    The threshold will be calibrated on the trainset using the parameters
+    `calibration_params`.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_constraints, 2, n_features) or \
+           (n_constraints, 2)
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    y : array-like, of shape (n_constraints,)
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
+    calibration_params : `dict` or `None`
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
+    """
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
+    self._fit(pairs, y)
+    self.calibrate_threshold(pairs, y, **calibration_params)
+    return self
+
+
+class MMC_Supervised(_BaseMMC, TransformerMixin):
+  """Supervised version of Mahalanobis Metric for Clustering (MMC)
+
+  `MMC_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `MMC` for training.
+
+  Parameters
+  ----------
+  max_iter : int, optional (default=100)
+    Maximum number of iterations of the optimization procedure.
+
+  max_proj : int, optional (default=10000)
+    Maximum number of projection steps.
+
+  tol : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
+
+  n_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  init : string or numpy array, optional (default='identity')
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A numpy array of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
+
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    Mahalanobis matrix.  In any case, `random_state` is also used to
+    randomly sample constraints from labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import MMC_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> mmc = MMC_Supervised(n_constraints=200)
+  >>> mmc.fit(X, Y)
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+  """
+
+  def __init__(self, max_iter=100, max_proj=10000, tol=1e-6,
+               n_constraints=None, init='identity',
+               diagonal=False, diagonal_c=1.0, verbose=False,
+               preprocessor=None, random_state=None,
+               num_constraints='deprecated',
+               convergence_threshold='deprecated'):
+    _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj,
+                      tol=tol,
+                      init=init, diagonal=diagonal,
+                      diagonal_c=diagonal_c, verbose=verbose,
+                      preprocessor=preprocessor,
+                      random_state=random_state,
+                      convergence_threshold=convergence_threshold)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
+
+  def fit(self, X, y):
+    """Create constraints from labels and learn the MMC model.
+
+    Parameters
+    ----------
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    y : (n) array-like
+      Data labels.
+    """
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    n_constraints = self.n_constraints
+    if n_constraints is None:
+      num_classes = len(np.unique(y))
+      n_constraints = 20 * num_classes**2
+
+    c = Constraints(y)
+    pos_neg = c.positive_negative_pairs(n_constraints,
+                                        random_state=self.random_state)
+    pairs, y = wrap_pairs(X, pos_neg)
+    return _BaseMMC._fit(self, pairs, y)
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index c0616e2f..7b4423d3 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -1,54 +1,225 @@
 """
 Neighborhood Components Analysis (NCA)
-Ported to Python from https://github.com/vomjom/nca
 """
 
-from __future__ import absolute_import
+import warnings
+import time
+import sys
 import numpy as np
-from six.moves import xrange
+from scipy.optimize import minimize
+from scipy.special import logsumexp
+from sklearn.base import TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances
 
-from .base_metric import BaseMetricLearner
+from ._util import _initialize_components, _check_n_components
+from .base_metric import MahalanobisMixin
 
+EPS = np.finfo(float).eps
 
-class NCA(BaseMetricLearner):
-  def __init__(self, max_iter=100, learning_rate=0.01):
-    self.params = {
-      'max_iter': max_iter,
-      'learning_rate': learning_rate,
-    }
-    self.A = None
 
-  def transformer(self):
-    return self.A
+class NCA(MahalanobisMixin, TransformerMixin):
+  """Neighborhood Components Analysis (NCA)
 
-  def fit(self, X, labels):
+  NCA is a distance metric learning algorithm which aims to improve the
+  accuracy of nearest neighbors classification compared to the standard
+  Euclidean distance. The algorithm directly maximizes a stochastic variant
+  of the leave-one-out k-nearest neighbors(KNN) score on the training set.
+  It can also learn a low-dimensional linear transformation of data that can
+  be used for data visualization and fast classification.
+
+  Read more in the :ref:`User Guide <nca>`.
+
+  Parameters
+  ----------
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  max_iter : int, optional (default=100)
+    Maximum number of iterations done by the optimization algorithm.
+
+  tol : float, optional (default=None)
+    Convergence tolerance for the optimization.
+
+  verbose : bool, optional (default=False)
+    Whether to print progress messages or not.
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
+
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import NCA
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> nca = NCA(max_iter=1000)
+  >>> nca.fit(X, Y)
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+    The number of iterations the solver has run.
+
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+
+  References
+  ----------
+  .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. `Neighbourhood
+         Components Analysis
+         <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_.
+         NIPS 2005.
+
+  .. [2] Wikipedia entry on `Neighborhood Components Analysis
+         <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+  """
+
+  def __init__(self, init='auto', n_components=None,
+               max_iter=100, tol=None, verbose=False, preprocessor=None,
+               random_state=None):
+    self.n_components = n_components
+    self.init = init
+    self.max_iter = max_iter
+    self.tol = tol
+    self.verbose = verbose
+    self.random_state = random_state
+    super(NCA, self).__init__(preprocessor)
+
+  def fit(self, X, y):
     """
     X: data matrix, (n x d)
-    labels: scalar labels, (n)
+    y: scalar labels, (n)
     """
+    X, labels = self._prepare_inputs(X, y, ensure_min_samples=2)
     n, d = X.shape
-    # Initialize A to a scaling matrix
-    A = np.zeros((d, d))
-    np.fill_diagonal(A, 1./(X.max(axis=0)-X.min(axis=0)))
+    n_components = _check_n_components(d, self.n_components)
+
+    # Measure the total training time
+    train_time = time.time()
+
+    # Initialize A
+    A = _initialize_components(n_components, X, labels, self.init,
+                               self.verbose, self.random_state)
 
     # Run NCA
-    dX = X[:,None] - X[None]  # shape (n, n, d)
-    tmp = np.einsum('...i,...j->...ij', dX, dX)  # shape (n, n, d, d)
-    masks = labels[:,None] == labels[None]
-    learning_rate = self.params['learning_rate']
-    for it in xrange(self.params['max_iter']):
-      for i, label in enumerate(labels):
-        mask = masks[i]
-        Ax = A.dot(X.T).T  # shape (n, d)
-
-        softmax = np.exp(-((Ax[i] - Ax)**2).sum(axis=1))  # shape (n)
-        softmax[i] = 0
-        softmax /= softmax.sum()
-
-        t = softmax[:, None, None] * tmp[i]  # shape (n, d, d)
-        d = softmax[mask].sum() * t.sum(axis=0) - t[mask].sum(axis=0)
-        A += learning_rate * A.dot(d)
-
-    self.X = X
-    self.A = A
+    mask = labels[:, np.newaxis] == labels[np.newaxis, :]
+    optimizer_params = {'method': 'L-BFGS-B',
+                        'fun': self._loss_grad_lbfgs,
+                        'args': (X, mask, -1.0),
+                        'jac': True,
+                        'x0': A.ravel(),
+                        'options': dict(maxiter=self.max_iter),
+                        'tol': self.tol
+                        }
+
+    # Call the optimizer
+    self.n_iter_ = 0
+    opt_result = minimize(**optimizer_params)
+
+    self.components_ = opt_result.x.reshape(-1, X.shape[1])
+    self.n_iter_ = opt_result.nit
+
+    # Stop timer
+    train_time = time.time() - train_time
+    if self.verbose:
+      cls_name = self.__class__.__name__
+
+      # Warn the user if the algorithm did not converge
+      if not opt_result.success:
+        warnings.warn('[{}] NCA did not converge: {}'.format(
+            cls_name, opt_result.message), ConvergenceWarning)
+
+      print('[{}] Training took {:8.2f}s.'.format(cls_name, train_time))
+
     return self
+
+  def _loss_grad_lbfgs(self, A, X, mask, sign=1.0):
+
+    if self.n_iter_ == 0 and self.verbose:
+      header_fields = ['Iteration', 'Objective Value', 'Time(s)']
+      header_fmt = '{:>10} {:>20} {:>10}'
+      header = header_fmt.format(*header_fields)
+      cls_name = self.__class__.__name__
+      print('[{cls}]'.format(cls=cls_name))
+      print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name,
+                                                     header=header,
+                                                     sep='-' * len(header)))
+
+    start_time = time.time()
+
+    A = A.reshape(-1, X.shape[1])
+    X_embedded = np.dot(X, A.T)  # (n_samples, n_components)
+    # Compute softmax distances
+    p_ij = pairwise_distances(X_embedded, squared=True)
+    np.fill_diagonal(p_ij, np.inf)
+    p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis])
+    # (n_samples, n_samples)
+
+    # Compute loss
+    masked_p_ij = p_ij * mask
+    p = masked_p_ij.sum(axis=1, keepdims=True)  # (n_samples, 1)
+    loss = p.sum()
+
+    # Compute gradient of loss w.r.t. `transform`
+    weighted_p_ij = masked_p_ij - p_ij * p
+    weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
+    np.fill_diagonal(weighted_p_ij_sym, - weighted_p_ij.sum(axis=0))
+    gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X)
+
+    if self.verbose:
+        start_time = time.time() - start_time
+        values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}'
+        print(values_fmt.format(cls=self.__class__.__name__,
+                                n_iter=self.n_iter_, loss=loss,
+                                start_time=start_time))
+        sys.stdout.flush()
+
+    self.n_iter_ += 1
+    return sign * loss, sign * gradient.ravel()
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 9e91167e..253b9c92 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -1,93 +1,125 @@
-"""Relative Components Analysis (RCA)
-
-RCA learns a full rank Mahalanobis distance metric based on a
-weighted sum of in-class covariance matrices.
-It applies a global linear transformation to assign large weights to
-relevant dimensions and low weights to irrelevant dimensions.
-Those relevant dimensions are estimated using "chunklets",
-subsets of points that are known to belong to the same class.
-
-'Learning distance functions using equivalence relations', ICML 2003
+"""
+Relative Components Analysis (RCA)
 """
 
-from __future__ import absolute_import
 import numpy as np
-from six.moves import xrange
+import warnings
+from sklearn.base import TransformerMixin
 
-from .base_metric import BaseMetricLearner
+from ._util import _check_n_components
+from .base_metric import MahalanobisMixin
 from .constraints import Constraints
 
 
-class RCA(BaseMetricLearner):
-  """Relevant Components Analysis (RCA)"""
-  def __init__(self, dim=None):
-    """Initialize the learner.
-
-    Parameters
-    ----------
-    dim : int, optional
-        embedding dimension (default: original dimension of data)
-    """
-    self.params = {
-      'dim': dim,
-    }
-
-  def transformer(self):
-    return self._transformer
-
-  def _process_inputs(self, X, Y):
-    X = np.asanyarray(X)
-    self.X = X
-    n, d = X.shape
-
-    if self.params['dim'] is None:
-      self.params['dim'] = d
-    elif not 0 < self.params['dim'] <= d:
-      raise ValueError('Invalid embedding dimension, must be in [1,%d]' % d)
-
-    Y = np.asanyarray(Y)
-    num_chunks = Y.max() + 1
-
-    return X, Y, num_chunks, d
-
-  def fit(self, data, chunks):
+# mean center each chunklet separately
+def _chunk_mean_centering(data, chunks):
+  n_chunks = chunks.max() + 1
+  chunk_mask = chunks != -1
+  # We need to ensure the data is float so that we can substract the
+  # mean on it
+  chunk_data = data[chunk_mask].astype(float, copy=False)
+  chunk_labels = chunks[chunk_mask]
+  for c in range(n_chunks):
+    mask = chunk_labels == c
+    chunk_data[mask] -= chunk_data[mask].mean(axis=0)
+
+  return chunk_mask, chunk_data
+
+
+class RCA(MahalanobisMixin, TransformerMixin):
+  """Relevant Components Analysis (RCA)
+
+  RCA learns a full rank Mahalanobis distance metric based on a weighted sum of
+  in-chunklets covariance matrices. It applies a global linear transformation
+  to assign large weights to relevant dimensions and low weights to irrelevant
+  dimensions. Those relevant dimensions are estimated using "chunklets",
+  subsets of points that are known to belong to the same class.
+
+  Read more in the :ref:`User Guide <rca>`.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  Examples
+  --------
+  >>> from metric_learn import RCA
+  >>> X = [[-0.05,  3.0],[0.05, -3.0],
+  >>>     [0.1, -3.55],[-0.1, 3.55],
+  >>>     [-0.95, -0.05],[0.95, 0.05],
+  >>>     [0.4,  0.05],[-0.4, -0.05]]
+  >>> chunks = [0, 0, 1, 1, 2, 2, 3, 3]
+  >>> rca = RCA()
+  >>> rca.fit(X, chunks)
+
+  References
+  ----------
+  .. [1] Noam Shental, et al. `Adjustment learning and relevant component
+         analysis <http://citeseerx.ist.\
+         psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ .
+         ECCV 2002.
+
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+  """
+
+  def __init__(self, n_components=None, preprocessor=None):
+    self.n_components = n_components
+    super(RCA, self).__init__(preprocessor)
+
+  def _check_dimension(self, rank, X):
+    d = X.shape[1]
+
+    if rank < d:
+      warnings.warn('The inner covariance matrix is not invertible, '
+                    'so the transformation matrix may contain Nan values. '
+                    'You should remove any linearly dependent features and/or '
+                    'reduce the dimensionality of your input, '
+                    'for instance using `sklearn.decomposition.PCA` as a '
+                    'preprocessing step.')
+
+    dim = _check_n_components(d, self.n_components)
+    return dim
+
+  def fit(self, X, chunks):
     """Learn the RCA model.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
+    data : (n x d) data matrix
+      Each row corresponds to a single instance
+
     chunks : (n,) array of ints
-        when ``chunks[i] == -1``, point i doesn't belong to any chunklet,
-        when ``chunks[i] == j``, point i belongs to chunklet j.
+      When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
+      When ``chunks[i] == j``, point i belongs to chunklet j.
     """
-    data, chunks, num_chunks, d = self._process_inputs(data, chunks)
+    X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2)
 
-    # mean center
-    data -= data.mean(axis=0)
+    chunks = np.asanyarray(chunks, dtype=int)
+    chunk_mask, chunked_data = _chunk_mean_centering(X, chunks)
 
-    # mean center each chunklet separately
-    chunk_mask = chunks != -1
-    chunk_data = data[chunk_mask]
-    chunk_labels = chunks[chunk_mask]
-    for c in xrange(num_chunks):
-      mask = chunk_labels == c
-      chunk_data[mask] -= chunk_data[mask].mean(axis=0)
-
-    # "inner" covariance of chunk deviations
-    inner_cov = np.cov(chunk_data, rowvar=0, bias=1)
+    inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1))
+    dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X)
 
     # Fisher Linear Discriminant projection
-    if self.params['dim'] < d:
-      total_cov = np.cov(data[chunk_mask], rowvar=0)
-      tmp = np.linalg.lstsq(total_cov, inner_cov)[0]
+    if dim < X.shape[1]:
+      total_cov = np.cov(X[chunk_mask], rowvar=0)
+      tmp = np.linalg.lstsq(total_cov, inner_cov, rcond=None)[0]
       vals, vecs = np.linalg.eig(tmp)
-      inds = np.argsort(vals)[:self.params['dim']]
-      A = vecs[:,inds]
-      inner_cov = A.T.dot(inner_cov).dot(A)
-      self._transformer = _inv_sqrtm(inner_cov).dot(A.T)
+      inds = np.argsort(vals)[:dim]
+      A = vecs[:, inds]
+      inner_cov = np.atleast_2d(A.T.dot(inner_cov).dot(A))
+      self.components_ = _inv_sqrtm(inner_cov).dot(A.T)
     else:
-      self._transformer = _inv_sqrtm(inner_cov).T
+      self.components_ = _inv_sqrtm(inner_cov).T
 
     return self
 
@@ -99,29 +131,87 @@ def _inv_sqrtm(x):
 
 
 class RCA_Supervised(RCA):
-  def __init__(self, dim=None, num_chunks=100, chunk_size=2):
-    """Initialize the learner.
-
-    Parameters
-    ----------
-    dim : int, optional
-        embedding dimension (default: original dimension of data)
-    num_chunks: int, optional
-    chunk_size: int, optional
-    """
-    RCA.__init__(self, dim=dim)
-    self.params.update(num_chunks=num_chunks, chunk_size=chunk_size)
-
-  def fit(self, X, labels):
-    """Create constraints from labels and learn the LSML model.
-    Needs num_constraints specified in constructor.
+  """Supervised version of Relevant Components Analysis (RCA)
+
+  `RCA_Supervised` creates chunks of similar points by first sampling a
+  class, taking `chunk_size` elements in it, and repeating the process
+  `n_chunks` times.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  n_chunks: int, optional (default=100)
+    Number of chunks to generate.
+
+  chunk_size: int, optional (default=2)
+    Number of points per chunk.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int.
+    It is used to randomly sample constraints from labels.
+
+  num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import RCA_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> rca = RCA_Supervised(n_chunks=30, chunk_size=2)
+  >>> rca.fit(X, Y)
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+  """
+
+  def __init__(self, n_components=None, n_chunks=100, chunk_size=2,
+               preprocessor=None, random_state=None,
+               num_chunks='deprecated'):
+    """Initialize the supervised version of `RCA`."""
+    RCA.__init__(self, n_components=n_components, preprocessor=preprocessor)
+    if num_chunks != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_chunks = num_chunks
+    self.num_chunks = 'deprecated'  # To avoid no_attribute error
+    self.n_chunks = n_chunks
+    self.chunk_size = chunk_size
+    self.random_state = random_state
+
+  def fit(self, X, y):
+    """Create constraints from labels and learn the RCA model.
+    Needs n_constraints specified in constructor. (Not true?)
 
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
-    labels : (n) data labels
+      each row corresponds to a single instance
+
+    y : (n) data labels
     """
-    chunks = Constraints(labels).chunks(num_chunks=self.params['num_chunks'],
-                                        chunk_size=self.params['chunk_size'])
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    chunks = Constraints(y).chunks(n_chunks=self.n_chunks,
+                                   chunk_size=self.chunk_size,
+                                   random_state=self.random_state)
+
+    if self.n_chunks * (self.chunk_size - 1) < X.shape[1]:
+      warnings.warn('Due to the parameters of RCA_Supervised, '
+                    'the inner covariance matrix is not invertible, '
+                    'so the transformation matrix will contain Nan values. '
+                    'Increase the number or size of the chunks to correct '
+                    'this problem.'
+                    )
+
     return RCA.fit(self, X, chunks)
diff --git a/metric_learn/scml.py b/metric_learn/scml.py
new file mode 100644
index 00000000..fedf393d
--- /dev/null
+++ b/metric_learn/scml.py
@@ -0,0 +1,663 @@
+"""
+Sparse Compositional Metric Learning (SCML)
+"""
+
+from __future__ import print_function, absolute_import, division
+import numpy as np
+from .base_metric import _TripletsClassifierMixin, MahalanobisMixin
+from ._util import components_from_metric
+from sklearn.base import TransformerMixin
+from .constraints import Constraints
+from sklearn.preprocessing import normalize
+from sklearn.neighbors import NearestNeighbors
+from sklearn.cluster import KMeans
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.utils import check_array, check_random_state
+import warnings
+
+
+class _BaseSCML(MahalanobisMixin):
+
+  _tuple_size = 3   # constraints are triplets
+  _authorized_basis = ['triplet_diffs']
+
+  def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None,
+               gamma=5e-3, max_iter=10000, output_iter=500, batch_size=10,
+               verbose=False, preprocessor=None, random_state=None):
+    self.beta = beta
+    self.basis = basis
+    self.n_basis = n_basis
+    self.gamma = gamma
+    self.max_iter = max_iter
+    self.output_iter = output_iter
+    self.batch_size = batch_size
+    self.verbose = verbose
+    self.preprocessor = preprocessor
+    self.random_state = random_state
+    super(_BaseSCML, self).__init__(preprocessor)
+
+  def _fit(self, triplets, basis=None, n_basis=None):
+    """
+    Optimization procedure to find a sparse vector of weights to
+    construct the metric from the basis set. This is based on the
+    dual averaging method.
+    """
+
+    if not isinstance(self.max_iter, int):
+      raise ValueError("max_iter should be an integer, instead it is of type"
+                       " %s" % type(self.max_iter))
+    if not isinstance(self.output_iter, int):
+      raise ValueError("output_iter should be an integer, instead it is of "
+                       "type %s" % type(self.output_iter))
+    if not isinstance(self.batch_size, int):
+      raise ValueError("batch_size should be an integer, instead it is of type"
+                       " %s" % type(self.batch_size))
+
+    if self.output_iter > self.max_iter:
+      raise ValueError("The value of output_iter must be equal or smaller than"
+                       " max_iter.")
+
+    # Currently prepare_inputs makes triplets contain points and not indices
+    triplets = self._prepare_inputs(triplets, type_of_inputs='tuples')
+
+    # TODO:
+    # This algorithm is built to work with indices, but in order to be
+    # compliant with the current handling of inputs it is converted
+    # back to indices by the following function. This should be improved
+    # in the future.
+    triplets, X = self._to_index_points(triplets)
+
+    if basis is None:
+      basis, n_basis = self._initialize_basis(triplets, X)
+
+    dist_diff = self._compute_dist_diff(triplets, X, basis)
+
+    n_triplets = triplets.shape[0]
+
+    # weight vector
+    w = np.zeros((1, n_basis))
+    # avarage obj gradient wrt weights
+    avg_grad_w = np.zeros((1, n_basis))
+
+    # l2 norm in time of all obj gradients wrt weights
+    ada_grad_w = np.zeros((1, n_basis))
+    # slack for not dividing by zero
+    delta = 0.001
+
+    best_obj = np.inf
+
+    rng = check_random_state(self.random_state)
+    rand_int = rng.randint(low=0, high=n_triplets,
+                           size=(self.max_iter, self.batch_size))
+    for iter in range(self.max_iter):
+
+      idx = rand_int[iter]
+
+      slack_val = 1 + np.matmul(dist_diff[idx, :], w.T)
+      slack_mask = np.squeeze(slack_val > 0, axis=1)
+
+      grad_w = np.sum(dist_diff[idx[slack_mask], :],
+                      axis=0, keepdims=True)/self.batch_size
+      avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1)
+
+      ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w))
+
+      scale_f = -(iter+1) / (self.gamma * (delta + ada_grad_w))
+
+      # proximal operator with negative trimming equivalent
+      w = scale_f * np.minimum(avg_grad_w + self.beta, 0)
+
+      if (iter + 1) % self.output_iter == 0:
+        # regularization part of obj function
+        obj1 = np.sum(w)*self.beta
+
+        # Every triplet distance difference in the space given by L
+        # plus a slack of one
+        slack_val = 1 + np.matmul(dist_diff, w.T)
+        # Mask of places with positive slack
+        slack_mask = slack_val > 0
+
+        # loss function of learning task part of obj function
+        obj2 = np.sum(slack_val[slack_mask])/n_triplets
+
+        obj = obj1 + obj2
+        if self.verbose:
+          count = np.sum(slack_mask)
+          print("[%s] iter %d\t obj %.6f\t num_imp %d" %
+                (self.__class__.__name__, (iter+1), obj, count))
+
+        # update the best
+        if obj < best_obj:
+          best_obj = obj
+          best_w = w
+
+    if self.verbose:
+      print("max iteration reached.")
+
+    # return L matrix yielded from best weights
+    self.n_iter_ = iter
+    self.components_ = self._components_from_basis_weights(basis, best_w)
+
+    return self
+
+  def _compute_dist_diff(self, triplets, X, basis):
+    """
+    Helper function to compute the distance difference of every triplet in the
+    space yielded by the basis set.
+    """
+    # Transformation of data by the basis set
+    XB = np.matmul(X, basis.T)
+
+    n_triplets = triplets.shape[0]
+    # get all positive and negative pairs with lowest index first
+    # np.array (2*n_triplets,2)
+    triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]],
+                                               triplets[:, [0, 2]])),
+                                    kind='stable')
+    # calculate all unique pairs and their indices
+    uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True,
+                                   axis=0)
+    # calculate L2 distance acording to bases only for unique pairs
+    dist = np.square(XB[uniqPairs[:, 0], :] - XB[uniqPairs[:, 1], :])
+
+    # return the diference of distances between all positive and negative
+    # pairs
+    return dist[indices[:n_triplets]] - dist[indices[n_triplets:]]
+
+  def _components_from_basis_weights(self, basis, w):
+    """
+    Get components matrix (L) from computed mahalanobis matrix.
+    """
+
+    # get rid of inactive bases
+    # TODO: Maybe have a tolerance over zero?
+    active_idx, = w > 0
+    w = w[..., active_idx]
+    basis = basis[active_idx, :]
+
+    n_basis, n_features = basis.shape
+
+    if n_basis < n_features:  # if metric is low-rank
+      warnings.warn("The number of bases with nonzero weight is less than the "
+                    "number of features of the input, in consequence the "
+                    "learned transformation reduces the dimension to %d."
+                    % n_basis)
+      return np.sqrt(w.T)*basis  # equivalent to np.diag(np.sqrt(w)).dot(basis)
+
+    else:   # if metric is full rank
+      return components_from_metric(np.matmul(basis.T, w.T*basis))
+
+  def _to_index_points(self, triplets):
+    shape = triplets.shape
+    X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0)
+    triplets = triplets.reshape(shape[:2])
+    return triplets, X
+
+  def _initialize_basis(self, triplets, X):
+    """ Checks if the basis array is well constructed or constructs it based
+    on one of the available options.
+    """
+    n_features = X.shape[1]
+
+    if isinstance(self.basis, np.ndarray):
+      # TODO: should copy?
+      basis = check_array(self.basis, copy=True)
+      if basis.shape[1] != n_features:
+        raise ValueError('The dimensionality ({}) of the provided bases must'
+                         ' match the dimensionality of the data '
+                         '({}).'.format(basis.shape[1], n_features))
+    elif self.basis not in self._authorized_basis:
+      raise ValueError(
+          "`basis` must be one of the options '{}' "
+          "or an array of shape (n_basis, n_features)."
+          .format("', '".join(self._authorized_basis)))
+    if self.basis == 'triplet_diffs':
+      basis, n_basis = self._generate_bases_dist_diff(triplets, X)
+
+    return basis, n_basis
+
+  def _generate_bases_dist_diff(self, triplets, X):
+    """ Constructs the basis set from the differences of positive and negative
+    pairs from the triplets constraints.
+
+    The basis set is constructed iteratively by taking n_features triplets,
+    then adding and substracting respectively all the outerproducts of the
+    positive and negative pairs, and finally selecting the eigenvectors
+    of this matrix with positive eigenvalue. This is done until n_basis are
+    selected.
+    """
+    n_features = X.shape[1]
+    n_triplets = triplets.shape[0]
+
+    if self.n_basis is None:
+      # TODO: Get a good default n_basis directive
+      n_basis = n_features*80
+      warnings.warn('As no value for `n_basis` was selected, the number of '
+                    'basis will be set to n_basis= %d' % n_basis)
+    elif isinstance(self.n_basis, int):
+      n_basis = self.n_basis
+    else:
+      raise ValueError("n_basis should be an integer, instead it is of type %s"
+                       % type(self.n_basis))
+
+    if n_features > n_triplets:
+      raise ValueError(
+        "Number of features (%s) is greater than the number of triplets(%s).\n"
+        "Consider using dimensionality reduction or using another basis "
+        "generation scheme." % (n_features, n_triplets))
+
+    basis = np.zeros((n_basis, n_features))
+
+    # get all positive and negative pairs with lowest index first
+    # np.array (2*n_triplets,2)
+    triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]],
+                                               triplets[:, [0, 2]])),
+                                    kind='stable')
+    # calculate all unique pairs and their indices
+    uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True,
+                                   axis=0)
+    # calculate differences only for unique pairs
+    diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :]
+
+    diff_pos = diff[indices[:n_triplets], :]
+    diff_neg = diff[indices[n_triplets:], :]
+
+    rng = check_random_state(self.random_state)
+
+    start = 0
+    finish = 0
+    while finish != n_basis:
+      # Select triplets to yield diff
+      select_triplet = rng.choice(n_triplets, size=n_features, replace=False)
+
+      # select n_features positive differences
+      d_pos = diff_pos[select_triplet, :]
+
+      # select n_features negative differences
+      d_neg = diff_neg[select_triplet, :]
+
+      # Yield matrix
+      diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg)
+
+      # Calculate eigenvalue and eigenvectors
+      w, v = np.linalg.eigh(diff_sum.T.dot(diff_sum))
+
+      # Add eigenvectors with positive eigenvalue to basis set
+      pos_eig_mask = w > 0
+      start = finish
+      finish += pos_eig_mask.sum()
+
+      try:
+        basis[start:finish, :] = v[pos_eig_mask]
+      except ValueError:
+        # if finish is greater than n_basis
+        basis[start:, :] = v[pos_eig_mask][:n_basis-start]
+        break
+
+      # TODO: maybe add a warning in case there are no added bases, this could
+      # be caused by a bad triplet set. This would cause an infinite loop
+
+    return basis, n_basis
+
+
+class SCML(_BaseSCML, _TripletsClassifierMixin):
+  """Sparse Compositional Metric Learning (SCML)
+
+  `SCML` learns an squared Mahalanobis distance from triplet constraints by
+  optimizing sparse positive weights assigned to a set of :math:`K` rank-one
+  PSD bases. This can be formulated as an optimization problem with only
+  :math:`K` parameters, that can be solved with an efficient stochastic
+  composite scheme.
+
+  Read more in the :ref:`User Guide <scml>`.
+
+  .. warning::
+    SCML is still a bit experimental, don't hesitate to report if
+    something fails/doesn't work as expected.
+
+  Parameters
+  ----------
+  beta: float (default=1e-5)
+    L1 regularization parameter.
+
+  basis : string or array-like, optional (default='triplet_diffs')
+    Set of bases to construct the metric. Possible options are
+    'triplet_diffs', and an array-like of shape (n_basis, n_features).
+
+    'triplet_diffs'
+      The basis set is constructed iteratively from differences between points
+      of `n_features` positive or negative pairs randomly sampled from the
+      triplets constraints. Requires the number of training triplets to be
+      great or equal to `n_features`.
+
+    array-like
+        A matrix of shape (n_basis, n_features), that will be used as
+        the basis set for the metric construction.
+
+  n_basis : int, optional
+    Number of basis to be yielded. In case it is not set it will be set based
+    on `basis`. If no value is selected a default will be computed based on
+    the input.
+
+  gamma: float (default = 5e-3)
+    Learning rate for the optimization algorithm.
+
+  max_iter : int (default = 10000)
+    Number of iterations for the algorithm.
+
+  output_iter : int (default = 5000)
+    Number of iterations to check current weights performance and output this
+    information in case verbose is True.
+
+  verbose : bool, optional
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get triplets from indices. If array-like,
+    triplets will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int.
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `_components_from_basis_weights`.)
+
+  Examples
+  --------
+  >>> from metric_learn import SCML
+  >>> triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]],
+  >>>             [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]],
+  >>>             [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]],
+  >>>             [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]]
+  >>> scml = SCML()
+  >>> scml.fit(triplets)
+
+  References
+  ----------
+  .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning.
+         <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
+         (AAAI), 2014.
+
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
+
+  See Also
+  --------
+  metric_learn.SCML_Supervised : The supervised version of the algorithm.
+
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def fit(self, triplets):
+    """Learn the SCML model.
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_constraints, 3, n_features) or \
+      (n_constraints, 3)
+      3D array-like of triplets of points or 2D array of triplets of
+      indicators. Triplets are assumed to be ordered such that:
+      d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 0], triplets[i, 2]).
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
+    """
+
+    return self._fit(triplets)
+
+
+class SCML_Supervised(_BaseSCML, TransformerMixin):
+  """Supervised version of Sparse Compositional Metric Learning (SCML)
+
+  `SCML_Supervised` creates triplets by taking `k_genuine` neighbours
+  of the same class and `k_impostor` neighbours from different classes for each
+  point and then runs the SCML algorithm on these triplets.
+
+  Read more in the :ref:`User Guide <scml>`.
+
+  .. warning::
+    SCML is still a bit experimental, don't hesitate to report if
+    something fails/doesn't work as expected.
+
+  Parameters
+  ----------
+  beta: float (default=1e-5)
+    L1 regularization parameter.
+
+  basis : string or an array-like, optional (default='lda')
+    Set of bases to construct the metric. Possible options are
+    'lda', and an array-like of shape (n_basis, n_features).
+
+    'lda'
+      The `n_basis` basis set is constructed from the LDA of significant
+      local regions in the feature space via clustering, for each region
+      center k-nearest neighbors are used to obtain the LDA scalings,
+      which correspond to the locally discriminative basis.
+
+    array-like
+      A matrix of shape (n_basis, n_features), that will be used as
+      the basis set for the metric construction.
+
+  n_basis : int, optional
+    Number of basis to be yielded. In case it is not set it will be set based
+    on `basis`. If no value is selected a default will be computed based on
+    the input.
+
+  gamma: float (default = 5e-3)
+    Learning rate for the optimization algorithm.
+
+  max_iter : int (default = 100000)
+    Number of iterations for the algorithm.
+
+  output_iter : int (default = 5000)
+    Number of iterations to check current weights performance and output this
+    information in case verbose is True.
+
+  verbose : bool, optional
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get triplets from indices. If array-like,
+    triplets will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int.
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `_components_from_basis_weights`.)
+
+  Examples
+  --------
+  >>> from metric_learn import SCML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> scml = SCML_Supervised(random_state=33)
+  >>> scml.fit(X, Y)
+  SCML_Supervised(random_state=33)
+  >>> scml.score_pairs([[X[0], X[1]], [X[0], X[2]]])
+  array([1.84640733, 1.55984363])
+  >>> scml.get_metric()(X[0], X[1])
+  1.8464073327922157
+
+  References
+  ----------
+  .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning.
+         <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
+         (AAAI), 2014.
+
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
+
+  See Also
+  --------
+  metric_learn.SCML : The weakly supervised version of this
+    algorithm.
+  """
+  # Add supervised authorized basis construction options
+  _authorized_basis = _BaseSCML._authorized_basis + ['lda']
+
+  def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda',
+               n_basis=None, gamma=5e-3, max_iter=10000, output_iter=500,
+               batch_size=10, verbose=False, preprocessor=None,
+               random_state=None):
+    self.k_genuine = k_genuine
+    self.k_impostor = k_impostor
+    _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis,
+                       max_iter=max_iter, output_iter=output_iter,
+                       batch_size=batch_size, verbose=verbose,
+                       preprocessor=preprocessor, random_state=random_state)
+
+  def fit(self, X, y):
+    """Create constraints from labels and learn the SCML model.
+
+    Parameters
+    ----------
+    X : (n x d) matrix
+        Input data, where each row corresponds to a single instance.
+
+    y : (n) array-like
+        Data labels.
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
+    """
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+
+    basis, n_basis = self._initialize_basis_supervised(X, y)
+
+    if not isinstance(self.k_genuine, int):
+      raise ValueError("k_genuine should be an integer, instead it is of type"
+                       " %s" % type(self.k_genuine))
+    if not isinstance(self.k_impostor, int):
+      raise ValueError("k_impostor should be an integer, instead it is of "
+                       "type %s" % type(self.k_impostor))
+
+    constraints = Constraints(y)
+    triplets = constraints.generate_knntriplets(X, self.k_genuine,
+                                                self.k_impostor)
+
+    triplets = X[triplets]
+
+    return self._fit(triplets, basis, n_basis)
+
+  def _initialize_basis_supervised(self, X, y):
+    """ Constructs the basis set following one of the supervised options in
+    case one is selected.
+    """
+
+    if isinstance(self.basis, str) and self.basis == 'lda':
+      basis, n_basis = self._generate_bases_LDA(X, y)
+    else:
+      basis, n_basis = None, None
+
+    return basis, n_basis
+
+  def _generate_bases_LDA(self, X, y):
+    """ Generates bases for the 'lda' option.
+
+    The basis set is constructed using Linear Discriminant Analysis of
+    significant local regions in the feature space via clustering, for
+    each region center k-nearest neighbors are used to obtain the LDA scalings,
+    which correspond to the locally discriminative basis. Currently this is
+    done at two scales `k={10,20}` if `n_feature < 50` or else `k={20,50}`.
+    """
+
+    labels, class_count = np.unique(y, return_counts=True)
+    n_class = len(labels)
+
+    n_features = X.shape[1]
+    # Number of basis yielded from each LDA
+    num_eig = min(n_class-1, n_features)
+
+    if self.n_basis is None:
+      # TODO: Get a good default n_basis directive
+      n_basis = min(20*n_features, X.shape[0]*2*num_eig - 1)
+      warnings.warn('As no value for `n_basis` was selected, the number of '
+                    'basis will be set to n_basis= %d' % n_basis)
+
+    elif isinstance(self.n_basis, int):
+      n_basis = self.n_basis
+    else:
+      raise ValueError("n_basis should be an integer, instead it is of type %s"
+                       % type(self.n_basis))
+
+    # Number of clusters needed for 2 scales given the number of basis
+    # yielded by every LDA
+    n_clusters = int(np.ceil(n_basis/(2 * num_eig)))
+
+    if n_basis < n_class:
+      warnings.warn("The number of basis is less than the number of classes, "
+                    "which may lead to poor discriminative performance.")
+    elif n_basis >= X.shape[0]*2*num_eig:
+      raise ValueError("Not enough samples to generate %d LDA bases, n_basis"
+                       "should be smaller than %d" %
+                       (n_basis, X.shape[0]*2*num_eig))
+
+    kmeans = KMeans(n_clusters=n_clusters, n_init=10,
+                    random_state=self.random_state, algorithm='elkan').fit(X)
+    cX = kmeans.cluster_centers_
+
+    n_scales = 2
+    if n_features > 50:
+      scales = [20, 50]
+    else:
+      scales = [10, 20]
+
+    k_class = np.vstack((np.minimum(class_count, scales[0]),
+                         np.minimum(class_count, scales[1])))
+
+    idx_set = [np.zeros((n_clusters, sum(k_class[0, :])), dtype=np.int64),
+               np.zeros((n_clusters, sum(k_class[1, :])), dtype=np.int64)]
+
+    start_finish_indices = np.hstack((np.zeros((2, 1), np.int64),
+                                     k_class)).cumsum(axis=1)
+
+    neigh = NearestNeighbors()
+
+    for c in range(n_class):
+        sel_c = np.where(y == labels[c])
+
+        # get k_class same class neighbors
+        neigh.fit(X=X[sel_c])
+        # Only take the neighbors once for the biggest scale
+        neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[-1, c],
+                                     return_distance=False)
+
+        # add index set of neighbors for every cluster center for both scales
+        for s, k in enumerate(k_class[:, c]):
+          start, finish = start_finish_indices[s, c:c+2]
+          idx_set[s][:, start:finish] = np.take(sel_c, neighbors[:, :k])
+
+    # Compute basis for every cluster in both scales
+    basis = np.zeros((n_basis, n_features))
+    lda = LinearDiscriminantAnalysis()
+    start_finish_indices = np.hstack((np.vstack((0, n_clusters * num_eig)),
+                                     np.full((2, n_clusters),
+                                             num_eig))).cumsum(axis=1)
+
+    for s in range(n_scales):
+      for c in range(n_clusters):
+        lda.fit(X[idx_set[s][c, :]], y[idx_set[s][c, :]])
+        start, finish = start_finish_indices[s, c:c+2]
+        normalized_scalings = normalize(lda.scalings_.T)
+        try:
+          basis[start: finish, :] = normalized_scalings
+        except ValueError:
+          # handle tail
+          basis[start:, :] = normalized_scalings[:n_basis-start]
+          break
+
+    return basis, n_basis
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index aba1b9be..c4c427b9 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -1,108 +1,351 @@
 """
-Qi et al.
-An efficient sparse metric learning in high-dimensional space via
-L1-penalized log-determinant regularization.
-ICML 2009
-
-Adapted from https://gist.github.com/kcarnold/5439945
-Paper: http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/icml09-guojun.pdf
+Sparse High-Dimensional Metric Learning (SDML)
 """
 
-from __future__ import absolute_import
+import warnings
 import numpy as np
-from scipy.sparse.csgraph import laplacian
-from sklearn.covariance import graph_lasso
-from sklearn.utils.extmath import pinvh
-
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
-
-
-class SDML(BaseMetricLearner):
-  def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True,
-               verbose=False):
-    '''
-    balance_param: float, optional
-        trade off between sparsity and M0 prior
-    sparsity_param: float, optional
-        trade off between optimizer and sparseness (see graph_lasso)
-    use_cov: bool, optional
-        controls prior matrix, will use the identity if use_cov=False
-    verbose : bool, optional
-        if True, prints information while learning
-    '''
-    self.params = {
-      'balance_param': balance_param,
-      'sparsity_param': sparsity_param,
-      'use_cov': use_cov,
-      'verbose': verbose,
-    }
-
-  def _prepare_inputs(self, X, W):
-    self.X = X
-    # set up prior M
-    if self.params['use_cov']:
-      self.M = np.cov(X.T)
+from sklearn.base import TransformerMixin
+from scipy.linalg import pinvh
+try:
+  from sklearn.covariance._graph_lasso import (
+    _graphical_lasso as graphical_lasso
+  )
+except ImportError:
+  from sklearn.covariance import graphical_lasso
+
+from sklearn.exceptions import ConvergenceWarning
+
+from .base_metric import MahalanobisMixin, _PairsClassifierMixin
+from .constraints import Constraints, wrap_pairs
+from ._util import components_from_metric, _initialize_metric_mahalanobis
+try:
+  from inverse_covariance import quic
+except ImportError:
+  HAS_SKGGM = False
+else:
+  HAS_SKGGM = True
+
+
+class _BaseSDML(MahalanobisMixin):
+
+  _tuple_size = 2  # constraints are pairs
+
+  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity',
+               verbose=False, preprocessor=None,
+               random_state=None):
+    self.balance_param = balance_param
+    self.sparsity_param = sparsity_param
+    self.prior = prior
+    self.verbose = verbose
+    self.random_state = random_state
+    super(_BaseSDML, self).__init__(preprocessor)
+
+  def _fit(self, pairs, y):
+    if not HAS_SKGGM:
+      if self.verbose:
+        print("SDML will use scikit-learn's graphical lasso solver.")
     else:
-      self.M = np.identity(X.shape[1])
-    L = laplacian(W, normed=False)
-    self.loss_matrix = self.X.T.dot(L.dot(self.X))
+      if self.verbose:
+        print("SDML will use skggm's graphical lasso solver.")
+    pairs, y = self._prepare_inputs(pairs, y,
+                                    type_of_inputs='tuples')
+    n_features = pairs.shape[2]
+    if n_features < 2:
+      raise ValueError(f"Cannot fit SDML with {n_features} feature(s)")
 
-  def metric(self):
-    return self.M
+    # set up (the inverse of) the prior M
+    # if the prior is the default (None), we raise a warning
+    _, prior_inv = _initialize_metric_mahalanobis(
+        pairs, self.prior,
+        return_inverse=True, strict_pd=True, matrix_name='prior',
+        random_state=self.random_state)
+    diff = pairs[:, 0] - pairs[:, 1]
+    loss_matrix = (diff.T * y).dot(diff)
+    emp_cov = prior_inv + self.balance_param * loss_matrix
 
-  def fit(self, X, W):
-    """
-    X: data matrix, (n x d)
-        each row corresponds to a single instance
-    W: connectivity graph, (n x n)
-        +1 for positive pairs, -1 for negative.
+    # our initialization will be the matrix with emp_cov's eigenvalues,
+    # with a constant added so that they are all positive (plus an epsilon
+    # to ensure definiteness). This is empirical.
+    w, V = np.linalg.eigh(emp_cov)
+    min_eigval = np.min(w)
+    if min_eigval < 0.:
+      warnings.warn("Warning, the input matrix of graphical lasso is not "
+                    "positive semi-definite (PSD). The algorithm may diverge, "
+                    "and lead to degenerate solutions. "
+                    "To prevent that, try to decrease the balance parameter "
+                    "`balance_param` and/or to set prior='identity'.",
+                    ConvergenceWarning)
+      w -= min_eigval  # we translate the eigenvalues to make them all positive
+    w += 1e-10  # we add a small offset to avoid definiteness problems
+    sigma0 = (V * w).dot(V.T)
+    try:
+      if HAS_SKGGM:
+        theta0 = pinvh(sigma0)
+        M, _, _, _, _, _ = quic(emp_cov, lam=self.sparsity_param,
+                                msg=self.verbose,
+                                Theta0=theta0, Sigma0=sigma0)
+      else:
+        _, M, *_ = graphical_lasso(emp_cov, alpha=self.sparsity_param,
+                                   verbose=self.verbose,
+                                   cov_init=sigma0)
+      raised_error = None
+      w_mahalanobis, _ = np.linalg.eigh(M)
+      not_spd = any(w_mahalanobis < 0.)
+      not_finite = not np.isfinite(M).all()
+    # TODO: Narrow this to the specific exceptions we expect.
+    except Exception as e:
+      raised_error = e
+      not_spd = False  # not_spd not applicable here so we set to False
+      not_finite = False  # not_finite not applicable here so we set to False
+    if raised_error is not None or not_spd or not_finite:
+      msg = ("There was a problem in SDML when using {}'s graphical "
+             "lasso solver.").format("skggm" if HAS_SKGGM else "scikit-learn")
+      if not HAS_SKGGM:
+        skggm_advice = (" skggm's graphical lasso can sometimes converge "
+                        "on non SPD cases where scikit-learn's graphical "
+                        "lasso fails to converge. Try to install skggm and "
+                        "rerun the algorithm (see the README.md for the "
+                        "right version of skggm).")
+        msg += skggm_advice
+      if raised_error is not None:
+        msg += " The following error message was thrown: {}.".format(
+            raised_error)
+      raise RuntimeError(msg)
+
+    self.components_ = components_from_metric(np.atleast_2d(M))
+    return self
+
+
+class SDML(_BaseSDML, _PairsClassifierMixin):
+  r"""Sparse Distance Metric Learning (SDML)
+
+  SDML is an efficient sparse metric learning in high-dimensional space via
+  double regularization: an L1-penalization on the off-diagonal elements of the
+  Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence
+  between :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either
+  :math:`\mathbf{I}` or :math:`\mathbf{\Omega}^{-1}`, where
+  :math:`\mathbf{\Omega}` is the covariance matrix).
+
+  Read more in the :ref:`User Guide <sdml>`.
+
+  Parameters
+  ----------
+  balance_param : float, optional (default=0.5)
+    Trade off between sparsity and M0 prior.
+
+  sparsity_param : float, optional  (default=0.01)
+    Trade off between optimizer and sparseness (see graph_lasso).
+
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  threshold_ : `float`
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import SDML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> sdml = SDML_Supervised(n_constraints=200)
+  >>> sdml.fit(X, Y)
+
+  References
+  ----------
+  .. [1] Qi et al. `An efficient sparse metric learning in high-dimensional
+         space via L1-penalized log-determinant regularization
+         <http://www.machinelearning.org/archive/icml2009/papers/46.pdf>`_.
+         ICML 2009.
+
+  .. [2] Code adapted from https://gist.github.com/kcarnold/5439945
+  """
+
+  def fit(self, pairs, y, calibration_params=None):
+    """Learn the SDML model.
+
+    The threshold will be calibrated on the trainset using the parameters
+    `calibration_params`.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_constraints, 2, n_features) or \
+           (n_constraints, 2)
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    y : array-like, of shape (n_constraints,)
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
+    calibration_params : `dict` or `None`
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
     """
-    self._prepare_inputs(X, W)
-    P = pinvh(self.M) + self.params['balance_param'] * self.loss_matrix
-    emp_cov = pinvh(P)
-    # hack: ensure positive semidefinite
-    emp_cov = emp_cov.T.dot(emp_cov)
-    self.M, _ = graph_lasso(emp_cov, self.params['sparsity_param'],
-                            verbose=self.params['verbose'])
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
+    self._fit(pairs, y)
+    self.calibrate_threshold(pairs, y, **calibration_params)
     return self
 
 
-class SDML_Supervised(SDML):
-  def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True,
-               num_labeled=np.inf, num_constraints=None, verbose=False):
-    SDML.__init__(self, balance_param=balance_param,
-                  sparsity_param=sparsity_param, use_cov=use_cov,
-                  verbose=verbose)
-    '''
-    balance_param: float, optional
-        trade off between sparsity and M0 prior
-    sparsity_param: float, optional
-        trade off between optimizer and sparseness (see graph_lasso)
-    use_cov: bool, optional
-        controls prior matrix, will use the identity if use_cov=False
-    num_labeled : int, optional
-        number of labels to preserve for training
-    num_constraints: int, optional
-        number of constraints to generate
-    verbose : bool, optional
-        if True, prints information while learning
-    '''
-    self.params.update(num_labeled=num_labeled, num_constraints=num_constraints)
-
-  def fit(self, X, labels):
+class SDML_Supervised(_BaseSDML, TransformerMixin):
+  """Supervised version of Sparse Distance Metric Learning (SDML)
+
+  `SDML_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `SDML` for training.
+
+  Parameters
+  ----------
+  balance_param : float, optional (default=0.5)
+    Trade off between sparsity and M0 prior.
+
+  sparsity_param : float, optional (default=0.01)
+    Trade off between optimizer and sparseness (see graph_lasso).
+
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  n_constraints : int, optional (default=None)
+    Number of constraints to generate. If None, defaults to `20 *
+    num_classes**2`.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  See Also
+  --------
+  metric_learn.SDML : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity',
+               n_constraints=None, verbose=False, preprocessor=None,
+               random_state=None, num_constraints='deprecated'):
+    _BaseSDML.__init__(self, balance_param=balance_param,
+                       sparsity_param=sparsity_param, prior=prior,
+                       verbose=verbose,
+                       preprocessor=preprocessor, random_state=random_state)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
+
+  def fit(self, X, y):
     """Create constraints from labels and learn the SDML model.
 
     Parameters
     ----------
-    X: data matrix, (n x d)
-        each row corresponds to a single instance
-    labels: data labels, (n,) array-like
+    X : array-like, shape (n, d)
+      data matrix, where each row corresponds to a single instance
+
+    y : array-like, shape (n,)
+      data labels, one for each instance
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
     """
-    num_constraints = self.params['num_constraints']
-    if num_constraints is None:
-      num_classes = np.unique(labels)
-      num_constraints = 20*(len(num_classes))**2
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+    n_constraints = self.n_constraints
+    if n_constraints is None:
+      num_classes = len(np.unique(y))
+      n_constraints = 20 * num_classes**2
 
-    c = Constraints.random_subset(labels, self.params['num_labeled'])
-    return SDML.fit(self, X, c.adjacency_matrix(num_constraints))
+    c = Constraints(y)
+    pos_neg = c.positive_negative_pairs(n_constraints,
+                                        random_state=self.random_state)
+    pairs, y = wrap_pairs(X, pos_neg)
+    return _BaseSDML._fit(self, pairs, y)
diff --git a/metric_learn/sklearn_shims.py b/metric_learn/sklearn_shims.py
new file mode 100644
index 00000000..8d746890
--- /dev/null
+++ b/metric_learn/sklearn_shims.py
@@ -0,0 +1,25 @@
+"""This file is for fixing imports due to different APIs
+depending on the scikit-learn version"""
+import sklearn
+from packaging import version
+SKLEARN_AT_LEAST_0_22 = (version.parse(sklearn.__version__)
+                         >= version.parse('0.22.0'))
+if SKLEARN_AT_LEAST_0_22:
+    from sklearn.utils._testing import (set_random_state,
+                                        ignore_warnings,
+                                        assert_allclose_dense_sparse,
+                                        _get_args)
+    from sklearn.utils.estimator_checks import (_is_public_parameter
+                                                as is_public_parameter)
+    from sklearn.metrics._scorer import get_scorer
+else:
+    from sklearn.utils.testing import (set_random_state,
+                                       ignore_warnings,
+                                       assert_allclose_dense_sparse,
+                                       _get_args)
+    from sklearn.utils.estimator_checks import is_public_parameter
+    from sklearn.metrics.scorer import get_scorer
+
+__all__ = ['set_random_state', 'set_random_state',
+           'ignore_warnings', 'assert_allclose_dense_sparse', '_get_args',
+           'is_public_parameter', 'get_scorer']
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..ef3c8acb
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+  integration: mark a test as integration
+  unit: mark a test as unit
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 8d95aa1e..bc7695e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,4 +2,6 @@
 universal = 1 
 
 [metadata]
-description-file = README.rst
\ No newline at end of file
+description-file = README.rst
+license_files =
+  LICENSE.txt
diff --git a/setup.py b/setup.py
index 2031754a..23392077 100755
--- a/setup.py
+++ b/setup.py
@@ -1,33 +1,77 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from setuptools import setup
+import os
+import io
+import sys
+
+
+CURRENT_PYTHON = sys.version_info[:2]
+REQUIRED_PYTHON = (3, 6)
+
+# This check and everything above must remain compatible with Python 2.7.
+if CURRENT_PYTHON < REQUIRED_PYTHON:
+    sys.stderr.write("""
+==========================
+Unsupported Python version
+==========================
+This version of metric-learn requires Python {}.{}, but you're trying to
+install it on Python {}.{}.
+This may be because you are using a version of pip that doesn't
+understand the python_requires classifier. Make sure you
+have pip >= 9.0 and setuptools >= 24.2, then try again:
+    $ python -m pip install --upgrade pip setuptools
+    $ python -m pip install django
+This will install the latest version of metric-learn which works on your
+version of Python. If you can't upgrade your pip (or Python), request
+an older version of metric-learn:
+    $ python -m pip install "metric-learn<0.6.0"
+""".format(*(REQUIRED_PYTHON + CURRENT_PYTHON)))
+    sys.exit(1)
+
+
+version = {}
+with io.open(os.path.join('metric_learn', '_version.py')) as fp:
+  exec(fp.read(), version)
+
+# Get the long description from README.md
+with io.open('README.rst', encoding='utf-8') as f:
+  long_description = f.read()
 
-version = "0.3.0"
 setup(name='metric-learn',
-      version=version,
+      version=version['__version__'],
       description='Python implementations of metric learning algorithms',
-      author=['CJ Carey', 'Yuan Tang'],
+      long_description=long_description,
+      python_requires='>={}.{}'.format(*REQUIRED_PYTHON),
+      author=[
+          'CJ Carey',
+          'Yuan Tang',
+          'William de Vazelhes',
+          'Aurélien Bellet',
+          'Nathalie Vauquier'
+      ],
       author_email='ccarey@cs.umass.edu',
-      url='http://github.com/all-umass/metric-learn',
+      url='http://github.com/scikit-learn-contrib/metric-learn',
       license='MIT',
       classifiers=[
           'Development Status :: 4 - Beta',
           'License :: OSI Approved :: MIT License',
-          'Programming Language :: Python',
+          'Programming Language :: Python :: 3',
           'Operating System :: OS Independent',
           'Intended Audience :: Science/Research',
           'Topic :: Scientific/Engineering'
       ],
       packages=['metric_learn'],
       install_requires=[
-          'numpy',
-          'scipy',
-          'scikit-learn',
-          'six'
+          'numpy>= 1.11.0',
+          'scipy>= 0.17.0',
+          'scikit-learn>=0.21.3',
       ],
       extras_require=dict(
-          docs=['sphinx', 'shinx_rtd_theme', 'numpydoc'],
+          docs=['sphinx', 'sphinx_rtd_theme', 'numpydoc', 'sphinx-gallery',
+                'matplotlib'],
           demo=['matplotlib'],
+          sdml=['skggm>=0.2.9']
       ),
       test_suite='test',
       keywords=[
@@ -36,5 +80,9 @@
           'Information Theoretic Metric Learning',
           'Sparse Determinant Metric Learning',
           'Least Squares Metric Learning',
-          'Neighborhood Components Analysis'
+          'Neighborhood Components Analysis',
+          'Local Fisher Discriminant Analysis',
+          'Relative Components Analysis',
+          'Mahalanobis Metric for Clustering',
+          'Metric Learning for Kernel Regression'
       ])
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 1a745596..d457b52d 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -1,24 +1,41 @@
+import warnings
 import unittest
+import re
+import pytest
 import numpy as np
-from six.moves import xrange
-from sklearn.metrics import pairwise_distances
-from sklearn.datasets import load_iris
-from numpy.testing import assert_array_almost_equal
-
-from metric_learn import (
-    LMNN, NCA, LFDA,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised)
+import scipy
+from scipy.optimize import check_grad, approx_fprime
+from sklearn.metrics import pairwise_distances, euclidean_distances
+from sklearn.datasets import (load_iris, make_classification, make_regression,
+                              make_spd_matrix)
+from numpy.testing import (assert_array_almost_equal, assert_array_equal,
+                           assert_allclose)
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils.validation import check_X_y
+from sklearn.preprocessing import StandardScaler
+try:
+  from inverse_covariance import quic
+  assert quic
+except ImportError:
+  HAS_SKGGM = False
+else:
+  HAS_SKGGM = True
+from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC,
+                          SCML_Supervised, LSML_Supervised,
+                          ITML_Supervised, SDML_Supervised, RCA_Supervised,
+                          MMC_Supervised, SDML, RCA, ITML, SCML)
 # Import this specially for testing.
-from metric_learn.lmnn import python_LMNN
+from metric_learn.constraints import wrap_pairs, Constraints
+from metric_learn.lmnn import _sum_outer_products
 
 
 def class_separation(X, labels):
   unique_labels, label_inds = np.unique(labels, return_inverse=True)
   ratio = 0
-  for li in xrange(len(unique_labels)):
-    Xc = X[label_inds==li]
-    Xnc = X[label_inds!=li]
-    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean()
+  for li in range(len(unique_labels)):
+    Xc = X[label_inds == li]
+    Xnc = X[label_inds != li]
+    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean()
   return ratio / len(unique_labels)
 
 
@@ -32,77 +49,1120 @@ def setUpClass(self):
     np.random.seed(1234)
 
 
+class TestCovariance(MetricTestCase):
+  def test_iris(self):
+    cov = Covariance()
+    cov.fit(self.iris_points)
+
+    csep = class_separation(cov.transform(self.iris_points), self.iris_labels)
+    # deterministic result
+    self.assertAlmostEqual(csep, 0.72981476)
+
+  def test_singular_returns_pseudo_inverse(self):
+    """Checks that if the input covariance matrix is singular, we return
+    the pseudo inverse"""
+    X, y = load_iris(return_X_y=True)
+    # We add a virtual column that is a linear combination of the other
+    # columns so that the covariance matrix will be singular
+    X = np.concatenate([X, X[:, :2].dot([[2], [3]])], axis=1)
+    cov_matrix = np.cov(X, rowvar=False)
+    covariance = Covariance()
+    covariance.fit(X)
+    pseudo_inverse = covariance.get_mahalanobis_matrix()
+    # here is the definition of a pseudo inverse according to wikipedia:
+    assert_allclose(cov_matrix.dot(pseudo_inverse).dot(cov_matrix),
+                    cov_matrix)
+    assert_allclose(pseudo_inverse.dot(cov_matrix).dot(pseudo_inverse),
+                    pseudo_inverse)
+
+
+class TestSCML(object):
+  @pytest.mark.parametrize('basis', ('lda', 'triplet_diffs'))
+  def test_iris(self, basis):
+    """
+    SCML applied to Iris dataset should give better results when
+    computing class separation.
+    """
+    X, y = load_iris(return_X_y=True)
+    before = class_separation(X, y)
+    scml = SCML_Supervised(basis=basis, n_basis=85, k_genuine=7, k_impostor=5,
+                           random_state=42)
+    scml.fit(X, y)
+    after = class_separation(scml.transform(X), y)
+    assert before > after + 0.03  # It's better by a margin of 0.03
+
+  def test_big_n_features(self):
+    X, y = make_classification(n_samples=100, n_classes=3, n_features=60,
+                               n_informative=60, n_redundant=0, n_repeated=0,
+                               random_state=42)
+    X = StandardScaler().fit_transform(X)
+    scml = SCML_Supervised(random_state=42, n_basis=399)
+    scml.fit(X, y)
+    csep = class_separation(scml.transform(X), y)
+    assert csep < 0.7
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.ones((3, 3, 3)),)),
+                            (SCML_Supervised, (np.array([[0, 0], [0, 1],
+                                                         [2, 0], [2, 1]]),
+                                               np.array([1, 0, 1, 0])))])
+  def test_bad_basis(self, estimator, data):
+    model = estimator(basis='bad_basis', n_basis=33)  # n_basis doesn't matter
+    msg = ("`basis` must be one of the options '{}' or an array of shape "
+           "(n_basis, n_features)."
+           .format("', '".join(model._authorized_basis)))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  def test_dimension_reduction_msg(self):
+    scml = SCML(n_basis=2)
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]],
+                         [[2, 1], [0, 1], [2, 0]],
+                         [[0, 0], [2, 0], [0, 1]],
+                         [[2, 0], [0, 0], [2, 1]]])
+    msg = ("The number of bases with nonzero weight is less than the "
+           "number of features of the input, in consequence the "
+           "learned transformation reduces the dimension to 1.")
+    with pytest.warns(UserWarning) as raised_warning:
+      scml.fit(triplets)
+    assert msg == raised_warning[0].message.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.array([[[0, 1], [2, 1], [0, 0]],
+                                              [[2, 1], [0, 1], [2, 0]],
+                                              [[0, 0], [2, 0], [0, 1]],
+                                              [[2, 0], [0, 0], [2, 1]]]),)),
+                           (SCML_Supervised, (np.array([[0, 0], [1, 1],
+                                                       [3, 3]]),
+                                              np.array([1, 2, 3])))])
+  def test_n_basis_wrong_type(self, estimator, data):
+    n_basis = 4.0
+    model = estimator(n_basis=n_basis)
+    msg = ("n_basis should be an integer, instead it is of type %s"
+           % type(n_basis))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  def test_small_n_basis_lda(self):
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([0, 0, 1, 1])
+
+    n_class = 2
+    scml = SCML_Supervised(n_basis=n_class-1)
+    msg = ("The number of basis is less than the number of classes, which may"
+           " lead to poor discriminative performance.")
+    with pytest.warns(UserWarning) as raised_warning:
+      scml.fit(X, y)
+    assert msg == raised_warning[0].message.args[0]
+
+  def test_big_n_basis_lda(self):
+    X = np.array([[0, 0], [1, 1], [3, 3]])
+    y = np.array([1, 2, 3])
+
+    n_class = 3
+    num_eig = min(n_class - 1, X.shape[1])
+    n_basis = X.shape[0] * 2 * num_eig
+
+    scml = SCML_Supervised(n_basis=n_basis)
+    msg = ("Not enough samples to generate %d LDA bases, n_basis"
+           "should be smaller than %d" %
+           (n_basis, n_basis))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(X, y)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.random.rand(3, 3, 2),)),
+                           (SCML_Supervised, (np.array([[0, 0], [0, 1],
+                                                        [2, 0], [2, 1]]),
+                                              np.array([1, 0, 1, 0])))])
+  def test_array_basis(self, estimator, data):
+    """ Test that the proper error is raised when the shape of the input basis
+    array is not consistent with the input
+    """
+    basis = np.eye(3)
+    scml = estimator(n_basis=3, basis=basis)
+
+    msg = ('The dimensionality ({}) of the provided bases must match the '
+           'dimensionality of the data ({}).'
+           .format(basis.shape[1], data[0].shape[-1]))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2],
+                                              [1, 0, 3], [2, 3, 1], [2, 3, 0],
+                                              [3, 2, 1], [3, 2, 0]]),)),
+                           (SCML_Supervised, (np.array([0, 1, 2, 3]),
+                                              np.array([0, 0, 1, 1])))])
+  def test_verbose(self, estimator, data, capsys):
+    # assert there is proper output when verbose = True
+    model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+                      max_iter=1, output_iter=1, batch_size=1,
+                      basis='triplet_diffs', random_state=42, verbose=True)
+    model.fit(*data)
+    out, _ = capsys.readouterr()
+    expected_out = ('[%s] iter 1\t obj 0.569946\t num_imp 2\n'
+                    'max iteration reached.\n' % estimator.__name__)
+    assert out == expected_out
+
+  def test_triplet_diffs_toy(self):
+    expected_n_basis = 10
+    model = SCML_Supervised(n_basis=expected_n_basis)
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    triplets = np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], [1, 0, 3],
+                         [2, 3, 1], [2, 3, 0], [3, 2, 1], [3, 2, 0]])
+    basis, n_basis = model._generate_bases_dist_diff(triplets, X)
+    # All points are along the same line, so the only possible basis will be
+    # the vector along that line normalized.
+    expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2)
+    assert n_basis == expected_n_basis
+    np.testing.assert_allclose(basis, expected_basis)
+
+  def test_lda_toy(self):
+    expected_n_basis = 7
+    model = SCML_Supervised(n_basis=expected_n_basis)
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([0, 0, 1, 1])
+    basis, n_basis = model._generate_bases_LDA(X, y)
+    # All points are along the same line, so the only possible basis will be
+    # the vector along that line normalized. In this case it is possible to
+    # obtain it with positive or negative orientations.
+    expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2)
+    assert n_basis == expected_n_basis
+    np.testing.assert_allclose(np.abs(basis), expected_basis)
+
+  @pytest.mark.parametrize('n_samples', [100, 500])
+  @pytest.mark.parametrize('n_features', [10, 50, 100])
+  @pytest.mark.parametrize('n_classes', [5, 10, 15])
+  def test_triplet_diffs(self, n_samples, n_features, n_classes):
+    """
+    Test that the correct value of n_basis is being generated with
+    different triplet constraints.
+    """
+    X, y = make_classification(n_samples=n_samples, n_classes=n_classes,
+                               n_features=n_features, n_informative=n_features,
+                               n_redundant=0, n_repeated=0)
+    X = StandardScaler().fit_transform(X)
+    model = SCML_Supervised(n_basis=None)  # Explicit n_basis=None
+    constraints = Constraints(y)
+    triplets = constraints.generate_knntriplets(X, model.k_genuine,
+                                                model.k_impostor)
+
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      basis, n_basis = model._generate_bases_dist_diff(triplets, X)
+    assert msg in str(raised_warning[0].message)
+
+    expected_n_basis = n_features * 80
+    assert n_basis == expected_n_basis
+    assert basis.shape == (expected_n_basis, n_features)
+
+  @pytest.mark.parametrize('n_samples', [100, 500])
+  @pytest.mark.parametrize('n_features', [10, 50, 100])
+  @pytest.mark.parametrize('n_classes', [5, 10, 15])
+  def test_lda(self, n_samples, n_features, n_classes):
+    """
+    Test that when n_basis=None, the correct n_basis is generated,
+    for SCML_Supervised and different values of n_samples, n_features
+    and n_classes.
+    """
+    X, y = make_classification(n_samples=n_samples, n_classes=n_classes,
+                               n_features=n_features, n_informative=n_features,
+                               n_redundant=0, n_repeated=0)
+    X = StandardScaler().fit_transform(X)
+
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      model = SCML_Supervised(n_basis=None)  # Explicit n_basis=None
+      basis, n_basis = model._generate_bases_LDA(X, y)
+    assert msg in str(raised_warning[0].message)
+
+    num_eig = min(n_classes - 1, n_features)
+    expected_n_basis = min(20 * n_features, n_samples * 2 * num_eig - 1)
+    assert n_basis == expected_n_basis
+    assert basis.shape == (expected_n_basis, n_features)
+
+  @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size',
+                                    'n_basis'])
+  def test_int_inputs(self, name):
+    value = 1.0
+    d = {name: value}
+    scml = SCML(**d)
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]]])
+
+    msg = ("%s should be an integer, instead it is of type"
+           " %s" % (name, type(value)))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(triplets)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size',
+                                    'k_genuine', 'k_impostor', 'n_basis'])
+  def test_int_inputs_supervised(self, name):
+    value = 1.0
+    d = {name: value}
+    scml = SCML_Supervised(**d)
+    X = np.array([[0, 0], [1, 1], [3, 3], [4, 4]])
+    y = np.array([1, 1, 0, 0])
+    msg = ("%s should be an integer, instead it is of type"
+           " %s" % (name, type(value)))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(X, y)
+    assert msg == raised_error.value.args[0]
+
+  def test_large_output_iter(self):
+    scml = SCML(max_iter=1, output_iter=2, n_basis=33)  # n_basis don't matter
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]]])
+    msg = ("The value of output_iter must be equal or smaller than"
+           " max_iter.")
+
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(triplets)
+    assert msg == raised_error.value.args[0]
+
+
 class TestLSML(MetricTestCase):
   def test_iris(self):
-    lsml = LSML_Supervised(num_constraints=200)
+    lsml = LSML_Supervised(n_constraints=200)
     lsml.fit(self.iris_points, self.iris_labels)
 
-    csep = class_separation(lsml.transform(), self.iris_labels)
+    csep = class_separation(lsml.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.8)  # it's pretty terrible
 
 
 class TestITML(MetricTestCase):
   def test_iris(self):
-    itml = ITML_Supervised(num_constraints=200)
+    itml = ITML_Supervised(n_constraints=200)
     itml.fit(self.iris_points, self.iris_labels)
 
-    csep = class_separation(itml.transform(), self.iris_labels)
-    self.assertLess(csep, 0.4)  # it's not great
+    csep = class_separation(itml.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.2)
+
+
+@pytest.mark.parametrize('bounds', [None, (20., 100.), [20., 100.],
+                                    np.array([20., 100.]),
+                                    np.array([[20., 100.]]),
+                                    np.array([[20], [100]])])
+def test_bounds_parameters_valid(bounds):
+  """Asserts that we can provide any array-like of two elements as bounds,
+  and that the attribute bound_ is a numpy array"""
+
+  pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
+  y_pairs = [1, -1]
+  itml = ITML()
+  itml.fit(pairs, y_pairs, bounds=bounds)
+
+  X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+  y = np.array([1, 0, 1, 0])
+  itml_supervised = ITML_Supervised()
+  itml_supervised.fit(X, y, bounds=bounds)
+
+
+@pytest.mark.parametrize('bounds', ['weird', ['weird1', 'weird2'],
+                                    np.array([1, 2, 3])])
+def test_bounds_parameters_invalid(bounds):
+  """Assert that if a non array-like is put for bounds, or an array-like
+  of length different than 2, an error is returned"""
+  pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
+  y_pairs = [1, -1]
+  itml = ITML()
+  with pytest.raises(Exception):
+    itml.fit(pairs, y_pairs, bounds=bounds)
+
+  X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+  y = np.array([1, 0, 1, 0])
+  itml_supervised = ITML_Supervised()
+  with pytest.raises(Exception):
+    itml_supervised.fit(X, y, bounds=bounds)
 
 
 class TestLMNN(MetricTestCase):
   def test_iris(self):
-    # Test both impls, if available.
-    for LMNN_cls in set((LMNN, python_LMNN)):
-      lmnn = LMNN_cls(k=5, learn_rate=1e-6, verbose=False)
-      lmnn.fit(self.iris_points, self.iris_labels)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    lmnn.fit(self.iris_points, self.iris_labels)
+
+    csep = class_separation(lmnn.transform(self.iris_points),
+                            self.iris_labels)
+    self.assertLess(csep, 0.25)
+
+  def test_loss_grad_lbfgs(self):
+    """Test gradient of loss function
+    Assert that the gradient is almost equal to its finite differences
+    approximation.
+    """
+    rng = np.random.RandomState(42)
+    X, y = make_classification(random_state=rng)
+    L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
+    lmnn = LMNN()
+
+    k = lmnn.n_neighbors
+    reg = lmnn.regularization
+
+    X, y = lmnn._prepare_inputs(X, y, dtype=float,
+                                ensure_min_samples=2)
+    num_pts, n_components = X.shape
+    unique_labels, label_inds = np.unique(y, return_inverse=True)
+    lmnn.labels_ = np.arange(len(unique_labels))
+    lmnn.components_ = np.eye(n_components)
+
+    target_neighbors = lmnn._select_targets(X, label_inds)
+
+    # sum outer products
+    dfG = _sum_outer_products(X, target_neighbors.flatten(),
+                              np.repeat(np.arange(X.shape[0]), k))
+
+    # initialize L
+    def loss_grad(flat_L):
+      return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG,
+                             k, reg, target_neighbors, label_inds)
+
+    def fun(x):
+      return loss_grad(x)[1]
+
+    def grad(x):
+      return loss_grad(x)[0].ravel()
+
+    # compute relative error
+    epsilon = np.sqrt(np.finfo(float).eps)
+    rel_diff = (check_grad(fun, grad, L.ravel()) /
+                np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon)))
+    np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
+
+
+def test_loss_func(capsys):
+  """Test the loss function (and its gradient) on a simple example,
+  by comparing the results with the actual implementation of metric-learn,
+  with a very simple (but nonperformant) implementation"""
+
+  # toy dataset to use
+  X, y = make_classification(n_samples=10, n_classes=2,
+                             n_features=6,
+                             n_redundant=0, shuffle=True,
+                             scale=[1, 1, 20, 20, 20, 20], random_state=42)
+
+  def hinge(a):
+    if a > 0:
+      return a, 1
+    else:
+      return 0, 0
+
+  def loss_fn(L, X, y, target_neighbors, reg):
+     L = L.reshape(-1, X.shape[1])
+     Lx = np.dot(X, L.T)
+     loss = 0
+     total_active = 0
+     grad = np.zeros_like(L)
+     for i in range(X.shape[0]):
+       for j in target_neighbors[i]:
+         loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2)
+         grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j])
+         for k in range(X.shape[0]):
+           if y[i] != y[k]:
+             hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) -
+                                 np.sum((Lx[i] - Lx[k])**2))
+             total_active += active
+             if active:
+               loss += reg * hin
+               grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) -
+                               np.outer(Lx[i] - Lx[k], X[i] - X[k])))
+     grad = 2 * grad
+     return grad, loss, total_active
+
+  # we check that the gradient we have computed in the non-performant implem
+  # is indeed the true gradient on a toy example:
+
+  def _select_targets(X, y, k):
+    target_neighbors = np.empty((X.shape[0], k), dtype=int)
+    for label in np.unique(y):
+      inds, = np.nonzero(y == label)
+      dd = euclidean_distances(X[inds], squared=True)
+      np.fill_diagonal(dd, np.inf)
+      nn = np.argsort(dd)[..., :k]
+      target_neighbors[inds] = inds[nn]
+    return target_neighbors
+
+  target_neighbors = _select_targets(X, y, 2)
+  regularization = 0.5
+  n_features = X.shape[1]
+  x0 = np.random.randn(1, n_features)
+
+  def loss(x0):
+    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+                   regularization)[1]
+
+  def grad(x0):
+    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+                   regularization)[0].ravel()
+
+  scipy.optimize.check_grad(loss, grad, x0.ravel())
+
+  class LMNN_with_callback(LMNN):
+    """ We will use a callback to get the gradient (see later)
+    """
+
+    def __init__(self, callback, *args, **kwargs):
+      self.callback = callback
+      super(LMNN_with_callback, self).__init__(*args, **kwargs)
+
+    def _loss_grad(self, *args, **kwargs):
+      grad, objective, total_active = (
+          super(LMNN_with_callback, self)._loss_grad(*args, **kwargs))
+      self.callback.append(grad)
+      return grad, objective, total_active
+
+  class LMNN_nonperformant(LMNN_with_callback):
+
+    def fit(self, X, y):
+      self.y = y
+      return super(LMNN_nonperformant, self).fit(X, y)
+
+    def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
+      grad, loss, total_active = loss_fn(L.ravel(), X, self.y,
+                                         target_neighbors, self.regularization)
+      self.callback.append(grad)
+      return grad, loss, total_active
+
+  mem1, mem2 = [], []
+  lmnn_perf = LMNN_with_callback(verbose=True, random_state=42,
+                                 init='identity', max_iter=30, callback=mem1)
+  lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42,
+                                    init='identity', max_iter=30,
+                                    callback=mem2)
+  objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(),
+                                                     dict())
+  for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']):
+    algo.fit(X, y)
+    out, _ = capsys.readouterr()
+    lines = re.split("\n+", out)
+    # we get every variable that is printed from the algorithm in verbose
+    num = r'(-?\d+.?\d*(e[+|-]\d+)?)'
+    strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})"
+                         .format(num, num, num), s) for s in lines]
+    objectives[name] = [float(match.group(1)) for match in strings if match is
+                        not None]
+    obj_diffs[name] = [float(match.group(3)) for match in strings if match is
+                       not None]
+    total_active[name] = [float(match.group(5)) for match in strings if
+                          match is not
+                          None]
+    learn_rate[name] = [float(match.group(6)) for match in strings if match is
+                        not None]
+    assert len(strings) >= 10  # we ensure that we actually did more than 10
+    # iterations
+    assert total_active[name][0] >= 2  # we ensure that we have some active
+    # constraints (that's the case we want to test)
+    # we remove the last element because it can be equal to the penultimate
+    # if the last gradient update is null
+  for i in range(len(mem1)):
+    np.testing.assert_allclose(lmnn_perf.callback[i],
+                               lmnn_nonperf.callback[i],
+                               err_msg='Gradient different at position '
+                                       '{}'.format(i))
+  np.testing.assert_allclose(objectives['perf'], objectives['nonperf'])
+  np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf'])
+  np.testing.assert_allclose(total_active['perf'], total_active['nonperf'])
+  np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf'])
+
+
+@pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]),
+                                         [1, 1, 0, 0], 3.0),
+                                        (np.array([[0], [1], [2], [3]]),
+                                         [1, 0, 0, 1], 26.)])
+def test_toy_ex_lmnn(X, y, loss):
+  """Test that the loss give the right result on a toy example"""
+  L = np.array([[1]])
+  lmnn = LMNN(n_neighbors=1, regularization=0.5)
+
+  k = lmnn.n_neighbors
+  reg = lmnn.regularization
+
+  X, y = lmnn._prepare_inputs(X, y, dtype=float,
+                              ensure_min_samples=2)
+  num_pts, n_components = X.shape
+  unique_labels, label_inds = np.unique(y, return_inverse=True)
+  lmnn.labels_ = np.arange(len(unique_labels))
+  lmnn.components_ = np.eye(n_components)
+
+  target_neighbors = lmnn._select_targets(X, label_inds)
+
+  # sum outer products
+  dfG = _sum_outer_products(X, target_neighbors.flatten(),
+                            np.repeat(np.arange(X.shape[0]), k))
+
+  # storage
+  a1 = [None] * k
+  a2 = [None] * k
+  for nn_idx in range(k):
+    a1[nn_idx] = np.array([])
+    a2[nn_idx] = np.array([])
+
+  #  assert that the loss equals the one computed by hand
+  assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k,
+                         reg, target_neighbors, label_inds)[1] == loss
+
 
-      csep = class_separation(lmnn.transform(), self.iris_labels)
-      self.assertLess(csep, 0.25)
+def test_convergence_simple_example(capsys):
+  # LMNN should converge on this simple example, which it did not with
+  # this issue: https://github.com/scikit-learn-contrib/metric-learn/issues/88
+  X, y = make_classification(random_state=0)
+  lmnn = LMNN(verbose=True)
+  lmnn.fit(X, y)
+  out, _ = capsys.readouterr()
+  assert "LMNN converged with objective" in out
+
+
+def test_no_twice_same_objective(capsys):
+  # test that the objective function never has twice the same value
+  # see https://github.com/scikit-learn-contrib/metric-learn/issues/88
+  X, y = make_classification(random_state=0)
+  lmnn = LMNN(verbose=True)
+  lmnn.fit(X, y)
+  out, _ = capsys.readouterr()
+  lines = re.split("\n+", out)
+  # we get only objectives from each line:
+  # the regexp matches a float that follows an integer (the iteration
+  # number), and which is followed by a (signed) float (delta obj). It
+  # matches for instance:
+  # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06
+  objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s)
+                for s in lines]
+  objectives = [match.group(1) for match in objectives if match is not None]
+  # we remove the last element because it can be equal to the penultimate
+  # if the last gradient update is null
+  assert len(objectives[:-1]) == len(set(objectives[:-1]))
 
 
 class TestSDML(MetricTestCase):
+
+  @pytest.mark.skipif(HAS_SKGGM,
+                      reason="The warning can be thrown only if skggm is "
+                             "not installed.")
+  def test_sdml_supervised_raises_warning_msg_not_installed_skggm(self):
+    """Tests that the right warning message is raised if someone tries to
+    use SDML_Supervised but has not installed skggm, and that the algorithm
+    fails to converge"""
+    # TODO: remove if we don't need skggm anymore
+    # load_iris: dataset where we know scikit-learn's graphical lasso fails
+    # with a Floating Point error
+    X, y = load_iris(return_X_y=True)
+    sdml_supervised = SDML_Supervised(balance_param=0.5, sparsity_param=0.01)
+    msg = ("There was a problem in SDML when using scikit-learn's graphical "
+           "lasso solver. skggm's graphical lasso can sometimes converge on "
+           "non SPD cases where scikit-learn's graphical lasso fails to "
+           "converge. Try to install skggm and rerun the algorithm (see "
+           "the README.md for the right version of skggm). The following "
+           "error message was thrown:")
+    with pytest.raises(RuntimeError) as raised_error:
+      sdml_supervised.fit(X, y)
+    assert str(raised_error.value).startswith(msg)
+
+  @pytest.mark.skipif(HAS_SKGGM,
+                      reason="The warning can be thrown only if skggm is "
+                             "not installed.")
+  def test_sdml_raises_warning_msg_not_installed_skggm(self):
+    """Tests that the right warning message is raised if someone tries to
+    use SDML but has not installed skggm, and that the algorithm fails to
+    converge"""
+    # TODO: remove if we don't need skggm anymore
+    # case on which we know that scikit-learn's graphical lasso fails
+    # because it will return a non SPD matrix
+    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
+    y_pairs = [1, -1]
+    sdml = SDML(prior='identity', balance_param=100, verbose=True)
+
+    msg = ("There was a problem in SDML when using scikit-learn's graphical "
+           "lasso solver. skggm's graphical lasso can sometimes converge on "
+           "non SPD cases where scikit-learn's graphical lasso fails to "
+           "converge. Try to install skggm and rerun the algorithm (see "
+           "the README.md for the right version of skggm).")
+    with pytest.raises(RuntimeError) as raised_error:
+      sdml.fit(pairs, y_pairs)
+    assert msg == str(raised_error.value)
+
+  @pytest.mark.skipif(not HAS_SKGGM,
+                      reason="The warning can be thrown only if skggm is "
+                             "installed.")
+  def test_sdml_raises_warning_msg_installed_skggm(self):
+    """Tests that the right warning message is raised if someone tries to
+    use SDML and has installed skggm, and that the algorithm fails to
+    converge"""
+    # TODO: remove if we don't need skggm anymore
+    # case on which we know that skggm's graphical lasso fails
+    # because it will return non finite values
+    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
+    y_pairs = [1, -1]
+    sdml = SDML(prior='identity', balance_param=100, verbose=True)
+
+    msg = ("There was a problem in SDML when using skggm's graphical "
+           "lasso solver.")
+    with pytest.raises(RuntimeError) as raised_error:
+      sdml.fit(pairs, y_pairs)
+    assert msg == str(raised_error.value)
+
+  @pytest.mark.skipif(not HAS_SKGGM,
+                      reason="The warning can be thrown only if skggm is "
+                             "installed.")
+  def test_sdml_supervised_raises_warning_msg_installed_skggm(self):
+    """Tests that the right warning message is raised if someone tries to
+    use SDML_Supervised but has not installed skggm, and that the algorithm
+    fails to converge"""
+    # TODO: remove if we don't need skggm anymore
+    # case on which we know that skggm's graphical lasso fails
+    # because it will return non finite values
+    rng = np.random.RandomState(42)
+    # This example will create a diagonal em_cov with a negative coeff (
+    # pathological case)
+    X = np.array([[-10., 0.], [10., 0.], [5., 0.], [3., 0.]])
+    y = [0, 0, 1, 1]
+    sdml_supervised = SDML_Supervised(balance_param=0.5, prior='identity',
+                                      sparsity_param=0.01, random_state=rng)
+    msg = ("There was a problem in SDML when using skggm's graphical "
+           "lasso solver.")
+    with pytest.raises(RuntimeError) as raised_error:
+      sdml_supervised.fit(X, y)
+    assert msg == str(raised_error.value)
+
+  @pytest.mark.skipif(not HAS_SKGGM,
+                      reason="It's only in the case where skggm is installed"
+                             "that no warning should be thrown.")
+  def test_raises_no_warning_installed_skggm(self):
+    # otherwise we should be able to instantiate and fit SDML and it
+    # should raise no error and no ConvergenceWarning
+    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]])
+    y_pairs = [1, -1]
+    X, y = make_classification(random_state=42)
+    with warnings.catch_warnings(record=True) as records:
+      sdml = SDML(prior='covariance')
+      sdml.fit(pairs, y_pairs)
+    for record in records:
+      assert record.category is not ConvergenceWarning
+    with warnings.catch_warnings(record=True) as records:
+      sdml_supervised = SDML_Supervised(prior='identity', balance_param=1e-5)
+      sdml_supervised.fit(X, y)
+    for record in records:
+      assert record.category is not ConvergenceWarning
+
   def test_iris(self):
     # Note: this is a flaky test, which fails for certain seeds.
     # TODO: un-flake it!
-    np.random.seed(5555)
+    rs = np.random.RandomState(5555)
 
-    sdml = SDML_Supervised(num_constraints=1500)
+    sdml = SDML_Supervised(n_constraints=1500, prior='identity',
+                           balance_param=5e-5, random_state=rs)
     sdml.fit(self.iris_points, self.iris_labels)
-    csep = class_separation(sdml.transform(), self.iris_labels)
-    self.assertLess(csep, 0.25)
+    csep = class_separation(sdml.transform(self.iris_points),
+                            self.iris_labels)
+    self.assertLess(csep, 0.22)
+
+  def test_sdml_raises_warning_non_psd(self):
+    """Tests that SDML raises a warning on a toy example where we know the
+    pseudo-covariance matrix is not PSD"""
+    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
+    y = [1, -1]
+    sdml = SDML(prior='covariance', sparsity_param=0.01, balance_param=0.5)
+    msg = ("Warning, the input matrix of graphical lasso is not "
+           "positive semi-definite (PSD). The algorithm may diverge, "
+           "and lead to degenerate solutions. "
+           "To prevent that, try to decrease the balance parameter "
+           "`balance_param` and/or to set prior='identity'.")
+    with pytest.warns(ConvergenceWarning) as raised_warning:
+      try:
+        sdml.fit(pairs, y)
+      except Exception:
+        pass
+    # we assert that this warning is in one of the warning raised by the
+    # estimator
+    assert msg in list(map(lambda w: str(w.message), raised_warning))
+
+  def test_sdml_converges_if_psd(self):
+    """Tests that sdml converges on a simple problem where we know the
+    pseudo-covariance matrix is PSD"""
+    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]])
+    y = [1, -1]
+    sdml = SDML(prior='covariance', sparsity_param=0.01, balance_param=0.5)
+    sdml.fit(pairs, y)
+    assert np.isfinite(sdml.get_mahalanobis_matrix()).all()
+
+  @pytest.mark.skipif(not HAS_SKGGM,
+                      reason="sklearn's graphical_lasso can sometimes not "
+                             "work on some non SPD problems. We test that "
+                             "is works only if skggm is installed.")
+  def test_sdml_works_on_non_spd_pb_with_skggm(self):
+    """Test that SDML works on a certain non SPD problem on which we know
+    it should work, but scikit-learn's graphical_lasso does not work"""
+    X, y = load_iris(return_X_y=True)
+    sdml = SDML_Supervised(balance_param=0.5, sparsity_param=0.01,
+                           prior='covariance',
+                           random_state=np.random.RandomState(42))
+    sdml.fit(X, y)
+
+
+@pytest.mark.skipif(not HAS_SKGGM,
+                    reason='The message should be printed only if skggm is '
+                           'installed.')
+def test_verbose_has_installed_skggm_sdml(capsys):
+  # Test that if users have installed skggm, a message is printed telling them
+  # skggm's solver is used (when they use SDML)
+  # TODO: remove if we don't need skggm anymore
+  pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]])
+  y_pairs = [1, -1]
+  sdml = SDML(verbose=True, prior='covariance')
+  sdml.fit(pairs, y_pairs)
+  out, _ = capsys.readouterr()
+  assert "SDML will use skggm's graphical lasso solver." in out
+
+
+@pytest.mark.skipif(not HAS_SKGGM,
+                    reason='The message should be printed only if skggm is '
+                           'installed.')
+def test_verbose_has_installed_skggm_sdml_supervised(capsys):
+  # Test that if users have installed skggm, a message is printed telling them
+  # skggm's solver is used (when they use SDML_Supervised)
+  # TODO: remove if we don't need skggm anymore
+  X, y = load_iris(return_X_y=True)
+  sdml = SDML_Supervised(verbose=True, prior='identity', balance_param=1e-5)
+  sdml.fit(X, y)
+  out, _ = capsys.readouterr()
+  assert "SDML will use skggm's graphical lasso solver." in out
+
+
+@pytest.mark.skipif(HAS_SKGGM,
+                    reason='The message should be printed only if skggm is '
+                           'not installed.')
+def test_verbose_has_not_installed_skggm_sdml(capsys):
+  # Test that if users have installed skggm, a message is printed telling them
+  # skggm's solver is used (when they use SDML)
+  # TODO: remove if we don't need skggm anymore
+  pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]])
+  y_pairs = [1, -1]
+  sdml = SDML(verbose=True, prior='covariance')
+  sdml.fit(pairs, y_pairs)
+  out, _ = capsys.readouterr()
+  assert "SDML will use scikit-learn's graphical lasso solver." in out
+
+
+@pytest.mark.skipif(HAS_SKGGM,
+                    reason='The message should be printed only if skggm is '
+                           'not installed.')
+def test_verbose_has_not_installed_skggm_sdml_supervised(capsys):
+  # Test that if users have installed skggm, a message is printed telling them
+  # skggm's solver is used (when they use SDML_Supervised)
+  # TODO: remove if we don't need skggm anymore
+  X, y = make_classification(random_state=42)
+  sdml = SDML_Supervised(verbose=True, balance_param=1e-5, prior='identity')
+  sdml.fit(X, y)
+  out, _ = capsys.readouterr()
+  assert "SDML will use scikit-learn's graphical lasso solver." in out
 
 
 class TestNCA(MetricTestCase):
   def test_iris(self):
     n = self.iris_points.shape[0]
-    nca = NCA(max_iter=(100000//n), learning_rate=0.01)
+
+    # Without dimension reduction
+    nca = NCA(max_iter=(100000 // n))
+    nca.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(nca.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.15)
+
+    # With dimension reduction
+    nca = NCA(max_iter=(100000 // n), n_components=2)
     nca.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(nca.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.20)
+
+  def test_finite_differences(self):
+    """Test gradient of loss function
+
+    Assert that the gradient is almost equal to its finite differences
+    approximation.
+    """
+    # Initialize the transformation `M`, as well as `X` and `y` and `NCA`
+    X, y = make_classification()
+    M = np.random.randn(np.random.randint(1, X.shape[1] + 1), X.shape[1])
+    mask = y[:, np.newaxis] == y[np.newaxis, :]
+    nca = NCA()
+    nca.n_iter_ = 0
+
+    def fun(M):
+      return nca._loss_grad_lbfgs(M, X, mask)[0]
+
+    def grad(M):
+      return nca._loss_grad_lbfgs(M, X, mask)[1].ravel()
+
+    # compute relative error
+    epsilon = np.sqrt(np.finfo(float).eps)
+    rel_diff = (check_grad(fun, grad, M.ravel()) /
+                np.linalg.norm(approx_fprime(M.ravel(), fun, epsilon)))
+    np.testing.assert_almost_equal(rel_diff, 0., decimal=6)
+
+  def test_simple_example(self):
+    """Test on a simple example.
+
+    Puts four points in the input space where the opposite labels points are
+    next to each other. After transform the same labels points should be next
+    to each other.
+
+    """
+    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+    y = np.array([1, 0, 1, 0])
+    nca = NCA(n_components=2,)
+    nca.fit(X, y)
+    Xansformed = nca.transform(X)
+    np.testing.assert_equal(pairwise_distances(Xansformed).argsort()[:, 1],
+                            np.array([2, 3, 0, 1]))
+
+  def test_singleton_class(self):
+      X = self.iris_points
+      y = self.iris_labels
+
+      # one singleton class: test fitting works
+      singleton_class = 1
+      ind_singleton, = np.where(y == singleton_class)
+      y[ind_singleton] = 2
+      y[ind_singleton[0]] = singleton_class
+
+      nca = NCA(max_iter=30)
+      nca.fit(X, y)
+
+      # One non-singleton class: test fitting works
+      ind_1, = np.where(y == 1)
+      ind_2, = np.where(y == 2)
+      y[ind_1] = 0
+      y[ind_1[0]] = 1
+      y[ind_2] = 0
+      y[ind_2[0]] = 2
 
-    # Result copied from Iris example at
-    # https://github.com/vomjom/nca/blob/master/README.mkd
-    expected = [[-0.09935, -0.2215,  0.3383,  0.443],
-                [+0.2532,   0.5835, -0.8461, -0.8915],
-                [-0.729,   -0.6386,  1.767,   1.832],
-                [-0.9405,  -0.8461,  2.281,   2.794]]
-    assert_array_almost_equal(expected, nca.transformer(), decimal=3)
+      nca = NCA(max_iter=30)
+      nca.fit(X, y)
+
+      # Only singleton classes: test fitting does nothing (the gradient
+      # must be null in this case, so the final matrix must stay like
+      # the initialization)
+      ind_0, = np.where(y == 0)
+      ind_1, = np.where(y == 1)
+      ind_2, = np.where(y == 2)
+      X = X[[ind_0[0], ind_1[0], ind_2[0]]]
+      y = y[[ind_0[0], ind_1[0], ind_2[0]]]
+
+      A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1])
+      nca = NCA(init=A, max_iter=30, n_components=X.shape[1])
+      nca.fit(X, y)
+      assert_array_equal(nca.components_, A)
+
+  def test_one_class(self):
+      # if there is only one class the gradient is null, so the final matrix
+      #  must stay like the initialization
+      X = self.iris_points[self.iris_labels == 0]
+      y = self.iris_labels[self.iris_labels == 0]
+
+      A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1])
+      nca = NCA(init=A, max_iter=30, n_components=X.shape[1])
+      nca.fit(X, y)
+      assert_array_equal(nca.components_, A)
 
 
 class TestLFDA(MetricTestCase):
   def test_iris(self):
-    lfda = LFDA(k=2, dim=2)
+    lfda = LFDA(k=2, n_components=2)
     lfda.fit(self.iris_points, self.iris_labels)
-    csep = class_separation(lfda.transform(), self.iris_labels)
+    csep = class_separation(lfda.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.15)
 
+    # Sanity checks for learned matrices.
+    self.assertEqual(lfda.get_mahalanobis_matrix().shape, (4, 4))
+    self.assertEqual(lfda.components_.shape, (2, 4))
+
 
 class TestRCA(MetricTestCase):
   def test_iris(self):
-    rca = RCA_Supervised(dim=2, num_chunks=30, chunk_size=2)
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2)
     rca.fit(self.iris_points, self.iris_labels)
-    csep = class_separation(rca.transform(), self.iris_labels)
+    csep = class_separation(rca.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.29)
+
+  def test_rank_deficient_returns_warning(self):
+    """Checks that if the covariance matrix is not invertible, we raise a
+    warning message advising to use PCA"""
+    X, y = load_iris(return_X_y=True)
+    # we make the fourth column a linear combination of the two first,
+    # so that the covariance matrix will not be invertible:
+    X[:, 3] = X[:, 0] + 3 * X[:, 1]
+    rca = RCA()
+    msg = ('The inner covariance matrix is not invertible, '
+           'so the transformation matrix may contain Nan values. '
+           'You should remove any linearly dependent features and/or '
+           'reduce the dimensionality of your input, '
+           'for instance using `sklearn.decomposition.PCA` as a '
+           'preprocessing step.')
+
+    with warnings.catch_warnings(record=True) as raised_warnings:
+      rca.fit(X, y)
+    assert any(str(w.message) == msg for w in raised_warnings)
+
+  def test_unknown_labels(self):
+    n = 200
+    n_chunks = 50
+    X, y = make_classification(random_state=42, n_samples=2 * n,
+                               n_features=6, n_informative=6, n_redundant=0)
+    y2 = np.concatenate((y[:n], -np.ones(n)))
+
+    rca = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    rca.fit(X[:n], y[:n])
+
+    rca2 = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    rca2.fit(X, y2)
+
+    assert not np.any(np.isnan(rca.components_))
+    assert not np.any(np.isnan(rca2.components_))
+
+    np.testing.assert_array_equal(rca.components_, rca2.components_)
+
+  def test_bad_parameters(self):
+    n = 200
+    n_chunks = 3
+    X, y = make_classification(random_state=42, n_samples=n,
+                               n_features=6, n_informative=6, n_redundant=0)
+
+    rca = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    msg = ('Due to the parameters of RCA_Supervised, '
+           'the inner covariance matrix is not invertible, '
+           'so the transformation matrix will contain Nan values. '
+           'Increase the number or size of the chunks to correct '
+           'this problem.'
+           )
+    with warnings.catch_warnings(record=True) as raised_warning:
+      rca.fit(X, y)
+    assert any(str(w.message) == msg for w in raised_warning)
+
+
+class TestMLKR(MetricTestCase):
+  def test_iris(self):
+    mlkr = MLKR()
+    mlkr.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(mlkr.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.25)
 
+  def test_finite_differences(self):
+    """Test gradient of loss function
+
+    Assert that the gradient is almost equal to its finite differences
+    approximation.
+    """
+    # Initialize the transformation `M`, as well as `X`, and `y` and `MLKR`
+    X, y = make_regression(n_features=4, random_state=1, n_samples=20)
+    X, y = check_X_y(X, y)
+    M = np.random.randn(2, X.shape[1])
+    mlkr = MLKR()
+    mlkr.n_iter_ = 0
+
+    def fun(M):
+      return mlkr._loss(M, X, y)[0]
+
+    def grad_fn(M):
+      return mlkr._loss(M, X, y)[1].ravel()
+
+    # compute relative error
+    rel_diff = check_grad(fun, grad_fn, M.ravel()) / np.linalg.norm(grad_fn(M))
+    np.testing.assert_almost_equal(rel_diff, 0.)
+
+
+class TestMMC(MetricTestCase):
+  def test_iris(self):
+
+    # Generate full set of constraints for comparison with reference
+    # implementation
+    mask = self.iris_labels[None] == self.iris_labels[:, None]
+    a, b = np.nonzero(np.triu(mask, k=1))
+    c, d = np.nonzero(np.triu(~mask, k=1))
+
+    # Full metric
+    n_features = self.iris_points.shape[1]
+    mmc = MMC(tol=0.01, init=np.eye(n_features) / 10)
+    mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d]))
+    expected = [[+0.000514, +0.000868, -0.001195, -0.001703],
+                [+0.000868, +0.001468, -0.002021, -0.002879],
+                [-0.001195, -0.002021, +0.002782, +0.003964],
+                [-0.001703, -0.002879, +0.003964, +0.005648]]
+    assert_array_almost_equal(expected, mmc.get_mahalanobis_matrix(),
+                              decimal=6)
+
+    # Diagonal metric
+    mmc = MMC(diagonal=True)
+    mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d]))
+    expected = [0, 0, 1.210220, 1.228596]
+    assert_array_almost_equal(np.diag(expected), mmc.get_mahalanobis_matrix(),
+                              decimal=6)
+
+    # Supervised Full
+    mmc = MMC_Supervised()
+    mmc.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(mmc.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.15)
+
+    # Supervised Diagonal
+    mmc = MMC_Supervised(diagonal=True)
+    mmc.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(mmc.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.2)
+
+
+@pytest.mark.parametrize(('algo_class', 'dataset'),
+                         [(NCA, make_classification()),
+                          (MLKR, make_regression())])
+def test_verbose(algo_class, dataset, capsys):
+  # assert there is proper output when verbose = True
+  X, y = dataset
+  model = algo_class(verbose=True)
+  model.fit(X, y)
+  out, _ = capsys.readouterr()
+
+  # check output
+  lines = re.split('\n+', out)
+  header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value',
+                                         'Time(s)')
+  assert lines[0] == '[{}]'.format(algo_class.__name__)
+  assert lines[1] == '[{}] {}'.format(algo_class.__name__, header)
+  assert lines[2] == '[{}] {}'.format(algo_class.__name__, '-' * len(header))
+  for line in lines[3:-2]:
+    # The following regex will match for instance:
+    # '[NCA]          0         6.988936e+01       0.01'
+    assert re.match(r"\[" + algo_class.__name__ + r"\]\ *\d+\ *\d\.\d{6}e[+|-]"
+                    r"\d+\ *\d+\.\d{2}", line)
+  assert re.match(r"\[" + algo_class.__name__ + r"\] Training took\ *"
+                  r"\d+\.\d{2}s\.", lines[-2])
+  assert lines[-1] == ''
+
+
+@pytest.mark.parametrize(('algo_class', 'dataset'),
+                         [(NCA, make_classification()),
+                          (MLKR, make_regression(n_features=10))])
+def test_no_verbose(dataset, algo_class, capsys):
+  # assert by default there is no output (verbose=False)
+  X, y = dataset
+  model = algo_class()
+  model.fit(X, y)
+  out, _ = capsys.readouterr()
+  # check output
+  assert (out == '')
+
+
+@pytest.mark.parametrize(('algo_class', 'dataset'),
+                         [(NCA, make_classification()),
+                          (MLKR, make_regression(n_features=10))])
+def test_convergence_warning(dataset, algo_class):
+    X, y = dataset
+    model = algo_class(max_iter=2, verbose=True)
+    cls_name = model.__class__.__name__
+    msg = '[{}] {} did not converge'.format(cls_name, cls_name)
+    with pytest.warns(Warning) as raised_warning:
+      model.fit(X, y)
+    assert any([msg in str(warn.message) for warn in raised_warning])
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
new file mode 100644
index 00000000..b1e71020
--- /dev/null
+++ b/test/test_base_metric.py
@@ -0,0 +1,304 @@
+from numpy.core.numeric import array_equal
+import warnings
+import pytest
+import re
+import unittest
+import metric_learn
+import numpy as np
+from sklearn import clone
+from test.test_utils import ids_metric_learners, metric_learners, remove_y
+from metric_learn.sklearn_shims import set_random_state, SKLEARN_AT_LEAST_0_22
+
+
+def remove_spaces(s):
+  return re.sub(r'\s+', '', s)
+
+
+def sk_repr_kwargs(def_kwargs, nndef_kwargs):
+    """Given the non-default arguments, and the default
+    keywords arguments, build the string that will appear
+    in the __repr__ of the estimator, depending on the
+    version of scikit-learn.
+    """
+    if SKLEARN_AT_LEAST_0_22:
+        def_kwargs = {}
+    def_kwargs.update(nndef_kwargs)
+    args_str = ",".join(f"{key}={repr(value)}"
+                        for key, value in def_kwargs.items())
+    return args_str
+
+
+class TestStringRepr(unittest.TestCase):
+
+  def test_covariance(self):
+    def_kwargs = {'preprocessor': None}
+    nndef_kwargs = {}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.Covariance())),
+                     remove_spaces(f"Covariance({merged_kwargs})"))
+
+  def test_lmnn(self):
+    def_kwargs = {'convergence_tol': 0.001, 'init': 'auto', 'n_neighbors': 3,
+                  'learn_rate': 1e-07, 'max_iter': 1000, 'min_iter': 50,
+                  'n_components': None, 'preprocessor': None,
+                  'random_state': None, 'regularization': 0.5,
+                  'verbose': False}
+    nndef_kwargs = {'convergence_tol': 0.01, 'n_neighbors': 6}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01,
+                                            n_neighbors=6))),
+        remove_spaces(f"LMNN({merged_kwargs})"))
+
+  def test_nca(self):
+    def_kwargs = {'init': 'auto', 'max_iter': 100, 'n_components': None,
+                  'preprocessor': None, 'random_state': None, 'tol': None,
+                  'verbose': False}
+    nndef_kwargs = {'max_iter': 42}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.NCA(max_iter=42))),
+                     remove_spaces(f"NCA({merged_kwargs})"))
+
+  def test_lfda(self):
+    def_kwargs = {'embedding_type': 'weighted', 'k': None,
+                  'n_components': None, 'preprocessor': None}
+    nndef_kwargs = {'k': 2}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.LFDA(k=2))),
+                     remove_spaces(f"LFDA({merged_kwargs})"))
+
+  def test_itml(self):
+    def_kwargs = {'tol': 0.001, 'gamma': 1.0,
+                  'max_iter': 1000, 'preprocessor': None,
+                  'prior': 'identity', 'random_state': None, 'verbose': False}
+    nndef_kwargs = {'gamma': 0.5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.ITML(gamma=0.5))),
+                     remove_spaces(f"ITML({merged_kwargs})"))
+    def_kwargs = {'tol': 0.001, 'gamma': 1.0,
+                  'max_iter': 1000, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'verbose': False}
+    nndef_kwargs = {'n_constraints': 7}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.ITML_Supervised(n_constraints=7))),
+        remove_spaces(f"ITML_Supervised({merged_kwargs})"))
+
+  def test_lsml(self):
+    def_kwargs = {'max_iter': 1000, 'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'tol': 0.001, 'verbose': False}
+    nndef_kwargs = {'tol': 0.1}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.LSML(tol=0.1))),
+                     remove_spaces(f"LSML({merged_kwargs})"))
+    def_kwargs = {'max_iter': 1000, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'tol': 0.001, 'verbose': False,
+                  'weights': None}
+    nndef_kwargs = {'verbose': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.LSML_Supervised(verbose=True))),
+        remove_spaces(f"LSML_Supervised({merged_kwargs})"))
+
+  def test_sdml(self):
+    def_kwargs = {'balance_param': 0.5, 'preprocessor': None,
+                  'prior': 'identity', 'random_state': None,
+                  'sparsity_param': 0.01, 'verbose': False}
+    nndef_kwargs = {'verbose': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.SDML(verbose=True))),
+                     remove_spaces(f"SDML({merged_kwargs})"))
+    def_kwargs = {'balance_param': 0.5, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'sparsity_param': 0.01,
+                  'verbose': False}
+    nndef_kwargs = {'sparsity_param': 0.5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.SDML_Supervised(sparsity_param=0.5))),
+        remove_spaces(f"SDML_Supervised({merged_kwargs})"))
+
+  def test_rca(self):
+    def_kwargs = {'n_components': None, 'preprocessor': None}
+    nndef_kwargs = {'n_components': 3}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.RCA(n_components=3))),
+                     remove_spaces(f"RCA({merged_kwargs})"))
+    def_kwargs = {'chunk_size': 2, 'n_components': None, 'n_chunks': 100,
+                  'preprocessor': None, 'random_state': None}
+    nndef_kwargs = {'n_chunks': 5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.RCA_Supervised(n_chunks=5))),
+        remove_spaces(f"RCA_Supervised({merged_kwargs})"))
+
+  def test_mlkr(self):
+    def_kwargs = {'init': 'auto', 'max_iter': 1000,
+                  'n_components': None, 'preprocessor': None,
+                  'random_state': None, 'tol': None, 'verbose': False}
+    nndef_kwargs = {'max_iter': 777}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.MLKR(max_iter=777))),
+                     remove_spaces(f"MLKR({merged_kwargs})"))
+
+  def test_mmc(self):
+    def_kwargs = {'tol': 0.001, 'diagonal': False,
+                  'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100,
+                  'max_proj': 10000, 'preprocessor': None,
+                  'random_state': None, 'verbose': False}
+    nndef_kwargs = {'diagonal': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.MMC(diagonal=True))),
+                     remove_spaces(f"MMC({merged_kwargs})"))
+    def_kwargs = {'tol': 1e-06, 'diagonal': False,
+                  'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100,
+                  'max_proj': 10000, 'n_constraints': None,
+                  'preprocessor': None, 'random_state': None,
+                  'verbose': False}
+    nndef_kwargs = {'max_iter': 1}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.MMC_Supervised(max_iter=1))),
+        remove_spaces(f"MMC_Supervised({merged_kwargs})"))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_is_independent_from_metric_learner(estimator,
+                                                       build_dataset):
+  """Tests that the get_metric method returns a function that is independent
+  from the original metric learner"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+
+  # we fit the metric learner on it and then we compute the metric on some
+  # points
+  model.fit(*remove_y(model, input_data, labels))
+  metric = model.get_metric()
+  score = metric(X[0], X[1])
+
+  # then we refit the estimator on another dataset
+  model.fit(*remove_y(model, np.sin(input_data), labels))
+
+  # we recompute the distance between the two points: it should be the same
+  score_bis = metric(X[0], X[1])
+  assert score_bis == score
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_raises_error(estimator, build_dataset):
+  """Tests that the metric returned by get_metric raises errors similar to
+  the distance functions in scipy.spatial.distance"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(model, input_data, labels))
+  metric = model.get_metric()
+
+  list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]),  # vectors with
+                                 # different dimensions
+                                 (X[0:4], X[1:5]),  # 2D vectors
+                                 (X[0].tolist() + [5.2], X[1] + [7.2])]
+  # vectors of same dimension but incompatible with what the metric learner
+  # was trained on
+
+  for u, v in list_test_get_metric_raises:
+    with pytest.raises(ValueError):
+      metric(u, v)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_works_does_not_raise(estimator, build_dataset):
+  """Tests that the metric returned by get_metric does not raise errors (or
+  warnings) similarly to the distance functions in scipy.spatial.distance"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(model, input_data, labels))
+  metric = model.get_metric()
+
+  list_test_get_metric_doesnt_raise = [(X[0], X[1]),
+                                       (X[0].tolist(), X[1].tolist()),
+                                       (X[0][None], X[1][None])]
+
+  for u, v in list_test_get_metric_doesnt_raise:
+    with warnings.catch_warnings(record=True) as record:
+      metric(u, v)
+    assert len(record) == 0
+
+  # Test that the scalar case works
+  model.components_ = np.array([3.1])
+  metric = model.get_metric()
+  for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]:
+    with warnings.catch_warnings(record=True) as record:
+      metric(u, v)
+    assert len(record) == 0
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_n_components(estimator, build_dataset):
+  """Check that estimators that have a n_components parameters can use it
+  and that it actually works as expected"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+
+  if hasattr(model, 'n_components'):
+    set_random_state(model)
+    model.set_params(n_components=None)
+    model.fit(*remove_y(model, input_data, labels))
+    assert model.components_.shape == (X.shape[1], X.shape[1])
+
+    model = clone(estimator)
+    set_random_state(model)
+    model.set_params(n_components=X.shape[1] - 1)
+    model.fit(*remove_y(model, input_data, labels))
+    assert model.components_.shape == (X.shape[1] - 1, X.shape[1])
+
+    model = clone(estimator)
+    set_random_state(model)
+    model.set_params(n_components=X.shape[1] + 1)
+    with pytest.raises(ValueError) as expected_err:
+      model.fit(*remove_y(model, input_data, labels))
+    assert (str(expected_err.value) ==
+            'Invalid n_components, must be in [1, {}]'.format(X.shape[1]))
+
+    model = clone(estimator)
+    set_random_state(model)
+    model.set_params(n_components=0)
+    with pytest.raises(ValueError) as expected_err:
+      model.fit(*remove_y(model, input_data, labels))
+    assert (str(expected_err.value) ==
+            'Invalid n_components, must be in [1, {}]'.format(X.shape[1]))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_score_pairs_warning(estimator, build_dataset):
+  """Tests that score_pairs returns a FutureWarning regarding deprecation.
+  Also that score_pairs and pair_distance have the same behaviour"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+
+  # We fit the metric learner on it and then we call score_pairs on some
+  # points
+  model.fit(*remove_y(model, input_data, labels))
+
+  msg = ("score_pairs will be deprecated in release 0.7.0. "
+         "Use pair_score to compute similarity scores, or "
+         "pair_distances to compute distances.")
+  with pytest.warns(FutureWarning) as raised_warning:
+    score = model.score_pairs([[X[0], X[1]], ])
+    dist = model.pair_distance([[X[0], X[1]], ])
+    assert array_equal(score, dist)
+  assert any([str(warning.message) == msg for warning in raised_warning])
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py
new file mode 100644
index 00000000..c6113957
--- /dev/null
+++ b/test/test_components_metric_conversion.py
@@ -0,0 +1,181 @@
+import unittest
+import numpy as np
+import pytest
+from scipy.stats import ortho_group
+from sklearn.datasets import load_iris
+from numpy.testing import assert_array_almost_equal, assert_allclose
+from metric_learn.sklearn_shims import ignore_warnings
+
+from metric_learn import (
+    LMNN, NCA, LFDA, Covariance, MLKR,
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised)
+from metric_learn._util import components_from_metric
+from metric_learn.exceptions import NonPSDError
+
+
+class TestTransformerMetricConversion(unittest.TestCase):
+  @classmethod
+  def setUpClass(self):
+    # runs once per test class
+    iris_data = load_iris()
+    self.X = iris_data['data']
+    self.y = iris_data['target']
+
+  def test_cov(self):
+    cov = Covariance()
+    cov.fit(self.X)
+    L = cov.components_
+    assert_array_almost_equal(L.T.dot(L), cov.get_mahalanobis_matrix())
+
+  def test_lsml_supervised(self):
+    seed = np.random.RandomState(1234)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
+    lsml.fit(self.X, self.y)
+    L = lsml.components_
+    assert_array_almost_equal(L.T.dot(L), lsml.get_mahalanobis_matrix())
+
+  def test_itml_supervised(self):
+    seed = np.random.RandomState(1234)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
+    itml.fit(self.X, self.y)
+    L = itml.components_
+    assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix())
+
+  def test_lmnn(self):
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    lmnn.fit(self.X, self.y)
+    L = lmnn.components_
+    assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
+
+  def test_sdml_supervised(self):
+    seed = np.random.RandomState(1234)
+    sdml = SDML_Supervised(n_constraints=1500, prior='identity',
+                           balance_param=1e-5, random_state=seed)
+    sdml.fit(self.X, self.y)
+    L = sdml.components_
+    assert_array_almost_equal(L.T.dot(L), sdml.get_mahalanobis_matrix())
+
+  def test_nca(self):
+    n = self.X.shape[0]
+    nca = NCA(max_iter=(100000 // n))
+    nca.fit(self.X, self.y)
+    L = nca.components_
+    assert_array_almost_equal(L.T.dot(L), nca.get_mahalanobis_matrix())
+
+  def test_lfda(self):
+    lfda = LFDA(k=2, n_components=2)
+    lfda.fit(self.X, self.y)
+    L = lfda.components_
+    assert_array_almost_equal(L.T.dot(L), lfda.get_mahalanobis_matrix())
+
+  def test_rca_supervised(self):
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2)
+    rca.fit(self.X, self.y)
+    L = rca.components_
+    assert_array_almost_equal(L.T.dot(L), rca.get_mahalanobis_matrix())
+
+  def test_mlkr(self):
+    mlkr = MLKR(n_components=2)
+    mlkr.fit(self.X, self.y)
+    L = mlkr.components_
+    assert_array_almost_equal(L.T.dot(L), mlkr.get_mahalanobis_matrix())
+
+  @ignore_warnings
+  def test_components_from_metric_edge_cases(self):
+    """Test that components_from_metric returns the right result in various
+    edge cases"""
+    rng = np.random.RandomState(42)
+
+    # an orthonormal matrix useful for creating matrices with given
+    # eigenvalues:
+    P = ortho_group.rvs(7, random_state=rng)
+
+    # matrix with all its coefficients very low (to check that the algorithm
+    # does not consider it as a diagonal matrix)(non regression test for
+    # https://github.com/scikit-learn-contrib/metric-learn/issues/175)
+    M = np.diag([1e-15, 2e-16, 3e-15, 4e-16, 5e-15, 6e-16, 7e-15])
+    M = P.dot(M).dot(P.T)
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # diagonal matrix
+    M = np.diag(np.abs(rng.randn(5)))
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # low-rank matrix (with zeros)
+    M = np.zeros((7, 7))
+    small_random = rng.randn(3, 3)
+    M[:3, :3] = small_random.T.dot(small_random)
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # low-rank matrix (without necessarily zeros)
+    R = np.abs(rng.randn(7, 7))
+    M = R.dot(np.diag([1, 5, 3, 2, 0, 0, 0])).dot(R.T)
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # matrix with a determinant still high but which is
+    # undefinite w.r.t to numpy standards
+    M = np.diag([1e5, 1e5, 1e5, 1e5, 1e5, 1e5, 1e-20])
+    M = P.dot(M).dot(P.T)
+    assert np.abs(np.linalg.det(M)) > 10
+    assert np.linalg.slogdet(M)[1] > 1  # (just to show that the computed
+    # determinant is far from null)
+    assert np.linalg.matrix_rank(M) < M.shape[0]
+    # (just to show that this case is indeed considered by numpy as an
+    # indefinite case)
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # matrix with lots of small nonzeros that make a big zero when multiplied
+    M = np.diag([1e-3, 1e-3, 1e-3, 1e-3, 1e-3, 1e-3, 1e-3])
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+    # full rank matrix
+    M = rng.randn(10, 10)
+    M = M.T.dot(M)
+    assert np.linalg.matrix_rank(M) == 10
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+  def test_non_symmetric_matrix_raises(self):
+    """Checks that if a non symmetric matrix is given to
+    components_from_metric, an error is thrown"""
+    rng = np.random.RandomState(42)
+    M = rng.randn(10, 10)
+    with pytest.raises(ValueError) as raised_error:
+      components_from_metric(M)
+    assert str(raised_error.value) == "The input metric should be symmetric."
+
+  def test_non_psd_raises(self):
+    """Checks that a non PSD matrix (i.e. with negative eigenvalues) will
+    raise an error when passed to components_from_metric"""
+    rng = np.random.RandomState(42)
+    D = np.diag([1, 5, 3, 4.2, -4, -2, 1])
+    P = ortho_group.rvs(7, random_state=rng)
+    M = P.dot(D).dot(P.T)
+    msg = ("Matrix is not positive semidefinite (PSD).")
+    with pytest.raises(NonPSDError) as raised_error:
+      components_from_metric(M)
+    assert str(raised_error.value) == msg
+    with pytest.raises(NonPSDError) as raised_error:
+      components_from_metric(D)
+    assert str(raised_error.value) == msg
+
+  def test_almost_psd_dont_raise(self):
+    """Checks that if the metric is almost PSD (i.e. it has some negative
+    eigenvalues very close to zero), then components_from_metric will still
+    work"""
+    rng = np.random.RandomState(42)
+    D = np.diag([1, 5, 3, 4.2, -1e-20, -2e-20, -1e-20])
+    P = ortho_group.rvs(7, random_state=rng)
+    M = P.dot(D).dot(P.T)
+    L = components_from_metric(M)
+    assert_allclose(L.T.dot(L), M)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/test/test_constraints.py b/test/test_constraints.py
new file mode 100644
index 00000000..3429d9cc
--- /dev/null
+++ b/test/test_constraints.py
@@ -0,0 +1,188 @@
+import pytest
+import numpy as np
+from sklearn.utils import shuffle
+from metric_learn.constraints import Constraints
+from sklearn.datasets import make_blobs
+
+SEED = 42
+
+
+def gen_labels_for_chunks(n_chunks, chunk_size,
+                          n_classes=10, n_unknown_labels=5):
+  """Generates n_chunks*chunk_size labels that split in n_chunks chunks,
+  that are homogeneous in the label."""
+  assert min(n_chunks, chunk_size) > 0
+  classes = shuffle(np.arange(n_classes), random_state=SEED)
+  n_per_class = chunk_size * (n_chunks // n_classes)
+  n_maj_class = chunk_size * n_chunks - n_per_class * (n_classes - 1)
+
+  first_labels = classes[0] * np.ones(n_maj_class, dtype=int)
+  remaining_labels = np.concatenate([k * np.ones(n_per_class, dtype=int)
+                                     for k in classes[1:]])
+  unknown_labels = -1 * np.ones(n_unknown_labels, dtype=int)
+
+  labels = np.concatenate([first_labels, remaining_labels, unknown_labels])
+  return shuffle(labels, random_state=SEED)
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_exact_num_points_for_chunks(n_chunks, chunk_size):
+  """Checks that the chunk generation works well with just enough points."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  constraints = Constraints(labels)
+  chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                              random_state=SEED)
+
+  chunk_no, size_each_chunk = np.unique(chunks[chunks >= 0],
+                                        return_counts=True)
+
+  np.testing.assert_array_equal(size_each_chunk, chunk_size)
+  assert chunk_no.shape[0] == n_chunks
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_chunk_case_one_miss_point(n_chunks, chunk_size):
+  """Checks that the chunk generation breaks when one point is missing."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  assert len(labels) >= 1
+  constraints = Constraints(labels[1:])
+  with pytest.raises(ValueError) as e:
+    constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                       random_state=SEED)
+
+  expected_message = (('Not enough possible chunks of %d elements in each'
+                       ' class to form expected %d chunks - maximum number'
+                       ' of chunks is %d'
+                       ) % (chunk_size, n_chunks, n_chunks - 1))
+
+  assert str(e.value) == expected_message
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_unknown_labels_not_in_chunks(n_chunks, chunk_size):
+  """Checks that unknown labels are not assigned to any chunk."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  constraints = Constraints(labels)
+  chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                              random_state=SEED)
+
+  assert np.all(chunks[labels < 0] < 0)
+
+
+@pytest.mark.parametrize("k_genuine, k_impostor, T_test",
+                         [(2, 2,
+                          [[0, 1, 3], [0, 1, 4], [0, 2, 3], [0, 2, 4],
+                           [1, 0, 3], [1, 0, 4], [1, 2, 3], [1, 2, 4],
+                           [2, 0, 3], [2, 0, 4], [2, 1, 3], [2, 1, 4],
+                           [3, 4, 1], [3, 4, 2], [3, 5, 1], [3, 5, 2],
+                           [4, 3, 1], [4, 3, 2], [4, 5, 1], [4, 5, 2],
+                           [5, 3, 1], [5, 3, 2], [5, 4, 1], [5, 4, 2]]),
+                          (1, 3,
+                          [[0, 1, 3], [0, 1, 4], [0, 1, 5], [1, 0, 3],
+                           [1, 0, 4], [1, 0, 5], [2, 1, 3], [2, 1, 4],
+                           [2, 1, 5], [3, 4, 0], [3, 4, 1], [3, 4, 2],
+                           [4, 3, 0], [4, 3, 1], [4, 3, 2], [5, 4, 0],
+                           [5, 4, 1], [5, 4, 2]]),
+                          (1, 2,
+                          [[0, 1, 3], [0, 1, 4], [1, 0, 3], [1, 0, 4],
+                           [2, 1, 3], [2, 1, 4], [3, 4, 1], [3, 4, 2],
+                           [4, 3, 1], [4, 3, 2], [5, 4, 1], [5, 4, 2]])])
+def test_generate_knntriplets_under_edge(k_genuine, k_impostor, T_test):
+  """Checks under the edge cases of knn triplet construction with enough
+     neighbors"""
+
+  X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]])
+  y = np.array([1, 1, 1, 2, 2, 2, -1])
+
+  T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor)
+
+  assert np.array_equal(sorted(T.tolist()), T_test)
+
+
+@pytest.mark.parametrize("k_genuine, k_impostor,",
+                         [(3, 3), (2, 4), (3, 4), (10, 9), (144, 33)])
+def test_generate_knntriplets(k_genuine, k_impostor):
+  """Checks edge and over the edge cases of knn triplet construction with not
+     enough neighbors"""
+
+  T_test = [[0, 1, 3], [0, 1, 4], [0, 1, 5], [0, 2, 3], [0, 2, 4], [0, 2, 5],
+            [1, 0, 3], [1, 0, 4], [1, 0, 5], [1, 2, 3], [1, 2, 4], [1, 2, 5],
+            [2, 0, 3], [2, 0, 4], [2, 0, 5], [2, 1, 3], [2, 1, 4], [2, 1, 5],
+            [3, 4, 0], [3, 4, 1], [3, 4, 2], [3, 5, 0], [3, 5, 1], [3, 5, 2],
+            [4, 3, 0], [4, 3, 1], [4, 3, 2], [4, 5, 0], [4, 5, 1], [4, 5, 2],
+            [5, 3, 0], [5, 3, 1], [5, 3, 2], [5, 4, 0], [5, 4, 1], [5, 4, 2]]
+
+  X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]])
+  y = np.array([1, 1, 1, 2, 2, 2, -1])
+
+  msg1 = ("The class 1 has 3 elements, which is not sufficient to "
+          f"generate {k_genuine+1} genuine neighbors "
+          "as specified by k_genuine")
+  msg2 = ("The class 2 has 3 elements, which is not sufficient to "
+          f"generate {k_genuine+1} genuine neighbors "
+          "as specified by k_genuine")
+  msg3 = ("The class 1 has 3 elements of other classes, which is "
+          f"not sufficient to generate {k_impostor} impostor "
+          "neighbors as specified by k_impostor")
+  msg4 = ("The class 2 has 3 elements of other classes, which is "
+          f"not sufficient to generate {k_impostor} impostor "
+          "neighbors as specified by k_impostor")
+  msgs = [msg1, msg2, msg3, msg4]
+  with pytest.warns(UserWarning) as user_warning:
+    T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor)
+  assert any([[msg in str(warn.message) for msg in msgs]
+             for warn in user_warning])
+  assert np.array_equal(sorted(T.tolist()), T_test)
+
+
+def test_generate_knntriplets_k_genuine():
+  """Checks the correct error raised when k_genuine is too big """
+  X, y = shuffle(*make_blobs(random_state=SEED),
+                 random_state=SEED)
+
+  label, labels_count = np.unique(y, return_counts=True)
+  labels_count_min = np.min(labels_count)
+  idx_smallest_label, = np.where(labels_count == labels_count_min)
+  k_genuine = labels_count_min
+
+  warn_msgs = []
+  for idx in idx_smallest_label:
+    warn_msgs.append("The class {} has {} elements, which is not sufficient "
+                     "to generate {} genuine neighbors as specified by "
+                     "k_genuine. Will generate {} genuine neighbors instead."
+                     "\n"
+                     .format(label[idx], k_genuine, k_genuine+1, k_genuine-1))
+
+  with pytest.warns(UserWarning) as raised_warning:
+    Constraints(y).generate_knntriplets(X, k_genuine, 1)
+  for warn in raised_warning:
+    assert str(warn.message) in warn_msgs
+
+
+def test_generate_knntriplets_k_impostor():
+  """Checks the correct error raised when k_impostor is too big """
+  X, y = shuffle(*make_blobs(random_state=SEED),
+                 random_state=SEED)
+
+  length = len(y)
+  label, labels_count = np.unique(y, return_counts=True)
+  labels_count_max = np.max(labels_count)
+  idx_biggest_label, = np.where(labels_count == labels_count_max)
+  k_impostor = length - labels_count_max + 1
+
+  warn_msgs = []
+  for idx in idx_biggest_label:
+    warn_msgs.append("The class {} has {} elements of other classes, which is"
+                     " not sufficient to generate {} impostor neighbors as "
+                     "specified by k_impostor. Will generate {} impostor "
+                     "neighbors instead.\n"
+                     .format(label[idx], k_impostor-1, k_impostor,
+                             k_impostor-1))
+
+  with pytest.warns(UserWarning) as raised_warning:
+    Constraints(y).generate_knntriplets(X, 1, k_impostor)
+  for warn in raised_warning:
+    assert str(warn.message) in warn_msgs
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
new file mode 100644
index 00000000..246223b0
--- /dev/null
+++ b/test/test_fit_transform.py
@@ -0,0 +1,138 @@
+import unittest
+import numpy as np
+from sklearn.datasets import load_iris
+from numpy.testing import assert_array_almost_equal
+
+from metric_learn import (
+    LMNN, NCA, LFDA, Covariance, MLKR,
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised,
+    MMC_Supervised)
+
+
+class TestFitTransform(unittest.TestCase):
+  @classmethod
+  def setUpClass(self):
+    # runs once per test class
+    iris_data = load_iris()
+    self.X = iris_data['data']
+    self.y = iris_data['target']
+
+  def test_cov(self):
+    cov = Covariance()
+    cov.fit(self.X)
+    res_1 = cov.transform(self.X)
+
+    cov = Covariance()
+    res_2 = cov.fit_transform(self.X)
+    # deterministic result
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_lsml_supervised(self):
+    seed = np.random.RandomState(1234)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
+    lsml.fit(self.X, self.y)
+    res_1 = lsml.transform(self.X)
+
+    seed = np.random.RandomState(1234)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
+    res_2 = lsml.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_itml_supervised(self):
+    seed = np.random.RandomState(1234)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
+    itml.fit(self.X, self.y)
+    res_1 = itml.transform(self.X)
+
+    seed = np.random.RandomState(1234)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
+    res_2 = itml.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_lmnn(self):
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    lmnn.fit(self.X, self.y)
+    res_1 = lmnn.transform(self.X)
+
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    res_2 = lmnn.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_sdml_supervised(self):
+    seed = np.random.RandomState(1234)
+    sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5,
+                           prior='identity', random_state=seed)
+    sdml.fit(self.X, self.y)
+    res_1 = sdml.transform(self.X)
+
+    seed = np.random.RandomState(1234)
+    sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5,
+                           prior='identity', random_state=seed)
+    res_2 = sdml.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_nca(self):
+    n = self.X.shape[0]
+    nca = NCA(max_iter=(100000 // n))
+    nca.fit(self.X, self.y)
+    res_1 = nca.transform(self.X)
+
+    nca = NCA(max_iter=(100000 // n))
+    res_2 = nca.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_lfda(self):
+    lfda = LFDA(k=2, n_components=2)
+    lfda.fit(self.X, self.y)
+    res_1 = lfda.transform(self.X)
+
+    lfda = LFDA(k=2, n_components=2)
+    res_2 = lfda.fit_transform(self.X, self.y)
+
+    # signs may be flipped, that's okay
+    assert_array_almost_equal(abs(res_1), abs(res_2))
+
+  def test_rca_supervised(self):
+    seed = np.random.RandomState(1234)
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2,
+                         random_state=seed)
+    rca.fit(self.X, self.y)
+    res_1 = rca.transform(self.X)
+
+    seed = np.random.RandomState(1234)
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2,
+                         random_state=seed)
+    res_2 = rca.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_mlkr(self):
+    mlkr = MLKR(n_components=2)
+    mlkr.fit(self.X, self.y)
+    res_1 = mlkr.transform(self.X)
+
+    mlkr = MLKR(n_components=2)
+    res_2 = mlkr.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+  def test_mmc_supervised(self):
+    seed = np.random.RandomState(1234)
+    mmc = MMC_Supervised(n_constraints=200, random_state=seed)
+    mmc.fit(self.X, self.y)
+    res_1 = mmc.transform(self.X)
+
+    seed = np.random.RandomState(1234)
+    mmc = MMC_Supervised(n_constraints=200, random_state=seed)
+    res_2 = mmc.fit_transform(self.X, self.y)
+
+    assert_array_almost_equal(res_1, res_2)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
new file mode 100644
index 00000000..9378ac60
--- /dev/null
+++ b/test/test_mahalanobis_mixin.py
@@ -0,0 +1,756 @@
+from itertools import product
+
+import pytest
+import numpy as np
+from numpy.linalg import LinAlgError
+from numpy.testing import assert_array_almost_equal, assert_allclose, \
+                          assert_array_equal
+from scipy.spatial.distance import pdist, squareform, mahalanobis
+from scipy.stats import ortho_group
+from sklearn import clone
+from sklearn.cluster import DBSCAN
+from sklearn.datasets import make_spd_matrix, make_blobs
+from sklearn.utils import check_random_state, shuffle
+from sklearn.utils.multiclass import type_of_target
+from metric_learn.sklearn_shims import set_random_state
+
+from metric_learn._util import make_context, _initialize_metric_mahalanobis
+from metric_learn.sdml import _BaseSDML
+from metric_learn.base_metric import (_QuadrupletsClassifierMixin,
+                                      _TripletsClassifierMixin,
+                                      _PairsClassifierMixin)
+from metric_learn.exceptions import NonPSDError
+
+from test.test_utils import (ids_metric_learners, metric_learners,
+                             remove_y, ids_classifiers)
+
+RNG = check_random_state(0)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_pair_score_equivalent(estimator, build_dataset):
+  """
+  For Mahalanobis learners, pair_score should be equivalent to the
+  opposite of the pair_distance result.
+  """
+  input_data, labels, _, X = build_dataset()
+  n_samples = 20
+  X = X[:n_samples]
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+
+  distances = model.pair_distance(np.array(list(product(X, X))))
+  scores = model.pair_score(np.array(list(product(X, X))))
+
+  assert_array_equal(distances, -1 * scores)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_pairwise(estimator, build_dataset):
+  # Computing pairwise scores should return a euclidean distance matrix.
+  input_data, labels, _, X = build_dataset()
+  n_samples = 20
+  X = X[:n_samples]
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+
+  pairwise = model.pair_distance(np.array(list(product(X, X))))\
+      .reshape(n_samples, n_samples)
+
+  check_is_distance_matrix(pairwise)
+
+  # a necessary condition for euclidean distance matrices: (see
+  # https://en.wikipedia.org/wiki/Euclidean_distance_matrix)
+  assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2
+
+  # assert that this distance is coherent with pdist on embeddings
+  assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X)))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_toy_example(estimator, build_dataset):
+    # Checks that pair_distance works on a toy example
+    input_data, labels, _, X = build_dataset()
+    n_samples = 20
+    X = X[:n_samples]
+    model = clone(estimator)
+    set_random_state(model)
+    model.fit(*remove_y(estimator, input_data, labels))
+    pairs = np.stack([X[:10], X[10:20]], axis=1)
+    embedded_pairs = pairs.dot(model.components_.T)
+    distances = np.sqrt(np.sum((embedded_pairs[:, 1] -
+                                embedded_pairs[:, 0])**2,
+                               axis=-1))
+    assert_array_almost_equal(model.pair_distance(pairs), distances)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_finite(estimator, build_dataset):
+  # tests that the score is finite
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  pairs = np.array(list(product(X, X)))
+  assert np.isfinite(model.pair_distance(pairs)).all()
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_dim(estimator, build_dataset):
+  # scoring of 3D arrays should return 1D array (several tuples),
+  # and scoring of 2D arrays (one tuple) should return an error (like
+  # scikit-learn's error when scoring 1D arrays)
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  tuples = np.array(list(product(X, X)))
+  assert model.pair_distance(tuples).shape == (tuples.shape[0],)
+  context = make_context(estimator)
+  msg = ("3D array of formed tuples expected{}. Found 2D array "
+         "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n"
+         .format(context, tuples[1]))
+  with pytest.raises(ValueError) as raised_error:
+    model.pair_distance(tuples[1])
+  assert str(raised_error.value) == msg
+
+
+def check_is_distance_matrix(pairwise):
+  assert (pairwise >= 0).all()  # positivity
+  assert np.array_equal(pairwise, pairwise.T)  # symmetry
+  assert (pairwise.diagonal() == 0).all()  # identity
+  # triangular inequality
+  tol = 1e-12
+  assert (pairwise <= pairwise[:, :, np.newaxis] +
+          pairwise[:, np.newaxis, :] + tol).all()
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_embed_toy_example(estimator, build_dataset):
+    # Checks that embed works on a toy example
+    input_data, labels, _, X = build_dataset()
+    n_samples = 20
+    X = X[:n_samples]
+    model = clone(estimator)
+    set_random_state(model)
+    model.fit(*remove_y(estimator, input_data, labels))
+    embedded_points = X.dot(model.components_.T)
+    assert_array_almost_equal(model.transform(X), embedded_points)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_embed_dim(estimator, build_dataset):
+  # Checks that the the dimension of the output space is as expected
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  assert model.transform(X).shape == X.shape
+
+  # assert that ValueError is thrown if input shape is 1D
+  context = make_context(estimator)
+  err_msg = ("2D array of formed points expected{}. Found 1D array "
+             "instead:\ninput={}. Reshape your data and/or use a "
+             "preprocessor.\n".format(context, X[0]))
+  with pytest.raises(ValueError) as raised_error:
+    model.pair_distance(model.transform(X[0, :]))
+  assert str(raised_error.value) == err_msg
+  # we test that the shape is also OK when doing dimensionality reduction
+  if hasattr(model, 'n_components'):
+    model.set_params(n_components=2)
+    model.fit(*remove_y(estimator, input_data, labels))
+    assert model.transform(X).shape == (X.shape[0], 2)
+    # assert that ValueError is thrown if input shape is 1D
+    with pytest.raises(ValueError) as raised_error:
+        model.transform(model.transform(X[0, :]))
+    assert str(raised_error.value) == err_msg
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_embed_finite(estimator, build_dataset):
+  # Checks that embed returns vectors with finite values
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  assert np.isfinite(model.transform(X)).all()
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_embed_is_linear(estimator, build_dataset):
+  # Checks that the embedding is linear
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  assert_array_almost_equal(model.transform(X[:10] + X[10:20]),
+                            model.transform(X[:10]) +
+                            model.transform(X[10:20]))
+  assert_array_almost_equal(model.transform(5 * X[:10]),
+                            5 * model.transform(X[:10]))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
+                                                       build_dataset):
+  """Tests that using the get_metric method of mahalanobis metric learners is
+  equivalent to explicitely calling scipy's mahalanobis metric
+  """
+  rng = np.random.RandomState(42)
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  metric = model.get_metric()
+  n_features = X.shape[1]
+  a, b = (rng.randn(n_features), rng.randn(n_features))
+  expected_dist = mahalanobis(a, b, VI=model.get_mahalanobis_matrix())
+  assert_allclose(metric(a, b), expected_dist, rtol=1e-13)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_is_pseudo_metric(estimator, build_dataset):
+  """Tests that the get_metric method of mahalanobis metric learners returns a
+  pseudo-metric (metric but without one side of the equivalence of
+  the identity of indiscernables property)
+  """
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  metric = model.get_metric()
+
+  n_features = X.shape[1]
+  for seed in range(10):
+    rng = np.random.RandomState(seed)
+    a, b, c = (rng.randn(n_features) for _ in range(3))
+    assert metric(a, b) >= 0  # positivity
+    assert metric(a, b) == metric(b, a)  # symmetry
+    # one side of identity indiscernables: x == y => d(x, y) == 0. The other
+    # side of the equivalence is not always true for Mahalanobis distances.
+    assert metric(a, a) == 0
+    # triangular inequality
+    assert (metric(a, c) < metric(a, b) + metric(b, c) or
+            np.isclose(metric(a, c), metric(a, b) + metric(b, c), rtol=1e-20))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset):
+  """Check that the metric returned by get_metric is compatible with
+  scikit-learn's algorithms using a custom metric, DBSCAN for instance"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  clustering = DBSCAN(metric=model.get_metric())
+  clustering.fit(X)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_get_squared_metric(estimator, build_dataset):
+  """Test that the squared metric returned is indeed the square of the
+  metric"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+  metric = model.get_metric()
+
+  n_features = X.shape[1]
+  for seed in range(10):
+    rng = np.random.RandomState(seed)
+    a, b = (rng.randn(n_features) for _ in range(2))
+    assert_allclose(metric(a, b, squared=True),
+                    metric(a, b, squared=False)**2,
+                    rtol=1e-15)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_components_is_2D(estimator, build_dataset):
+  """Tests that the transformation matrix of metric learners is 2D"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+  # test that it works for X.shape[1] features
+  model.fit(*remove_y(estimator, input_data, labels))
+  assert model.components_.shape == (X.shape[1], X.shape[1])
+
+  if isinstance(estimator, _BaseSDML):
+    # SDML doesn't support running on a single feature.
+    return
+
+  # test that it works for 1 feature. Use 2nd dimension, to avoid border cases
+  trunc_data = input_data[..., 1:2]
+  # we drop duplicates that might have been formed, i.e. of the form
+  # aabc or abcc or aabb for quadruplets, and aa for pairs.
+
+  if isinstance(estimator, _QuadrupletsClassifierMixin):
+    pairs_idx = [[0, 1], [2, 3]]
+  elif isinstance(estimator, _TripletsClassifierMixin):
+    pairs_idx = [[0, 1], [0, 2]]
+  elif isinstance(estimator, _PairsClassifierMixin):
+    pairs_idx = [[0, 1]]
+  else:
+    pairs_idx = []
+
+  for pair_idx in pairs_idx:
+    pairs = trunc_data[:, pair_idx, :]
+    diffs = pairs[:, 1, :] - pairs[:, 0, :]
+    to_keep = np.abs(diffs.ravel()) > 1e-9
+    trunc_data = trunc_data[to_keep]
+    labels = labels[to_keep]
+
+  model.fit(*remove_y(estimator, trunc_data, labels))
+  assert model.components_.shape == (1, 1)  # the components must be 2D
+
+
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if hasattr(ml, 'n_components') and
+                          hasattr(ml, 'init')],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if hasattr(ml, 'n_components') and
+                              hasattr(ml, 'init')])
+def test_init_transformation(estimator, build_dataset):
+    input_data, labels, _, X = build_dataset()
+    is_classification = (type_of_target(labels) in ['multiclass', 'binary'])
+    model = clone(estimator)
+    rng = np.random.RandomState(42)
+
+    # Start learning from scratch
+    model.set_params(init='identity')
+    model.fit(input_data, labels)
+
+    # Initialize with random
+    model.set_params(init='random')
+    model.fit(input_data, labels)
+
+    # Initialize with auto
+    model.set_params(init='auto')
+    model.fit(input_data, labels)
+
+    # Initialize with PCA
+    model.set_params(init='pca')
+    model.fit(input_data, labels)
+
+    # Initialize with LDA
+    if is_classification:
+      model.set_params(init='lda')
+      model.fit(input_data, labels)
+
+    # Initialize with a numpy array
+    init = rng.rand(X.shape[1], X.shape[1])
+    model.set_params(init=init)
+    model.fit(input_data, labels)
+
+    # init.shape[1] must match X.shape[1]
+    init = rng.rand(X.shape[1], X.shape[1] + 1)
+    model.set_params(init=init)
+    msg = ('The input dimensionality ({}) of the given '
+           'linear transformation `init` must match the '
+           'dimensionality of the given inputs `X` ({}).'
+           .format(init.shape[1], X.shape[1]))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(input_data, labels)
+    assert str(raised_error.value) == msg
+
+    # init.shape[0] must be <= init.shape[1]
+    init = rng.rand(X.shape[1] + 1, X.shape[1])
+    model.set_params(init=init)
+    msg = ('The output dimensionality ({}) of the given '
+           'linear transformation `init` cannot be '
+           'greater than its input dimensionality ({}).'
+           .format(init.shape[0], init.shape[1]))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(input_data, labels)
+    assert str(raised_error.value) == msg
+
+    # init.shape[0] must match n_components
+    init = rng.rand(X.shape[1], X.shape[1])
+    n_components = X.shape[1] - 1
+    model.set_params(init=init, n_components=n_components)
+    msg = ('The preferred dimensionality of the '
+           'projected space `n_components` ({}) does not match '
+           'the output dimensionality of the given '
+           'linear transformation `init` ({})!'
+           .format(n_components, init.shape[0]))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(input_data, labels)
+    assert str(raised_error.value) == msg
+
+    # init must be as specified in the docstring
+    model.set_params(init=1)
+    msg = ("`init` must be 'auto', 'pca', 'identity', "
+           "'random'{} or a numpy array of shape "
+           "(n_components, n_features)."
+           .format(", 'lda'" if is_classification else ''))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(input_data, labels)
+    assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('n_samples', [3, 5, 7, 11])
+@pytest.mark.parametrize('n_features', [3, 5, 7, 11])
+@pytest.mark.parametrize('n_classes', [5, 7, 11])
+@pytest.mark.parametrize('n_components', [3, 5, 7, 11])
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if hasattr(ml, 'n_components') and
+                          hasattr(ml, 'init')],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if hasattr(ml, 'n_components') and
+                              hasattr(ml, 'init')])
+def test_auto_init_transformation(n_samples, n_features, n_classes,
+                                  n_components, estimator, build_dataset):
+  # Test that auto choose the init transformation as expected with every
+  # configuration of order of n_samples, n_features, n_classes and
+  # n_components, for all metric learners that learn a transformation.
+  if n_classes >= n_samples:
+    pass
+    # n_classes > n_samples is impossible, and n_classes == n_samples
+    # throws an error from lda but is an absurd case
+  else:
+    input_data, labels, _, X = build_dataset()
+    model_base = clone(estimator)
+    rng = np.random.RandomState(42)
+    model_base.set_params(init='auto',
+                          n_components=n_components,
+                          random_state=rng)
+    # To make the test work for LMNN:
+    if 'LMNN' in model_base.__class__.__name__:
+      model_base.set_params(n_neighbors=1)
+    # To make the test faster for estimators that have a max_iter:
+    if hasattr(model_base, 'max_iter'):
+      model_base.set_params(max_iter=1)
+    if n_components > n_features:
+      # this would return a ValueError, which is tested in
+      # test_init_transformation
+      pass
+    else:
+      # We need to build a dataset of the right shape:
+      num_to_pad_n_samples = ((n_samples // input_data.shape[0] + 1))
+      num_to_pad_n_features = ((n_features // input_data.shape[-1] + 1))
+      if input_data.ndim == 3:
+        input_data = np.tile(input_data,
+                             (num_to_pad_n_samples, input_data.shape[1],
+                              num_to_pad_n_features))
+      else:
+        input_data = np.tile(input_data,
+                             (num_to_pad_n_samples, num_to_pad_n_features))
+      input_data = input_data[:n_samples, ..., :n_features]
+      assert input_data.shape[0] == n_samples
+      assert input_data.shape[-1] == n_features
+      has_classes = model_base.__class__.__name__ in ids_classifiers
+      if has_classes:
+        labels = np.tile(range(n_classes), n_samples //
+                         n_classes + 1)[:n_samples]
+      else:
+        labels = np.tile(labels, n_samples // labels.shape[0] + 1)[:n_samples]
+      model = clone(model_base)
+      model.fit(input_data, labels)
+      if n_components <= min(n_classes - 1, n_features) and has_classes:
+        model_other = clone(model_base).set_params(init='lda')
+      elif n_components < min(n_features, n_samples):
+        model_other = clone(model_base).set_params(init='pca')
+      else:
+        model_other = clone(model_base).set_params(init='identity')
+      model_other.fit(input_data, labels)
+      assert_array_almost_equal(model.components_,
+                                model_other.components_)
+
+
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if not hasattr(ml, 'n_components') and
+                          hasattr(ml, 'init')],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if not hasattr(ml, 'n_components') and
+                              hasattr(ml, 'init')])
+def test_init_mahalanobis(estimator, build_dataset):
+    """Tests that for estimators that learn a mahalanobis matrix
+    instead of a linear transformation, i.e. those that are mahalanobis metric
+    learners
+    where we can change the init, but not choose the n_components,
+    (TODO: be more explicit on this characterization, for instance with
+    safe_flags like in scikit-learn) that the init has an expected behaviour.
+    """
+    input_data, labels, _, X = build_dataset()
+
+    matrices_to_set = []
+    if hasattr(estimator, 'init'):
+      matrices_to_set.append('init')
+    if hasattr(estimator, 'prior'):
+      matrices_to_set.append('prior')
+
+    for param in matrices_to_set:
+      model = clone(estimator)
+      set_random_state(model)
+      rng = np.random.RandomState(42)
+
+      # Start learning from scratch
+      model.set_params(**{param: 'identity'})
+      model.fit(input_data, labels)
+
+      # Initialize with random
+      model.set_params(**{param: 'random'})
+      model.fit(input_data, labels)
+
+      # Initialize with covariance
+      model.set_params(**{param: 'covariance'})
+      model.fit(input_data, labels)
+
+      # Initialize with a random spd matrix
+      init = make_spd_matrix(n_dim=X.shape[1], random_state=rng)
+      model.set_params(**{param: init})
+      model.fit(input_data, labels)
+
+      # init.shape[1] must match X.shape[1]
+      init = make_spd_matrix(n_dim=X.shape[1] + 1, random_state=rng)
+      model.set_params(**{param: init})
+      msg = ('The input dimensionality {} of the given '
+             'mahalanobis matrix `{}` must match the '
+             'dimensionality of the given inputs ({}).'
+             .format(init.shape, param, input_data.shape[-1]))
+
+      with pytest.raises(ValueError) as raised_error:
+        model.fit(input_data, labels)
+      assert str(raised_error.value) == msg
+
+      # The input matrix must be symmetric
+      init = rng.rand(X.shape[1], X.shape[1])
+      model.set_params(**{param: init})
+      msg = ("`{}` is not symmetric.".format(param))
+      with pytest.raises(ValueError) as raised_error:
+        model.fit(input_data, labels)
+      assert str(raised_error.value) == msg
+
+      # The input matrix must be SPD
+      P = ortho_group.rvs(X.shape[1], random_state=rng)
+      w = np.abs(rng.randn(X.shape[1]))
+      w[0] = -10.
+      M = P.dot(np.diag(w)).dot(P.T)
+      model.set_params(**{param: M})
+      msg = ("Matrix is not positive semidefinite (PSD).")
+      with pytest.raises(NonPSDError) as raised_err:
+        model.fit(input_data, labels)
+      assert str(raised_err.value) == msg
+
+      # init must be as specified in the docstring
+      model.set_params(**{param: 1})
+      msg = ("`{}` must be 'identity', 'covariance', "
+             "'random' or a numpy array of shape "
+             "(n_features, n_features).".format(param))
+      with pytest.raises(ValueError) as raised_error:
+        model.fit(input_data, labels)
+      assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if idml[:4] in ['ITML', 'SDML', 'LSML']],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if idml[:4] in ['ITML', 'SDML', 'LSML']])
+def test_singular_covariance_init_or_prior_strictpd(estimator, build_dataset):
+    """Tests that when using the 'covariance' init or prior, it returns the
+    appropriate error if the covariance matrix is singular, for algorithms
+    that need a strictly PD prior or init (see
+    https://github.com/scikit-learn-contrib/metric-learn/issues/202 and
+    https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment
+    -492332451)
+    """
+    matrices_to_set = []
+    if hasattr(estimator, 'init'):
+      matrices_to_set.append('init')
+    if hasattr(estimator, 'prior'):
+      matrices_to_set.append('prior')
+
+    input_data, labels, _, X = build_dataset()
+    for param in matrices_to_set:
+      model = clone(estimator)
+      set_random_state(model)
+      # We create a feature that is a linear combination of the first two
+      # features:
+      input_data = np.concatenate([input_data, input_data[:, ..., :2]
+                                   .dot([[2], [3]])],
+                                  axis=-1)
+      model.set_params(**{param: 'covariance'})
+      msg = ("Unable to get a true inverse of the covariance "
+             "matrix since it is not definite. Try another "
+             "`{}`, or an algorithm that does not "
+             "require the `{}` to be strictly positive definite."
+             .format(param, param))
+      with pytest.raises(LinAlgError) as raised_err:
+        model.fit(input_data, labels)
+      assert str(raised_err.value) == msg
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if idml[:3] in ['MMC']],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if idml[:3] in ['MMC']])
+def test_singular_covariance_init_of_non_strict_pd(estimator, build_dataset):
+    """Tests that when using the 'covariance' init or prior, it returns the
+    appropriate warning if the covariance matrix is singular, for algorithms
+    that don't need a strictly PD init. Also checks that the returned
+    inverse matrix has finite values
+    """
+    input_data, labels, _, X = build_dataset()
+    model = clone(estimator)
+    set_random_state(model)
+    # We create a feature that is a linear combination of the first two
+    # features:
+    input_data = np.concatenate([input_data, input_data[:, ..., :2].dot([[2],
+                                                                        [3]])],
+                                axis=-1)
+    model.set_params(init='covariance')
+    msg = ('The covariance matrix is not invertible: '
+           'using the pseudo-inverse instead.'
+           'To make the covariance matrix invertible'
+           ' you can remove any linearly dependent features and/or '
+           'reduce the dimensionality of your input, '
+           'for instance using `sklearn.decomposition.PCA` as a '
+           'preprocessing step.')
+    with pytest.warns(UserWarning) as raised_warning:
+      model.fit(input_data, labels)
+    assert any([str(warning.message) == msg for warning in raised_warning])
+    M, _ = _initialize_metric_mahalanobis(X, init='covariance',
+                                          random_state=RNG,
+                                          return_inverse=True,
+                                          strict_pd=False)
+    assert np.isfinite(M).all()
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if idml[:4] in ['ITML', 'SDML', 'LSML']],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if idml[:4] in ['ITML', 'SDML', 'LSML']])
+@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
+def test_singular_array_init_or_prior_strictpd(estimator, build_dataset, w0):
+    """Tests that when using a custom array init (or prior), it returns the
+    appropriate error if it is singular, for algorithms
+    that need a strictly PD prior or init (see
+    https://github.com/scikit-learn-contrib/metric-learn/issues/202 and
+    https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment
+    -492332451)
+    """
+    matrices_to_set = []
+    if hasattr(estimator, 'init'):
+      matrices_to_set.append('init')
+    if hasattr(estimator, 'prior'):
+      matrices_to_set.append('prior')
+
+    rng = np.random.RandomState(42)
+    input_data, labels, _, X = build_dataset()
+    for param in matrices_to_set:
+      model = clone(estimator)
+      set_random_state(model)
+
+      P = ortho_group.rvs(X.shape[1], random_state=rng)
+      w = np.abs(rng.randn(X.shape[1]))
+      w[0] = w0
+      M = P.dot(np.diag(w)).dot(P.T)
+      if hasattr(model, 'init'):
+        model.set_params(init=M)
+      if hasattr(model, 'prior'):
+        model.set_params(prior=M)
+      if not hasattr(model, 'prior') and not hasattr(model, 'init'):
+        raise RuntimeError("Neither prior or init could be set in the model.")
+      msg = ("You should provide a strictly positive definite "
+             "matrix as `{}`. This one is not definite. Try another"
+             " {}, or an algorithm that does not "
+             "require the {} to be strictly positive definite."
+             .format(*(param,) * 3))
+      with pytest.raises(LinAlgError) as raised_err:
+        model.fit(input_data, labels)
+      assert str(raised_err.value) == msg
+
+
+@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
+def test_singular_array_init_of_non_strict_pd(w0):
+    """Tests that when using a custom array init, it returns the
+    appropriate warning if it is singular. Also checks if the returned
+    inverse matrix is finite. This isn't checked for model fitting as no
+    model curently uses this setting.
+    """
+    rng = np.random.RandomState(42)
+    X, y = shuffle(*make_blobs(random_state=rng),
+                   random_state=rng)
+    P = ortho_group.rvs(X.shape[1], random_state=rng)
+    w = np.abs(rng.randn(X.shape[1]))
+    w[0] = w0
+    M = P.dot(np.diag(w)).dot(P.T)
+    msg = ('The initialization matrix is not invertible: '
+           'using the pseudo-inverse instead.')
+    with pytest.warns(UserWarning) as raised_warning:
+      _, M_inv = _initialize_metric_mahalanobis(X, init=M,
+                                                random_state=rng,
+                                                return_inverse=True,
+                                                strict_pd=False)
+    assert str(raised_warning[0].message) == msg
+    assert np.isfinite(M_inv).all()
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_deterministic_initialization(estimator, build_dataset):
+  """Test that estimators that have a prior or an init are deterministic
+  when it is set to to random and when the random_state is fixed."""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  if hasattr(estimator, 'init'):
+    model.set_params(init='random')
+  if hasattr(estimator, 'prior'):
+    model.set_params(prior='random')
+  model1 = clone(model)
+  set_random_state(model1, 42)
+  model1 = model1.fit(*remove_y(model, input_data, labels))
+  model2 = clone(model)
+  set_random_state(model2, 42)
+  model2 = model2.fit(*remove_y(model, input_data, labels))
+  np.testing.assert_allclose(model1.get_mahalanobis_matrix(),
+                             model2.get_mahalanobis_matrix())
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
new file mode 100644
index 00000000..bfedefea
--- /dev/null
+++ b/test/test_pairs_classifiers.py
@@ -0,0 +1,574 @@
+from functools import partial
+
+import warnings
+import pytest
+from numpy.testing import assert_array_equal
+from scipy.spatial.distance import euclidean
+
+from metric_learn.base_metric import _PairsClassifierMixin, MahalanobisMixin
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics import (f1_score, accuracy_score, fbeta_score,
+                             precision_score)
+from sklearn.model_selection import train_test_split
+
+from test.test_utils import pairs_learners, ids_pairs_learners
+from metric_learn.sklearn_shims import set_random_state
+from sklearn import clone
+import numpy as np
+from itertools import product
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
+                                                              labels)
+  estimator.fit(pairs_train, y_train)
+  predictions = estimator.predict(pairs_test)
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_predict_monotonous(estimator, build_dataset,
+                            with_preprocessor):
+  """Test that there is a threshold distance separating points labeled as
+  similar and points labeled as dissimilar """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
+                                                              labels)
+  estimator.fit(pairs_train, y_train)
+  scores = estimator.pair_score(pairs_test)
+  predictions = estimator.predict(pairs_test)
+  max_dissimilar = np.max(scores[predictions == -1])
+  min_similar = np.min(scores[predictions == 1])
+  assert max_dissimilar <= min_similar
+  separator = np.mean([max_dissimilar, min_similar])
+  assert (predictions[scores < separator] == -1).all()
+  assert (predictions[scores > separator] == 1).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                              with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to use
+  pair_score, score_pairs, decision_function, get_metric, transform or
+  get_mahalanobis_matrix on input data and the metric learner
+  has not been fitted."""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):  # Remove in 0.8.0
+    estimator.score_pairs(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.pair_score(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.decision_function(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.get_metric()
+  with pytest.raises(NotFittedError):
+    estimator.transform(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.get_mahalanobis_matrix()
+  with pytest.raises(NotFittedError):
+    estimator.calibrate_threshold(input_data, labels)
+
+  with pytest.raises(NotFittedError):
+    estimator.set_threshold(0.5)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+
+
+@pytest.mark.parametrize('calibration_params',
+                         [None, {}, dict(), {'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                              ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [0., 0.1, 0.2, 1., 5.]]
+                         )
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_fit_with_valid_threshold_params(estimator, build_dataset,
+                                         with_preprocessor,
+                                         calibration_params):
+  """Tests that fitting `calibration_params` with appropriate parameters works
+  as expected"""
+  pairs, y, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(pairs, y, calibration_params=calibration_params)
+  estimator.predict(pairs)
+
+
+@pytest.mark.parametrize('kwargs',
+                         [{'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                              ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [0., 0.1, 0.2, 1., 5.]]
+                         )
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_threshold_different_scores_is_finite(estimator, build_dataset,
+                                              with_preprocessor, kwargs):
+  # test that calibrating the threshold works for every metric learner
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(input_data, labels)
+  with warnings.catch_warnings(record=True) as record:
+    estimator.calibrate_threshold(input_data, labels, **kwargs)
+  assert len(record) == 0
+
+
+class IdentityPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
+  """A simple pairs classifier for testing purposes, that will just have
+  identity as components_, and a string threshold so that it returns an
+  error if not explicitely set.
+  """
+  def fit(self, pairs, y):
+    pairs, y = self._prepare_inputs(pairs, y,
+                                    type_of_inputs='tuples')
+    self.components_ = np.atleast_2d(np.identity(pairs.shape[2]))
+    # self.threshold_ is not set.
+    return self
+
+
+def test_unset_threshold():
+  """Tests that the "threshold is unset" error is raised when using predict
+  (performs binary classification on pairs) with an unset threshold."""
+  identity_pairs_classifier = IdentityPairsClassifier()
+  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
+  y = np.array([1, 1, -1, -1])
+  identity_pairs_classifier.fit(pairs, y)
+  with pytest.raises(AttributeError) as e:
+    identity_pairs_classifier.predict(pairs)
+
+  expected_msg = ("A threshold for this estimator has not been set, "
+                  "call its set_threshold or calibrate_threshold method.")
+
+  assert str(e.value) == expected_msg
+
+
+def test_set_threshold():
+  # test that set_threshold indeed sets the threshold
+  identity_pairs_classifier = IdentityPairsClassifier()
+  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
+  y = np.array([1, 1, -1, -1])
+  identity_pairs_classifier.fit(pairs, y)
+  identity_pairs_classifier.set_threshold(0.5)
+  assert identity_pairs_classifier.threshold_ == 0.5
+
+
+@pytest.mark.parametrize('value', ["ABC", None, [1, 2, 3], {'key': None},
+                         (1, 2), set(),
+                         np.array([[[0.], [1.]], [[1.], [3.]]])])
+def test_set_wrong_type_threshold(value):
+  """
+  Test that `set_threshold` indeed sets the threshold
+  and cannot accept nothing but float or integers, but
+  being permissive with boolean True=1.0 and False=0.0
+  """
+  model = IdentityPairsClassifier()
+  model.fit(np.array([[[0.], [1.]]]), np.array([1]))
+  msg = ('Parameter threshold must be a real number. '
+         'Got {} instead.'.format(type(value)))
+
+  with pytest.raises(ValueError) as e:  # String
+    model.set_threshold(value)
+  assert str(e.value).startswith(msg)
+
+
+def test_f_beta_1_is_f_1():
+  # test that putting beta to 1 indeed finds the best threshold to optimize
+  # the f1_score
+  rng = np.random.RandomState(42)
+  n_samples = 100
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, strategy='f_beta', beta=1)
+  best_f1_score = f1_score(y, pairs_learner.predict(pairs))
+  for threshold in - pairs_learner.decision_function(pairs):
+    pairs_learner.set_threshold(threshold)
+    assert f1_score(y, pairs_learner.predict(pairs)) <= best_f1_score
+
+
+def true_pos_true_neg_rates(y_true, y_pred):
+  """A function that returns the true positive rates and the true negatives
+  rate. For testing purposes (optimized for readability not performance)."""
+  assert y_pred.shape[0] == y_true.shape[0]
+  tp = np.sum((y_pred == 1) * (y_true == 1))
+  tn = np.sum((y_pred == -1) * (y_true == -1))
+  fn = np.sum((y_pred == -1) * (y_true == 1))
+  fp = np.sum((y_pred == 1) * (y_true == -1))
+  tpr = tp / (tp + fn)
+  tnr = tn / (tn + fp)
+  tpr = tpr if not np.isnan(tpr) else 0.
+  tnr = tnr if not np.isnan(tnr) else 0.
+  return tpr, tnr
+
+
+def tpr_threshold(y_true, y_pred, tnr_threshold=0.):
+  """A function that returns the true positive rate if the true negative
+  rate is higher or equal than `threshold`, and -1 otherwise. For testing
+  purposes"""
+  tpr, tnr = true_pos_true_neg_rates(y_true, y_pred)
+  if tnr < tnr_threshold:
+    return -1
+  else:
+    return tpr
+
+
+def tnr_threshold(y_true, y_pred, tpr_threshold=0.):
+  """A function that returns the true negative rate if the true positive
+  rate is higher or equal than `threshold`, and -1 otherwise. For testing
+  purposes"""
+  tpr, tnr = true_pos_true_neg_rates(y_true, y_pred)
+  if tpr < tpr_threshold:
+    return -1
+  else:
+    return tnr
+
+
+@pytest.mark.parametrize('kwargs, scoring',
+                         [({'strategy': 'accuracy'}, accuracy_score)] +
+                         [({'strategy': 'f_beta', 'beta': b},
+                           partial(fbeta_score, beta=b))
+                          for b in [0.1, 0.5, 1.]] +
+                         [({'strategy': 'f_beta', 'beta': 0},
+                           precision_score)] +
+                         [({'strategy': 'max_tpr', 'min_rate': t},
+                           partial(tpr_threshold, tnr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]] +
+                         [({'strategy': 'max_tnr', 'min_rate': t},
+                           partial(tnr_threshold, tpr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]],
+                         )
+def test_found_score_is_best_score(kwargs, scoring):
+  # test that when we use calibrate threshold, it will indeed be the
+  # threshold that have the best score
+  rng = np.random.RandomState(42)
+  n_samples = 50
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, **kwargs)
+  best_score = scoring(y, pairs_learner.predict(pairs))
+  scores = []
+  predicted_scores = pairs_learner.decision_function(pairs)
+  predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
+                                predicted_scores,
+                                [np.max(predicted_scores) + 1]])
+  for threshold in - predicted_scores:
+    pairs_learner.set_threshold(threshold)
+    score = scoring(y, pairs_learner.predict(pairs))
+    assert score <= best_score
+    scores.append(score)
+  assert len(set(scores)) > 1  # assert that we didn't always have the same
+  # value for the score (which could be a hint for some bug, but would still
+  # silently pass the test))
+
+
+@pytest.mark.parametrize('kwargs, scoring',
+                         [({'strategy': 'accuracy'}, accuracy_score)] +
+                         [({'strategy': 'f_beta', 'beta': b},
+                           partial(fbeta_score, beta=b))
+                          for b in [0.1, 0.5, 1.]] +
+                         [({'strategy': 'f_beta', 'beta': 0},
+                           precision_score)] +
+                         [({'strategy': 'max_tpr', 'min_rate': t},
+                           partial(tpr_threshold, tnr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]] +
+                         [({'strategy': 'max_tnr', 'min_rate': t},
+                           partial(tnr_threshold, tpr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]]
+                         )
+def test_found_score_is_best_score_duplicates(kwargs, scoring):
+  # test that when we use calibrate threshold, it will indeed be the
+  # threshold that have the best score. It's the same as the previous test
+  # except this time we test that the scores are coherent even if there are
+  # duplicates (i.e. points that have the same score returned by
+  # `decision_function`).
+  rng = np.random.RandomState(42)
+  n_samples = 50
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  # we create some duplicates points, which will also have the same score
+  # predicted
+  pairs[6:10] = pairs[10:14]
+  y[6:10] = y[10:14]
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, **kwargs)
+  best_score = scoring(y, pairs_learner.predict(pairs))
+  scores = []
+  predicted_scores = pairs_learner.decision_function(pairs)
+  predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
+                                predicted_scores,
+                                [np.max(predicted_scores) + 1]])
+  for threshold in - predicted_scores:
+    pairs_learner.set_threshold(threshold)
+    score = scoring(y, pairs_learner.predict(pairs))
+    assert score <= best_score
+    scores.append(score)
+  assert len(set(scores)) > 1  # assert that we didn't always have the same
+  # value for the score (which could be a hint for some bug, but would still
+  # silently pass the test))
+
+
+@pytest.mark.parametrize('invalid_args, expected_msg',
+                         [({'strategy': 'weird'},
+                           ('Strategy can either be "accuracy", "f_beta" or '
+                            '"max_tpr" or "max_tnr". Got "weird" instead.'))] +
+                         [({'strategy': strategy, 'min_rate': min_rate},
+                           'Parameter min_rate must be a number in'
+                           '[0, 1]. Got {} instead.'.format(min_rate))
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [None, 'weird', -0.2, 1.2, 3 + 2j])] +
+                         [({'strategy': 'f_beta', 'beta': beta},
+                           'Parameter beta must be a real number. '
+                           'Got {} instead.'.format(type(beta)))
+                          for beta in [None, 'weird', 3 + 2j]]
+                         )
+def test_calibrate_threshold_invalid_parameters_right_error(invalid_args,
+                                                            expected_msg):
+  # test that the right error message is returned if invalid arguments are
+  # given to calibrate_threshold
+  rng = np.random.RandomState(42)
+  pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  with pytest.raises(ValueError) as raised_error:
+    pairs_learner.calibrate_threshold(pairs, y, **invalid_args)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('valid_args',
+                         [{'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
+                         # Note that we authorize beta < 0 (even if
+                         # in fact it will be squared, so it would be useless
+                         # to do that)
+                         )
+def test_calibrate_threshold_valid_parameters(valid_args):
+  # test that no warning message is returned if valid arguments are given to
+  # calibrate threshold
+  rng = np.random.RandomState(42)
+  pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  with warnings.catch_warnings(record=True) as record:
+    pairs_learner.calibrate_threshold(pairs, y, **valid_args)
+  assert len(record) == 0
+
+
+def test_calibrate_threshold_extreme():
+  """Test that in the (rare) case where we should accept all points or
+  reject all points, this is effectively what
+  is done"""
+
+  class MockBadPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
+    """A pairs classifier that returns bad scores (i.e. in the inverse order
+    of what we would expect from a good pairs classifier
+    """
+
+    def fit(self, pairs, y, calibration_params=None):
+      self.preprocessor_ = 'not used'
+      self.components_ = 'not used'
+      self.calibrate_threshold(pairs, y, **(calibration_params if
+                                            calibration_params is not None else
+                                            dict()))
+      return self
+
+    def decision_function(self, pairs):
+      return np.arange(pairs.shape[0], dtype=float)
+
+  rng = np.random.RandomState(42)
+  pairs = rng.randn(7, 2, 5)  # the info in X is not used, it's just for the
+  # API
+
+  y = [1., 1., 1., -1., -1., -1., -1.]
+  mock_clf = MockBadPairsClassifier()
+  # case of bad scoring with more negative than positives. In
+  # this case, when:
+  # optimizing for accuracy we should reject all points
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
+
+  # optimizing for max_tpr we should accept all points if min_rate == 0. (
+  # because by convention then tnr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
+                                             'min_rate': 0.})
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
+  # optimizing for max_tnr we should reject all points if min_rate = 0. (
+  # because by convention then tpr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
+                                             'min_rate': 0.})
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
+
+  y = [1., 1., 1., 1., -1., -1., -1.]
+  # case of bad scoring with more positives than negatives. In
+  # this case, when:
+  # optimizing for accuracy we should accept all points
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
+  # optimizing for max_tpr we should accept all points if min_rate == 0. (
+  # because by convention then tnr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
+                                             'min_rate': 0.})
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
+  # optimizing for max_tnr we should reject all points if min_rate = 0. (
+  # because by convention then tpr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
+                                             'min_rate': 0.})
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
+
+  # Note: we'll never find a case where we would reject all points for
+  # maximizing tpr (we can always accept more points), and accept all
+  # points for maximizing tnr (we can always reject more points)
+
+  # case of alternated scores: for optimizing the f_1 score we should accept
+  # all points (because this way we have max recall (1) and max precision (
+  # here: 0.5))
+  y = [1., -1., 1., -1., 1., -1.]
+  mock_clf.fit(pairs[:6], y, calibration_params={'strategy': 'f_beta',
+                                                 'beta': 1.})
+  assert_array_equal(mock_clf.predict(pairs[:6]), np.ones(6))
+
+  # Note: for optimizing f_1 score, we will never find an optimal case where we
+  # reject all points because in this case we would have 0 precision (by
+  # convention, because it's 0/0), and 0 recall (and we could always decrease
+  # the threshold to increase the recall, and we couldn't do worse for
+  # precision so it would be better)
+
+
+@pytest.mark.parametrize('estimator, _',
+                         pairs_learners + [(IdentityPairsClassifier(), None),
+                                           (_PairsClassifierMixin, None)],
+                         ids=ids_pairs_learners + ['mock', 'class'])
+@pytest.mark.parametrize('invalid_args, expected_msg',
+                         [({'strategy': 'weird'},
+                           ('Strategy can either be "accuracy", "f_beta" or '
+                            '"max_tpr" or "max_tnr". Got "weird" instead.'))] +
+                         [({'strategy': strategy, 'min_rate': min_rate},
+                           'Parameter min_rate must be a number in'
+                           '[0, 1]. Got {} instead.'.format(min_rate))
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [None, 'weird', -0.2, 1.2, 3 + 2j])] +
+                         [({'strategy': 'f_beta', 'beta': beta},
+                           'Parameter beta must be a real number. '
+                           'Got {} instead.'.format(type(beta)))
+                          for beta in [None, 'weird', 3 + 2j]]
+                         )
+def test_validate_calibration_params_invalid_parameters_right_error(
+        estimator, _, invalid_args, expected_msg):
+  # test that the right error message is returned if invalid arguments are
+  # given to _validate_calibration_params, for all pairs metric learners as
+  # well as a mocking general identity pairs classifier and the class itself
+  with pytest.raises(ValueError) as raised_error:
+    estimator._validate_calibration_params(**invalid_args)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator, _',
+                         pairs_learners + [(IdentityPairsClassifier(), None),
+                                           (_PairsClassifierMixin, None)],
+                         ids=ids_pairs_learners + ['mock', 'class'])
+@pytest.mark.parametrize('valid_args',
+                         [{}, {'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
+                         # Note that we authorize beta < 0 (even if
+                         # in fact it will be squared, so it would be useless
+                         # to do that)
+                         )
+def test_validate_calibration_params_valid_parameters(
+        estimator, _, valid_args):
+  # test that no warning message is returned if valid arguments are given to
+  # _validate_calibration_params for all pairs metric learners, as well as
+  # a mocking example, and the class itself
+  with warnings.catch_warnings(record=True) as record:
+    estimator._validate_calibration_params(**valid_args)
+  assert len(record) == 0
+
+
+@pytest.mark.parametrize('estimator, build_dataset',
+                         pairs_learners,
+                         ids=ids_pairs_learners)
+def test_validate_calibration_params_invalid_parameters_error_before__fit(
+        estimator, build_dataset):
+  """For all pairs metric learners (which currently all have a _fit method),
+  make sure that calibration parameters are validated before fitting"""
+  estimator = clone(estimator)
+  input_data, labels, _, _ = build_dataset()
+
+  def breaking_fun(**args):  # a function that fails so that we will miss
+    # the calibration at the end and therefore the right error message from
+    # validating params should be thrown before
+    raise RuntimeError('Game over.')
+  estimator._fit = breaking_fun
+  expected_msg = ('Strategy can either be "accuracy", "f_beta" or '
+                  '"max_tpr" or "max_tnr". Got "weird" instead.')
+  with pytest.raises(ValueError) as raised_error:
+    estimator.fit(input_data, labels, calibration_params={'strategy': 'weird'})
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_accuracy_toy_example(estimator, build_dataset):
+  """Test that the accuracy works on some toy example (hence that the
+  prediction is OK)"""
+  input_data, labels, preprocessor, X = build_dataset(with_preprocessor=False)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(input_data, labels)
+  # we force the transformation to be identity so that we control what it does
+  estimator.components_ = np.eye(X.shape[1])
+  # the threshold for similar or dissimilar pairs is half of the distance
+  # between X[0] and X[1]
+  estimator.set_threshold(euclidean(X[0], X[1]) / 2)
+  # We take the two first points and we build 4 regularly spaced points on the
+  # line they define, so that it's easy to build quadruplets of different
+  # similarities.
+  X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4
+  pairs_test = np.array(
+      [[X_test[0], X_test[1]],  # similar
+       [X_test[0], X_test[3]],  # dissimilar
+       [X_test[1], X_test[2]],  # similar
+       [X_test[2], X_test[3]]])  # similar
+  y = np.array([-1, 1, 1, -1])  # [F, F, T, F]
+  assert accuracy_score(estimator.predict(pairs_test), y) == 0.25
diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
new file mode 100644
index 00000000..a8319961
--- /dev/null
+++ b/test/test_quadruplets_classifiers.py
@@ -0,0 +1,65 @@
+import pytest
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+from test.test_utils import quadruplets_learners, ids_quadruplets_learners
+from metric_learn.sklearn_shims import set_random_state
+from sklearn import clone
+import numpy as np
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  (quadruplets_train,
+   quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
+  estimator.fit(quadruplets_train)
+  predictions = estimator.predict(quadruplets_test)
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                              with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to predict and
+  the metric learner has not been fitted."""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_accuracy_toy_example(estimator, build_dataset):
+  """Test that the default scoring for quadruplets (accuracy) works on some
+  toy example"""
+  input_data, labels, preprocessor, X = build_dataset(with_preprocessor=False)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(input_data)
+  # We take the two first points and we build 4 regularly spaced points on the
+  # line they define, so that it's easy to build quadruplets of different
+  # similarities.
+  X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4
+  quadruplets_test = np.array(
+      [[X_test[0], X_test[2], X_test[0], X_test[1]],
+       [X_test[1], X_test[3], X_test[1], X_test[0]],
+       [X_test[1], X_test[2], X_test[0], X_test[3]],
+       [X_test[3], X_test[0], X_test[2], X_test[1]]])
+  # we force the transformation to be identity so that we control what it does
+  estimator.components_ = np.eye(X.shape[1])
+  assert estimator.score(quadruplets_test) == 0.25
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
new file mode 100644
index 00000000..798d9036
--- /dev/null
+++ b/test/test_sklearn_compat.py
@@ -0,0 +1,465 @@
+import pytest
+import unittest
+from sklearn.utils.estimator_checks import check_estimator
+from sklearn.base import TransformerMixin
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+from metric_learn.sklearn_shims import (assert_allclose_dense_sparse,
+                                        set_random_state, _get_args,
+                                        is_public_parameter, get_scorer)
+from metric_learn import (Covariance, LFDA, LMNN, MLKR, NCA,
+                          ITML_Supervised, LSML_Supervised,
+                          MMC_Supervised, RCA_Supervised, SDML_Supervised,
+                          SCML_Supervised)
+from sklearn import clone
+import numpy as np
+from sklearn.model_selection import (cross_val_score, cross_val_predict,
+                                     train_test_split, KFold)
+from test.test_utils import (metric_learners, ids_metric_learners,
+                             mock_preprocessor, tuples_learners,
+                             ids_tuples_learners, pairs_learners,
+                             ids_pairs_learners, remove_y,
+                             metric_learners_pipeline,
+                             ids_metric_learners_pipeline)
+
+
+class Stable_RCA_Supervised(RCA_Supervised):
+
+  def __init__(self, n_components=None,
+               chunk_size=2, preprocessor=None, random_state=None):
+    # this init makes RCA stable for scikit-learn examples.
+    super(Stable_RCA_Supervised, self).__init__(
+        n_chunks=2, n_components=n_components,
+        chunk_size=chunk_size, preprocessor=preprocessor,
+        random_state=random_state)
+
+
+class Stable_SDML_Supervised(SDML_Supervised):
+
+  def __init__(self, sparsity_param=0.01,
+               n_constraints=None, verbose=False, preprocessor=None,
+               random_state=None):
+    # this init makes SDML stable for scikit-learn examples.
+    super(Stable_SDML_Supervised, self).__init__(
+        sparsity_param=sparsity_param,
+        n_constraints=n_constraints, verbose=verbose,
+        preprocessor=preprocessor, balance_param=1e-5, prior='identity',
+        random_state=random_state)
+
+
+class TestSklearnCompat(unittest.TestCase):
+  def test_covariance(self):
+    check_estimator(Covariance())
+
+  def test_lmnn(self):
+    check_estimator(LMNN())
+
+  def test_lfda(self):
+    check_estimator(LFDA())
+
+  def test_mlkr(self):
+    check_estimator(MLKR())
+
+  def test_nca(self):
+    check_estimator(NCA())
+
+  def test_lsml(self):
+    check_estimator(LSML_Supervised())
+
+  def test_itml(self):
+    check_estimator(ITML_Supervised())
+
+  def test_mmc(self):
+    check_estimator(MMC_Supervised())
+
+  def test_sdml(self):
+    check_estimator(Stable_SDML_Supervised())
+
+  def test_rca(self):
+    check_estimator(Stable_RCA_Supervised())
+
+  def test_scml(self):
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      check_estimator(SCML_Supervised())
+    assert msg in str(raised_warning[0].message)
+
+
+RNG = check_random_state(0)
+
+
+# ---------------------- Test scikit-learn compatibility ----------------------
+
+def generate_array_like(input_data, labels=None):
+  """Helper function to generate array-like variants of numpy datasets,
+  for testing purposes."""
+  list_data = input_data.tolist()
+  input_data_changed = [input_data, list_data, tuple(list_data)]
+  if input_data.ndim >= 2:
+    input_data_changed.append(tuple(tuple(x) for x in list_data))
+  if input_data.ndim >= 3:
+    input_data_changed.append(tuple(tuple(tuple(x) for x in y) for y in
+                                    list_data))
+  if input_data.ndim == 2:
+    pd = pytest.importorskip('pandas')
+    input_data_changed.append(pd.DataFrame(input_data))
+  if labels is not None:
+    labels_changed = [labels, list(labels), tuple(labels)]
+  else:
+    labels_changed = [labels]
+  return input_data_changed, labels_changed
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
+  """Test that metric-learners can have as input (of all functions that are
+  applied on data) any array-like object."""
+  input_data, labels, preprocessor, X = build_dataset(with_preprocessor)
+
+  # we subsample the data for the test to be more efficient
+  input_data, _, labels, _ = train_test_split(input_data, labels,
+                                              train_size=40,
+                                              random_state=42)
+  X = X[:10]
+
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  input_variants, label_variants = generate_array_like(input_data, labels)
+  for input_variant in input_variants:
+    for label_variant in label_variants:
+      estimator.fit(*remove_y(estimator, input_variant, label_variant))
+    if hasattr(estimator, "predict"):
+      estimator.predict(input_variant)
+    if hasattr(estimator, "predict_proba"):
+      estimator.predict_proba(input_variant)  # anticipation in case some
+      # time we have that, or if ppl want to contribute with new algorithms
+      # it will be checked automatically
+    if hasattr(estimator, "decision_function"):
+      estimator.decision_function(input_variant)
+    if hasattr(estimator, "score"):
+      for label_variant in label_variants:
+        estimator.score(*remove_y(estimator, input_variant, label_variant))
+
+  X_variants, _ = generate_array_like(X)
+  for X_variant in X_variants:
+    estimator.transform(X_variant)
+
+  pairs = np.array([[X[0], X[1]], [X[0], X[2]]])
+  pairs_variants, _ = generate_array_like(pairs)
+
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+
+  for pairs_variant in pairs_variants:
+    estimator.pair_score(pairs_variant)  # All learners have pair_score
+
+    # But not all of them will have pair_distance
+    try:
+      estimator.pair_distance(pairs_variant)
+    except Exception as raised_exception:
+      assert raised_exception.value.args[0] == not_implemented_msg
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_various_scoring_on_tuples_learners(estimator, build_dataset,
+                                            with_preprocessor):
+  """Tests that scikit-learn's scoring returns something finite,
+  for other scoring than default scoring. (List of scikit-learn's scores can be
+  found in sklearn.metrics._scorer). For each type of output (predict,
+  predict_proba, decision_function), we test a bunch of scores.
+  We only test on pairs learners because quadruplets don't have a y argument.
+  """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+
+  # scores that need a predict function: every tuples learner should have a
+  # predict function (whether the pair is of positive samples or negative
+  # samples)
+  for scoring in ['accuracy', 'f1']:
+    check_score_is_finite(scoring, estimator, input_data, labels)
+  # scores that need a predict_proba:
+  if hasattr(estimator, "predict_proba"):
+    for scoring in ['neg_log_loss', 'brier_score']:
+      check_score_is_finite(scoring, estimator, input_data, labels)
+  # scores that need a decision_function: every tuples learner should have a
+  # decision function (the metric between points)
+  for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']:
+    check_score_is_finite(scoring, estimator, input_data, labels)
+
+
+def check_score_is_finite(scoring, estimator, input_data, labels):
+  estimator = clone(estimator)
+  assert np.isfinite(cross_val_score(estimator, input_data, labels,
+                                     scoring=scoring)).all()
+  estimator.fit(input_data, labels)
+  assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
+                         ids=ids_tuples_learners)
+def test_cross_validation_is_finite(estimator, build_dataset):
+  """Tests that validation on metric-learn estimators returns something finite
+  """
+  input_data, labels, preprocessor, _ = build_dataset()
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  assert np.isfinite(cross_val_score(estimator,
+                                     *remove_y(estimator, input_data, labels)
+                                     )).all()
+  assert np.isfinite(cross_val_predict(estimator,
+                                       *remove_y(estimator, input_data, labels)
+                                       )).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
+                                           with_preprocessor):
+  """Tests that if we make a manual cross-validation, the result will be the
+  same as scikit-learn's cross-validation (some code for generating the
+  folds is taken from scikit-learn).
+  """
+  if any(hasattr(estimator, method) for method in ["predict", "score"]):
+    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+    estimator = clone(estimator)
+    estimator.set_params(preprocessor=preprocessor)
+    set_random_state(estimator)
+    n_splits = 3
+    kfold = KFold(shuffle=False, n_splits=n_splits)
+    n_samples = input_data.shape[0]
+    fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int64)
+    fold_sizes[:n_samples % n_splits] += 1
+    current = 0
+    scores, predictions = [], np.zeros(input_data.shape[0])
+    for fold_size in fold_sizes:
+      start, stop = current, current + fold_size
+      current = stop
+      test_slice = slice(start, stop)
+      train_mask = np.ones(input_data.shape[0], bool)
+      train_mask[test_slice] = False
+      y_train, y_test = labels[train_mask], labels[test_slice]
+      estimator.fit(*remove_y(estimator, input_data[train_mask], y_train))
+      if hasattr(estimator, "score"):
+        scores.append(estimator.score(*remove_y(
+            estimator, input_data[test_slice], y_test)))
+      if hasattr(estimator, "predict"):
+        predictions[test_slice] = estimator.predict(input_data[test_slice])
+    if hasattr(estimator, "score"):
+      assert all(scores == cross_val_score(
+          estimator, *remove_y(estimator, input_data, labels),
+          cv=kfold))
+    if hasattr(estimator, "predict"):
+      assert all(predictions == cross_val_predict(
+          estimator,
+          *remove_y(estimator, input_data, labels),
+          cv=kfold))
+
+
+def check_score(estimator, tuples, y):
+  if hasattr(estimator, "score"):
+    score = estimator.score(*remove_y(estimator, tuples, y))
+    assert np.isfinite(score)
+
+
+def check_predict(estimator, tuples):
+  if hasattr(estimator, "predict"):
+    y_predicted = estimator.predict(tuples)
+    assert len(y_predicted), len(tuples)
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_simple_estimator(estimator, build_dataset, with_preprocessor):
+  """Tests that fit, predict and scoring works.
+  """
+  if any(hasattr(estimator, method) for method in ["predict", "score"]):
+    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+    (tuples_train, tuples_test, y_train,
+     y_test) = train_test_split(input_data, labels, random_state=RNG)
+    estimator = clone(estimator)
+    estimator.set_params(preprocessor=preprocessor)
+    set_random_state(estimator)
+
+    estimator.fit(*remove_y(estimator, tuples_train, y_train))
+    check_score(estimator, tuples_test, y_test)
+    check_predict(estimator, tuples_test)
+
+
+@pytest.mark.parametrize('estimator', [est[0] for est in metric_learners],
+                         ids=ids_metric_learners)
+@pytest.mark.parametrize('preprocessor', [None, mock_preprocessor])
+def test_no_attributes_set_in_init(estimator, preprocessor):
+  """Check setting during init. Adapted from scikit-learn."""
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  if hasattr(type(estimator).__init__, "deprecated_original"):
+      return
+
+  init_params = _get_args(type(estimator).__init__)
+  parents_init_params = [param for params_parent in
+                         (_get_args(parent) for parent in
+                          type(estimator).__mro__)
+                         for param in params_parent]
+
+  # Test for no setting apart from parameters during init
+  invalid_attr = (set(vars(estimator)) - set(init_params) -
+                  set(parents_init_params))
+  assert not invalid_attr, \
+      ("Estimator %s should not set any attribute apart"
+       " from parameters during init. Found attributes %s."
+       % (type(estimator).__name__, sorted(invalid_attr)))
+  # Ensure that each parameter is set in init
+  invalid_attr = (set(init_params) - set(vars(estimator)) -
+                  set(["self"]))
+  assert not invalid_attr, \
+      ("Estimator %s should store all parameters"
+       " as an attribute during init. Did not find "
+       "attributes %s." % (type(estimator).__name__, sorted(invalid_attr)))
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_estimators_fit_returns_self(estimator, build_dataset,
+                                     with_preprocessor):
+  """Check if self is returned when calling fit"""
+  # Adapted from scikit-learn
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  assert estimator.fit(*remove_y(estimator, input_data, labels)) is estimator
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners_pipeline,
+                         ids=ids_metric_learners_pipeline)
+def test_pipeline_consistency(estimator, build_dataset,
+                              with_preprocessor):
+  # Adapted from scikit learn
+  # check that make_pipeline(est) gives same score as est
+
+  input_data, y, preprocessor, _ = build_dataset(with_preprocessor)
+
+  def make_random_state(estimator, in_pipeline):
+    rs = {}
+    name_estimator = estimator.__class__.__name__
+    if name_estimator[-11:] == '_Supervised':
+      name_param = 'random_state'
+      if in_pipeline:
+          name_param = name_estimator.lower() + '__' + name_param
+      rs[name_param] = check_random_state(0)
+    return rs
+
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor,
+                       **make_random_state(estimator, False))
+  pipeline = make_pipeline(estimator)
+  estimator.fit(input_data, y)
+  estimator.set_params(preprocessor=preprocessor)
+  pipeline.set_params(**make_random_state(estimator, True))
+  pipeline.fit(input_data, y)
+
+  if hasattr(estimator, 'score'):
+    result = estimator.score(input_data, y)
+    result_pipe = pipeline.score(input_data, y)
+    assert_allclose_dense_sparse(result, result_pipe)
+
+  if hasattr(estimator, 'predict'):
+    result = estimator.predict(input_data)
+    result_pipe = pipeline.predict(input_data)
+    assert_allclose_dense_sparse(result, result_pipe)
+
+  if issubclass(estimator.__class__, TransformerMixin):
+    if hasattr(estimator, 'transform'):
+      result = estimator.transform(input_data)
+      result_pipe = pipeline.transform(input_data)
+      assert_allclose_dense_sparse(result, result_pipe)
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_dict_unchanged(estimator, build_dataset, with_preprocessor):
+  # Adapted from scikit-learn
+  (input_data, labels, preprocessor,
+   to_transform) = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  if hasattr(estimator, "n_components"):
+    estimator.n_components = 1
+  estimator.fit(*remove_y(estimator, input_data, labels))
+
+  def check_dict():
+    assert estimator.__dict__ == dict_before, (
+        "Estimator changes __dict__ during %s" % method)
+  for method in ["predict", "decision_function", "predict_proba"]:
+    if hasattr(estimator, method):
+      dict_before = estimator.__dict__.copy()
+      getattr(estimator, method)(input_data)
+      check_dict()
+  if hasattr(estimator, "transform"):
+    dict_before = estimator.__dict__.copy()
+    # we transform only dataset of points
+    estimator.transform(to_transform)
+    check_dict()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_dont_overwrite_parameters(estimator, build_dataset,
+                                   with_preprocessor):
+  # Adapted from scikit-learn
+  # check that fit method only changes or sets private attributes
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  if hasattr(estimator, "n_components"):
+    estimator.n_components = 1
+  dict_before_fit = estimator.__dict__.copy()
+
+  estimator.fit(*remove_y(estimator, input_data, labels))
+  dict_after_fit = estimator.__dict__
+
+  public_keys_after_fit = [key for key in dict_after_fit.keys()
+                           if is_public_parameter(key)]
+
+  attrs_added_by_fit = [key for key in public_keys_after_fit
+                        if key not in dict_before_fit.keys()]
+
+  # check that fit doesn't add any public attribute
+  assert not attrs_added_by_fit, (
+      "Estimator adds public attribute(s) during"
+      " the fit method."
+      " Estimators are only allowed to add private "
+      "attributes"
+      " either started with _ or ended"
+      " with _ but %s added" % ', '.join(attrs_added_by_fit))
+
+  # check that fit doesn't change any public attribute
+  attrs_changed_by_fit = [key for key in public_keys_after_fit
+                          if (dict_before_fit[key]
+                              is not dict_after_fit[key])]
+
+  assert not attrs_changed_by_fit, (
+      "Estimator changes public attribute(s) during"
+      " the fit method. Estimators are only allowed"
+      " to change attributes started"
+      " or ended with _, but"
+      " %s changed" % ', '.join(attrs_changed_by_fit))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/test/test_triplets_classifiers.py b/test/test_triplets_classifiers.py
new file mode 100644
index 00000000..515a0a33
--- /dev/null
+++ b/test/test_triplets_classifiers.py
@@ -0,0 +1,127 @@
+import pytest
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+from metric_learn import SCML
+from test.test_utils import (
+  triplets_learners,
+  ids_triplets_learners,
+  build_triplets
+)
+from metric_learn.sklearn_shims import set_random_state
+from sklearn import clone
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, _, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  triplets_train, triplets_test = train_test_split(input_data)
+  estimator.fit(triplets_train)
+  predictions = estimator.predict(triplets_test)
+
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
+
+
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_no_zero_prediction(estimator, build_dataset):
+  """
+  Test that all predicted values are not zero, even when the
+  distance d(x,y) and d(x,z) is the same for a triplet of the
+  form (x, y, z). i.e border cases.
+  """
+  triplets, _, _, X = build_dataset(with_preprocessor=False)
+  # Force 3 dimentions only, to use cross product and get easy orthogonal vec.
+  triplets = np.array([[t[0][:3], t[1][:3], t[2][:3]] for t in triplets])
+  X = X[:, :3]
+  # Dummy fit
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  estimator.fit(triplets)
+  # We force the transformation to be identity, to force euclidean distance
+  estimator.components_ = np.eye(X.shape[1])
+
+  # Get two orthogonal vectors in respect to X[1]
+  k = X[1] / np.linalg.norm(X[1])  # Normalize first vector
+  x = X[2] - X[2].dot(k) * k  # Get random orthogonal vector
+  x /= np.linalg.norm(x)  # Normalize
+  y = np.cross(k, x)  # Get orthogonal vector to x
+  # Assert these orthogonal vectors are different
+  with pytest.raises(AssertionError):
+    assert_array_equal(X[1], x)
+  with pytest.raises(AssertionError):
+    assert_array_equal(X[1], y)
+  # Assert the distance is the same for both
+  assert estimator.get_metric()(X[1], x) == estimator.get_metric()(X[1], y)
+
+  # Form the three scenarios where predict() gives 0 with numpy.sign
+  triplets_test = np.array(  # Critical examples
+    [[X[0], X[2], X[2]],
+     [X[1], X[1], X[1]],
+     [X[1], x, y]])
+  # Predict
+  predictions = estimator.predict(triplets_test)
+  # Check there are no zero values
+  assert np.sum(predictions == 0) == 0
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                              with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to predict and
+  the metric learner has not been fitted."""
+  input_data, _, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_accuracy_toy_example(estimator, build_dataset):
+  """Test that the default scoring for triplets (accuracy) works on some
+  toy example"""
+  triplets, _, _, X = build_dataset(with_preprocessor=False)
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  estimator.fit(triplets)
+  # We take the two first points and we build 4 regularly spaced points on the
+  # line they define, so that it's easy to build triplets of different
+  # similarities.
+  X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4
+
+  triplets_test = np.array(
+      [[X_test[0], X_test[2], X_test[1]],
+       [X_test[1], X_test[3], X_test[0]],
+       [X_test[1], X_test[2], X_test[3]],
+       [X_test[3], X_test[0], X_test[2]]])
+  # we force the transformation to be identity so that we control what it does
+  estimator.components_ = np.eye(X.shape[1])
+  assert estimator.score(triplets_test) == 0.25
+
+
+def test_raise_big_number_of_features():
+  triplets, _, _, X = build_triplets(with_preprocessor=False)
+  triplets = triplets[:3, :, :]
+  estimator = SCML(n_basis=320)
+  set_random_state(estimator)
+  with pytest.raises(ValueError) as exc_info:
+    estimator.fit(triplets)
+  assert exc_info.value.args[0] == \
+         "Number of features (4) is greater than the number of triplets(3)." \
+         "\nConsider using dimensionality reduction or using another basis " \
+         "generation scheme."
diff --git a/test/test_utils.py b/test/test_utils.py
new file mode 100644
index 00000000..c0383792
--- /dev/null
+++ b/test/test_utils.py
@@ -0,0 +1,1273 @@
+import warnings
+import pytest
+from scipy.linalg import eigh, pinvh
+from collections import namedtuple
+import numpy as np
+from numpy.testing import assert_array_equal, assert_equal
+from sklearn.model_selection import train_test_split
+from sklearn.utils import check_random_state, shuffle
+from metric_learn.sklearn_shims import set_random_state
+from sklearn.base import clone
+from metric_learn._util import (check_input, make_context, preprocess_tuples,
+                                make_name, preprocess_points,
+                                check_collapsed_pairs, validate_vector,
+                                _check_sdp_from_eigen, _check_n_components,
+                                check_y_valid_values_for_pairs,
+                                _auto_select_init, _pseudo_inverse_from_eig)
+from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA,
+                          LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised,
+                          MMC_Supervised, RCA_Supervised, SDML_Supervised,
+                          SCML, SCML_Supervised, Constraints)
+from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin,
+                                      _PairsClassifierMixin,
+                                      _TripletsClassifierMixin,
+                                      _QuadrupletsClassifierMixin)
+from metric_learn.exceptions import PreprocessorError, NonPSDError
+from sklearn.datasets import make_regression, make_blobs, load_iris
+
+
+SEED = 42
+RNG = check_random_state(SEED)
+
+Dataset = namedtuple('Dataset', ('data target preprocessor to_transform'))
+# Data and target are what we will fit on. Preprocessor is the additional
+# data if we use a preprocessor (which should be the default ArrayIndexer),
+# and to_transform is some additional data that we would want to transform
+
+
+def build_classification(with_preprocessor=False):
+  """Basic array for testing when using a preprocessor"""
+  X, y = shuffle(*make_blobs(random_state=SEED),
+                 random_state=SEED)
+  indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int)
+  if with_preprocessor:
+    return Dataset(indices, y[indices], X, indices)
+  else:
+    return Dataset(X[indices], y[indices], None, X[indices])
+
+
+def build_regression(with_preprocessor=False):
+  """Basic array for testing when using a preprocessor"""
+  X, y = shuffle(*make_regression(n_samples=100, n_features=5,
+                                  random_state=SEED),
+                 random_state=SEED)
+  indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int)
+  if with_preprocessor:
+    return Dataset(indices, y[indices], X, indices)
+  else:
+    return Dataset(X[indices], y[indices], None, X[indices])
+
+
+def build_data():
+  input_data, labels = load_iris(return_X_y=True)
+  X, y = shuffle(input_data, labels, random_state=SEED)
+  n_constraints = 50
+  constraints = Constraints(y)
+  pairs = (
+      constraints
+      .positive_negative_pairs(n_constraints, same_length=True,
+                               random_state=check_random_state(SEED)))
+  return X, pairs
+
+
+def build_pairs(with_preprocessor=False):
+  # builds a toy pairs problem
+  X, indices = build_data()
+  c = np.vstack([np.column_stack(indices[:2]), np.column_stack(indices[2:])])
+  target = np.concatenate([np.ones(indices[0].shape[0]),
+                           - np.ones(indices[0].shape[0])])
+  c, target = shuffle(c, target, random_state=SEED)
+  if with_preprocessor:
+    # if preprocessor, we build a 2D array of pairs of indices
+    return Dataset(c, target, X, c[:, 0])
+  else:
+    # if not, we build a 3D array of pairs of samples
+    return Dataset(X[c], target, None, X[c[:, 0]])
+
+
+def build_triplets(with_preprocessor=False):
+  input_data, labels = load_iris(return_X_y=True)
+  X, y = shuffle(input_data, labels, random_state=SEED)
+  constraints = Constraints(y)
+  triplets = constraints.generate_knntriplets(X, k_genuine=3, k_impostor=4)
+  if with_preprocessor:
+    # if preprocessor, we build a 2D array of triplets of indices
+    return Dataset(triplets, np.ones(len(triplets)), X, np.arange(len(X)))
+  else:
+    # if not, we build a 3D array of triplets of samples
+    return Dataset(X[triplets], np.ones(len(triplets)), None, X)
+
+
+def build_quadruplets(with_preprocessor=False):
+  # builds a toy quadruplets problem
+  X, indices = build_data()
+  c = np.column_stack(indices)
+  target = np.ones(c.shape[0])  # quadruplets targets are not used
+  # anyways
+  c, target = shuffle(c, target, random_state=SEED)
+  if with_preprocessor:
+    # if preprocessor, we build a 2D array of quadruplets of indices
+    return Dataset(c, target, X, c[:, 0])
+  else:
+    # if not, we build a 3D array of quadruplets of samples
+    return Dataset(X[c], target, None, X[c[:, 0]])
+
+
+quadruplets_learners = [(LSML(), build_quadruplets)]
+ids_quadruplets_learners = list(map(lambda x: x.__class__.__name__,
+                                [learner for (learner, _) in
+                                 quadruplets_learners]))
+
+triplets_learners = [(SCML(n_basis=320), build_triplets)]
+ids_triplets_learners = list(map(lambda x: x.__class__.__name__,
+                             [learner for (learner, _) in
+                              triplets_learners]))
+
+pairs_learners = [(ITML(max_iter=2), build_pairs),  # max_iter=2 to be faster
+                  (MMC(max_iter=2), build_pairs),  # max_iter=2 to be faster
+                  (SDML(prior='identity', balance_param=1e-5), build_pairs)]
+ids_pairs_learners = list(map(lambda x: x.__class__.__name__,
+                              [learner for (learner, _) in
+                               pairs_learners]))
+
+classifiers = [(Covariance(), build_classification),
+               (LFDA(), build_classification),
+               (LMNN(), build_classification),
+               (NCA(), build_classification),
+               (RCA(), build_classification),
+               (ITML_Supervised(max_iter=5), build_classification),
+               (LSML_Supervised(), build_classification),
+               (MMC_Supervised(max_iter=5), build_classification),
+               (RCA_Supervised(n_chunks=5), build_classification),
+               (SDML_Supervised(prior='identity', balance_param=1e-5),
+               build_classification),
+               (SCML_Supervised(n_basis=80), build_classification)]
+ids_classifiers = list(map(lambda x: x.__class__.__name__,
+                           [learner for (learner, _) in
+                            classifiers]))
+
+regressors = [(MLKR(init='pca'), build_regression)]
+ids_regressors = list(map(lambda x: x.__class__.__name__,
+                          [learner for (learner, _) in regressors]))
+
+WeaklySupervisedClasses = (_PairsClassifierMixin,
+                           _TripletsClassifierMixin,
+                           _QuadrupletsClassifierMixin)
+
+tuples_learners = pairs_learners + triplets_learners + quadruplets_learners
+ids_tuples_learners = ids_pairs_learners + ids_triplets_learners \
+                      + ids_quadruplets_learners
+
+supervised_learners = classifiers + regressors
+ids_supervised_learners = ids_classifiers + ids_regressors
+
+metric_learners = tuples_learners + supervised_learners
+ids_metric_learners = ids_tuples_learners + ids_supervised_learners
+
+metric_learners_pipeline = pairs_learners + supervised_learners
+ids_metric_learners_pipeline = ids_pairs_learners + ids_supervised_learners
+
+
+def remove_y(estimator, X, y):
+  """Quadruplets and triplets learners have no y in fit, but to write test for
+  all estimators, it is convenient to have this function, that will return X
+  and y if the estimator needs a y to fit on, and just X otherwise."""
+  no_y_fit = quadruplets_learners + triplets_learners
+  if estimator.__class__.__name__ in [e.__class__.__name__
+                                      for (e, _) in no_y_fit]:
+    return (X,)
+  else:
+    return (X, y)
+
+
+def mock_preprocessor(indices):
+  """A preprocessor for testing purposes that returns an all ones 3D array
+  """
+  return np.ones((indices.shape[0], 3))
+
+
+@pytest.mark.parametrize('type_of_inputs', ['other', 'tuple', 'classics', 2,
+                                            int, NCA()])
+def test_check_input_invalid_type_of_inputs(type_of_inputs):
+  """Tests that an invalid type of inputs in check_inputs raises an error."""
+  with pytest.raises(ValueError) as e:
+    check_input([[0.2, 2.1], [0.2, .8]], type_of_inputs=type_of_inputs)
+  msg = ("Unknown value {} for type_of_inputs. Valid values are "
+         "'classic' or 'tuples'.".format(type_of_inputs))
+  assert str(e.value) == msg
+
+
+#  ---------------- test check_input with 'tuples' type_of_input' ------------
+
+
+def tuples_prep():
+  """Basic array for testing when using a preprocessor"""
+  tuples = np.array([[1, 2],
+                     [2, 3]])
+  return tuples
+
+
+def tuples_no_prep():
+  """Basic array for testing when using no preprocessor"""
+  tuples = np.array([[[1., 2.3], [2.3, 5.3]],
+                     [[2.3, 4.3], [0.2, 0.4]]])
+  return tuples
+
+
+@pytest.mark.parametrize('estimator, expected',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+def test_make_context(estimator, expected):
+  """test the make_name function"""
+  assert make_context(estimator) == expected
+
+
+@pytest.mark.parametrize('estimator, expected',
+                         [(NCA(), "NCA"), ('NCA', "NCA"), (None, None)])
+def test_make_name(estimator, expected):
+  """test the make_name function"""
+  assert make_name(estimator) == expected
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+@pytest.mark.parametrize('load_tuples, preprocessor',
+                         [(tuples_prep, mock_preprocessor),
+                          (tuples_no_prep, None),
+                          (tuples_no_prep, mock_preprocessor)])
+def test_check_tuples_invalid_tuple_size(estimator, context, load_tuples,
+                                         preprocessor):
+  """Checks that the exception are raised if tuple_size is not the one
+  expected"""
+  tuples = load_tuples()
+  preprocessed_tuples = (preprocess_tuples(tuples, preprocessor)
+                         if (preprocessor is not None and
+                         tuples.ndim == 2) else tuples)
+  expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 "
+                  "element(s) instead (shape={}):\ninput={}.\n"
+                  .format(context, preprocessed_tuples.shape,
+                          preprocessed_tuples))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(tuples, type_of_inputs='tuples', tuple_size=3,
+                preprocessor=preprocessor, estimator=estimator)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+@pytest.mark.parametrize('tuples, found, expected, preprocessor',
+                         [(5, '0', '2D array of indicators or 3D array of '
+                                   'formed tuples', mock_preprocessor),
+                          (5, '0', '3D array of formed tuples', None),
+                          ([1, 2], '1', '2D array of indicators or 3D array '
+                                        'of formed tuples', mock_preprocessor),
+                          ([1, 2], '1', '3D array of formed tuples', None),
+                          ([[[[5]]]], '4', '2D array of indicators or 3D array'
+                                           ' of formed tuples',
+                           mock_preprocessor),
+                          ([[[[5]]]], '4', '3D array of formed tuples', None),
+                          ([[1], [3]], '2', '3D array of formed '
+                                            'tuples', None)])
+def test_check_tuples_invalid_shape(estimator, context, tuples, found,
+                                    expected, preprocessor):
+  """Checks that a value error with the appropriate message is raised if
+  shape is invalid (not 2D with preprocessor or 3D with no preprocessor)
+  """
+  tuples = np.array(tuples)
+  msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your "
+         "data{}.\n"
+         .format(expected, context, ' when using a preprocessor'
+                 if preprocessor else '', found, tuples,
+                 ' and/or use a preprocessor' if
+                 (not preprocessor and tuples.ndim == 2) else ''))
+  with pytest.raises(ValueError) as raised_error:
+      check_input(tuples, type_of_inputs='tuples',
+                  preprocessor=preprocessor, ensure_min_samples=0,
+                  estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+def test_check_tuples_invalid_n_features(estimator, context):
+  """Checks that the right warning is printed if not enough features
+  Here we only test if no preprocessor (otherwise we don't ensure this)
+  """
+  msg = ("Found array with 2 feature(s) (shape={}) while"
+         " a minimum of 3 is required{}.".format(tuples_no_prep().shape,
+                                                 context))
+  with pytest.raises(ValueError) as raised_error:
+      check_input(tuples_no_prep(), type_of_inputs='tuples',
+                  preprocessor=None, ensure_min_features=3,
+                  estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+@pytest.mark.parametrize('load_tuples, preprocessor',
+                         [(tuples_prep, mock_preprocessor),
+                          (tuples_no_prep, None),
+                          (tuples_no_prep, mock_preprocessor)])
+def test_check_tuples_invalid_n_samples(estimator, context, load_tuples,
+                                        preprocessor):
+  """Checks that the right warning is printed if n_samples is too small"""
+  tuples = load_tuples()
+  msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 "
+         "is required{}.".format((preprocess_tuples(tuples, preprocessor)
+                                 if (preprocessor is not None and
+                                 tuples.ndim == 2) else tuples).shape,
+                                 context))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(tuples, type_of_inputs='tuples',
+                preprocessor=preprocessor,
+                ensure_min_samples=3, estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+def test_check_tuples_invalid_dtype_not_convertible_with_preprocessor():
+  """Checks that a value error is thrown if attempting to convert an
+  input not convertible to float, when using a preprocessor
+  """
+
+  def preprocessor(indices):
+    # preprocessor that returns objects
+    return np.full((indices.shape[0], 3), 'a')
+
+  with pytest.raises(ValueError):
+    check_input(tuples_prep(), type_of_inputs='tuples',
+                preprocessor=preprocessor, dtype=np.float64)
+
+
+def test_check_tuples_invalid_dtype_not_convertible_without_preprocessor():
+  """Checks that a value error is thrown if attempting to convert an
+  input not convertible to float, when using no preprocessor
+  """
+  tuples = np.full_like(tuples_no_prep(), 'a', dtype=object)
+  with pytest.raises(ValueError):
+    check_input(tuples, type_of_inputs='tuples',
+                preprocessor=None, dtype=np.float64)
+
+
+@pytest.mark.parametrize('tuple_size', [2, None])
+def test_check_tuples_valid_tuple_size(tuple_size):
+  """For inputs that have the right matrix dimension (2D or 3D for instance),
+  checks that checking the number of tuples (pairs, quadruplets, etc) raises
+  no warning if there is the right number of points in a tuple.
+  """
+  with warnings.catch_warnings(record=True) as record:
+    check_input(tuples_prep(), type_of_inputs='tuples',
+                preprocessor=mock_preprocessor, tuple_size=tuple_size)
+    check_input(tuples_no_prep(), type_of_inputs='tuples', preprocessor=None,
+                tuple_size=tuple_size)
+  assert len(record) == 0
+
+
+@pytest.mark.parametrize('tuples',
+                         [np.array([[2.5, 0.1, 2.6],
+                                    [1.6, 4.8, 9.1]]),
+                          np.array([[2, 0, 2],
+                                    [1, 4, 9]]),
+                          np.array([["img1.png", "img3.png"],
+                                    ["img2.png", "img4.png"]]),
+                          [[2, 0, 2],
+                           [1, 4, 9]],
+                          [np.array([2, 0, 2]),
+                           np.array([1, 4, 9])],
+                          ((2, 0, 2),
+                           (1, 4, 9)),
+                          np.array([[[1.2, 2.2], [1.4, 3.3]],
+                                    [[2.6, 2.3], [3.4, 5.0]]])])
+def test_check_tuples_valid_with_preprocessor(tuples):
+  """Test that valid inputs when using a preprocessor raises no warning"""
+  with warnings.catch_warnings(record=True) as record:
+    check_input(tuples, type_of_inputs='tuples',
+                preprocessor=mock_preprocessor)
+  assert len(record) == 0
+
+
+@pytest.mark.parametrize('tuples',
+                         [np.array([[[2.5], [0.1], [2.6]],
+                                    [[1.6], [4.8], [9.1]],
+                                    [[5.6], [2.8], [6.1]]]),
+                          np.array([[[2], [0], [2]],
+                                    [[1], [4], [9]],
+                                    [[1], [5], [3]]]),
+                          [[[2], [0], [2]],
+                           [[1], [4], [9]],
+                           [[3], [4], [29]]],
+                          (((2, 1), (0, 2), (2, 3)),
+                           ((1, 2), (4, 4), (9, 3)),
+                           ((3, 1), (4, 4), (29, 4)))])
+def test_check_tuples_valid_without_preprocessor(tuples):
+  """Test that valid inputs when using no preprocessor raises no warning"""
+  with warnings.catch_warnings(record=True) as record:
+    check_input(tuples, type_of_inputs='tuples', preprocessor=None)
+  assert len(record) == 0
+
+
+def test_check_tuples_behaviour_auto_dtype():
+  """Checks that check_tuples allows by default every type if using a
+  preprocessor, and numeric types if using no preprocessor"""
+  tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']]
+  with warnings.catch_warnings(record=True) as record:
+    check_input(tuples_prep, type_of_inputs='tuples',
+                preprocessor=mock_preprocessor)
+  assert len(record) == 0
+
+  with warnings.catch_warnings(record=True) as record:
+      check_input(tuples_no_prep(), type_of_inputs='tuples')  # numeric type
+  assert len(record) == 0
+
+  # not numeric type
+  tuples_no_prep_bis = np.array([[['img1.png'], ['img2.png']],
+                                 [['img3.png'], ['img5.png']]])
+  tuples_no_prep_bis = tuples_no_prep_bis.astype(object)
+  with pytest.raises(ValueError):
+      check_input(tuples_no_prep_bis, type_of_inputs='tuples')
+
+
+def test_check_tuples_invalid_complex_data():
+  """Checks that the right error message is thrown if given complex data (
+  this comes from sklearn's check_array's message)"""
+  tuples = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]],
+                     [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]])
+  msg = ("Complex data not supported\n"
+         "{}\n".format(tuples))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(tuples, type_of_inputs='tuples')
+  assert str(raised_error.value) == msg
+
+
+# ------------- test check_input with 'classic' type_of_inputs ----------------
+
+
+def points_prep():
+  """Basic array for testing when using a preprocessor"""
+  points = np.array([1, 2])
+  return points
+
+
+def points_no_prep():
+  """Basic array for testing when using no preprocessor"""
+  points = np.array([[1., 2.3],
+                     [2.3, 4.3]])
+  return points
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+@pytest.mark.parametrize('points, found, expected, preprocessor',
+                         [(5, '0', '1D array of indicators or 2D array of '
+                                   'formed points', mock_preprocessor),
+                          (5, '0', '2D array of formed points', None),
+                          ([1, 2], '1', '2D array of formed points', None),
+                          ([[[5]]], '3', '1D array of indicators or 2D '
+                                         'array of formed points',
+                           mock_preprocessor),
+                          ([[[5]]], '3', '2D array of formed points', None)])
+def test_check_classic_invalid_shape(estimator, context, points, found,
+                                     expected, preprocessor):
+  """Checks that a value error with the appropriate message is raised if
+  shape is invalid (valid being 1D or 2D with preprocessor or 2D with no
+  preprocessor)
+  """
+  points = np.array(points)
+  msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your "
+         "data{}.\n"
+         .format(expected, context, ' when using a preprocessor'
+                 if preprocessor else '', found, points,
+                 ' and/or use a preprocessor' if
+                 (not preprocessor and points.ndim == 1) else ''))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(points, type_of_inputs='classic', preprocessor=preprocessor,
+                ensure_min_samples=0,
+                estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+def test_check_classic_invalid_n_features(estimator, context):
+  """Checks that the right warning is printed if not enough features
+  Here we only test if no preprocessor (otherwise we don't ensure this)
+  """
+  msg = ("Found array with 2 feature(s) (shape={}) while"
+         " a minimum of 3 is required{}.".format(points_no_prep().shape,
+                                                 context))
+  with pytest.raises(ValueError) as raised_error:
+      check_input(points_no_prep(), type_of_inputs='classic',
+                  preprocessor=None, ensure_min_features=3,
+                  estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('estimator, context',
+                         [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")])
+@pytest.mark.parametrize('load_points, preprocessor',
+                         [(points_prep, mock_preprocessor),
+                          (points_no_prep, None),
+                          (points_no_prep, mock_preprocessor)])
+def test_check_classic_invalid_n_samples(estimator, context, load_points,
+                                         preprocessor):
+  """Checks that the right warning is printed if n_samples is too small"""
+  points = load_points()
+  msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 "
+         "is required{}.".format((preprocess_points(points,
+                                                    preprocessor)
+                                 if preprocessor is not None and
+                                 points.ndim == 1 else
+                                 points).shape,
+                                 context))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(points, type_of_inputs='classic', preprocessor=preprocessor,
+                ensure_min_samples=3,
+                estimator=estimator)
+  assert str(raised_error.value) == msg
+
+
+@pytest.mark.parametrize('preprocessor, points',
+                         [(mock_preprocessor, np.array([['a', 'b'],
+                                                        ['e', 'b']])),
+                          (None, np.array([[['b', 'v'], ['a', 'd']],
+                                           [['x', 'u'], ['c', 'a']]]))])
+def test_check_classic_invalid_dtype_not_convertible(preprocessor, points):
+  """Checks that a value error is thrown if attempting to convert an
+  input not convertible to float
+  """
+  with pytest.raises(ValueError):
+    check_input(points, type_of_inputs='classic',
+                preprocessor=preprocessor, dtype=np.float64)
+
+
+@pytest.mark.parametrize('points',
+                         [["img1.png", "img3.png", "img2.png"],
+                          np.array(["img1.png", "img3.png", "img2.png"]),
+                          [2, 0, 2, 1, 4, 9],
+                          range(10),
+                          np.array([2, 0, 2]),
+                          (2, 0, 2),
+                          np.array([[1.2, 2.2],
+                                    [2.6, 2.3]])])
+def test_check_classic_valid_with_preprocessor(points):
+  """Test that valid inputs when using a preprocessor raises no warning"""
+  with warnings.catch_warnings(record=True) as record:
+    check_input(points, type_of_inputs='classic',
+                preprocessor=mock_preprocessor)
+  assert len(record) == 0
+
+
+@pytest.mark.parametrize('points',
+                         [np.array([[2.5, 0.1, 2.6],
+                                    [1.6, 4.8, 9.1],
+                                    [5.6, 2.8, 6.1]]),
+                          np.array([[2, 0, 2],
+                                    [1, 4, 9],
+                                    [1, 5, 3]]),
+                          [[2, 0, 2],
+                           [1, 4, 9],
+                           [3, 4, 29]],
+                          ((2, 1, 0, 2, 2, 3),
+                           (1, 2, 4, 4, 9, 3),
+                           (3, 1, 4, 4, 29, 4))])
+def test_check_classic_valid_without_preprocessor(points):
+  """Test that valid inputs when using no preprocessor raises no warning"""
+  with warnings.catch_warnings(record=True) as record:
+    check_input(points, type_of_inputs='classic', preprocessor=None)
+  assert len(record) == 0
+
+
+def test_check_classic_by_default():
+  """Checks that 'classic' is the default behaviour of check_input"""
+  assert (check_input([[2, 3], [3, 2]]) ==
+          check_input([[2, 3], [3, 2]], type_of_inputs='classic')).all()
+
+
+def test_check_classic_behaviour_auto_dtype():
+  """Checks that check_input (for points) allows by default every type if
+  using a preprocessor, and numeric types if using no preprocessor"""
+  points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png']
+  with warnings.catch_warnings(record=True) as record:
+    check_input(points_prep, type_of_inputs='classic',
+                preprocessor=mock_preprocessor)
+  assert len(record) == 0
+
+  with warnings.catch_warnings(record=True) as record:
+      check_input(points_no_prep(), type_of_inputs='classic')  # numeric type
+  assert len(record) == 0
+
+  # not numeric type
+  points_no_prep_bis = np.array(['img1.png', 'img2.png', 'img3.png',
+                                 'img5.png'])
+  points_no_prep_bis = points_no_prep_bis.astype(object)
+  with pytest.raises(ValueError):
+      check_input(points_no_prep_bis, type_of_inputs='classic')
+
+
+def test_check_classic_invalid_complex_data():
+  """Checks that the right error message is thrown if given complex data (
+  this comes from sklearn's check_array's message)"""
+  points = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]],
+                     [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]])
+  msg = ("Complex data not supported\n"
+         "{}\n".format(points))
+  with pytest.raises(ValueError) as raised_error:
+    check_input(points, type_of_inputs='classic')
+  assert str(raised_error.value) == msg
+
+
+# ----------------------------- Test preprocessor -----------------------------
+
+
+X = np.array([[0.89, 0.11, 1.48, 0.12],
+              [2.63, 1.08, 1.68, 0.46],
+              [1.00, 0.59, 0.62, 1.15]])
+
+
+class MockFileLoader:
+  """Preprocessor that takes a root file path at construction and simulates
+  fetching the file in the specific root folder when given the name of the
+  file"""
+
+  def __init__(self, root):
+    self.root = root
+    self.folders = {'fake_root': {'img0.png': X[0],
+                                  'img1.png': X[1],
+                                  'img2.png': X[2]
+                                  },
+                    'other_folder': {}  # empty folder
+                    }
+
+  def __call__(self, path_list):
+    images = list()
+    for path in path_list:
+      images.append(self.folders[self.root][path])
+    return np.array(images)
+
+
+def mock_id_loader(list_of_indicators):
+  """A preprocessor as a function that takes indicators (strings) and
+  returns the corresponding samples"""
+  points = []
+  for indicator in list_of_indicators:
+      points.append(X[int(indicator[2:])])
+  return np.array(points)
+
+
+tuples_list = [np.array([[0, 1],
+                         [2, 1]]),
+
+               np.array([['img0.png', 'img1.png'],
+                         ['img2.png', 'img1.png']]),
+
+               np.array([['id0', 'id1'],
+                         ['id2', 'id1']])
+               ]
+
+points_list = [np.array([0, 1, 2, 1]),
+
+               np.array(['img0.png', 'img1.png', 'img2.png', 'img1.png']),
+
+               np.array(['id0', 'id1', 'id2', 'id1'])
+               ]
+
+preprocessors = [X, MockFileLoader('fake_root'), mock_id_loader]
+
+
+@pytest.fixture
+def y_tuples():
+  y = [-1, 1]
+  return y
+
+
+@pytest.fixture
+def y_points():
+  y = [0, 1, 0, 0]
+  return y
+
+
+@pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors,
+                                                     tuples_list))
+def test_preprocessor_weakly_supervised(preprocessor, tuples, y_tuples):
+  """Tests different ways to use the preprocessor argument: an array,
+  a class callable, and a function callable, with a weakly supervised
+  algorithm
+  """
+  nca = ITML(preprocessor=preprocessor)
+  nca.fit(tuples, y_tuples)
+
+
+@pytest.mark.parametrize('preprocessor, points', zip(preprocessors,
+                                                     points_list))
+def test_preprocessor_supervised(preprocessor, points, y_points):
+  """Tests different ways to use the preprocessor argument: an array,
+  a class callable, and a function callable, with a supervised algorithm
+  """
+  lfda = LFDA(preprocessor=preprocessor)
+  lfda.fit(points, y_points)
+
+
+@pytest.mark.parametrize('estimator', ['NCA', NCA(), None])
+def test_preprocess_tuples_invalid_message(estimator):
+  """Checks that if the preprocessor does some weird stuff, the preprocessed
+  input is detected as weird. Checks this for preprocess_tuples."""
+
+  context = make_context(estimator) + (' after the preprocessor '
+                                       'has been applied')
+
+  def preprocessor(sequence):
+    return np.ones((len(sequence), 2, 2))  # returns a 3D array instead of 2D
+
+  with pytest.raises(ValueError) as raised_error:
+      check_input(np.ones((3, 2)), type_of_inputs='tuples',
+                  preprocessor=preprocessor, estimator=estimator)
+  expected_msg = ("3D array of formed tuples expected{}. Found 4D "
+                  "array instead:\ninput={}. Reshape your data{}.\n"
+                  .format(context, np.ones((3, 2, 2, 2)),
+                          ' and/or use a preprocessor' if preprocessor
+                          is not None else ''))
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator', ['NCA', NCA(), None])
+def test_preprocess_points_invalid_message(estimator):
+  """Checks that if the preprocessor does some weird stuff, the preprocessed
+  input is detected as weird."""
+
+  context = make_context(estimator) + (' after the preprocessor '
+                                       'has been applied')
+
+  def preprocessor(sequence):
+    return np.ones((len(sequence), 2, 2))  # returns a 3D array instead of 2D
+
+  with pytest.raises(ValueError) as raised_error:
+    check_input(np.ones((3,)), type_of_inputs='classic',
+                preprocessor=preprocessor, estimator=estimator)
+  expected_msg = ("2D array of formed points expected{}. "
+                  "Found 3D array instead:\ninput={}. Reshape your data{}.\n"
+                  .format(context, np.ones((3, 2, 2)),
+                          ' and/or use a preprocessor' if preprocessor
+                          is not None else ''))
+  assert str(raised_error.value) == expected_msg
+
+
+def test_preprocessor_error_message():
+  """Tests whether the preprocessor returns a preprocessor error when there
+  is a problem using the preprocessor
+  """
+  preprocessor = ArrayIndexer(np.array([[1.2, 3.3], [3.1, 3.2]]))
+
+  # with tuples
+  X = np.array([[[2, 3], [3, 3]], [[2, 3], [3, 2]]])
+  # There are less samples than the max index we want to preprocess
+  with pytest.raises(PreprocessorError):
+    preprocess_tuples(X, preprocessor)
+
+  # with points
+  X = np.array([[1], [2], [3], [3]])
+  with pytest.raises(PreprocessorError):
+    preprocess_points(X, preprocessor)
+
+
+@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]],
+                                        ((5, 3), (3, 2))
+                                        ])
+@pytest.mark.parametrize('indices', [[0, 1], (1, 0)])
+def test_array_like_indexer_array_like_valid_classic(input_data, indices):
+  """Checks that any array-like is valid in the 'preprocessor' argument,
+  and in the indices, for a classic input"""
+  class MockMetricLearner(MahalanobisMixin):
+    def fit(self):
+      pass
+    pass
+
+  mock_algo = MockMetricLearner(preprocessor=input_data)
+  mock_algo._prepare_inputs(indices, type_of_inputs='classic')
+
+
+@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]],
+                                        ((5, 3), (3, 2))
+                                        ])
+@pytest.mark.parametrize('indices', [[[0, 1], [1, 0]], ((1, 0), (1, 0))])
+def test_array_like_indexer_array_like_valid_tuples(input_data, indices):
+  """Checks that any array-like is valid in the 'preprocessor' argument,
+  and in the indices, for a classic input"""
+  class MockMetricLearner(MahalanobisMixin):
+    def fit(self):
+      pass
+    pass
+
+  mock_algo = MockMetricLearner(preprocessor=input_data)
+  mock_algo._prepare_inputs(indices, type_of_inputs='tuples')
+
+
+@pytest.mark.parametrize('preprocessor', [4, NCA()])
+def test_error_message_check_preprocessor(preprocessor):
+  """Checks that if the preprocessor given is not an array-like or a
+  callable, the right error message is returned"""
+  class MockMetricLearner(MahalanobisMixin):
+    pass
+
+  mock_algo = MockMetricLearner(preprocessor=preprocessor)
+  with pytest.raises(ValueError) as e:
+    mock_algo._check_preprocessor()
+  assert str(e.value) == ("Invalid type for the preprocessor: {}. You should "
+                          "provide either None, an array-like object, "
+                          "or a callable.".format(type(preprocessor)))
+
+
+@pytest.mark.parametrize('estimator, _', tuples_learners,
+                         ids=ids_tuples_learners)
+def test_error_message_tuple_size(estimator, _):
+  """Tests that if a tuples learner is not given the good number of points
+  per tuple, it throws an error message"""
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  invalid_pairs = np.ones((2, 5, 2))
+  y = [1, 1]
+  with pytest.raises(ValueError) as raised_err:
+    estimator.fit(*remove_y(estimator, invalid_pairs, y))
+  expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 5 "
+                  "element(s) instead (shape=(2, 5, 2)):\ninput={}.\n"
+                  .format(estimator._tuple_size, make_context(estimator),
+                          invalid_pairs))
+  assert str(raised_err.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator, _', metric_learners,
+                         ids=ids_metric_learners)
+def test_error_message_t_pair_distance_or_score(estimator, _):
+  """Tests that if you want to pair_distance or pair_score on triplets
+  for instance, it returns the right error message
+  """
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  estimator._check_preprocessor()
+  triplets = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]],
+                       [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]])
+  with pytest.raises(ValueError) as raised_err:
+    estimator.pair_score(triplets)
+  expected_msg = ("Tuples of 2 element(s) expected{}. Got tuples of 3 "
+                  "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n"
+                  .format(make_context(estimator), triplets))
+  assert str(raised_err.value) == expected_msg
+
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+
+  # One exception will trigger for sure
+  with pytest.raises(Exception) as raised_exception:
+      estimator.pair_distance(triplets)
+  err_value = raised_exception.value.args[0]
+  assert err_value == expected_msg or err_value == not_implemented_msg
+
+
+def test_preprocess_tuples_simple_example():
+  """Test the preprocessor on a very simple example of tuples to ensure the
+  result is as expected"""
+  array = np.array([[1, 2],
+                    [2, 3],
+                    [4, 5]])
+
+  def fun(row):
+    return np.array([[1, 1], [3, 3], [4, 4]])
+
+  expected_result = np.array([[[1, 1], [1, 1]],
+                              [[3, 3], [3, 3]],
+                              [[4, 4], [4, 4]]])
+
+  assert (preprocess_tuples(array, fun) == expected_result).all()
+
+
+def test_preprocess_points_simple_example():
+  """Test the preprocessor on very simple examples of points to ensure the
+  result is as expected"""
+  array = np.array([1, 2, 4])
+
+  def fun(row):
+    return [[1, 1], [3, 3], [4, 4]]
+
+  expected_result = np.array([[1, 1],
+                              [3, 3],
+                              [4, 4]])
+
+  assert (preprocess_points(array, fun) == expected_result).all()
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_same_with_or_without_preprocessor(estimator, build_dataset):
+  """Test that algorithms using a preprocessor behave consistently
+# with their no-preprocessor equivalent
+  """
+  dataset_indices = build_dataset(with_preprocessor=True)
+  dataset_formed = build_dataset(with_preprocessor=False)
+  X = dataset_indices.preprocessor
+  indicators_to_transform = dataset_indices.to_transform
+  formed_points_to_transform = dataset_formed.to_transform
+  (indices_train, indices_test, y_train, y_test, formed_train,
+   formed_test) = train_test_split(dataset_indices.data,
+                                   dataset_indices.target,
+                                   dataset_formed.data,
+                                   random_state=SEED)
+
+  estimator_with_preprocessor = clone(estimator)
+  set_random_state(estimator_with_preprocessor)
+  estimator_with_preprocessor.set_params(preprocessor=X)
+  estimator_with_preprocessor.fit(*remove_y(estimator, indices_train, y_train))
+
+  estimator_without_preprocessor = clone(estimator)
+  set_random_state(estimator_without_preprocessor)
+  estimator_without_preprocessor.set_params(preprocessor=None)
+  estimator_without_preprocessor.fit(*remove_y(estimator, formed_train,
+                                               y_train))
+
+  estimator_with_prep_formed = clone(estimator)
+  set_random_state(estimator_with_prep_formed)
+  estimator_with_prep_formed.set_params(preprocessor=X)
+  estimator_with_prep_formed.fit(*remove_y(estimator, indices_train, y_train))
+
+  # test prediction methods
+  for method in ["predict", "decision_function"]:
+    if hasattr(estimator, method):
+      output_with_prep = getattr(estimator_with_preprocessor,
+                                 method)(indices_test)
+      output_without_prep = getattr(estimator_without_preprocessor,
+                                    method)(formed_test)
+      assert np.array(output_with_prep == output_without_prep).all()
+      output_with_prep = getattr(estimator_with_preprocessor,
+                                 method)(indices_test)
+      output_with_prep_formed = getattr(estimator_with_prep_formed,
+                                        method)(formed_test)
+      assert np.array(output_with_prep == output_with_prep_formed).all()
+
+  # Test pair_score, all learners have it.
+  idx1 = np.array([[0, 2], [5, 3]], dtype=int)
+  output_with_prep = estimator_with_preprocessor.pair_score(
+      indicators_to_transform[idx1])
+  output_without_prep = estimator_without_preprocessor.pair_score(
+      formed_points_to_transform[idx1])
+  assert np.array(output_with_prep == output_without_prep).all()
+
+  output_with_prep = estimator_with_preprocessor.pair_score(
+      indicators_to_transform[idx1])
+  output_without_prep = estimator_with_prep_formed.pair_score(
+      formed_points_to_transform[idx1])
+  assert np.array(output_with_prep == output_without_prep).all()
+
+  # Test pair_distance
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+  try:
+    output_with_prep = estimator_with_preprocessor.pair_distance(
+        indicators_to_transform[idx1])
+    output_without_prep = estimator_without_preprocessor.pair_distance(
+        formed_points_to_transform[idx1])
+    assert np.array(output_with_prep == output_without_prep).all()
+
+    output_with_prep = estimator_with_preprocessor.pair_distance(
+        indicators_to_transform[idx1])
+    output_without_prep = estimator_with_prep_formed.pair_distance(
+        formed_points_to_transform[idx1])
+    assert np.array(output_with_prep == output_without_prep).all()
+
+  except Exception as raised_exception:
+    assert raised_exception.value.args[0] == not_implemented_msg
+
+  # Test transform
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have transform"
+  try:
+    output_with_prep = estimator_with_preprocessor.transform(
+        indicators_to_transform)
+    output_without_prep = estimator_without_preprocessor.transform(
+        formed_points_to_transform)
+    assert np.array(output_with_prep == output_without_prep).all()
+
+    output_with_prep = estimator_with_preprocessor.transform(
+        indicators_to_transform)
+    output_without_prep = estimator_with_prep_formed.transform(
+        formed_points_to_transform)
+    assert np.array(output_with_prep == output_without_prep).all()
+
+  except Exception as raised_exception:
+    assert raised_exception.value.args[0] == not_implemented_msg
+
+
+def test_check_collapsed_pairs_raises_no_error():
+  """Checks that check_collapsed_pairs raises no error if no collapsed pairs
+  is present"""
+  pairs_ok = np.array([[[0.1, 3.3], [3.3, 0.1]],
+                       [[0.1, 3.3], [3.3, 0.1]],
+                       [[2.5, 8.1], [0.1, 3.3]]])
+  check_collapsed_pairs(pairs_ok)
+
+
+def test_check_collapsed_pairs_raises_error():
+  """Checks that check_collapsed_pairs raises no error if no collapsed pairs
+  is present"""
+  pairs_not_ok = np.array([[[0.1, 3.3], [0.1, 3.3]],
+                           [[0.1, 3.3], [3.3, 0.1]],
+                           [[2.5, 8.1], [2.5, 8.1]]])
+  with pytest.raises(ValueError) as e:
+    check_collapsed_pairs(pairs_not_ok)
+  assert str(e.value) == ("2 collapsed pairs found (where the left element is "
+                          "the same as the right element), out of 3 pairs in"
+                          " total.")
+
+
+def test__validate_vector():
+  """Replica of scipy.spatial.tests.test_distance.test__validate_vector"""
+  x = [1, 2, 3]
+  y = validate_vector(x)
+  assert_array_equal(y, x)
+
+  y = validate_vector(x, dtype=np.float64)
+  assert_array_equal(y, x)
+  assert_equal(y.dtype, np.float64)
+
+  x = [1]
+  y = validate_vector(x)
+  assert_equal(y.ndim, 1)
+  assert_equal(y, x)
+
+  x = 1
+  y = validate_vector(x)
+  assert_equal(y.ndim, 1)
+  assert_equal(y, [x])
+
+  x = np.arange(5).reshape(1, -1, 1)
+  y = validate_vector(x)
+  assert_equal(y.ndim, 1)
+  assert_array_equal(y, x[0, :, 0])
+
+  x = [[1, 2], [3, 4]]
+  with pytest.raises(ValueError):
+    validate_vector(x)
+
+
+def test__check_sdp_from_eigen_positive_err_messages():
+  """Tests that if _check_sdp_from_eigen is given a negative tol it returns
+  an error, and if positive (or None) it does not"""
+  w = np.abs(np.random.RandomState(42).randn(10)) + 1
+  with pytest.raises(ValueError) as raised_error:
+    _check_sdp_from_eigen(w, -5.)
+  assert str(raised_error.value) == "tol should be positive."
+  with pytest.raises(ValueError) as raised_error:
+    _check_sdp_from_eigen(w, -1e-10)
+  assert str(raised_error.value) == "tol should be positive."
+  _check_sdp_from_eigen(w, 1.)
+  _check_sdp_from_eigen(w, 0.)
+  _check_sdp_from_eigen(w, None)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w', [np.array([-1.2, 5.5, 6.6]),
+                               np.array([-1.2, -5.6])])
+def test__check_sdp_from_eigen_positive_eigenvalues(w):
+  """Tests that _check_sdp_from_eigen, returns a NonPSDError when
+  the eigenvalues are negatives or null."""
+  with pytest.raises(NonPSDError):
+    _check_sdp_from_eigen(w)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w', [np.array([0., 2.3, 5.3]),
+                               np.array([1e-20, 3.5]),
+                               np.array([1.5, 2.4, 4.6])])
+def test__check_sdp_from_eigen_negative_eigenvalues(w):
+  """Tests that _check_sdp_from_eigen, returns no error when the
+  eigenvalues are positive."""
+  _check_sdp_from_eigen(w)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, is_definite', [(np.array([1e-15, 5.6]), False),
+                                            (np.array([-1e-15, 5.6]), False),
+                                            (np.array([3.2, 5.6, 0.01]), True),
+                                            ])
+def test__check_sdp_from_eigen_returns_definiteness(w, is_definite):
+  """Tests that _check_sdp_from_eigen returns the definiteness of the
+  matrix (when it is PSD), based on the given eigenvalues"""
+  assert _check_sdp_from_eigen(w) == is_definite
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, tol, is_definite',
+                         [(np.array([5., 3.]), 2, True),
+                          (np.array([5., 1.]), 2, False),
+                          (np.array([5., -1.]), 2, False)])
+def test__check_sdp_from_eigen_tol_psd(w, tol, is_definite):
+  """Tests that _check_sdp_from_eigen, for PSD matrices, returns
+  False if an eigenvalue is lower than tol"""
+  assert _check_sdp_from_eigen(w, tol=tol) == is_definite
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, tol',
+                         [(np.array([5., -3.]), 2),
+                          (np.array([1., -3.]), 2)])
+def test__check_sdp_from_eigen_tol_non_psd(w, tol):
+  """Tests that _check_sdp_from_eigen raises a NonPSDError
+  when there is a negative value with abs value higher than tol"""
+  with pytest.raises(NonPSDError):
+    _check_sdp_from_eigen(w, tol=tol)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, is_definite',
+                         [(np.array([1e5, 1e5, 1e5, 1e5,
+                                     1e5, 1e5, 1e-20]), False),
+                          (np.array([1e-10, 1e-10]), True)])
+def test__check_sdp_from_eigen_tol_default_psd(w, is_definite):
+  """Tests that the default tol argument gives good results for edge cases
+  like even if the determinant is high but clearly one eigenvalue is low,
+  (undefinite so returns False) or when all eigenvalues are low (definite so
+  returns True)"""
+  assert _check_sdp_from_eigen(w, tol=None) == is_definite
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w',
+                         [np.array([1., -1.]),
+                          np.array([-1e-10, 1e-10])])
+def test__check_sdp_from_eigen_tol_default_non_psd(w):
+  """Tests that the default tol argument is good for raising
+  NonPSDError, e.g. that when a value is clearly relatively
+  negative it raises such an error"""
+  with pytest.raises(NonPSDError):
+    _check_sdp_from_eigen(w, tol=None)
+
+
+def test__check_n_components():
+  """Checks that n_components returns what is expected
+  (including the errors)"""
+  dim = _check_n_components(5, None)
+  assert dim == 5
+
+  dim = _check_n_components(5, 3)
+  assert dim == 3
+
+  with pytest.raises(ValueError) as expected_err:
+    _check_n_components(5, 10)
+  assert str(expected_err.value) == 'Invalid n_components, must be in [1, 5]'
+
+  with pytest.raises(ValueError) as expected_err:
+    _check_n_components(5, 0)
+  assert str(expected_err.value) == 'Invalid n_components, must be in [1, 5]'
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('wrong_labels',
+                         [[0.5, 0.6, 0.7, 0.8, 0.9],
+                          np.random.RandomState(42).randn(5),
+                          np.random.RandomState(42).choice([0, 1], size=5)])
+def test_check_y_valid_values_for_pairs(wrong_labels):
+  expected_msg = ("When training on pairs, the labels (y) should contain "
+                  "only values in [-1, 1]. Found an incorrect value.")
+  with pytest.raises(ValueError) as raised_error:
+    check_y_valid_values_for_pairs(wrong_labels)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('wrong_labels',
+                         [[0.5, 0.6, 0.7, 0.8, 0.9],
+                          np.random.RandomState(42).randn(5),
+                          np.random.RandomState(42).choice([0, 1], size=5)])
+def test_check_input_invalid_tuples_without_preprocessor(wrong_labels):
+  pairs = np.random.RandomState(42).randn(5, 2, 3)
+  expected_msg = ("When training on pairs, the labels (y) should contain "
+                  "only values in [-1, 1]. Found an incorrect value.")
+  with pytest.raises(ValueError) as raised_error:
+    check_input(pairs, wrong_labels, preprocessor=None,
+                type_of_inputs='tuples')
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('wrong_labels',
+                         [[0.5, 0.6, 0.7, 0.8, 0.9],
+                          np.random.RandomState(42).randn(5),
+                          np.random.RandomState(42).choice([0, 1], size=5)])
+def test_check_input_invalid_tuples_with_preprocessor(wrong_labels):
+  n_samples, n_features, n_pairs = 10, 4, 5
+  rng = np.random.RandomState(42)
+  pairs = rng.randint(10, size=(n_pairs, 2))
+  preprocessor = rng.randn(n_samples, n_features)
+  expected_msg = ("When training on pairs, the labels (y) should contain "
+                  "only values in [-1, 1]. Found an incorrect value.")
+  with pytest.raises(ValueError) as raised_error:
+    check_input(pairs, wrong_labels, preprocessor=ArrayIndexer(preprocessor),
+                type_of_inputs='tuples')
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_check_input_pairs_learners_invalid_y(estimator, build_dataset,
+                                              with_preprocessor):
+  """checks that the only allowed labels for learning pairs are +1 and -1"""
+  input_data, labels, _, X = build_dataset()
+  wrong_labels_list = [labels + 0.5,
+                       np.random.RandomState(42).randn(len(labels)),
+                       np.random.RandomState(42).choice([0, 1],
+                                                        size=len(labels))]
+  model = clone(estimator)
+  set_random_state(model)
+
+  expected_msg = ("When training on pairs, the labels (y) should contain "
+                  "only values in [-1, 1]. Found an incorrect value.")
+
+  for wrong_labels in wrong_labels_list:
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(input_data, wrong_labels)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('has_classes, n_features, n_samples, n_components, '
+                         'n_classes, result',
+                         [(False, 3, 20, 3, 0, 'identity'),
+                          (False, 3, 2, 3, 0, 'identity'),
+                          (False, 5, 3, 4, 0, 'identity'),
+                          (False, 4, 5, 3, 0, 'pca'),
+                          (True, 5, 6, 3, 4, 'lda'),
+                          (True, 6, 3, 3, 3, 'identity'),
+                          (True, 5, 6, 4, 2, 'pca'),
+                          (True, 2, 6, 2, 10, 'lda'),
+                          (True, 4, 6, 2, 3, 'lda')
+                          ])
+def test__auto_select_init(has_classes, n_features, n_samples, n_components,
+                           n_classes,
+                           result):
+  """Checks that the auto selection of the init works as expected"""
+  assert (_auto_select_init(has_classes, n_features,
+                            n_samples, n_components, n_classes) == result)
+
+
+@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
+def test_pseudo_inverse_from_eig_and_pinvh_singular(w0):
+  """Checks that _pseudo_inverse_from_eig returns the same result as
+  scipy.linalg.pinvh for a singular matrix"""
+  rng = np.random.RandomState(SEED)
+  A = rng.rand(100, 100)
+  A = A + A.T
+  w, V = eigh(A)
+  w[0] = w0
+  A = V.dot(np.diag(w)).dot(V.T)
+  np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A),
+                             rtol=1e-05)
+
+
+def test_pseudo_inverse_from_eig_and_pinvh_nonsingular():
+  """Checks that _pseudo_inverse_from_eig returns the same result as
+  scipy.linalg.pinvh for a non singular matrix"""
+  rng = np.random.RandomState(SEED)
+  A = rng.rand(100, 100)
+  A = A + A.T
+  w, V = eigh(A, check_finite=False)
+  np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A))