diff --git a/.github/issue_template.md b/.github/ISSUE_TEMPLATE/bug_report.md
similarity index 82%
rename from .github/issue_template.md
rename to .github/ISSUE_TEMPLATE/bug_report.md
index d4fb0abe..ae757838 100644
--- a/.github/issue_template.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,3 +1,9 @@
+---
+name: Reproducible bug report
+about: Create a reproducible bug report. Not for support requests.
+labels: 'bug'
+---
+
 #### Description
 <!-- Describe your issue here.-->
 
@@ -42,3 +48,9 @@ $ pip show metric_learn | grep Version
 )
 -->
 <!-- Thanks for contributing! -->
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Impacted by this bug? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..415acfcd
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,18 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: Have you read the docs?
+    url: http://contrib.scikit-learn.org/metric-learn/
+    about: Much help can be found in the docs
+  - name: Ask a question
+    url: https://github.com/scikit-learn-contrib/metric-learn/discussions/new
+    about: Ask a question or start a discussion about metric-learn
+  - name: Stack Overflow
+    url: https://stackoverflow.com
+    about: Please ask and answer metric-learn usage questions (API, installation...) on Stack Overflow
+  - name: Cross Validated
+    url: https://stats.stackexchange.com
+    about: Please ask and answer metric learning questions (use cases, algorithms & theory...) on Cross Validated
+  - name: Blank issue
+    url: https://github.com/scikit-learn-contrib/metric-learn/issues/new
+    about: Please note that Github Discussions should be used in most cases instead
diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md
new file mode 100644
index 00000000..753cf2f7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/doc_improvement.md
@@ -0,0 +1,23 @@
+---
+name: Documentation improvement
+about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change.
+labels: Documentation
+---
+
+#### Describe the issue linked to the documentation
+
+<!--
+Tell us about the confusion introduced in the documentation.
+-->
+
+#### Suggest a potential alternative/fix
+
+<!--
+Tell us how we could improve the documentation in this regard.
+-->
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Confused by this part of the doc too? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/enhancement_proposal.md b/.github/ISSUE_TEMPLATE/enhancement_proposal.md
new file mode 100644
index 00000000..01dfb1d7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement_proposal.md
@@ -0,0 +1,18 @@
+---
+name: Enhancement proposal
+about: Propose an enhancement for metric-learn
+labels: 'enhancement'
+---
+# Summary
+
+What change needs making?
+
+# Use Cases
+
+When would you use this?
+
+---
+<!-- Issue Author: Don't delete this message to encourage other users to support your issue! -->
+**Message from the maintainers**:
+
+Want to see this feature happen? Give it a 👍. We prioritise the issues with the most 👍.
\ No newline at end of file
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 00000000..0935a109
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,42 @@
+name: CI
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+    
+jobs:
+  # Run normal testing with the latest versions of all dependencies
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run Tests without skggm
+        run:  |
+          sudo apt-get install liblapack-dev
+          pip install --upgrade pip pytest
+          pip install wheel cython numpy scipy codecov pytest-cov scikit-learn
+          pytest test --cov
+          bash <(curl -s https://codecov.io/bash)
+      - name: Run Tests with skggm
+        env:
+          SKGGM_VERSION: a0ed406586c4364ea3297a658f415e13b5cbdaf8
+        run:  |
+          pip install git+https://github.com/skggm/skggm.git@${SKGGM_VERSION}
+          pytest test --cov
+          bash <(curl -s https://codecov.io/bash)
+      - name: Syntax checking with flake8
+        run: |
+          pip install flake8
+          flake8 --extend-ignore=E111,E114 --show-source;
diff --git a/.gitignore b/.gitignore
index 8321c7d2..66eb3551 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ htmlcov/
 .cache/
 .pytest_cache/
 doc/auto_examples/*
-doc/generated/*
\ No newline at end of file
+doc/generated/*
+venv/
+.vscode/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 0e510a9f..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-language: python
-sudo: false
-cache: pip
-python:
-  - "2.7"
-  - "3.4"
-  - "3.6"
-before_install:
-  - sudo apt-get install liblapack-dev
-  - pip install --upgrade pip pytest
-  - pip install wheel cython numpy scipy codecov pytest-cov
-  - if $TRAVIS_PYTHON_VERSION == "3.6"; then
-        pip install scikit-learn;
-    else
-        pip install scikit-learn==0.20.3;
-    fi
-  - if [[ ($TRAVIS_PYTHON_VERSION == "3.6") ||
-          ($TRAVIS_PYTHON_VERSION == "2.7")]]; then
-              pip install git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8;
-    fi
-script:
-  # we do coverage for all versions so that codecov will merge them: this
-  # way we will see that both paths (with or without skggm) are tested
-  - pytest test --cov;
-after_success:
-  - bash <(curl -s https://codecov.io/bash)
-
diff --git a/README.rst b/README.rst
index 027e5498..b2f6e6d4 100644
--- a/README.rst
+++ b/README.rst
@@ -1,9 +1,9 @@
-|Travis-CI Build Status| |License| |PyPI version| |Code coverage|
+|GitHub Actions Build Status| |License| |PyPI version| |Code coverage|
 
-metric-learn
-=============
+metric-learn: Metric Learning in Python
+=======================================
 
-Metric Learning algorithms in Python.
+metric-learn contains efficient Python implementations of several popular supervised and weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib <https://github.com/scikit-learn-contrib>`_, the API of metric-learn is compatible with `scikit-learn <http://scikit-learn.org/stable/>`_, the leading library for machine learning in Python. This allows to use all the scikit-learn routines (for pipelining, model selection, etc) with metric learning algorithms through a unified interface.
 
 **Algorithms**
 
@@ -11,6 +11,7 @@ Metric Learning algorithms in Python.
 -  Information Theoretic Metric Learning (ITML)
 -  Sparse Determinant Metric Learning (SDML)
 -  Least Squares Metric Learning (LSML)
+-  Sparse Compositional Metric Learning (SCML)
 -  Neighborhood Components Analysis (NCA)
 -  Local Fisher Discriminant Analysis (LFDA)
 -  Relative Components Analysis (RCA)
@@ -19,36 +20,58 @@ Metric Learning algorithms in Python.
 
 **Dependencies**
 
--  Python 2.7+, 3.4+
--  numpy, scipy, scikit-learn>=0.20.3
+-  Python 3.6+ (the last version supporting Python 2 and Python 3.5 was
+   `v0.5.0 <https://pypi.org/project/metric-learn/0.5.0/>`_)
+-  numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3
 
 **Optional dependencies**
 
 - For SDML, using skggm will allow the algorithm to solve problematic cases
   (install from commit `a0ed406 <https://github.com/skggm/skggm/commit/a0ed406586c4364ea3297a658f415e13b5cbdaf8>`_).
+  ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub.
 -  For running the examples only: matplotlib
 
 **Installation/Setup**
 
-Run ``pip install metric-learn`` to download and install from PyPI.
+- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here <https://github.com/conda-forge/metric-learn-feedstock#installing-metric-learn>`_.
 
-Run ``python setup.py install`` for default installation.
+- To install from PyPI: ``pip install metric-learn``.
 
-Run ``pytest test`` to run all tests (you will need to have the ``pytest``
-package installed).
+- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed).
 
 **Usage**
 
 See the `sphinx documentation`_ for full documentation about installation, API, usage, and examples.
 
+**Citation**
 
-.. _sphinx documentation: http://metric-learn.github.io/metric-learn/
+If you use metric-learn in a scientific publication, we would appreciate
+citations to the following paper:
 
-.. |Travis-CI Build Status| image:: https://api.travis-ci.org/metric-learn/metric-learn.svg?branch=master
-   :target: https://travis-ci.org/metric-learn/metric-learn
+`metric-learn: Metric Learning Algorithms in Python
+<http://www.jmlr.org/papers/volume21/19-678/19-678.pdf>`_, de Vazelhes
+*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020.
+
+Bibtex entry::
+
+  @article{metric-learn,
+    title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython},
+    author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and
+              {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien},
+    journal = {Journal of Machine Learning Research},
+    year = {2020},
+    volume = {21},
+    number = {138},
+    pages = {1--6}
+  }
+
+.. _sphinx documentation: http://contrib.scikit-learn.org/metric-learn/
+
+.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg
+   :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
    :target: http://badges.mit-license.org
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
-.. |Code coverage| image:: https://codecov.io/gh/metric-learn/metric-learn/branch/master/graph/badge.svg
-   :target: https://codecov.io/gh/metric-learn/metric-learn
+.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn
diff --git a/bench/benchmarks/iris.py b/bench/benchmarks/iris.py
index 5973f7b8..05035085 100644
--- a/bench/benchmarks/iris.py
+++ b/bench/benchmarks/iris.py
@@ -5,15 +5,15 @@
 
 CLASSES = {
     'Covariance': metric_learn.Covariance(),
-    'ITML_Supervised': metric_learn.ITML_Supervised(num_constraints=200),
+    'ITML_Supervised': metric_learn.ITML_Supervised(n_constraints=200),
     'LFDA': metric_learn.LFDA(k=2, dim=2),
-    'LMNN': metric_learn.LMNN(k=5, learn_rate=1e-6, verbose=False),
-    'LSML_Supervised': metric_learn.LSML_Supervised(num_constraints=200),
+    'LMNN': metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False),
+    'LSML_Supervised': metric_learn.LSML_Supervised(n_constraints=200),
     'MLKR': metric_learn.MLKR(),
     'NCA': metric_learn.NCA(max_iter=700, n_components=2),
-    'RCA_Supervised': metric_learn.RCA_Supervised(dim=2, num_chunks=30,
+    'RCA_Supervised': metric_learn.RCA_Supervised(dim=2, n_chunks=30,
                                                   chunk_size=2),
-    'SDML_Supervised': metric_learn.SDML_Supervised(num_constraints=1500)
+    'SDML_Supervised': metric_learn.SDML_Supervised(n_constraints=1500)
 }
 
 
diff --git a/doc/_static/css/styles.css b/doc/_static/css/styles.css
new file mode 100644
index 00000000..6d350ae4
--- /dev/null
+++ b/doc/_static/css/styles.css
@@ -0,0 +1,36 @@
+.hatnote {
+    border-color: #e1e4e5 ;
+    border-style: solid ;
+    border-width: 1px ;
+    font-size: x-small ;
+    font-style: italic ;
+    margin-left: auto ;
+    margin-right: auto ;
+    margin-bottom: 24px;
+    padding: 12px;
+}
+.hatnote-gray {
+  background-color: #f5f5f5 
+}
+.hatnote li {
+  list-style-type: square;
+  margin-left: 12px !important;
+}
+.hatnote ul {
+  list-style-type: square;
+  margin-left: 0px !important;
+  margin-bottom: 0px !important;
+}
+.deprecated {
+  color: #b94a48;
+  background-color: #F3E5E5;
+  border-color: #eed3d7;
+  margin-top: 0.5rem;
+  padding: 0.5rem;
+  border-radius: 0.5rem;
+  margin-bottom: 0.5rem;
+}
+
+.deprecated p {
+  margin-bottom: 0 !important;
+}
\ No newline at end of file
diff --git a/doc/conf.py b/doc/conf.py
index 5d1baeda..c472cc21 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import sys
 import os
+import warnings
 
 extensions = [
     'sphinx.ext.autodoc',
@@ -20,10 +21,12 @@
 
 # General information about the project.
 project = u'metric-learn'
-copyright = u'2015-2019, CJ Carey, Yuan Tang, William de Vazelhes, Aurélien Bellet, and Nathalie Vauquier'
-author = u'CJ Carey, Yuan Tang, William de Vazelhes, Aurélien Bellet, and Nathalie Vauquier'
-version = '0.5.0'
-release = '0.5.0'
+copyright = (u'2015-2023, CJ Carey, Yuan Tang, William de Vazelhes, Aurélien '
+             u'Bellet and Nathalie Vauquier')
+author = (u'CJ Carey, Yuan Tang, William de Vazelhes, Aurélien Bellet and '
+          u'Nathalie Vauquier')
+version = '0.7.0'
+release = '0.7.0'
 language = 'en'
 
 exclude_patterns = ['_build']
@@ -35,9 +38,6 @@
 html_static_path = ['_static']
 htmlhelp_basename = 'metric-learndoc'
 
-# Option to only need single backticks to refer to symbols
-default_role = 'any'
-
 # Option to hide doctests comments in the documentation (like # doctest:
 # +NORMALIZE_WHITESPACE for instance)
 trim_doctest_flags = True
@@ -63,3 +63,20 @@
 
 # generate autosummary even if no references
 autosummary_generate = True
+
+
+# Temporary work-around for spacing problem between parameter and parameter
+# type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug
+# has been fixed in sphinx (https://github.com/sphinx-doc/sphinx/pull/5976) but
+# through a change in sphinx basic.css except rtd_theme does not use basic.css.
+# In an ideal world, this would get fixed in this PR:
+# https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files
+def setup(app):
+  app.add_js_file('js/copybutton.js')
+  app.add_css_file('css/styles.css')
+
+
+# Remove matplotlib agg warnings from generated doc when using plt.show
+warnings.filterwarnings("ignore", category=UserWarning,
+                        message='Matplotlib is currently using agg, which is a'
+                                ' non-GUI backend, so cannot show the figure.')
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 5a671d86..90b7c7ee 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -5,23 +5,28 @@ Getting started
 Installation and Setup
 ======================
 
-Run ``pip install metric-learn`` to download and install from PyPI.
+**Installation**
 
-Alternately, download the source repository and run:
+metric-learn can be installed in either of the following ways:
 
--  ``python setup.py install`` for default installation.
--  ``python setup.py test`` to run all tests.
+- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here <https://github.com/conda-forge/metric-learn-feedstock#installing-metric-learn>`_.
+
+- To install from PyPI: ``pip install metric-learn``.
+
+- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed).
 
 **Dependencies**
 
--  Python 2.7+, 3.4+
--  numpy, scipy, scikit-learn>=0.20.3
+- Python 3.6+ (the last version supporting Python 2 and Python 3.5 was
+  `v0.5.0 <https://pypi.org/project/metric-learn/0.5.0/>`_)
+- numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3
 
 **Optional dependencies**
 
 - For SDML, using skggm will allow the algorithm to solve problematic cases
   (install from commit `a0ed406 <https://github.com/skggm/skggm/commit/a0ed406586c4364ea3297a658f415e13b5cbdaf8>`_).
--  For running the examples only: matplotlib
+  ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub.
+- For running the examples only: matplotlib
 
 Quick start
 ===========
@@ -29,11 +34,14 @@ Quick start
 This example loads the iris dataset, and evaluates a k-nearest neighbors
 algorithm on an embedding space learned with `NCA`.
 
->>> from metric_learn import NCA
->>> from sklearn.datasets import load_iris
->>> from sklearn.model_selection import cross_val_score
->>> from sklearn.pipeline import make_pipeline
->>>
->>> X, y = load_iris(return_X_y=True)
->>> clf = make_pipeline(NCA(), KNeighborsClassifier())
->>> cross_val_score(clf, X, y)
+::
+
+    from metric_learn import NCA
+    from sklearn.datasets import load_iris
+    from sklearn.model_selection import cross_val_score
+    from sklearn.pipeline import make_pipeline
+    from sklearn.neighbors import KNeighborsClassifier
+    
+    X, y = load_iris(return_X_y=True)
+    clf = make_pipeline(NCA(), KNeighborsClassifier())
+    cross_val_score(clf, X, y)
diff --git a/doc/index.rst b/doc/index.rst
index 9d303bee..f9dfd83d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,13 +1,36 @@
 metric-learn: Metric Learning in Python
 =======================================
-|Travis-CI Build Status| |License| |PyPI version| |Code coverage|
+|GitHub Actions Build Status| |License| |PyPI version| |Code coverage|
 
-Metric-learn contains efficient Python implementations of several
-popular supervised and weakly-supervised metric learning algorithms. The API
-of metric-learn is compatible with `scikit-learn
+`metric-learn <https://github.com/scikit-learn-contrib/metric-learn>`_
+contains efficient Python implementations of several popular supervised and
+weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib
+<https://github.com/scikit-learn-contrib>`_, the API of metric-learn is compatible with `scikit-learn
 <https://scikit-learn.org/>`_, the leading library for machine learning in
-Python. This allows to use of all the scikit-learn routines (for pipelining,
-model selection, etc) with metric learning algorithms.
+Python. This allows to use all the scikit-learn routines (for pipelining,
+model selection, etc) with metric learning algorithms through a unified
+interface.
+
+If you use metric-learn in a scientific publication, we would appreciate
+citations to the following paper:
+
+`metric-learn: Metric Learning Algorithms in Python
+<http://www.jmlr.org/papers/volume21/19-678/19-678.pdf>`_, de Vazelhes
+*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020.
+
+Bibtex entry::
+
+  @article{metric-learn,
+    title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython},
+    author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and
+              {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien},
+    journal = {Journal of Machine Learning Research},
+    year = {2020},
+    volume = {21},
+    number = {138},
+    pages = {1--6}
+  }
+
 
 Documentation outline
 ---------------------
@@ -32,13 +55,13 @@ Documentation outline
 
    auto_examples/index
 
-:ref:`genindex` | :ref:`modindex` | :ref:`search`
+:ref:`genindex` | :ref:`search`
 
-.. |Travis-CI Build Status| image:: https://api.travis-ci.org/metric-learn/metric-learn.svg?branch=master
-   :target: https://travis-ci.org/metric-learn/metric-learn
+.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg
+   :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
    :target: http://badges.mit-license.org
-.. |Code coverage| image:: https://codecov.io/gh/metric-learn/metric-learn/branch/master/graph/badge.svg
-   :target: https://codecov.io/gh/metric-learn/metric-learn
+.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn
diff --git a/doc/introduction.rst b/doc/introduction.rst
index 04ae1a18..e9ff0015 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -96,7 +96,7 @@ examples (for code illustrating some of these use-cases, see the
   metric learning provides a way to bias the clusters found by algorithms like
   K-Means towards the intended semantics.
 - Information retrieval: the learned metric can be used to retrieve the
-  elements of a database that are semantically closer to a query element.
+  elements of a database that are semantically closest to a query element.
 - Dimensionality reduction: metric learning may be seen as a way to reduce the
   data dimension in a (weakly) supervised setting.
 - More generally, the learned transformation :math:`L` can be used to project
@@ -123,26 +123,3 @@ to the following resources:
   Survey <http://dx.doi.org/10.1561/2200000019>`_ (2012)
 - **Book:** `Metric Learning
   <http://dx.doi.org/10.2200/S00626ED1V01Y201501AIM030>`_ (2015)
-
-.. Methods [TO MOVE TO SUPERVISED/WEAK SECTIONS]
-.. =============================================
-
-.. Currently, each metric learning algorithm supports the following methods:
-
-.. -  ``fit(...)``, which learns the model.
-.. -  ``get_mahalanobis_matrix()``, which returns a Mahalanobis matrix
-.. -  ``get_metric()``, which returns a function that takes as input two 1D
-      arrays and outputs the learned metric score on these two points
-..    :math:`M = L^{\top}L` such that distance between vectors ``x`` and
-..    ``y`` can be computed as :math:`\sqrt{\left(x-y\right)M\left(x-y\right)}`.
-.. -  ``components_from_metric(metric)``, which returns a transformation matrix
-..    :math:`L \in \mathbb{R}^{D \times d}`, which can be used to convert a
-..    data matrix :math:`X \in \mathbb{R}^{n \times d}` to the
-..    :math:`D`-dimensional learned metric space :math:`X L^{\top}`,
-..    in which standard Euclidean distances may be used.
-.. -  ``transform(X)``, which applies the aforementioned transformation.
-.. - ``score_pairs(pairs)`` which returns the distance between pairs of
-..   points. ``pairs`` should be a 3D array-like of pairs of shape ``(n_pairs,
-..   2, n_features)``, or it can be a 2D array-like of pairs indicators of
-..   shape ``(n_pairs, 2)`` (see section :ref:`preprocessor_section` for more
-..   details).
\ No newline at end of file
diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst
index 930404d0..4d0676b9 100644
--- a/doc/metric_learn.rst
+++ b/doc/metric_learn.rst
@@ -13,7 +13,10 @@ Base Classes
 
     metric_learn.Constraints
     metric_learn.base_metric.BaseMetricLearner
+    metric_learn.base_metric.MetricTransformer
+    metric_learn.base_metric.MahalanobisMixin
     metric_learn.base_metric._PairsClassifierMixin
+    metric_learn.base_metric._TripletsClassifierMixin
     metric_learn.base_metric._QuadrupletsClassifierMixin
 
 Supervised Learning Algorithms
@@ -32,6 +35,7 @@ Supervised Learning Algorithms
    metric_learn.MMC_Supervised
    metric_learn.SDML_Supervised
    metric_learn.RCA_Supervised
+   metric_learn.SCML_Supervised
 
 Weakly Supervised Learning Algorithms
 -------------------------------------
@@ -44,6 +48,7 @@ Weakly Supervised Learning Algorithms
    metric_learn.LSML
    metric_learn.MMC
    metric_learn.SDML
+   metric_learn.SCML
 
 Unsupervised Learning Algorithms
 --------------------------------
diff --git a/doc/supervised.rst b/doc/supervised.rst
index 3c941b20..49548b83 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -50,7 +50,7 @@ classes will be large. To do so, we fit the metric learner (example:
 >>> from metric_learn import NCA
 >>> nca = NCA(random_state=42)
 >>> nca.fit(X, y)
-NCA(init=None, max_iter=100, n_components=None, num_dims='deprecated',
+NCA(init='auto', max_iter=100, n_components=None,
   preprocessor=None, random_state=42, tol=None, verbose=False)
 
 
@@ -69,10 +69,10 @@ Also, as explained before, our metric learners has learn a distance between
 points. You can use this distance in two main ways:
 
 - You can either return the distance between pairs of points using the
-  `score_pairs` function:
+  `pair_distance` function:
 
->>> nca.score_pairs([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]]])
-array([0.49627072, 3.65287282])
+>>> nca.pair_distance([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+array([0.49627072, 3.65287282, 6.06079877])
 
 - Or you can return a function that will return the distance (in the new
   space) between two 1D arrays (the coordinates of the points in the original
@@ -82,6 +82,18 @@ array([0.49627072, 3.65287282])
 >>> metric_fun([3.5, 3.6], [5.6, 2.4])
 0.4962707194621285
 
+- Alternatively, you can use `pair_score` to return the **score** between
+  pairs of points (the larger the score, the more similar the pair).
+  For Mahalanobis learners, it is equal to the opposite of the distance.
+
+>>> score = nca.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+>>> score
+array([-0.49627072, -3.65287282, -6.06079877])
+
+This is useful because `pair_score` matches the **score** semantic of 
+scikit-learn's `Classification metrics
+<https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics>`_.
+
 .. note::
 
     If the metric learner that you use learns a :ref:`Mahalanobis distance
@@ -93,7 +105,6 @@ array([0.49627072, 3.65287282])
     array([[0.43680409, 0.89169412],
            [0.89169412, 1.9542479 ]])
 
-.. TODO: remove the "like it is the case etc..." if it's not the case anymore
 
 Scikit-learn compatibility
 --------------------------
@@ -105,6 +116,7 @@ All supervised algorithms are scikit-learn estimators
 scikit-learn model selection routines 
 (`sklearn.model_selection.cross_val_score`,
 `sklearn.model_selection.GridSearchCV`, etc).
+You can also use some of the scoring functions from `sklearn.metrics`.
 
 Algorithms
 ==========
@@ -131,16 +143,16 @@ The distance is learned by solving the following optimization problem:
       c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L(x_i-x_j)}||^2-||
       \mathbf{L(x_i-x_l)}||^2]_+)
 
-where :math:`\mathbf{x}_i` is an data point, :math:`\mathbf{x}_j` is one 
-of its k nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` 
+where :math:`\mathbf{x}_i` is a data point, :math:`\mathbf{x}_j` is one 
+of its k-nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` 
 are all the other instances within that region with different labels, 
 :math:`\eta_{ij}, y_{ij} \in \{0, 1\}` are both the indicators, 
-:math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k nearest 
-neighbors(with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` 
-indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different class, 
+:math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k-nearest 
+neighbors (with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` 
+indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different classes, 
 :math:`[\cdot]_+=\max(0, \cdot)` is the Hinge loss.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -152,18 +164,18 @@ indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different class,
     X = iris_data['data']
     Y = iris_data['target']
 
-    lmnn = LMNN(k=5, learn_rate=1e-6)
-    lmnn.fit(X, Y, verbose=False)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
+    lmnn.fit(X, Y)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] Weinberger et al. `Distance Metric Learning for Large Margin
-       Nearest Neighbor Classification
-       <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_.
-       JMLR 2009
 
-    .. [2] `Wikipedia entry on Large Margin Nearest Neighbor <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_
-       
+.. container:: hatnote hatnote-gray
+
+  [1]. Weinberger et al. `Distance Metric Learning for Large Margin Nearest Neighbor Classification <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_. JMLR 2009.
+
+  [2]. `Wikipedia entry on Large Margin Nearest Neighbor <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_.
+             
 
 .. _nca:
 
@@ -204,7 +216,7 @@ the sum of probability of being correctly classified:
 
       \mathbf{L} = \text{argmax}\sum_i p_i
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -219,13 +231,14 @@ the sum of probability of being correctly classified:
     nca = NCA(max_iter=1000)
     nca.fit(X, Y)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Goldberger et al.
-       `Neighbourhood Components Analysis <https://papers.nips.cc/paper/2566-neighbourhood-components-analysis.pdf>`_.
-       NIPS 2005
+      [1]. Goldberger et al. `Neighbourhood Components Analysis <https://papers.nips.cc/paper/2566-neighbourhood-components-analysis.pdf>`_. NIPS 2005.
 
-    .. [2] `Wikipedia entry on Neighborhood Components Analysis <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+      [2]. `Wikipedia entry on Neighborhood Components Analysis <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_.
        
 
 .. _lfda:
@@ -235,7 +248,7 @@ the sum of probability of being correctly classified:
 
 Local Fisher Discriminant Analysis (:py:class:`LFDA <metric_learn.LFDA>`)
 
-`LFDA` is a linear supervised dimensionality reduction method. It is
+`LFDA` is a linear supervised dimensionality reduction method which effectively combines the ideas of `Linear Discriminant Analysis <https://en.wikipedia.org/wiki/Linear_discriminant_analysis>` and Locality-Preserving Projection . It is
 particularly useful when dealing with multi-modality, where one ore more classes
 consist of separate clusters in input space. The core optimization problem of
 LFDA is solved as a generalized eigenvalue problem.
@@ -261,23 +274,23 @@ where
     \,\,\mathbf{A}_{i,j}(1/n-1/n_l) \qquad y_i = y_j\end{aligned}\right.\\
 
 here :math:`\mathbf{A}_{i,j}` is the :math:`(i,j)`-th entry of the affinity
-matrix :math:`\mathbf{A}`:, which can be calculated with local scaling methods.
+matrix :math:`\mathbf{A}`:, which can be calculated with local scaling methods, `n` and `n_l` are the total number of points and the number of points per cluster `l` respectively.
 
 Then the learning problem becomes derive the LFDA transformation matrix 
-:math:`\mathbf{T}_{LFDA}`:
+:math:`\mathbf{L}_{LFDA}`:
 
 .. math::
 
-    \mathbf{T}_{LFDA} = \arg\max_\mathbf{T}
-    [\text{tr}((\mathbf{T}^T\mathbf{S}^{(w)}
-    \mathbf{T})^{-1}\mathbf{T}^T\mathbf{S}^{(b)}\mathbf{T})]
+    \mathbf{L}_{LFDA} = \arg\max_\mathbf{L}
+    [\text{tr}((\mathbf{L}^T\mathbf{S}^{(w)}
+    \mathbf{L})^{-1}\mathbf{L}^T\mathbf{S}^{(b)}\mathbf{L})]
 
-That is, it is looking for a transformation matrix :math:`\mathbf{T}` such that 
+That is, it is looking for a transformation matrix :math:`\mathbf{L}` such that 
 nearby data pairs in the same class are made close and the data pairs in 
 different classes are separated from each other; far apart data pairs in the 
 same class are not imposed to be close.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -292,15 +305,19 @@ same class are not imposed to be close.
     lfda = LFDA(k=2, dim=2)
     lfda.fit(X, Y)
 
-.. topic:: References:
+.. note::
+    LDFA suffers from a problem called “sign indeterminacy”, which means the sign of the ``components`` and the output from transform depend on a random state. This is directly related to the calculation of eigenvectors in the algorithm. The same input ran in different times might lead to different transforms, but both valid.
+    
+    To work around this, fit instances of this class to data once, then keep the instance around to do transformations.
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local
-       Fisher Discriminant Analysis <http://www.jmlr.org/papers/volume8/sugiyama07b/sugiyama07b.pdf>`_.
-       JMLR 2007
+      [1]. Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis <http://www.jmlr.org/papers/volume8/sugiyama07b/sugiyama07b.pdf>`_. JMLR 2007.
 
-    .. [2] Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering
-       <https://gastrograph.com/resources/whitepapers/local-fisher
-       -discriminant-analysis-on-beer-style-clustering.html#>`_.
+      [2]. Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering <https://gastrograph.com/resources/whitepapers/local-fisher-discriminant-analysis-on-beer-style-clustering.html#>`_.
 
 .. _mlkr:
 
@@ -326,9 +343,9 @@ empirical development. The Gaussian kernel is denoted as:
 
 where :math:`d(\cdot, \cdot)` is the squared distance under some metrics, 
 here in the fashion of Mahalanobis, it should be :math:`d(\mathbf{x}_i, 
-\mathbf{x}_j) = ||\mathbf{A}(\mathbf{x}_i - \mathbf{x}_j)||`, the transition 
-matrix :math:`\mathbf{A}` is derived from the decomposition of Mahalanobis 
-matrix :math:`\mathbf{M=A^TA}`.
+\mathbf{x}_j) = ||\mathbf{L}(\mathbf{x}_i - \mathbf{x}_j)||`, the transition 
+matrix :math:`\mathbf{L}` is derived from the decomposition of Mahalanobis 
+matrix :math:`\mathbf{M=L^TL}`.
 
 Since :math:`\sigma^2` can be integrated into :math:`d(\cdot)`, we can set 
 :math:`\sigma^2=1` for the sake of simplicity. Here we use the cumulative 
@@ -346,7 +363,7 @@ calculating a weighted average of all the training samples:
 
     \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}}
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -360,10 +377,12 @@ calculating a weighted average of all the training samples:
     mlkr = MLKR()
     mlkr.fit(X, Y)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] Weinberger et al. `Metric Learning for Kernel Regression <http://proceedings.mlr.
-       press/v2/weinberger07a/weinberger07a.pdf>`_. AISTATS 2007
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Weinberger et al. `Metric Learning for Kernel Regression <http://proceedings.mlr.press/v2/weinberger07a/weinberger07a.pdf>`_. AISTATS 2007.
 
 
 .. _supervised_version:
@@ -374,7 +393,12 @@ Supervised versions of weakly-supervised algorithms
 Each :ref:`weakly-supervised algorithm <weakly_supervised_section>`
 has a supervised version of the form `*_Supervised` where similarity tuples are
 randomly generated from the labels information and passed to the underlying
-algorithm.
+algorithm. 
+
+.. warning::
+    Supervised versions of weakly-supervised algorithms interpret label -1
+    (or any negative label) as a point with unknown label.
+    Those points are discarded in the learning process.
 
 For pairs learners (see :ref:`learning_on_pairs`), pairs (tuple of two points
 from the dataset), and pair labels (`int` indicating whether the two points
@@ -383,8 +407,8 @@ are similar (+1) or dissimilar (-1)), are sampled with the function
 (of label +1), this method will look at all the samples from the same label and
 sample randomly a pair among them. To sample negative pairs (of label -1), this
 method will look at all the samples from a different class and sample randomly
-a pair among them. The method will try to build `num_constraints` positive
-pairs and `num_constraints` negative pairs, but sometimes it cannot find enough
+a pair among them. The method will try to build `n_constraints` positive
+pairs and `n_constraints` negative pairs, but sometimes it cannot find enough
 of one of those, so forcing `same_length=True` will return both times the
 minimum of the two lenghts.
 
@@ -395,7 +419,7 @@ quadruplets, where for each quadruplet the two first points are from the same
 class, and the two last points are from a different class (so indeed the two
 last points should be less similar than the two first points).
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -406,5 +430,5 @@ last points should be less similar than the two first points).
     X = iris_data['data']
     Y = iris_data['target']
 
-    mmc = MMC_Supervised(num_constraints=200)
+    mmc = MMC_Supervised(n_constraints=200)
     mmc.fit(X, Y)
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
index 1191e805..110b07f9 100644
--- a/doc/unsupervised.rst
+++ b/doc/unsupervised.rst
@@ -20,7 +20,7 @@ It can be used for ZCA whitening of the data (see the Wikipedia page of
 `whitening transformation <https://en.wikipedia.org/wiki/\
 Whitening_transformation>`_).
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -32,6 +32,9 @@ Whitening_transformation>`_).
     cov = Covariance().fit(iris)
     x = cov.transform(iris)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
\ No newline at end of file
+
+.. container:: hatnote hatnote-gray
+
+      [1]. On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936.
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 38f08fbe..76f7c14e 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -57,13 +57,14 @@ learn:
 ^^^^^^^^^^^^^^^^^^
 
 The most intuitive way to represent tuples is to provide the algorithm with a
-3D array-like of tuples of shape `(n_tuples, t, n_features)`, where
+3D array-like of tuples of shape `(n_tuples, tuple_size, n_features)`, where
 `n_tuples` is the number of tuples, `tuple_size` is the number of elements
 in a tuple (2 for pairs, 3 for triplets for instance), and `n_features` is
 the number of features of each point.
 
-.. topic:: Example:
-   Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
+.. rubric:: Example Code
+
+Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
 
 >>> import numpy as np
 >>> tuples = np.array([[[-0.12, -1.21, -0.20],
@@ -94,7 +95,9 @@ would be to keep the dataset of points `X` aside, and just represent tuples
 as a collection of tuples of *indices* from the points in `X`. Since we loose
 the feature dimension there, the resulting array is 2D.
 
-.. topic:: Example: An equivalent representation of the above pairs would be:
+.. rubric:: Example Code
+    
+An equivalent representation of the above pairs would be:
 
 >>> X = np.array([[-0.12, -1.21, -0.20],
 >>>               [+0.05, -0.19, -0.05],
@@ -134,8 +137,8 @@ are respected.
 >>> from metric_learn import MMC
 >>> mmc = MMC(random_state=42)
 >>> mmc.fit(tuples, y)
-MMC(A0='deprecated', convergence_threshold=0.001, diagonal=False,
-  diagonal_c=1.0, init=None, max_iter=100, max_proj=10000,
+MMC(A0='deprecated', tol=0.001, diagonal=False,
+  diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000,
   preprocessor=None, random_state=42, verbose=False)
 
 Or alternatively (using a preprocessor):
@@ -160,9 +163,9 @@ Also, as explained before, our metric learner has learned a distance between
 points. You can use this distance in two main ways:
 
 - You can either return the distance between pairs of points using the
-  `score_pairs` function:
+  `pair_distance` function:
 
->>> mmc.score_pairs([[[3.5, 3.6, 5.2], [5.6, 2.4, 6.7]],
+>>> mmc.pair_distance([[[3.5, 3.6, 5.2], [5.6, 2.4, 6.7]],
 ...                  [[1.2, 4.2, 7.7], [2.1, 6.4, 0.9]]])
 array([7.27607365, 0.88853014])
 
@@ -175,6 +178,18 @@ array([7.27607365, 0.88853014])
 >>> metric_fun([3.5, 3.6, 5.2], [5.6, 2.4, 6.7])
 7.276073646278203
 
+- Alternatively, you can use `pair_score` to return the **score** between
+  pairs of points (the larger the score, the more similar the pair).
+  For Mahalanobis learners, it is equal to the opposite of the distance.
+
+>>> score = mmc.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]])
+>>> score
+array([-0.49627072, -3.65287282, -6.06079877])
+
+  This is useful because `pair_score` matches the **score** semantic of 
+  scikit-learn's `Classification metrics
+  <https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics>`_.
+
 .. note::
 
     If the metric learner that you use learns a :ref:`Mahalanobis distance
@@ -187,8 +202,6 @@ array([[ 0.58603894, -5.69883982, -1.66614919],
        [-5.69883982, 55.41743549, 16.20219519],
        [-1.66614919, 16.20219519,  4.73697721]])
 
-.. TODO: remove the "like it is the case etc..." if it's not the case anymore
-
 .. _sklearn_compat_ws:
 
 Prediction and scoring
@@ -250,8 +263,8 @@ tuples).
 >>> y_pairs = np.array([1, -1])
 >>> mmc = MMC(random_state=42)
 >>> mmc.fit(pairs, y_pairs)
-MMC(A0='deprecated', convergence_threshold=0.001, diagonal=False,
-    diagonal_c=1.0, init=None, max_iter=100, max_proj=10000, preprocessor=None,
+MMC(tol=0.001, diagonal=False,
+    diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000, preprocessor=None,
     random_state=42, verbose=False)
 
 Here, we learned a metric that puts the two first points closer
@@ -344,8 +357,8 @@ returns the `sklearn.metrics.roc_auc_score` (which is threshold-independent).
 
 .. note::
    See :ref:`fit_ws` for more details on metric learners functions that are
-   not specific to learning on pairs, like `transform`, `score_pairs`,
-   `get_metric` and `get_mahalanobis_matrix`.
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
 
 Algorithms
 ----------
@@ -367,40 +380,40 @@ other methods, `ITML` does not rely on an eigenvalue computation or
 semi-definite programming.
 
 
-Given a Mahalanobis distance parameterized by :math:`A`, its corresponding 
+Given a Mahalanobis distance parameterized by :math:`M`, its corresponding 
 multivariate Gaussian is denoted as:
 
 .. math::
-    p(\mathbf{x}; \mathbf{A}) = \frac{1}{Z}\exp(-\frac{1}{2}d_\mathbf{A}
+    p(\mathbf{x}; \mathbf{M}) = \frac{1}{Z}\exp(-\frac{1}{2}d_\mathbf{M}
     (\mathbf{x}, \mu)) 
-    =  \frac{1}{Z}\exp(-\frac{1}{2}((\mathbf{x} - \mu)^T\mathbf{A}
+    =  \frac{1}{Z}\exp(-\frac{1}{2}((\mathbf{x} - \mu)^T\mathbf{M}
     (\mathbf{x} - \mu)) 
 
 where :math:`Z` is the normalization constant, the inverse of Mahalanobis 
-matrix :math:`\mathbf{A}^{-1}` is the covariance of the Gaussian.
+matrix :math:`\mathbf{M}^{-1}` is the covariance of the Gaussian.
 
 Given pairs of similar points :math:`S` and pairs of dissimilar points 
 :math:`D`, the distance metric learning problem is to minimize the LogDet
 divergence, which is equivalent as minimizing :math:`\textbf{KL}(p(\mathbf{x}; 
-\mathbf{A}_0) || p(\mathbf{x}; \mathbf{A}))`:
+\mathbf{M}_0) || p(\mathbf{x}; \mathbf{M}))`:
 
 .. math::
 
-    \min_\mathbf{A} D_{\ell \mathrm{d}}\left(A, A_{0}\right) = 
-    \operatorname{tr}\left(A A_{0}^{-1}\right)-\log \operatorname{det}
-    \left(A A_{0}^{-1}\right)-n\\
-    \text{subject to } \quad d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) 
+    \min_\mathbf{A} D_{\ell \mathrm{d}}\left(M, M_{0}\right) = 
+    \operatorname{tr}\left(M M_{0}^{-1}\right)-\log \operatorname{det}
+    \left(M M_{0}^{-1}\right)-n\\
+    \text{subject to } \quad d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) 
     \leq u \qquad (\mathbf{x}_i, \mathbf{x}_j)\in S \\
-    d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, 
+    d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, 
     \mathbf{x}_j)\in D
 
 
 where :math:`u` and :math:`l` is the upper and the lower bound of distance
-for similar and dissimilar pairs respectively, and :math:`\mathbf{A}_0` 
+for similar and dissimilar pairs respectively, and :math:`\mathbf{M}_0` 
 is the prior distance metric, set to identity matrix by default, 
 :math:`D_{\ell \mathrm{d}}(\cdot)` is the log determinant.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -419,11 +432,14 @@ is the prior distance metric, set to identity matrix by default,
     itml = ITML()
     itml.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning <https://icml.cc/imls/conferences/2007/proceedings/papers/404.pdf>`_. ICML 2007
+      [1]. Jason V. Davis, et al. `Information-theoretic Metric Learning <https://icml.cc/imls/conferences/2007/proceedings/papers/404.pdf>`_. ICML 2007.
 
-    .. [2] Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/
+      [2]. Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/ .
 
 
 .. _sdml:
@@ -458,7 +474,7 @@ the sums of the row elements of :math:`\mathbf{K}`., :math:`||\cdot||_{1, off}`
 is the off-diagonal L1 norm.
 
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -476,19 +492,19 @@ is the off-diagonal L1 norm.
     sdml = SDML()
     sdml.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] Qi et al.
-       `An efficient sparse metric learning in high-dimensional space via
-       L1-penalized log-determinant regularization <https://icml.cc/Conferences/2009/papers/46.pdf>`_.
-       ICML 2009.
 
-    .. [2] Adapted from https://gist.github.com/kcarnold/5439945
+.. container:: hatnote hatnote-gray
+
+      [1]. Qi et al. `An efficient sparse metric learning in high-dimensional space via L1-penalized log-determinant regularization <https://icml.cc/Conferences/2009/papers/46.pdf>`_. ICML 2009.
+
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439945 .
 
 .. _rca:
 
 :py:class:`RCA <metric_learn.RCA>`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Relative Components Analysis (:py:class:`RCA <metric_learn.RCA>`)
 
@@ -512,33 +528,31 @@ where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}`
 with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}^{-1}` is used 
 as the Mahalanobis matrix.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
     from metric_learn import RCA
 
-    pairs = [[[1.2, 7.5], [1.3, 1.5]],
-             [[6.4, 2.6], [6.2, 9.7]],
-             [[1.3, 4.5], [3.2, 4.6]],
-             [[6.2, 5.5], [5.4, 5.4]]]
-    y = [1, 1, -1, -1]
-
-    # in this task we want points where the first feature is close to be closer
-    # to each other, no matter how close the second feature is
+    X = [[-0.05,  3.0],[0.05, -3.0],
+        [0.1, -3.55],[-0.1, 3.55],
+        [-0.95, -0.05],[0.95, 0.05],
+        [0.4,  0.05],[-0.4, -0.05]]
+    chunks = [0, 0, 1, 1, 2, 2, 3, 3]
 
     rca = RCA()
-    rca.fit(pairs, y)
+    rca.fit(X, chunks)
+
+.. rubric:: References
+
 
-.. topic:: References:
+.. container:: hatnote hatnote-gray
 
-    .. [1] Shental et al. `Adjustment learning and relevant component analysis
-       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871
-       &rep=rep1&type=pdf>`_. ECCV 2002
+      [1]. Shental et al. `Adjustment learning and relevant component analysis <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871 &rep=rep1&type=pdf>`_. ECCV 2002.
 
-    .. [2] Bar-Hillel et al. `Learning distance functions using equivalence relations <https://aaai.org/Papers/ICML/2003/ICML03-005.pdf>`_. ICML 2003
+      [2]. Bar-Hillel et al. `Learning distance functions using equivalence relations <https://aaai.org/Papers/ICML/2003/ICML03-005.pdf>`_. ICML 2003.
 
-    .. [3] Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints <http://www.jmlr.org/papers/volume6/bar-hillel05a/bar-hillel05a.pdf>`_. JMLR 2005
+      [3]. Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints <http://www.jmlr.org/papers/volume6/bar-hillel05a/bar-hillel05a.pdf>`_. JMLR 2005.
 
 .. _mmc:
 
@@ -569,7 +583,7 @@ points, while constrains the sum of distances between dissimilar points:
       \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}_i, \mathbf{x}_j)
       \in D} d^2_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) \geq 1
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -587,13 +601,179 @@ points, while constrains the sum of distances between dissimilar points:
     mmc = MMC()
     mmc.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Xing et al. `Distance metric learning with application to clustering with side-information <http://papers.nips .cc/paper/2164-distance-metric-learning-with-application-to-clustering-with-side-information.pdf>`_. NIPS 2002.
+    
+    [2]. Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz .
+
+.. _learning_on_triplets:
+
+Learning on triplets
+====================
+
+Some metric learning algorithms learn on triplets of samples. In this case,
+one should provide the algorithm with `n_samples` triplets of points. The
+semantic of each triplet is that the first point should be closer to the
+second point than to the third one.
 
-  .. [1] Xing et al. `Distance metric learning with application to clustering with
-        side-information <http://papers.nips
-        .cc/paper/2164-distance-metric-learning-with-application-to-clustering
-        -with-side-information.pdf>`_. NIPS 2002
-  .. [2] Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
+Fitting
+-------
+Here is an example for fitting on triplets (see :ref:`fit_ws` for more
+details on the input data format and how to fit, in the general case of
+learning on tuples).
+
+>>> from metric_learn import SCML
+>>> triplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.1, 0.6]],
+>>>                      [[4.5, 2.3], [2.1, 2.3], [7.3, 3.4]]])
+>>> scml = SCML(random_state=42)
+>>> scml.fit(triplets)
+SCML(beta=1e-5, B=None, max_iter=100000, verbose=False,
+    preprocessor=None, random_state=None)
+
+Or alternatively (using a preprocessor):
+
+>>> X = np.array([[[1.2, 3.2], 
+>>>                [2.3, 5.5],
+>>>                [2.1, 0.6],
+>>>                [4.5, 2.3],
+>>>                [2.1, 2.3],
+>>>                [7.3, 3.4]])
+>>> triplets_indices = np.array([[0, 1, 2], [3, 4, 5]])
+>>> scml = SCML(preprocessor=X, random_state=42)
+>>> scml.fit(triplets_indices)
+SCML(beta=1e-5, B=None, max_iter=100000, verbose=False,
+   preprocessor=array([[1.2, 3.2],
+       [2.3, 5.5],
+       [2.4, 6.7],
+       [2.1, 0.6],
+       [4.5, 2.3],
+       [2.1, 2.3],
+       [0.6, 1.2],
+       [7.3, 3.4]]),
+    random_state=None)
+
+
+Here, we want to learn a metric that, for each of the two
+`triplets`, will make the first point closer to the
+second point than to the third one.
+
+.. _triplets_predicting:
+
+Prediction
+----------
+
+When a triplets learner is fitted, it is also able to predict, for an
+upcoming triplet, whether the first point is closer to the second point 
+than to the third one (+1), or not (-1).
+
+>>> triplets_test = np.array(
+... [[[5.6, 5.3], [2.2, 2.1], [1.2, 3.4]],
+...  [[6.0, 4.2], [4.3, 1.2], [0.1, 7.8]]])
+>>> scml.predict(triplets_test)
+array([-1.,  1.])
+
+.. _triplets_scoring:
+
+Scoring
+-------
+
+Triplet metric learners can also return a `decision_function` for a set of triplets,
+which corresponds to the distance between the first two points minus the distance
+between the first and last points of the triplet (the higher the value, the more
+similar the first point to the second point compared to the last one). This "score"
+can be interpreted as a measure of likeliness of having a +1 prediction for this 
+triplet.
+
+>>> scml.decision_function(triplets_test)
+array([-1.75700306,  4.98982131])
+
+In the above example, for the first triplet in `triplets_test`, the first 
+point is predicted less similar to the second point than to the last point
+(they are further away in the transformed space).
+
+Unlike pairs learners, triplets learners do not allow to give a `y` when fitting: we
+assume that the ordering of points within triplets is such that the training triplets
+are all positive. Therefore, it is not possible to use scikit-learn scoring functions
+(such as 'f1_score') for triplets learners.
+
+However, triplets learners do have a default scoring function, which will
+basically return the accuracy score on a given test set, i.e. the proportion
+of triplets that have the right predicted ordering.
+
+>>> scml.score(triplets_test)
+0.5
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
+
+
+
+
+Algorithms
+----------
+
+.. _scml:
+
+:py:class:`SCML <metric_learn.SCML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sparse Compositional Metric Learning
+(:py:class:`SCML <metric_learn.SCML>`)
+
+`SCML` learns a squared Mahalanobis distance from triplet constraints by
+optimizing sparse positive weights assigned to a set of :math:`K` rank-one
+PSD bases. This can be formulated as an optimization problem with only
+:math:`K` parameters, that can be solved with an efficient stochastic
+composite scheme.
+
+The Mahalanobis matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}`
+weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` as:
+
+.. math::
+
+    M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0
+
+Learning :math:`M` in this form makes it PSD by design, as it is a
+nonnegative sum of PSD matrices. The basis set :math:`B` is fixed in advance
+and it is possible to construct it from the data. The optimization problem
+over :math:`w` is formulated as a classic margin-based hinge loss function
+involving the set :math:`C` of triplets. A regularization :math:`\ell_1`
+is added to yield a sparse combination. The formulation is the following:
+
+.. math::
+
+    \min_{w\geq 0} \sum_{(x_i,x_j,x_k)\in C} [1 + d_w(x_i,x_j)-d_w(x_i,x_k)]_+ + \beta||w||_1
+
+where :math:`[\cdot]_+` is the hinge loss. 
+ 
+.. rubric:: Example Code
+
+::
+
+    from metric_learn import SCML
+
+    triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]],
+                [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]],
+                [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]],
+                [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]]
+
+    scml = SCML()
+    scml.fit(triplets)
+
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
+
+    [1]. Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. (AAAI), 2014.
+
+    [2]. Adapted from original `Matlab implementation. <https://github.com/bellet/SCML>`_.
 
 
 .. _learning_on_quadruplets:
@@ -602,7 +782,7 @@ Learning on quadruplets
 =======================
 
 Some metric learning algorithms learn on quadruplets of samples. In this case,
-one should provide the algorithm with `n_samples` quadruplets of points. Th
+one should provide the algorithm with `n_samples` quadruplets of points. The
 semantic of each quadruplet is that the first two points should be closer
 together than the last two points.
 
@@ -669,14 +849,12 @@ array([-1.,  1.])
 Scoring
 -------
 
-Quadruplet metric learners can also
-return a `decision_function` for a set of pairs. This is basically the "score"
-which sign will be taken to find the prediction for the pair, which
-corresponds to the difference between the distance between the two last points,
-and the distance between the two last points of the quadruplet (higher
-score means the two last points are more likely to be more dissimilar than
-the two first points (i.e. more likely to have a +1 prediction since it's
-the right ordering)).
+Quadruplet metric learners can also return a `decision_function` for a set of
+quadruplets, which corresponds to the distance between the first pair of points minus 
+the distance between the second pair of points of the triplet (the higher the value,
+the more similar the first pair is than the last pair). 
+This "score" can be interpreted as a measure of likeliness of having a +1 prediction 
+for this quadruplet.
 
 >>> lsml.decision_function(quadruplets_test)
 array([-1.75700306,  4.98982131])
@@ -685,17 +863,10 @@ In the above example, for the first quadruplet in `quadruplets_test`, the
 two first points are predicted less similar than the two last points (they
 are further away in the transformed space).
 
-Unlike for pairs learners, quadruplets learners don't allow to give a `y`
-when fitting, which does not allow to use scikit-learn scoring functions
-like:
-
->>> from sklearn.model_selection import cross_val_score
->>> cross_val_score(lsml, quadruplets, scoring='f1_score')  # this won't work
-
-(This is actually intentional, for more details
-about that, see
-`this comment <https://github.com/metric-learn/metric-learn/pull/168#pullrequestreview-203730742>`_
-on github.)
+Like triplet learners, quadruplets learners do not allow to give a `y` when fitting: we
+assume that the ordering of points within triplets is such that the training triplets
+are all positive. Therefore, it is not possible to use scikit-learn scoring functions
+(such as 'f1_score') for triplets learners.
 
 However, quadruplets learners do have a default scoring function, which will
 basically return the accuracy score on a given test set, i.e. the proportion
@@ -706,8 +877,8 @@ of quadruplets have the right predicted ordering.
 
 .. note::
    See :ref:`fit_ws` for more details on metric learners functions that are
-   not specific to learning on pairs, like `transform`, `score_pairs`,
-   `get_metric` and `get_mahalanobis_matrix`.
+   not specific to learning on pairs, like `transform`, `pair_distance`,
+   `pair_score`, `get_metric` and `get_mahalanobis_matrix`.
 
 
 
@@ -733,13 +904,13 @@ extension leads to more stable estimation when the dimension is high and
 only a small amount of constraints is given.
 
 The loss function of each constraint 
-:math:`d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)` is 
+:math:`d(\mathbf{x}_i, \mathbf{x}_j) < d(\mathbf{x}_k, \mathbf{x}_l)` is 
 denoted as:
 
 .. math::
 
-    H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) 
-    - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d))
+    H(d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) 
+    - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))
 
 where :math:`H(\cdot)` is the squared Hinge loss function defined as:
 
@@ -749,8 +920,8 @@ where :math:`H(\cdot)` is the squared Hinge loss function defined as:
     \,\,x^2 \qquad x>0\end{aligned}\right.\\
 
 The summed loss function :math:`L(C)` is the simple sum over all constraints 
-:math:`C = \{(\mathbf{x}_a , \mathbf{x}_b , \mathbf{x}_c , \mathbf{x}_d) 
-: d(\mathbf{x}_a , \mathbf{x}_b) < d(\mathbf{x}_c , \mathbf{x}_d)\}`. The 
+:math:`C = \{(\mathbf{x}_i , \mathbf{x}_j , \mathbf{x}_k , \mathbf{x}_l) 
+: d(\mathbf{x}_i , \mathbf{x}_j) < d(\mathbf{x}_k , \mathbf{x}_l)\}`. The 
 original paper suggested here should be a weighted sum since the confidence 
 or probability of each constraint might differ. However, for the sake of 
 simplicity and assumption of no extra knowledge provided, we just deploy 
@@ -762,9 +933,9 @@ knowledge:
 
 .. math::
 
-    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_a, 
-    \mathbf{x}_b, \mathbf{x}_c, \mathbf{x}_d)\in C}H(d_\mathbf{M}(
-    \mathbf{x}_a, \mathbf{x}_b) - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_c))\\
+    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_i, 
+    \mathbf{x}_j, \mathbf{x}_k, \mathbf{x}_l)\in C}H(d_\mathbf{M}(
+    \mathbf{x}_i, \mathbf{x}_j) - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))\\
 
 where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity 
 by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
@@ -774,7 +945,7 @@ by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
     D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
     (\mathbf{M})
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -791,12 +962,13 @@ by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
     lsml = LSML()
     lsml.fit(quadruplets)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Liu et al.
-       `Metric Learning from Relative Comparisons by Minimizing Squared
-       Residual <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012
+      [1]. Liu et al. `Metric Learning from Relative Comparisons by Minimizing Squared Residual <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
 
-    .. [2] Adapted from https://gist.github.com/kcarnold/5439917
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439917 .
 
 
diff --git a/examples/plot_metric_learning_examples.py b/examples/plot_metric_learning_examples.py
index 0d602cbb..32759636 100644
--- a/examples/plot_metric_learning_examples.py
+++ b/examples/plot_metric_learning_examples.py
@@ -15,7 +15,11 @@
 ######################################################################
 # Imports
 # ^^^^^^^
+# .. note::
 #
+#     In order to show the charts of the examples you need a graphical
+#     ``matplotlib`` backend installed. For intance, use ``pip install pyqt5``
+#     to get Qt graphical interface or use your favorite one.
 
 from sklearn.manifold import TSNE
 
@@ -35,9 +39,9 @@
 # We will be using a synthetic dataset to illustrate the plotting,
 # using the function `sklearn.datasets.make_classification` from
 # scikit-learn. The dataset will contain:
-#   - 100 points in 3 classes with 2 clusters per class
-#   - 5 features, among which 3 are informative (correlated with the class
-#     labels) and two are random noise with large magnitude
+# - 100 points in 3 classes with 2 clusters per class
+# - 5 features, among which 3 are informative (correlated with the class
+# labels) and two are random noise with large magnitude
 
 X, y = make_classification(n_samples=100, n_classes=3, n_clusters_per_class=2,
                            n_informative=3, class_sep=4., n_features=5,
@@ -88,7 +92,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 # distances between points for the task at hand. Especially in higher
 # dimensions when Euclidean distances are a poor way to measure distance, this
 # becomes very useful.
-# 
+#
 # Basically, we learn this distance:
 # :math:`D(x, x') = \sqrt{(x-x')^\top M(x-x')}`. And we learn the parameters
 # :math:`M` of this distance to satisfy certain constraints on the distance
@@ -113,12 +117,12 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Large Margin Nearest Neighbour
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # LMNN is a metric learning algorithm primarily designed for k-nearest
 # neighbor classification. The algorithm is based on semidefinite
 # programming, a sub-class of convex programming (as most Metric Learning
 # algorithms are).
-# 
+#
 # The main intuition behind LMNN is to learn a pseudometric under which
 # all data instances in the training set are surrounded by at least k
 # instances that share the same class label. If this is achieved, the
@@ -136,10 +140,10 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Fit and then transform!
 # -----------------------
-# 
+#
 
 # setting up LMNN
-lmnn = metric_learn.LMNN(k=5, learn_rate=1e-6)
+lmnn = metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6)
 
 # fit the data!
 lmnn.fit(X, y)
@@ -162,7 +166,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 
 ######################################################################
 # Pretty neat, huh?
-# 
+#
 # The rest of this notebook will briefly explain the other Metric Learning
 # algorithms before plotting them. Also, while we have first run ``fit``
 # and then ``transform`` to see our data transformed, we can also use
@@ -172,10 +176,10 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Information Theoretic Metric Learning
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # ITML uses a regularizer that automatically enforces a Semi-Definite
 # Positive Matrix condition - the LogDet divergence. It uses soft
-# must-link or cannot like constraints, and a simple algorithm based on
+# must-link or cannot-link constraints, and a simple algorithm based on
 # Bregman projections. Unlike LMNN, ITML will implicitly enforce points from
 # the same class to belong to the same cluster, as you can see below.
 #
@@ -231,7 +235,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Least Squares Metric Learning
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # LSML is a simple, yet effective, algorithm that learns a Mahalanobis
 # metric from a given set of relative comparisons. This is done by
 # formulating and minimizing a convex loss function that corresponds to
@@ -277,7 +281,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Local Fisher Discriminant Analysis
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # LFDA is a linear supervised dimensionality reduction method. It is
 # particularly useful when dealing with multimodality, where one ore more
 # classes consist of separate clusters in input space. The core
@@ -289,7 +293,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 # - See more in the documentation of the class :py:class:`LFDA
 #   <metric_learn.LFDA>`
 
-lfda = metric_learn.LFDA(k=2, num_dims=2)
+lfda = metric_learn.LFDA(k=2, n_components=2)
 X_lfda = lfda.fit_transform(X, y)
 
 plot_tsne(X_lfda, y)
@@ -298,7 +302,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 ######################################################################
 # Relative Components Analysis
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # RCA is another one of the older algorithms. It learns a full rank
 # Mahalanobis distance metric based on a weighted sum of in-class
 # covariance matrices. It applies a global linear transformation to assign
@@ -310,7 +314,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 # - See more in the documentation of the class :py:class:`RCA
 #   <metric_learn.RCA>`
 
-rca = metric_learn.RCA_Supervised(num_chunks=30, chunk_size=2)
+rca = metric_learn.RCA_Supervised(n_chunks=30, chunk_size=2)
 X_rca = rca.fit_transform(X, y)
 
 plot_tsne(X_rca, y)
@@ -402,7 +406,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 def create_constraints(labels):
     import itertools
     import random
-    
+
     # aggregate indices of same class
     zeros = np.where(y == 0)[0]
     ones = np.where(y == 1)[0]
@@ -413,7 +417,7 @@ def create_constraints(labels):
     twos_ = list(itertools.combinations(twos, 2))
     # put them together!
     sim = np.array(zeros_ + ones_ + twos_)
-    
+
     # similarily, put together indices in different classes
     dis = []
     for zero in zeros:
@@ -424,21 +428,25 @@ def create_constraints(labels):
     for one in ones:
         for two in twos:
             dis.append((one, two))
-            
+
     # pick up just enough dissimilar examples as we have similar examples
     dis = np.array(random.sample(dis, len(sim)))
-    
-    # return an array of pairs of indices of shape=(2*len(sim), 2), and the corresponding labels, array of shape=(2*len(sim))
-    # Each pair of similar points have a label of +1 and each pair of dissimilar points have a label of -1
-    return (np.vstack([np.column_stack([sim[:, 0], sim[:, 1]]), np.column_stack([dis[:, 0], dis[:, 1]])]),
+
+    # return an array of pairs of indices of shape=(2*len(sim), 2), and the
+    # corresponding labels, array of shape=(2*len(sim))
+    # Each pair of similar points have a label of +1 and each pair of
+    # dissimilar points have a label of -1
+    return (np.vstack([np.column_stack([sim[:, 0], sim[:, 1]]),
+                       np.column_stack([dis[:, 0], dis[:, 1]])]),
             np.concatenate([np.ones(len(sim)), -np.ones(len(sim))]))
 
+
 pairs, pairs_labels = create_constraints(y)
 
 
 ######################################################################
 # Now that we've created our constraints, let's see what it looks like!
-# 
+#
 
 print(pairs)
 print(pairs_labels)
diff --git a/examples/plot_sandwich.py b/examples/plot_sandwich.py
index 84e53d07..740852be 100644
--- a/examples/plot_sandwich.py
+++ b/examples/plot_sandwich.py
@@ -6,12 +6,20 @@
 Sandwich demo based on code from http://nbviewer.ipython.org/6576096
 """
 
+######################################################################
+# .. note::
+#
+#     In order to show the charts of the examples you need a graphical
+#     ``matplotlib`` backend installed. For intance, use ``pip install pyqt5``
+#     to get Qt graphical interface or use your favorite one.
+
 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
 from sklearn.neighbors import NearestNeighbors
 
-from metric_learn import LMNN, ITML_Supervised, LSML_Supervised, SDML_Supervised
+from metric_learn import (LMNN, ITML_Supervised, LSML_Supervised,
+                          SDML_Supervised)
 
 
 def sandwich_demo():
@@ -27,9 +35,9 @@ def sandwich_demo():
 
   mls = [
       LMNN(),
-      ITML_Supervised(num_constraints=200),
-      SDML_Supervised(num_constraints=200, balance_param=0.001),
-      LSML_Supervised(num_constraints=200),
+      ITML_Supervised(n_constraints=200),
+      SDML_Supervised(n_constraints=200, balance_param=0.001),
+      LSML_Supervised(n_constraints=200),
   ]
 
   for ax_num, ml in enumerate(mls, start=3):
@@ -47,10 +55,10 @@ def sandwich_demo():
 
 # TODO: use this somewhere
 def visualize_class_separation(X, labels):
-  _, (ax1,ax2) = plt.subplots(ncols=2)
+  _, (ax1, ax2) = plt.subplots(ncols=2)
   label_order = np.argsort(labels)
   ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest')
-  ax2.imshow(pairwise_distances(labels[label_order,None]),
+  ax2.imshow(pairwise_distances(labels[label_order, None]),
              interpolation='nearest')
 
 
@@ -77,19 +85,19 @@ def sandwich_data():
     for k, xc in enumerate(x_centers):
       data[i, k, 0] = np.random.normal(xc, 0.1)
       data[i, k, 1] = np.random.normal(yc, 0.1)
-    labels[i,:] = i
+    labels[i, :] = i
   return data.reshape((-1, 2)), labels.ravel()
 
 
 def plot_sandwich_data(x, y, axis=plt, colors='rbgmky'):
   for idx, val in enumerate(np.unique(y)):
-    xi = x[y==val]
+    xi = x[y == val]
     axis.scatter(*xi.T, s=50, facecolors='none', edgecolors=colors[idx])
 
 
 def plot_neighborhood_graph(x, nn, y, axis=plt, colors='rbgmky'):
   for i, a in enumerate(x):
-    b = x[nn[i,1]]
+    b = x[nn[i, 1]]
     axis.plot((a[0], b[0]), (a[1], b[1]), colors[y[i]])
 
 
diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py
index b2b84559..92823fb1 100644
--- a/metric_learn/__init__.py
+++ b/metric_learn/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from .constraints import Constraints
 from .covariance import Covariance
 from .itml import ITML, ITML_Supervised
@@ -11,5 +9,12 @@
 from .rca import RCA, RCA_Supervised
 from .mlkr import MLKR
 from .mmc import MMC, MMC_Supervised
+from .scml import SCML, SCML_Supervised
 
 from ._version import __version__
+
+__all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised',
+           'LMNN', 'LSML', 'LSML_Supervised', 'SDML',
+           'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised',
+           'MLKR', 'MMC', 'MMC_Supervised', 'SCML',
+           'SCML_Supervised', '__version__']
diff --git a/metric_learn/_util.py b/metric_learn/_util.py
index b476e70b..868ececa 100644
--- a/metric_learn/_util.py
+++ b/metric_learn/_util.py
@@ -1,6 +1,4 @@
 import numpy as np
-import scipy
-import six
 from numpy.linalg import LinAlgError
 from sklearn.datasets import make_spd_matrix
 from sklearn.decomposition import PCA
@@ -8,9 +6,10 @@
 from sklearn.utils.validation import check_X_y, check_random_state
 from .exceptions import PreprocessorError, NonPSDError
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from scipy.linalg import pinvh
+from scipy.linalg import pinvh, eigh
 import sys
 import time
+import warnings
 
 # hack around lack of axis kwarg in older numpy versions
 try:
@@ -283,7 +282,7 @@ def make_name(estimator):
   if a string is given
   """
   if estimator is not None:
-    if isinstance(estimator, six.string_types):
+    if isinstance(estimator, str):
       estimator_name = estimator
     else:
       estimator_name = estimator.__class__.__name__
@@ -448,45 +447,45 @@ def _initialize_components(n_components, input, y=None, init='auto',
     The input labels (or not if there are no labels).
 
   init : string or numpy array, optional (default='auto')
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
-          the description of 'lda' init), as it uses labels information. If
-          not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
-          as it projects data onto meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
-          This initialization is possible only if `has_classes == True`.
-
-      'identity'
-          The identity matrix. If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
+      the description of 'lda' init), as it uses labels information. If
+      not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
+      as it projects data onto meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
+      This initialization is possible only if `has_classes == True`.
+
+    'identity'
+      The identity matrix. If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
 
   verbose : bool
     Whether to print the details of the initialization or not.
@@ -606,26 +605,26 @@ def _initialize_metric_mahalanobis(input, init='identity', random_state=None,
     The input samples (can be tuples or regular samples).
 
   init : string or numpy array, optional (default='identity')
-         Specification for the matrix to initialize. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of shape
-         (n_features, n_features).
+    Specification for the matrix to initialize. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features).
 
-         'identity'
-            An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-         'covariance'
-            The (pseudo-)inverse covariance matrix (raises an error if the
-            covariance matrix is not definite and `strict_pd == True`)
+    'covariance'
+      The (pseudo-)inverse covariance matrix (raises an error if the
+      covariance matrix is not definite and `strict_pd == True`)
 
-         'random'
-             A random positive definite (PD) matrix of shape
-             `(n_features, n_features)`, generated using
-             `sklearn.datasets.make_spd_matrix`.
+    'random'
+      A random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-         numpy array
-             A PSD matrix (or strictly PD if strict_pd==True) of
-             shape (n_features, n_features), that will be used as such to
-             initialize the metric, or set the prior.
+    numpy array
+      A PSD matrix (or strictly PD if strict_pd==True) of
+      shape (n_features, n_features), that will be used as such to
+      initialize the metric, or set the prior.
 
   random_state : int or `numpy.RandomState` or None, optional (default=None)
     A pseudo random number generator object or a seed for it if int. If
@@ -678,17 +677,20 @@ def _initialize_metric_mahalanobis(input, init='identity', random_state=None,
 
   random_state = check_random_state(random_state)
   M = init
-  if isinstance(init, np.ndarray):
-    s, u = scipy.linalg.eigh(init)
-    init_is_definite = _check_sdp_from_eigen(s)
+  if isinstance(M, np.ndarray):
+    w, V = eigh(M, check_finite=False)
+    init_is_definite = _check_sdp_from_eigen(w)
     if strict_pd and not init_is_definite:
       raise LinAlgError("You should provide a strictly positive definite "
                         "matrix as `{}`. This one is not definite. Try another"
                         " {}, or an algorithm that does not "
                         "require the {} to be strictly positive definite."
                         .format(*((matrix_name,) * 3)))
+    elif return_inverse and not init_is_definite:
+      warnings.warn('The initialization matrix is not invertible: '
+                    'using the pseudo-inverse instead.')
     if return_inverse:
-      M_inv = np.dot(u / s, u.T)
+      M_inv = _pseudo_inverse_from_eig(w, V)
       return M, M_inv
     else:
       return M
@@ -702,20 +704,28 @@ def _initialize_metric_mahalanobis(input, init='identity', random_state=None,
   elif init == 'covariance':
     if input.ndim == 3:
       # if the input are tuples, we need to form an X by deduplication
-      X = np.vstack({tuple(row) for row in input.reshape(-1, n_features)})
+      X = np.unique(np.vstack(input), axis=0)
     else:
       X = input
     # atleast2d is necessary to deal with scalar covariance matrices
     M_inv = np.atleast_2d(np.cov(X, rowvar=False))
-    s, u = scipy.linalg.eigh(M_inv)
-    cov_is_definite = _check_sdp_from_eigen(s)
+    w, V = eigh(M_inv, check_finite=False)
+    cov_is_definite = _check_sdp_from_eigen(w)
     if strict_pd and not cov_is_definite:
       raise LinAlgError("Unable to get a true inverse of the covariance "
                         "matrix since it is not definite. Try another "
                         "`{}`, or an algorithm that does not "
                         "require the `{}` to be strictly positive definite."
                         .format(*((matrix_name,) * 2)))
-    M = np.dot(u / s, u.T)
+    elif not cov_is_definite:
+      warnings.warn('The covariance matrix is not invertible: '
+                    'using the pseudo-inverse instead.'
+                    'To make the covariance matrix invertible'
+                    ' you can remove any linearly dependent features and/or '
+                    'reduce the dimensionality of your input, '
+                    'for instance using `sklearn.decomposition.PCA` as a '
+                    'preprocessing step.')
+    M = _pseudo_inverse_from_eig(w, V)
     if return_inverse:
       return M, M_inv
     else:
@@ -742,3 +752,36 @@ def _check_n_components(n_features, n_components):
   if 0 < n_components <= n_features:
     return n_components
   raise ValueError('Invalid n_components, must be in [1, %d]' % n_features)
+
+
+def _pseudo_inverse_from_eig(w, V, tol=None):
+  """Compute the (Moore-Penrose) pseudo-inverse of the EVD of a symetric
+  matrix.
+
+  Parameters
+  ----------
+  w : (..., M) ndarray
+    The eigenvalues in ascending order, each repeated according to
+    its multiplicity.
+
+  v : {(..., M, M) ndarray, (..., M, M) matrix}
+    The column ``v[:, i]`` is the normalized eigenvector corresponding
+    to the eigenvalue ``w[i]``.  Will return a matrix object if `a` is
+    a matrix object.
+
+  tol : positive `float`, optional
+    Absolute eigenvalues below tol are considered zero.
+
+  Returns
+  -------
+  output : (..., M, N) array_like
+    The pseudo-inverse given by the EVD.
+  """
+  if tol is None:
+    tol = np.amax(w) * np.max(w.shape) * np.finfo(w.dtype).eps
+  # discard small eigenvalues and invert the rest
+  large = np.abs(w) > tol
+  w = np.divide(1, w, where=large, out=w)
+  w[~large] = 0
+
+  return np.dot(V * w, np.conjugate(V).T)
diff --git a/metric_learn/_version.py b/metric_learn/_version.py
index 2b8877c5..a71c5c7f 100644
--- a/metric_learn/_version.py
+++ b/metric_learn/_version.py
@@ -1 +1 @@
-__version__ = '0.5.0'
+__version__ = '0.7.0'
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 570172a9..47efe4b7 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -2,18 +2,17 @@
 Base module.
 """
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
 from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
 import numpy as np
 from abc import ABCMeta, abstractmethod
-import six
 from ._util import ArrayIndexer, check_input, validate_vector
 import warnings
 
 
-class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)):
+class BaseMetricLearner(BaseEstimator, metaclass=ABCMeta):
   """
   Base class for all metric-learners.
 
@@ -29,26 +28,98 @@ def __init__(self, preprocessor=None):
 
   @abstractmethod
   def score_pairs(self, pairs):
-    """Returns the score between pairs
+    """
+    Returns the score between pairs
     (can be a similarity, or a distance/metric depending on the algorithm)
 
+    .. deprecated:: 0.7.0
+        Refer to `pair_distance` and `pair_score`.
+
+    .. warning::
+        This method will be removed in 0.8.0. Please refer to `pair_distance`
+        or `pair_score`. This change will occur in order to add learners
+        that don't necessarily learn a Mahalanobis distance.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The score of every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference between `score_pairs` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
+    """
+
+  @abstractmethod
+  def pair_score(self, pairs):
+    """
+    .. versionadded:: 0.7.0 Compute the similarity score between pairs
+
+    Returns the similarity score between pairs of points (the larger the score,
+    the more similar the pair). For metric learners that learn a distance,
+    the score is simply the opposite of the distance between pairs. All
+    learners have access to this method.
+
     Parameters
     ----------
-    pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features)
-      3D array of pairs.
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
 
     Returns
     -------
-    scores: `numpy.ndarray` of shape=(n_pairs,)
+    scores : `numpy.ndarray` of shape=(n_pairs,)
       The score of every pair.
 
     See Also
     --------
     get_metric : a method that returns a function to compute the metric between
-      two points. The difference with `score_pairs` is that it works on two 1D
-      arrays and cannot use a preprocessor. Besides, the returned function is
-      independent of the metric learner and hence is not modified if the metric
-      learner is.
+      two points. The difference with `pair_score` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
+    """
+
+  @abstractmethod
+  def pair_distance(self, pairs):
+    """
+    .. versionadded:: 0.7.0 Compute the distance between pairs
+
+    Returns the (pseudo) distance between pairs, when available. For metric
+    learners that do not learn a (pseudo) distance, an error is thrown
+    instead.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs for which to compute the distance, with each
+      row corresponding to two points, for 2D array of indices of pairs
+      if the metric learner uses a preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The distance between every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_distance` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
     """
 
   def _check_preprocessor(self):
@@ -69,19 +140,19 @@ def _prepare_inputs(self, X, y=None, type_of_inputs='classic',
 
     Parameters
     ----------
-    input: array-like
+    X : array-like
       The input data array to check.
 
     y : array-like
       The input labels array to check.
 
-    type_of_inputs: `str` {'classic', 'tuples'}
+    type_of_inputs : `str` {'classic', 'tuples'}
       The type of inputs to check. If 'classic', the input should be
       a 2D array-like of points or a 1D array like of indicators of points. If
       'tuples', the input should be a 3D array-like of tuples or a 2D
       array-like of indicators of tuples.
 
-    **kwargs: dict
+    **kwargs : dict
       Arguments to pass to check_input.
 
     Returns
@@ -89,21 +160,29 @@ def _prepare_inputs(self, X, y=None, type_of_inputs='classic',
     X : `numpy.ndarray`
       The checked input data array.
 
-    y: `numpy.ndarray` (optional)
+    y : `numpy.ndarray` (optional)
       The checked input labels array.
     """
     self._check_preprocessor()
-    return check_input(X, y,
+
+    check_is_fitted(self, ['preprocessor_'])
+    outs = check_input(X, y,
                        type_of_inputs=type_of_inputs,
                        preprocessor=self.preprocessor_,
                        estimator=self,
                        tuple_size=getattr(self, '_tuple_size', None),
                        **kwargs)
+    # Conform to SLEP010
+    if not hasattr(self, 'n_features_in_'):
+      self.n_features_in_ = (outs if y is None else outs[0]).shape[1]
+    return outs
 
   @abstractmethod
   def get_metric(self):
-    """Returns a function that takes as input two 1D arrays and outputs the
-    learned metric score on these two points.
+    """Returns a function that takes as input two 1D arrays and outputs
+    the value of the learned metric on these two points. Depending on the
+    algorithm, it can return a distance or a similarity function between
+    pairs.
 
     This function will be independent from the metric learner that learned it
     (it will not be modified if the initial metric learner is modified),
@@ -136,15 +215,25 @@ def get_metric(self):
 
     See Also
     --------
-    score_pairs : a method that returns the metric score between several pairs
-      of points. Unlike `get_metric`, this is a method of the metric learner
-      and therefore can change if the metric learner changes. Besides, it can
-      use the metric learner's preprocessor, and works on concatenated arrays.
+    pair_distance : a method that returns the distance between several
+      pairs of points. Unlike `get_metric`, this is a method of the metric
+      learner and therefore can change if the metric learner changes. Besides,
+      it can use the metric learner's preprocessor, and works on concatenated
+      arrays.
+
+    pair_score : a method that returns the similarity score between
+      several pairs of points. Unlike `get_metric`, this is a method of the
+      metric learner and therefore can change if the metric learner changes.
+      Besides, it can use the metric learner's preprocessor, and works on
+      concatenated arrays.
     """
 
 
-class MetricTransformer(six.with_metaclass(ABCMeta)):
-
+class MetricTransformer(metaclass=ABCMeta):
+  """
+  Base class for all learners that can transform data into a new space
+  with the metric learned.
+  """
   @abstractmethod
   def transform(self, X):
     """Applies the metric transformation.
@@ -152,18 +241,18 @@ def transform(self, X):
     Parameters
     ----------
     X : (n x d) matrix
-        Data to transform.
+      Data to transform.
 
     Returns
     -------
     transformed : (n x d) matrix
-        Input data transformed to the metric space by :math:`XL^{\\top}`
+      Input data transformed to the metric space by :math:`XL^{\\top}`
     """
 
 
-class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner,
-                                          MetricTransformer)):
-  """Mahalanobis metric learning algorithms.
+class MahalanobisMixin(BaseMetricLearner, MetricTransformer,
+                       metaclass=ABCMeta):
+  r"""Mahalanobis metric learning algorithms.
 
   Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`,
   defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x,
@@ -178,20 +267,29 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner,
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
   def score_pairs(self, pairs):
-    """Returns the learned Mahalanobis distance between pairs.
+    r"""
+    Returns the learned Mahalanobis distance between pairs.
 
-    This distance is defined as: :math:`d_M(x, x') = \sqrt{(x-x')^T M (x-x')}`
+    This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
     where ``M`` is the learned Mahalanobis matrix, for every pair of points
     ``x`` and ``x'``. This corresponds to the euclidean distance between
     embeddings of the points in a new space, obtained through a linear
-    transformation. Indeed, we have also: :math:`d_M(x, x') = \sqrt{(x_e -
+    transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
     x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
     :class:`MahalanobisMixin`).
 
+    .. deprecated:: 0.7.0
+        Please use `pair_distance` instead.
+
+    .. warning::
+        This method will be removed in 0.8.0. Please refer to `pair_distance`
+        or `pair_score`. This change will occur in order to add learners
+        that don't necessarily learn a Mahalanobis distance.
+
     Parameters
     ----------
     pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
@@ -201,20 +299,91 @@ def score_pairs(self, pairs):
 
     Returns
     -------
-    scores: `numpy.ndarray` of shape=(n_pairs,)
+    scores : `numpy.ndarray` of shape=(n_pairs,)
       The learned Mahalanobis distance for every pair.
 
     See Also
     --------
     get_metric : a method that returns a function to compute the metric between
-      two points. The difference with `score_pairs` is that it works on two 1D
-      arrays and cannot use a preprocessor. Besides, the returned function is
-      independent of the metric learner and hence is not modified if the metric
-      learner is.
+      two points. The difference with `score_pairs` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
 
     :ref:`mahalanobis_distances` : The section of the project documentation
       that describes Mahalanobis Distances.
     """
+    dpr_msg = ("score_pairs will be deprecated in release 0.7.0. "
+               "Use pair_score to compute similarity scores, or "
+               "pair_distances to compute distances.")
+    warnings.warn(dpr_msg, category=FutureWarning)
+    return self.pair_distance(pairs)
+
+  def pair_score(self, pairs):
+    """
+    Returns the opposite of the learned Mahalanobis distance between pairs.
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The opposite of the learned Mahalanobis distance for every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_score` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is not modified if the
+      metric learner is.
+
+    :ref:`mahalanobis_distances` : The section of the project documentation
+      that describes Mahalanobis Distances.
+    """
+    return -1 * self.pair_distance(pairs)
+
+  def pair_distance(self, pairs):
+    """
+    Returns the learned Mahalanobis distance between pairs.
+
+    This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
+    where ``M`` is the learned Mahalanobis matrix, for every pair of points
+    ``x`` and ``x'``. This corresponds to the euclidean distance between
+    embeddings of the points in a new space, obtained through a linear
+    transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e -
+    x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
+    :class:`MahalanobisMixin`).
+
+    Parameters
+    ----------
+    pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
+      3D Array of pairs to score, with each row corresponding to two points,
+      for 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
+    Returns
+    -------
+    scores : `numpy.ndarray` of shape=(n_pairs,)
+      The learned Mahalanobis distance for every pair.
+
+    See Also
+    --------
+    get_metric : a method that returns a function to compute the metric between
+      two points. The difference with `pair_distance` is that it works on two
+      1D arrays and cannot use a preprocessor. Besides, the returned function
+      is independent of the metric learner and hence is  not modified if the
+      metric learner is.
+
+    :ref:`mahalanobis_distances` : The section of the project documentation
+      that describes Mahalanobis Distances.
+    """
+    check_is_fitted(self, ['preprocessor_'])
     pairs = check_input(pairs, type_of_inputs='tuples',
                         preprocessor=self.preprocessor_,
                         estimator=self, tuple_size=2)
@@ -240,12 +409,14 @@ def transform(self, X):
     X_embedded : `numpy.ndarray`, shape=(n_samples, n_components)
       The embedded data points.
     """
+    check_is_fitted(self, ['preprocessor_', 'components_'])
     X_checked = check_input(X, type_of_inputs='classic', estimator=self,
-                             preprocessor=self.preprocessor_,
-                             accept_sparse=True)
+                            preprocessor=self.preprocessor_,
+                            accept_sparse=True)
     return X_checked.dot(self.components_.T)
 
   def get_metric(self):
+    check_is_fitted(self, 'components_')
     components_T = self.components_.T.copy()
 
     def metric_fun(u, v, squared=False):
@@ -266,7 +437,7 @@ def metric_fun(u, v, squared=False):
 
       Returns
       -------
-      distance: float
+      distance : float
         The distance between u and v according to the new metric.
       """
       u = validate_vector(u)
@@ -281,15 +452,6 @@ def metric_fun(u, v, squared=False):
 
   get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__
 
-  def metric(self):
-    """Deprecated. Will be removed in v0.6.0. Use `get_mahalanobis_matrix`
-    instead"""
-    # TODO: remove this method in version 0.6.0
-    warnings.warn(("`metric` is deprecated since version 0.5.0 and will be "
-                   "removed in 0.6.0. Use `get_mahalanobis_matrix` instead."),
-                  DeprecationWarning)
-    return self.get_mahalanobis_matrix()
-
   def get_mahalanobis_matrix(self):
     """Returns a copy of the Mahalanobis matrix learned by the metric learner.
 
@@ -298,20 +460,22 @@ def get_mahalanobis_matrix(self):
     M : `numpy.ndarray`, shape=(n_features, n_features)
       The copy of the learned Mahalanobis matrix.
     """
+    check_is_fitted(self, 'components_')
     return self.components_.T.dot(self.components_)
 
 
-class _PairsClassifierMixin(BaseMetricLearner):
+class _PairsClassifierMixin(BaseMetricLearner, ClassifierMixin):
   """Base class for pairs learners.
 
   Attributes
   ----------
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
   """
 
+  classes_ = np.array([0, 1])
   _tuple_size = 2  # number of points in a tuple, 2 for pairs
 
   def predict(self, pairs):
@@ -333,7 +497,12 @@ def predict(self, pairs):
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
       The predicted learned metric value between samples in every pair.
     """
-    check_is_fitted(self, ['threshold_', 'components_'])
+    check_is_fitted(self, 'preprocessor_')
+
+    if "threshold_" not in vars(self):
+      msg = ("A threshold for this estimator has not been set, "
+             "call its set_threshold or calibrate_threshold method.")
+      raise AttributeError(msg)
     return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1
 
   def decision_function(self, pairs):
@@ -357,10 +526,11 @@ def decision_function(self, pairs):
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
       The predicted decision function value for each pair.
     """
+    check_is_fitted(self, 'preprocessor_')
     pairs = check_input(pairs, type_of_inputs='tuples',
                         preprocessor=self.preprocessor_,
                         estimator=self, tuple_size=self._tuple_size)
-    return - self.score_pairs(pairs)
+    return self.pair_score(pairs)
 
   def score(self, pairs, y):
     """Computes score of pairs similarity prediction.
@@ -407,7 +577,15 @@ def set_threshold(self, threshold):
     self : `_PairsClassifier`
       The pairs classifier with the new threshold set.
     """
-    self.threshold_ = threshold
+    check_is_fitted(self, 'preprocessor_')
+    try:
+      self.threshold_ = float(threshold)
+    except TypeError:
+      raise ValueError('Parameter threshold must be a real number. '
+                       'Got {} instead.'.format(type(threshold)))
+    except ValueError:
+      raise ValueError('Parameter threshold must be a real number. '
+                       'Got {} instead.'.format(type(threshold)))
     return self
 
   def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
@@ -463,12 +641,13 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
            evaluation tool in clinical medicine, MH Zweig, G Campbell -
            Clinical chemistry, 1993
 
-    .. [2] most of the code of this function is from scikit-learn's PR #10117
+    .. [2] Most of the code of this function is from scikit-learn's PR #10117
 
     See Also
     --------
     sklearn.calibration : scikit-learn's module for calibrating classifiers
     """
+    check_is_fitted(self, 'preprocessor_')
 
     self._validate_calibration_params(strategy, min_rate, beta)
 
@@ -574,10 +753,98 @@ def _validate_calibration_params(strategy='accuracy', min_rate=None,
                          'Got {} instead.'.format(type(beta)))
 
 
-class _QuadrupletsClassifierMixin(BaseMetricLearner):
-  """Base class for quadruplets learners.
+class _TripletsClassifierMixin(BaseMetricLearner, ClassifierMixin):
+  """
+  Base class for triplets learners.
+  """
+
+  classes_ = np.array([0, 1])
+  _tuple_size = 3  # number of points in a tuple, 3 for triplets
+
+  def predict(self, triplets):
+    """Predicts the ordering between sample distances in input triplets.
+
+    For each triplets, returns 1 if the first element is closer to the second
+    than to the last and -1 if not.
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_triplets, 3, n_features) or (n_triplets, 3)
+      3D array of triplets to predict, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Predictions of the ordering of pairs, for each triplet.
+    """
+    return 2 * (self.decision_function(triplets) > 0) - 1
+
+  def decision_function(self, triplets):
+    """Predicts differences between sample distances in input triplets.
+
+    For each triplet (X_a, X_b, X_c) in the samples, computes the difference
+    between the learned distance of the second pair (X_a, X_c) minus the
+    learned distance of the first pair (X_a, X_b). The higher it is, the more
+    probable it is that the pairs in the triplets are presented in the right
+    order, i.e. that the label of the triplet is 1. The lower it is, the more
+    probable it is that the label of the triplet is -1.
+
+    Parameters
+    ----------
+    triplet : array-like, shape=(n_triplets, 3, n_features) or \
+                  (n_triplets, 3)
+      3D array of triplets to predict, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
+      Metric differences.
+    """
+    check_is_fitted(self, 'preprocessor_')
+    triplets = check_input(triplets, type_of_inputs='tuples',
+                           preprocessor=self.preprocessor_,
+                           estimator=self, tuple_size=self._tuple_size)
+    return (self.pair_score(triplets[:, :2]) -
+            self.pair_score(triplets[:, [0, 2]]))
+
+  def score(self, triplets):
+    """Computes score on input triplets.
+
+    Returns the accuracy score of the following classification task: a triplet
+    (X_a, X_b, X_c) is correctly classified if the predicted similarity between
+    the first pair (X_a, X_b) is higher than that of the second pair (X_a, X_c)
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_triplets, 3, n_features) or \
+                  (n_triplets, 3)
+      3D array of triplets to score, with each row corresponding to three
+      points, or 2D array of indices of triplets if the metric learner
+      uses a preprocessor.
+
+    Returns
+    -------
+    score : float
+      The triplets score.
+    """
+    # Since the prediction is a vector of values in {-1, +1}, we need to
+    # rescale them to {0, 1} to compute the accuracy using the mean (because
+    # then 1 means a correctly classified result (pairs are in the right
+    # order), and a 0 an incorrectly classified result (pairs are in the
+    # wrong order).
+    return self.predict(triplets).mean() / 2 + 0.5
+
+
+class _QuadrupletsClassifierMixin(BaseMetricLearner, ClassifierMixin):
+  """
+  Base class for quadruplets learners.
   """
 
+  classes_ = np.array([0, 1])
   _tuple_size = 4  # number of points in a tuple, 4 for quadruplets
 
   def predict(self, quadruplets):
@@ -599,10 +866,6 @@ def predict(self, quadruplets):
     prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
       Predictions of the ordering of pairs, for each quadruplet.
     """
-    check_is_fitted(self, 'components_')
-    quadruplets = check_input(quadruplets, type_of_inputs='tuples',
-                              preprocessor=self.preprocessor_,
-                              estimator=self, tuple_size=self._tuple_size)
     return np.sign(self.decision_function(quadruplets))
 
   def decision_function(self, quadruplets):
@@ -628,11 +891,12 @@ def decision_function(self, quadruplets):
     decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
       Metric differences.
     """
+    check_is_fitted(self, 'preprocessor_')
     quadruplets = check_input(quadruplets, type_of_inputs='tuples',
                               preprocessor=self.preprocessor_,
                               estimator=self, tuple_size=self._tuple_size)
-    return (self.score_pairs(quadruplets[:, 2:]) -
-            self.score_pairs(quadruplets[:, :2]))
+    return (self.pair_score(quadruplets[:, :2]) -
+            self.pair_score(quadruplets[:, 2:]))
 
   def score(self, quadruplets):
     """Computes score on input quadruplets
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
index e42ef4b8..4993e9ef 100644
--- a/metric_learn/constraints.py
+++ b/metric_learn/constraints.py
@@ -4,87 +4,282 @@
 """
 import numpy as np
 import warnings
-from six.moves import xrange
-from scipy.sparse import coo_matrix
 from sklearn.utils import check_random_state
+from sklearn.neighbors import NearestNeighbors
+
 
 __all__ = ['Constraints']
 
 
 class Constraints(object):
   """
-  Class to build constraints from labels.
+  Class to build constraints from labeled data.
+
+  See more in the :ref:`User Guide <supervised_version>`.
 
-  See more in the :ref:`User Guide <supervised_version>`
+  Parameters
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+    Array of labels, with -1 indicating unknown label.
+
+  Attributes
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+    Array of labels, with -1 indicating unknown label.
   """
+
   def __init__(self, partial_labels):
-    '''partial_labels : int arraylike, -1 indicating unknown label'''
     partial_labels = np.asanyarray(partial_labels, dtype=int)
-    self.num_points, = partial_labels.shape
-    self.known_label_idx, = np.where(partial_labels >= 0)
-    self.known_labels = partial_labels[self.known_label_idx]
+    self.partial_labels = partial_labels
 
-  def adjacency_matrix(self, num_constraints, random_state=None):
-    random_state = check_random_state(random_state)
-    a, b, c, d = self.positive_negative_pairs(num_constraints,
-                                              random_state=random_state)
-    row = np.concatenate((a, c))
-    col = np.concatenate((b, d))
-    data = np.ones_like(row, dtype=int)
-    data[len(a):] = -1
-    adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
-    # symmetrize
-    return adj + adj.T
-
-  def positive_negative_pairs(self, num_constraints, same_length=False,
-                              random_state=None):
+  def positive_negative_pairs(self, n_constraints, same_length=False,
+                              random_state=None, num_constraints='deprecated'):
+    """
+    Generates positive pairs and negative pairs from labeled data.
+
+    Positive pairs are formed by randomly drawing ``n_constraints`` pairs of
+    points with the same label. Negative pairs are formed by randomly drawing
+    ``n_constraints`` pairs of points with different label.
+
+    In the case where it is not possible to generate enough positive or
+    negative pairs, a smaller number of pairs will be returned with a warning.
+
+    Parameters
+    ----------
+    n_constraints : int
+      Number of positive and negative constraints to generate.
+
+    same_length : bool, optional (default=False)
+      If True, forces the number of positive and negative pairs to be
+      equal by ignoring some pairs from the larger set.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int.
+
+    num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+    Returns
+    -------
+    a : array-like, shape=(n_constraints,)
+      1D array of indicators for the left elements of positive pairs.
+
+    b : array-like, shape=(n_constraints,)
+      1D array of indicators for the right elements of positive pairs.
+
+    c : array-like, shape=(n_constraints,)
+      1D array of indicators for the left elements of negative pairs.
+
+    d : array-like, shape=(n_constraints,)
+      1D array of indicators for the right elements of negative pairs.
+    """
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
     random_state = check_random_state(random_state)
-    a, b = self._pairs(num_constraints, same_label=True,
+    a, b = self._pairs(n_constraints, same_label=True,
                        random_state=random_state)
-    c, d = self._pairs(num_constraints, same_label=False,
+    c, d = self._pairs(n_constraints, same_label=False,
                        random_state=random_state)
     if same_length and len(a) != len(c):
       n = min(len(a), len(c))
       return a[:n], b[:n], c[:n], d[:n]
     return a, b, c, d
 
-  def _pairs(self, num_constraints, same_label=True, max_iter=10,
+  def generate_knntriplets(self, X, k_genuine, k_impostor):
+    """
+    Generates triplets from labeled data.
+
+    For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all
+    the combinations of taking one of its `k_genuine`-nearest neighbors of the
+    same class (X_b) and taking one of its `k_impostor`-nearest neighbors of
+    other classes (X_c).
+
+    In the case a class doesn't have enough points in the same class (other
+    classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be
+    raised and the maximum value of genuine (impostor) neighbors will be used
+    for that class.
+
+    Parameters
+    ----------
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    k_genuine : int
+      Number of neighbors of the same class to be taken into account.
+
+    k_impostor : int
+      Number of neighbors of different classes to be taken into account.
+
+    Returns
+    -------
+    triplets : array-like, shape=(n_constraints, 3)
+      2D array of triplets of indicators.
+    """
+    # Ignore unlabeled samples
+    known_labels_mask = self.partial_labels >= 0
+    known_labels = self.partial_labels[known_labels_mask]
+    X = X[known_labels_mask]
+
+    labels, labels_count = np.unique(known_labels, return_counts=True)
+    len_input = known_labels.shape[0]
+
+    # Handle the case where there are too few elements to yield k_genuine or
+    # k_impostor neighbors for every class.
+
+    k_genuine_vec = np.full_like(labels, k_genuine)
+    k_impostor_vec = np.full_like(labels, k_impostor)
+
+    for i, count in enumerate(labels_count):
+      if k_genuine + 1 > count:
+        k_genuine_vec[i] = count-1
+        warnings.warn("The class {} has {} elements, which is not sufficient "
+                      "to generate {} genuine neighbors as specified by "
+                      "k_genuine. Will generate {} genuine neighbors instead."
+                      "\n"
+                      .format(labels[i], count, k_genuine+1,
+                              k_genuine_vec[i]))
+      if k_impostor > len_input - count:
+        k_impostor_vec[i] = len_input - count
+        warnings.warn("The class {} has {} elements of other classes, which is"
+                      " not sufficient to generate {} impostor neighbors as "
+                      "specified by k_impostor. Will generate {} impostor "
+                      "neighbors instead.\n"
+                      .format(labels[i], k_impostor_vec[i], k_impostor,
+                              k_impostor_vec[i]))
+
+    # The total number of possible triplets combinations per label comes from
+    # taking one of the k_genuine_vec[i] genuine neighbors and one of the
+    # k_impostor_vec[i] impostor neighbors for the labels_count[i] elements
+    comb_per_label = labels_count * k_genuine_vec * k_impostor_vec
+
+    # Get start and finish for later triplet assigning
+    # append zero at the begining for start and get cumulative sum
+    start_finish_indices = np.hstack((0, comb_per_label)).cumsum()
+
+    # Total number of triplets is the sum of all possible combinations per
+    # label
+    num_triplets = start_finish_indices[-1]
+    triplets = np.empty((num_triplets, 3), dtype=np.intp)
+
+    neigh = NearestNeighbors()
+
+    for i, label in enumerate(labels):
+
+        # generate mask for current label
+        gen_mask = known_labels == label
+        gen_indx = np.where(gen_mask)
+
+        # get k_genuine genuine neighbors
+        neigh.fit(X=X[gen_indx])
+        # Take elements of gen_indx according to the yielded k-neighbors
+        gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i],
+                                             return_distance=False)
+        gen_neigh = np.take(gen_indx, gen_relative_indx)
+
+        # generate mask for impostors of current label
+        imp_indx = np.where(~gen_mask)
+
+        # get k_impostor impostor neighbors
+        neigh.fit(X=X[imp_indx])
+        # Take elements of imp_indx according to the yielded k-neighbors
+        imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i],
+                                             X=X[gen_mask],
+                                             return_distance=False)
+        imp_neigh = np.take(imp_indx, imp_relative_indx)
+
+        # length = len_label*k_genuine*k_impostor
+        start, finish = start_finish_indices[i:i+2]
+
+        triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh,
+                                         k_genuine_vec[i],
+                                         k_impostor_vec[i])
+
+    return triplets
+
+  def _pairs(self, n_constraints, same_label=True, max_iter=10,
              random_state=np.random):
-    num_labels = len(self.known_labels)
+    known_label_idx, = np.where(self.partial_labels >= 0)
+    known_labels = self.partial_labels[known_label_idx]
+    num_labels = len(known_labels)
     ab = set()
     it = 0
-    while it < max_iter and len(ab) < num_constraints:
-      nc = num_constraints - len(ab)
+    while it < max_iter and len(ab) < n_constraints:
+      nc = n_constraints - len(ab)
       for aidx in random_state.randint(num_labels, size=nc):
         if same_label:
-          mask = self.known_labels[aidx] == self.known_labels
+          mask = known_labels[aidx] == known_labels
           mask[aidx] = False  # avoid identity pairs
         else:
-          mask = self.known_labels[aidx] != self.known_labels
+          mask = known_labels[aidx] != known_labels
         b_choices, = np.where(mask)
         if len(b_choices) > 0:
           ab.add((aidx, random_state.choice(b_choices)))
       it += 1
-    if len(ab) < num_constraints:
+    if len(ab) < n_constraints:
       warnings.warn("Only generated %d %s constraints (requested %d)" % (
-          len(ab), 'positive' if same_label else 'negative', num_constraints))
-    ab = np.array(list(ab)[:num_constraints], dtype=int)
-    return self.known_label_idx[ab.T]
+          len(ab), 'positive' if same_label else 'negative', n_constraints))
+    ab = np.array(list(ab)[:n_constraints], dtype=int)
+    return known_label_idx[ab.T]
 
-  def chunks(self, num_chunks=100, chunk_size=2, random_state=None):
+  def chunks(self, n_chunks=100, chunk_size=2, random_state=None,
+             num_chunks='deprecated'):
     """
-    the random state object to be passed must be a numpy random seed
+    Generates chunks from labeled data.
+
+    Each of ``n_chunks`` chunks is composed of ``chunk_size`` points from
+    the same class drawn at random. Each point can belong to at most 1 chunk.
+
+    In the case where there is not enough points to generate ``n_chunks``
+    chunks of size ``chunk_size``, a ValueError will be raised.
+
+    Parameters
+    ----------
+    n_chunks : int, optional (default=100)
+      Number of chunks to generate.
+
+    chunk_size : int, optional (default=2)
+      Number of points in each chunk.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int.
+
+    num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0
+
+    Returns
+    -------
+    chunks : array-like, shape=(n_samples,)
+      1D array of chunk indicators, where -1 indicates that the point does not
+      belong to any chunk.
     """
+    if num_chunks != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_chunks = num_chunks
     random_state = check_random_state(random_state)
-    chunks = -np.ones_like(self.known_label_idx, dtype=int)
-    uniq, lookup = np.unique(self.known_labels, return_inverse=True)
-    all_inds = [set(np.where(lookup == c)[0]) for c in xrange(len(uniq))]
+    chunks = -np.ones_like(self.partial_labels, dtype=int)
+    uniq, lookup = np.unique(self.partial_labels, return_inverse=True)
+    unknown_uniq = np.where(uniq < 0)[0]
+    all_inds = [set(np.where(lookup == c)[0]) for c in range(len(uniq))
+                if c not in unknown_uniq]
+    max_chunks = int(np.sum([len(s) // chunk_size for s in all_inds]))
+    if max_chunks < n_chunks:
+      raise ValueError(('Not enough possible chunks of %d elements in each'
+                        ' class to form expected %d chunks - maximum number'
+                        ' of chunks is %d'
+                        ) % (chunk_size, n_chunks, max_chunks))
     idx = 0
-    while idx < num_chunks and all_inds:
+    while idx < n_chunks and all_inds:
       if len(all_inds) == 1:
         c = 0
       else:
-        c = random_state.randint(0, high=len(all_inds)-1)
+        c = random_state.randint(0, high=len(all_inds) - 1)
       inds = all_inds[c]
       if len(inds) < chunk_size:
         del all_inds[c]
@@ -93,12 +288,18 @@ def chunks(self, num_chunks=100, chunk_size=2, random_state=None):
       inds.difference_update(ii)
       chunks[ii] = idx
       idx += 1
-    if idx < num_chunks:
-      raise ValueError('Unable to make %d chunks of %d examples each' %
-                       (num_chunks, chunk_size))
     return chunks
 
 
+def comb(A, B, C, sizeB, sizeC):
+    # generate_knntriplets helper function
+    # generate an array with all combinations of choosing
+    # an element from A, B and C
+    return np.vstack((np.tile(A, (sizeB*sizeC, 1)).ravel(order='F'),
+                      np.tile(np.hstack(B), (sizeC, 1)).ravel(order='F'),
+                      np.tile(C, (1, sizeB)).ravel())).T
+
+
 def wrap_pairs(X, constraints):
   a = np.array(constraints[0])
   b = np.array(constraints[1])
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
index 7214dd62..2c05b28d 100644
--- a/metric_learn/covariance.py
+++ b/metric_learn/covariance.py
@@ -2,7 +2,6 @@
 Covariance metric (baseline method)
 """
 
-from __future__ import absolute_import
 import numpy as np
 import scipy
 from sklearn.base import TransformerMixin
@@ -43,6 +42,10 @@ def __init__(self, preprocessor=None):
 
   def fit(self, X, y=None):
     """
+    Calculates the covariance matrix of the input data.
+
+    Parameters
+    ----------
     X : data matrix, (n x d)
     y : unused
     """
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index c3b91fc4..9537eec2 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -2,17 +2,14 @@
 Information Theoretic Metric Learning (ITML)
 """
 
-from __future__ import print_function, absolute_import
-import warnings
 import numpy as np
-from six.moves import xrange
-from sklearn.exceptions import ChangedBehaviorWarning
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array
 from sklearn.base import TransformerMixin
 from .base_metric import _PairsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints, wrap_pairs
 from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
 
 
 class _BaseITML(MahalanobisMixin):
@@ -20,29 +17,31 @@ class _BaseITML(MahalanobisMixin):
 
   _tuple_size = 2  # constraints are pairs
 
-  def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
-               prior='identity', A0='deprecated', verbose=False,
-               preprocessor=None, random_state=None):
+  def __init__(self, gamma=1., max_iter=1000, tol=1e-3,
+               prior='identity', verbose=False,
+               preprocessor=None, random_state=None,
+               convergence_threshold='deprecated'):
+    if convergence_threshold != 'deprecated':
+      warnings.warn('"convergence_threshold" parameter has been '
+                    ' renamed to "tol". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      tol = convergence_threshold
+    self.convergence_threshold = 'deprecated'  # Avoid errors
     self.gamma = gamma
     self.max_iter = max_iter
-    self.convergence_threshold = convergence_threshold
+    self.tol = tol
     self.prior = prior
-    self.A0 = A0
     self.verbose = verbose
     self.random_state = random_state
     super(_BaseITML, self).__init__(preprocessor)
 
   def _fit(self, pairs, y, bounds=None):
-    if self.A0 != 'deprecated':
-      warnings.warn('"A0" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    'removed in 0.6.0. Use "prior" instead.',
-                    DeprecationWarning)
     pairs, y = self._prepare_inputs(pairs, y,
                                     type_of_inputs='tuples')
     # init bounds
     if bounds is None:
-      X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])})
+      X = np.unique(np.vstack(pairs), axis=0)
       self.bounds_ = np.percentile(pairwise_distances(X), (5, 95))
     else:
       bounds = check_array(bounds, allow_nd=False, ensure_min_samples=0,
@@ -63,30 +62,31 @@ def _fit(self, pairs, y, bounds=None):
     num_neg = len(neg_pairs)
     _lambda = np.zeros(num_pos + num_neg)
     lambdaold = np.zeros_like(_lambda)
-    gamma_proj = 1. if gamma is np.inf else gamma/(gamma+1.)
+    gamma_proj = 1. if gamma is np.inf else gamma / (gamma + 1.)
     pos_bhat = np.zeros(num_pos) + self.bounds_[0]
     neg_bhat = np.zeros(num_neg) + self.bounds_[1]
     pos_vv = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
     neg_vv = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
 
-    for it in xrange(self.max_iter):
+    for it in range(self.max_iter):
       # update positives
-      for i,v in enumerate(pos_vv):
+      for i, v in enumerate(pos_vv):
         wtw = v.dot(A).dot(v)  # scalar
-        alpha = min(_lambda[i], gamma_proj*(1./wtw - 1./pos_bhat[i]))
+        alpha = min(_lambda[i], gamma_proj * (1. / wtw - 1. / pos_bhat[i]))
         _lambda[i] -= alpha
-        beta = alpha/(1 - alpha*wtw)
-        pos_bhat[i] = 1./((1 / pos_bhat[i]) + (alpha / gamma))
+        beta = alpha / (1 - alpha * wtw)
+        pos_bhat[i] = 1. / ((1 / pos_bhat[i]) + (alpha / gamma))
         Av = A.dot(v)
         A += np.outer(Av, Av * beta)
 
       # update negatives
-      for i,v in enumerate(neg_vv):
+      for i, v in enumerate(neg_vv):
         wtw = v.dot(A).dot(v)  # scalar
-        alpha = min(_lambda[i+num_pos], gamma_proj*(1./neg_bhat[i] - 1./wtw))
-        _lambda[i+num_pos] -= alpha
-        beta = -alpha/(1 + alpha*wtw)
-        neg_bhat[i] = 1./((1 / neg_bhat[i]) - (alpha / gamma))
+        alpha = min(_lambda[i + num_pos],
+                    gamma_proj * (1. / neg_bhat[i] - 1. / wtw))
+        _lambda[i + num_pos] -= alpha
+        beta = -alpha / (1 + alpha * wtw)
+        neg_bhat[i] = 1. / ((1 / neg_bhat[i]) - (alpha / gamma))
         Av = A.dot(v)
         A += np.outer(Av, Av * beta)
 
@@ -95,7 +95,7 @@ def _fit(self, pairs, y, bounds=None):
         conv = np.inf
         break
       conv = np.abs(lambdaold - _lambda).sum() / normsum
-      if conv < self.convergence_threshold:
+      if conv < self.tol:
         break
       lambdaold = _lambda.copy()
       if self.verbose:
@@ -125,91 +125,91 @@ class ITML(_BaseITML, _PairsClassifierMixin):
 
   Parameters
   ----------
-  gamma : float, optional (default=1.)
-      Value for slack variables
+  gamma : float, optional (default=1.0)
+    Value for slack variables
 
   max_iter : int, optional (default=1000)
-      Maximum number of iteration of the optimization procedure.
+    Maximum number of iteration of the optimization procedure.
 
-  convergence_threshold : float, optional (default=1e-3)
-      Convergence tolerance.
+  tol : float, optional (default=1e-3)
+    Convergence tolerance.
 
   prior : string or numpy array, optional (default='identity')
-      The Mahalanobis matrix to use as a prior. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of shape
-      (n_features, n_features). For ITML, the prior should be strictly
-      positive definite (PD).
+    The Mahalanobis matrix to use as a prior. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
 
-      'identity'
-          An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-      'covariance'
-          The inverse covariance matrix.
+    'covariance'
+      The inverse covariance matrix.
 
-      'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-      numpy array
-          A positive definite (PD) matrix of shape
-          (n_features, n_features), that will be used as such to set the
-          prior.
-
-  A0 : Not used
-      .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'prior' instead.
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
 
   verbose : bool, optional (default=False)
-      If True, prints information while learning
+    If True, prints information while learning
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
 
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
-      Bounds on similarity, aside slack variables, s.t.
-      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-      dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
-      not provided at initialization, bounds_[0] and bounds_[1] are set at
-      train time to the 5th and 95th percentile of the pairwise distances among
-      all points present in the input `pairs`.
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
+    not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances among
+    all points present in the input `pairs`.
 
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
-  >>> from metric_learn import ITML_Supervised
-  >>> from sklearn.datasets import load_iris
-  >>> iris_data = load_iris()
-  >>> X = iris_data['data']
-  >>> Y = iris_data['target']
-  >>> itml = ITML_Supervised(num_constraints=200)
-  >>> itml.fit(X, Y)
+  >>> from metric_learn import ITML
+  >>> pairs = [[[1.2, 7.5], [1.3, 1.5]],
+  >>>         [[6.4, 2.6], [6.2, 9.7]],
+  >>>         [[1.3, 4.5], [3.2, 4.6]],
+  >>>         [[6.2, 5.5], [5.4, 5.4]]]
+  >>> y = [1, 1, -1, -1]
+  >>> # in this task we want points where the first feature is close to be
+  >>> # closer to each other, no matter how close the second feature is
+  >>> itml = ITML()
+  >>> itml.fit(pairs, y)
 
   References
   ----------
-  .. [1] `Information-theoretic Metric Learning
-         <http://machinelearning.wustl.edu/mlpapers/paper_files\
-/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
+  .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning
+         <http://www.prateekjain.org/publications/all_papers\
+         /DavisKJSD07_ICML.pdf>`_. ICML 2007.
   """
 
   def fit(self, pairs, y, bounds=None, calibration_params=None):
@@ -222,28 +222,31 @@ def fit(self, pairs, y, bounds=None, calibration_params=None):
     ----------
     pairs: array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
     y: array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
     bounds : array-like of two numbers
-        Bounds on similarity, aside slack variables, s.t.
-        ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-        and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-        dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-        If not provided at initialization, bounds_[0] and bounds_[1] will be
-        set to the 5th and 95th percentile of the pairwise distances among all
-        points present in the input `pairs`.
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points present in the input `pairs`.
+
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -262,76 +265,85 @@ class ITML_Supervised(_BaseITML, TransformerMixin):
 
   Parameters
   ----------
-  gamma : float, optional
-      value for slack variables
-  max_iter : int, optional
-  convergence_threshold : float, optional
-  num_labeled : Not used
-        .. deprecated:: 0.5.0
-           `num_labeled` was deprecated in version 0.5.0 and will
-           be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-  bounds : Not used
-         .. deprecated:: 0.5.0
-        `bounds` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Set `bounds` at fit time instead :
-        `itml_supervised.fit(X, y, bounds=...)`
+  gamma : float, optional (default=1.0)
+    Value for slack variables
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iterations of the optimization procedure.
+
+  tol : float, optional (default=1e-3)
+    Tolerance of the optimization procedure.
+
+  n_constraints : int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
 
   prior : string or numpy array, optional (default='identity')
-       Initialization of the Mahalanobis matrix. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of shape
-       (n_features, n_features). For ITML, the prior should be strictly
-       positive definite (PD).
-
-       'identity'
-          An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-          The inverse covariance matrix.
-
-       'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
-  A0 : Not used
-    .. deprecated:: 0.5.0
-       `A0` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0. Use 'prior' instead.
-  verbose : bool, optional
-      if True, prints information while learning
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior. In any
-        case, `random_state` is also used to randomly sample constraints from
-        labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior. In any
+    case, `random_state` is also used to randomly sample constraints from
+    labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
 
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
 
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
-      Bounds on similarity, aside slack variables, s.t.
-      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-      If not provided at initialization, bounds_[0] and bounds_[1] are set at
-      train time to the 5th and 95th percentile of the pairwise distances
-      among all points in the training data `X`.
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+    If not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances
+    among all points in the training data `X`.
 
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import ITML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> itml = ITML_Supervised(n_constraints=200)
+  >>> itml.fit(X, Y)
 
   See Also
   --------
@@ -340,75 +352,56 @@ class ITML_Supervised(_BaseITML, TransformerMixin):
     that describes the supervised version of weakly supervised estimators.
   """
 
-  def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
-               num_labeled='deprecated', num_constraints=None,
-               bounds='deprecated', prior='identity', A0='deprecated',
-               verbose=False, preprocessor=None, random_state=None):
+  def __init__(self, gamma=1.0, max_iter=1000, tol=1e-3,
+               n_constraints=None, prior='identity',
+               verbose=False, preprocessor=None, random_state=None,
+               num_constraints='deprecated',
+               convergence_threshold='deprecated'):
     _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter,
-                       convergence_threshold=convergence_threshold,
-                       A0=A0, prior=prior, verbose=verbose,
-                       preprocessor=preprocessor, random_state=random_state)
-    self.num_labeled = num_labeled
-    self.num_constraints = num_constraints
-    self.bounds = bounds
-
-  def fit(self, X, y, random_state='deprecated', bounds=None):
+                       tol=tol,
+                       prior=prior, verbose=verbose,
+                       preprocessor=preprocessor,
+                       random_state=random_state,
+                       convergence_threshold=convergence_threshold)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_constraints = num_constraints
+    self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
+
+  def fit(self, X, y, bounds=None):
     """Create constraints from labels and learn the ITML model.
 
 
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
 
     y : (n) array-like
-        Data labels.
-
-    random_state : Not used
-      .. deprecated:: 0.5.0
-        `random_state` in the `fit` function was deprecated in version 0.5.0
-        and will be removed in 0.6.0. Set `random_state` at initialization
-        instead (when instantiating a new `ITML_Supervised` object).
+      Data labels.
 
     bounds : array-like of two numbers
-        Bounds on similarity, aside slack variables, s.t.
-        ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-        and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-        dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-        If not provided at initialization, bounds_[0] and bounds_[1] will be
-        set to the 5th and 95th percentile of the pairwise distances among all
-        points in the training data `X`.
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points in the training data `X`.
     """
-    # TODO: remove these in v0.6.0
-    if self.num_labeled != 'deprecated':
-      warnings.warn('"num_labeled" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0', DeprecationWarning)
-    if self.bounds != 'deprecated':
-      warnings.warn('"bounds" parameter from initialization is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0. Use the "bounds" parameter of this '
-                    'fit method instead.', DeprecationWarning)
-    if random_state != 'deprecated':
-      warnings.warn('"random_state" parameter in the `fit` function is '
-                    'deprecated. Set `random_state` at initialization '
-                    'instead (when instantiating a new `ITML_Supervised` '
-                    'object).', DeprecationWarning)
-    else:
-      warnings.warn('As of v0.5.0, `ITML_Supervised` now uses the '
-                    '`random_state` given at initialization to sample '
-                    'constraints, not the default `np.random` from the `fit` '
-                    'method, since this argument is now deprecated. '
-                    'This warning will disappear in v0.6.0.',
-                    ChangedBehaviorWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
-    num_constraints = self.num_constraints
-    if num_constraints is None:
+    n_constraints = self.n_constraints
+    if n_constraints is None:
       num_classes = len(np.unique(y))
-      num_constraints = 20 * num_classes**2
+      n_constraints = 20 * num_classes**2
 
     c = Constraints(y)
-    pos_neg = c.positive_negative_pairs(num_constraints,
+    pos_neg = c.positive_negative_pairs(n_constraints,
                                         random_state=self.random_state)
     pairs, y = wrap_pairs(X, pos_neg)
     return _BaseITML._fit(self, pairs, y, bounds=bounds)
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index ffc4c885..82ae20eb 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -1,11 +1,9 @@
 """
 Local Fisher Discriminant Analysis (LFDA)
 """
-from __future__ import division, absolute_import
 import numpy as np
 import scipy
 import warnings
-from six.moves import xrange
 from sklearn.metrics import pairwise_distances
 from sklearn.base import TransformerMixin
 
@@ -27,27 +25,27 @@ class LFDA(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
-  num_dims : Not used
+  k : int, optional (default=None)
+    Number of nearest neighbors used in local scaling method. If None,
+    defaults to min(7, n_features - 1).
 
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+  embedding_type : str, optional (default: 'weighted')
+    Type of metric in the embedding space.
 
-  k : int, optional
-      Number of nearest neighbors used in local scaling method.
-      Defaults to min(7, n_components - 1).
+    'weighted'
+      weighted eigenvectors
 
-  embedding_type : str, optional
-      Type of metric in the embedding space (default: 'weighted')
-        'weighted'        - weighted eigenvectors
-        'orthonormalized' - orthonormalized
-        'plain'           - raw eigenvectors
+    'orthonormalized'
+      orthonormalized
+
+    'plain'
+      raw eigenvectors
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   Attributes
   ----------
@@ -67,22 +65,22 @@ class LFDA(MahalanobisMixin, TransformerMixin):
   >>> lfda.fit(X, Y)
 
   References
-  ------------------
-  .. [1] `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher
-         Discriminant Analysis <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_
-         Masashi Sugiyama.
-
-  .. [2] `Local Fisher Discriminant Analysis on Beer Style Clustering
-         <https://gastrograph.com/resources/whitepapers/local-fisher\
--discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
+  ----------
+  .. [1] Masashi Sugiyama. `Dimensionality Reduction of Multimodal Labeled
+         Data by Local Fisher Discriminant Analysis
+         <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_. JMLR 2007.
+
+  .. [2] Yuan Tang. `Local Fisher Discriminant Analysis on Beer Style
+        Clustering
+        <https://gastrograph.com/resources/whitepapers/local-fisher\
+        -discriminant-analysis-on-beer-style-clustering.html#>`_.
   '''
 
-  def __init__(self, n_components=None, num_dims='deprecated',
+  def __init__(self, n_components=None,
                k=None, embedding_type='weighted', preprocessor=None):
     if embedding_type not in ('weighted', 'orthonormalized', 'plain'):
       raise ValueError('Invalid embedding_type: %r' % embedding_type)
     self.n_components = n_components
-    self.num_dims = num_dims
     self.embedding_type = embedding_type
     self.k = k
     super(LFDA, self).__init__(preprocessor)
@@ -98,11 +96,6 @@ def fit(self, X, y):
     y : (n,) array-like
         Class labels, one per point of data.
     '''
-    if self.num_dims != 'deprecated':
-      warnings.warn('"num_dims" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0. Use "n_components" instead',
-                    DeprecationWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
     unique_classes, y = np.unique(y, return_inverse=True)
     n, d = X.shape
@@ -113,15 +106,16 @@ def fit(self, X, y):
     if self.k is None:
       k = min(7, d - 1)
     elif self.k >= d:
-      warnings.warn('Chosen k (%d) too large, using %d instead.' % (self.k,d-1))
+      warnings.warn('Chosen k (%d) too large, using %d instead.'
+                    % (self.k, d - 1))
       k = d - 1
     else:
       k = int(self.k)
-    tSb = np.zeros((d,d))
-    tSw = np.zeros((d,d))
+    tSb = np.zeros((d, d))
+    tSw = np.zeros((d, d))
 
-    for c in xrange(num_classes):
-      Xc = X[y==c]
+    for c in range(num_classes):
+      Xc = X[y == c]
       nc = Xc.shape[0]
 
       # classwise affinity matrix
@@ -132,14 +126,14 @@ def fit(self, X, y):
 
       local_scale = np.outer(sigma, sigma)
       with np.errstate(divide='ignore', invalid='ignore'):
-        A = np.exp(-dist/local_scale)
-        A[local_scale==0] = 0
+        A = np.exp(-dist / local_scale)
+        A[local_scale == 0] = 0
 
-      G = Xc.T.dot(A.sum(axis=0)[:,None] * Xc) - Xc.T.dot(A).dot(Xc)
-      tSb += G/n + (1-nc/n)*Xc.T.dot(Xc) + _sum_outer(Xc)/n
-      tSw += G/nc
+      G = Xc.T.dot(A.sum(axis=0)[:, None] * Xc) - Xc.T.dot(A).dot(Xc)
+      tSb += G / n + (1 - nc / n) * Xc.T.dot(Xc) + _sum_outer(Xc) / n
+      tSw += G / nc
 
-    tSb -= _sum_outer(X)/n - tSw
+    tSb -= _sum_outer(X) / n - tSw
 
     # symmetrize
     tSb = (tSb + tSb.T) / 2
@@ -148,7 +142,7 @@ def fit(self, X, y):
     vals, vecs = _eigh(tSb, tSw, dim)
     order = np.argsort(-vals)[:dim]
     vals = vals[order].real
-    vecs = vecs[:,order]
+    vecs = vecs[:, order]
 
     if self.embedding_type == 'weighted':
        vecs *= np.sqrt(vals)
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 2035588f..47bb065f 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,14 +1,11 @@
 """
 Large Margin Nearest Neighbor Metric learning (LMNN)
 """
-from __future__ import print_function, absolute_import
 import numpy as np
-import warnings
 from collections import Counter
-from six.moves import xrange
-from sklearn.exceptions import ChangedBehaviorWarning
 from sklearn.metrics import euclidean_distances
 from sklearn.base import TransformerMixin
+import warnings
 
 from ._util import _initialize_components, _check_n_components
 from .base_metric import MahalanobisMixin
@@ -27,102 +24,91 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   Parameters
   ----------
-  init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-        'auto' (this option is to raise a warning if 'init' is not set,
-        and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-          it uses labels information. If not, but
-          ``n_components < min(n_features, n_samples)``, we use 'pca', as
-          it projects data in meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
-
-  k : int, optional
-      Number of neighbors to consider, not including self-edges.
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  n_neighbors : int, optional (default=3)
+    Number of neighbors to consider, not including self-edges.
 
   min_iter : int, optional (default=50)
-      Minimum number of iterations of the optimization procedure.
+    Minimum number of iterations of the optimization procedure.
 
   max_iter : int, optional (default=1000)
-      Maximum number of iterations of the optimization procedure.
+    Maximum number of iterations of the optimization procedure.
 
   learn_rate : float, optional (default=1e-7)
-      Learning rate of the optimization procedure
+    Learning rate of the optimization procedure
 
   tol : float, optional (default=0.001)
-      Tolerance of the optimization procedure. If the objective value varies
-      less than `tol`, we consider the algorithm has converged and stop it.
-
-  use_pca : Not used
-
-      .. deprecated:: 0.5.0
-        `use_pca` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0.
+    Tolerance of the optimization procedure. If the objective value varies
+    less than `tol`, we consider the algorithm has converged and stop it.
 
   verbose : bool, optional (default=False)
-      Whether to print the progress of the optimization procedure.
+    Whether to print the progress of the optimization procedure.
 
-  regularization: float, optional
-      Weighting of pull and push terms, with 0.5 meaning equal weight.
+  regularization: float, optional (default=0.5)
+    Relative weight between pull and push terms, with 0.5 meaning equal
+    weight.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
-
-  num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
+
+  k : Renamed to n_neighbors. Will be deprecated in 0.7.0
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   Examples
   --------
@@ -133,56 +119,43 @@ class LMNN(MahalanobisMixin, TransformerMixin):
   >>> iris_data = load_iris()
   >>> X = iris_data['data']
   >>> Y = iris_data['target']
-  >>> lmnn = LMNN(k=5, learn_rate=1e-6)
+  >>> lmnn = LMNN(n_neighbors=5, learn_rate=1e-6)
   >>> lmnn.fit(X, Y, verbose=False)
 
-  Notes
-  -----
-
-  If a recent version of the Shogun Python modular (``modshogun``) library
-  is available, the LMNN implementation will use the fast C++ version from
-  there. Otherwise, the included pure-Python version will be used.
-  The two implementations differ slightly, and the C++ version is more
-  complete.
-
   References
   ----------
-  .. [1] `Distance Metric Learning for Large Margin Nearest Neighbor
-         Classification <http://papers.nips.cc/paper/2795-distance-metric\
--learning-for-large-margin-nearest-neighbor-classification>`_
-         Kilian Q. Weinberger, John Blitzer, Lawrence K. Saul
+  .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric
+         Learning for Large Margin Nearest Neighbor Classification
+         <http://papers.nips.cc/paper/2795-distance-metric\
+         -learning-for-large-margin-nearest-neighbor-classification>`_. NIPS
+         2005.
   """
 
-  def __init__(self, init=None, k=3, min_iter=50, max_iter=1000,
+  def __init__(self, init='auto', n_neighbors=3, min_iter=50, max_iter=1000,
                learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
-               use_pca='deprecated', verbose=False, preprocessor=None,
-               n_components=None, num_dims='deprecated', random_state=None):
+               verbose=False, preprocessor=None,
+               n_components=None, random_state=None, k='deprecated'):
     self.init = init
-    self.k = k
+    if k != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_neighbors = k
+    self.k = 'deprecated'  # To avoid no_attribute error
+    self.n_neighbors = n_neighbors
     self.min_iter = min_iter
     self.max_iter = max_iter
     self.learn_rate = learn_rate
     self.regularization = regularization
     self.convergence_tol = convergence_tol
-    self.use_pca = use_pca
     self.verbose = verbose
     self.n_components = n_components
-    self.num_dims = num_dims
     self.random_state = random_state
     super(LMNN, self).__init__(preprocessor)
 
   def fit(self, X, y):
-    if self.num_dims != 'deprecated':
-      warnings.warn('"num_dims" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0. Use "n_components" instead',
-                    DeprecationWarning)
-    if self.use_pca != 'deprecated':
-      warnings.warn('"use_pca" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0.',
-                    DeprecationWarning)
-    k = self.k
+    k = self.n_neighbors
     reg = self.regularization
     learn_rate = self.learn_rate
 
@@ -195,24 +168,11 @@ def fit(self, X, y):
       raise ValueError('Must have one label per point.')
     self.labels_ = np.arange(len(unique_labels))
 
-    # if the init is the default (None), we raise a warning
-    if self.init is None:
-      # TODO: replace init=None by init='auto' in v0.6.0 and remove the warning
-      msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-             "the default init will now be set to 'auto', instead of the "
-             "previous identity matrix. If you still want to use the identity "
-             "matrix as before, set init='identity'. This warning "
-             "will disappear in v0.6.0, and `init` parameter's default value "
-             "will be set to 'auto'.")
-      warnings.warn(msg, ChangedBehaviorWarning)
-      init = 'auto'
-    else:
-      init = self.init
-    self.components_ = _initialize_components(output_dim, X, y, init,
+    self.components_ = _initialize_components(output_dim, X, y, self.init,
                                               self.verbose,
                                               random_state=self.random_state)
     required_k = np.bincount(label_inds).min()
-    if self.k > required_k:
+    if self.n_neighbors > required_k:
       raise ValueError('not enough class labels for specified k'
                        ' (smallest class has %d)' % required_k)
 
@@ -233,8 +193,12 @@ def fit(self, X, y):
 
     it = 1  # we already made one iteration
 
+    if self.verbose:
+      print("iter | objective | objective difference | active constraints",
+            "| learning rate")
+
     # main loop
-    for it in xrange(2, self.max_iter):
+    for it in range(2, self.max_iter):
       # then at each iteration, we try to find a value of L that has better
       # objective than the previous L, following the gradient:
       while True:
@@ -244,8 +208,8 @@ def fit(self, X, y):
         # we copy variables that can be modified by _loss_grad, because if we
         # retry we don t want to modify them several times
         (G_next, objective_next, total_active_next) = (
-          self._loss_grad(X, L_next, dfG, k, reg, target_neighbors,
-                          label_inds))
+            self._loss_grad(X, L_next, dfG, k, reg, target_neighbors,
+                            label_inds))
         assert not np.isnan(objective)
         delta_obj = objective_next - objective
         if delta_obj > 0:
@@ -298,7 +262,7 @@ def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
     # compute the gradient
     total_active = 0
     df = np.zeros((X.shape[1], X.shape[1]))
-    for nn_idx in reversed(xrange(k)):  # note: reverse not useful here
+    for nn_idx in reversed(range(k)):  # note: reverse not useful here
       act1 = g0 < g1[:, nn_idx]
       act2 = g0 < g2[:, nn_idx]
       total_active += act1.sum() + act2.sum()
@@ -321,12 +285,12 @@ def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
     return 2 * G, objective, total_active
 
   def _select_targets(self, X, label_inds):
-    target_neighbors = np.empty((X.shape[0], self.k), dtype=int)
+    target_neighbors = np.empty((X.shape[0], self.n_neighbors), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(label_inds == label)
       dd = euclidean_distances(X[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
-      nn = np.argsort(dd)[..., :self.k]
+      nn = np.argsort(dd)[..., :self.n_neighbors]
       target_neighbors[inds] = inds[nn]
     return target_neighbors
 
@@ -338,15 +302,15 @@ def _find_impostors(self, furthest_neighbors, X, label_inds, L):
       in_inds, = np.nonzero(label_inds == label)
       out_inds, = np.nonzero(label_inds > label)
       dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
-      i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
-      i2,j2 = np.nonzero(dist < margin_radii[in_inds])
-      i = np.hstack((i1,i2))
-      j = np.hstack((j1,j2))
+      i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None])
+      i2, j2 = np.nonzero(dist < margin_radii[in_inds])
+      i = np.hstack((i1, i2))
+      j = np.hstack((j1, j2))
       if i.size > 0:
         # get unique (i,j) pairs using index trickery
-        shape = (i.max()+1, j.max()+1)
-        tmp = np.ravel_multi_index((i,j), shape)
-        i,j = np.unravel_index(np.unique(tmp), shape)
+        shape = (i.max() + 1, j.max() + 1)
+        tmp = np.ravel_multi_index((i, j), shape)
+        i, j = np.unravel_index(np.unique(tmp), shape)
       impostors.append(np.vstack((in_inds[j], out_inds[i])))
     if len(impostors) == 0:
         # No impostors detected
@@ -361,19 +325,19 @@ def _inplace_paired_L2(A, B):
 
 
 def _count_edges(act1, act2, impostors, targets):
-  imp = impostors[0,act1]
+  imp = impostors[0, act1]
   c = Counter(zip(imp, targets[imp]))
-  imp = impostors[1,act2]
+  imp = impostors[1, act2]
   c.update(zip(imp, targets[imp]))
   if c:
     active_pairs = np.array(list(c.keys()))
   else:
-    active_pairs = np.empty((0,2), dtype=int)
+    active_pairs = np.empty((0, 2), dtype=int)
   return active_pairs, np.array(list(c.values()))
 
 
 def _sum_outer_products(data, a_inds, b_inds, weights=None):
   Xab = data[a_inds] - data[b_inds]
   if weights is not None:
-    return np.dot(Xab.T, Xab * weights[:,None])
+    return np.dot(Xab.T, Xab * weights[:, None])
   return np.dot(Xab.T, Xab)
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index 340e6bf2..af7fa95b 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -2,24 +2,21 @@
 Metric Learning from Relative Comparisons by Minimizing Squared Residual (LSML)
 """
 
-from __future__ import print_function, absolute_import, division
-import warnings
 import numpy as np
 import scipy.linalg
-from six.moves import xrange
 from sklearn.base import TransformerMixin
-from sklearn.exceptions import ChangedBehaviorWarning
 
 from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints
 from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
 
 
 class _BaseLSML(MahalanobisMixin):
 
   _tuple_size = 4  # constraints are quadruplets
 
-  def __init__(self, tol=1e-3, max_iter=1000, prior=None,
+  def __init__(self, tol=1e-3, max_iter=1000, prior='identity',
                verbose=False, preprocessor=None, random_state=None):
     self.prior = prior
     self.tol = tol
@@ -42,22 +39,10 @@ def _fit(self, quadruplets, weights=None):
     else:
       self.w_ = weights
     self.w_ /= self.w_.sum()  # weights must sum to 1
-    # if the prior is the default (None), we raise a warning
-    if self.prior is None:
-      msg = ("Warning, no prior was set (`prior=None`). As of version 0.5.0, "
-             "the default prior will now be set to "
-             "'identity', instead of 'covariance'. If you still want to use "
-             "the inverse of the covariance matrix as a prior, "
-             "set prior='covariance'. This warning will disappear in "
-             "v0.6.0, and `prior` parameter's default value will be set to "
-             "'identity'.")
-      warnings.warn(msg, ChangedBehaviorWarning)
-      prior = 'identity'
-    else:
-      prior = self.prior
-    M, prior_inv = _initialize_metric_mahalanobis(quadruplets, prior,
-       return_inverse=True, strict_pd=True, matrix_name='prior',
-       random_state=self.random_state)
+    M, prior_inv = _initialize_metric_mahalanobis(
+        quadruplets, self.prior,
+        return_inverse=True, strict_pd=True, matrix_name='prior',
+        random_state=self.random_state)
 
     step_sizes = np.logspace(-10, 0, 10)
     # Keep track of the best step size and the loss at that step.
@@ -65,7 +50,7 @@ def _fit(self, quadruplets, weights=None):
     s_best = self._total_loss(M, vab, vcd, prior_inv)
     if self.verbose:
       print('initial loss', s_best)
-    for it in xrange(1, self.max_iter+1):
+    for it in range(1, self.max_iter + 1):
       grad = self._gradient(M, vab, vcd, prior_inv)
       grad_norm = scipy.linalg.norm(grad)
       if grad_norm < self.tol:
@@ -117,8 +102,8 @@ def _gradient(self, metric, vab, vcd, prior_inv):
     # TODO: vectorize
     for vab, dab, vcd, dcd in zip(vab[violations], dabs[violations],
                                   vcd[violations], dcds[violations]):
-      dMetric += ((1-np.sqrt(dcd/dab))*np.outer(vab, vab) +
-                  (1-np.sqrt(dab/dcd))*np.outer(vcd, vcd))
+      dMetric += ((1 - np.sqrt(dcd / dab)) * np.outer(vab, vab) +
+                  (1 - np.sqrt(dab / dcd)) * np.outer(vcd, vcd))
     return dMetric
 
 
@@ -138,60 +123,66 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
 
   Parameters
   ----------
-  prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For LSML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-       'identity'
-          An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-          The inverse covariance matrix.
-
-       'random'
-          The initial Mahalanobis matrix will be a random positive definite
-          (PD) matrix of shape `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
-
-  tol : float, optional
-  max_iter : int, optional
-  verbose : bool, optional
-      if True, prints information while learning
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  tol : float, optional (default=1e-3)
+    Convergence tolerance of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iteration of the optimization procedure.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   Examples
   --------
-  >>> from metric_learn import LSML_Supervised
-  >>> from sklearn.datasets import load_iris
-  >>> iris_data = load_iris()
-  >>> X = iris_data['data']
-  >>> Y = iris_data['target']
-  >>> lsml = LSML_Supervised(num_constraints=200)
-  >>> lsml.fit(X, Y)
+  >>> from metric_learn import LSML
+  >>> quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
+  >>>                [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
+  >>>                [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
+  >>>                [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
+  >>> # we want to make closer points where the first feature is close, and
+  >>> # further if the second feature is close
+  >>> lsml = LSML()
+  >>> lsml.fit(quadruplets)
 
   References
   ----------
@@ -199,7 +190,7 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
          Squared Residual
          <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
 
-  .. [2] Adapted from https://gist.github.com/kcarnold/5439917
+  .. [2] Code adapted from https://gist.github.com/kcarnold/5439917
 
   See Also
   --------
@@ -216,18 +207,19 @@ def fit(self, quadruplets, weights=None):
     ----------
     quadruplets : array-like, shape=(n_constraints, 4, n_features) or \
                   (n_constraints, 4)
-        3D array-like of quadruplets of points or 2D array of quadruplets of
-        indicators. In order to supervise the algorithm in the right way, we
-        should have the four samples ordered in a way such that:
-        d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
-        n_constraints.
+      3D array-like of quadruplets of points or 2D array of quadruplets of
+      indicators. In order to supervise the algorithm in the right way, we
+      should have the four samples ordered in a way such that:
+      d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
+      n_constraints.
+
     weights : (n_constraints,) array of floats, optional
-        scale factor for each constraint
+      scale factor for each constraint
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     return self._fit(quadruplets, weights=weights)
 
@@ -243,112 +235,114 @@ class LSML_Supervised(_BaseLSML, TransformerMixin):
   Parameters
   ----------
   tol : float, optional (default=1e-3)
-      Tolerance for the convergence procedure.
+    Convergence tolerance of the optimization procedure.
+
   max_iter : int, optional (default=1000)
-      Number of maximum iterations of the convergence procedure.
-  prior : None, string or numpy array, optional (default=None)
-      Prior to set for the metric. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of
-      shape (n_features, n_features). For LSML, the prior should be strictly
-      positive definite (PD). If `None`, will be set
-      automatically to 'identity' (this is to raise a warning if
-      `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-      'identity'
-          An identity matrix of shape (n_features, n_features).
-
-      'covariance'
-          The inverse covariance matrix.
-
-      'random'
-          The initial Mahalanobis matrix will be a random positive definite
-          (PD) matrix of shape `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-      numpy array
-          A positive definite (PD) matrix of shape
-          (n_features, n_features), that will be used as such to set the
-          prior.
-  num_labeled : Not used
-    .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-  weights : (m,) array of floats, optional
-      scale factor for each constraint
-  verbose : bool, optional
-      if True, prints information while learning
+    Number of maximum iterations of the optimization procedure.
+
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  n_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  weights : (n_constraints,) array of floats, optional (default=None)
+    Relative weight given to each constraint. If None, defaults to uniform
+    weights.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior. In any case, `random_state` is also used to randomly sample
-      constraints from labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import LSML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lsml = LSML_Supervised(n_constraints=200)
+  >>> lsml.fit(X, Y)
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
   """
 
-  def __init__(self, tol=1e-3, max_iter=1000, prior=None,
-               num_labeled='deprecated', num_constraints=None, weights=None,
-               verbose=False, preprocessor=None, random_state=None):
+  def __init__(self, tol=1e-3, max_iter=1000, prior='identity',
+               n_constraints=None, weights=None,
+               verbose=False, preprocessor=None, random_state=None,
+               num_constraints='deprecated'):
     _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior,
                        verbose=verbose, preprocessor=preprocessor,
                        random_state=random_state)
-    self.num_labeled = num_labeled
-    self.num_constraints = num_constraints
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
     self.weights = weights
 
-  def fit(self, X, y, random_state='deprecated'):
+  def fit(self, X, y):
     """Create constraints from labels and learn the LSML model.
 
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
 
     y : (n) array-like
-        Data labels.
-
-    random_state : Not used
-      .. deprecated:: 0.5.0
-        `random_state` in the `fit` function was deprecated in version 0.5.0
-        and will be removed in 0.6.0. Set `random_state` at initialization
-        instead (when instantiating a new `LSML_Supervised` object).
+      Data labels.
     """
-    if self.num_labeled != 'deprecated':
-      warnings.warn('"num_labeled" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0', DeprecationWarning)
-    if random_state != 'deprecated':
-      warnings.warn('"random_state" parameter in the `fit` function is '
-                    'deprecated. Set `random_state` at initialization '
-                    'instead (when instantiating a new `LSML_Supervised` '
-                    'object).', DeprecationWarning)
-    else:
-      warnings.warn('As of v0.5.0, `LSML_Supervised` now uses the '
-                    '`random_state` given at initialization to sample '
-                    'constraints, not the default `np.random` from the `fit` '
-                    'method, since this argument is now deprecated. '
-                    'This warning will disappear in v0.6.0.',
-                    ChangedBehaviorWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
-    num_constraints = self.num_constraints
-    if num_constraints is None:
+    n_constraints = self.n_constraints
+    if n_constraints is None:
       num_classes = len(np.unique(y))
-      num_constraints = 20 * num_classes**2
+      n_constraints = 20 * num_classes**2
 
     c = Constraints(y)
-    pos_neg = c.positive_negative_pairs(num_constraints, same_length=True,
+    pos_neg = c.positive_negative_pairs(n_constraints, same_length=True,
                                         random_state=self.random_state)
     return _BaseLSML._fit(self, X[np.column_stack(pos_neg)],
                           weights=self.weights)
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
index 471694b6..01d185e7 100644
--- a/metric_learn/mlkr.py
+++ b/metric_learn/mlkr.py
@@ -1,21 +1,18 @@
 """
 Metric Learning for Kernel Regression (MLKR)
 """
-from __future__ import division, print_function
 import time
 import sys
 import warnings
 import numpy as np
-from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning
-from sklearn.utils.fixes import logsumexp
 from scipy.optimize import minimize
+from scipy.special import logsumexp
 from sklearn.base import TransformerMixin
-
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.metrics import pairwise_distances
 
-from metric_learn._util import _check_n_components
 from .base_metric import MahalanobisMixin
-from ._util import _initialize_components
+from ._util import _initialize_components, _check_n_components
 
 EPS = np.finfo(float).eps
 
@@ -33,78 +30,65 @@ class MLKR(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
-
-  num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
-
-  init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-      'auto' (this option is to raise a warning if 'init' is not set,
-      and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components < min(n_features, n_samples)``,
-          we use 'pca', as it projects data in meaningful directions (those
-          of higher variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
-
-  A0: Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'init' instead.
-
-  tol: float, optional (default=None)
-      Convergence tolerance for the optimization.
-
-  max_iter: int, optional
-      Cap on number of conjugate gradient iterations.
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components < min(n_features, n_samples)``,
+      we use 'pca', as it projects data in meaningful directions (those
+      of higher variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  tol : float, optional (default=None)
+    Convergence tolerance for the optimization.
+
+  max_iter : int, optional (default=1000)
+    Cap on number of conjugate gradient iterations.
 
   verbose : bool, optional (default=False)
-      Whether to print progress messages or not.
+    Whether to print progress messages or not.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   Examples
   --------
@@ -119,18 +103,16 @@ class MLKR(MahalanobisMixin, TransformerMixin):
 
   References
   ----------
-  .. [1] `Information-theoretic Metric Learning
-     <http://machinelearning.wustl.edu/\
-mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
+  .. [1] K.Q. Weinberger and G. Tesauto. `Metric Learning for Kernel
+         Regression <http://proceedings.mlr.press/v2/weinberger07a\
+         /weinberger07a.pdf>`_. AISTATS 2007.
   """
 
-  def __init__(self, n_components=None, num_dims='deprecated', init=None,
-               A0='deprecated', tol=None, max_iter=1000, verbose=False,
+  def __init__(self, n_components=None, init='auto',
+               tol=None, max_iter=1000, verbose=False,
                preprocessor=None, random_state=None):
     self.n_components = n_components
-    self.num_dims = num_dims
     self.init = init
-    self.A0 = A0
     self.tol = tol
     self.max_iter = max_iter
     self.verbose = verbose
@@ -146,18 +128,6 @@ def fit(self, X, y):
       X : (n x d) array of samples
       y : (n) data labels
       """
-      if self.A0 != 'deprecated':
-        warnings.warn('"A0" parameter is not used.'
-                      ' It has been deprecated in version 0.5.0 and will be'
-                      'removed in 0.6.0. Use "init" instead.',
-                      DeprecationWarning)
-
-      if self.num_dims != 'deprecated':
-        warnings.warn('"num_dims" parameter is not used.'
-                      ' It has been deprecated in version 0.5.0 and will be'
-                      ' removed in 0.6.0. Use "n_components" instead',
-                      DeprecationWarning)
-
       X, y = self._prepare_inputs(X, y, y_numeric=True,
                                   ensure_min_samples=2)
       n, d = X.shape
@@ -170,19 +140,7 @@ def fit(self, X, y):
       if m is None:
           m = d
       # if the init is the default (None), we raise a warning
-      if self.init is None:
-        # TODO:
-        #  replace init=None by init='auto' in v0.6.0 and remove the warning
-        msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-               "the default init will now be set to 'auto', instead of 'pca'. "
-               "If you still want to use PCA as an init, set init='pca'. "
-               "This warning will disappear in v0.6.0, and `init` parameter's"
-               " default value will be set to 'auto'.")
-        warnings.warn(msg, ChangedBehaviorWarning)
-        init = 'auto'
-      else:
-        init = self.init
-      A = _initialize_components(m, X, y, init=init,
+      A = _initialize_components(m, X, y, init=self.init,
                                  random_state=self.random_state,
                                  # MLKR works on regression targets:
                                  has_classes=False)
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index c8c52b24..5cf166fd 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -1,30 +1,34 @@
 """Mahalanobis Metric for Clustering (MMC)"""
-from __future__ import print_function, absolute_import, division
-import warnings
 import numpy as np
-from six.moves import xrange
 from sklearn.base import TransformerMixin
 from sklearn.utils.validation import assert_all_finite
-from sklearn.exceptions import ChangedBehaviorWarning
 
 from .base_metric import _PairsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints, wrap_pairs
 from ._util import components_from_metric, _initialize_metric_mahalanobis
+import warnings
 
 
 class _BaseMMC(MahalanobisMixin):
 
   _tuple_size = 2  # constraints are pairs
 
-  def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
-               init=None, A0='deprecated', diagonal=False,
+  def __init__(self, max_iter=100, max_proj=10000, tol=1e-3,
+               init='identity', diagonal=False,
                diagonal_c=1.0, verbose=False, preprocessor=None,
-               random_state=None):
+               random_state=None,
+               convergence_threshold='deprecated'):
+    if convergence_threshold != 'deprecated':
+      warnings.warn('"convergence_threshold" parameter has been '
+                    ' renamed to "tol". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      tol = convergence_threshold
+    self.convergence_threshold = 'deprecated'  # Avoid errors
     self.max_iter = max_iter
     self.max_proj = max_proj
-    self.convergence_threshold = convergence_threshold
+    self.tol = tol
     self.init = init
-    self.A0 = A0
     self.diagonal = diagonal
     self.diagonal_c = diagonal_c
     self.verbose = verbose
@@ -32,30 +36,10 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
     super(_BaseMMC, self).__init__(preprocessor)
 
   def _fit(self, pairs, y):
-    if self.A0 != 'deprecated':
-      warnings.warn('"A0" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    'removed in 0.6.0. Use "init" instead.',
-                    DeprecationWarning)
     pairs, y = self._prepare_inputs(pairs, y,
                                     type_of_inputs='tuples')
 
-    if self.init is None:
-      # TODO: replace init=None by init='auto' in v0.6.0 and remove the warning
-      msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-             "the default init will now be set to 'identity', instead of the "
-             "identity divided by a scaling factor of 10. "
-             "If you still want to use the same init as in previous "
-             "versions, set init=np.eye(d)/10, where d is the dimension "
-             "of your input space (d=pairs.shape[1]). "
-             "This warning will disappear in v0.6.0, and `init` parameter's"
-             " default value will be set to 'auto'.")
-      warnings.warn(msg, ChangedBehaviorWarning)
-      init = 'identity'
-    else:
-      init = self.init
-
-    self.A_ = _initialize_metric_mahalanobis(pairs, init,
+    self.A_ = _initialize_metric_mahalanobis(pairs, self.init,
                                              random_state=self.random_state,
                                              matrix_name='init')
 
@@ -70,14 +54,14 @@ def _fit_full(self, pairs, y):
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      Each row corresponds to a single instance.
     constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
-        dissimilar pairs
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
     """
     num_dim = pairs.shape[2]
 
-    error1 = error2 = 1e10
+    error2 = 1e10
     eps = 0.01        # error-bound of iterative projection on C1 and C2
     A = self.A_
 
@@ -105,16 +89,17 @@ def _fit_full(self, pairs, y):
     # constraint function
     grad2 = self._fD1(neg_pairs, A)            # gradient of dissimilarity
     # constraint function
-    M = self._grad_projection(grad1, grad2)  # gradient of fD1 orthogonal to fS1
+    # gradient of fD1 orthogonal to fS1:
+    M = self._grad_projection(grad1, grad2)
 
     A_old = A.copy()
 
-    for cycle in xrange(self.max_iter):
+    for cycle in range(self.max_iter):
 
       # projection of constraints C1 and C2
       satisfy = False
 
-      for it in xrange(self.max_proj):
+      for it in range(self.max_proj):
 
         # First constraint:
         # f(A) = \sum_{i,j \in S} d_ij' A d_ij <= t              (1)
@@ -133,7 +118,7 @@ def _fit_full(self, pairs, y):
         # PSD constraint A >= 0
         # project A onto domain A>0
         l, V = np.linalg.eigh((A + A.T) / 2)
-        A[:] = np.dot(V * np.maximum(0, l[None,:]), V.T)
+        A[:] = np.dot(V * np.maximum(0, l[None, :]), V.T)
 
         fDC2 = w.dot(A.ravel())
         error2 = (fDC2 - t) / t
@@ -169,12 +154,13 @@ def _fit_full(self, pairs, y):
         A[:] = A_old + alpha * M
 
       delta = np.linalg.norm(alpha * M) / np.linalg.norm(A_old)
-      if delta < self.convergence_threshold:
+      if delta < self.tol:
         break
       if self.verbose:
-        print('mmc iter: %d, conv = %f, projections = %d' % (cycle, delta, it+1))
+        print('mmc iter: %d, conv = %f, projections = %d' %
+              (cycle, delta, it + 1))
 
-    if delta > self.convergence_threshold:
+    if delta > self.tol:
       self.converged_ = False
       if self.verbose:
         print('mmc did not converge, conv = %f' % (delta,))
@@ -193,10 +179,10 @@ def _fit_diag(self, pairs, y):
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      Each row corresponds to a single instance.
     constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
-        dissimilar pairs
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
     """
     num_dim = pairs.shape[2]
     pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
@@ -208,14 +194,16 @@ def _fit_diag(self, pairs, y):
     reduction = 2.0
     w = np.diag(self.A_).copy()
 
-    while error > self.convergence_threshold and it < self.max_iter:
+    while error > self.tol and it < self.max_iter:
 
       fD0, fD_1st_d, fD_2nd_d = self._D_constraint(neg_pairs, w)
       obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0
       fS_1st_d = s_sum  # first derivative of the similarity constraints
 
-      gradient = fS_1st_d - self.diagonal_c * fD_1st_d               # gradient of the objective
-      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim)  # Hessian of the objective
+      # gradient of the objective:
+      gradient = fS_1st_d - self.diagonal_c * fD_1st_d
+      # Hessian of the objective:
+      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim)
       step = np.dot(np.linalg.inv(hessian), gradient)
 
       # Newton-Rapshon update
@@ -225,7 +213,7 @@ def _fit_diag(self, pairs, y):
       obj = (np.dot(s_sum, w_tmp) + self.diagonal_c *
              self._D_objective(neg_pairs, w_tmp))
       assert_all_finite(obj)
-      obj_previous = obj + 1  # just to get the while-loop started
+      obj_previous = np.inf  # just to get the while-loop started
 
       inner_it = 0
       while obj < obj_previous:
@@ -250,16 +238,17 @@ def _fit_diag(self, pairs, y):
     return self
 
   def _fD(self, neg_pairs, A):
-    """The value of the dissimilarity constraint function.
+    r"""The value of the dissimilarity constraint function.
 
     f = f(\sum_{ij \in D} distance(x_i, x_j))
     i.e. distance can be L1:  \sqrt{(x_i-x_j)A(x_i-x_j)'}
     """
     diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
-    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis=1))) + 1e-6)
+    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis=1))) +
+                  1e-6)
 
   def _fD1(self, neg_pairs, A):
-    """The gradient of the dissimilarity constraint function w.r.t. A.
+    r"""The gradient of the dissimilarity constraint function w.r.t. A.
 
     For example, let distance by L1 norm:
     f = f(\sum_{ij \in D} \sqrt{(x_i-x_j)A(x_i-x_j)'})
@@ -270,19 +259,19 @@ def _fD1(self, neg_pairs, A):
         df/dA = f'(\sum_{ij \in D} \sqrt{tr(d_ij'*d_ij*A)})
                 * 0.5*(\sum_{ij \in D} (1/sqrt{tr(d_ij'*d_ij*A)})*(d_ij'*d_ij))
     """
-    dim = neg_pairs.shape[2]
     diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]
     # outer products of all rows in `diff`
     M = np.einsum('ij,ik->ijk', diff, diff)
     # faster version of: dist = np.sqrt(np.sum(M * A[None,:,:], axis=(1,2)))
     dist = np.sqrt(np.einsum('ijk,jk', M, A))
-    # faster version of: sum_deri = np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis=0)
+    # faster version of: sum_deri = np.sum(M /
+    # (2 * (dist[:,None,None] + 1e-6)), axis=0)
     sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))
     sum_dist = dist.sum()
     return sum_deri / (sum_dist + 1e-6)
 
   def _fS1(self, pos_pairs, A):
-    """The gradient of the similarity constraint function w.r.t. A.
+    r"""The gradient of the similarity constraint function w.r.t. A.
 
     f = \sum_{ij}(x_i-x_j)A(x_i-x_j)' = \sum_{ij}d_ij*A*d_ij'
     df/dA = d(d_ij*A*d_ij')/dA
@@ -290,9 +279,9 @@ def _fS1(self, pos_pairs, A):
     Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A)
     so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij
     """
-    dim = pos_pairs.shape[2]
     diff = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
-    return np.einsum('ij,ik->jk', diff, diff)  # sum of outer products of all rows in `diff`
+    # sum of outer products of all rows in `diff`:
+    return np.einsum('ij,ik->jk', diff, diff)
 
   def _grad_projection(self, grad1, grad2):
     grad2 = grad2 / np.linalg.norm(grad2)
@@ -303,7 +292,7 @@ def _grad_projection(self, grad1, grad2):
   def _D_objective(self, neg_pairs, w):
     return np.log(np.sum(np.sqrt(np.sum(((neg_pairs[:, 0, :] -
                                           neg_pairs[:, 1, :]) ** 2) *
-                                        w[None,:], axis=1) + 1e-6)))
+                                        w[None, :], axis=1) + 1e-6)))
 
   def _D_constraint(self, neg_pairs, w):
     """Compute the value, 1st derivative, second derivative (Hessian) of
@@ -317,13 +306,14 @@ def _D_constraint(self, neg_pairs, w):
     sum_deri2 = np.einsum(
         'ij,ik->jk',
         diff_sq,
-        diff_sq / (-4 * np.maximum(1e-6, dist**3))[:,None]
+        diff_sq / (-4 * np.maximum(1e-6, dist**3))[:, None]
     )
     sum_dist = dist.sum()
     return (
-      np.log(sum_dist),
-      sum_deri1 / sum_dist,
-      sum_deri2 / sum_dist - np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist)
+        np.log(sum_dist),
+        sum_deri1 / sum_dist,
+        sum_deri2 / sum_dist -
+        np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist)
     )
 
 
@@ -346,94 +336,92 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
   Parameters
   ----------
   max_iter : int, optional (default=100)
-      Maximum number of iterations of the convergence procedure.
+    Maximum number of iterations of the optimization procedure.
 
   max_proj : int, optional (default=10000)
-      Maximum number of projection steps.
+    Maximum number of projection steps.
 
-  convergence_threshold : float, optional (default=1e-6)
-      Convergence threshold for the convergence procedure.
+  tol : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
 
-  init : None, string or numpy array, optional (default=None)
-     Initialization of the Mahalanobis matrix. Possible options are
-     'identity', 'covariance', 'random', and a numpy array of
-     shape (n_features, n_features). If None, will be set
-     automatically to 'identity' (this is to raise a warning if
-     'init' is not set, and stays to its default value (None), in v0.5.0).
+  init : string or numpy array, optional (default='identity')
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features).
 
-      'identity'
-         An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-      'covariance'
-         The (pseudo-)inverse of the covariance matrix.
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
 
-      'random'
-         The initial Mahalanobis matrix will be a random SPD matrix of
-         shape
-         `(n_features, n_features)`, generated using
-         `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-      numpy array
-          An SPD matrix of shape (n_features, n_features), that will
-          be used as such to initialize the metric.
+    numpy array
+      An SPD matrix of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
 
-  verbose : bool, optional
-     if True, prints information while learning
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-     The preprocessor to call to get tuples from indices. If array-like,
-     tuples will be gotten like this: X[indices].
-  A0 : Not used.
-     .. deprecated:: 0.5.0
-       `A0` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0. Use 'init' instead.
-  diagonal : bool, optional
-     if True, a diagonal metric will be learned,
-     i.e., a simple scaling of dimensions. The initialization will then
-     be the diagonal coefficients of the matrix given as 'init'.
-  diagonal_c : float, optional
-     weight of the dissimilarity constraint for diagonal
-     metric learning
-  verbose : bool, optional
-     if True, prints information while learning
-  preprocessor : array-like, shape=(n_samples, n_features) or callable
-     The preprocessor to call to get tuples from indices. If array-like,
-     tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-     A pseudo random number generator object or a seed for it if int. If
-     ``init='random'``, ``random_state`` is used to initialize the random
-     transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation.
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
-  >>> from metric_learn import MMC_Supervised
-  >>> from sklearn.datasets import load_iris
-  >>> iris_data = load_iris()
-  >>> X = iris_data['data']
-  >>> Y = iris_data['target']
-  >>> mmc = MMC_Supervised(num_constraints=200)
-  >>> mmc.fit(X, Y)
+  >>> from metric_learn import MMC
+  >>> pairs = [[[1.2, 7.5], [1.3, 1.5]],
+  >>>          [[6.4, 2.6], [6.2, 9.7]],
+  >>>          [[1.3, 4.5], [3.2, 4.6]],
+  >>>          [[6.2, 5.5], [5.4, 5.4]]]
+  >>> y = [1, 1, -1, -1]
+  >>> # in this task we want points where the first feature is close to be
+  >>> # closer to each other, no matter how close the second feature is
+  >>> mmc = MMC()
+  >>> mmc.fit(pairs, y)
 
   References
   ----------
-  .. [1] `Distance metric learning with application to clustering with
-         side-information <http://papers.nips.cc/paper/2164-distance-metric-\
-learning-with-application-to-clustering-with-side-information.pdf>`_
-         Xing, Jordan, Russell, Ng.
+  .. [1] Xing, Jordan, Russell, Ng. `Distance metric learning with application
+         to clustering with side-information
+         <http://papers.nips.cc/paper/2164-distance-metric-\
+         learning-with-application-to-clustering-with-side-information.pdf>`_.
+         NIPS 2002.
 
   See Also
   --------
@@ -452,19 +440,22 @@ def fit(self, pairs, y, calibration_params=None):
     ----------
     pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
     y : array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
+
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -483,129 +474,128 @@ class MMC_Supervised(_BaseMMC, TransformerMixin):
 
   Parameters
   ----------
-  max_iter : int, optional
-  max_proj : int, optional
-  convergence_threshold : float, optional
-  num_labeled : Not used
-    .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-  init : None, string or numpy array, optional (default=None)
-      Initialization of the Mahalanobis matrix. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of
-      shape (n_features, n_features). If None, will be set
-      automatically to 'identity' (this is to raise a warning if
-      'init' is not set, and stays to its default value (None), in v0.5.0).
-
-       'identity'
-           An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-           The (pseudo-)inverse of the covariance matrix.
-
-       'random'
-           The initial Mahalanobis matrix will be a random SPD matrix of
-           shape `(n_features, n_features)`, generated using
-           `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A numpy array of shape (n_features, n_features), that will
-           be used as such to initialize the metric.
-
-  verbose : bool, optional
-      if True, prints information while learning
+  max_iter : int, optional (default=100)
+    Maximum number of iterations of the optimization procedure.
+
+  max_proj : int, optional (default=10000)
+    Maximum number of projection steps.
+
+  tol : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
+
+  n_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  init : string or numpy array, optional (default='identity')
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A numpy array of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
+
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be gotten like this: X[indices].
-  A0 : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'init' instead.
-  diagonal : bool, optional
-      if True, a diagonal metric will be learned,
-      i.e., a simple scaling of dimensions
-  diagonal_c : float, optional
-      weight of the dissimilarity constraint for diagonal
-      metric learning
-  verbose : bool, optional
-      if True, prints information while learning
-  preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      Mahalanobis matrix.  In any case, `random_state` is also used to
-      randomly sample constraints from labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    Mahalanobis matrix.  In any case, `random_state` is also used to
+    randomly sample constraints from labels.
 
-  `MMC_Supervised` creates pairs of similar sample by taking same class
-  samples, and pairs of dissimilar samples by taking different class
-  samples. It then passes these pairs to `MMC` for training.
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
+
+  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import MMC_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> mmc = MMC_Supervised(n_constraints=200)
+  >>> mmc.fit(X, Y)
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
   """
 
-  def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
-               num_labeled='deprecated', num_constraints=None, init=None,
-               A0='deprecated', diagonal=False, diagonal_c=1.0, verbose=False,
-               preprocessor=None, random_state=None):
+  def __init__(self, max_iter=100, max_proj=10000, tol=1e-6,
+               n_constraints=None, init='identity',
+               diagonal=False, diagonal_c=1.0, verbose=False,
+               preprocessor=None, random_state=None,
+               num_constraints='deprecated',
+               convergence_threshold='deprecated'):
     _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj,
-                      convergence_threshold=convergence_threshold,
-                      init=init, A0=A0, diagonal=diagonal,
+                      tol=tol,
+                      init=init, diagonal=diagonal,
                       diagonal_c=diagonal_c, verbose=verbose,
-                      preprocessor=preprocessor, random_state=random_state)
-    self.num_labeled = num_labeled
-    self.num_constraints = num_constraints
+                      preprocessor=preprocessor,
+                      random_state=random_state,
+                      convergence_threshold=convergence_threshold)
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
 
-  def fit(self, X, y, random_state='deprecated'):
+  def fit(self, X, y):
     """Create constraints from labels and learn the MMC model.
 
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
+
     y : (n) array-like
-        Data labels.
-    random_state : Not used
-      .. deprecated:: 0.5.0
-        `random_state` in the `fit` function was deprecated in version 0.5.0
-        and will be removed in 0.6.0. Set `random_state` at initialization
-        instead (when instantiating a new `MMC_Supervised` object).
+      Data labels.
     """
-    if self.num_labeled != 'deprecated':
-      warnings.warn('"num_labeled" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0', DeprecationWarning)
-    if random_state != 'deprecated':
-      warnings.warn('"random_state" parameter in the `fit` function is '
-                    'deprecated. Set `random_state` at initialization '
-                    'instead (when instantiating a new `MMC_Supervised` '
-                    'object).', DeprecationWarning)
-    else:
-      warnings.warn('As of v0.5.0, `MMC_Supervised` now uses the '
-                    '`random_state` given at initialization to sample '
-                    'constraints, not the default `np.random` from the `fit` '
-                    'method, since this argument is now deprecated. '
-                    'This warning will disappear in v0.6.0.',
-                    ChangedBehaviorWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
-    num_constraints = self.num_constraints
-    if num_constraints is None:
+    n_constraints = self.n_constraints
+    if n_constraints is None:
       num_classes = len(np.unique(y))
-      num_constraints = 20 * num_classes**2
+      n_constraints = 20 * num_classes**2
 
     c = Constraints(y)
-    pos_neg = c.positive_negative_pairs(num_constraints,
+    pos_neg = c.positive_negative_pairs(n_constraints,
                                         random_state=self.random_state)
     pairs, y = wrap_pairs(X, pos_neg)
     return _BaseMMC._fit(self, pairs, y)
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 03abdc41..7b4423d3 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -2,16 +2,15 @@
 Neighborhood Components Analysis (NCA)
 """
 
-from __future__ import absolute_import
 import warnings
 import time
 import sys
 import numpy as np
 from scipy.optimize import minimize
-from sklearn.metrics import pairwise_distances
-from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning
-from sklearn.utils.fixes import logsumexp
+from scipy.special import logsumexp
 from sklearn.base import TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances
 
 from ._util import _initialize_components, _check_n_components
 from .base_metric import MahalanobisMixin
@@ -33,71 +32,63 @@ class NCA(MahalanobisMixin, TransformerMixin):
 
   Parameters
   ----------
-  init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-      'auto' (this option is to raise a warning if 'init' is not set,
-      and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-          it uses labels information. If not, but
-          ``n_components < min(n_features, n_samples)``, we use 'pca', as
-          it projects data in meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
+  init : string or numpy array, optional (default='auto')
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
 
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
-
-  num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   max_iter : int, optional (default=100)
     Maximum number of iterations done by the optimization algorithm.
 
   tol : float, optional (default=None)
-      Convergence tolerance for the optimization.
+    Convergence tolerance for the optimization.
 
   verbose : bool, optional (default=False)
     Whether to print progress messages or not.
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
 
   Examples
   --------
@@ -114,28 +105,27 @@ class NCA(MahalanobisMixin, TransformerMixin):
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   References
   ----------
   .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. `Neighbourhood
          Components Analysis
          <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_.
-         Advances in Neural Information Processing Systems. 17, 513-520, 2005.
+         NIPS 2005.
 
   .. [2] Wikipedia entry on `Neighborhood Components Analysis
          <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
   """
 
-  def __init__(self, init=None, n_components=None, num_dims='deprecated',
+  def __init__(self, init='auto', n_components=None,
                max_iter=100, tol=None, verbose=False, preprocessor=None,
                random_state=None):
     self.n_components = n_components
     self.init = init
-    self.num_dims = num_dims
     self.max_iter = max_iter
     self.tol = tol
     self.verbose = verbose
@@ -147,11 +137,6 @@ def fit(self, X, y):
     X: data matrix, (n x d)
     y: scalar labels, (n)
     """
-    if self.num_dims != 'deprecated':
-      warnings.warn('"num_dims" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0. Use "n_components" instead',
-                    DeprecationWarning)
     X, labels = self._prepare_inputs(X, y, ensure_min_samples=2)
     n, d = X.shape
     n_components = _check_n_components(d, self.n_components)
@@ -160,22 +145,8 @@ def fit(self, X, y):
     train_time = time.time()
 
     # Initialize A
-    # if the init is the default (None), we raise a warning
-    if self.init is None:
-      # TODO: replace init=None by init='auto' in v0.6.0 and remove the warning
-      msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-             "the default init will now be set to 'auto', instead of the "
-             "previous scaling matrix. If you still want to use the same "
-             "scaling matrix as before, set "
-             "init=np.eye(X.shape[1])/(np.maximum(X.max(axis=0)-X.min(axis=0)"
-             ", EPS))). This warning will disappear in v0.6.0, and `init` "
-             "parameter's default value will be set to 'auto'.")
-      warnings.warn(msg, ChangedBehaviorWarning)
-      init = 'auto'
-    else:
-      init = self.init
-    A = _initialize_components(n_components, X, labels, init, self.verbose,
-                               self.random_state)
+    A = _initialize_components(n_components, X, labels, self.init,
+                               self.verbose, self.random_state)
 
     # Run NCA
     mask = labels[:, np.newaxis] == labels[np.newaxis, :]
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 8471a1b1..253b9c92 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -2,13 +2,9 @@
 Relative Components Analysis (RCA)
 """
 
-from __future__ import absolute_import
 import numpy as np
 import warnings
-from six.moves import xrange
-from sklearn import decomposition
 from sklearn.base import TransformerMixin
-from sklearn.exceptions import ChangedBehaviorWarning
 
 from ._util import _check_n_components
 from .base_metric import MahalanobisMixin
@@ -17,13 +13,13 @@
 
 # mean center each chunklet separately
 def _chunk_mean_centering(data, chunks):
-  num_chunks = chunks.max() + 1
+  n_chunks = chunks.max() + 1
   chunk_mask = chunks != -1
   # We need to ensure the data is float so that we can substract the
   # mean on it
   chunk_data = data[chunk_mask].astype(float, copy=False)
   chunk_labels = chunks[chunk_mask]
-  for c in xrange(num_chunks):
+  for c in range(n_chunks):
     mask = chunk_labels == c
     chunk_data[mask] -= chunk_data[mask].mean(axis=0)
 
@@ -44,60 +40,49 @@ class RCA(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
-
-  num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
-
-  pca_comps : Not used
-      .. deprecated:: 0.5.0
-      `pca_comps` was deprecated in version 0.5.0 and will
-      be removed in 0.6.0.
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   Examples
   --------
-  >>> from metric_learn import RCA_Supervised
-  >>> from sklearn.datasets import load_iris
-  >>> iris_data = load_iris()
-  >>> X = iris_data['data']
-  >>> Y = iris_data['target']
-  >>> rca = RCA_Supervised(num_chunks=30, chunk_size=2)
-  >>> rca.fit(X, Y)
+  >>> from metric_learn import RCA
+  >>> X = [[-0.05,  3.0],[0.05, -3.0],
+  >>>     [0.1, -3.55],[-0.1, 3.55],
+  >>>     [-0.95, -0.05],[0.95, 0.05],
+  >>>     [0.4,  0.05],[-0.4, -0.05]]
+  >>> chunks = [0, 0, 1, 1, 2, 2, 3, 3]
+  >>> rca = RCA()
+  >>> rca.fit(X, chunks)
 
   References
-  ------------------
-  .. [1] `Adjustment learning and relevant component analysis
-         <http://citeseerx.ist.\
-psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ Noam
-         Shental, et al.
+  ----------
+  .. [1] Noam Shental, et al. `Adjustment learning and relevant component
+         analysis <http://citeseerx.ist.\
+         psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ .
+         ECCV 2002.
 
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
-  def __init__(self, n_components=None, num_dims='deprecated',
-               pca_comps='deprecated', preprocessor=None):
+  def __init__(self, n_components=None, preprocessor=None):
     self.n_components = n_components
-    self.num_dims = num_dims
-    self.pca_comps = pca_comps
     super(RCA, self).__init__(preprocessor)
 
   def _check_dimension(self, rank, X):
     d = X.shape[1]
+
     if rank < d:
       warnings.warn('The inner covariance matrix is not invertible, '
                     'so the transformation matrix may contain Nan values. '
-                    'You should reduce the dimensionality of your input,'
+                    'You should remove any linearly dependent features and/or '
+                    'reduce the dimensionality of your input, '
                     'for instance using `sklearn.decomposition.PCA` as a '
                     'preprocessing step.')
 
@@ -110,34 +95,14 @@ def fit(self, X, chunks):
     Parameters
     ----------
     data : (n x d) data matrix
-        Each row corresponds to a single instance
+      Each row corresponds to a single instance
+
     chunks : (n,) array of ints
-        When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
-        When ``chunks[i] == j``, point i belongs to chunklet j.
+      When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
+      When ``chunks[i] == j``, point i belongs to chunklet j.
     """
-    if self.num_dims != 'deprecated':
-      warnings.warn('"num_dims" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0. Use "n_components" instead',
-                    DeprecationWarning)
-
-    if self.pca_comps != 'deprecated':
-      warnings.warn(
-        '"pca_comps" parameter is not used. '
-        'It has been deprecated in version 0.5.0 and will be'
-        'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
-        'you still want to do it, you could use '
-        '`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.',
-        DeprecationWarning)
-
     X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2)
 
-    warnings.warn(
-      "RCA will no longer center the data before training. If you want "
-      "to do some preprocessing, you should do it manually (you can also "
-      "use an `sklearn.pipeline.Pipeline` for instance). This warning "
-      "will disappear in version 0.6.0.", ChangedBehaviorWarning)
-
     chunks = np.asanyarray(chunks, dtype=int)
     chunk_mask, chunked_data = _chunk_mean_centering(X, chunks)
 
@@ -147,7 +112,7 @@ def fit(self, X, chunks):
     # Fisher Linear Discriminant projection
     if dim < X.shape[1]:
       total_cov = np.cov(X[chunk_mask], rowvar=0)
-      tmp = np.linalg.lstsq(total_cov, inner_cov)[0]
+      tmp = np.linalg.lstsq(total_cov, inner_cov, rcond=None)[0]
       vals, vecs = np.linalg.eig(tmp)
       inds = np.argsort(vals)[:dim]
       A = vecs[:, inds]
@@ -170,76 +135,83 @@ class RCA_Supervised(RCA):
 
   `RCA_Supervised` creates chunks of similar points by first sampling a
   class, taking `chunk_size` elements in it, and repeating the process
-  `num_chunks` times.
+  `n_chunks` times.
 
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
-
-  num_dims : Not used
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+  n_chunks: int, optional (default=100)
+    Number of chunks to generate.
 
-  num_chunks: int, optional
-
-  chunk_size: int, optional
+  chunk_size: int, optional (default=2)
+    Number of points per chunk.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int.
-      It is used to randomly sample constraints from labels.
+    A pseudo random number generator object or a seed for it if int.
+    It is used to randomly sample constraints from labels.
+
+  num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0
+
+  Examples
+  --------
+  >>> from metric_learn import RCA_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> rca = RCA_Supervised(n_chunks=30, chunk_size=2)
+  >>> rca.fit(X, Y)
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
-  def __init__(self, num_dims='deprecated', n_components=None,
-               pca_comps='deprecated', num_chunks=100, chunk_size=2,
-               preprocessor=None, random_state=None):
+  def __init__(self, n_components=None, n_chunks=100, chunk_size=2,
+               preprocessor=None, random_state=None,
+               num_chunks='deprecated'):
     """Initialize the supervised version of `RCA`."""
-    RCA.__init__(self, num_dims=num_dims, n_components=n_components,
-                 pca_comps=pca_comps, preprocessor=preprocessor)
-    self.num_chunks = num_chunks
+    RCA.__init__(self, n_components=n_components, preprocessor=preprocessor)
+    if num_chunks != 'deprecated':
+      warnings.warn('"num_chunks" parameter has been renamed to'
+                    ' "n_chunks". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      n_chunks = num_chunks
+    self.num_chunks = 'deprecated'  # To avoid no_attribute error
+    self.n_chunks = n_chunks
     self.chunk_size = chunk_size
     self.random_state = random_state
 
-  def fit(self, X, y, random_state='deprecated'):
+  def fit(self, X, y):
     """Create constraints from labels and learn the RCA model.
-    Needs num_constraints specified in constructor.
+    Needs n_constraints specified in constructor. (Not true?)
 
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      each row corresponds to a single instance
+
     y : (n) data labels
-    random_state : Not used
-      .. deprecated:: 0.5.0
-        `random_state` in the `fit` function was deprecated in version 0.5.0
-        and will be removed in 0.6.0. Set `random_state` at initialization
-        instead (when instantiating a new `RCA_Supervised` object).
     """
-    if random_state != 'deprecated':
-      warnings.warn('"random_state" parameter in the `fit` function is '
-                    'deprecated. Set `random_state` at initialization '
-                    'instead (when instantiating a new `RCA_Supervised` '
-                    'object).', DeprecationWarning)
-    else:
-      warnings.warn('As of v0.5.0, `RCA_Supervised` now uses the '
-                    '`random_state` given at initialization to sample '
-                    'constraints, not the default `np.random` from the `fit` '
-                    'method, since this argument is now deprecated. '
-                    'This warning will disappear in v0.6.0.',
-                    ChangedBehaviorWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
-    chunks = Constraints(y).chunks(num_chunks=self.num_chunks,
+    chunks = Constraints(y).chunks(n_chunks=self.n_chunks,
                                    chunk_size=self.chunk_size,
                                    random_state=self.random_state)
+
+    if self.n_chunks * (self.chunk_size - 1) < X.shape[1]:
+      warnings.warn('Due to the parameters of RCA_Supervised, '
+                    'the inner covariance matrix is not invertible, '
+                    'so the transformation matrix will contain Nan values. '
+                    'Increase the number or size of the chunks to correct '
+                    'this problem.'
+                    )
+
     return RCA.fit(self, X, chunks)
diff --git a/metric_learn/scml.py b/metric_learn/scml.py
new file mode 100644
index 00000000..fedf393d
--- /dev/null
+++ b/metric_learn/scml.py
@@ -0,0 +1,663 @@
+"""
+Sparse Compositional Metric Learning (SCML)
+"""
+
+from __future__ import print_function, absolute_import, division
+import numpy as np
+from .base_metric import _TripletsClassifierMixin, MahalanobisMixin
+from ._util import components_from_metric
+from sklearn.base import TransformerMixin
+from .constraints import Constraints
+from sklearn.preprocessing import normalize
+from sklearn.neighbors import NearestNeighbors
+from sklearn.cluster import KMeans
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.utils import check_array, check_random_state
+import warnings
+
+
+class _BaseSCML(MahalanobisMixin):
+
+  _tuple_size = 3   # constraints are triplets
+  _authorized_basis = ['triplet_diffs']
+
+  def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None,
+               gamma=5e-3, max_iter=10000, output_iter=500, batch_size=10,
+               verbose=False, preprocessor=None, random_state=None):
+    self.beta = beta
+    self.basis = basis
+    self.n_basis = n_basis
+    self.gamma = gamma
+    self.max_iter = max_iter
+    self.output_iter = output_iter
+    self.batch_size = batch_size
+    self.verbose = verbose
+    self.preprocessor = preprocessor
+    self.random_state = random_state
+    super(_BaseSCML, self).__init__(preprocessor)
+
+  def _fit(self, triplets, basis=None, n_basis=None):
+    """
+    Optimization procedure to find a sparse vector of weights to
+    construct the metric from the basis set. This is based on the
+    dual averaging method.
+    """
+
+    if not isinstance(self.max_iter, int):
+      raise ValueError("max_iter should be an integer, instead it is of type"
+                       " %s" % type(self.max_iter))
+    if not isinstance(self.output_iter, int):
+      raise ValueError("output_iter should be an integer, instead it is of "
+                       "type %s" % type(self.output_iter))
+    if not isinstance(self.batch_size, int):
+      raise ValueError("batch_size should be an integer, instead it is of type"
+                       " %s" % type(self.batch_size))
+
+    if self.output_iter > self.max_iter:
+      raise ValueError("The value of output_iter must be equal or smaller than"
+                       " max_iter.")
+
+    # Currently prepare_inputs makes triplets contain points and not indices
+    triplets = self._prepare_inputs(triplets, type_of_inputs='tuples')
+
+    # TODO:
+    # This algorithm is built to work with indices, but in order to be
+    # compliant with the current handling of inputs it is converted
+    # back to indices by the following function. This should be improved
+    # in the future.
+    triplets, X = self._to_index_points(triplets)
+
+    if basis is None:
+      basis, n_basis = self._initialize_basis(triplets, X)
+
+    dist_diff = self._compute_dist_diff(triplets, X, basis)
+
+    n_triplets = triplets.shape[0]
+
+    # weight vector
+    w = np.zeros((1, n_basis))
+    # avarage obj gradient wrt weights
+    avg_grad_w = np.zeros((1, n_basis))
+
+    # l2 norm in time of all obj gradients wrt weights
+    ada_grad_w = np.zeros((1, n_basis))
+    # slack for not dividing by zero
+    delta = 0.001
+
+    best_obj = np.inf
+
+    rng = check_random_state(self.random_state)
+    rand_int = rng.randint(low=0, high=n_triplets,
+                           size=(self.max_iter, self.batch_size))
+    for iter in range(self.max_iter):
+
+      idx = rand_int[iter]
+
+      slack_val = 1 + np.matmul(dist_diff[idx, :], w.T)
+      slack_mask = np.squeeze(slack_val > 0, axis=1)
+
+      grad_w = np.sum(dist_diff[idx[slack_mask], :],
+                      axis=0, keepdims=True)/self.batch_size
+      avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1)
+
+      ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w))
+
+      scale_f = -(iter+1) / (self.gamma * (delta + ada_grad_w))
+
+      # proximal operator with negative trimming equivalent
+      w = scale_f * np.minimum(avg_grad_w + self.beta, 0)
+
+      if (iter + 1) % self.output_iter == 0:
+        # regularization part of obj function
+        obj1 = np.sum(w)*self.beta
+
+        # Every triplet distance difference in the space given by L
+        # plus a slack of one
+        slack_val = 1 + np.matmul(dist_diff, w.T)
+        # Mask of places with positive slack
+        slack_mask = slack_val > 0
+
+        # loss function of learning task part of obj function
+        obj2 = np.sum(slack_val[slack_mask])/n_triplets
+
+        obj = obj1 + obj2
+        if self.verbose:
+          count = np.sum(slack_mask)
+          print("[%s] iter %d\t obj %.6f\t num_imp %d" %
+                (self.__class__.__name__, (iter+1), obj, count))
+
+        # update the best
+        if obj < best_obj:
+          best_obj = obj
+          best_w = w
+
+    if self.verbose:
+      print("max iteration reached.")
+
+    # return L matrix yielded from best weights
+    self.n_iter_ = iter
+    self.components_ = self._components_from_basis_weights(basis, best_w)
+
+    return self
+
+  def _compute_dist_diff(self, triplets, X, basis):
+    """
+    Helper function to compute the distance difference of every triplet in the
+    space yielded by the basis set.
+    """
+    # Transformation of data by the basis set
+    XB = np.matmul(X, basis.T)
+
+    n_triplets = triplets.shape[0]
+    # get all positive and negative pairs with lowest index first
+    # np.array (2*n_triplets,2)
+    triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]],
+                                               triplets[:, [0, 2]])),
+                                    kind='stable')
+    # calculate all unique pairs and their indices
+    uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True,
+                                   axis=0)
+    # calculate L2 distance acording to bases only for unique pairs
+    dist = np.square(XB[uniqPairs[:, 0], :] - XB[uniqPairs[:, 1], :])
+
+    # return the diference of distances between all positive and negative
+    # pairs
+    return dist[indices[:n_triplets]] - dist[indices[n_triplets:]]
+
+  def _components_from_basis_weights(self, basis, w):
+    """
+    Get components matrix (L) from computed mahalanobis matrix.
+    """
+
+    # get rid of inactive bases
+    # TODO: Maybe have a tolerance over zero?
+    active_idx, = w > 0
+    w = w[..., active_idx]
+    basis = basis[active_idx, :]
+
+    n_basis, n_features = basis.shape
+
+    if n_basis < n_features:  # if metric is low-rank
+      warnings.warn("The number of bases with nonzero weight is less than the "
+                    "number of features of the input, in consequence the "
+                    "learned transformation reduces the dimension to %d."
+                    % n_basis)
+      return np.sqrt(w.T)*basis  # equivalent to np.diag(np.sqrt(w)).dot(basis)
+
+    else:   # if metric is full rank
+      return components_from_metric(np.matmul(basis.T, w.T*basis))
+
+  def _to_index_points(self, triplets):
+    shape = triplets.shape
+    X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0)
+    triplets = triplets.reshape(shape[:2])
+    return triplets, X
+
+  def _initialize_basis(self, triplets, X):
+    """ Checks if the basis array is well constructed or constructs it based
+    on one of the available options.
+    """
+    n_features = X.shape[1]
+
+    if isinstance(self.basis, np.ndarray):
+      # TODO: should copy?
+      basis = check_array(self.basis, copy=True)
+      if basis.shape[1] != n_features:
+        raise ValueError('The dimensionality ({}) of the provided bases must'
+                         ' match the dimensionality of the data '
+                         '({}).'.format(basis.shape[1], n_features))
+    elif self.basis not in self._authorized_basis:
+      raise ValueError(
+          "`basis` must be one of the options '{}' "
+          "or an array of shape (n_basis, n_features)."
+          .format("', '".join(self._authorized_basis)))
+    if self.basis == 'triplet_diffs':
+      basis, n_basis = self._generate_bases_dist_diff(triplets, X)
+
+    return basis, n_basis
+
+  def _generate_bases_dist_diff(self, triplets, X):
+    """ Constructs the basis set from the differences of positive and negative
+    pairs from the triplets constraints.
+
+    The basis set is constructed iteratively by taking n_features triplets,
+    then adding and substracting respectively all the outerproducts of the
+    positive and negative pairs, and finally selecting the eigenvectors
+    of this matrix with positive eigenvalue. This is done until n_basis are
+    selected.
+    """
+    n_features = X.shape[1]
+    n_triplets = triplets.shape[0]
+
+    if self.n_basis is None:
+      # TODO: Get a good default n_basis directive
+      n_basis = n_features*80
+      warnings.warn('As no value for `n_basis` was selected, the number of '
+                    'basis will be set to n_basis= %d' % n_basis)
+    elif isinstance(self.n_basis, int):
+      n_basis = self.n_basis
+    else:
+      raise ValueError("n_basis should be an integer, instead it is of type %s"
+                       % type(self.n_basis))
+
+    if n_features > n_triplets:
+      raise ValueError(
+        "Number of features (%s) is greater than the number of triplets(%s).\n"
+        "Consider using dimensionality reduction or using another basis "
+        "generation scheme." % (n_features, n_triplets))
+
+    basis = np.zeros((n_basis, n_features))
+
+    # get all positive and negative pairs with lowest index first
+    # np.array (2*n_triplets,2)
+    triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]],
+                                               triplets[:, [0, 2]])),
+                                    kind='stable')
+    # calculate all unique pairs and their indices
+    uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True,
+                                   axis=0)
+    # calculate differences only for unique pairs
+    diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :]
+
+    diff_pos = diff[indices[:n_triplets], :]
+    diff_neg = diff[indices[n_triplets:], :]
+
+    rng = check_random_state(self.random_state)
+
+    start = 0
+    finish = 0
+    while finish != n_basis:
+      # Select triplets to yield diff
+      select_triplet = rng.choice(n_triplets, size=n_features, replace=False)
+
+      # select n_features positive differences
+      d_pos = diff_pos[select_triplet, :]
+
+      # select n_features negative differences
+      d_neg = diff_neg[select_triplet, :]
+
+      # Yield matrix
+      diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg)
+
+      # Calculate eigenvalue and eigenvectors
+      w, v = np.linalg.eigh(diff_sum.T.dot(diff_sum))
+
+      # Add eigenvectors with positive eigenvalue to basis set
+      pos_eig_mask = w > 0
+      start = finish
+      finish += pos_eig_mask.sum()
+
+      try:
+        basis[start:finish, :] = v[pos_eig_mask]
+      except ValueError:
+        # if finish is greater than n_basis
+        basis[start:, :] = v[pos_eig_mask][:n_basis-start]
+        break
+
+      # TODO: maybe add a warning in case there are no added bases, this could
+      # be caused by a bad triplet set. This would cause an infinite loop
+
+    return basis, n_basis
+
+
+class SCML(_BaseSCML, _TripletsClassifierMixin):
+  """Sparse Compositional Metric Learning (SCML)
+
+  `SCML` learns an squared Mahalanobis distance from triplet constraints by
+  optimizing sparse positive weights assigned to a set of :math:`K` rank-one
+  PSD bases. This can be formulated as an optimization problem with only
+  :math:`K` parameters, that can be solved with an efficient stochastic
+  composite scheme.
+
+  Read more in the :ref:`User Guide <scml>`.
+
+  .. warning::
+    SCML is still a bit experimental, don't hesitate to report if
+    something fails/doesn't work as expected.
+
+  Parameters
+  ----------
+  beta: float (default=1e-5)
+    L1 regularization parameter.
+
+  basis : string or array-like, optional (default='triplet_diffs')
+    Set of bases to construct the metric. Possible options are
+    'triplet_diffs', and an array-like of shape (n_basis, n_features).
+
+    'triplet_diffs'
+      The basis set is constructed iteratively from differences between points
+      of `n_features` positive or negative pairs randomly sampled from the
+      triplets constraints. Requires the number of training triplets to be
+      great or equal to `n_features`.
+
+    array-like
+        A matrix of shape (n_basis, n_features), that will be used as
+        the basis set for the metric construction.
+
+  n_basis : int, optional
+    Number of basis to be yielded. In case it is not set it will be set based
+    on `basis`. If no value is selected a default will be computed based on
+    the input.
+
+  gamma: float (default = 5e-3)
+    Learning rate for the optimization algorithm.
+
+  max_iter : int (default = 10000)
+    Number of iterations for the algorithm.
+
+  output_iter : int (default = 5000)
+    Number of iterations to check current weights performance and output this
+    information in case verbose is True.
+
+  verbose : bool, optional
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get triplets from indices. If array-like,
+    triplets will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int.
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `_components_from_basis_weights`.)
+
+  Examples
+  --------
+  >>> from metric_learn import SCML
+  >>> triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]],
+  >>>             [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]],
+  >>>             [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]],
+  >>>             [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]]
+  >>> scml = SCML()
+  >>> scml.fit(triplets)
+
+  References
+  ----------
+  .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning.
+         <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
+         (AAAI), 2014.
+
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
+
+  See Also
+  --------
+  metric_learn.SCML_Supervised : The supervised version of the algorithm.
+
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
+  """
+
+  def fit(self, triplets):
+    """Learn the SCML model.
+
+    Parameters
+    ----------
+    triplets : array-like, shape=(n_constraints, 3, n_features) or \
+      (n_constraints, 3)
+      3D array-like of triplets of points or 2D array of triplets of
+      indicators. Triplets are assumed to be ordered such that:
+      d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 0], triplets[i, 2]).
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
+    """
+
+    return self._fit(triplets)
+
+
+class SCML_Supervised(_BaseSCML, TransformerMixin):
+  """Supervised version of Sparse Compositional Metric Learning (SCML)
+
+  `SCML_Supervised` creates triplets by taking `k_genuine` neighbours
+  of the same class and `k_impostor` neighbours from different classes for each
+  point and then runs the SCML algorithm on these triplets.
+
+  Read more in the :ref:`User Guide <scml>`.
+
+  .. warning::
+    SCML is still a bit experimental, don't hesitate to report if
+    something fails/doesn't work as expected.
+
+  Parameters
+  ----------
+  beta: float (default=1e-5)
+    L1 regularization parameter.
+
+  basis : string or an array-like, optional (default='lda')
+    Set of bases to construct the metric. Possible options are
+    'lda', and an array-like of shape (n_basis, n_features).
+
+    'lda'
+      The `n_basis` basis set is constructed from the LDA of significant
+      local regions in the feature space via clustering, for each region
+      center k-nearest neighbors are used to obtain the LDA scalings,
+      which correspond to the locally discriminative basis.
+
+    array-like
+      A matrix of shape (n_basis, n_features), that will be used as
+      the basis set for the metric construction.
+
+  n_basis : int, optional
+    Number of basis to be yielded. In case it is not set it will be set based
+    on `basis`. If no value is selected a default will be computed based on
+    the input.
+
+  gamma: float (default = 5e-3)
+    Learning rate for the optimization algorithm.
+
+  max_iter : int (default = 100000)
+    Number of iterations for the algorithm.
+
+  output_iter : int (default = 5000)
+    Number of iterations to check current weights performance and output this
+    information in case verbose is True.
+
+  verbose : bool, optional
+    If True, prints information while learning.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get triplets from indices. If array-like,
+    triplets will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int.
+
+  Attributes
+  ----------
+  components_ : `numpy.ndarray`, shape=(n_features, n_features)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `_components_from_basis_weights`.)
+
+  Examples
+  --------
+  >>> from metric_learn import SCML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> scml = SCML_Supervised(random_state=33)
+  >>> scml.fit(X, Y)
+  SCML_Supervised(random_state=33)
+  >>> scml.score_pairs([[X[0], X[1]], [X[0], X[2]]])
+  array([1.84640733, 1.55984363])
+  >>> scml.get_metric()(X[0], X[1])
+  1.8464073327922157
+
+  References
+  ----------
+  .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning.
+         <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
+         (AAAI), 2014.
+
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
+
+  See Also
+  --------
+  metric_learn.SCML : The weakly supervised version of this
+    algorithm.
+  """
+  # Add supervised authorized basis construction options
+  _authorized_basis = _BaseSCML._authorized_basis + ['lda']
+
+  def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda',
+               n_basis=None, gamma=5e-3, max_iter=10000, output_iter=500,
+               batch_size=10, verbose=False, preprocessor=None,
+               random_state=None):
+    self.k_genuine = k_genuine
+    self.k_impostor = k_impostor
+    _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis,
+                       max_iter=max_iter, output_iter=output_iter,
+                       batch_size=batch_size, verbose=verbose,
+                       preprocessor=preprocessor, random_state=random_state)
+
+  def fit(self, X, y):
+    """Create constraints from labels and learn the SCML model.
+
+    Parameters
+    ----------
+    X : (n x d) matrix
+        Input data, where each row corresponds to a single instance.
+
+    y : (n) array-like
+        Data labels.
+
+    Returns
+    -------
+    self : object
+      Returns the instance.
+    """
+    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
+
+    basis, n_basis = self._initialize_basis_supervised(X, y)
+
+    if not isinstance(self.k_genuine, int):
+      raise ValueError("k_genuine should be an integer, instead it is of type"
+                       " %s" % type(self.k_genuine))
+    if not isinstance(self.k_impostor, int):
+      raise ValueError("k_impostor should be an integer, instead it is of "
+                       "type %s" % type(self.k_impostor))
+
+    constraints = Constraints(y)
+    triplets = constraints.generate_knntriplets(X, self.k_genuine,
+                                                self.k_impostor)
+
+    triplets = X[triplets]
+
+    return self._fit(triplets, basis, n_basis)
+
+  def _initialize_basis_supervised(self, X, y):
+    """ Constructs the basis set following one of the supervised options in
+    case one is selected.
+    """
+
+    if isinstance(self.basis, str) and self.basis == 'lda':
+      basis, n_basis = self._generate_bases_LDA(X, y)
+    else:
+      basis, n_basis = None, None
+
+    return basis, n_basis
+
+  def _generate_bases_LDA(self, X, y):
+    """ Generates bases for the 'lda' option.
+
+    The basis set is constructed using Linear Discriminant Analysis of
+    significant local regions in the feature space via clustering, for
+    each region center k-nearest neighbors are used to obtain the LDA scalings,
+    which correspond to the locally discriminative basis. Currently this is
+    done at two scales `k={10,20}` if `n_feature < 50` or else `k={20,50}`.
+    """
+
+    labels, class_count = np.unique(y, return_counts=True)
+    n_class = len(labels)
+
+    n_features = X.shape[1]
+    # Number of basis yielded from each LDA
+    num_eig = min(n_class-1, n_features)
+
+    if self.n_basis is None:
+      # TODO: Get a good default n_basis directive
+      n_basis = min(20*n_features, X.shape[0]*2*num_eig - 1)
+      warnings.warn('As no value for `n_basis` was selected, the number of '
+                    'basis will be set to n_basis= %d' % n_basis)
+
+    elif isinstance(self.n_basis, int):
+      n_basis = self.n_basis
+    else:
+      raise ValueError("n_basis should be an integer, instead it is of type %s"
+                       % type(self.n_basis))
+
+    # Number of clusters needed for 2 scales given the number of basis
+    # yielded by every LDA
+    n_clusters = int(np.ceil(n_basis/(2 * num_eig)))
+
+    if n_basis < n_class:
+      warnings.warn("The number of basis is less than the number of classes, "
+                    "which may lead to poor discriminative performance.")
+    elif n_basis >= X.shape[0]*2*num_eig:
+      raise ValueError("Not enough samples to generate %d LDA bases, n_basis"
+                       "should be smaller than %d" %
+                       (n_basis, X.shape[0]*2*num_eig))
+
+    kmeans = KMeans(n_clusters=n_clusters, n_init=10,
+                    random_state=self.random_state, algorithm='elkan').fit(X)
+    cX = kmeans.cluster_centers_
+
+    n_scales = 2
+    if n_features > 50:
+      scales = [20, 50]
+    else:
+      scales = [10, 20]
+
+    k_class = np.vstack((np.minimum(class_count, scales[0]),
+                         np.minimum(class_count, scales[1])))
+
+    idx_set = [np.zeros((n_clusters, sum(k_class[0, :])), dtype=np.int64),
+               np.zeros((n_clusters, sum(k_class[1, :])), dtype=np.int64)]
+
+    start_finish_indices = np.hstack((np.zeros((2, 1), np.int64),
+                                     k_class)).cumsum(axis=1)
+
+    neigh = NearestNeighbors()
+
+    for c in range(n_class):
+        sel_c = np.where(y == labels[c])
+
+        # get k_class same class neighbors
+        neigh.fit(X=X[sel_c])
+        # Only take the neighbors once for the biggest scale
+        neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[-1, c],
+                                     return_distance=False)
+
+        # add index set of neighbors for every cluster center for both scales
+        for s, k in enumerate(k_class[:, c]):
+          start, finish = start_finish_indices[s, c:c+2]
+          idx_set[s][:, start:finish] = np.take(sel_c, neighbors[:, :k])
+
+    # Compute basis for every cluster in both scales
+    basis = np.zeros((n_basis, n_features))
+    lda = LinearDiscriminantAnalysis()
+    start_finish_indices = np.hstack((np.vstack((0, n_clusters * num_eig)),
+                                     np.full((2, n_clusters),
+                                             num_eig))).cumsum(axis=1)
+
+    for s in range(n_scales):
+      for c in range(n_clusters):
+        lda.fit(X[idx_set[s][c, :]], y[idx_set[s][c, :]])
+        start, finish = start_finish_indices[s, c:c+2]
+        normalized_scalings = normalize(lda.scalings_.T)
+        try:
+          basis[start: finish, :] = normalized_scalings
+        except ValueError:
+          # handle tail
+          basis[start:, :] = normalized_scalings[:n_basis-start]
+          break
+
+    return basis, n_basis
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 2d67e0b8..c4c427b9 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -2,13 +2,18 @@
 Sparse High-Dimensional Metric Learning (SDML)
 """
 
-from __future__ import absolute_import
 import warnings
 import numpy as np
 from sklearn.base import TransformerMixin
 from scipy.linalg import pinvh
-from sklearn.covariance import graphical_lasso
-from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning
+try:
+  from sklearn.covariance._graph_lasso import (
+    _graphical_lasso as graphical_lasso
+  )
+except ImportError:
+  from sklearn.covariance import graphical_lasso
+
+from sklearn.exceptions import ConvergenceWarning
 
 from .base_metric import MahalanobisMixin, _PairsClassifierMixin
 from .constraints import Constraints, wrap_pairs
@@ -25,23 +30,17 @@ class _BaseSDML(MahalanobisMixin):
 
   _tuple_size = 2  # constraints are pairs
 
-  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior=None,
-               use_cov='deprecated', verbose=False, preprocessor=None,
+  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity',
+               verbose=False, preprocessor=None,
                random_state=None):
     self.balance_param = balance_param
     self.sparsity_param = sparsity_param
     self.prior = prior
-    self.use_cov = use_cov
     self.verbose = verbose
     self.random_state = random_state
     super(_BaseSDML, self).__init__(preprocessor)
 
   def _fit(self, pairs, y):
-    if self.use_cov != 'deprecated':
-      warnings.warn('"use_cov" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    'removed in 0.6.0. Use "prior" instead.',
-                    DeprecationWarning)
     if not HAS_SKGGM:
       if self.verbose:
         print("SDML will use scikit-learn's graphical lasso solver.")
@@ -50,27 +49,16 @@ def _fit(self, pairs, y):
         print("SDML will use skggm's graphical lasso solver.")
     pairs, y = self._prepare_inputs(pairs, y,
                                     type_of_inputs='tuples')
+    n_features = pairs.shape[2]
+    if n_features < 2:
+      raise ValueError(f"Cannot fit SDML with {n_features} feature(s)")
 
     # set up (the inverse of) the prior M
     # if the prior is the default (None), we raise a warning
-    if self.prior is None:
-      # TODO:
-      #  replace prior=None by prior='identity' in v0.6.0 and remove the
-      #  warning
-      msg = ("Warning, no prior was set (`prior=None`). As of version 0.5.0, "
-             "the default prior will now be set to "
-             "'identity', instead of 'covariance'. If you still want to use "
-             "the inverse of the covariance matrix as a prior, "
-             "set prior='covariance'. This warning will disappear in "
-             "v0.6.0, and `prior` parameter's default value will be set to "
-             "'identity'.")
-      warnings.warn(msg, ChangedBehaviorWarning)
-      prior = 'identity'
-    else:
-      prior = self.prior
-    _, prior_inv = _initialize_metric_mahalanobis(pairs, prior,
-       return_inverse=True, strict_pd=True, matrix_name='prior',
-       random_state=self.random_state)
+    _, prior_inv = _initialize_metric_mahalanobis(
+        pairs, self.prior,
+        return_inverse=True, strict_pd=True, matrix_name='prior',
+        random_state=self.random_state)
     diff = pairs[:, 0] - pairs[:, 1]
     loss_matrix = (diff.T * y).dot(diff)
     emp_cov = prior_inv + self.balance_param * loss_matrix
@@ -97,13 +85,14 @@ def _fit(self, pairs, y):
                                 msg=self.verbose,
                                 Theta0=theta0, Sigma0=sigma0)
       else:
-        _, M = graphical_lasso(emp_cov, alpha=self.sparsity_param,
-                               verbose=self.verbose,
-                               cov_init=sigma0)
+        _, M, *_ = graphical_lasso(emp_cov, alpha=self.sparsity_param,
+                                   verbose=self.verbose,
+                                   cov_init=sigma0)
       raised_error = None
       w_mahalanobis, _ = np.linalg.eigh(M)
       not_spd = any(w_mahalanobis < 0.)
       not_finite = not np.isfinite(M).all()
+    # TODO: Narrow this to the specific exceptions we expect.
     except Exception as e:
       raised_error = e
       not_spd = False  # not_spd not applicable here so we set to False
@@ -128,7 +117,7 @@ def _fit(self, pairs, y):
 
 
 class SDML(_BaseSDML, _PairsClassifierMixin):
-  """Sparse Distance Metric Learning (SDML)
+  r"""Sparse Distance Metric Learning (SDML)
 
   SDML is an efficient sparse metric learning in high-dimensional space via
   double regularization: an L1-penalization on the off-diagonal elements of the
@@ -141,62 +130,55 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
 
   Parameters
   ----------
-  balance_param : float, optional
-      trade off between sparsity and M0 prior
-
-  sparsity_param : float, optional
-      trade off between optimizer and sparseness (see graph_lasso)
+  balance_param : float, optional (default=0.5)
+    Trade off between sparsity and M0 prior.
 
-  prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For SDML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
+  sparsity_param : float, optional  (default=0.01)
+    Trade off between optimizer and sparseness (see graph_lasso).
 
-       'identity'
-          An identity matrix of shape (n_features, n_features).
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD).
 
-       'covariance'
-          The inverse covariance matrix.
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-       'random'
-          The prior will be a random positive definite (PD) matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
+    'covariance'
+      The inverse covariance matrix.
 
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
+    'random'
+      The prior will be a random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-  use_cov : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'prior' instead.
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
 
   verbose : bool, optional (default=False)
-      if True, prints information while learning
+    If True, prints information while learning.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
@@ -205,19 +187,17 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
   >>> iris_data = load_iris()
   >>> X = iris_data['data']
   >>> Y = iris_data['target']
-  >>> sdml = SDML_Supervised(num_constraints=200)
+  >>> sdml = SDML_Supervised(n_constraints=200)
   >>> sdml.fit(X, Y)
 
   References
   ----------
+  .. [1] Qi et al. `An efficient sparse metric learning in high-dimensional
+         space via L1-penalized log-determinant regularization
+         <http://www.machinelearning.org/archive/icml2009/papers/46.pdf>`_.
+         ICML 2009.
 
-  .. [1] Qi et al.
-         An efficient sparse metric learning in high-dimensional space via
-         L1-penalized log-determinant regularization. ICML 2009.
-         http://lms.comp.nus.edu.sg/sites/default/files/publication\
--attachments/icml09-guojun.pdf
-
-  .. [2] Adapted from https://gist.github.com/kcarnold/5439945
+  .. [2] Code adapted from https://gist.github.com/kcarnold/5439945
   """
 
   def fit(self, pairs, y, calibration_params=None):
@@ -230,20 +210,22 @@ def fit(self, pairs, y, calibration_params=None):
     ----------
     pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
     y : array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -263,60 +245,57 @@ class SDML_Supervised(_BaseSDML, TransformerMixin):
   Parameters
   ----------
   balance_param : float, optional (default=0.5)
-      trade off between sparsity and M0 prior
+    Trade off between sparsity and M0 prior.
+
   sparsity_param : float, optional (default=0.01)
-      trade off between optimizer and sparseness (see graph_lasso)
-  prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For SDML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-       'identity'
-          An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-          The inverse covariance matrix.
-
-       'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
-
-  use_cov : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'prior' instead.
-
-  num_labeled : Not used
-    .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
-  num_constraints : int, optional (default=None)
-      number of constraints to generate
+    Trade off between optimizer and sparseness (see graph_lasso).
+
+  prior : string or numpy array, optional (default='identity')
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  n_constraints : int, optional (default=None)
+    Number of constraints to generate. If None, defaults to `20 *
+    num_classes**2`.
+
   verbose : bool, optional (default=False)
-      if True, prints information while learning
+    If True, prints information while learning.
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior. In any case, `random_state` is also used to randomly sample
-      constraints from labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
+
+  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   See Also
   --------
@@ -325,61 +304,48 @@ class SDML_Supervised(_BaseSDML, TransformerMixin):
     that describes the supervised version of weakly supervised estimators.
   """
 
-  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior=None,
-               use_cov='deprecated', num_labeled='deprecated',
-               num_constraints=None, verbose=False, preprocessor=None,
-               random_state=None):
+  def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity',
+               n_constraints=None, verbose=False, preprocessor=None,
+               random_state=None, num_constraints='deprecated'):
     _BaseSDML.__init__(self, balance_param=balance_param,
                        sparsity_param=sparsity_param, prior=prior,
-                       use_cov=use_cov, verbose=verbose,
+                       verbose=verbose,
                        preprocessor=preprocessor, random_state=random_state)
-    self.num_labeled = num_labeled
-    self.num_constraints = num_constraints
+    if num_constraints != 'deprecated':
+      warnings.warn('"num_constraints" parameter has been renamed to'
+                    ' "n_constraints". It has been deprecated in'
+                    ' version 0.6.3 and will be removed in 0.7.0'
+                    '', FutureWarning)
+      self.n_constraints = num_constraints
+    else:
+      self.n_constraints = n_constraints
+    # Avoid test get_params from failing (all params passed sholud be set)
+    self.num_constraints = 'deprecated'
 
-  def fit(self, X, y, random_state='deprecated'):
+  def fit(self, X, y):
     """Create constraints from labels and learn the SDML model.
 
     Parameters
     ----------
     X : array-like, shape (n, d)
-        data matrix, where each row corresponds to a single instance
+      data matrix, where each row corresponds to a single instance
+
     y : array-like, shape (n,)
-        data labels, one for each instance
-    random_state : Not used
-      .. deprecated:: 0.5.0
-        `random_state` in the `fit` function was deprecated in version 0.5.0
-        and will be removed in 0.6.0. Set `random_state` at initialization
-        instead (when instantiating a new `SDML_Supervised` object).
+      data labels, one for each instance
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
-    if self.num_labeled != 'deprecated':
-      warnings.warn('"num_labeled" parameter is not used.'
-                    ' It has been deprecated in version 0.5.0 and will be'
-                    ' removed in 0.6.0', DeprecationWarning)
-    if random_state != 'deprecated':
-      warnings.warn('"random_state" parameter in the `fit` function is '
-                    'deprecated. Set `random_state` at initialization '
-                    'instead (when instantiating a new `SDML_Supervised` '
-                    'object).', DeprecationWarning)
-    else:
-      warnings.warn('As of v0.5.0, `SDML_Supervised` now uses the '
-                    '`random_state` given at initialization to sample '
-                    'constraints, not the default `np.random` from the `fit` '
-                    'method, since this argument is now deprecated. '
-                    'This warning will disappear in v0.6.0.',
-                    ChangedBehaviorWarning)
     X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
-    num_constraints = self.num_constraints
-    if num_constraints is None:
+    n_constraints = self.n_constraints
+    if n_constraints is None:
       num_classes = len(np.unique(y))
-      num_constraints = 20 * num_classes**2
+      n_constraints = 20 * num_classes**2
 
     c = Constraints(y)
-    pos_neg = c.positive_negative_pairs(num_constraints,
+    pos_neg = c.positive_negative_pairs(n_constraints,
                                         random_state=self.random_state)
     pairs, y = wrap_pairs(X, pos_neg)
     return _BaseSDML._fit(self, pairs, y)
diff --git a/metric_learn/sklearn_shims.py b/metric_learn/sklearn_shims.py
new file mode 100644
index 00000000..8d746890
--- /dev/null
+++ b/metric_learn/sklearn_shims.py
@@ -0,0 +1,25 @@
+"""This file is for fixing imports due to different APIs
+depending on the scikit-learn version"""
+import sklearn
+from packaging import version
+SKLEARN_AT_LEAST_0_22 = (version.parse(sklearn.__version__)
+                         >= version.parse('0.22.0'))
+if SKLEARN_AT_LEAST_0_22:
+    from sklearn.utils._testing import (set_random_state,
+                                        ignore_warnings,
+                                        assert_allclose_dense_sparse,
+                                        _get_args)
+    from sklearn.utils.estimator_checks import (_is_public_parameter
+                                                as is_public_parameter)
+    from sklearn.metrics._scorer import get_scorer
+else:
+    from sklearn.utils.testing import (set_random_state,
+                                       ignore_warnings,
+                                       assert_allclose_dense_sparse,
+                                       _get_args)
+    from sklearn.utils.estimator_checks import is_public_parameter
+    from sklearn.metrics.scorer import get_scorer
+
+__all__ = ['set_random_state', 'set_random_state',
+           'ignore_warnings', 'assert_allclose_dense_sparse', '_get_args',
+           'is_public_parameter', 'get_scorer']
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..ef3c8acb
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+  integration: mark a test as integration
+  unit: mark a test as unit
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 8d95aa1e..bc7695e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,4 +2,6 @@
 universal = 1 
 
 [metadata]
-description-file = README.rst
\ No newline at end of file
+description-file = README.rst
+license_files =
+  LICENSE.txt
diff --git a/setup.py b/setup.py
index dfb20fc0..23392077 100755
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,32 @@
 from setuptools import setup
 import os
 import io
+import sys
+
+
+CURRENT_PYTHON = sys.version_info[:2]
+REQUIRED_PYTHON = (3, 6)
+
+# This check and everything above must remain compatible with Python 2.7.
+if CURRENT_PYTHON < REQUIRED_PYTHON:
+    sys.stderr.write("""
+==========================
+Unsupported Python version
+==========================
+This version of metric-learn requires Python {}.{}, but you're trying to
+install it on Python {}.{}.
+This may be because you are using a version of pip that doesn't
+understand the python_requires classifier. Make sure you
+have pip >= 9.0 and setuptools >= 24.2, then try again:
+    $ python -m pip install --upgrade pip setuptools
+    $ python -m pip install django
+This will install the latest version of metric-learn which works on your
+version of Python. If you can't upgrade your pip (or Python), request
+an older version of metric-learn:
+    $ python -m pip install "metric-learn<0.6.0"
+""".format(*(REQUIRED_PYTHON + CURRENT_PYTHON)))
+    sys.exit(1)
+
 
 version = {}
 with io.open(os.path.join('metric_learn', '_version.py')) as fp:
@@ -16,27 +42,34 @@
       version=version['__version__'],
       description='Python implementations of metric learning algorithms',
       long_description=long_description,
-      author=['CJ Carey', 'Yuan Tang'],
+      python_requires='>={}.{}'.format(*REQUIRED_PYTHON),
+      author=[
+          'CJ Carey',
+          'Yuan Tang',
+          'William de Vazelhes',
+          'Aurélien Bellet',
+          'Nathalie Vauquier'
+      ],
       author_email='ccarey@cs.umass.edu',
-      url='http://github.com/metric-learn/metric-learn',
+      url='http://github.com/scikit-learn-contrib/metric-learn',
       license='MIT',
       classifiers=[
           'Development Status :: 4 - Beta',
           'License :: OSI Approved :: MIT License',
-          'Programming Language :: Python',
+          'Programming Language :: Python :: 3',
           'Operating System :: OS Independent',
           'Intended Audience :: Science/Research',
           'Topic :: Scientific/Engineering'
       ],
       packages=['metric_learn'],
       install_requires=[
-          'numpy',
-          'scipy',
-          'scikit-learn',
-          'six'
+          'numpy>= 1.11.0',
+          'scipy>= 0.17.0',
+          'scikit-learn>=0.21.3',
       ],
       extras_require=dict(
-          docs=['sphinx', 'shinx_rtd_theme', 'numpydoc'],
+          docs=['sphinx', 'sphinx_rtd_theme', 'numpydoc', 'sphinx-gallery',
+                'matplotlib'],
           demo=['matplotlib'],
           sdml=['skggm>=0.2.9']
       ),
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 00314ad0..d457b52d 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -1,40 +1,41 @@
+import warnings
 import unittest
 import re
 import pytest
 import numpy as np
 import scipy
 from scipy.optimize import check_grad, approx_fprime
-from six.moves import xrange
 from sklearn.metrics import pairwise_distances, euclidean_distances
 from sklearn.datasets import (load_iris, make_classification, make_regression,
                               make_spd_matrix)
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
                            assert_allclose)
-from sklearn.utils.testing import assert_warns_message
-from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.validation import check_X_y
+from sklearn.preprocessing import StandardScaler
 try:
   from inverse_covariance import quic
+  assert quic
 except ImportError:
   HAS_SKGGM = False
 else:
   HAS_SKGGM = True
 from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC,
-                          LSML_Supervised, ITML_Supervised, SDML_Supervised,
-                          RCA_Supervised, MMC_Supervised, SDML, RCA, ITML,
-                          LSML)
+                          SCML_Supervised, LSML_Supervised,
+                          ITML_Supervised, SDML_Supervised, RCA_Supervised,
+                          MMC_Supervised, SDML, RCA, ITML, SCML)
 # Import this specially for testing.
-from metric_learn.constraints import wrap_pairs
+from metric_learn.constraints import wrap_pairs, Constraints
 from metric_learn.lmnn import _sum_outer_products
 
 
 def class_separation(X, labels):
   unique_labels, label_inds = np.unique(labels, return_inverse=True)
   ratio = 0
-  for li in xrange(len(unique_labels)):
-    Xc = X[label_inds==li]
-    Xnc = X[label_inds!=li]
-    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean()
+  for li in range(len(unique_labels)):
+    Xc = X[label_inds == li]
+    Xnc = X[label_inds != li]
+    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean()
   return ratio / len(unique_labels)
 
 
@@ -75,168 +76,272 @@ def test_singular_returns_pseudo_inverse(self):
                     pseudo_inverse)
 
 
+class TestSCML(object):
+  @pytest.mark.parametrize('basis', ('lda', 'triplet_diffs'))
+  def test_iris(self, basis):
+    """
+    SCML applied to Iris dataset should give better results when
+    computing class separation.
+    """
+    X, y = load_iris(return_X_y=True)
+    before = class_separation(X, y)
+    scml = SCML_Supervised(basis=basis, n_basis=85, k_genuine=7, k_impostor=5,
+                           random_state=42)
+    scml.fit(X, y)
+    after = class_separation(scml.transform(X), y)
+    assert before > after + 0.03  # It's better by a margin of 0.03
+
+  def test_big_n_features(self):
+    X, y = make_classification(n_samples=100, n_classes=3, n_features=60,
+                               n_informative=60, n_redundant=0, n_repeated=0,
+                               random_state=42)
+    X = StandardScaler().fit_transform(X)
+    scml = SCML_Supervised(random_state=42, n_basis=399)
+    scml.fit(X, y)
+    csep = class_separation(scml.transform(X), y)
+    assert csep < 0.7
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.ones((3, 3, 3)),)),
+                            (SCML_Supervised, (np.array([[0, 0], [0, 1],
+                                                         [2, 0], [2, 1]]),
+                                               np.array([1, 0, 1, 0])))])
+  def test_bad_basis(self, estimator, data):
+    model = estimator(basis='bad_basis', n_basis=33)  # n_basis doesn't matter
+    msg = ("`basis` must be one of the options '{}' or an array of shape "
+           "(n_basis, n_features)."
+           .format("', '".join(model._authorized_basis)))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  def test_dimension_reduction_msg(self):
+    scml = SCML(n_basis=2)
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]],
+                         [[2, 1], [0, 1], [2, 0]],
+                         [[0, 0], [2, 0], [0, 1]],
+                         [[2, 0], [0, 0], [2, 1]]])
+    msg = ("The number of bases with nonzero weight is less than the "
+           "number of features of the input, in consequence the "
+           "learned transformation reduces the dimension to 1.")
+    with pytest.warns(UserWarning) as raised_warning:
+      scml.fit(triplets)
+    assert msg == raised_warning[0].message.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.array([[[0, 1], [2, 1], [0, 0]],
+                                              [[2, 1], [0, 1], [2, 0]],
+                                              [[0, 0], [2, 0], [0, 1]],
+                                              [[2, 0], [0, 0], [2, 1]]]),)),
+                           (SCML_Supervised, (np.array([[0, 0], [1, 1],
+                                                       [3, 3]]),
+                                              np.array([1, 2, 3])))])
+  def test_n_basis_wrong_type(self, estimator, data):
+    n_basis = 4.0
+    model = estimator(n_basis=n_basis)
+    msg = ("n_basis should be an integer, instead it is of type %s"
+           % type(n_basis))
+    with pytest.raises(ValueError) as raised_error:
+      model.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  def test_small_n_basis_lda(self):
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([0, 0, 1, 1])
+
+    n_class = 2
+    scml = SCML_Supervised(n_basis=n_class-1)
+    msg = ("The number of basis is less than the number of classes, which may"
+           " lead to poor discriminative performance.")
+    with pytest.warns(UserWarning) as raised_warning:
+      scml.fit(X, y)
+    assert msg == raised_warning[0].message.args[0]
+
+  def test_big_n_basis_lda(self):
+    X = np.array([[0, 0], [1, 1], [3, 3]])
+    y = np.array([1, 2, 3])
+
+    n_class = 3
+    num_eig = min(n_class - 1, X.shape[1])
+    n_basis = X.shape[0] * 2 * num_eig
+
+    scml = SCML_Supervised(n_basis=n_basis)
+    msg = ("Not enough samples to generate %d LDA bases, n_basis"
+           "should be smaller than %d" %
+           (n_basis, n_basis))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(X, y)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.random.rand(3, 3, 2),)),
+                           (SCML_Supervised, (np.array([[0, 0], [0, 1],
+                                                        [2, 0], [2, 1]]),
+                                              np.array([1, 0, 1, 0])))])
+  def test_array_basis(self, estimator, data):
+    """ Test that the proper error is raised when the shape of the input basis
+    array is not consistent with the input
+    """
+    basis = np.eye(3)
+    scml = estimator(n_basis=3, basis=basis)
+
+    msg = ('The dimensionality ({}) of the provided bases must match the '
+           'dimensionality of the data ({}).'
+           .format(basis.shape[1], data[0].shape[-1]))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(*data)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize(('estimator', 'data'),
+                           [(SCML, (np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2],
+                                              [1, 0, 3], [2, 3, 1], [2, 3, 0],
+                                              [3, 2, 1], [3, 2, 0]]),)),
+                           (SCML_Supervised, (np.array([0, 1, 2, 3]),
+                                              np.array([0, 0, 1, 1])))])
+  def test_verbose(self, estimator, data, capsys):
+    # assert there is proper output when verbose = True
+    model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+                      max_iter=1, output_iter=1, batch_size=1,
+                      basis='triplet_diffs', random_state=42, verbose=True)
+    model.fit(*data)
+    out, _ = capsys.readouterr()
+    expected_out = ('[%s] iter 1\t obj 0.569946\t num_imp 2\n'
+                    'max iteration reached.\n' % estimator.__name__)
+    assert out == expected_out
+
+  def test_triplet_diffs_toy(self):
+    expected_n_basis = 10
+    model = SCML_Supervised(n_basis=expected_n_basis)
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    triplets = np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], [1, 0, 3],
+                         [2, 3, 1], [2, 3, 0], [3, 2, 1], [3, 2, 0]])
+    basis, n_basis = model._generate_bases_dist_diff(triplets, X)
+    # All points are along the same line, so the only possible basis will be
+    # the vector along that line normalized.
+    expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2)
+    assert n_basis == expected_n_basis
+    np.testing.assert_allclose(basis, expected_basis)
+
+  def test_lda_toy(self):
+    expected_n_basis = 7
+    model = SCML_Supervised(n_basis=expected_n_basis)
+    X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([0, 0, 1, 1])
+    basis, n_basis = model._generate_bases_LDA(X, y)
+    # All points are along the same line, so the only possible basis will be
+    # the vector along that line normalized. In this case it is possible to
+    # obtain it with positive or negative orientations.
+    expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2)
+    assert n_basis == expected_n_basis
+    np.testing.assert_allclose(np.abs(basis), expected_basis)
+
+  @pytest.mark.parametrize('n_samples', [100, 500])
+  @pytest.mark.parametrize('n_features', [10, 50, 100])
+  @pytest.mark.parametrize('n_classes', [5, 10, 15])
+  def test_triplet_diffs(self, n_samples, n_features, n_classes):
+    """
+    Test that the correct value of n_basis is being generated with
+    different triplet constraints.
+    """
+    X, y = make_classification(n_samples=n_samples, n_classes=n_classes,
+                               n_features=n_features, n_informative=n_features,
+                               n_redundant=0, n_repeated=0)
+    X = StandardScaler().fit_transform(X)
+    model = SCML_Supervised(n_basis=None)  # Explicit n_basis=None
+    constraints = Constraints(y)
+    triplets = constraints.generate_knntriplets(X, model.k_genuine,
+                                                model.k_impostor)
+
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      basis, n_basis = model._generate_bases_dist_diff(triplets, X)
+    assert msg in str(raised_warning[0].message)
+
+    expected_n_basis = n_features * 80
+    assert n_basis == expected_n_basis
+    assert basis.shape == (expected_n_basis, n_features)
+
+  @pytest.mark.parametrize('n_samples', [100, 500])
+  @pytest.mark.parametrize('n_features', [10, 50, 100])
+  @pytest.mark.parametrize('n_classes', [5, 10, 15])
+  def test_lda(self, n_samples, n_features, n_classes):
+    """
+    Test that when n_basis=None, the correct n_basis is generated,
+    for SCML_Supervised and different values of n_samples, n_features
+    and n_classes.
+    """
+    X, y = make_classification(n_samples=n_samples, n_classes=n_classes,
+                               n_features=n_features, n_informative=n_features,
+                               n_redundant=0, n_repeated=0)
+    X = StandardScaler().fit_transform(X)
+
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      model = SCML_Supervised(n_basis=None)  # Explicit n_basis=None
+      basis, n_basis = model._generate_bases_LDA(X, y)
+    assert msg in str(raised_warning[0].message)
+
+    num_eig = min(n_classes - 1, n_features)
+    expected_n_basis = min(20 * n_features, n_samples * 2 * num_eig - 1)
+    assert n_basis == expected_n_basis
+    assert basis.shape == (expected_n_basis, n_features)
+
+  @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size',
+                                    'n_basis'])
+  def test_int_inputs(self, name):
+    value = 1.0
+    d = {name: value}
+    scml = SCML(**d)
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]]])
+
+    msg = ("%s should be an integer, instead it is of type"
+           " %s" % (name, type(value)))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(triplets)
+    assert msg == raised_error.value.args[0]
+
+  @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size',
+                                    'k_genuine', 'k_impostor', 'n_basis'])
+  def test_int_inputs_supervised(self, name):
+    value = 1.0
+    d = {name: value}
+    scml = SCML_Supervised(**d)
+    X = np.array([[0, 0], [1, 1], [3, 3], [4, 4]])
+    y = np.array([1, 1, 0, 0])
+    msg = ("%s should be an integer, instead it is of type"
+           " %s" % (name, type(value)))
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(X, y)
+    assert msg == raised_error.value.args[0]
+
+  def test_large_output_iter(self):
+    scml = SCML(max_iter=1, output_iter=2, n_basis=33)  # n_basis don't matter
+    triplets = np.array([[[0, 1], [2, 1], [0, 0]]])
+    msg = ("The value of output_iter must be equal or smaller than"
+           " max_iter.")
+
+    with pytest.raises(ValueError) as raised_error:
+      scml.fit(triplets)
+    assert msg == raised_error.value.args[0]
+
+
 class TestLSML(MetricTestCase):
   def test_iris(self):
-    lsml = LSML_Supervised(num_constraints=200)
+    lsml = LSML_Supervised(n_constraints=200)
     lsml.fit(self.iris_points, self.iris_labels)
 
     csep = class_separation(lsml.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.8)  # it's pretty terrible
 
-  def test_deprecation_num_labeled(self):
-    # test that a deprecation message is thrown if num_labeled is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lsml_supervised = LSML_Supervised(num_labeled=np.inf)
-    msg = ('"num_labeled" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0')
-    assert_warns_message(DeprecationWarning, msg, lsml_supervised.fit, X, y)
-
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lsml_supervised = LSML_Supervised()
-    msg = ("Warning, no prior was set (`prior=None`). As of version 0.5.0, "
-           "the default prior will now be set to "
-           "'identity', instead of 'covariance'. If you still want to use "
-           "the inverse of the covariance matrix as a prior, "
-           "set prior='covariance'. This warning will disappear in "
-           "v0.6.0, and `prior` parameter's default value will be set to "
-           "'identity'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      lsml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.], [-5., 3.], [5., 0.]],
-                      [[0., 50.], [0., -60], [-10., 0.], [10., 0.]]])
-    lsml = LSML()
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      lsml.fit(pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_deprecation_random_state(self):
-    # test that a deprecation message is thrown if random_state is set at
-    # fit time
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lsml_supervised = LSML_Supervised()
-    msg = ('"random_state" parameter in the `fit` function is '
-           'deprecated. Set `random_state` at initialization '
-           'instead (when instantiating a new `LSML_Supervised` '
-           'object).')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      lsml_supervised.fit(X, y, random_state=np.random)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning_random_state(self):
-    # test that a ChangedBehavior warning is thrown if the random_state is
-    # not set in fit.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lsml_supervised = LSML_Supervised()
-    msg = ('As of v0.5.0, `LSML_Supervised` now uses the '
-           '`random_state` given at initialization to sample '
-           'constraints, not the default `np.random` from the `fit` '
-           'method, since this argument is now deprecated. '
-           'This warning will disappear in v0.6.0.')
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      lsml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
 
 class TestITML(MetricTestCase):
   def test_iris(self):
-    itml = ITML_Supervised(num_constraints=200)
+    itml = ITML_Supervised(n_constraints=200)
     itml.fit(self.iris_points, self.iris_labels)
 
     csep = class_separation(itml.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.2)
 
-  def test_deprecation_num_labeled(self):
-    # test that a deprecation message is thrown if num_labeled is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    itml_supervised = ITML_Supervised(num_labeled=np.inf)
-    msg = ('"num_labeled" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0')
-    assert_warns_message(DeprecationWarning, msg, itml_supervised.fit, X, y)
-
-  def test_deprecation_bounds(self):
-    # test that a deprecation message is thrown if bounds is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    itml_supervised = ITML_Supervised(bounds=None)
-    msg = ('"bounds" parameter from initialization is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0. Use the "bounds" parameter of this '
-           'fit method instead.')
-    assert_warns_message(DeprecationWarning, msg, itml_supervised.fit, X, y)
-
-  def test_deprecation_A0(self):
-    # test that a deprecation message is thrown if A0 is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    itml_supervised = ITML_Supervised(A0=np.ones_like(X))
-    msg = ('"A0" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           'removed in 0.6.0. Use "prior" instead.')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      itml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
-    y_pairs = [1, -1]
-    itml = ITML(A0=np.ones_like(X))
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      itml.fit(pairs, y_pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_deprecation_random_state(self):
-    # test that a deprecation message is thrown if random_state is set at
-    # fit time
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    itml_supervised = ITML_Supervised()
-    msg = ('"random_state" parameter in the `fit` function is '
-           'deprecated. Set `random_state` at initialization '
-           'instead (when instantiating a new `ITML_Supervised` '
-           'object).')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      itml_supervised.fit(X, y, random_state=np.random)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning_random_state(self):
-    # test that a ChangedBehavior warning is thrown if the random_state is
-    # not set in fit.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    itml_supervised = ITML_Supervised()
-    msg = ('As of v0.5.0, `ITML_Supervised` now uses the '
-           '`random_state` given at initialization to sample '
-           'constraints, not the default `np.random` from the `fit` '
-           'method, since this argument is now deprecated. '
-           'This warning will disappear in v0.6.0.')
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      itml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
 
 @pytest.mark.parametrize('bounds', [None, (20., 100.), [20., 100.],
                                     np.array([20., 100.]),
@@ -277,7 +382,7 @@ def test_bounds_parameters_invalid(bounds):
 
 class TestLMNN(MetricTestCase):
   def test_iris(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
     lmnn.fit(self.iris_points, self.iris_labels)
 
     csep = class_separation(lmnn.transform(self.iris_points),
@@ -294,7 +399,7 @@ def test_loss_grad_lbfgs(self):
     L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
     lmnn = LMNN()
 
-    k = lmnn.k
+    k = lmnn.n_neighbors
     reg = lmnn.regularization
 
     X, y = lmnn._prepare_inputs(X, y, dtype=float,
@@ -327,35 +432,6 @@ def grad(x):
                 np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon)))
     np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lmnn = LMNN(k=2)
-    msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-           "the default init will now be set to 'auto', instead of the "
-           "previous identity matrix. If you still want to use the identity "
-           "matrix as before, set init='identity'. This warning "
-           "will disappear in v0.6.0, and `init` parameter's default value "
-           "will be set to 'auto'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      lmnn.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_deprecation_use_pca(self):
-    # test that a DeprecationWarning is thrown about use_pca, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    lmnn = LMNN(k=2, use_pca=True)
-    msg = ('"use_pca" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0.')
-    assert_warns_message(DeprecationWarning, msg, lmnn.fit, X, y)
-
 
 def test_loss_func(capsys):
   """Test the loss function (and its gradient) on a simple example,
@@ -384,15 +460,15 @@ def loss_fn(L, X, y, target_neighbors, reg):
        for j in target_neighbors[i]:
          loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2)
          grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j])
-         for l in range(X.shape[0]):
-           if y[i] != y[l]:
+         for k in range(X.shape[0]):
+           if y[i] != y[k]:
              hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) -
-                                 np.sum((Lx[i] - Lx[l])**2))
+                                 np.sum((Lx[i] - Lx[k])**2))
              total_active += active
              if active:
                loss += reg * hin
                grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) -
-                               np.outer(Lx[i] - Lx[l], X[i] - X[l])))
+                               np.outer(Lx[i] - Lx[k], X[i] - X[k])))
      grad = 2 * grad
      return grad, loss, total_active
 
@@ -434,7 +510,7 @@ def __init__(self, callback, *args, **kwargs):
 
     def _loss_grad(self, *args, **kwargs):
       grad, objective, total_active = (
-        super(LMNN_with_callback, self)._loss_grad(*args, **kwargs))
+          super(LMNN_with_callback, self)._loss_grad(*args, **kwargs))
       self.callback.append(grad)
       return grad, objective, total_active
 
@@ -463,18 +539,18 @@ def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
     out, _ = capsys.readouterr()
     lines = re.split("\n+", out)
     # we get every variable that is printed from the algorithm in verbose
-    num = '(-?\d+.?\d*(e[+|-]\d+)?)'
-    strings = [re.search("\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})"
+    num = r'(-?\d+.?\d*(e[+|-]\d+)?)'
+    strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})"
                          .format(num, num, num), s) for s in lines]
     objectives[name] = [float(match.group(1)) for match in strings if match is
                         not None]
     obj_diffs[name] = [float(match.group(3)) for match in strings if match is
-                             not None]
+                       not None]
     total_active[name] = [float(match.group(5)) for match in strings if
                           match is not
                           None]
     learn_rate[name] = [float(match.group(6)) for match in strings if match is
-                      not None]
+                        not None]
     assert len(strings) >= 10  # we ensure that we actually did more than 10
     # iterations
     assert total_active[name][0] >= 2  # we ensure that we have some active
@@ -499,9 +575,9 @@ def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
 def test_toy_ex_lmnn(X, y, loss):
   """Test that the loss give the right result on a toy example"""
   L = np.array([[1]])
-  lmnn = LMNN(k=1, regularization=0.5)
+  lmnn = LMNN(n_neighbors=1, regularization=0.5)
 
-  k = lmnn.k
+  k = lmnn.n_neighbors
   reg = lmnn.regularization
 
   X, y = lmnn._prepare_inputs(X, y, dtype=float,
@@ -512,17 +588,15 @@ def test_toy_ex_lmnn(X, y, loss):
   lmnn.components_ = np.eye(n_components)
 
   target_neighbors = lmnn._select_targets(X, label_inds)
-  impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds, L)
 
   # sum outer products
   dfG = _sum_outer_products(X, target_neighbors.flatten(),
                             np.repeat(np.arange(X.shape[0]), k))
-  df = np.zeros_like(dfG)
 
   # storage
-  a1 = [None]*k
-  a2 = [None]*k
-  for nn_idx in xrange(k):
+  a1 = [None] * k
+  a2 = [None] * k
+  for nn_idx in range(k):
     a1[nn_idx] = np.array([])
     a2[nn_idx] = np.array([])
 
@@ -530,9 +604,10 @@ def test_toy_ex_lmnn(X, y, loss):
   assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k,
                          reg, target_neighbors, label_inds)[1] == loss
 
+
 def test_convergence_simple_example(capsys):
   # LMNN should converge on this simple example, which it did not with
-  # this issue: https://github.com/metric-learn/metric-learn/issues/88
+  # this issue: https://github.com/scikit-learn-contrib/metric-learn/issues/88
   X, y = make_classification(random_state=0)
   lmnn = LMNN(verbose=True)
   lmnn.fit(X, y)
@@ -542,7 +617,7 @@ def test_convergence_simple_example(capsys):
 
 def test_no_twice_same_objective(capsys):
   # test that the objective function never has twice the same value
-  # see https://github.com/metric-learn/metric-learn/issues/88
+  # see https://github.com/scikit-learn-contrib/metric-learn/issues/88
   X, y = make_classification(random_state=0)
   lmnn = LMNN(verbose=True)
   lmnn.fit(X, y)
@@ -553,7 +628,7 @@ def test_no_twice_same_objective(capsys):
   # number), and which is followed by a (signed) float (delta obj). It
   # matches for instance:
   # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06
-  objectives = [re.search("\d* (?:(\d*.\d*))[ | -]\d*.\d*", s)
+  objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s)
                 for s in lines]
   objectives = [match.group(1) for match in objectives if match is not None]
   # we remove the last element because it can be equal to the penultimate
@@ -574,8 +649,7 @@ def test_sdml_supervised_raises_warning_msg_not_installed_skggm(self):
     # load_iris: dataset where we know scikit-learn's graphical lasso fails
     # with a Floating Point error
     X, y = load_iris(return_X_y=True)
-    sdml_supervised = SDML_Supervised(balance_param=0.5, use_cov=True,
-                                      sparsity_param=0.01)
+    sdml_supervised = SDML_Supervised(balance_param=0.5, sparsity_param=0.01)
     msg = ("There was a problem in SDML when using scikit-learn's graphical "
            "lasso solver. skggm's graphical lasso can sometimes converge on "
            "non SPD cases where scikit-learn's graphical lasso fails to "
@@ -661,12 +735,12 @@ def test_raises_no_warning_installed_skggm(self):
     pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]])
     y_pairs = [1, -1]
     X, y = make_classification(random_state=42)
-    with pytest.warns(None) as records:
+    with warnings.catch_warnings(record=True) as records:
       sdml = SDML(prior='covariance')
       sdml.fit(pairs, y_pairs)
     for record in records:
       assert record.category is not ConvergenceWarning
-    with pytest.warns(None) as records:
+    with warnings.catch_warnings(record=True) as records:
       sdml_supervised = SDML_Supervised(prior='identity', balance_param=1e-5)
       sdml_supervised.fit(X, y)
     for record in records:
@@ -677,25 +751,13 @@ def test_iris(self):
     # TODO: un-flake it!
     rs = np.random.RandomState(5555)
 
-    sdml = SDML_Supervised(num_constraints=1500, prior='identity',
-                           balance_param=5e-5)
-    sdml.fit(self.iris_points, self.iris_labels, random_state=rs)
+    sdml = SDML_Supervised(n_constraints=1500, prior='identity',
+                           balance_param=5e-5, random_state=rs)
+    sdml.fit(self.iris_points, self.iris_labels)
     csep = class_separation(sdml.transform(self.iris_points),
                             self.iris_labels)
     self.assertLess(csep, 0.22)
 
-  def test_deprecation_num_labeled(self):
-    # test that a deprecation message is thrown if num_labeled is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X, y = make_classification(random_state=42)
-    sdml_supervised = SDML_Supervised(num_labeled=np.inf, prior='identity',
-                                      balance_param=5e-5)
-    msg = ('"num_labeled" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0')
-    assert_warns_message(DeprecationWarning, msg, sdml_supervised.fit, X, y)
-
   def test_sdml_raises_warning_non_psd(self):
     """Tests that SDML raises a warning on a toy example where we know the
     pseudo-covariance matrix is not PSD"""
@@ -738,83 +800,6 @@ def test_sdml_works_on_non_spd_pb_with_skggm(self):
                            random_state=np.random.RandomState(42))
     sdml.fit(X, y)
 
-  def test_deprecation_use_cov(self):
-    # test that a deprecation message is thrown if use_cov  is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    sdml_supervised = SDML_Supervised(use_cov=np.ones_like(X),
-                                      balance_param=1e-5)
-    msg = ('"use_cov" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           'removed in 0.6.0. Use "prior" instead.')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      sdml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
-    y_pairs = [1, -1]
-    sdml = SDML(use_cov=np.ones_like(X), balance_param=1e-5)
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      sdml.fit(pairs, y_pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used (except for the balance_param that we need
-    # to set for the algorithm to not diverge)
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    sdml_supervised = SDML_Supervised(balance_param=1e-5)
-    msg = ("Warning, no prior was set (`prior=None`). As of version 0.5.0, "
-           "the default prior will now be set to "
-           "'identity', instead of 'covariance'. If you still want to use "
-           "the inverse of the covariance matrix as a prior, "
-           "set prior='covariance'. This warning will disappear in "
-           "v0.6.0, and `prior` parameter's default value will be set to "
-           "'identity'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      sdml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
-    y_pairs = [1, -1]
-    sdml = SDML(balance_param=1e-5)
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      sdml.fit(pairs, y_pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_deprecation_random_state(self):
-    # test that a deprecation message is thrown if random_state is set at
-    # fit time
-    # TODO: remove in v.0.6
-    X, y = load_iris(return_X_y=True)
-    sdml_supervised = SDML_Supervised(balance_param=5e-5)
-    msg = ('"random_state" parameter in the `fit` function is '
-           'deprecated. Set `random_state` at initialization '
-           'instead (when instantiating a new `SDML_Supervised` '
-           'object).')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      sdml_supervised.fit(X, y, random_state=np.random)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning_random_state(self):
-    # test that a ChangedBehavior warning is thrown if the random_state is
-    # not set in fit.
-    # TODO: remove in v.0.6
-    X, y = load_iris(return_X_y=True)
-    sdml_supervised = SDML_Supervised(balance_param=5e-5)
-    msg = ('As of v0.5.0, `SDML_Supervised` now uses the '
-           '`random_state` given at initialization to sample '
-           'constraints, not the default `np.random` from the `fit` '
-           'method, since this argument is now deprecated. '
-           'This warning will disappear in v0.6.0.')
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      sdml_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
 
 @pytest.mark.skipif(not HAS_SKGGM,
                     reason='The message should be printed only if skggm is '
@@ -964,7 +949,7 @@ def test_singleton_class(self):
       X = X[[ind_0[0], ind_1[0], ind_2[0]]]
       y = y[[ind_0[0], ind_1[0], ind_2[0]]]
 
-      A = make_spd_matrix(X.shape[1], X.shape[1])
+      A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1])
       nca = NCA(init=A, max_iter=30, n_components=X.shape[1])
       nca.fit(X, y)
       assert_array_equal(nca.components_, A)
@@ -975,45 +960,11 @@ def test_one_class(self):
       X = self.iris_points[self.iris_labels == 0]
       y = self.iris_labels[self.iris_labels == 0]
 
-      A = make_spd_matrix(X.shape[1], X.shape[1])
+      A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1])
       nca = NCA(init=A, max_iter=30, n_components=X.shape[1])
       nca.fit(X, y)
       assert_array_equal(nca.components_, A)
 
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    nca = NCA()
-    msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-           "the default init will now be set to 'auto', instead of the "
-           "previous scaling matrix. If you still want to use the same "
-           "scaling matrix as before, set "
-           "init=np.eye(X.shape[1])/(np.maximum(X.max(axis=0)-X.min(axis=0)"
-           ", EPS))). This warning will disappear in v0.6.0, and `init` "
-           "parameter's default value will be set to 'auto'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      nca.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-
-@pytest.mark.parametrize('num_dims', [None, 2])
-def test_deprecation_num_dims_nca(num_dims):
-  # test that a deprecation message is thrown if num_dims is set at
-  # initialization
-  # TODO: remove in v.0.6
-  X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-  y = np.array([1, 0, 1, 0])
-  nca = NCA(num_dims=num_dims)
-  msg = ('"num_dims" parameter is not used.'
-         ' It has been deprecated in version 0.5.0 and will be'
-         ' removed in 0.6.0. Use "n_components" instead')
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    nca.fit(X, y)
-  assert (str(raised_warning[0].message) == msg)
-
 
 class TestLFDA(MetricTestCase):
   def test_iris(self):
@@ -1027,69 +978,13 @@ def test_iris(self):
     self.assertEqual(lfda.components_.shape, (2, 4))
 
 
-@pytest.mark.parametrize('num_dims', [None, 2])
-def test_deprecation_num_dims_lfda(num_dims):
-  # test that a deprecation message is thrown if num_dims is set at
-  # initialization
-  # TODO: remove in v.0.6
-  X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-  y = np.array([1, 0, 1, 0])
-  lfda = LFDA(num_dims=num_dims)
-  msg = ('"num_dims" parameter is not used.'
-         ' It has been deprecated in version 0.5.0 and will be'
-         ' removed in 0.6.0. Use "n_components" instead')
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    lfda.fit(X, y)
-  assert (str(raised_warning[0].message) == msg)
-
-
 class TestRCA(MetricTestCase):
   def test_iris(self):
-    rca = RCA_Supervised(n_components=2, num_chunks=30, chunk_size=2)
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2)
     rca.fit(self.iris_points, self.iris_labels)
     csep = class_separation(rca.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.29)
 
-  def test_deprecation_pca_comps(self):
-    # test that a deprecation message is thrown if pca_comps is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X, y = make_classification(random_state=42, n_samples=100)
-    rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20)
-    msg = ('"pca_comps" parameter is not used. '
-           'It has been deprecated in version 0.5.0 and will be'
-           'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
-           'you still want to do it, you could use '
-           '`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.')
-    with pytest.warns(ChangedBehaviorWarning) as expected_msg:
-      rca_supervised.fit(X, y)
-    assert any(str(w.message) == msg for w in expected_msg)
-
-    rca = RCA(pca_comps=X.shape[1])
-    with pytest.warns(ChangedBehaviorWarning) as expected_msg:
-      rca.fit(X, y)
-    assert any(str(w.message) == msg for w in expected_msg)
-
-  def test_changedbehaviorwarning_preprocessing(self):
-    # test that a ChangedBehaviorWarning is thrown when using RCA
-    # TODO: remove in v.0.6
-
-    msg = ("RCA will no longer center the data before training. If you want "
-           "to do some preprocessing, you should do it manually (you can also "
-           "use an `sklearn.pipeline.Pipeline` for instance). This warning "
-           "will disappear in version 0.6.0.")
-
-    X, y = make_classification(random_state=42, n_samples=100)
-    rca_supervised = RCA_Supervised(num_chunks=20)
-    with pytest.warns(ChangedBehaviorWarning) as expected_msg:
-      rca_supervised.fit(X, y)
-    assert any(str(w.message) == msg for w in expected_msg)
-
-    rca = RCA()
-    with pytest.warns(ChangedBehaviorWarning) as expected_msg:
-      rca.fit(X, y)
-    assert any(str(w.message) == msg for w in expected_msg)
-
   def test_rank_deficient_returns_warning(self):
     """Checks that if the covariance matrix is not invertible, we raise a
     warning message advising to use PCA"""
@@ -1100,65 +995,49 @@ def test_rank_deficient_returns_warning(self):
     rca = RCA()
     msg = ('The inner covariance matrix is not invertible, '
            'so the transformation matrix may contain Nan values. '
-           'You should reduce the dimensionality of your input,'
+           'You should remove any linearly dependent features and/or '
+           'reduce the dimensionality of your input, '
            'for instance using `sklearn.decomposition.PCA` as a '
            'preprocessing step.')
-    with pytest.warns(None) as raised_warnings:
+
+    with warnings.catch_warnings(record=True) as raised_warnings:
       rca.fit(X, y)
     assert any(str(w.message) == msg for w in raised_warnings)
 
-  def test_deprecation_random_state(self):
-    # test that a deprecation message is thrown if random_state is set at
-    # fit time
-    # TODO: remove in v.0.6
-    X, y = make_classification(random_state=42, n_samples=100)
-    rca_supervised = RCA_Supervised(num_chunks=20)
-    msg = ('"random_state" parameter in the `fit` function is '
-           'deprecated. Set `random_state` at initialization '
-           'instead (when instantiating a new `RCA_Supervised` '
-           'object).')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      rca_supervised.fit(X, y, random_state=np.random)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning_random_state(self):
-    # test that a ChangedBehavior warning is thrown if the random_state is
-    # not set in fit.
-    # TODO: remove in v.0.6
-    X, y = make_classification(random_state=42, n_samples=100)
-    rca_supervised = RCA_Supervised(num_chunks=20)
-    msg = ('As of v0.5.0, `RCA_Supervised` now uses the '
-           '`random_state` given at initialization to sample '
-           'constraints, not the default `np.random` from the `fit` '
-           'method, since this argument is now deprecated. '
-           'This warning will disappear in v0.6.0.')
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      rca_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-
-@pytest.mark.parametrize('num_dims', [None, 2])
-def test_deprecation_num_dims_rca(num_dims):
-  # test that a deprecation message is thrown if num_dims is set at
-  # initialization
-  # TODO: remove in v.0.6
-  X, y = load_iris(return_X_y=True)
-  rca = RCA(num_dims=num_dims)
-  msg = ('"num_dims" parameter is not used.'
-         ' It has been deprecated in version 0.5.0 and will be'
-         ' removed in 0.6.0. Use "n_components" instead')
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    rca.fit(X, y)
-  assert any(str(w.message) == msg for w in raised_warning)
-
-  # we take a small number of chunks so that RCA works on iris
-  rca_supervised = RCA_Supervised(num_dims=num_dims, num_chunks=10)
-  msg = ('"num_dims" parameter is not used.'
-         ' It has been deprecated in version 0.5.0 and will be'
-         ' removed in 0.6.0. Use "n_components" instead')
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    rca_supervised.fit(X, y)
-  assert any(str(w.message) == msg for w in raised_warning)
+  def test_unknown_labels(self):
+    n = 200
+    n_chunks = 50
+    X, y = make_classification(random_state=42, n_samples=2 * n,
+                               n_features=6, n_informative=6, n_redundant=0)
+    y2 = np.concatenate((y[:n], -np.ones(n)))
+
+    rca = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    rca.fit(X[:n], y[:n])
+
+    rca2 = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    rca2.fit(X, y2)
+
+    assert not np.any(np.isnan(rca.components_))
+    assert not np.any(np.isnan(rca2.components_))
+
+    np.testing.assert_array_equal(rca.components_, rca2.components_)
+
+  def test_bad_parameters(self):
+    n = 200
+    n_chunks = 3
+    X, y = make_classification(random_state=42, n_samples=n,
+                               n_features=6, n_informative=6, n_redundant=0)
+
+    rca = RCA_Supervised(n_chunks=n_chunks, random_state=42)
+    msg = ('Due to the parameters of RCA_Supervised, '
+           'the inner covariance matrix is not invertible, '
+           'so the transformation matrix will contain Nan values. '
+           'Increase the number or size of the chunks to correct '
+           'this problem.'
+           )
+    with warnings.catch_warnings(record=True) as raised_warning:
+      rca.fit(X, y)
+    assert any(str(w.message) == msg for w in raised_warning)
 
 
 class TestMLKR(MetricTestCase):
@@ -1191,65 +1070,19 @@ def grad_fn(M):
     rel_diff = check_grad(fun, grad_fn, M.ravel()) / np.linalg.norm(grad_fn(M))
     np.testing.assert_almost_equal(rel_diff, 0.)
 
-  def test_deprecation_A0(self):
-    # test that a deprecation message is thrown if A0 is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mlkr = MLKR(A0=np.ones_like(X))
-    msg = ('"A0" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           'removed in 0.6.0. Use "init" instead.')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      mlkr.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([0.1, 0.2, 0.3, 0.4])
-    mlkr = MLKR()
-    msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-           "the default init will now be set to 'auto', instead of 'pca'. "
-           "If you still want to use PCA as an init, set init='pca'. "
-           "This warning will disappear in v0.6.0, and `init` parameter's"
-           " default value will be set to 'auto'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      mlkr.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-
-@pytest.mark.parametrize('num_dims', [None, 2])
-def test_deprecation_num_dims_mlkr(num_dims):
-  # test that a deprecation message is thrown if num_dims is set at
-  # initialization
-  # TODO: remove in v.0.6
-  X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-  y = np.array([1, 0, 1, 0])
-  mlkr = MLKR(num_dims=num_dims)
-  msg = ('"num_dims" parameter is not used.'
-         ' It has been deprecated in version 0.5.0 and will be'
-         ' removed in 0.6.0. Use "n_components" instead')
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    mlkr.fit(X, y)
-  assert (str(raised_warning[0].message) == msg)
-
 
 class TestMMC(MetricTestCase):
   def test_iris(self):
 
-    # Generate full set of constraints for comparison with reference implementation
-    n = self.iris_points.shape[0]
-    mask = (self.iris_labels[None] == self.iris_labels[:,None])
+    # Generate full set of constraints for comparison with reference
+    # implementation
+    mask = self.iris_labels[None] == self.iris_labels[:, None]
     a, b = np.nonzero(np.triu(mask, k=1))
     c, d = np.nonzero(np.triu(~mask, k=1))
 
     # Full metric
     n_features = self.iris_points.shape[1]
-    mmc = MMC(convergence_threshold=0.01, init=np.eye(n_features) / 10)
+    mmc = MMC(tol=0.01, init=np.eye(n_features) / 10)
     mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d]))
     expected = [[+0.000514, +0.000868, -0.001195, -0.001703],
                 [+0.000868, +0.001468, -0.002021, -0.002879],
@@ -1260,7 +1093,7 @@ def test_iris(self):
 
     # Diagonal metric
     mmc = MMC(diagonal=True)
-    mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d]))
+    mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d]))
     expected = [0, 0, 1.210220, 1.228596]
     assert_array_almost_equal(np.diag(expected), mmc.get_mahalanobis_matrix(),
                               decimal=6)
@@ -1270,103 +1103,13 @@ def test_iris(self):
     mmc.fit(self.iris_points, self.iris_labels)
     csep = class_separation(mmc.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.15)
-    
+
     # Supervised Diagonal
     mmc = MMC_Supervised(diagonal=True)
     mmc.fit(self.iris_points, self.iris_labels)
     csep = class_separation(mmc.transform(self.iris_points), self.iris_labels)
     self.assertLess(csep, 0.2)
 
-  def test_deprecation_num_labeled(self):
-    # test that a deprecation message is thrown if num_labeled is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mmc_supervised = MMC_Supervised(num_labeled=np.inf)
-    msg = ('"num_labeled" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           ' removed in 0.6.0')
-    assert_warns_message(DeprecationWarning, msg, mmc_supervised.fit, X, y)
-
-  def test_deprecation_A0(self):
-    # test that a deprecation message is thrown if A0 is set at
-    # initialization
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mmc_supervised = MMC_Supervised(A0=np.ones_like(X))
-    msg = ('"A0" parameter is not used.'
-           ' It has been deprecated in version 0.5.0 and will be'
-           'removed in 0.6.0. Use "init" instead.')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      mmc_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
-    y_pairs = [1, -1]
-    mmc = MMC(A0=np.ones_like(X))
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      mmc.fit(pairs, y_pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning(self):
-    # test that a ChangedBehavior warning is thrown about the init, if the
-    # default parameters are used.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mmc_supervised = MMC_Supervised()
-    msg = ("Warning, no init was set (`init=None`). As of version 0.5.0, "
-           "the default init will now be set to 'identity', instead of the "
-           "identity divided by a scaling factor of 10. "
-           "If you still want to use the same init as in previous "
-           "versions, set init=np.eye(d)/10, where d is the dimension "
-           "of your input space (d=pairs.shape[1]). "
-           "This warning will disappear in v0.6.0, and `init` parameter's"
-           " default value will be set to 'auto'.")
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      mmc_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-    pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]])
-    y_pairs = [1, -1]
-    mmc = MMC()
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      mmc.fit(pairs, y_pairs)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_deprecation_random_state(self):
-    # test that a deprecation message is thrown if random_state is set at
-    # fit time
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mmc_supervised = MMC_Supervised()
-    msg = ('"random_state" parameter in the `fit` function is '
-           'deprecated. Set `random_state` at initialization '
-           'instead (when instantiating a new `MMC_Supervised` '
-           'object).')
-    with pytest.warns(DeprecationWarning) as raised_warning:
-      mmc_supervised.fit(X, y, random_state=np.random)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
-  def test_changed_behaviour_warning_random_state(self):
-    # test that a ChangedBehavior warning is thrown if the random_state is
-    # not set in fit.
-    # TODO: remove in v.0.6
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    mmc_supervised = MMC_Supervised()
-    msg = ('As of v0.5.0, `MMC_Supervised` now uses the '
-           '`random_state` given at initialization to sample '
-           'constraints, not the default `np.random` from the `fit` '
-           'method, since this argument is now deprecated. '
-           'This warning will disappear in v0.6.0.')
-    with pytest.warns(ChangedBehaviorWarning) as raised_warning:
-      mmc_supervised.fit(X, y)
-    assert any(msg == str(wrn.message) for wrn in raised_warning)
-
 
 @pytest.mark.parametrize(('algo_class', 'dataset'),
                          [(NCA, make_classification()),
@@ -1388,10 +1131,10 @@ def test_verbose(algo_class, dataset, capsys):
   for line in lines[3:-2]:
     # The following regex will match for instance:
     # '[NCA]          0         6.988936e+01       0.01'
-    assert re.match("\[" + algo_class.__name__ + "\]\ *\d+\ *\d\.\d{6}e[+|-]"
-                    "\d+\ *\d+\.\d{2}", line)
-  assert re.match("\[" + algo_class.__name__ + "\] Training took\ *"
-                  "\d+\.\d{2}s\.", lines[-2])
+    assert re.match(r"\[" + algo_class.__name__ + r"\]\ *\d+\ *\d\.\d{6}e[+|-]"
+                    r"\d+\ *\d+\.\d{2}", line)
+  assert re.match(r"\[" + algo_class.__name__ + r"\] Training took\ *"
+                  r"\d+\.\d{2}s\.", lines[-2])
   assert lines[-1] == ''
 
 
@@ -1415,9 +1158,10 @@ def test_convergence_warning(dataset, algo_class):
     X, y = dataset
     model = algo_class(max_iter=2, verbose=True)
     cls_name = model.__class__.__name__
-    assert_warns_message(ConvergenceWarning,
-                         '[{}] {} did not converge'.format(cls_name, cls_name),
-                         model.fit, X, y)
+    msg = '[{}] {} did not converge'.format(cls_name, cls_name)
+    with pytest.warns(Warning) as raised_warning:
+      model.fit(X, y)
+    assert any([msg in str(warn.message) for warn in raised_warning])
 
 
 if __name__ == '__main__':
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index 725df31a..b1e71020 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -1,126 +1,167 @@
+from numpy.core.numeric import array_equal
+import warnings
 import pytest
 import re
 import unittest
 import metric_learn
 import numpy as np
 from sklearn import clone
-from sklearn.utils.testing import set_random_state
-from test.test_utils import ids_metric_learners, metric_learners
+from test.test_utils import ids_metric_learners, metric_learners, remove_y
+from metric_learn.sklearn_shims import set_random_state, SKLEARN_AT_LEAST_0_22
 
 
 def remove_spaces(s):
-  return re.sub('\s+', '', s)
+  return re.sub(r'\s+', '', s)
+
+
+def sk_repr_kwargs(def_kwargs, nndef_kwargs):
+    """Given the non-default arguments, and the default
+    keywords arguments, build the string that will appear
+    in the __repr__ of the estimator, depending on the
+    version of scikit-learn.
+    """
+    if SKLEARN_AT_LEAST_0_22:
+        def_kwargs = {}
+    def_kwargs.update(nndef_kwargs)
+    args_str = ",".join(f"{key}={repr(value)}"
+                        for key, value in def_kwargs.items())
+    return args_str
 
 
 class TestStringRepr(unittest.TestCase):
 
   def test_covariance(self):
+    def_kwargs = {'preprocessor': None}
+    nndef_kwargs = {}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
     self.assertEqual(remove_spaces(str(metric_learn.Covariance())),
-                     remove_spaces("Covariance(preprocessor=None)"))
+                     remove_spaces(f"Covariance({merged_kwargs})"))
 
   def test_lmnn(self):
+    def_kwargs = {'convergence_tol': 0.001, 'init': 'auto', 'n_neighbors': 3,
+                  'learn_rate': 1e-07, 'max_iter': 1000, 'min_iter': 50,
+                  'n_components': None, 'preprocessor': None,
+                  'random_state': None, 'regularization': 0.5,
+                  'verbose': False}
+    nndef_kwargs = {'convergence_tol': 0.01, 'n_neighbors': 6}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
     self.assertEqual(
-        remove_spaces(str(metric_learn.LMNN())),
-        remove_spaces(
-          "LMNN(convergence_tol=0.001, init=None, k=3, "
-          "learn_rate=1e-07, "
-          "max_iter=1000, min_iter=50, n_components=None, "
-          "num_dims='deprecated', preprocessor=None, random_state=None, "
-          "regularization=0.5, use_pca='deprecated', verbose=False)"))
+        remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01,
+                                            n_neighbors=6))),
+        remove_spaces(f"LMNN({merged_kwargs})"))
 
   def test_nca(self):
-    self.assertEqual(remove_spaces(str(metric_learn.NCA())),
-                     remove_spaces("NCA(init=None, max_iter=100,"
-                                   "n_components=None, "
-                                   "num_dims='deprecated', "
-                                   "preprocessor=None, random_state=None, "
-                                   "tol=None, verbose=False)"))
+    def_kwargs = {'init': 'auto', 'max_iter': 100, 'n_components': None,
+                  'preprocessor': None, 'random_state': None, 'tol': None,
+                  'verbose': False}
+    nndef_kwargs = {'max_iter': 42}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.NCA(max_iter=42))),
+                     remove_spaces(f"NCA({merged_kwargs})"))
 
   def test_lfda(self):
-    self.assertEqual(remove_spaces(str(metric_learn.LFDA())),
-                     remove_spaces(
-        "LFDA(embedding_type='weighted', k=None, "
-        "n_components=None, num_dims='deprecated',"
-        "preprocessor=None)"))
+    def_kwargs = {'embedding_type': 'weighted', 'k': None,
+                  'n_components': None, 'preprocessor': None}
+    nndef_kwargs = {'k': 2}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.LFDA(k=2))),
+                     remove_spaces(f"LFDA({merged_kwargs})"))
 
   def test_itml(self):
-    self.assertEqual(remove_spaces(str(metric_learn.ITML())),
-                     remove_spaces("""
-ITML(A0='deprecated', convergence_threshold=0.001, gamma=1.0,
-   max_iter=1000, preprocessor=None, prior='identity', random_state=None, 
-   verbose=False)
-"""))
-    self.assertEqual(remove_spaces(str(metric_learn.ITML_Supervised())),
-                     remove_spaces("""
-ITML_Supervised(A0='deprecated', bounds='deprecated',
-        convergence_threshold=0.001, gamma=1.0,
-        max_iter=1000, num_constraints=None, num_labeled='deprecated',
-        preprocessor=None, prior='identity', random_state=None, verbose=False)
-"""))
+    def_kwargs = {'tol': 0.001, 'gamma': 1.0,
+                  'max_iter': 1000, 'preprocessor': None,
+                  'prior': 'identity', 'random_state': None, 'verbose': False}
+    nndef_kwargs = {'gamma': 0.5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.ITML(gamma=0.5))),
+                     remove_spaces(f"ITML({merged_kwargs})"))
+    def_kwargs = {'tol': 0.001, 'gamma': 1.0,
+                  'max_iter': 1000, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'verbose': False}
+    nndef_kwargs = {'n_constraints': 7}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.ITML_Supervised(n_constraints=7))),
+        remove_spaces(f"ITML_Supervised({merged_kwargs})"))
 
   def test_lsml(self):
-    self.assertEqual(remove_spaces(str(metric_learn.LSML())),
-                     remove_spaces("""
-LSML(max_iter=1000, preprocessor=None, prior=None,
-   random_state=None, tol=0.001, verbose=False)
-"""))
-    self.assertEqual(remove_spaces(str(metric_learn.LSML_Supervised())),
-                     remove_spaces("""
-LSML_Supervised(max_iter=1000, num_constraints=None,
-        num_labeled='deprecated', preprocessor=None, prior=None,
-        random_state=None, tol=0.001, verbose=False, weights=None)
-"""))
+    def_kwargs = {'max_iter': 1000, 'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'tol': 0.001, 'verbose': False}
+    nndef_kwargs = {'tol': 0.1}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.LSML(tol=0.1))),
+                     remove_spaces(f"LSML({merged_kwargs})"))
+    def_kwargs = {'max_iter': 1000, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'tol': 0.001, 'verbose': False,
+                  'weights': None}
+    nndef_kwargs = {'verbose': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.LSML_Supervised(verbose=True))),
+        remove_spaces(f"LSML_Supervised({merged_kwargs})"))
 
   def test_sdml(self):
-    self.assertEqual(remove_spaces(str(metric_learn.SDML())),
-                     remove_spaces("""
-SDML(balance_param=0.5, preprocessor=None, prior=None, random_state=None,
-   sparsity_param=0.01, use_cov='deprecated', verbose=False)
-"""))
-    self.assertEqual(remove_spaces(str(metric_learn.SDML_Supervised())),
-                     remove_spaces("""
-SDML_Supervised(balance_param=0.5, num_constraints=None,
-        num_labeled='deprecated', preprocessor=None, prior=None,
-        random_state=None, sparsity_param=0.01, use_cov='deprecated',
-        verbose=False)
-"""))
+    def_kwargs = {'balance_param': 0.5, 'preprocessor': None,
+                  'prior': 'identity', 'random_state': None,
+                  'sparsity_param': 0.01, 'verbose': False}
+    nndef_kwargs = {'verbose': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.SDML(verbose=True))),
+                     remove_spaces(f"SDML({merged_kwargs})"))
+    def_kwargs = {'balance_param': 0.5, 'n_constraints': None,
+                  'preprocessor': None, 'prior': 'identity',
+                  'random_state': None, 'sparsity_param': 0.01,
+                  'verbose': False}
+    nndef_kwargs = {'sparsity_param': 0.5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.SDML_Supervised(sparsity_param=0.5))),
+        remove_spaces(f"SDML_Supervised({merged_kwargs})"))
 
   def test_rca(self):
-    self.assertEqual(remove_spaces(str(metric_learn.RCA())),
-                     remove_spaces("RCA(n_components=None, "
-                                   "num_dims='deprecated', "
-                                   "pca_comps='deprecated', "
-                                   "preprocessor=None)"))
-    self.assertEqual(remove_spaces(str(metric_learn.RCA_Supervised())),
-                     remove_spaces(
-                       "RCA_Supervised(chunk_size=2, "
-                       "n_components=None, num_chunks=100, "
-                       "num_dims='deprecated', pca_comps='deprecated', "
-                       "preprocessor=None, random_state=None)"))
+    def_kwargs = {'n_components': None, 'preprocessor': None}
+    nndef_kwargs = {'n_components': 3}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.RCA(n_components=3))),
+                     remove_spaces(f"RCA({merged_kwargs})"))
+    def_kwargs = {'chunk_size': 2, 'n_components': None, 'n_chunks': 100,
+                  'preprocessor': None, 'random_state': None}
+    nndef_kwargs = {'n_chunks': 5}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.RCA_Supervised(n_chunks=5))),
+        remove_spaces(f"RCA_Supervised({merged_kwargs})"))
 
   def test_mlkr(self):
-    self.assertEqual(remove_spaces(str(metric_learn.MLKR())),
-                     remove_spaces("MLKR(A0='deprecated', init=None,"
-                                   "max_iter=1000, n_components=None,"
-                                   "num_dims='deprecated', preprocessor=None,"
-                                   "random_state=None, tol=None, "
-                                   "verbose=False)"
-                                   ))
+    def_kwargs = {'init': 'auto', 'max_iter': 1000,
+                  'n_components': None, 'preprocessor': None,
+                  'random_state': None, 'tol': None, 'verbose': False}
+    nndef_kwargs = {'max_iter': 777}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.MLKR(max_iter=777))),
+                     remove_spaces(f"MLKR({merged_kwargs})"))
 
   def test_mmc(self):
-    self.assertEqual(remove_spaces(str(metric_learn.MMC())),
-                     remove_spaces("""
-MMC(A0='deprecated', convergence_threshold=0.001, diagonal=False,
-  diagonal_c=1.0, init=None, max_iter=100, max_proj=10000,
-  preprocessor=None, random_state=None, verbose=False)
-"""))
-    self.assertEqual(remove_spaces(str(metric_learn.MMC_Supervised())),
-                     remove_spaces("""
-MMC_Supervised(A0='deprecated', convergence_threshold=1e-06, diagonal=False,
-        diagonal_c=1.0, init=None, max_iter=100, max_proj=10000,
-        num_constraints=None, num_labeled='deprecated', preprocessor=None,
-        random_state=None, verbose=False)
-"""))
+    def_kwargs = {'tol': 0.001, 'diagonal': False,
+                  'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100,
+                  'max_proj': 10000, 'preprocessor': None,
+                  'random_state': None, 'verbose': False}
+    nndef_kwargs = {'diagonal': True}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(remove_spaces(str(metric_learn.MMC(diagonal=True))),
+                     remove_spaces(f"MMC({merged_kwargs})"))
+    def_kwargs = {'tol': 1e-06, 'diagonal': False,
+                  'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100,
+                  'max_proj': 10000, 'n_constraints': None,
+                  'preprocessor': None, 'random_state': None,
+                  'verbose': False}
+    nndef_kwargs = {'max_iter': 1}
+    merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
+    self.assertEqual(
+        remove_spaces(str(metric_learn.MMC_Supervised(max_iter=1))),
+        remove_spaces(f"MMC_Supervised({merged_kwargs})"))
 
 
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
@@ -135,12 +176,12 @@ def test_get_metric_is_independent_from_metric_learner(estimator,
 
   # we fit the metric learner on it and then we compute the metric on some
   # points
-  model.fit(input_data, labels)
+  model.fit(*remove_y(model, input_data, labels))
   metric = model.get_metric()
   score = metric(X[0], X[1])
 
   # then we refit the estimator on another dataset
-  model.fit(np.sin(input_data), labels)
+  model.fit(*remove_y(model, np.sin(input_data), labels))
 
   # we recompute the distance between the two points: it should be the same
   score_bis = metric(X[0], X[1])
@@ -155,7 +196,7 @@ def test_get_metric_raises_error(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*remove_y(model, input_data, labels))
   metric = model.get_metric()
 
   list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]),  # vectors with
@@ -178,7 +219,7 @@ def test_get_metric_works_does_not_raise(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*remove_y(model, input_data, labels))
   metric = model.get_metric()
 
   list_test_get_metric_doesnt_raise = [(X[0], X[1]),
@@ -186,7 +227,7 @@ def test_get_metric_works_does_not_raise(estimator, build_dataset):
                                        (X[0][None], X[1][None])]
 
   for u, v in list_test_get_metric_doesnt_raise:
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings(record=True) as record:
       metric(u, v)
     assert len(record) == 0
 
@@ -194,7 +235,7 @@ def test_get_metric_works_does_not_raise(estimator, build_dataset):
   model.components_ = np.array([3.1])
   metric = model.get_metric()
   for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]:
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings(record=True) as record:
       metric(u, v)
     assert len(record) == 0
 
@@ -210,20 +251,20 @@ def test_n_components(estimator, build_dataset):
   if hasattr(model, 'n_components'):
     set_random_state(model)
     model.set_params(n_components=None)
-    model.fit(input_data, labels)
+    model.fit(*remove_y(model, input_data, labels))
     assert model.components_.shape == (X.shape[1], X.shape[1])
 
     model = clone(estimator)
     set_random_state(model)
     model.set_params(n_components=X.shape[1] - 1)
-    model.fit(input_data, labels)
+    model.fit(*remove_y(model, input_data, labels))
     assert model.components_.shape == (X.shape[1] - 1, X.shape[1])
 
     model = clone(estimator)
     set_random_state(model)
     model.set_params(n_components=X.shape[1] + 1)
     with pytest.raises(ValueError) as expected_err:
-      model.fit(input_data, labels)
+      model.fit(*remove_y(model, input_data, labels))
     assert (str(expected_err.value) ==
             'Invalid n_components, must be in [1, {}]'.format(X.shape[1]))
 
@@ -231,10 +272,33 @@ def test_n_components(estimator, build_dataset):
     set_random_state(model)
     model.set_params(n_components=0)
     with pytest.raises(ValueError) as expected_err:
-      model.fit(input_data, labels)
+      model.fit(*remove_y(model, input_data, labels))
     assert (str(expected_err.value) ==
             'Invalid n_components, must be in [1, {}]'.format(X.shape[1]))
 
 
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_score_pairs_warning(estimator, build_dataset):
+  """Tests that score_pairs returns a FutureWarning regarding deprecation.
+  Also that score_pairs and pair_distance have the same behaviour"""
+  input_data, labels, _, X = build_dataset()
+  model = clone(estimator)
+  set_random_state(model)
+
+  # We fit the metric learner on it and then we call score_pairs on some
+  # points
+  model.fit(*remove_y(model, input_data, labels))
+
+  msg = ("score_pairs will be deprecated in release 0.7.0. "
+         "Use pair_score to compute similarity scores, or "
+         "pair_distances to compute distances.")
+  with pytest.warns(FutureWarning) as raised_warning:
+    score = model.score_pairs([[X[0], X[1]], ])
+    dist = model.pair_distance([[X[0], X[1]], ])
+    assert array_equal(score, dist)
+  assert any([str(warning.message) == msg for warning in raised_warning])
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py
index c0a7bbd4..c6113957 100644
--- a/test/test_components_metric_conversion.py
+++ b/test/test_components_metric_conversion.py
@@ -1,11 +1,10 @@
 import unittest
 import numpy as np
 import pytest
-from numpy.linalg import LinAlgError
 from scipy.stats import ortho_group
 from sklearn.datasets import load_iris
 from numpy.testing import assert_array_almost_equal, assert_allclose
-from sklearn.utils.testing import ignore_warnings
+from metric_learn.sklearn_shims import ignore_warnings
 
 from metric_learn import (
     LMNN, NCA, LFDA, Covariance, MLKR,
@@ -30,27 +29,27 @@ def test_cov(self):
 
   def test_lsml_supervised(self):
     seed = np.random.RandomState(1234)
-    lsml = LSML_Supervised(num_constraints=200, random_state=seed)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
     lsml.fit(self.X, self.y)
     L = lsml.components_
     assert_array_almost_equal(L.T.dot(L), lsml.get_mahalanobis_matrix())
 
   def test_itml_supervised(self):
     seed = np.random.RandomState(1234)
-    itml = ITML_Supervised(num_constraints=200)
-    itml.fit(self.X, self.y, random_state=seed)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
+    itml.fit(self.X, self.y)
     L = itml.components_
     assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix())
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
     lmnn.fit(self.X, self.y)
     L = lmnn.components_
     assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
 
   def test_sdml_supervised(self):
     seed = np.random.RandomState(1234)
-    sdml = SDML_Supervised(num_constraints=1500, prior='identity',
+    sdml = SDML_Supervised(n_constraints=1500, prior='identity',
                            balance_param=1e-5, random_state=seed)
     sdml.fit(self.X, self.y)
     L = sdml.components_
@@ -58,7 +57,7 @@ def test_sdml_supervised(self):
 
   def test_nca(self):
     n = self.X.shape[0]
-    nca = NCA(max_iter=(100000//n))
+    nca = NCA(max_iter=(100000 // n))
     nca.fit(self.X, self.y)
     L = nca.components_
     assert_array_almost_equal(L.T.dot(L), nca.get_mahalanobis_matrix())
@@ -70,9 +69,8 @@ def test_lfda(self):
     assert_array_almost_equal(L.T.dot(L), lfda.get_mahalanobis_matrix())
 
   def test_rca_supervised(self):
-    seed = np.random.RandomState(1234)
-    rca = RCA_Supervised(n_components=2, num_chunks=30, chunk_size=2)
-    rca.fit(self.X, self.y, random_state=seed)
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2)
+    rca.fit(self.X, self.y)
     L = rca.components_
     assert_array_almost_equal(L.T.dot(L), rca.get_mahalanobis_matrix())
 
@@ -94,7 +92,7 @@ def test_components_from_metric_edge_cases(self):
 
     # matrix with all its coefficients very low (to check that the algorithm
     # does not consider it as a diagonal matrix)(non regression test for
-    # https://github.com/metric-learn/metric-learn/issues/175)
+    # https://github.com/scikit-learn-contrib/metric-learn/issues/175)
     M = np.diag([1e-15, 2e-16, 3e-15, 4e-16, 5e-15, 6e-16, 7e-15])
     M = P.dot(M).dot(P.T)
     L = components_from_metric(M)
@@ -118,17 +116,14 @@ def test_components_from_metric_edge_cases(self):
     L = components_from_metric(M)
     assert_allclose(L.T.dot(L), M)
 
-    # matrix with a determinant still high but which should be considered as a
-    # non-definite matrix (to check we don't test the definiteness with the
-    # determinant which is a bad strategy)
+    # matrix with a determinant still high but which is
+    # undefinite w.r.t to numpy standards
     M = np.diag([1e5, 1e5, 1e5, 1e5, 1e5, 1e5, 1e-20])
     M = P.dot(M).dot(P.T)
     assert np.abs(np.linalg.det(M)) > 10
     assert np.linalg.slogdet(M)[1] > 1  # (just to show that the computed
     # determinant is far from null)
-    with pytest.raises(LinAlgError) as err_msg:
-      np.linalg.cholesky(M)
-    assert str(err_msg.value) == 'Matrix is not positive definite'
+    assert np.linalg.matrix_rank(M) < M.shape[0]
     # (just to show that this case is indeed considered by numpy as an
     # indefinite case)
     L = components_from_metric(M)
diff --git a/test/test_constraints.py b/test/test_constraints.py
new file mode 100644
index 00000000..3429d9cc
--- /dev/null
+++ b/test/test_constraints.py
@@ -0,0 +1,188 @@
+import pytest
+import numpy as np
+from sklearn.utils import shuffle
+from metric_learn.constraints import Constraints
+from sklearn.datasets import make_blobs
+
+SEED = 42
+
+
+def gen_labels_for_chunks(n_chunks, chunk_size,
+                          n_classes=10, n_unknown_labels=5):
+  """Generates n_chunks*chunk_size labels that split in n_chunks chunks,
+  that are homogeneous in the label."""
+  assert min(n_chunks, chunk_size) > 0
+  classes = shuffle(np.arange(n_classes), random_state=SEED)
+  n_per_class = chunk_size * (n_chunks // n_classes)
+  n_maj_class = chunk_size * n_chunks - n_per_class * (n_classes - 1)
+
+  first_labels = classes[0] * np.ones(n_maj_class, dtype=int)
+  remaining_labels = np.concatenate([k * np.ones(n_per_class, dtype=int)
+                                     for k in classes[1:]])
+  unknown_labels = -1 * np.ones(n_unknown_labels, dtype=int)
+
+  labels = np.concatenate([first_labels, remaining_labels, unknown_labels])
+  return shuffle(labels, random_state=SEED)
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_exact_num_points_for_chunks(n_chunks, chunk_size):
+  """Checks that the chunk generation works well with just enough points."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  constraints = Constraints(labels)
+  chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                              random_state=SEED)
+
+  chunk_no, size_each_chunk = np.unique(chunks[chunks >= 0],
+                                        return_counts=True)
+
+  np.testing.assert_array_equal(size_each_chunk, chunk_size)
+  assert chunk_no.shape[0] == n_chunks
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_chunk_case_one_miss_point(n_chunks, chunk_size):
+  """Checks that the chunk generation breaks when one point is missing."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  assert len(labels) >= 1
+  constraints = Constraints(labels[1:])
+  with pytest.raises(ValueError) as e:
+    constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                       random_state=SEED)
+
+  expected_message = (('Not enough possible chunks of %d elements in each'
+                       ' class to form expected %d chunks - maximum number'
+                       ' of chunks is %d'
+                       ) % (chunk_size, n_chunks, n_chunks - 1))
+
+  assert str(e.value) == expected_message
+
+
+@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)])
+def test_unknown_labels_not_in_chunks(n_chunks, chunk_size):
+  """Checks that unknown labels are not assigned to any chunk."""
+  labels = gen_labels_for_chunks(n_chunks, chunk_size)
+
+  constraints = Constraints(labels)
+  chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size,
+                              random_state=SEED)
+
+  assert np.all(chunks[labels < 0] < 0)
+
+
+@pytest.mark.parametrize("k_genuine, k_impostor, T_test",
+                         [(2, 2,
+                          [[0, 1, 3], [0, 1, 4], [0, 2, 3], [0, 2, 4],
+                           [1, 0, 3], [1, 0, 4], [1, 2, 3], [1, 2, 4],
+                           [2, 0, 3], [2, 0, 4], [2, 1, 3], [2, 1, 4],
+                           [3, 4, 1], [3, 4, 2], [3, 5, 1], [3, 5, 2],
+                           [4, 3, 1], [4, 3, 2], [4, 5, 1], [4, 5, 2],
+                           [5, 3, 1], [5, 3, 2], [5, 4, 1], [5, 4, 2]]),
+                          (1, 3,
+                          [[0, 1, 3], [0, 1, 4], [0, 1, 5], [1, 0, 3],
+                           [1, 0, 4], [1, 0, 5], [2, 1, 3], [2, 1, 4],
+                           [2, 1, 5], [3, 4, 0], [3, 4, 1], [3, 4, 2],
+                           [4, 3, 0], [4, 3, 1], [4, 3, 2], [5, 4, 0],
+                           [5, 4, 1], [5, 4, 2]]),
+                          (1, 2,
+                          [[0, 1, 3], [0, 1, 4], [1, 0, 3], [1, 0, 4],
+                           [2, 1, 3], [2, 1, 4], [3, 4, 1], [3, 4, 2],
+                           [4, 3, 1], [4, 3, 2], [5, 4, 1], [5, 4, 2]])])
+def test_generate_knntriplets_under_edge(k_genuine, k_impostor, T_test):
+  """Checks under the edge cases of knn triplet construction with enough
+     neighbors"""
+
+  X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]])
+  y = np.array([1, 1, 1, 2, 2, 2, -1])
+
+  T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor)
+
+  assert np.array_equal(sorted(T.tolist()), T_test)
+
+
+@pytest.mark.parametrize("k_genuine, k_impostor,",
+                         [(3, 3), (2, 4), (3, 4), (10, 9), (144, 33)])
+def test_generate_knntriplets(k_genuine, k_impostor):
+  """Checks edge and over the edge cases of knn triplet construction with not
+     enough neighbors"""
+
+  T_test = [[0, 1, 3], [0, 1, 4], [0, 1, 5], [0, 2, 3], [0, 2, 4], [0, 2, 5],
+            [1, 0, 3], [1, 0, 4], [1, 0, 5], [1, 2, 3], [1, 2, 4], [1, 2, 5],
+            [2, 0, 3], [2, 0, 4], [2, 0, 5], [2, 1, 3], [2, 1, 4], [2, 1, 5],
+            [3, 4, 0], [3, 4, 1], [3, 4, 2], [3, 5, 0], [3, 5, 1], [3, 5, 2],
+            [4, 3, 0], [4, 3, 1], [4, 3, 2], [4, 5, 0], [4, 5, 1], [4, 5, 2],
+            [5, 3, 0], [5, 3, 1], [5, 3, 2], [5, 4, 0], [5, 4, 1], [5, 4, 2]]
+
+  X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]])
+  y = np.array([1, 1, 1, 2, 2, 2, -1])
+
+  msg1 = ("The class 1 has 3 elements, which is not sufficient to "
+          f"generate {k_genuine+1} genuine neighbors "
+          "as specified by k_genuine")
+  msg2 = ("The class 2 has 3 elements, which is not sufficient to "
+          f"generate {k_genuine+1} genuine neighbors "
+          "as specified by k_genuine")
+  msg3 = ("The class 1 has 3 elements of other classes, which is "
+          f"not sufficient to generate {k_impostor} impostor "
+          "neighbors as specified by k_impostor")
+  msg4 = ("The class 2 has 3 elements of other classes, which is "
+          f"not sufficient to generate {k_impostor} impostor "
+          "neighbors as specified by k_impostor")
+  msgs = [msg1, msg2, msg3, msg4]
+  with pytest.warns(UserWarning) as user_warning:
+    T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor)
+  assert any([[msg in str(warn.message) for msg in msgs]
+             for warn in user_warning])
+  assert np.array_equal(sorted(T.tolist()), T_test)
+
+
+def test_generate_knntriplets_k_genuine():
+  """Checks the correct error raised when k_genuine is too big """
+  X, y = shuffle(*make_blobs(random_state=SEED),
+                 random_state=SEED)
+
+  label, labels_count = np.unique(y, return_counts=True)
+  labels_count_min = np.min(labels_count)
+  idx_smallest_label, = np.where(labels_count == labels_count_min)
+  k_genuine = labels_count_min
+
+  warn_msgs = []
+  for idx in idx_smallest_label:
+    warn_msgs.append("The class {} has {} elements, which is not sufficient "
+                     "to generate {} genuine neighbors as specified by "
+                     "k_genuine. Will generate {} genuine neighbors instead."
+                     "\n"
+                     .format(label[idx], k_genuine, k_genuine+1, k_genuine-1))
+
+  with pytest.warns(UserWarning) as raised_warning:
+    Constraints(y).generate_knntriplets(X, k_genuine, 1)
+  for warn in raised_warning:
+    assert str(warn.message) in warn_msgs
+
+
+def test_generate_knntriplets_k_impostor():
+  """Checks the correct error raised when k_impostor is too big """
+  X, y = shuffle(*make_blobs(random_state=SEED),
+                 random_state=SEED)
+
+  length = len(y)
+  label, labels_count = np.unique(y, return_counts=True)
+  labels_count_max = np.max(labels_count)
+  idx_biggest_label, = np.where(labels_count == labels_count_max)
+  k_impostor = length - labels_count_max + 1
+
+  warn_msgs = []
+  for idx in idx_biggest_label:
+    warn_msgs.append("The class {} has {} elements of other classes, which is"
+                     " not sufficient to generate {} impostor neighbors as "
+                     "specified by k_impostor. Will generate {} impostor "
+                     "neighbors instead.\n"
+                     .format(label[idx], k_impostor-1, k_impostor,
+                             k_impostor-1))
+
+  with pytest.warns(UserWarning) as raised_warning:
+    Constraints(y).generate_knntriplets(X, 1, k_impostor)
+  for warn in raised_warning:
+    assert str(warn.message) in warn_msgs
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
index a9b2719e..246223b0 100644
--- a/test/test_fit_transform.py
+++ b/test/test_fit_transform.py
@@ -1,4 +1,3 @@
-import pytest
 import unittest
 import numpy as np
 from sklearn.datasets import load_iris
@@ -30,47 +29,47 @@ def test_cov(self):
 
   def test_lsml_supervised(self):
     seed = np.random.RandomState(1234)
-    lsml = LSML_Supervised(num_constraints=200, random_state=seed)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
     lsml.fit(self.X, self.y)
     res_1 = lsml.transform(self.X)
 
     seed = np.random.RandomState(1234)
-    lsml = LSML_Supervised(num_constraints=200, random_state=seed)
+    lsml = LSML_Supervised(n_constraints=200, random_state=seed)
     res_2 = lsml.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
 
   def test_itml_supervised(self):
     seed = np.random.RandomState(1234)
-    itml = ITML_Supervised(num_constraints=200, random_state=seed)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
     itml.fit(self.X, self.y)
     res_1 = itml.transform(self.X)
 
     seed = np.random.RandomState(1234)
-    itml = ITML_Supervised(num_constraints=200, random_state=seed)
+    itml = ITML_Supervised(n_constraints=200, random_state=seed)
     res_2 = itml.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
     lmnn.fit(self.X, self.y)
     res_1 = lmnn.transform(self.X)
 
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False)
     res_2 = lmnn.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
 
   def test_sdml_supervised(self):
     seed = np.random.RandomState(1234)
-    sdml = SDML_Supervised(num_constraints=1500, balance_param=1e-5,
+    sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5,
                            prior='identity', random_state=seed)
     sdml.fit(self.X, self.y)
     res_1 = sdml.transform(self.X)
 
     seed = np.random.RandomState(1234)
-    sdml = SDML_Supervised(num_constraints=1500, balance_param=1e-5,
+    sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5,
                            prior='identity', random_state=seed)
     res_2 = sdml.fit_transform(self.X, self.y)
 
@@ -78,11 +77,11 @@ def test_sdml_supervised(self):
 
   def test_nca(self):
     n = self.X.shape[0]
-    nca = NCA(max_iter=(100000//n))
+    nca = NCA(max_iter=(100000 // n))
     nca.fit(self.X, self.y)
     res_1 = nca.transform(self.X)
 
-    nca = NCA(max_iter=(100000//n))
+    nca = NCA(max_iter=(100000 // n))
     res_2 = nca.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
@@ -100,13 +99,13 @@ def test_lfda(self):
 
   def test_rca_supervised(self):
     seed = np.random.RandomState(1234)
-    rca = RCA_Supervised(n_components=2, num_chunks=30, chunk_size=2,
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2,
                          random_state=seed)
     rca.fit(self.X, self.y)
     res_1 = rca.transform(self.X)
 
     seed = np.random.RandomState(1234)
-    rca = RCA_Supervised(n_components=2, num_chunks=30, chunk_size=2,
+    rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2,
                          random_state=seed)
     res_2 = rca.fit_transform(self.X, self.y)
 
@@ -124,12 +123,12 @@ def test_mlkr(self):
 
   def test_mmc_supervised(self):
     seed = np.random.RandomState(1234)
-    mmc = MMC_Supervised(num_constraints=200, random_state=seed)
+    mmc = MMC_Supervised(n_constraints=200, random_state=seed)
     mmc.fit(self.X, self.y)
     res_1 = mmc.transform(self.X)
 
     seed = np.random.RandomState(1234)
-    mmc = MMC_Supervised(num_constraints=200, random_state=seed)
+    mmc = MMC_Supervised(n_constraints=200, random_state=seed)
     res_2 = mmc.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
index a812d185..9378ac60 100644
--- a/test/test_mahalanobis_mixin.py
+++ b/test/test_mahalanobis_mixin.py
@@ -3,39 +3,62 @@
 import pytest
 import numpy as np
 from numpy.linalg import LinAlgError
-from numpy.testing import assert_array_almost_equal, assert_allclose
+from numpy.testing import assert_array_almost_equal, assert_allclose, \
+                          assert_array_equal
 from scipy.spatial.distance import pdist, squareform, mahalanobis
 from scipy.stats import ortho_group
 from sklearn import clone
 from sklearn.cluster import DBSCAN
-from sklearn.datasets import make_spd_matrix
-from sklearn.utils import check_random_state
+from sklearn.datasets import make_spd_matrix, make_blobs
+from sklearn.utils import check_random_state, shuffle
 from sklearn.utils.multiclass import type_of_target
-from sklearn.utils.testing import set_random_state
+from metric_learn.sklearn_shims import set_random_state
 
-from metric_learn._util import make_context
+from metric_learn._util import make_context, _initialize_metric_mahalanobis
+from metric_learn.sdml import _BaseSDML
 from metric_learn.base_metric import (_QuadrupletsClassifierMixin,
+                                      _TripletsClassifierMixin,
                                       _PairsClassifierMixin)
 from metric_learn.exceptions import NonPSDError
 
 from test.test_utils import (ids_metric_learners, metric_learners,
-                             remove_y_quadruplets, ids_classifiers)
+                             remove_y, ids_classifiers)
 
 RNG = check_random_state(0)
 
 
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
-def test_score_pairs_pairwise(estimator, build_dataset):
+def test_pair_distance_pair_score_equivalent(estimator, build_dataset):
+  """
+  For Mahalanobis learners, pair_score should be equivalent to the
+  opposite of the pair_distance result.
+  """
+  input_data, labels, _, X = build_dataset()
+  n_samples = 20
+  X = X[:n_samples]
+  model = clone(estimator)
+  set_random_state(model)
+  model.fit(*remove_y(estimator, input_data, labels))
+
+  distances = model.pair_distance(np.array(list(product(X, X))))
+  scores = model.pair_score(np.array(list(product(X, X))))
+
+  assert_array_equal(distances, -1 * scores)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
+                         ids=ids_metric_learners)
+def test_pair_distance_pairwise(estimator, build_dataset):
   # Computing pairwise scores should return a euclidean distance matrix.
   input_data, labels, _, X = build_dataset()
   n_samples = 20
   X = X[:n_samples]
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
 
-  pairwise = model.score_pairs(np.array(list(product(X, X))))\
+  pairwise = model.pair_distance(np.array(list(product(X, X))))\
       .reshape(n_samples, n_samples)
 
   check_is_distance_matrix(pairwise)
@@ -50,52 +73,52 @@ def test_score_pairs_pairwise(estimator, build_dataset):
 
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
-def test_score_pairs_toy_example(estimator, build_dataset):
-    # Checks that score_pairs works on a toy example
+def test_pair_distance_toy_example(estimator, build_dataset):
+    # Checks that pair_distance works on a toy example
     input_data, labels, _, X = build_dataset()
     n_samples = 20
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y(estimator, input_data, labels))
     pairs = np.stack([X[:10], X[10:20]], axis=1)
     embedded_pairs = pairs.dot(model.components_.T)
     distances = np.sqrt(np.sum((embedded_pairs[:, 1] -
                                 embedded_pairs[:, 0])**2,
                                axis=-1))
-    assert_array_almost_equal(model.score_pairs(pairs), distances)
+    assert_array_almost_equal(model.pair_distance(pairs), distances)
 
 
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
-def test_score_pairs_finite(estimator, build_dataset):
+def test_pair_distance_finite(estimator, build_dataset):
   # tests that the score is finite
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   pairs = np.array(list(product(X, X)))
-  assert np.isfinite(model.score_pairs(pairs)).all()
+  assert np.isfinite(model.pair_distance(pairs)).all()
 
 
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
-def test_score_pairs_dim(estimator, build_dataset):
+def test_pair_distance_dim(estimator, build_dataset):
   # scoring of 3D arrays should return 1D array (several tuples),
   # and scoring of 2D arrays (one tuple) should return an error (like
   # scikit-learn's error when scoring 1D arrays)
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   tuples = np.array(list(product(X, X)))
-  assert model.score_pairs(tuples).shape == (tuples.shape[0],)
+  assert model.pair_distance(tuples).shape == (tuples.shape[0],)
   context = make_context(estimator)
   msg = ("3D array of formed tuples expected{}. Found 2D array "
          "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n"
          .format(context, tuples[1]))
   with pytest.raises(ValueError) as raised_error:
-    model.score_pairs(tuples[1])
+    model.pair_distance(tuples[1])
   assert str(raised_error.value) == msg
 
 
@@ -118,7 +141,7 @@ def test_embed_toy_example(estimator, build_dataset):
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y(estimator, input_data, labels))
     embedded_points = X.dot(model.components_.T)
     assert_array_almost_equal(model.transform(X), embedded_points)
 
@@ -130,7 +153,7 @@ def test_embed_dim(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   assert model.transform(X).shape == X.shape
 
   # assert that ValueError is thrown if input shape is 1D
@@ -139,12 +162,12 @@ def test_embed_dim(estimator, build_dataset):
              "instead:\ninput={}. Reshape your data and/or use a "
              "preprocessor.\n".format(context, X[0]))
   with pytest.raises(ValueError) as raised_error:
-    model.score_pairs(model.transform(X[0, :]))
+    model.pair_distance(model.transform(X[0, :]))
   assert str(raised_error.value) == err_msg
   # we test that the shape is also OK when doing dimensionality reduction
   if hasattr(model, 'n_components'):
     model.set_params(n_components=2)
-    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y(estimator, input_data, labels))
     assert model.transform(X).shape == (X.shape[0], 2)
     # assert that ValueError is thrown if input shape is 1D
     with pytest.raises(ValueError) as raised_error:
@@ -159,7 +182,7 @@ def test_embed_finite(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   assert np.isfinite(model.transform(X)).all()
 
 
@@ -170,7 +193,7 @@ def test_embed_is_linear(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   assert_array_almost_equal(model.transform(X[:10] + X[10:20]),
                             model.transform(X[:10]) +
                             model.transform(X[10:20]))
@@ -189,12 +212,11 @@ def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   metric = model.get_metric()
   n_features = X.shape[1]
   a, b = (rng.randn(n_features), rng.randn(n_features))
-  expected_dist = mahalanobis(a[None], b[None],
-                              VI=model.get_mahalanobis_matrix())
+  expected_dist = mahalanobis(a, b, VI=model.get_mahalanobis_matrix())
   assert_allclose(metric(a, b), expected_dist, rtol=1e-13)
 
 
@@ -208,7 +230,7 @@ def test_get_metric_is_pseudo_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -225,24 +247,6 @@ def test_get_metric_is_pseudo_metric(estimator, build_dataset):
             np.isclose(metric(a, c), metric(a, b) + metric(b, c), rtol=1e-20))
 
 
-@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
-                         ids=ids_metric_learners)
-def test_metric_raises_deprecation_warning(estimator, build_dataset):
-  """assert that a deprecation warning is raised if someones wants to call
-  the `metric` function"""
-  # TODO: remove this method in version 0.6.0
-  input_data, labels, _, X = build_dataset()
-  model = clone(estimator)
-  set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
-
-  with pytest.warns(DeprecationWarning) as raised_warning:
-    model.metric()
-  assert (str(raised_warning[0].message) ==
-          ("`metric` is deprecated since version 0.5.0 and will be removed "
-           "in 0.6.0. Use `get_mahalanobis_matrix` instead."))
-
-
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
 def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset):
@@ -251,7 +255,7 @@ def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   clustering = DBSCAN(metric=model.get_metric())
   clustering.fit(X)
 
@@ -264,7 +268,7 @@ def test_get_squared_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -284,26 +288,35 @@ def test_components_is_2D(estimator, build_dataset):
   model = clone(estimator)
   set_random_state(model)
   # test that it works for X.shape[1] features
-  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y(estimator, input_data, labels))
   assert model.components_.shape == (X.shape[1], X.shape[1])
 
-  # test that it works for 1 feature
-  trunc_data = input_data[..., :1]
+  if isinstance(estimator, _BaseSDML):
+    # SDML doesn't support running on a single feature.
+    return
+
+  # test that it works for 1 feature. Use 2nd dimension, to avoid border cases
+  trunc_data = input_data[..., 1:2]
   # we drop duplicates that might have been formed, i.e. of the form
   # aabc or abcc or aabb for quadruplets, and aa for pairs.
+
   if isinstance(estimator, _QuadrupletsClassifierMixin):
-    for slice_idx in [slice(0, 2), slice(2, 4)]:
-      pairs = trunc_data[:, slice_idx, :]
-      diffs = pairs[:, 1, :] - pairs[:, 0, :]
-      to_keep = np.where(np.abs(diffs.ravel()) > 1e-9)
-      trunc_data = trunc_data[to_keep]
-      labels = labels[to_keep]
+    pairs_idx = [[0, 1], [2, 3]]
+  elif isinstance(estimator, _TripletsClassifierMixin):
+    pairs_idx = [[0, 1], [0, 2]]
   elif isinstance(estimator, _PairsClassifierMixin):
-    diffs = trunc_data[:, 1, :] - trunc_data[:, 0, :]
-    to_keep = np.where(np.abs(diffs.ravel()) > 1e-9)
+    pairs_idx = [[0, 1]]
+  else:
+    pairs_idx = []
+
+  for pair_idx in pairs_idx:
+    pairs = trunc_data[:, pair_idx, :]
+    diffs = pairs[:, 1, :] - pairs[:, 0, :]
+    to_keep = np.abs(diffs.ravel()) > 1e-9
     trunc_data = trunc_data[to_keep]
     labels = labels[to_keep]
-  model.fit(*remove_y_quadruplets(estimator, trunc_data, labels))
+
+  model.fit(*remove_y(estimator, trunc_data, labels))
   assert model.components_.shape == (1, 1)  # the components must be 2D
 
 
@@ -429,7 +442,7 @@ def test_auto_init_transformation(n_samples, n_features, n_classes,
                           random_state=rng)
     # To make the test work for LMNN:
     if 'LMNN' in model_base.__class__.__name__:
-      model_base.set_params(k=1)
+      model_base.set_params(n_neighbors=1)
     # To make the test faster for estimators that have a max_iter:
     if hasattr(model_base, 'max_iter'):
       model_base.set_params(max_iter=1)
@@ -515,12 +528,12 @@ def test_init_mahalanobis(estimator, build_dataset):
       model.fit(input_data, labels)
 
       # Initialize with a random spd matrix
-      init = make_spd_matrix(X.shape[1], random_state=rng)
+      init = make_spd_matrix(n_dim=X.shape[1], random_state=rng)
       model.set_params(**{param: init})
       model.fit(input_data, labels)
 
       # init.shape[1] must match X.shape[1]
-      init = make_spd_matrix(X.shape[1] + 1, X.shape[1] + 1)
+      init = make_spd_matrix(n_dim=X.shape[1] + 1, random_state=rng)
       model.set_params(**{param: init})
       msg = ('The input dimensionality {} of the given '
              'mahalanobis matrix `{}` must match the '
@@ -569,12 +582,12 @@ def test_init_mahalanobis(estimator, build_dataset):
                               in zip(ids_metric_learners,
                                      metric_learners)
                               if idml[:4] in ['ITML', 'SDML', 'LSML']])
-def test_singular_covariance_init_or_prior(estimator, build_dataset):
+def test_singular_covariance_init_or_prior_strictpd(estimator, build_dataset):
     """Tests that when using the 'covariance' init or prior, it returns the
     appropriate error if the covariance matrix is singular, for algorithms
     that need a strictly PD prior or init (see
-    https://github.com/metric-learn/metric-learn/issues/202 and
-    https://github.com/metric-learn/metric-learn/pull/195#issuecomment
+    https://github.com/scikit-learn-contrib/metric-learn/issues/202 and
+    https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment
     -492332451)
     """
     matrices_to_set = []
@@ -603,6 +616,48 @@ def test_singular_covariance_init_or_prior(estimator, build_dataset):
       assert str(raised_err.value) == msg
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize('estimator, build_dataset',
+                         [(ml, bd) for idml, (ml, bd)
+                          in zip(ids_metric_learners,
+                                 metric_learners)
+                          if idml[:3] in ['MMC']],
+                         ids=[idml for idml, (ml, _)
+                              in zip(ids_metric_learners,
+                                     metric_learners)
+                              if idml[:3] in ['MMC']])
+def test_singular_covariance_init_of_non_strict_pd(estimator, build_dataset):
+    """Tests that when using the 'covariance' init or prior, it returns the
+    appropriate warning if the covariance matrix is singular, for algorithms
+    that don't need a strictly PD init. Also checks that the returned
+    inverse matrix has finite values
+    """
+    input_data, labels, _, X = build_dataset()
+    model = clone(estimator)
+    set_random_state(model)
+    # We create a feature that is a linear combination of the first two
+    # features:
+    input_data = np.concatenate([input_data, input_data[:, ..., :2].dot([[2],
+                                                                        [3]])],
+                                axis=-1)
+    model.set_params(init='covariance')
+    msg = ('The covariance matrix is not invertible: '
+           'using the pseudo-inverse instead.'
+           'To make the covariance matrix invertible'
+           ' you can remove any linearly dependent features and/or '
+           'reduce the dimensionality of your input, '
+           'for instance using `sklearn.decomposition.PCA` as a '
+           'preprocessing step.')
+    with pytest.warns(UserWarning) as raised_warning:
+      model.fit(input_data, labels)
+    assert any([str(warning.message) == msg for warning in raised_warning])
+    M, _ = _initialize_metric_mahalanobis(X, init='covariance',
+                                          random_state=RNG,
+                                          return_inverse=True,
+                                          strict_pd=False)
+    assert np.isfinite(M).all()
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize('estimator, build_dataset',
                          [(ml, bd) for idml, (ml, bd)
@@ -614,12 +669,12 @@ def test_singular_covariance_init_or_prior(estimator, build_dataset):
                                      metric_learners)
                               if idml[:4] in ['ITML', 'SDML', 'LSML']])
 @pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
-def test_singular_array_init_or_prior(estimator, build_dataset, w0):
+def test_singular_array_init_or_prior_strictpd(estimator, build_dataset, w0):
     """Tests that when using a custom array init (or prior), it returns the
     appropriate error if it is singular, for algorithms
     that need a strictly PD prior or init (see
-    https://github.com/metric-learn/metric-learn/issues/202 and
-    https://github.com/metric-learn/metric-learn/pull/195#issuecomment
+    https://github.com/scikit-learn-contrib/metric-learn/issues/202 and
+    https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment
     -492332451)
     """
     matrices_to_set = []
@@ -654,6 +709,31 @@ def test_singular_array_init_or_prior(estimator, build_dataset, w0):
       assert str(raised_err.value) == msg
 
 
+@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
+def test_singular_array_init_of_non_strict_pd(w0):
+    """Tests that when using a custom array init, it returns the
+    appropriate warning if it is singular. Also checks if the returned
+    inverse matrix is finite. This isn't checked for model fitting as no
+    model curently uses this setting.
+    """
+    rng = np.random.RandomState(42)
+    X, y = shuffle(*make_blobs(random_state=rng),
+                   random_state=rng)
+    P = ortho_group.rvs(X.shape[1], random_state=rng)
+    w = np.abs(rng.randn(X.shape[1]))
+    w[0] = w0
+    M = P.dot(np.diag(w)).dot(P.T)
+    msg = ('The initialization matrix is not invertible: '
+           'using the pseudo-inverse instead.')
+    with pytest.warns(UserWarning) as raised_warning:
+      _, M_inv = _initialize_metric_mahalanobis(X, init=M,
+                                                random_state=rng,
+                                                return_inverse=True,
+                                                strict_pd=False)
+    assert str(raised_warning[0].message) == msg
+    assert np.isfinite(M_inv).all()
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
@@ -668,9 +748,9 @@ def test_deterministic_initialization(estimator, build_dataset):
     model.set_params(prior='random')
   model1 = clone(model)
   set_random_state(model1, 42)
-  model1 = model1.fit(input_data, labels)
+  model1 = model1.fit(*remove_y(model, input_data, labels))
   model2 = clone(model)
   set_random_state(model2, 42)
-  model2 = model2.fit(input_data, labels)
+  model2 = model2.fit(*remove_y(model, input_data, labels))
   np.testing.assert_allclose(model1.get_mahalanobis_matrix(),
                              model2.get_mahalanobis_matrix())
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index affc70f6..bfedefea 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -1,7 +1,6 @@
-from __future__ import division
-
 from functools import partial
 
+import warnings
 import pytest
 from numpy.testing import assert_array_equal
 from scipy.spatial.distance import euclidean
@@ -13,7 +12,7 @@
 from sklearn.model_selection import train_test_split
 
 from test.test_utils import pairs_learners, ids_pairs_learners
-from sklearn.utils.testing import set_random_state
+from metric_learn.sklearn_shims import set_random_state
 from sklearn import clone
 import numpy as np
 from itertools import product
@@ -51,14 +50,14 @@ def test_predict_monotonous(estimator, build_dataset,
   pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
                                                               labels)
   estimator.fit(pairs_train, y_train)
-  distances = estimator.score_pairs(pairs_test)
+  scores = estimator.pair_score(pairs_test)
   predictions = estimator.predict(pairs_test)
-  min_dissimilar = np.min(distances[predictions == -1])
-  max_similar = np.max(distances[predictions == 1])
-  assert max_similar <= min_dissimilar
-  separator = np.mean([min_dissimilar, max_similar])
-  assert (predictions[distances > separator] == -1).all()
-  assert (predictions[distances < separator] == 1).all()
+  max_dissimilar = np.max(scores[predictions == -1])
+  min_similar = np.min(scores[predictions == 1])
+  assert max_dissimilar <= min_similar
+  separator = np.mean([max_dissimilar, min_similar])
+  assert (predictions[scores < separator] == -1).all()
+  assert (predictions[scores > separator] == 1).all()
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -66,12 +65,31 @@ def test_predict_monotonous(estimator, build_dataset,
                          ids=ids_pairs_learners)
 def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
                                               with_preprocessor):
-  """Test that a NotFittedError is raised if someone tries to predict and
-  the metric learner has not been fitted."""
+  """Test that a NotFittedError is raised if someone tries to use
+  pair_score, score_pairs, decision_function, get_metric, transform or
+  get_mahalanobis_matrix on input data and the metric learner
+  has not been fitted."""
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
+  with pytest.raises(NotFittedError):  # Remove in 0.8.0
+    estimator.score_pairs(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.pair_score(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.decision_function(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.get_metric()
+  with pytest.raises(NotFittedError):
+    estimator.transform(input_data)
+  with pytest.raises(NotFittedError):
+    estimator.get_mahalanobis_matrix()
+  with pytest.raises(NotFittedError):
+    estimator.calibrate_threshold(input_data, labels)
+
+  with pytest.raises(NotFittedError):
+    estimator.set_threshold(0.5)
   with pytest.raises(NotFittedError):
     estimator.predict(input_data)
 
@@ -119,7 +137,7 @@ def test_threshold_different_scores_is_finite(estimator, build_dataset,
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
   estimator.fit(input_data, labels)
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     estimator.calibrate_threshold(input_data, labels, **kwargs)
   assert len(record) == 0
 
@@ -133,10 +151,26 @@ def fit(self, pairs, y):
     pairs, y = self._prepare_inputs(pairs, y,
                                     type_of_inputs='tuples')
     self.components_ = np.atleast_2d(np.identity(pairs.shape[2]))
-    self.threshold_ = 'I am not set.'
+    # self.threshold_ is not set.
     return self
 
 
+def test_unset_threshold():
+  """Tests that the "threshold is unset" error is raised when using predict
+  (performs binary classification on pairs) with an unset threshold."""
+  identity_pairs_classifier = IdentityPairsClassifier()
+  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
+  y = np.array([1, 1, -1, -1])
+  identity_pairs_classifier.fit(pairs, y)
+  with pytest.raises(AttributeError) as e:
+    identity_pairs_classifier.predict(pairs)
+
+  expected_msg = ("A threshold for this estimator has not been set, "
+                  "call its set_threshold or calibrate_threshold method.")
+
+  assert str(e.value) == expected_msg
+
+
 def test_set_threshold():
   # test that set_threshold indeed sets the threshold
   identity_pairs_classifier = IdentityPairsClassifier()
@@ -147,6 +181,25 @@ def test_set_threshold():
   assert identity_pairs_classifier.threshold_ == 0.5
 
 
+@pytest.mark.parametrize('value', ["ABC", None, [1, 2, 3], {'key': None},
+                         (1, 2), set(),
+                         np.array([[[0.], [1.]], [[1.], [3.]]])])
+def test_set_wrong_type_threshold(value):
+  """
+  Test that `set_threshold` indeed sets the threshold
+  and cannot accept nothing but float or integers, but
+  being permissive with boolean True=1.0 and False=0.0
+  """
+  model = IdentityPairsClassifier()
+  model.fit(np.array([[[0.], [1.]]]), np.array([1]))
+  msg = ('Parameter threshold must be a real number. '
+         'Got {} instead.'.format(type(value)))
+
+  with pytest.raises(ValueError) as e:  # String
+    model.set_threshold(value)
+  assert str(e.value).startswith(msg)
+
+
 def test_f_beta_1_is_f_1():
   # test that putting beta to 1 indeed finds the best threshold to optimize
   # the f1_score
@@ -331,7 +384,7 @@ def test_calibrate_threshold_valid_parameters(valid_args):
   pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20)
   pairs_learner = IdentityPairsClassifier()
   pairs_learner.fit(pairs, y)
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     pairs_learner.calibrate_threshold(pairs, y, **valid_args)
   assert len(record) == 0
 
@@ -347,6 +400,7 @@ class MockBadPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
     """
 
     def fit(self, pairs, y, calibration_params=None):
+      self.preprocessor_ = 'not used'
       self.components_ = 'not used'
       self.calibrate_threshold(pairs, y, **(calibration_params if
                                             calibration_params is not None else
@@ -465,7 +519,7 @@ def test_validate_calibration_params_valid_parameters(
   # test that no warning message is returned if valid arguments are given to
   # _validate_calibration_params for all pairs metric learners, as well as
   # a mocking example, and the class itself
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     estimator._validate_calibration_params(**valid_args)
   assert len(record) == 0
 
diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
index efe10030..a8319961 100644
--- a/test/test_quadruplets_classifiers.py
+++ b/test/test_quadruplets_classifiers.py
@@ -3,7 +3,7 @@
 from sklearn.model_selection import train_test_split
 
 from test.test_utils import quadruplets_learners, ids_quadruplets_learners
-from sklearn.utils.testing import set_random_state
+from metric_learn.sklearn_shims import set_random_state
 from sklearn import clone
 import numpy as np
 
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index b2056c09..798d9036 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -4,80 +4,85 @@
 from sklearn.base import TransformerMixin
 from sklearn.pipeline import make_pipeline
 from sklearn.utils import check_random_state
-from sklearn.utils.estimator_checks import is_public_parameter
-from sklearn.utils.testing import (assert_allclose_dense_sparse,
-                                   set_random_state)
-
+from metric_learn.sklearn_shims import (assert_allclose_dense_sparse,
+                                        set_random_state, _get_args,
+                                        is_public_parameter, get_scorer)
 from metric_learn import (Covariance, LFDA, LMNN, MLKR, NCA,
                           ITML_Supervised, LSML_Supervised,
-                          MMC_Supervised, RCA_Supervised, SDML_Supervised)
+                          MMC_Supervised, RCA_Supervised, SDML_Supervised,
+                          SCML_Supervised)
 from sklearn import clone
 import numpy as np
 from sklearn.model_selection import (cross_val_score, cross_val_predict,
                                      train_test_split, KFold)
-from sklearn.metrics.scorer import get_scorer
-from sklearn.utils.testing import _get_args
 from test.test_utils import (metric_learners, ids_metric_learners,
                              mock_preprocessor, tuples_learners,
                              ids_tuples_learners, pairs_learners,
-                             ids_pairs_learners, remove_y_quadruplets,
-                             quadruplets_learners)
+                             ids_pairs_learners, remove_y,
+                             metric_learners_pipeline,
+                             ids_metric_learners_pipeline)
 
 
 class Stable_RCA_Supervised(RCA_Supervised):
 
-  def __init__(self, n_components=None, pca_comps=None,
+  def __init__(self, n_components=None,
                chunk_size=2, preprocessor=None, random_state=None):
     # this init makes RCA stable for scikit-learn examples.
     super(Stable_RCA_Supervised, self).__init__(
-        num_chunks=2, n_components=n_components, pca_comps=pca_comps,
+        n_chunks=2, n_components=n_components,
         chunk_size=chunk_size, preprocessor=preprocessor,
         random_state=random_state)
 
 
 class Stable_SDML_Supervised(SDML_Supervised):
 
-  def __init__(self, sparsity_param=0.01, num_labeled='deprecated',
-               num_constraints=None, verbose=False, preprocessor=None,
+  def __init__(self, sparsity_param=0.01,
+               n_constraints=None, verbose=False, preprocessor=None,
                random_state=None):
     # this init makes SDML stable for scikit-learn examples.
     super(Stable_SDML_Supervised, self).__init__(
-        sparsity_param=sparsity_param, num_labeled=num_labeled,
-        num_constraints=num_constraints, verbose=verbose,
+        sparsity_param=sparsity_param,
+        n_constraints=n_constraints, verbose=verbose,
         preprocessor=preprocessor, balance_param=1e-5, prior='identity',
         random_state=random_state)
 
 
 class TestSklearnCompat(unittest.TestCase):
   def test_covariance(self):
-    check_estimator(Covariance)
+    check_estimator(Covariance())
 
   def test_lmnn(self):
-    check_estimator(LMNN)
+    check_estimator(LMNN())
 
   def test_lfda(self):
-    check_estimator(LFDA)
+    check_estimator(LFDA())
 
   def test_mlkr(self):
-    check_estimator(MLKR)
+    check_estimator(MLKR())
 
   def test_nca(self):
-    check_estimator(NCA)
+    check_estimator(NCA())
 
   def test_lsml(self):
-    check_estimator(LSML_Supervised)
+    check_estimator(LSML_Supervised())
 
   def test_itml(self):
-    check_estimator(ITML_Supervised)
+    check_estimator(ITML_Supervised())
 
   def test_mmc(self):
-    check_estimator(MMC_Supervised)
+    check_estimator(MMC_Supervised())
 
   def test_sdml(self):
-    check_estimator(Stable_SDML_Supervised)
+    check_estimator(Stable_SDML_Supervised())
 
   def test_rca(self):
-    check_estimator(Stable_RCA_Supervised)
+    check_estimator(Stable_RCA_Supervised())
+
+  def test_scml(self):
+    msg = "As no value for `n_basis` was selected, "
+    with pytest.warns(UserWarning) as raised_warning:
+      check_estimator(SCML_Supervised())
+    assert msg in str(raised_warning[0].message)
 
 
 RNG = check_random_state(0)
@@ -116,7 +121,8 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
 
   # we subsample the data for the test to be more efficient
   input_data, _, labels, _ = train_test_split(input_data, labels,
-                                              train_size=20)
+                                              train_size=40,
+                                              random_state=42)
   X = X[:10]
 
   estimator = clone(estimator)
@@ -125,8 +131,7 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
   input_variants, label_variants = generate_array_like(input_data, labels)
   for input_variant in input_variants:
     for label_variant in label_variants:
-      estimator.fit(*remove_y_quadruplets(estimator, input_variant,
-                                          label_variant))
+      estimator.fit(*remove_y(estimator, input_variant, label_variant))
     if hasattr(estimator, "predict"):
       estimator.predict(input_variant)
     if hasattr(estimator, "predict_proba"):
@@ -137,8 +142,7 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
       estimator.decision_function(input_variant)
     if hasattr(estimator, "score"):
       for label_variant in label_variants:
-        estimator.score(*remove_y_quadruplets(estimator, input_variant,
-                                              label_variant))
+        estimator.score(*remove_y(estimator, input_variant, label_variant))
 
   X_variants, _ = generate_array_like(X)
   for X_variant in X_variants:
@@ -146,8 +150,19 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
 
   pairs = np.array([[X[0], X[1]], [X[0], X[2]]])
   pairs_variants, _ = generate_array_like(pairs)
+
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+
   for pairs_variant in pairs_variants:
-    estimator.score_pairs(pairs_variant)
+    estimator.pair_score(pairs_variant)  # All learners have pair_score
+
+    # But not all of them will have pair_distance
+    try:
+      estimator.pair_distance(pairs_variant)
+    except Exception as raised_exception:
+      assert raised_exception.value.args[0] == not_implemented_msg
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -157,7 +172,7 @@ def test_various_scoring_on_tuples_learners(estimator, build_dataset,
                                             with_preprocessor):
   """Tests that scikit-learn's scoring returns something finite,
   for other scoring than default scoring. (List of scikit-learn's scores can be
-  found in sklearn.metrics.scorer). For each type of output (predict,
+  found in sklearn.metrics._scorer). For each type of output (predict,
   predict_proba, decision_function), we test a bunch of scores.
   We only test on pairs learners because quadruplets don't have a y argument.
   """
@@ -199,13 +214,10 @@ def test_cross_validation_is_finite(estimator, build_dataset):
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
   assert np.isfinite(cross_val_score(estimator,
-                                     *remove_y_quadruplets(estimator,
-                                                           input_data,
-                                                           labels))).all()
+                                     *remove_y(estimator, input_data, labels)
+                                     )).all()
   assert np.isfinite(cross_val_predict(estimator,
-                                       *remove_y_quadruplets(estimator,
-                                                             input_data,
-                                                             labels)
+                                       *remove_y(estimator, input_data, labels)
                                        )).all()
 
 
@@ -226,7 +238,7 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
     n_splits = 3
     kfold = KFold(shuffle=False, n_splits=n_splits)
     n_samples = input_data.shape[0]
-    fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
+    fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int64)
     fold_sizes[:n_samples % n_splits] += 1
     current = 0
     scores, predictions = [], np.zeros(input_data.shape[0])
@@ -237,28 +249,26 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
       train_mask = np.ones(input_data.shape[0], bool)
       train_mask[test_slice] = False
       y_train, y_test = labels[train_mask], labels[test_slice]
-      estimator.fit(*remove_y_quadruplets(estimator,
-                                          input_data[train_mask],
-                                          y_train))
+      estimator.fit(*remove_y(estimator, input_data[train_mask], y_train))
       if hasattr(estimator, "score"):
-        scores.append(estimator.score(*remove_y_quadruplets(
+        scores.append(estimator.score(*remove_y(
             estimator, input_data[test_slice], y_test)))
       if hasattr(estimator, "predict"):
         predictions[test_slice] = estimator.predict(input_data[test_slice])
     if hasattr(estimator, "score"):
       assert all(scores == cross_val_score(
-          estimator, *remove_y_quadruplets(estimator, input_data, labels),
+          estimator, *remove_y(estimator, input_data, labels),
           cv=kfold))
     if hasattr(estimator, "predict"):
       assert all(predictions == cross_val_predict(
           estimator,
-          *remove_y_quadruplets(estimator, input_data, labels),
+          *remove_y(estimator, input_data, labels),
           cv=kfold))
 
 
 def check_score(estimator, tuples, y):
   if hasattr(estimator, "score"):
-    score = estimator.score(*remove_y_quadruplets(estimator, tuples, y))
+    score = estimator.score(*remove_y(estimator, tuples, y))
     assert np.isfinite(score)
 
 
@@ -282,7 +292,7 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor):
     estimator.set_params(preprocessor=preprocessor)
     set_random_state(estimator)
 
-    estimator.fit(*remove_y_quadruplets(estimator, tuples_train, y_train))
+    estimator.fit(*remove_y(estimator, tuples_train, y_train))
     check_score(estimator, tuples_test, y_test)
     check_predict(estimator, tuples_test)
 
@@ -329,62 +339,53 @@ def test_estimators_fit_returns_self(estimator, build_dataset,
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
-  assert estimator.fit(*remove_y_quadruplets(estimator,
-                                             input_data,
-                                             labels)) is estimator
+  assert estimator.fit(*remove_y(estimator, input_data, labels)) is estimator
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
-@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
-                         ids=ids_metric_learners)
+@pytest.mark.parametrize('estimator, build_dataset', metric_learners_pipeline,
+                         ids=ids_metric_learners_pipeline)
 def test_pipeline_consistency(estimator, build_dataset,
                               with_preprocessor):
   # Adapted from scikit learn
   # check that make_pipeline(est) gives same score as est
-  # we do this test on all except quadruplets (since they don't have a y
-  # in fit):
-  if estimator.__class__.__name__ not in [e.__class__.__name__
-                                          for (e, _) in
-                                          quadruplets_learners]:
-    input_data, y, preprocessor, _ = build_dataset(with_preprocessor)
-
-    def make_random_state(estimator, in_pipeline):
-      rs = {}
-      name_estimator = estimator.__class__.__name__
-      if name_estimator[-11:] == '_Supervised':
-        name_param = 'random_state'
-        if in_pipeline:
-            name_param = name_estimator.lower() + '__' + name_param
-        rs[name_param] = check_random_state(0)
-      return rs
 
-    estimator = clone(estimator)
-    estimator.set_params(preprocessor=preprocessor)
-    pipeline = make_pipeline(estimator)
-    estimator.fit(*remove_y_quadruplets(estimator, input_data, y),
-                  **make_random_state(estimator, False))
-    pipeline.fit(*remove_y_quadruplets(estimator, input_data, y),
-                 **make_random_state(estimator, True))
-
-    if hasattr(estimator, 'score'):
-      result = estimator.score(*remove_y_quadruplets(estimator,
-                                                     input_data,
-                                                     y))
-      result_pipe = pipeline.score(*remove_y_quadruplets(estimator,
-                                                         input_data,
-                                                         y))
-      assert_allclose_dense_sparse(result, result_pipe)
+  input_data, y, preprocessor, _ = build_dataset(with_preprocessor)
 
-    if hasattr(estimator, 'predict'):
-      result = estimator.predict(input_data)
-      result_pipe = pipeline.predict(input_data)
-      assert_allclose_dense_sparse(result, result_pipe)
+  def make_random_state(estimator, in_pipeline):
+    rs = {}
+    name_estimator = estimator.__class__.__name__
+    if name_estimator[-11:] == '_Supervised':
+      name_param = 'random_state'
+      if in_pipeline:
+          name_param = name_estimator.lower() + '__' + name_param
+      rs[name_param] = check_random_state(0)
+    return rs
 
-    if issubclass(estimator.__class__, TransformerMixin):
-      if hasattr(estimator, 'transform'):
-        result = estimator.transform(input_data)
-        result_pipe = pipeline.transform(input_data)
-        assert_allclose_dense_sparse(result, result_pipe)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor,
+                       **make_random_state(estimator, False))
+  pipeline = make_pipeline(estimator)
+  estimator.fit(input_data, y)
+  estimator.set_params(preprocessor=preprocessor)
+  pipeline.set_params(**make_random_state(estimator, True))
+  pipeline.fit(input_data, y)
+
+  if hasattr(estimator, 'score'):
+    result = estimator.score(input_data, y)
+    result_pipe = pipeline.score(input_data, y)
+    assert_allclose_dense_sparse(result, result_pipe)
+
+  if hasattr(estimator, 'predict'):
+    result = estimator.predict(input_data)
+    result_pipe = pipeline.predict(input_data)
+    assert_allclose_dense_sparse(result, result_pipe)
+
+  if issubclass(estimator.__class__, TransformerMixin):
+    if hasattr(estimator, 'transform'):
+      result = estimator.transform(input_data)
+      result_pipe = pipeline.transform(input_data)
+      assert_allclose_dense_sparse(result, result_pipe)
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -398,7 +399,7 @@ def test_dict_unchanged(estimator, build_dataset, with_preprocessor):
   estimator.set_params(preprocessor=preprocessor)
   if hasattr(estimator, "n_components"):
     estimator.n_components = 1
-  estimator.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  estimator.fit(*remove_y(estimator, input_data, labels))
 
   def check_dict():
     assert estimator.__dict__ == dict_before, (
@@ -429,7 +430,7 @@ def test_dont_overwrite_parameters(estimator, build_dataset,
     estimator.n_components = 1
   dict_before_fit = estimator.__dict__.copy()
 
-  estimator.fit(*remove_y_quadruplets(estimator, input_data, labels))
+  estimator.fit(*remove_y(estimator, input_data, labels))
   dict_after_fit = estimator.__dict__
 
   public_keys_after_fit = [key for key in dict_after_fit.keys()
diff --git a/test/test_triplets_classifiers.py b/test/test_triplets_classifiers.py
new file mode 100644
index 00000000..515a0a33
--- /dev/null
+++ b/test/test_triplets_classifiers.py
@@ -0,0 +1,127 @@
+import pytest
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+from metric_learn import SCML
+from test.test_utils import (
+  triplets_learners,
+  ids_triplets_learners,
+  build_triplets
+)
+from metric_learn.sklearn_shims import set_random_state
+from sklearn import clone
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, _, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  triplets_train, triplets_test = train_test_split(input_data)
+  estimator.fit(triplets_train)
+  predictions = estimator.predict(triplets_test)
+
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
+
+
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_no_zero_prediction(estimator, build_dataset):
+  """
+  Test that all predicted values are not zero, even when the
+  distance d(x,y) and d(x,z) is the same for a triplet of the
+  form (x, y, z). i.e border cases.
+  """
+  triplets, _, _, X = build_dataset(with_preprocessor=False)
+  # Force 3 dimentions only, to use cross product and get easy orthogonal vec.
+  triplets = np.array([[t[0][:3], t[1][:3], t[2][:3]] for t in triplets])
+  X = X[:, :3]
+  # Dummy fit
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  estimator.fit(triplets)
+  # We force the transformation to be identity, to force euclidean distance
+  estimator.components_ = np.eye(X.shape[1])
+
+  # Get two orthogonal vectors in respect to X[1]
+  k = X[1] / np.linalg.norm(X[1])  # Normalize first vector
+  x = X[2] - X[2].dot(k) * k  # Get random orthogonal vector
+  x /= np.linalg.norm(x)  # Normalize
+  y = np.cross(k, x)  # Get orthogonal vector to x
+  # Assert these orthogonal vectors are different
+  with pytest.raises(AssertionError):
+    assert_array_equal(X[1], x)
+  with pytest.raises(AssertionError):
+    assert_array_equal(X[1], y)
+  # Assert the distance is the same for both
+  assert estimator.get_metric()(X[1], x) == estimator.get_metric()(X[1], y)
+
+  # Form the three scenarios where predict() gives 0 with numpy.sign
+  triplets_test = np.array(  # Critical examples
+    [[X[0], X[2], X[2]],
+     [X[1], X[1], X[1]],
+     [X[1], x, y]])
+  # Predict
+  predictions = estimator.predict(triplets_test)
+  # Check there are no zero values
+  assert np.sum(predictions == 0) == 0
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                              with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to predict and
+  the metric learner has not been fitted."""
+  input_data, _, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+
+
+@pytest.mark.parametrize('estimator, build_dataset', triplets_learners,
+                         ids=ids_triplets_learners)
+def test_accuracy_toy_example(estimator, build_dataset):
+  """Test that the default scoring for triplets (accuracy) works on some
+  toy example"""
+  triplets, _, _, X = build_dataset(with_preprocessor=False)
+  estimator = clone(estimator)
+  set_random_state(estimator)
+  estimator.fit(triplets)
+  # We take the two first points and we build 4 regularly spaced points on the
+  # line they define, so that it's easy to build triplets of different
+  # similarities.
+  X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4
+
+  triplets_test = np.array(
+      [[X_test[0], X_test[2], X_test[1]],
+       [X_test[1], X_test[3], X_test[0]],
+       [X_test[1], X_test[2], X_test[3]],
+       [X_test[3], X_test[0], X_test[2]]])
+  # we force the transformation to be identity so that we control what it does
+  estimator.components_ = np.eye(X.shape[1])
+  assert estimator.score(triplets_test) == 0.25
+
+
+def test_raise_big_number_of_features():
+  triplets, _, _, X = build_triplets(with_preprocessor=False)
+  triplets = triplets[:3, :, :]
+  estimator = SCML(n_basis=320)
+  set_random_state(estimator)
+  with pytest.raises(ValueError) as exc_info:
+    estimator.fit(triplets)
+  assert exc_info.value.args[0] == \
+         "Number of features (4) is greater than the number of triplets(3)." \
+         "\nConsider using dimensionality reduction or using another basis " \
+         "generation scheme."
diff --git a/test/test_utils.py b/test/test_utils.py
index 37abb307..c0383792 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,24 +1,26 @@
+import warnings
 import pytest
+from scipy.linalg import eigh, pinvh
 from collections import namedtuple
 import numpy as np
 from numpy.testing import assert_array_equal, assert_equal
 from sklearn.model_selection import train_test_split
-from sklearn.exceptions import DataConversionWarning
 from sklearn.utils import check_random_state, shuffle
-from sklearn.utils.testing import set_random_state
+from metric_learn.sklearn_shims import set_random_state
 from sklearn.base import clone
 from metric_learn._util import (check_input, make_context, preprocess_tuples,
                                 make_name, preprocess_points,
                                 check_collapsed_pairs, validate_vector,
                                 _check_sdp_from_eigen, _check_n_components,
                                 check_y_valid_values_for_pairs,
-                                _auto_select_init)
+                                _auto_select_init, _pseudo_inverse_from_eig)
 from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA,
                           LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised,
                           MMC_Supervised, RCA_Supervised, SDML_Supervised,
-                          Constraints)
+                          SCML, SCML_Supervised, Constraints)
 from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin,
                                       _PairsClassifierMixin,
+                                      _TripletsClassifierMixin,
                                       _QuadrupletsClassifierMixin)
 from metric_learn.exceptions import PreprocessorError, NonPSDError
 from sklearn.datasets import make_regression, make_blobs, load_iris
@@ -59,11 +61,11 @@ def build_regression(with_preprocessor=False):
 def build_data():
   input_data, labels = load_iris(return_X_y=True)
   X, y = shuffle(input_data, labels, random_state=SEED)
-  num_constraints = 50
+  n_constraints = 50
   constraints = Constraints(y)
   pairs = (
       constraints
-      .positive_negative_pairs(num_constraints, same_length=True,
+      .positive_negative_pairs(n_constraints, same_length=True,
                                random_state=check_random_state(SEED)))
   return X, pairs
 
@@ -83,6 +85,19 @@ def build_pairs(with_preprocessor=False):
     return Dataset(X[c], target, None, X[c[:, 0]])
 
 
+def build_triplets(with_preprocessor=False):
+  input_data, labels = load_iris(return_X_y=True)
+  X, y = shuffle(input_data, labels, random_state=SEED)
+  constraints = Constraints(y)
+  triplets = constraints.generate_knntriplets(X, k_genuine=3, k_impostor=4)
+  if with_preprocessor:
+    # if preprocessor, we build a 2D array of triplets of indices
+    return Dataset(triplets, np.ones(len(triplets)), X, np.arange(len(X)))
+  else:
+    # if not, we build a 3D array of triplets of samples
+    return Dataset(X[triplets], np.ones(len(triplets)), None, X)
+
+
 def build_quadruplets(with_preprocessor=False):
   # builds a toy quadruplets problem
   X, indices = build_data()
@@ -103,6 +118,11 @@ def build_quadruplets(with_preprocessor=False):
                                 [learner for (learner, _) in
                                  quadruplets_learners]))
 
+triplets_learners = [(SCML(n_basis=320), build_triplets)]
+ids_triplets_learners = list(map(lambda x: x.__class__.__name__,
+                             [learner for (learner, _) in
+                              triplets_learners]))
+
 pairs_learners = [(ITML(max_iter=2), build_pairs),  # max_iter=2 to be faster
                   (MMC(max_iter=2), build_pairs),  # max_iter=2 to be faster
                   (SDML(prior='identity', balance_param=1e-5), build_pairs)]
@@ -118,9 +138,10 @@ def build_quadruplets(with_preprocessor=False):
                (ITML_Supervised(max_iter=5), build_classification),
                (LSML_Supervised(), build_classification),
                (MMC_Supervised(max_iter=5), build_classification),
-               (RCA_Supervised(num_chunks=5), build_classification),
+               (RCA_Supervised(n_chunks=5), build_classification),
                (SDML_Supervised(prior='identity', balance_param=1e-5),
-               build_classification)]
+               build_classification),
+               (SCML_Supervised(n_basis=80), build_classification)]
 ids_classifiers = list(map(lambda x: x.__class__.__name__,
                            [learner for (learner, _) in
                             classifiers]))
@@ -130,10 +151,12 @@ def build_quadruplets(with_preprocessor=False):
                           [learner for (learner, _) in regressors]))
 
 WeaklySupervisedClasses = (_PairsClassifierMixin,
+                           _TripletsClassifierMixin,
                            _QuadrupletsClassifierMixin)
 
-tuples_learners = pairs_learners + quadruplets_learners
-ids_tuples_learners = ids_pairs_learners + ids_quadruplets_learners
+tuples_learners = pairs_learners + triplets_learners + quadruplets_learners
+ids_tuples_learners = ids_pairs_learners + ids_triplets_learners \
+                      + ids_quadruplets_learners
 
 supervised_learners = classifiers + regressors
 ids_supervised_learners = ids_classifiers + ids_regressors
@@ -141,14 +164,17 @@ def build_quadruplets(with_preprocessor=False):
 metric_learners = tuples_learners + supervised_learners
 ids_metric_learners = ids_tuples_learners + ids_supervised_learners
 
+metric_learners_pipeline = pairs_learners + supervised_learners
+ids_metric_learners_pipeline = ids_pairs_learners + ids_supervised_learners
+
 
-def remove_y_quadruplets(estimator, X, y):
-  """Quadruplets learners have no y in fit, but to write test for all
-  estimators, it is convenient to have this function, that will return X and y
-  if the estimator needs a y to fit on, and just X otherwise."""
+def remove_y(estimator, X, y):
+  """Quadruplets and triplets learners have no y in fit, but to write test for
+  all estimators, it is convenient to have this function, that will return X
+  and y if the estimator needs a y to fit on, and just X otherwise."""
+  no_y_fit = quadruplets_learners + triplets_learners
   if estimator.__class__.__name__ in [e.__class__.__name__
-                                      for (e, _) in
-                                      quadruplets_learners]:
+                                      for (e, _) in no_y_fit]:
     return (X,)
   else:
     return (X, y)
@@ -328,7 +354,7 @@ def test_check_tuples_valid_tuple_size(tuple_size):
   checks that checking the number of tuples (pairs, quadruplets, etc) raises
   no warning if there is the right number of points in a tuple.
   """
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(tuples_prep(), type_of_inputs='tuples',
                 preprocessor=mock_preprocessor, tuple_size=tuple_size)
     check_input(tuples_no_prep(), type_of_inputs='tuples', preprocessor=None,
@@ -353,7 +379,7 @@ def test_check_tuples_valid_tuple_size(tuple_size):
                                     [[2.6, 2.3], [3.4, 5.0]]])])
 def test_check_tuples_valid_with_preprocessor(tuples):
   """Test that valid inputs when using a preprocessor raises no warning"""
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(tuples, type_of_inputs='tuples',
                 preprocessor=mock_preprocessor)
   assert len(record) == 0
@@ -374,7 +400,7 @@ def test_check_tuples_valid_with_preprocessor(tuples):
                            ((3, 1), (4, 4), (29, 4)))])
 def test_check_tuples_valid_without_preprocessor(tuples):
   """Test that valid inputs when using no preprocessor raises no warning"""
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(tuples, type_of_inputs='tuples', preprocessor=None)
   assert len(record) == 0
 
@@ -383,12 +409,12 @@ def test_check_tuples_behaviour_auto_dtype():
   """Checks that check_tuples allows by default every type if using a
   preprocessor, and numeric types if using no preprocessor"""
   tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']]
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(tuples_prep, type_of_inputs='tuples',
                 preprocessor=mock_preprocessor)
   assert len(record) == 0
 
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
       check_input(tuples_no_prep(), type_of_inputs='tuples')  # numeric type
   assert len(record) == 0
 
@@ -524,7 +550,7 @@ def test_check_classic_invalid_dtype_not_convertible(preprocessor, points):
                                     [2.6, 2.3]])])
 def test_check_classic_valid_with_preprocessor(points):
   """Test that valid inputs when using a preprocessor raises no warning"""
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(points, type_of_inputs='classic',
                 preprocessor=mock_preprocessor)
   assert len(record) == 0
@@ -545,7 +571,7 @@ def test_check_classic_valid_with_preprocessor(points):
                            (3, 1, 4, 4, 29, 4))])
 def test_check_classic_valid_without_preprocessor(points):
   """Test that valid inputs when using no preprocessor raises no warning"""
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(points, type_of_inputs='classic', preprocessor=None)
   assert len(record) == 0
 
@@ -560,12 +586,12 @@ def test_check_classic_behaviour_auto_dtype():
   """Checks that check_input (for points) allows by default every type if
   using a preprocessor, and numeric types if using no preprocessor"""
   points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png']
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
     check_input(points_prep, type_of_inputs='classic',
                 preprocessor=mock_preprocessor)
   assert len(record) == 0
 
-  with pytest.warns(None) as record:
+  with warnings.catch_warnings(record=True) as record:
       check_input(points_no_prep(), type_of_inputs='classic')  # numeric type
   assert len(record) == 0
 
@@ -750,6 +776,8 @@ def test_array_like_indexer_array_like_valid_classic(input_data, indices):
   """Checks that any array-like is valid in the 'preprocessor' argument,
   and in the indices, for a classic input"""
   class MockMetricLearner(MahalanobisMixin):
+    def fit(self):
+      pass
     pass
 
   mock_algo = MockMetricLearner(preprocessor=input_data)
@@ -764,6 +792,8 @@ def test_array_like_indexer_array_like_valid_tuples(input_data, indices):
   """Checks that any array-like is valid in the 'preprocessor' argument,
   and in the indices, for a classic input"""
   class MockMetricLearner(MahalanobisMixin):
+    def fit(self):
+      pass
     pass
 
   mock_algo = MockMetricLearner(preprocessor=input_data)
@@ -792,13 +822,12 @@ def test_error_message_tuple_size(estimator, _):
   per tuple, it throws an error message"""
   estimator = clone(estimator)
   set_random_state(estimator)
-  invalid_pairs = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]],
-                            [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]])
+  invalid_pairs = np.ones((2, 5, 2))
   y = [1, 1]
   with pytest.raises(ValueError) as raised_err:
-    estimator.fit(*remove_y_quadruplets(estimator, invalid_pairs, y))
-  expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 "
-                  "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n"
+    estimator.fit(*remove_y(estimator, invalid_pairs, y))
+  expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 5 "
+                  "element(s) instead (shape=(2, 5, 2)):\ninput={}.\n"
                   .format(estimator._tuple_size, make_context(estimator),
                           invalid_pairs))
   assert str(raised_err.value) == expected_msg
@@ -806,9 +835,9 @@ def test_error_message_tuple_size(estimator, _):
 
 @pytest.mark.parametrize('estimator, _', metric_learners,
                          ids=ids_metric_learners)
-def test_error_message_t_score_pairs(estimator, _):
-  """tests that if you want to score_pairs on triplets for instance, it returns
-  the right error message
+def test_error_message_t_pair_distance_or_score(estimator, _):
+  """Tests that if you want to pair_distance or pair_score on triplets
+  for instance, it returns the right error message
   """
   estimator = clone(estimator)
   set_random_state(estimator)
@@ -816,12 +845,22 @@ def test_error_message_t_score_pairs(estimator, _):
   triplets = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]],
                        [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]])
   with pytest.raises(ValueError) as raised_err:
-    estimator.score_pairs(triplets)
+    estimator.pair_score(triplets)
   expected_msg = ("Tuples of 2 element(s) expected{}. Got tuples of 3 "
                   "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n"
                   .format(make_context(estimator), triplets))
   assert str(raised_err.value) == expected_msg
 
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+
+  # One exception will trigger for sure
+  with pytest.raises(Exception) as raised_exception:
+      estimator.pair_distance(triplets)
+  err_value = raised_exception.value.args[0]
+  assert err_value == expected_msg or err_value == not_implemented_msg
+
 
 def test_preprocess_tuples_simple_example():
   """Test the preprocessor on a very simple example of tuples to ensure the
@@ -872,35 +911,21 @@ def test_same_with_or_without_preprocessor(estimator, build_dataset):
                                    dataset_formed.data,
                                    random_state=SEED)
 
-  def make_random_state(estimator):
-    rs = {}
-    if estimator.__class__.__name__[-11:] == '_Supervised':
-      rs['random_state'] = check_random_state(SEED)
-    return rs
-
   estimator_with_preprocessor = clone(estimator)
   set_random_state(estimator_with_preprocessor)
   estimator_with_preprocessor.set_params(preprocessor=X)
-  estimator_with_preprocessor.fit(*remove_y_quadruplets(estimator,
-                                                        indices_train,
-                                                        y_train),
-                                  **make_random_state(estimator))
+  estimator_with_preprocessor.fit(*remove_y(estimator, indices_train, y_train))
 
   estimator_without_preprocessor = clone(estimator)
   set_random_state(estimator_without_preprocessor)
   estimator_without_preprocessor.set_params(preprocessor=None)
-  estimator_without_preprocessor.fit(*remove_y_quadruplets(estimator,
-                                                           formed_train,
-                                                           y_train),
-                                     **make_random_state(estimator))
+  estimator_without_preprocessor.fit(*remove_y(estimator, formed_train,
+                                               y_train))
 
   estimator_with_prep_formed = clone(estimator)
   set_random_state(estimator_with_prep_formed)
   estimator_with_prep_formed.set_params(preprocessor=X)
-  estimator_with_prep_formed.fit(*remove_y_quadruplets(estimator,
-                                                       indices_train,
-                                                       y_train),
-                                 **make_random_state(estimator))
+  estimator_with_prep_formed.fit(*remove_y(estimator, indices_train, y_train))
 
   # test prediction methods
   for method in ["predict", "decision_function"]:
@@ -916,31 +941,59 @@ def make_random_state(estimator):
                                         method)(formed_test)
       assert np.array(output_with_prep == output_with_prep_formed).all()
 
-  # test score_pairs
-  output_with_prep = estimator_with_preprocessor.score_pairs(
-      indicators_to_transform[[[[0, 2], [5, 3]]]])
-  output_without_prep = estimator_without_preprocessor.score_pairs(
-      formed_points_to_transform[[[[0, 2], [5, 3]]]])
+  # Test pair_score, all learners have it.
+  idx1 = np.array([[0, 2], [5, 3]], dtype=int)
+  output_with_prep = estimator_with_preprocessor.pair_score(
+      indicators_to_transform[idx1])
+  output_without_prep = estimator_without_preprocessor.pair_score(
+      formed_points_to_transform[idx1])
   assert np.array(output_with_prep == output_without_prep).all()
 
-  output_with_prep = estimator_with_preprocessor.score_pairs(
-      indicators_to_transform[[[[0, 2], [5, 3]]]])
-  output_without_prep = estimator_with_prep_formed.score_pairs(
-      formed_points_to_transform[[[[0, 2], [5, 3]]]])
+  output_with_prep = estimator_with_preprocessor.pair_score(
+      indicators_to_transform[idx1])
+  output_without_prep = estimator_with_prep_formed.pair_score(
+      formed_points_to_transform[idx1])
   assert np.array(output_with_prep == output_without_prep).all()
 
-  # test transform
-  output_with_prep = estimator_with_preprocessor.transform(
-      indicators_to_transform)
-  output_without_prep = estimator_without_preprocessor.transform(
-      formed_points_to_transform)
-  assert np.array(output_with_prep == output_without_prep).all()
-
-  output_with_prep = estimator_with_preprocessor.transform(
-      indicators_to_transform)
-  output_without_prep = estimator_with_prep_formed.transform(
-      formed_points_to_transform)
-  assert np.array(output_with_prep == output_without_prep).all()
+  # Test pair_distance
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have pair_distance"
+  try:
+    output_with_prep = estimator_with_preprocessor.pair_distance(
+        indicators_to_transform[idx1])
+    output_without_prep = estimator_without_preprocessor.pair_distance(
+        formed_points_to_transform[idx1])
+    assert np.array(output_with_prep == output_without_prep).all()
+
+    output_with_prep = estimator_with_preprocessor.pair_distance(
+        indicators_to_transform[idx1])
+    output_without_prep = estimator_with_prep_formed.pair_distance(
+        formed_points_to_transform[idx1])
+    assert np.array(output_with_prep == output_without_prep).all()
+
+  except Exception as raised_exception:
+    assert raised_exception.value.args[0] == not_implemented_msg
+
+  # Test transform
+  not_implemented_msg = ""
+  # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says
+  # "This learner does not have transform"
+  try:
+    output_with_prep = estimator_with_preprocessor.transform(
+        indicators_to_transform)
+    output_without_prep = estimator_without_preprocessor.transform(
+        formed_points_to_transform)
+    assert np.array(output_with_prep == output_without_prep).all()
+
+    output_with_prep = estimator_with_preprocessor.transform(
+        indicators_to_transform)
+    output_without_prep = estimator_with_prep_formed.transform(
+        formed_points_to_transform)
+    assert np.array(output_with_prep == output_without_prep).all()
+
+  except Exception as raised_exception:
+    assert raised_exception.value.args[0] == not_implemented_msg
 
 
 def test_check_collapsed_pairs_raises_no_error():
@@ -964,6 +1017,7 @@ def test_check_collapsed_pairs_raises_error():
                           "the same as the right element), out of 3 pairs in"
                           " total.")
 
+
 def test__validate_vector():
   """Replica of scipy.spatial.tests.test_distance.test__validate_vector"""
   x = [1, 2, 3]
@@ -1040,6 +1094,53 @@ def test__check_sdp_from_eigen_returns_definiteness(w, is_definite):
   assert _check_sdp_from_eigen(w) == is_definite
 
 
+@pytest.mark.unit
+@pytest.mark.parametrize('w, tol, is_definite',
+                         [(np.array([5., 3.]), 2, True),
+                          (np.array([5., 1.]), 2, False),
+                          (np.array([5., -1.]), 2, False)])
+def test__check_sdp_from_eigen_tol_psd(w, tol, is_definite):
+  """Tests that _check_sdp_from_eigen, for PSD matrices, returns
+  False if an eigenvalue is lower than tol"""
+  assert _check_sdp_from_eigen(w, tol=tol) == is_definite
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, tol',
+                         [(np.array([5., -3.]), 2),
+                          (np.array([1., -3.]), 2)])
+def test__check_sdp_from_eigen_tol_non_psd(w, tol):
+  """Tests that _check_sdp_from_eigen raises a NonPSDError
+  when there is a negative value with abs value higher than tol"""
+  with pytest.raises(NonPSDError):
+    _check_sdp_from_eigen(w, tol=tol)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w, is_definite',
+                         [(np.array([1e5, 1e5, 1e5, 1e5,
+                                     1e5, 1e5, 1e-20]), False),
+                          (np.array([1e-10, 1e-10]), True)])
+def test__check_sdp_from_eigen_tol_default_psd(w, is_definite):
+  """Tests that the default tol argument gives good results for edge cases
+  like even if the determinant is high but clearly one eigenvalue is low,
+  (undefinite so returns False) or when all eigenvalues are low (definite so
+  returns True)"""
+  assert _check_sdp_from_eigen(w, tol=None) == is_definite
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize('w',
+                         [np.array([1., -1.]),
+                          np.array([-1e-10, 1e-10])])
+def test__check_sdp_from_eigen_tol_default_non_psd(w):
+  """Tests that the default tol argument is good for raising
+  NonPSDError, e.g. that when a value is clearly relatively
+  negative it raises such an error"""
+  with pytest.raises(NonPSDError):
+    _check_sdp_from_eigen(w, tol=None)
+
+
 def test__check_n_components():
   """Checks that n_components returns what is expected
   (including the errors)"""
@@ -1146,3 +1247,27 @@ def test__auto_select_init(has_classes, n_features, n_samples, n_components,
   """Checks that the auto selection of the init works as expected"""
   assert (_auto_select_init(has_classes, n_features,
                             n_samples, n_components, n_classes) == result)
+
+
+@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20])
+def test_pseudo_inverse_from_eig_and_pinvh_singular(w0):
+  """Checks that _pseudo_inverse_from_eig returns the same result as
+  scipy.linalg.pinvh for a singular matrix"""
+  rng = np.random.RandomState(SEED)
+  A = rng.rand(100, 100)
+  A = A + A.T
+  w, V = eigh(A)
+  w[0] = w0
+  A = V.dot(np.diag(w)).dot(V.T)
+  np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A),
+                             rtol=1e-05)
+
+
+def test_pseudo_inverse_from_eig_and_pinvh_nonsingular():
+  """Checks that _pseudo_inverse_from_eig returns the same result as
+  scipy.linalg.pinvh for a non singular matrix"""
+  rng = np.random.RandomState(SEED)
+  A = rng.rand(100, 100)
+  A = A + A.T
+  w, V = eigh(A, check_finite=False)
+  np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A))