diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 00000000..f01db0a4 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,27 @@ +ignore: + - "test" + +# taken from scikit-learn: +# https://github.com/scikit-learn/scikit-learn/blob/a7e17117bb15eb3f51ebccc1bd53e42fcb4e6cd8/.codecov.yml +comment: false + +coverage: + status: + project: + default: + # Commits pushed to master should not make the overall + # project coverage decrease by more than 1%: + target: auto + threshold: 1% + patch: + default: + # Be tolerant on slight code coverage diff on PRs to limit + # noisy red coverage status on github PRs. + # Note The coverage stats are still uploaded + # to codecov so that PR reviewers can see uncovered lines + # in the github diff if they install the codecov browser + # extension: + # https://github.com/codecov/browser-extension + target: auto + threshold: 1% + diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..ae757838 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,56 @@ +--- +name: Reproducible bug report +about: Create a reproducible bug report. Not for support requests. +labels: 'bug' +--- + +#### Description + + +#### Steps/Code to Reproduce + + +#### Expected Results + + +#### Actual Results + + +#### Versions + + + +--- + +**Message from the maintainers**: + +Impacted by this bug? Give it a 👍. We prioritise the issues with the most 👍. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..415acfcd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,18 @@ +blank_issues_enabled: false + +contact_links: + - name: Have you read the docs? + url: http://contrib.scikit-learn.org/metric-learn/ + about: Much help can be found in the docs + - name: Ask a question + url: https://github.com/scikit-learn-contrib/metric-learn/discussions/new + about: Ask a question or start a discussion about metric-learn + - name: Stack Overflow + url: https://stackoverflow.com + about: Please ask and answer metric-learn usage questions (API, installation...) on Stack Overflow + - name: Cross Validated + url: https://stats.stackexchange.com + about: Please ask and answer metric learning questions (use cases, algorithms & theory...) on Cross Validated + - name: Blank issue + url: https://github.com/scikit-learn-contrib/metric-learn/issues/new + about: Please note that Github Discussions should be used in most cases instead diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md new file mode 100644 index 00000000..753cf2f7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/doc_improvement.md @@ -0,0 +1,23 @@ +--- +name: Documentation improvement +about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. +labels: Documentation +--- + +#### Describe the issue linked to the documentation + + + +#### Suggest a potential alternative/fix + + + +--- + +**Message from the maintainers**: + +Confused by this part of the doc too? Give it a 👍. We prioritise the issues with the most 👍. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/enhancement_proposal.md b/.github/ISSUE_TEMPLATE/enhancement_proposal.md new file mode 100644 index 00000000..01dfb1d7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement_proposal.md @@ -0,0 +1,18 @@ +--- +name: Enhancement proposal +about: Propose an enhancement for metric-learn +labels: 'enhancement' +--- +# Summary + +What change needs making? + +# Use Cases + +When would you use this? + +--- + +**Message from the maintainers**: + +Want to see this feature happen? Give it a 👍. We prioritise the issues with the most 👍. \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 00000000..0935a109 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,42 @@ +name: CI + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + # Run normal testing with the latest versions of all dependencies + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + python-version: ['3.8', '3.9', '3.10', '3.11'] + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Run Tests without skggm + run: | + sudo apt-get install liblapack-dev + pip install --upgrade pip pytest + pip install wheel cython numpy scipy codecov pytest-cov scikit-learn + pytest test --cov + bash <(curl -s https://codecov.io/bash) + - name: Run Tests with skggm + env: + SKGGM_VERSION: a0ed406586c4364ea3297a658f415e13b5cbdaf8 + run: | + pip install git+https://github.com/skggm/skggm.git@${SKGGM_VERSION} + pytest test --cov + bash <(curl -s https://codecov.io/bash) + - name: Syntax checking with flake8 + run: | + pip install flake8 + flake8 --extend-ignore=E111,E114 --show-source; diff --git a/.gitignore b/.gitignore index 32ed7270..66eb3551 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,10 @@ build/ dist/ *.egg-info .coverage +htmlcov/ +.cache/ +.pytest_cache/ +doc/auto_examples/* +doc/generated/* +venv/ +.vscode/ diff --git a/.landscape.yml b/.landscape.yml new file mode 100644 index 00000000..ae342735 --- /dev/null +++ b/.landscape.yml @@ -0,0 +1,16 @@ +strictness: medium +pep8: + disable: + - E111 + - E114 + - E231 + - E225 + - E402 + - W503 +pylint: + disable: + - bad-indentation + - invalid-name + - too-many-arguments +ignore-paths: + - bench/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 378cc5f5..00000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: python -sudo: false -cache: pip -python: - - "2.7" - - "3.4" -before_install: - - pip install --upgrade pip - - pip install wheel - - pip install numpy scipy scikit-learn -script: python setup.py test diff --git a/README.rst b/README.rst index 38c088aa..b2f6e6d4 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,9 @@ -|Travis-CI Build Status| |License| |PyPI version| +|GitHub Actions Build Status| |License| |PyPI version| |Code coverage| -metric-learn -============= +metric-learn: Metric Learning in Python +======================================= -Metric Learning algorithms in Python. +metric-learn contains efficient Python implementations of several popular supervised and weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib `_, the API of metric-learn is compatible with `scikit-learn `_, the leading library for machine learning in Python. This allows to use all the scikit-learn routines (for pipelining, model selection, etc) with metric learning algorithms through a unified interface. **Algorithms** @@ -11,60 +11,67 @@ Metric Learning algorithms in Python. - Information Theoretic Metric Learning (ITML) - Sparse Determinant Metric Learning (SDML) - Least Squares Metric Learning (LSML) +- Sparse Compositional Metric Learning (SCML) - Neighborhood Components Analysis (NCA) - Local Fisher Discriminant Analysis (LFDA) - Relative Components Analysis (RCA) +- Metric Learning for Kernel Regression (MLKR) +- Mahalanobis Metric for Clustering (MMC) **Dependencies** -- Python 2.7+, 3.4+ -- numpy, scipy, scikit-learn -- (for running the examples only: matplotlib) +- Python 3.6+ (the last version supporting Python 2 and Python 3.5 was + `v0.5.0 `_) +- numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3 -**Installation/Setup** +**Optional dependencies** -Run ``pip install metric-learn`` to download and install from PyPI. +- For SDML, using skggm will allow the algorithm to solve problematic cases + (install from commit `a0ed406 `_). + ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub. +- For running the examples only: matplotlib -Run ``python setup.py install`` for default installation. +**Installation/Setup** -Run ``python setup.py test`` to run all tests. +- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here `_. -**Usage** +- To install from PyPI: ``pip install metric-learn``. -For full usage examples, see the `sphinx documentation`_. +- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed). -Each metric is a subclass of ``BaseMetricLearner``, which provides -default implementations for the methods ``metric``, ``transformer``, and -``transform``. Subclasses must provide an implementation for either -``metric`` or ``transformer``. +**Usage** -For an instance of a metric learner named ``foo`` learning from a set of -``d``-dimensional points, ``foo.metric()`` returns a ``d`` by ``d`` -matrix ``M`` such that a distance between vectors ``x`` and ``y`` is -expressed ``(x-y).dot(M).dot(x-y)``. +See the `sphinx documentation`_ for full documentation about installation, API, usage, and examples. -In the same scenario, ``foo.transformer()`` returns a ``d`` by ``d`` -matrix ``L`` such that a vector ``x`` can be represented in the learned -space as the vector ``x.dot(L.T)``. +**Citation** -For convenience, the function ``foo.transform(X)`` is provided for -converting a matrix of points (``X``) into the learned space, in which -standard Euclidean distance can be used. +If you use metric-learn in a scientific publication, we would appreciate +citations to the following paper: -**Notes** +`metric-learn: Metric Learning Algorithms in Python +`_, de Vazelhes +*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020. -If a recent version of the Shogun Python modular (``modshogun``) library -is available, the LMNN implementation will use the fast C++ version from -there. The two implementations differ slightly, and the C++ version is -more complete. +Bibtex entry:: + @article{metric-learn, + title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython}, + author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and + {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien}, + journal = {Journal of Machine Learning Research}, + year = {2020}, + volume = {21}, + number = {138}, + pages = {1--6} + } -.. _sphinx documentation: http://all-umass.github.io/metric-learn/ +.. _sphinx documentation: http://contrib.scikit-learn.org/metric-learn/ -.. |Travis-CI Build Status| image:: https://api.travis-ci.org/all-umass/metric-learn.svg?branch=master - :target: https://travis-ci.org/all-umass/metric-learn +.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg + :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat :target: http://badges.mit-license.org .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg :target: http://badge.fury.io/py/metric-learn - +.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg + :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn diff --git a/bench/.gitignore b/bench/.gitignore new file mode 100644 index 00000000..824e23ac --- /dev/null +++ b/bench/.gitignore @@ -0,0 +1,4 @@ +results +env +metric-learn +html diff --git a/bench/asv.conf.json b/bench/asv.conf.json new file mode 100644 index 00000000..782d3ab2 --- /dev/null +++ b/bench/asv.conf.json @@ -0,0 +1,74 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "metric-learn", + + // The project's homepage + "project_url": "https://github.com/all-umass/metric-learn", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "tip" (for mercurial). + "branches": ["master"], // for git + // "branches": ["tip"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "virtualenv", + + // the base URL to show a commit for the project. + "show_commit_url": "http://github.com/all-umass/metric-learn/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["2.7", "3.3"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list indicates to just test against the default (latest) + // version. + "matrix": { + "numpy": ["1.12"], + "scipy": ["0.18"], + "scikit-learn": ["0.18"] + }, + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + // "env_dir": "env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + // "results_dir": "results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + // "html_dir": "html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache wheels of the recent builds in each + // environment, making them faster to install next time. This is + // number of builds to keep, per environment. + "wheel_cache_size": 4 +} diff --git a/bench/benchmarks/__init__.py b/bench/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/benchmarks/iris.py b/bench/benchmarks/iris.py new file mode 100644 index 00000000..05035085 --- /dev/null +++ b/bench/benchmarks/iris.py @@ -0,0 +1,31 @@ +import numpy as np +from sklearn.datasets import load_iris + +import metric_learn + +CLASSES = { + 'Covariance': metric_learn.Covariance(), + 'ITML_Supervised': metric_learn.ITML_Supervised(n_constraints=200), + 'LFDA': metric_learn.LFDA(k=2, dim=2), + 'LMNN': metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False), + 'LSML_Supervised': metric_learn.LSML_Supervised(n_constraints=200), + 'MLKR': metric_learn.MLKR(), + 'NCA': metric_learn.NCA(max_iter=700, n_components=2), + 'RCA_Supervised': metric_learn.RCA_Supervised(dim=2, n_chunks=30, + chunk_size=2), + 'SDML_Supervised': metric_learn.SDML_Supervised(n_constraints=1500) +} + + +class IrisDataset(object): + params = [sorted(CLASSES)] + param_names = ['alg'] + + def setup(self, alg): + iris_data = load_iris() + self.iris_points = iris_data['data'] + self.iris_labels = iris_data['target'] + + def time_fit(self, alg): + np.random.seed(5555) + CLASSES[alg].fit(self.iris_points, self.iris_labels) diff --git a/doc/_static/css/styles.css b/doc/_static/css/styles.css new file mode 100644 index 00000000..6d350ae4 --- /dev/null +++ b/doc/_static/css/styles.css @@ -0,0 +1,36 @@ +.hatnote { + border-color: #e1e4e5 ; + border-style: solid ; + border-width: 1px ; + font-size: x-small ; + font-style: italic ; + margin-left: auto ; + margin-right: auto ; + margin-bottom: 24px; + padding: 12px; +} +.hatnote-gray { + background-color: #f5f5f5 +} +.hatnote li { + list-style-type: square; + margin-left: 12px !important; +} +.hatnote ul { + list-style-type: square; + margin-left: 0px !important; + margin-bottom: 0px !important; +} +.deprecated { + color: #b94a48; + background-color: #F3E5E5; + border-color: #eed3d7; + margin-top: 0.5rem; + padding: 0.5rem; + border-radius: 0.5rem; + margin-bottom: 0.5rem; +} + +.deprecated p { + margin-bottom: 0 !important; +} \ No newline at end of file diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst new file mode 100644 index 00000000..f0c1b5bc --- /dev/null +++ b/doc/_templates/class.rst @@ -0,0 +1,16 @@ +:mod:`{{module}}`.{{objname}} +{{ underline }}============== + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :undoc-members: + :inherited-members: + :special-members: __init__ + +.. include:: {{module}}.{{objname}}.examples + +.. raw:: html + +
diff --git a/doc/conf.py b/doc/conf.py index 5e3f2cd9..c472cc21 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import sys +import os +import warnings extensions = [ 'sphinx.ext.autodoc', @@ -7,6 +10,9 @@ 'sphinx.ext.viewcode', 'sphinx.ext.mathjax', 'numpydoc', + 'sphinx_gallery.gen_gallery', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx' ] templates_path = ['_templates'] @@ -15,19 +21,62 @@ # General information about the project. project = u'metric-learn' -copyright = u'2015, CJ Carey and Yuan Tang' -author = u'CJ Carey and Yuan Tang' -version = '0.2.1' -release = '0.2.1' +copyright = (u'2015-2023, CJ Carey, Yuan Tang, William de Vazelhes, Aurélien ' + u'Bellet and Nathalie Vauquier') +author = (u'CJ Carey, Yuan Tang, William de Vazelhes, Aurélien Bellet and ' + u'Nathalie Vauquier') +version = '0.7.0' +release = '0.7.0' language = 'en' exclude_patterns = ['_build'] pygments_style = 'sphinx' todo_include_todos = True -numpydoc_show_class_members = False # Options for HTML output html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] htmlhelp_basename = 'metric-learndoc' +# Option to hide doctests comments in the documentation (like # doctest: +# +NORMALIZE_WHITESPACE for instance) +trim_doctest_flags = True + +# intersphinx configuration +intersphinx_mapping = { + 'python': ('https://docs.python.org/{.major}'.format( + sys.version_info), None), + 'numpy': ('https://docs.scipy.org/doc/numpy/', None), + 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), + 'scikit-learn': ('https://scikit-learn.org/stable/', None) +} + + +# sphinx-gallery configuration +sphinx_gallery_conf = { + # to generate mini-galleries at the end of each docstring in the API + # section: (see https://sphinx-gallery.github.io/configuration.html + # #references-to-examples) + 'doc_module': 'metric_learn', + 'backreferences_dir': os.path.join('generated'), +} + +# generate autosummary even if no references +autosummary_generate = True + + +# Temporary work-around for spacing problem between parameter and parameter +# type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug +# has been fixed in sphinx (https://github.com/sphinx-doc/sphinx/pull/5976) but +# through a change in sphinx basic.css except rtd_theme does not use basic.css. +# In an ideal world, this would get fixed in this PR: +# https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files +def setup(app): + app.add_js_file('js/copybutton.js') + app.add_css_file('css/styles.css') + + +# Remove matplotlib agg warnings from generated doc when using plt.show +warnings.filterwarnings("ignore", category=UserWarning, + message='Matplotlib is currently using agg, which is a' + ' non-GUI backend, so cannot show the figure.') diff --git a/doc/getting_started.rst b/doc/getting_started.rst new file mode 100644 index 00000000..90b7c7ee --- /dev/null +++ b/doc/getting_started.rst @@ -0,0 +1,47 @@ +############### +Getting started +############### + +Installation and Setup +====================== + +**Installation** + +metric-learn can be installed in either of the following ways: + +- If you use Anaconda: ``conda install -c conda-forge metric-learn``. See more options `here `_. + +- To install from PyPI: ``pip install metric-learn``. + +- For a manual install of the latest code, download the source repository and run ``python setup.py install``. You may then run ``pytest test`` to run all tests (you will need to have the ``pytest`` package installed). + +**Dependencies** + +- Python 3.6+ (the last version supporting Python 2 and Python 3.5 was + `v0.5.0 `_) +- numpy>= 1.11.0, scipy>= 0.17.0, scikit-learn>=0.21.3 + +**Optional dependencies** + +- For SDML, using skggm will allow the algorithm to solve problematic cases + (install from commit `a0ed406 `_). + ``pip install 'git+https://github.com/skggm/skggm.git@a0ed406586c4364ea3297a658f415e13b5cbdaf8'`` to install the required version of skggm from GitHub. +- For running the examples only: matplotlib + +Quick start +=========== + +This example loads the iris dataset, and evaluates a k-nearest neighbors +algorithm on an embedding space learned with `NCA`. + +:: + + from metric_learn import NCA + from sklearn.datasets import load_iris + from sklearn.model_selection import cross_val_score + from sklearn.pipeline import make_pipeline + from sklearn.neighbors import KNeighborsClassifier + + X, y = load_iris(return_X_y=True) + clf = make_pipeline(NCA(), KNeighborsClassifier()) + cross_val_score(clf, X, y) diff --git a/doc/index.rst b/doc/index.rst index df4ed8a6..f9dfd83d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,80 +1,67 @@ metric-learn: Metric Learning in Python ======================================= -|License| |PyPI version| +|GitHub Actions Build Status| |License| |PyPI version| |Code coverage| -Distance metrics are widely used in the machine learning literature. -Traditionally, practicioners would choose a standard distance metric -(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of -the domain. -Distance metric learning (or simply, metric learning) is the sub-field of -machine learning dedicated to automatically constructing optimal distance -metrics. +`metric-learn `_ +contains efficient Python implementations of several popular supervised and +weakly-supervised metric learning algorithms. As part of `scikit-learn-contrib +`_, the API of metric-learn is compatible with `scikit-learn +`_, the leading library for machine learning in +Python. This allows to use all the scikit-learn routines (for pipelining, +model selection, etc) with metric learning algorithms through a unified +interface. -This package contains efficient Python implementations of several popular -metric learning algorithms. - -.. toctree:: - :caption: Algorithms - :maxdepth: 1 - - metric_learn.covariance - metric_learn.lmnn - metric_learn.itml - metric_learn.sdml - metric_learn.lsml - metric_learn.nca - metric_learn.lfda - metric_learn.rca - -Each metric supports the following methods: - -- ``fit(...)``, which learns the model. -- ``transformer()``, which returns a transformation matrix - :math:`L \in \mathbb{R}^{D \times d}`, which can be used to convert a - data matrix :math:`X \in \mathbb{R}^{n \times d}` to the - :math:`D`-dimensional learned metric space :math:`X L^{\top}`, - in which standard Euclidean distances may be used. -- ``transform(X)``, which applies the aforementioned transformation. -- ``metric()``, which returns a Mahalanobis matrix - :math:`M = L^{\top}L` such that distance between vectors ``x`` and - ``y`` can be computed as :math:`\left(x-y\right)M\left(x-y\right)`. +If you use metric-learn in a scientific publication, we would appreciate +citations to the following paper: +`metric-learn: Metric Learning Algorithms in Python +`_, de Vazelhes +*et al.*, Journal of Machine Learning Research, 21(138):1-6, 2020. -Installation and Setup -====================== +Bibtex entry:: -Run ``pip install metric-learn`` to download and install from PyPI. + @article{metric-learn, + title = {metric-learn: {M}etric {L}earning {A}lgorithms in {P}ython}, + author = {{de Vazelhes}, William and {Carey}, CJ and {Tang}, Yuan and + {Vauquier}, Nathalie and {Bellet}, Aur{\'e}lien}, + journal = {Journal of Machine Learning Research}, + year = {2020}, + volume = {21}, + number = {138}, + pages = {1--6} + } -Alternately, download the source repository and run: -- ``python setup.py install`` for default installation. -- ``python setup.py test`` to run all tests. +Documentation outline +--------------------- -**Dependencies** +.. toctree:: + :maxdepth: 2 -- Python 2.7+, 3.4+ -- numpy, scipy, scikit-learn -- (for running the examples only: matplotlib) + getting_started -**Notes** +.. toctree:: + :maxdepth: 2 -If a recent version of the Shogun Python modular (``modshogun``) library -is available, the LMNN implementation will use the fast C++ version from -there. The two implementations differ slightly, and the C++ version is -more complete. + user_guide -Naviagtion ----------- +.. toctree:: + :maxdepth: 2 -:ref:`genindex` | :ref:`modindex` | :ref:`search` + Package Contents .. toctree:: - :maxdepth: 4 - :hidden: + :maxdepth: 2 + + auto_examples/index - Package Overview +:ref:`genindex` | :ref:`search` +.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg + :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg :target: http://badge.fury.io/py/metric-learn .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat :target: http://badges.mit-license.org +.. |Code coverage| image:: https://codecov.io/gh/scikit-learn-contrib/metric-learn/branch/master/graph/badge.svg + :target: https://codecov.io/gh/scikit-learn-contrib/metric-learn diff --git a/doc/introduction.rst b/doc/introduction.rst new file mode 100644 index 00000000..e9ff0015 --- /dev/null +++ b/doc/introduction.rst @@ -0,0 +1,125 @@ +.. _intro_metric_learning: + +======================== +What is Metric Learning? +======================== + +Many approaches in machine learning require a measure of distance between data +points. Traditionally, practitioners would choose a standard distance metric +(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of the +domain. However, it is often difficult to design metrics that are well-suited +to the particular data and task of interest. + +Distance metric learning (or simply, metric learning) aims at +automatically constructing task-specific distance metrics from (weakly) +supervised data, in a machine learning manner. The learned distance metric can +then be used to perform various tasks (e.g., k-NN classification, clustering, +information retrieval). + +Problem Setting +=============== + +Metric learning problems fall into two main categories depending on the type +of supervision available about the training data: + +- :doc:`Supervised learning `: the algorithm has access to + a set of data points, each of them belonging to a class (label) as in a + standard classification problem. + Broadly speaking, the goal in this setting is to learn a distance metric + that puts points with the same label close together while pushing away + points with different labels. +- :doc:`Weakly supervised learning `: the + algorithm has access to a set of data points with supervision only + at the tuple level (typically pairs, triplets, or quadruplets of + data points). A classic example of such weaker supervision is a set of + positive and negative pairs: in this case, the goal is to learn a distance + metric that puts positive pairs close together and negative pairs far away. + +Based on the above (weakly) supervised data, the metric learning problem is +generally formulated as an optimization problem where one seeks to find the +parameters of a distance function that optimize some objective function +measuring the agreement with the training data. + +.. _mahalanobis_distances: + +Mahalanobis Distances +===================== + +In the metric-learn package, all algorithms currently implemented learn +so-called Mahalanobis distances. Given a real-valued parameter matrix +:math:`L` of shape ``(num_dims, n_features)`` where ``n_features`` is the +number features describing the data, the Mahalanobis distance associated with +:math:`L` is defined as follows: + +.. math:: D(x, x') = \sqrt{(Lx-Lx')^\top(Lx-Lx')} + +In other words, a Mahalanobis distance is a Euclidean distance after a +linear transformation of the feature space defined by :math:`L` (taking +:math:`L` to be the identity matrix recovers the standard Euclidean distance). +Mahalanobis distance metric learning can thus be seen as learning a new +embedding space of dimension ``num_dims``. Note that when ``num_dims`` is +smaller than ``n_features``, this achieves dimensionality reduction. + +Strictly speaking, Mahalanobis distances are "pseudo-metrics": they satisfy +three of the `properties of a metric `_ (non-negativity, symmetry, triangle inequality) but not +necessarily the identity of indiscernibles. + +.. note:: + + Mahalanobis distances can also be parameterized by a `positive semi-definite + (PSD) matrix + `_ + :math:`M`: + + .. math:: D(x, x') = \sqrt{(x-x')^\top M(x-x')} + + Using the fact that a PSD matrix :math:`M` can always be decomposed as + :math:`M=L^\top L` for some :math:`L`, one can show that both + parameterizations are equivalent. In practice, an algorithm may thus solve + the metric learning problem with respect to either :math:`M` or :math:`L`. + +.. _use_cases: + +Use-cases +========= + +There are many use-cases for metric learning. We list here a few popular +examples (for code illustrating some of these use-cases, see the +:doc:`examples ` section of the documentation): + +- `Nearest neighbors models + `_: the learned + metric can be used to improve nearest neighbors learning models for + classification, regression, anomaly detection... +- `Clustering `_: + metric learning provides a way to bias the clusters found by algorithms like + K-Means towards the intended semantics. +- Information retrieval: the learned metric can be used to retrieve the + elements of a database that are semantically closest to a query element. +- Dimensionality reduction: metric learning may be seen as a way to reduce the + data dimension in a (weakly) supervised setting. +- More generally, the learned transformation :math:`L` can be used to project + the data into a new embedding space before feeding it into another machine + learning algorithm. + +The API of metric-learn is compatible with `scikit-learn +`_, the leading library for machine +learning in Python. This allows to easily pipeline metric learners with other +scikit-learn estimators to realize the above use-cases, to perform joint +hyperparameter tuning, etc. + +Further reading +=============== + +For more information about metric learning and its applications, one can refer +to the following resources: + +- **Tutorial:** `Similarity and Distance Metric Learning with Applications to + Computer Vision + `_ (2015) +- **Surveys:** `A Survey on Metric Learning for Feature Vectors and Structured + Data `_ (2013), `Metric Learning: A + Survey `_ (2012) +- **Book:** `Metric Learning + `_ (2015) diff --git a/doc/metric_learn.base_metric.rst b/doc/metric_learn.base_metric.rst deleted file mode 100644 index 050a360b..00000000 --- a/doc/metric_learn.base_metric.rst +++ /dev/null @@ -1,7 +0,0 @@ -metric_learn.base_metric module -=============================== - -.. automodule:: metric_learn.base_metric - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/metric_learn.covariance.rst b/doc/metric_learn.covariance.rst deleted file mode 100644 index d24229a3..00000000 --- a/doc/metric_learn.covariance.rst +++ /dev/null @@ -1,21 +0,0 @@ -Covariance metric (baseline method) -=================================== - -.. automodule:: metric_learn.covariance - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - from metric_learn import Covariance - from sklearn.datasets import load_iris - - iris_data = load_iris() - - cov = Covariance() - x = cov.fit_transform(iris_data['data']) diff --git a/doc/metric_learn.itml.rst b/doc/metric_learn.itml.rst deleted file mode 100644 index d6fb2221..00000000 --- a/doc/metric_learn.itml.rst +++ /dev/null @@ -1,27 +0,0 @@ -Information Theoretic Metric Learning (ITML) -============================================ - -.. automodule:: metric_learn.itml - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - from metric_learn import ITML_Supervised - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - itml = ITML_Supervised(num_constraints=200) - itml.fit(X, Y) - -References ----------- -`Information-theoretic Metric Learning `_ Jason V. Davis, et al. diff --git a/doc/metric_learn.lfda.rst b/doc/metric_learn.lfda.rst deleted file mode 100644 index 95cde90d..00000000 --- a/doc/metric_learn.lfda.rst +++ /dev/null @@ -1,30 +0,0 @@ -Local Fisher Discriminant Analysis (LFDA) -========================================= - -.. automodule:: metric_learn.lfda - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - import numpy as np - from metric_learn import LFDA - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - lfda = LFDA(k=2, dim=2) - lfda.fit(X, Y) - -References ------------------- -`Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis `_ Masashi Sugiyama. - -`Local Fisher Discriminant Analysis on Beer Style Clustering `_ Yuan Tang. diff --git a/doc/metric_learn.lmnn.rst b/doc/metric_learn.lmnn.rst deleted file mode 100644 index 4062bfa0..00000000 --- a/doc/metric_learn.lmnn.rst +++ /dev/null @@ -1,33 +0,0 @@ -Large Margin Nearest Neighbor (LMNN) -==================================== - -.. automodule:: metric_learn.lmnn - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - import numpy as np - from metric_learn import LMNN - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - lmnn = LMNN(k=5, learn_rate=1e-6) - lmnn.fit(X, Y, verbose=False) - -If a recent version of the Shogun Python modular (``modshogun``) library -is available, the LMNN implementation will use the fast C++ version from -there. Otherwise, the included pure-Python version will be used. -The two implementations differ slightly, and the C++ version is more complete. - -References ----------- -`Distance Metric Learning for Large Margin Nearest Neighbor Classification `_ Kilian Q. Weinberger, John Blitzer, Lawrence K. Saul diff --git a/doc/metric_learn.lsml.rst b/doc/metric_learn.lsml.rst deleted file mode 100644 index 12be71b8..00000000 --- a/doc/metric_learn.lsml.rst +++ /dev/null @@ -1,27 +0,0 @@ -Least Squares Metric Learning (LSML) -==================================== - -.. automodule:: metric_learn.lsml - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - from metric_learn import LSML_Supervised - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - lsml = LSML_Supervised(num_constraints=200) - isml.fit(X, Y) - -References ----------- - diff --git a/doc/metric_learn.nca.rst b/doc/metric_learn.nca.rst deleted file mode 100644 index 6a2675e5..00000000 --- a/doc/metric_learn.nca.rst +++ /dev/null @@ -1,28 +0,0 @@ -Neighborhood Components Analysis (NCA) -====================================== - -.. automodule:: metric_learn.nca - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - import numpy as np - from metric_learn import NCA - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - nca = NCA(max_iter=1000, learning_rate=0.01) - nca.fit(X, Y) - -References ----------- - diff --git a/doc/metric_learn.rca.rst b/doc/metric_learn.rca.rst deleted file mode 100644 index 2430cd82..00000000 --- a/doc/metric_learn.rca.rst +++ /dev/null @@ -1,27 +0,0 @@ -Relative Components Analysis (RCA) -================================== - -.. automodule:: metric_learn.rca - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - from metric_learn import RCA_Supervised - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - rca = RCA_Supervised(num_chunks=30, chunk_size=2) - rca.fit(X, Y) - -References ------------------- -`Adjustment learning and relevant component analysis `_ Noam Shental, et al. diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst index 226fd324..4d0676b9 100644 --- a/doc/metric_learn.rst +++ b/doc/metric_learn.rst @@ -1,24 +1,60 @@ metric_learn package ==================== -Submodules ----------- +Module Contents +--------------- -.. toctree:: +Base Classes +------------ - metric_learn.base_metric - metric_learn.itml - metric_learn.lfda - metric_learn.lmnn - metric_learn.lsml - metric_learn.nca - metric_learn.rca - metric_learn.sdml +.. autosummary:: + :toctree: generated/ + :template: class.rst -Module contents ---------------- + metric_learn.Constraints + metric_learn.base_metric.BaseMetricLearner + metric_learn.base_metric.MetricTransformer + metric_learn.base_metric.MahalanobisMixin + metric_learn.base_metric._PairsClassifierMixin + metric_learn.base_metric._TripletsClassifierMixin + metric_learn.base_metric._QuadrupletsClassifierMixin + +Supervised Learning Algorithms +------------------------------ +.. autosummary:: + :toctree: generated/ + :template: class.rst + + metric_learn.LFDA + metric_learn.LMNN + metric_learn.MLKR + metric_learn.NCA + metric_learn.RCA + metric_learn.ITML_Supervised + metric_learn.LSML_Supervised + metric_learn.MMC_Supervised + metric_learn.SDML_Supervised + metric_learn.RCA_Supervised + metric_learn.SCML_Supervised + +Weakly Supervised Learning Algorithms +------------------------------------- + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + metric_learn.ITML + metric_learn.LSML + metric_learn.MMC + metric_learn.SDML + metric_learn.SCML + +Unsupervised Learning Algorithms +-------------------------------- + +.. autosummary:: + :toctree: generated/ + :template: class.rst -.. automodule:: metric_learn - :members: - :undoc-members: - :show-inheritance: + metric_learn.Covariance \ No newline at end of file diff --git a/doc/metric_learn.sdml.rst b/doc/metric_learn.sdml.rst deleted file mode 100644 index 83570483..00000000 --- a/doc/metric_learn.sdml.rst +++ /dev/null @@ -1,26 +0,0 @@ -Sparse Determinant Metric Learning (SDML) -========================================= - -.. automodule:: metric_learn.sdml - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - -Example Code ------------- - -:: - - from metric_learn import SDML_Supervised - from sklearn.datasets import load_iris - - iris_data = load_iris() - X = iris_data['data'] - Y = iris_data['target'] - - sdml = SDML_Supervised(num_constraints=200) - sdml.fit(X, Y) - -References ------------------- diff --git a/doc/preprocessor.rst b/doc/preprocessor.rst new file mode 100644 index 00000000..ad1ffd8f --- /dev/null +++ b/doc/preprocessor.rst @@ -0,0 +1,111 @@ +.. _preprocessor_section: + +============ +Preprocessor +============ + +Estimators in metric-learn all have a ``preprocessor`` option at instantiation. +Filling this argument allows them to take more compact input representation +when fitting, predicting etc... + +If ``preprocessor=None``, no preprocessor will be used and the user must +provide the classical representation to the fit/predict/score/etc... methods of +the estimators (see the documentation of the particular estimator to know the +type of input it accepts). Otherwise, two types of objects can be put in this +argument: + +Array-like +---------- +You can specify ``preprocessor=X`` where ``X`` is an array-like containing the +dataset of points. In this case, the fit/predict/score/etc... methods of the +estimator will be able to take as inputs an array-like of indices, replacing +under the hood each index by the corresponding sample. + + +Example with a supervised metric learner: + +>>> from metric_learn import NCA +>>> +>>> X = np.array([[-0.7 , -0.23], +>>> [-0.43, -0.49], +>>> [ 0.14, -0.37]]) # array of 3 samples of 2 features +>>> points_indices = np.array([2, 0, 1, 0]) +>>> y = np.array([1, 0, 1, 1]) +>>> +>>> nca = NCA(preprocessor=X) +>>> nca.fit(points_indices, y) +>>> # under the hood the algorithm will create +>>> # points = np.array([[ 0.14, -0.37], +>>> # [-0.7 , -0.23], +>>> # [-0.43, -0.49], +>>> # [ 0.14, -0.37]]) and fit on it + + +Example with a weakly supervised metric learner: + +>>> from metric_learn import MMC +>>> X = np.array([[-0.7 , -0.23], +>>> [-0.43, -0.49], +>>> [ 0.14, -0.37]]) # array of 3 samples of 2 features +>>> pairs_indices = np.array([[2, 0], [1, 0]]) +>>> y_pairs = np.array([1, -1]) +>>> +>>> mmc = MMC(preprocessor=X) +>>> mmc.fit(pairs_indices, y_pairs) +>>> # under the hood the algorithm will create +>>> # pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]], +>>> # [[-0.43, -0.49], [-0.7 , -0.23]]]) and fit on it + +Callable +-------- +Alternatively, you can provide a callable as ``preprocessor``. Then the +estimator will accept indicators of points instead of points. Under the hood, +the estimator will call this callable on the indicators you provide as input +when fitting, predicting etc... Using a callable can be really useful to +represent lazily a dataset of images stored on the file system for instance. +The callable should take as an input a 1D array-like, and return a 2D +array-like. For supervised learners it will be applied on the whole 1D array of +indicators at once, and for weakly supervised learners it will be applied on +each column of the 2D array of tuples. + +Example with a supervised metric learner: + +>>> def find_images(file_paths): +>>> # each file contains a small image to use as an input datapoint +>>> return np.row_stack([imread(f).ravel() for f in file_paths]) +>>> +>>> nca = NCA(preprocessor=find_images) +>>> nca.fit(['img01.png', 'img00.png', 'img02.png'], [1, 0, 1]) +>>> # under the hood preprocessor(indicators) will be called + + +Example with a weakly supervised metric learner: + +>>> pairs_images_paths = [['img02.png', 'img00.png'], +>>> ['img01.png', 'img00.png']] +>>> y_pairs = np.array([1, -1]) +>>> +>>> mmc = NCA(preprocessor=find_images) +>>> mmc.fit(pairs_images_paths, y_pairs) +>>> # under the hood preprocessor(pairs_indicators[i]) will be called for each +>>> # i in [0, 1] + + +.. note:: Note that when you fill the ``preprocessor`` option, it allows you + to give more compact inputs, but the classical way of providing inputs + stays valid (2D array-like for supervised learners and 3D array-like of + tuples for weakly supervised learners). If a classical input + is provided, the metric learner will not use the preprocessor. + + Example: This will work: + + >>> from metric_learn import MMC + >>> def preprocessor_wip(array): + >>> raise NotImplementedError("This preprocessor does nothing yet.") + >>> + >>> pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]], + >>> [[-0.43, -0.49], [-0.7 , -0.23]]]) + >>> y_pairs = np.array([1, -1]) + >>> + >>> mmc = MMC(preprocessor=preprocessor_wip) + >>> mmc.fit(pairs, y_pairs) # preprocessor_wip will not be called here diff --git a/doc/supervised.rst b/doc/supervised.rst new file mode 100644 index 00000000..49548b83 --- /dev/null +++ b/doc/supervised.rst @@ -0,0 +1,434 @@ +========================== +Supervised Metric Learning +========================== + +Supervised metric learning algorithms take as inputs points `X` and target +labels `y`, and learn a distance matrix that make points from the same class +(for classification) or with close target value (for regression) close to each +other, and points from different classes or with distant target values far away +from each other. + +General API +=========== + +Supervised metric learning algorithms essentially use the same API as +scikit-learn. + +Input data +---------- +In order to train a model, you need two `array-like `_ objects, `X` and `y`. `X` +should be a 2D array-like of shape `(n_samples, n_features)`, where +`n_samples` is the number of points of your dataset and `n_features` is the +number of attributes describing each point. `y` should be a 1D +array-like +of shape `(n_samples,)`, containing for each point in `X` the class it +belongs to (or the value to regress for this sample, if you use `MLKR` for +instance). + +Here is an example of a dataset of two dogs and one +cat (the classes are 'dog' and 'cat') an animal being represented by +two numbers. + +>>> import numpy as np +>>> X = np.array([[2.3, 3.6], [0.2, 0.5], [6.7, 2.1]]) +>>> y = np.array(['dog', 'cat', 'dog']) + +.. note:: + + You can also use a preprocessor instead of directly giving the inputs as + 2D arrays. See the :ref:`preprocessor_section` section for more details. + +Fit, transform, and so on +------------------------- +The goal of supervised metric-learning algorithms is to transform +points in a new space, in which the distance between two points from the +same class will be small, and the distance between two points from different +classes will be large. To do so, we fit the metric learner (example: +`NCA`). + +>>> from metric_learn import NCA +>>> nca = NCA(random_state=42) +>>> nca.fit(X, y) +NCA(init='auto', max_iter=100, n_components=None, + preprocessor=None, random_state=42, tol=None, verbose=False) + + +Now that the estimator is fitted, you can use it on new data for several +purposes. + +First, you can transform the data in the learned space, using `transform`: +Here we transform two points in the new embedding space. + +>>> X_new = np.array([[9.4, 4.1], [2.1, 4.4]]) +>>> nca.transform(X_new) +array([[ 5.91884732, 10.25406973], + [ 3.1545886 , 6.80350083]]) + +Also, as explained before, our metric learners has learn a distance between +points. You can use this distance in two main ways: + +- You can either return the distance between pairs of points using the + `pair_distance` function: + +>>> nca.pair_distance([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]]) +array([0.49627072, 3.65287282, 6.06079877]) + +- Or you can return a function that will return the distance (in the new + space) between two 1D arrays (the coordinates of the points in the original + space), similarly to distance functions in `scipy.spatial.distance`. + +>>> metric_fun = nca.get_metric() +>>> metric_fun([3.5, 3.6], [5.6, 2.4]) +0.4962707194621285 + +- Alternatively, you can use `pair_score` to return the **score** between + pairs of points (the larger the score, the more similar the pair). + For Mahalanobis learners, it is equal to the opposite of the distance. + +>>> score = nca.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]]) +>>> score +array([-0.49627072, -3.65287282, -6.06079877]) + +This is useful because `pair_score` matches the **score** semantic of +scikit-learn's `Classification metrics +`_. + +.. note:: + + If the metric learner that you use learns a :ref:`Mahalanobis distance + ` (like it is the case for all algorithms + currently in metric-learn), you can get the plain learned Mahalanobis + matrix using `get_mahalanobis_matrix`. + + >>> nca.get_mahalanobis_matrix() + array([[0.43680409, 0.89169412], + [0.89169412, 1.9542479 ]]) + + +Scikit-learn compatibility +-------------------------- + +All supervised algorithms are scikit-learn estimators +(`sklearn.base.BaseEstimator`) and transformers +(`sklearn.base.TransformerMixin`) so they are compatible with pipelines +(`sklearn.pipeline.Pipeline`) and +scikit-learn model selection routines +(`sklearn.model_selection.cross_val_score`, +`sklearn.model_selection.GridSearchCV`, etc). +You can also use some of the scoring functions from `sklearn.metrics`. + +Algorithms +========== + +.. _lmnn: + +:py:class:`LMNN ` +----------------------------------------- + +Large Margin Nearest Neighbor Metric Learning +(:py:class:`LMNN `) + +`LMNN` learns a Mahalanobis distance metric in the kNN classification +setting. The learned metric attempts to keep close k-nearest neighbors +from the same class, while keeping examples from different classes +separated by a large margin. This algorithm makes no assumptions about +the distribution of the data. + +The distance is learned by solving the following optimization problem: + +.. math:: + + \min_\mathbf{L}\sum_{i, j}\eta_{ij}||\mathbf{L(x_i-x_j)}||^2 + + c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L(x_i-x_j)}||^2-|| + \mathbf{L(x_i-x_l)}||^2]_+) + +where :math:`\mathbf{x}_i` is a data point, :math:`\mathbf{x}_j` is one +of its k-nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` +are all the other instances within that region with different labels, +:math:`\eta_{ij}, y_{ij} \in \{0, 1\}` are both the indicators, +:math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k-nearest +neighbors (with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` +indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different classes, +:math:`[\cdot]_+=\max(0, \cdot)` is the Hinge loss. + +.. rubric:: Example Code + +:: + + import numpy as np + from metric_learn import LMNN + from sklearn.datasets import load_iris + + iris_data = load_iris() + X = iris_data['data'] + Y = iris_data['target'] + + lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False) + lmnn.fit(X, Y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Weinberger et al. `Distance Metric Learning for Large Margin Nearest Neighbor Classification `_. JMLR 2009. + + [2]. `Wikipedia entry on Large Margin Nearest Neighbor `_. + + +.. _nca: + +:py:class:`NCA ` +-------------------------------------- + +Neighborhood Components Analysis (:py:class:`NCA `) + +`NCA` is a distance metric learning algorithm which aims to improve the +accuracy of nearest neighbors classification compared to the standard +Euclidean distance. The algorithm directly maximizes a stochastic variant +of the leave-one-out k-nearest neighbors (KNN) score on the training set. +It can also learn a low-dimensional linear transformation of data that can +be used for data visualization and fast classification. + +They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and +define the probability :math:`p_{ij}` that :math:`\mathbf{x}_i` is the +neighbor of :math:`\mathbf{x}_j` by calculating the softmax likelihood of +the Mahalanobis distance: + +.. math:: + + p_{ij} = \frac{\exp(-|| \mathbf{Lx}_i - \mathbf{Lx}_j ||_2^2)} + {\sum_{l\neq i}\exp(-||\mathbf{Lx}_i - \mathbf{Lx}_l||_2^2)}, + \qquad p_{ii}=0 + +Then the probability that :math:`\mathbf{x}_i` will be correctly classified +by the stochastic nearest neighbors rule is: + +.. math:: + + p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij} + +The optimization problem is to find matrix :math:`\mathbf{L}` that maximizes +the sum of probability of being correctly classified: + +.. math:: + + \mathbf{L} = \text{argmax}\sum_i p_i + +.. rubric:: Example Code + +:: + + import numpy as np + from metric_learn import NCA + from sklearn.datasets import load_iris + + iris_data = load_iris() + X = iris_data['data'] + Y = iris_data['target'] + + nca = NCA(max_iter=1000) + nca.fit(X, Y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Goldberger et al. `Neighbourhood Components Analysis `_. NIPS 2005. + + [2]. `Wikipedia entry on Neighborhood Components Analysis `_. + + +.. _lfda: + +:py:class:`LFDA ` +----------------------------------------- + +Local Fisher Discriminant Analysis (:py:class:`LFDA `) + +`LFDA` is a linear supervised dimensionality reduction method which effectively combines the ideas of `Linear Discriminant Analysis ` and Locality-Preserving Projection . It is +particularly useful when dealing with multi-modality, where one ore more classes +consist of separate clusters in input space. The core optimization problem of +LFDA is solved as a generalized eigenvalue problem. + + +The algorithm define the Fisher local within-/between-class scatter matrix +:math:`\mathbf{S}^{(w)}/ \mathbf{S}^{(b)}` in a pairwise fashion: + +.. math:: + + \mathbf{S}^{(w)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(w)}(\mathbf{x}_i - + \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\ + \mathbf{S}^{(b)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(b)}(\mathbf{x}_i - + \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\ + +where + +.. math:: + + W_{ij}^{(w)} = \left\{\begin{aligned}0 \qquad y_i\neq y_j \\ + \,\,\mathbf{A}_{i,j}/n_l \qquad y_i = y_j\end{aligned}\right.\\ + W_{ij}^{(b)} = \left\{\begin{aligned}1/n \qquad y_i\neq y_j \\ + \,\,\mathbf{A}_{i,j}(1/n-1/n_l) \qquad y_i = y_j\end{aligned}\right.\\ + +here :math:`\mathbf{A}_{i,j}` is the :math:`(i,j)`-th entry of the affinity +matrix :math:`\mathbf{A}`:, which can be calculated with local scaling methods, `n` and `n_l` are the total number of points and the number of points per cluster `l` respectively. + +Then the learning problem becomes derive the LFDA transformation matrix +:math:`\mathbf{L}_{LFDA}`: + +.. math:: + + \mathbf{L}_{LFDA} = \arg\max_\mathbf{L} + [\text{tr}((\mathbf{L}^T\mathbf{S}^{(w)} + \mathbf{L})^{-1}\mathbf{L}^T\mathbf{S}^{(b)}\mathbf{L})] + +That is, it is looking for a transformation matrix :math:`\mathbf{L}` such that +nearby data pairs in the same class are made close and the data pairs in +different classes are separated from each other; far apart data pairs in the +same class are not imposed to be close. + +.. rubric:: Example Code + +:: + + import numpy as np + from metric_learn import LFDA + from sklearn.datasets import load_iris + + iris_data = load_iris() + X = iris_data['data'] + Y = iris_data['target'] + + lfda = LFDA(k=2, dim=2) + lfda.fit(X, Y) + +.. note:: + LDFA suffers from a problem called “sign indeterminacy”, which means the sign of the ``components`` and the output from transform depend on a random state. This is directly related to the calculation of eigenvectors in the algorithm. The same input ran in different times might lead to different transforms, but both valid. + + To work around this, fit instances of this class to data once, then keep the instance around to do transformations. + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis `_. JMLR 2007. + + [2]. Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering `_. + +.. _mlkr: + +:py:class:`MLKR ` +----------------------------------------- + +Metric Learning for Kernel Regression (:py:class:`MLKR `) + +`MLKR` is an algorithm for supervised metric learning, which learns a +distance function by directly minimizing the leave-one-out regression error. +This algorithm can also be viewed as a supervised variation of PCA and can be +used for dimensionality reduction and high dimensional data visualization. + +Theoretically, `MLKR` can be applied with many types of kernel functions and +distance metrics, we hereafter focus the exposition on a particular instance +of the Gaussian kernel and Mahalanobis metric, as these are used in our +empirical development. The Gaussian kernel is denoted as: + +.. math:: + + k_{ij} = \frac{1}{\sqrt{2\pi}\sigma}\exp(-\frac{d(\mathbf{x}_i, + \mathbf{x}_j)}{\sigma^2}) + +where :math:`d(\cdot, \cdot)` is the squared distance under some metrics, +here in the fashion of Mahalanobis, it should be :math:`d(\mathbf{x}_i, +\mathbf{x}_j) = ||\mathbf{L}(\mathbf{x}_i - \mathbf{x}_j)||`, the transition +matrix :math:`\mathbf{L}` is derived from the decomposition of Mahalanobis +matrix :math:`\mathbf{M=L^TL}`. + +Since :math:`\sigma^2` can be integrated into :math:`d(\cdot)`, we can set +:math:`\sigma^2=1` for the sake of simplicity. Here we use the cumulative +leave-one-out quadratic regression error of the training samples as the +loss function: + +.. math:: + + \mathcal{L} = \sum_i(y_i - \hat{y}_i)^2 + +where the prediction :math:`\hat{y}_i` is derived from kernel regression by +calculating a weighted average of all the training samples: + +.. math:: + + \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}} + +.. rubric:: Example Code + +:: + + from metric_learn import MLKR + from sklearn.datasets import load_iris + + iris_data = load_iris() + X = iris_data['data'] + Y = iris_data['target'] + + mlkr = MLKR() + mlkr.fit(X, Y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Weinberger et al. `Metric Learning for Kernel Regression `_. AISTATS 2007. + + +.. _supervised_version: + +Supervised versions of weakly-supervised algorithms +--------------------------------------------------- + +Each :ref:`weakly-supervised algorithm ` +has a supervised version of the form `*_Supervised` where similarity tuples are +randomly generated from the labels information and passed to the underlying +algorithm. + +.. warning:: + Supervised versions of weakly-supervised algorithms interpret label -1 + (or any negative label) as a point with unknown label. + Those points are discarded in the learning process. + +For pairs learners (see :ref:`learning_on_pairs`), pairs (tuple of two points +from the dataset), and pair labels (`int` indicating whether the two points +are similar (+1) or dissimilar (-1)), are sampled with the function +`metric_learn.constraints.positive_negative_pairs`. To sample positive pairs +(of label +1), this method will look at all the samples from the same label and +sample randomly a pair among them. To sample negative pairs (of label -1), this +method will look at all the samples from a different class and sample randomly +a pair among them. The method will try to build `n_constraints` positive +pairs and `n_constraints` negative pairs, but sometimes it cannot find enough +of one of those, so forcing `same_length=True` will return both times the +minimum of the two lenghts. + +For using quadruplets learners (see :ref:`learning_on_quadruplets`) in a +supervised way, positive and negative pairs are sampled as above and +concatenated so that we have a 3D array of +quadruplets, where for each quadruplet the two first points are from the same +class, and the two last points are from a different class (so indeed the two +last points should be less similar than the two first points). + +.. rubric:: Example Code + +:: + + from metric_learn import MMC_Supervised + from sklearn.datasets import load_iris + + iris_data = load_iris() + X = iris_data['data'] + Y = iris_data['target'] + + mmc = MMC_Supervised(n_constraints=200) + mmc.fit(X, Y) diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst new file mode 100644 index 00000000..110b07f9 --- /dev/null +++ b/doc/unsupervised.rst @@ -0,0 +1,40 @@ +============================ +Unsupervised Metric Learning +============================ + +Unsupervised metric learning algorithms only take as input an (unlabeled) +dataset `X`. For now, in metric-learn, there only is `Covariance`, which is a +simple baseline algorithm (see below). + + +Algorithms +========== +.. _covariance: + +Covariance +---------- + +`Covariance` does not "learn" anything, rather it calculates +the covariance matrix of the input data. This is a simple baseline method. +It can be used for ZCA whitening of the data (see the Wikipedia page of +`whitening transformation `_). + +.. rubric:: Example Code + +:: + + from metric_learn import Covariance + from sklearn.datasets import load_iris + + iris = load_iris()['data'] + + cov = Covariance().fit(iris) + x = cov.transform(iris) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936. \ No newline at end of file diff --git a/doc/user_guide.rst b/doc/user_guide.rst new file mode 100644 index 00000000..5472107a --- /dev/null +++ b/doc/user_guide.rst @@ -0,0 +1,16 @@ +.. title:: User guide: contents + +.. _user_guide: + +========== +User Guide +========== + +.. toctree:: + :numbered: + + introduction.rst + supervised.rst + weakly_supervised.rst + unsupervised.rst + preprocessor.rst \ No newline at end of file diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst new file mode 100644 index 00000000..76f7c14e --- /dev/null +++ b/doc/weakly_supervised.rst @@ -0,0 +1,974 @@ +.. _weakly_supervised_section: + +================================= +Weakly Supervised Metric Learning +================================= + +Weakly supervised algorithms work on weaker information about the data points +than supervised algorithms. Rather than labeled points, they take as input +similarity judgments on tuples of data points, for instance pairs of similar +and dissimilar points. Refer to the documentation of each algorithm for its +particular form of input data. + + +General API +=========== + +Input data +---------- + +In the following paragraph we talk about tuples for sake of generality. These +can be pairs, triplets, quadruplets etc, depending on the particular metric +learning algorithm we use. + +Basic form +^^^^^^^^^^ + +Every weakly supervised algorithm will take as input tuples of +points, and if needed labels for theses tuples. The tuples of points can +also be called "constraints". They are a set of points that we consider (ex: +two points, three points, etc...). The label is some information we have +about this set of points (e.g. "these two points are similar"). Note that +some information can be contained in the ordering of these tuples (see for +instance the section :ref:`learning_on_quadruplets`). For more details about +specific forms of tuples, refer to the appropriate sections +(:ref:`learning_on_pairs` or :ref:`learning_on_quadruplets`). + +The `tuples` argument is the first argument of every method (like the `X` +argument for classical algorithms in scikit-learn). The second argument is the +label of the tuple: its semantic depends on the algorithm used. For instance +for pairs learners `y` is a label indicating whether the pair is of similar +samples or dissimilar samples. + +Then one can fit a Weakly Supervised Metric Learner on this tuple, like this: + +>>> my_algo.fit(tuples, y) + +Like in a classical setting we split the points `X` between train and test, +here we split the `tuples` between train and test. + +>>> from sklearn.model_selection import train_test_split +>>> pairs_train, pairs_test, y_train, y_test = train_test_split(pairs, y) + +These are two data structures that can be used to represent tuple in metric +learn: + +3D array of tuples +^^^^^^^^^^^^^^^^^^ + +The most intuitive way to represent tuples is to provide the algorithm with a +3D array-like of tuples of shape `(n_tuples, tuple_size, n_features)`, where +`n_tuples` is the number of tuples, `tuple_size` is the number of elements +in a tuple (2 for pairs, 3 for triplets for instance), and `n_features` is +the number of features of each point. + +.. rubric:: Example Code + +Here is an artificial dataset of 4 pairs of 2 points of 3 features each: + +>>> import numpy as np +>>> tuples = np.array([[[-0.12, -1.21, -0.20], +>>> [+0.05, -0.19, -0.05]], +>>> +>>> [[-2.16, +0.11, -0.02], +>>> [+1.58, +0.16, +0.93]], +>>> +>>> [[+1.58, +0.16, +0.93], # same as tuples[1, 1, :] +>>> [+0.89, -0.34, +2.41]], +>>> +>>> [[-0.12, -1.21, -0.20], # same as tuples[0, 0, :] +>>> [-2.16, +0.11, -0.02]]]) # same as tuples[1, 0, :] +>>> y = np.array([-1, 1, 1, -1]) + +.. warning:: This way of specifying pairs is not recommended for a large number + of tuples, as it is redundant (see the comments in the example) and hence + takes a lot of memory. Indeed each feature vector of a point will be + replicated as many times as a point is involved in a tuple. The second way + to specify pairs is more efficient + + +2D array of indicators + preprocessor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Instead of forming each point in each tuple, a more efficient representation +would be to keep the dataset of points `X` aside, and just represent tuples +as a collection of tuples of *indices* from the points in `X`. Since we loose +the feature dimension there, the resulting array is 2D. + +.. rubric:: Example Code + +An equivalent representation of the above pairs would be: + +>>> X = np.array([[-0.12, -1.21, -0.20], +>>> [+0.05, -0.19, -0.05], +>>> [-2.16, +0.11, -0.02], +>>> [+1.58, +0.16, +0.93], +>>> [+0.89, -0.34, +2.41]]) +>>> +>>> tuples_indices = np.array([[0, 1], +>>> [2, 3], +>>> [3, 4], +>>> [0, 2]]) +>>> y = np.array([-1, 1, 1, -1]) + +In order to fit metric learning algorithms with this type of input, we need to +give the original dataset of points `X` to the estimator so that it knows +the points the indices refer to. We do this when initializing the estimator, +through the argument `preprocessor` (see below :ref:`fit_ws`) + + +.. note:: + + Instead of an array-like, you can give a callable in the argument + `preprocessor`, which will go fetch and form the tuples. This allows to + give more general indicators than just indices from an array (for instance + paths in the filesystem, name of records in a database etc...) See section + :ref:`preprocessor_section` for more details on how to use the preprocessor. + +.. _fit_ws: + +Fit, transform, and so on +------------------------- + +The goal of weakly-supervised metric-learning algorithms is to transform +points in a new space, in which the tuple-wise constraints between points +are respected. + +>>> from metric_learn import MMC +>>> mmc = MMC(random_state=42) +>>> mmc.fit(tuples, y) +MMC(A0='deprecated', tol=0.001, diagonal=False, + diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000, + preprocessor=None, random_state=42, verbose=False) + +Or alternatively (using a preprocessor): + +>>> from metric_learn import MMC +>>> mmc = MMC(preprocessor=X, random_state=42) +>>> mmc.fit(pairs_indice, y) + + +Now that the estimator is fitted, you can use it on new data for several +purposes. + +First, you can transform the data in the learned space, using `transform`: +Here we transform two points in the new embedding space. + +>>> X_new = np.array([[9.4, 4.1, 4.2], [2.1, 4.4, 2.3]]) +>>> mmc.transform(X_new) +array([[-3.24667162e+01, 4.62622348e-07, 3.88325421e-08], + [-3.61531114e+01, 4.86778289e-07, 2.12654397e-08]]) + +Also, as explained before, our metric learner has learned a distance between +points. You can use this distance in two main ways: + +- You can either return the distance between pairs of points using the + `pair_distance` function: + +>>> mmc.pair_distance([[[3.5, 3.6, 5.2], [5.6, 2.4, 6.7]], +... [[1.2, 4.2, 7.7], [2.1, 6.4, 0.9]]]) +array([7.27607365, 0.88853014]) + +- Or you can return a function that will return the distance + (in the new space) between two 1D arrays (the coordinates of the points in + the original space), similarly to distance functions in + `scipy.spatial.distance`. To do that, use the `get_metric` method. + +>>> metric_fun = mmc.get_metric() +>>> metric_fun([3.5, 3.6, 5.2], [5.6, 2.4, 6.7]) +7.276073646278203 + +- Alternatively, you can use `pair_score` to return the **score** between + pairs of points (the larger the score, the more similar the pair). + For Mahalanobis learners, it is equal to the opposite of the distance. + +>>> score = mmc.pair_score([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]], [[3.3, 7.8], [10.9, 0.1]]]) +>>> score +array([-0.49627072, -3.65287282, -6.06079877]) + + This is useful because `pair_score` matches the **score** semantic of + scikit-learn's `Classification metrics + `_. + +.. note:: + + If the metric learner that you use learns a :ref:`Mahalanobis distance + ` (like it is the case for all algorithms + currently in metric-learn), you can get the plain Mahalanobis matrix using + `get_mahalanobis_matrix`. + +>>> mmc.get_mahalanobis_matrix() +array([[ 0.58603894, -5.69883982, -1.66614919], + [-5.69883982, 55.41743549, 16.20219519], + [-1.66614919, 16.20219519, 4.73697721]]) + +.. _sklearn_compat_ws: + +Prediction and scoring +---------------------- + +Since weakly supervised are also able, after being fitted, to predict for a +given tuple what is its label (for pairs) or ordering (for quadruplets). See +the appropriate section for more details, either :ref:`this +one ` for pairs, or :ref:`this one +` for quadruplets. + +They also implement a default scoring method, `score`, that can be +used to evaluate the performance of a metric-learner on a test dataset. See +the appropriate section for more details, either :ref:`this +one ` for pairs, or :ref:`this one ` +for quadruplets. + +Scikit-learn compatibility +-------------------------- + +Weakly supervised estimators are compatible with scikit-learn routines for +model selection (`sklearn.model_selection.cross_val_score`, +`sklearn.model_selection.GridSearchCV`, etc). + +Example: + +>>> from metric_learn import MMC +>>> import numpy as np +>>> from sklearn.datasets import load_iris +>>> from sklearn.model_selection import cross_val_score +>>> rng = np.random.RandomState(42) +>>> X, _ = load_iris(return_X_y=True) +>>> # let's sample 30 random pairs and labels of pairs +>>> pairs_indices = rng.randint(X.shape[0], size=(30, 2)) +>>> y = 2 * rng.randint(2, size=30) - 1 +>>> mmc = MMC(preprocessor=X) +>>> cross_val_score(mmc, pairs_indices, y) + +.. _learning_on_pairs: + +Learning on pairs +================= + +Some metric learning algorithms learn on pairs of samples. In this case, one +should provide the algorithm with `n_samples` pairs of points, with a +corresponding target containing `n_samples` values being either +1 or -1. +These values indicate whether the given pairs are similar points or +dissimilar points. + +Fitting +------- +Here is an example for fitting on pairs (see :ref:`fit_ws` for more details on +the input data format and how to fit, in the general case of learning on +tuples). + +>>> from metric_learn import MMC +>>> pairs = np.array([[[1.2, 3.2], [2.3, 5.5]], +>>> [[4.5, 2.3], [2.1, 2.3]]]) +>>> y_pairs = np.array([1, -1]) +>>> mmc = MMC(random_state=42) +>>> mmc.fit(pairs, y_pairs) +MMC(tol=0.001, diagonal=False, + diagonal_c=1.0, init='auto', max_iter=100, max_proj=10000, preprocessor=None, + random_state=42, verbose=False) + +Here, we learned a metric that puts the two first points closer +together in the transformed space, and the two next points further away from +each other. + +.. _pairs_predicting: + +Prediction +---------- + +When a pairs learner is fitted, it is also able to predict, for an unseen +pair, whether it is a pair of similar or dissimilar points. + +>>> mmc.predict([[[0.6, 1.6], [1.15, 2.75]], +... [[3.2, 1.1], [5.4, 6.1]]]) +array([1, -1]) + +.. _calibration: + +Prediction threshold +^^^^^^^^^^^^^^^^^^^^ + +Predicting whether a new pair represents similar or dissimilar +samples requires to set a threshold on the learned distance, so that points +closer (in the learned space) than this threshold are predicted as similar, +and points further away are predicted as dissimilar. Several methods are +possible for this thresholding. + +- **Calibration at fit time**: The threshold is set with `calibrate_threshold` + (see below) on the training set. You can specify the calibration + parameters directly + in the `fit` method with the `threshold_params` parameter (see the + documentation of the `fit` method of any metric learner that learns on pairs + of points for more information). Note that calibrating on the training set + may cause some overfitting. If you want to avoid that, calibrate the + threshold after fitting, on a validation set. + + >>> mmc.fit(pairs, y) # will fit the threshold automatically after fitting + +- **Calibration on validation set**: calling `calibrate_threshold` will + calibrate the threshold to achieve a particular score on a validation set, + the score being among the classical scores for classification (accuracy, f1 + score...). + + >>> mmc.calibrate_threshold(pairs, y) + +- **Manual threshold**: calling `set_threshold` will set the threshold to a + particular value. + + >>> mmc.set_threshold(0.4) + +See also: `sklearn.calibration`. + +.. _pairs_scoring: + +Scoring +------- + +Pair metric learners can also return a `decision_function` for a set of pairs. +It is basically the "score" that will be thresholded to find the prediction +for the pair. This score corresponds to the opposite of the distance in the +new space (higher score means points are similar, and lower score dissimilar). + +>>> mmc.decision_function([[[0.6, 1.6], [1.15, 2.75]], +... [[3.2, 1.1], [5.4, 6.1]]]) +array([-0.12811124, -0.74750256]) + +This allows to use common scoring functions for binary classification, like +`sklearn.metrics.accuracy_score` for instance, which +can be used inside cross-validation routines: + +>>> from sklearn.model_selection import cross_val_score +>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]], +... [[3.2, 1.1], [5.4, 6.1]], +... [[7.7, 5.6], [1.23, 8.4]]]) +>>> y_test = np.array([-1., 1., -1.]) +>>> cross_val_score(mmc, pairs_test, y_test, scoring='accuracy') +array([1., 0., 1.]) + +Pairs learners also have a default score, which basically +returns the `sklearn.metrics.roc_auc_score` (which is threshold-independent). + +>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]], +... [[3.2, 1.1], [5.4, 6.1]], +... [[7.7, 5.6], [1.23, 8.4]]]) +>>> y_test = np.array([1., -1., -1.]) +>>> mmc.score(pairs_test, y_test) +1.0 + +.. note:: + See :ref:`fit_ws` for more details on metric learners functions that are + not specific to learning on pairs, like `transform`, `pair_distance`, + `pair_score`, `get_metric` and `get_mahalanobis_matrix`. + +Algorithms +---------- + +.. _itml: + +:py:class:`ITML ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Information Theoretic Metric Learning (:py:class:`ITML `) + +`ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler +divergence, between two multivariate Gaussians subject to constraints on the +associated Mahalanobis distance, which can be formulated into a Bregman +optimization problem by minimizing the LogDet divergence subject to +linear constraints. This algorithm can handle a wide variety of constraints +and can optionally incorporate a prior on the distance function. Unlike some +other methods, `ITML` does not rely on an eigenvalue computation or +semi-definite programming. + + +Given a Mahalanobis distance parameterized by :math:`M`, its corresponding +multivariate Gaussian is denoted as: + +.. math:: + p(\mathbf{x}; \mathbf{M}) = \frac{1}{Z}\exp(-\frac{1}{2}d_\mathbf{M} + (\mathbf{x}, \mu)) + = \frac{1}{Z}\exp(-\frac{1}{2}((\mathbf{x} - \mu)^T\mathbf{M} + (\mathbf{x} - \mu)) + +where :math:`Z` is the normalization constant, the inverse of Mahalanobis +matrix :math:`\mathbf{M}^{-1}` is the covariance of the Gaussian. + +Given pairs of similar points :math:`S` and pairs of dissimilar points +:math:`D`, the distance metric learning problem is to minimize the LogDet +divergence, which is equivalent as minimizing :math:`\textbf{KL}(p(\mathbf{x}; +\mathbf{M}_0) || p(\mathbf{x}; \mathbf{M}))`: + +.. math:: + + \min_\mathbf{A} D_{\ell \mathrm{d}}\left(M, M_{0}\right) = + \operatorname{tr}\left(M M_{0}^{-1}\right)-\log \operatorname{det} + \left(M M_{0}^{-1}\right)-n\\ + \text{subject to } \quad d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) + \leq u \qquad (\mathbf{x}_i, \mathbf{x}_j)\in S \\ + d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, + \mathbf{x}_j)\in D + + +where :math:`u` and :math:`l` is the upper and the lower bound of distance +for similar and dissimilar pairs respectively, and :math:`\mathbf{M}_0` +is the prior distance metric, set to identity matrix by default, +:math:`D_{\ell \mathrm{d}}(\cdot)` is the log determinant. + +.. rubric:: Example Code + +:: + + from metric_learn import ITML + + pairs = [[[1.2, 7.5], [1.3, 1.5]], + [[6.4, 2.6], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6]], + [[6.2, 5.5], [5.4, 5.4]]] + y = [1, 1, -1, -1] + + # in this task we want points where the first feature is close to be closer + # to each other, no matter how close the second feature is + + + itml = ITML() + itml.fit(pairs, y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Jason V. Davis, et al. `Information-theoretic Metric Learning `_. ICML 2007. + + [2]. Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/ . + + +.. _sdml: + +:py:class:`SDML ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sparse High-Dimensional Metric Learning +(:py:class:`SDML `) + +`SDML` is an efficient sparse metric learning in high-dimensional space via +double regularization: an L1-penalization on the off-diagonal elements of the +Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between +:math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}` +or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the +covariance matrix). + +The formulated optimization on the semidefinite matrix :math:`\mathbf{M}` +is convex: + +.. math:: + + \min_{\mathbf{M}} = \text{tr}((\mathbf{M}_0 + \eta \mathbf{XLX}^{T}) + \cdot \mathbf{M}) - \log\det \mathbf{M} + \lambda ||\mathbf{M}||_{1, off} + +where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]` is +the training data, the incidence matrix :math:`\mathbf{K}_{ij} = 1` if +:math:`(\mathbf{x}_i, \mathbf{x}_j)` is a similar pair, otherwise -1. The +Laplacian matrix :math:`\mathbf{L}=\mathbf{D}-\mathbf{K}` is calculated from +:math:`\mathbf{K}` and :math:`\mathbf{D}`, a diagonal matrix whose entries are +the sums of the row elements of :math:`\mathbf{K}`., :math:`||\cdot||_{1, off}` +is the off-diagonal L1 norm. + + +.. rubric:: Example Code + +:: + + from metric_learn import SDML + + pairs = [[[1.2, 7.5], [1.3, 1.5]], + [[6.4, 2.6], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6]], + [[6.2, 5.5], [5.4, 5.4]]] + y = [1, 1, -1, -1] + + # in this task we want points where the first feature is close to be closer + # to each other, no matter how close the second feature is + + sdml = SDML() + sdml.fit(pairs, y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Qi et al. `An efficient sparse metric learning in high-dimensional space via L1-penalized log-determinant regularization `_. ICML 2009. + + [2]. Code adapted from https://gist.github.com/kcarnold/5439945 . + +.. _rca: + +:py:class:`RCA ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Relative Components Analysis (:py:class:`RCA `) + +`RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of +in-chunklets covariance matrices. It applies a global linear transformation to +assign large weights to relevant dimensions and low weights to irrelevant +dimensions. Those relevant dimensions are estimated using "chunklets", subsets +of points that are known to belong to the same class. + +For a training set with :math:`n` training points in :math:`k` chunklets, the +algorithm is efficient since it simply amounts to computing + +.. math:: + + \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j} + (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j) + (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j)^T + + +where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}` +with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}^{-1}` is used +as the Mahalanobis matrix. + +.. rubric:: Example Code + +:: + + from metric_learn import RCA + + X = [[-0.05, 3.0],[0.05, -3.0], + [0.1, -3.55],[-0.1, 3.55], + [-0.95, -0.05],[0.95, 0.05], + [0.4, 0.05],[-0.4, -0.05]] + chunks = [0, 0, 1, 1, 2, 2, 3, 3] + + rca = RCA() + rca.fit(X, chunks) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Shental et al. `Adjustment learning and relevant component analysis `_. ECCV 2002. + + [2]. Bar-Hillel et al. `Learning distance functions using equivalence relations `_. ICML 2003. + + [3]. Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints `_. JMLR 2005. + +.. _mmc: + +:py:class:`MMC ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Metric Learning with Application for Clustering with Side Information +(:py:class:`MMC `) + +`MMC` minimizes the sum of squared distances between similar points, while +enforcing the sum of distances between dissimilar ones to be greater than one. +This leads to a convex and, thus, local-minima-free optimization problem that +can be solved efficiently. +However, the algorithm involves the computation of eigenvalues, which is the +main speed-bottleneck. Since it has initially been designed for clustering +applications, one of the implicit assumptions of MMC is that all classes form +a compact set, i.e., follow a unimodal distribution, which restricts the +possible use-cases of this method. However, it is one of the earliest and a +still often cited technique. + +The algorithm aims at minimizing the sum of distances between all the similar +points, while constrains the sum of distances between dissimilar points: + +.. math:: + + \min_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(\mathbf{x}_i, + \mathbf{x}_j)\in S} d_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) + \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}_i, \mathbf{x}_j) + \in D} d^2_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) \geq 1 + +.. rubric:: Example Code + +:: + + from metric_learn import MMC + + pairs = [[[1.2, 7.5], [1.3, 1.5]], + [[6.4, 2.6], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6]], + [[6.2, 5.5], [5.4, 5.4]]] + y = [1, 1, -1, -1] + + # in this task we want points where the first feature is close to be closer + # to each other, no matter how close the second feature is + + mmc = MMC() + mmc.fit(pairs, y) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Xing et al. `Distance metric learning with application to clustering with side-information `_. NIPS 2002. + + [2]. Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz . + +.. _learning_on_triplets: + +Learning on triplets +==================== + +Some metric learning algorithms learn on triplets of samples. In this case, +one should provide the algorithm with `n_samples` triplets of points. The +semantic of each triplet is that the first point should be closer to the +second point than to the third one. + +Fitting +------- +Here is an example for fitting on triplets (see :ref:`fit_ws` for more +details on the input data format and how to fit, in the general case of +learning on tuples). + +>>> from metric_learn import SCML +>>> triplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.1, 0.6]], +>>> [[4.5, 2.3], [2.1, 2.3], [7.3, 3.4]]]) +>>> scml = SCML(random_state=42) +>>> scml.fit(triplets) +SCML(beta=1e-5, B=None, max_iter=100000, verbose=False, + preprocessor=None, random_state=None) + +Or alternatively (using a preprocessor): + +>>> X = np.array([[[1.2, 3.2], +>>> [2.3, 5.5], +>>> [2.1, 0.6], +>>> [4.5, 2.3], +>>> [2.1, 2.3], +>>> [7.3, 3.4]]) +>>> triplets_indices = np.array([[0, 1, 2], [3, 4, 5]]) +>>> scml = SCML(preprocessor=X, random_state=42) +>>> scml.fit(triplets_indices) +SCML(beta=1e-5, B=None, max_iter=100000, verbose=False, + preprocessor=array([[1.2, 3.2], + [2.3, 5.5], + [2.4, 6.7], + [2.1, 0.6], + [4.5, 2.3], + [2.1, 2.3], + [0.6, 1.2], + [7.3, 3.4]]), + random_state=None) + + +Here, we want to learn a metric that, for each of the two +`triplets`, will make the first point closer to the +second point than to the third one. + +.. _triplets_predicting: + +Prediction +---------- + +When a triplets learner is fitted, it is also able to predict, for an +upcoming triplet, whether the first point is closer to the second point +than to the third one (+1), or not (-1). + +>>> triplets_test = np.array( +... [[[5.6, 5.3], [2.2, 2.1], [1.2, 3.4]], +... [[6.0, 4.2], [4.3, 1.2], [0.1, 7.8]]]) +>>> scml.predict(triplets_test) +array([-1., 1.]) + +.. _triplets_scoring: + +Scoring +------- + +Triplet metric learners can also return a `decision_function` for a set of triplets, +which corresponds to the distance between the first two points minus the distance +between the first and last points of the triplet (the higher the value, the more +similar the first point to the second point compared to the last one). This "score" +can be interpreted as a measure of likeliness of having a +1 prediction for this +triplet. + +>>> scml.decision_function(triplets_test) +array([-1.75700306, 4.98982131]) + +In the above example, for the first triplet in `triplets_test`, the first +point is predicted less similar to the second point than to the last point +(they are further away in the transformed space). + +Unlike pairs learners, triplets learners do not allow to give a `y` when fitting: we +assume that the ordering of points within triplets is such that the training triplets +are all positive. Therefore, it is not possible to use scikit-learn scoring functions +(such as 'f1_score') for triplets learners. + +However, triplets learners do have a default scoring function, which will +basically return the accuracy score on a given test set, i.e. the proportion +of triplets that have the right predicted ordering. + +>>> scml.score(triplets_test) +0.5 + +.. note:: + See :ref:`fit_ws` for more details on metric learners functions that are + not specific to learning on pairs, like `transform`, `pair_distance`, + `pair_score`, `get_metric` and `get_mahalanobis_matrix`. + + + + +Algorithms +---------- + +.. _scml: + +:py:class:`SCML ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sparse Compositional Metric Learning +(:py:class:`SCML `) + +`SCML` learns a squared Mahalanobis distance from triplet constraints by +optimizing sparse positive weights assigned to a set of :math:`K` rank-one +PSD bases. This can be formulated as an optimization problem with only +:math:`K` parameters, that can be solved with an efficient stochastic +composite scheme. + +The Mahalanobis matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}` +weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` as: + +.. math:: + + M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0 + +Learning :math:`M` in this form makes it PSD by design, as it is a +nonnegative sum of PSD matrices. The basis set :math:`B` is fixed in advance +and it is possible to construct it from the data. The optimization problem +over :math:`w` is formulated as a classic margin-based hinge loss function +involving the set :math:`C` of triplets. A regularization :math:`\ell_1` +is added to yield a sparse combination. The formulation is the following: + +.. math:: + + \min_{w\geq 0} \sum_{(x_i,x_j,x_k)\in C} [1 + d_w(x_i,x_j)-d_w(x_i,x_k)]_+ + \beta||w||_1 + +where :math:`[\cdot]_+` is the hinge loss. + +.. rubric:: Example Code + +:: + + from metric_learn import SCML + + triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]], + [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]], + [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]] + + scml = SCML() + scml.fit(triplets) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. `_. (AAAI), 2014. + + [2]. Adapted from original `Matlab implementation. `_. + + +.. _learning_on_quadruplets: + +Learning on quadruplets +======================= + +Some metric learning algorithms learn on quadruplets of samples. In this case, +one should provide the algorithm with `n_samples` quadruplets of points. The +semantic of each quadruplet is that the first two points should be closer +together than the last two points. + +Fitting +------- +Here is an example for fitting on quadruplets (see :ref:`fit_ws` for more +details on the input data format and how to fit, in the general case of +learning on tuples). + +>>> from metric_learn import LSML +>>> quadruplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.4, 6.7], [2.1, 0.6]], +>>> [[4.5, 2.3], [2.1, 2.3], [0.6, 1.2], [7.3, 3.4]]]) +>>> lsml = LSML(random_state=42) +>>> lsml.fit(quadruplets) +LSML(max_iter=1000, preprocessor=None, prior=None, random_state=42, tol=0.001, + verbose=False) + +Or alternatively (using a preprocessor): + +>>> X = np.array([[1.2, 3.2], +>>> [2.3, 5.5], +>>> [2.4, 6.7], +>>> [2.1, 0.6], +>>> [4.5, 2.3], +>>> [2.1, 2.3], +>>> [0.6, 1.2], +>>> [7.3, 3.4]]) +>>> quadruplets_indices = np.array([[0, 1, 2, 3], [4, 5, 6, 7]]) +>>> lsml = LSML(preprocessor=X, random_state=42) +>>> lsml.fit(quadruplets_indices) +LSML(max_iter=1000, + preprocessor=array([[1.2, 3.2], + [2.3, 5.5], + [2.4, 6.7], + [2.1, 0.6], + [4.5, 2.3], + [2.1, 2.3], + [0.6, 1.2], + [7.3, 3.4]]), + prior=None, random_state=42, tol=0.001, verbose=False) + + +Here, we want to learn a metric that, for each of the two +`quadruplets`, will put the two first points closer together than the two +last points. + +.. _quadruplets_predicting: + +Prediction +---------- + +When a quadruplets learner is fitted, it is also able to predict, for an +upcoming quadruplet, whether the two first points are more similar than the +two last points (+1), or not (-1). + +>>> quadruplets_test = np.array( +... [[[5.6, 5.3], [2.2, 2.1], [0.4, 0.6], [1.2, 3.4]], +... [[6.0, 4.2], [4.3, 1.2], [4.5, 0.6], [0.1, 7.8]]]) +>>> lsml.predict(quadruplets_test) +array([-1., 1.]) + +.. _quadruplets_scoring: + +Scoring +------- + +Quadruplet metric learners can also return a `decision_function` for a set of +quadruplets, which corresponds to the distance between the first pair of points minus +the distance between the second pair of points of the triplet (the higher the value, +the more similar the first pair is than the last pair). +This "score" can be interpreted as a measure of likeliness of having a +1 prediction +for this quadruplet. + +>>> lsml.decision_function(quadruplets_test) +array([-1.75700306, 4.98982131]) + +In the above example, for the first quadruplet in `quadruplets_test`, the +two first points are predicted less similar than the two last points (they +are further away in the transformed space). + +Like triplet learners, quadruplets learners do not allow to give a `y` when fitting: we +assume that the ordering of points within triplets is such that the training triplets +are all positive. Therefore, it is not possible to use scikit-learn scoring functions +(such as 'f1_score') for triplets learners. + +However, quadruplets learners do have a default scoring function, which will +basically return the accuracy score on a given test set, i.e. the proportion +of quadruplets have the right predicted ordering. + +>>> lsml.score(quadruplets_test) +0.5 + +.. note:: + See :ref:`fit_ws` for more details on metric learners functions that are + not specific to learning on pairs, like `transform`, `pair_distance`, + `pair_score`, `get_metric` and `get_mahalanobis_matrix`. + + + + +Algorithms +---------- + +.. _lsml: + +:py:class:`LSML ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Metric Learning from Relative Comparisons by Minimizing Squared Residual +(:py:class:`LSML `) + +`LSML` proposes a simple, yet effective, algorithm that minimizes a convex +objective function corresponding to the sum of squared residuals of +constraints. This algorithm uses the constraints in the form of the +relative distance comparisons, such method is especially useful where +pairwise constraints are not natural to obtain, thus pairwise constraints +based algorithms become infeasible to be deployed. Furthermore, its sparsity +extension leads to more stable estimation when the dimension is high and +only a small amount of constraints is given. + +The loss function of each constraint +:math:`d(\mathbf{x}_i, \mathbf{x}_j) < d(\mathbf{x}_k, \mathbf{x}_l)` is +denoted as: + +.. math:: + + H(d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) + - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l)) + +where :math:`H(\cdot)` is the squared Hinge loss function defined as: + +.. math:: + + H(x) = \left\{\begin{aligned}0 \qquad x\leq 0 \\ + \,\,x^2 \qquad x>0\end{aligned}\right.\\ + +The summed loss function :math:`L(C)` is the simple sum over all constraints +:math:`C = \{(\mathbf{x}_i , \mathbf{x}_j , \mathbf{x}_k , \mathbf{x}_l) +: d(\mathbf{x}_i , \mathbf{x}_j) < d(\mathbf{x}_k , \mathbf{x}_l)\}`. The +original paper suggested here should be a weighted sum since the confidence +or probability of each constraint might differ. However, for the sake of +simplicity and assumption of no extra knowledge provided, we just deploy +the simple sum here as well as what the authors did in the experiments. + +The distance metric learning problem becomes minimizing the summed loss +function of all constraints plus a regularization term w.r.t. the prior +knowledge: + +.. math:: + + \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_i, + \mathbf{x}_j, \mathbf{x}_k, \mathbf{x}_l)\in C}H(d_\mathbf{M}( + \mathbf{x}_i, \mathbf{x}_j) - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))\\ + +where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity +by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence: + +.. math:: + + D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet} + (\mathbf{M}) + +.. rubric:: Example Code + +:: + + from metric_learn import LSML + + quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]], + [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]], + [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]] + + # we want to make closer points where the first feature is close, and + # further if the second feature is close + + lsml = LSML() + lsml.fit(quadruplets) + +.. rubric:: References + + +.. container:: hatnote hatnote-gray + + [1]. Liu et al. `Metric Learning from Relative Comparisons by Minimizing Squared Residual `_. ICDM 2012. + + [2]. Code adapted from https://gist.github.com/kcarnold/5439917 . + + diff --git a/examples/README.txt b/examples/README.txt new file mode 100644 index 00000000..10dbe0d5 --- /dev/null +++ b/examples/README.txt @@ -0,0 +1,4 @@ +Examples +======== + +Below is a gallery of example metric-learn use cases. \ No newline at end of file diff --git a/examples/plot_metric_learning_examples.py b/examples/plot_metric_learning_examples.py new file mode 100644 index 00000000..32759636 --- /dev/null +++ b/examples/plot_metric_learning_examples.py @@ -0,0 +1,495 @@ +""" +Algorithms walkthrough +~~~~~~~~~~~~~~~~~~~~~~ + +This is a small walkthrough which illustrates most of the Metric Learning +algorithms implemented in metric-learn by using them on synthetic data, +with some visualizations to provide intuitions into what they are designed +to achieve. +""" + +# License: BSD 3 clause +# Authors: Bhargav Srinivasa Desikan +# William de Vazelhes + +###################################################################### +# Imports +# ^^^^^^^ +# .. note:: +# +# In order to show the charts of the examples you need a graphical +# ``matplotlib`` backend installed. For intance, use ``pip install pyqt5`` +# to get Qt graphical interface or use your favorite one. + +from sklearn.manifold import TSNE + +import metric_learn +import numpy as np +from sklearn.datasets import make_classification, make_regression + +# visualisation imports +import matplotlib.pyplot as plt +np.random.seed(42) + + +###################################################################### +# Loading our dataset and setting up plotting +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We will be using a synthetic dataset to illustrate the plotting, +# using the function `sklearn.datasets.make_classification` from +# scikit-learn. The dataset will contain: +# - 100 points in 3 classes with 2 clusters per class +# - 5 features, among which 3 are informative (correlated with the class +# labels) and two are random noise with large magnitude + +X, y = make_classification(n_samples=100, n_classes=3, n_clusters_per_class=2, + n_informative=3, class_sep=4., n_features=5, + n_redundant=0, shuffle=True, + scale=[1, 1, 20, 20, 20]) + +########################################################################### +# Note that the dimensionality of the data is 5, so to plot the +# transformed data in 2D, we will use the t-sne algorithm. (See +# `sklearn.manifold.TSNE`). + + +def plot_tsne(X, y, colormap=plt.cm.Paired): + plt.figure(figsize=(8, 6)) + + # clean the figure + plt.clf() + + tsne = TSNE() + X_embedded = tsne.fit_transform(X) + plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, cmap=colormap) + + plt.xticks(()) + plt.yticks(()) + + plt.show() + +################################### +# Let's now plot the dataset as is. + + +plot_tsne(X, y) + +######################################################################### +# We can see that the classes appear mixed up: this is because t-sne +# is based on preserving the original neighborhood of points in the embedding +# space, but this original neighborhood is based on the euclidean +# distance in the input space, in which the contribution of the noisy +# features is high. So even if points from the same class are close to each +# other in some subspace of the input space, this is not the case when +# considering all dimensions of the input space. +# +# Metric Learning +# ^^^^^^^^^^^^^^^ +# +# Why is Metric Learning useful? We can, with prior knowledge of which +# points are supposed to be closer, figure out a better way to compute +# distances between points for the task at hand. Especially in higher +# dimensions when Euclidean distances are a poor way to measure distance, this +# becomes very useful. +# +# Basically, we learn this distance: +# :math:`D(x, x') = \sqrt{(x-x')^\top M(x-x')}`. And we learn the parameters +# :math:`M` of this distance to satisfy certain constraints on the distance +# between points, for example requiring that points of the same class are +# close together and points of different class are far away. +# +# For more information, check the :ref:`intro_metric_learning` section +# from the documentation. Some good reading material can also be found +# `here `__. It serves as a +# good literature review of Metric Learning. +# +# We will briefly explain the metric learning algorithms implemented by +# metric-learn, before providing some examples for its usage, and also +# discuss how to perform metric learning with weaker supervision than class +# labels. +# +# Metric-learn can be easily integrated with your other machine learning +# pipelines, and follows scikit-learn conventions. +# + + +###################################################################### +# Large Margin Nearest Neighbour +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# LMNN is a metric learning algorithm primarily designed for k-nearest +# neighbor classification. The algorithm is based on semidefinite +# programming, a sub-class of convex programming (as most Metric Learning +# algorithms are). +# +# The main intuition behind LMNN is to learn a pseudometric under which +# all data instances in the training set are surrounded by at least k +# instances that share the same class label. If this is achieved, the +# leave-one-out error (a special case of cross validation) is minimized. +# You'll notice that the points from the same labels are closer together, +# but they are not necessary in a same cluster. This is particular to LMNN +# and we'll see that some other algorithms implicitly enforce points from +# the same class to cluster together. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`LMNN +# ` + + +###################################################################### +# Fit and then transform! +# ----------------------- +# + +# setting up LMNN +lmnn = metric_learn.LMNN(n_neighbors=5, learn_rate=1e-6) + +# fit the data! +lmnn.fit(X, y) + +# transform our input space +X_lmnn = lmnn.transform(X) + + +###################################################################### +# So what have we learned? The matrix :math:`M` we talked about before. + + +###################################################################### +# Now let us plot the transformed space - this tells us what the original +# space looks like after being transformed with the new learned metric. +# + +plot_tsne(X_lmnn, y) + + +###################################################################### +# Pretty neat, huh? +# +# The rest of this notebook will briefly explain the other Metric Learning +# algorithms before plotting them. Also, while we have first run ``fit`` +# and then ``transform`` to see our data transformed, we can also use +# ``fit_transform``. The rest of the examples and illustrations will use +# ``fit_transform``. + +###################################################################### +# Information Theoretic Metric Learning +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# ITML uses a regularizer that automatically enforces a Semi-Definite +# Positive Matrix condition - the LogDet divergence. It uses soft +# must-link or cannot-link constraints, and a simple algorithm based on +# Bregman projections. Unlike LMNN, ITML will implicitly enforce points from +# the same class to belong to the same cluster, as you can see below. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`ITML +# ` + +itml = metric_learn.ITML_Supervised() +X_itml = itml.fit_transform(X, y) + +plot_tsne(X_itml, y) + + +###################################################################### +# Mahalanobis Metric for Clustering +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# MMC is an algorithm that will try to minimize the distance between similar +# points, while ensuring that the sum of distances between dissimilar points is +# higher than a threshold. This is done by optimizing a cost function +# subject to an inequality constraint. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`MMC +# ` + +mmc = metric_learn.MMC_Supervised() +X_mmc = mmc.fit_transform(X, y) + +plot_tsne(X_mmc, y) + +###################################################################### +# Sparse Determinant Metric Learning +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Implements an efficient sparse metric learning algorithm in high +# dimensional space via an :math:`l_1`-penalized log-determinant +# regularization. Compared to the most existing distance metric learning +# algorithms, the algorithm exploits the sparsity nature underlying the +# intrinsic high dimensional feature space. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`SDML +# ` + +sdml = metric_learn.SDML_Supervised(sparsity_param=0.1, balance_param=0.0015, + prior='covariance') +X_sdml = sdml.fit_transform(X, y) + +plot_tsne(X_sdml, y) + + +###################################################################### +# Least Squares Metric Learning +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# LSML is a simple, yet effective, algorithm that learns a Mahalanobis +# metric from a given set of relative comparisons. This is done by +# formulating and minimizing a convex loss function that corresponds to +# the sum of squared hinge loss of violated constraints. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`LSML +# ` + +lsml = metric_learn.LSML_Supervised(tol=0.0001, max_iter=10000, + prior='covariance') +X_lsml = lsml.fit_transform(X, y) + +plot_tsne(X_lsml, y) + + +###################################################################### +# Neighborhood Components Analysis +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# NCA is an extremly popular metric learning algorithm. +# +# Neighborhood components analysis aims at "learning" a distance metric +# by finding a linear transformation of input data such that the average +# leave-one-out (LOO) classification performance of a soft-nearest +# neighbors rule is maximized in the transformed space. The key insight to +# the algorithm is that a matrix :math:`A` corresponding to the +# transformation can be found by defining a differentiable objective function +# for :math:`A`, followed by use of an iterative solver such as +# `scipy.optimize.fmin_l_bfgs_b`. Like LMNN, this algorithm does not try to +# cluster points from the same class in a unique cluster, because it +# enforces conditions at a local neighborhood scale. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`NCA +# ` + +nca = metric_learn.NCA(max_iter=1000) +X_nca = nca.fit_transform(X, y) + +plot_tsne(X_nca, y) + +###################################################################### +# Local Fisher Discriminant Analysis +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# LFDA is a linear supervised dimensionality reduction method. It is +# particularly useful when dealing with multimodality, where one ore more +# classes consist of separate clusters in input space. The core +# optimization problem of LFDA is solved as a generalized eigenvalue +# problem. Like LMNN, and NCA, this algorithm does not try to cluster points +# from the same class in a unique cluster. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`LFDA +# ` + +lfda = metric_learn.LFDA(k=2, n_components=2) +X_lfda = lfda.fit_transform(X, y) + +plot_tsne(X_lfda, y) + + +###################################################################### +# Relative Components Analysis +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# RCA is another one of the older algorithms. It learns a full rank +# Mahalanobis distance metric based on a weighted sum of in-class +# covariance matrices. It applies a global linear transformation to assign +# large weights to relevant dimensions and low weights to irrelevant +# dimensions. Those relevant dimensions are estimated using "chunklets", +# subsets of points that are known to belong to the same class. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`RCA +# ` + +rca = metric_learn.RCA_Supervised(n_chunks=30, chunk_size=2) +X_rca = rca.fit_transform(X, y) + +plot_tsne(X_rca, y) + +###################################################################### +# Regression example: Metric Learning for Kernel Regression +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# The previous algorithms took as input a dataset with class labels. Metric +# learning can also be useful for regression, when the labels are real numbers. +# An algorithm very similar to NCA but for regression is Metric +# Learning for Kernel Regression (MLKR). It will optimize for the average +# leave-one-out *regression* performance from a soft-nearest neighbors +# regression. +# +# - See more in the :ref:`User Guide ` +# - See more in the documentation of the class :py:class:`MLKR +# ` +# +# To illustrate MLKR, let's use the dataset +# `sklearn.datasets.make_regression` the same way as we did with the +# classification before. The dataset will contain: 100 points of 5 features +# each, among which 3 are informative (i.e., used to generate the +# regression target from a linear model), and two are random noise with the +# same magnitude. + +X_reg, y_reg = make_regression(n_samples=100, n_informative=3, n_features=5, + shuffle=True) + +###################################################################### +# Let's plot the dataset as is + +plot_tsne(X_reg, y_reg, plt.cm.Oranges) + +###################################################################### +# And let's plot the dataset after transformation by MLKR: +mlkr = metric_learn.MLKR() +X_mlkr = mlkr.fit_transform(X_reg, y_reg) +plot_tsne(X_mlkr, y_reg, plt.cm.Oranges) + +###################################################################### +# Points that have the same value to regress are now closer to each +# other ! This would improve the performance of +# `sklearn.neighbors.KNeighborsRegressor` for instance. + + +###################################################################### +# Metric Learning from Weaker Supervision +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# To learn the metric, so far we have always given the labels of the +# data to supervise the algorithms. However, in many applications, +# it is easier to obtain information about whether two samples are +# similar or dissimilar. For instance, when annotating a dataset of face +# images, it is easier for an annotator to tell if two faces belong to the same +# person or not, rather than finding the ID of the face among a huge database +# of every person's faces. +# Note that for some problems (e.g., in information +# retrieval where the goal is to rank documents by similarity to a query +# document), there is no notion of individual label but one can gather +# information on which pairs of points are similar or dissimilar. +# Fortunately, one of the strength of metric learning is the ability to +# learn from such weaker supervision. Indeed, some of the algorithms we've +# used above have alternate ways to pass some supervision about the metric +# we want to learn. The way to go is to pass a 2D array `pairs` of pairs, +# as well as an array of labels `pairs_labels` such that for each `i` between +# `0` and `n_pairs` we want `X[pairs[i, 0], :]` and `X[pairs[i, 1], :]` to be +# similar if `pairs_labels[i] == 1`, and we want them to be dissimilar if +# `pairs_labels[i] == -1`. In other words, we +# want to enforce a metric that projects similar points closer together and +# dissimilar points further away from each other. This kind of input is +# possible for ITML, SDML, and MMC. See :ref:`weakly_supervised_section` for +# details on other kinds of weak supervision that some algorithms can work +# with. +# +# For the purpose of this example, we're going to explicitly create these +# pairwise constraints through the labels we have, i.e. `y`. +# Do keep in mind that we are doing this method because we know the labels +# - we can actually create the constraints any way we want to depending on +# the data! +# +# Note that this is what metric-learn did under the hood in the previous +# examples (do check out the +# `constraints` module!) - but we'll try our own version of this. We're +# going to go ahead and assume that two points labeled the same will be +# closer than two points in different labels. + + +def create_constraints(labels): + import itertools + import random + + # aggregate indices of same class + zeros = np.where(y == 0)[0] + ones = np.where(y == 1)[0] + twos = np.where(y == 2)[0] + # make permutations of all those points in the same class + zeros_ = list(itertools.combinations(zeros, 2)) + ones_ = list(itertools.combinations(ones, 2)) + twos_ = list(itertools.combinations(twos, 2)) + # put them together! + sim = np.array(zeros_ + ones_ + twos_) + + # similarily, put together indices in different classes + dis = [] + for zero in zeros: + for one in ones: + dis.append((zero, one)) + for two in twos: + dis.append((zero, two)) + for one in ones: + for two in twos: + dis.append((one, two)) + + # pick up just enough dissimilar examples as we have similar examples + dis = np.array(random.sample(dis, len(sim))) + + # return an array of pairs of indices of shape=(2*len(sim), 2), and the + # corresponding labels, array of shape=(2*len(sim)) + # Each pair of similar points have a label of +1 and each pair of + # dissimilar points have a label of -1 + return (np.vstack([np.column_stack([sim[:, 0], sim[:, 1]]), + np.column_stack([dis[:, 0], dis[:, 1]])]), + np.concatenate([np.ones(len(sim)), -np.ones(len(sim))])) + + +pairs, pairs_labels = create_constraints(y) + + +###################################################################### +# Now that we've created our constraints, let's see what it looks like! +# + +print(pairs) +print(pairs_labels) + + +###################################################################### +# Using our constraints, let's now train ITML again. Note that we are no +# longer calling the supervised class :py:class:`ITML_Supervised +# ` but the more generic +# (weakly-supervised) :py:class:`ITML `, which +# takes the dataset `X` through the `preprocessor` argument (see +# :ref:`this section ` of the documentation to learn +# about more advanced uses of `preprocessor`) and the pair information `pairs` +# and `pairs_labels` in the fit method. + +itml = metric_learn.ITML(preprocessor=X) +itml.fit(pairs, pairs_labels) + +X_itml = itml.transform(X) + +plot_tsne(X_itml, y) + + +###################################################################### +# And that's the result of ITML after being trained on our manually +# constructed constraints! A bit different from our old result, but not too +# different. +# +# RCA and LSML also have their own specific ways of taking in inputs - +# it's worth one's while to poke around in the constraints.py file to see +# how exactly this is going on. +# +# Finally, one of the main advantages of metric-learn is its out-of-the box +# compatibility with scikit-learn, for doing `model selection +# `__, +# cross-validation, and scoring for instance. Indeed, supervised algorithms are +# regular `sklearn.base.TransformerMixin` that can be plugged into any +# pipeline or cross-validation procedure. And weakly-supervised estimators are +# also compatible with scikit-learn, since their input dataset format described +# above allows to be sliced along the first dimension when doing +# cross-validations (see also this :ref:`section `). You +# can also look at some :ref:`use cases ` where you could combine +# metric-learn with scikit-learn estimators. + +######################################################################## +# This brings us to the end of this tutorial! Have fun Metric Learning :) diff --git a/examples/sandwich.py b/examples/plot_sandwich.py similarity index 74% rename from examples/sandwich.py rename to examples/plot_sandwich.py index 34b48a00..740852be 100644 --- a/examples/sandwich.py +++ b/examples/plot_sandwich.py @@ -1,13 +1,25 @@ +# -*- coding: utf-8 -*- """ +Sandwich demo +============= + Sandwich demo based on code from http://nbviewer.ipython.org/6576096 """ +###################################################################### +# .. note:: +# +# In order to show the charts of the examples you need a graphical +# ``matplotlib`` backend installed. For intance, use ``pip install pyqt5`` +# to get Qt graphical interface or use your favorite one. + import numpy as np from matplotlib import pyplot as plt from sklearn.metrics import pairwise_distances from sklearn.neighbors import NearestNeighbors -from metric_learn import LMNN, ITML_Supervised, LSML_Supervised, SDML_Supervised +from metric_learn import (LMNN, ITML_Supervised, LSML_Supervised, + SDML_Supervised) def sandwich_demo(): @@ -23,14 +35,14 @@ def sandwich_demo(): mls = [ LMNN(), - ITML_Supervised(num_constraints=200), - SDML_Supervised(num_constraints=200), - LSML_Supervised(num_constraints=200), + ITML_Supervised(n_constraints=200), + SDML_Supervised(n_constraints=200, balance_param=0.001), + LSML_Supervised(n_constraints=200), ] for ax_num, ml in enumerate(mls, start=3): ml.fit(x, y) - tx = ml.transform() + tx = ml.transform(x) ml_knn = nearest_neighbors(tx, k=2) ax = plt.subplot(3, 2, ax_num) plot_sandwich_data(tx, y, axis=ax) @@ -43,10 +55,10 @@ def sandwich_demo(): # TODO: use this somewhere def visualize_class_separation(X, labels): - _, (ax1,ax2) = plt.subplots(ncols=2) + _, (ax1, ax2) = plt.subplots(ncols=2) label_order = np.argsort(labels) ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest') - ax2.imshow(pairwise_distances(labels[label_order,None]), + ax2.imshow(pairwise_distances(labels[label_order, None]), interpolation='nearest') @@ -73,19 +85,19 @@ def sandwich_data(): for k, xc in enumerate(x_centers): data[i, k, 0] = np.random.normal(xc, 0.1) data[i, k, 1] = np.random.normal(yc, 0.1) - labels[i,:] = i + labels[i, :] = i return data.reshape((-1, 2)), labels.ravel() def plot_sandwich_data(x, y, axis=plt, colors='rbgmky'): for idx, val in enumerate(np.unique(y)): - xi = x[y==val] + xi = x[y == val] axis.scatter(*xi.T, s=50, facecolors='none', edgecolors=colors[idx]) def plot_neighborhood_graph(x, nn, y, axis=plt, colors='rbgmky'): for i, a in enumerate(x): - b = x[nn[i,1]] + b = x[nn[i, 1]] axis.plot((a[0], b[0]), (a[1], b[1]), colors[y[i]]) diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py index cc60049d..92823fb1 100644 --- a/metric_learn/__init__.py +++ b/metric_learn/__init__.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from .constraints import Constraints from .covariance import Covariance from .itml import ITML, ITML_Supervised @@ -9,3 +7,14 @@ from .nca import NCA from .lfda import LFDA from .rca import RCA, RCA_Supervised +from .mlkr import MLKR +from .mmc import MMC, MMC_Supervised +from .scml import SCML, SCML_Supervised + +from ._version import __version__ + +__all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised', + 'LMNN', 'LSML', 'LSML_Supervised', 'SDML', + 'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised', + 'MLKR', 'MMC', 'MMC_Supervised', 'SCML', + 'SCML_Supervised', '__version__'] diff --git a/metric_learn/_util.py b/metric_learn/_util.py new file mode 100644 index 00000000..868ececa --- /dev/null +++ b/metric_learn/_util.py @@ -0,0 +1,787 @@ +import numpy as np +from numpy.linalg import LinAlgError +from sklearn.datasets import make_spd_matrix +from sklearn.decomposition import PCA +from sklearn.utils import check_array +from sklearn.utils.validation import check_X_y, check_random_state +from .exceptions import PreprocessorError, NonPSDError +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from scipy.linalg import pinvh, eigh +import sys +import time +import warnings + +# hack around lack of axis kwarg in older numpy versions +try: + np.linalg.norm([[4]], axis=1) +except TypeError: + def vector_norm(X): + return np.apply_along_axis(np.linalg.norm, 1, X) +else: + def vector_norm(X): + return np.linalg.norm(X, axis=1) + + +def check_input(input_data, y=None, preprocessor=None, + type_of_inputs='classic', tuple_size=None, accept_sparse=False, + dtype='numeric', order=None, + copy=False, force_all_finite=True, + multi_output=False, ensure_min_samples=1, + ensure_min_features=1, y_numeric=False, estimator=None): + """Checks that the input format is valid, and converts it if specified + (this is the equivalent of scikit-learn's `check_array` or `check_X_y`). + All arguments following tuple_size are scikit-learn's `check_X_y` + arguments that will be enforced on the data and labels array. If + indicators are given as an input data array, the returned data array + will be the formed points/tuples, using the given preprocessor. + + Parameters + ---------- + input : array-like + The input data array to check. + + y : array-like + The input labels array to check. + + preprocessor : callable (default=`None`) + The preprocessor to use. If None, no preprocessor is used. + + type_of_inputs : `str` {'classic', 'tuples'} + The type of inputs to check. If 'classic', the input should be + a 2D array-like of points or a 1D array like of indicators of points. If + 'tuples', the input should be a 3D array-like of tuples or a 2D + array-like of indicators of tuples. + + accept_sparse : `bool` + Set to true to allow sparse inputs (only works for sparse inputs with + dim < 3). + + tuple_size : int + The number of elements in a tuple (e.g. 2 for pairs). + + dtype : string, type, list of types or None (default='numeric') + Data type of result. If None, the dtype of the input is preserved. + If 'numeric', dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=`None`) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + estimator : str or estimator instance (default=`None`) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + X : `numpy.ndarray` + The checked input data array. + + y: `numpy.ndarray` (optional) + The checked input labels array. + """ + + context = make_context(estimator) + + args_for_sk_checks = dict(accept_sparse=accept_sparse, + dtype=dtype, order=order, + copy=copy, force_all_finite=force_all_finite, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + estimator=estimator) + + # We need to convert input_data into a numpy.ndarray if possible, before + # any further checks or conversions, and deal with y if needed. Therefore + # we use check_array/check_X_y with fixed permissive arguments. + if y is None: + input_data = check_array(input_data, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0) + else: + input_data, y = check_X_y(input_data, y, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0, + multi_output=multi_output, + y_numeric=y_numeric) + + if type_of_inputs == 'classic': + input_data = check_input_classic(input_data, context, preprocessor, + args_for_sk_checks) + + elif type_of_inputs == 'tuples': + input_data = check_input_tuples(input_data, context, preprocessor, + args_for_sk_checks, tuple_size) + + # if we have y and the input data are pairs, we need to ensure + # the labels are in [-1, 1]: + if y is not None and input_data.shape[1] == 2: + check_y_valid_values_for_pairs(y) + + else: + raise ValueError("Unknown value {} for type_of_inputs. Valid values are " + "'classic' or 'tuples'.".format(type_of_inputs)) + + return input_data if y is None else (input_data, y) + + +def check_input_tuples(input_data, context, preprocessor, args_for_sk_checks, + tuple_size): + preprocessor_has_been_applied = False + if input_data.ndim == 2: + if preprocessor is not None: + input_data = preprocess_tuples(input_data, preprocessor) + preprocessor_has_been_applied = True + else: + make_error_input(201, input_data, context) + elif input_data.ndim == 3: + pass + else: + if preprocessor is not None: + make_error_input(420, input_data, context) + else: + make_error_input(200, input_data, context) + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) + # we need to check num_features because check_array does not check it + # for 3D inputs: + if args_for_sk_checks['ensure_min_features'] > 0: + n_features = input_data.shape[2] + if n_features < args_for_sk_checks['ensure_min_features']: + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, input_data.shape, + args_for_sk_checks['ensure_min_features'], + context)) + # normally we don't need to check_tuple_size too because tuple_size + # shouldn't be able to be modified by any preprocessor + if input_data.ndim != 3: + # we have to ensure this because check_array above does not + if preprocessor_has_been_applied: + make_error_input(211, input_data, context) + else: + make_error_input(201, input_data, context) + check_tuple_size(input_data, tuple_size, context) + return input_data + + +def check_input_classic(input_data, context, preprocessor, args_for_sk_checks): + preprocessor_has_been_applied = False + if input_data.ndim == 1: + if preprocessor is not None: + input_data = preprocess_points(input_data, preprocessor) + preprocessor_has_been_applied = True + else: + make_error_input(101, input_data, context) + elif input_data.ndim == 2: + pass # OK + else: + if preprocessor is not None: + make_error_input(320, input_data, context) + else: + make_error_input(100, input_data, context) + + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) + if input_data.ndim != 2: + # we have to ensure this because check_array above does not + if preprocessor_has_been_applied: + make_error_input(111, input_data, context) + else: + make_error_input(101, input_data, context) + return input_data + + +def make_error_input(code, input_data, context): + code_str = {'expected_input': {'1': '2D array of formed points', + '2': '3D array of formed tuples', + '3': ('1D array of indicators or 2D array of ' + 'formed points'), + '4': ('2D array of indicators or 3D array ' + 'of formed tuples')}, + 'additional_context': {'0': '', + '2': ' when using a preprocessor', + '1': (' after the preprocessor has been ' + 'applied')}, + 'possible_preprocessor': {'0': '', + '1': ' and/or use a preprocessor' + }} + code_list = str(code) + err_args = dict(expected_input=code_str['expected_input'][code_list[0]], + additional_context=code_str['additional_context'] + [code_list[1]], + possible_preprocessor=code_str['possible_preprocessor'] + [code_list[2]], + input_data=input_data, context=context, + found_size=input_data.ndim) + err_msg = ('{expected_input} expected' + '{context}{additional_context}. Found {found_size}D array ' + 'instead:\ninput={input_data}. Reshape your data' + '{possible_preprocessor}.\n') + raise ValueError(err_msg.format(**err_args)) + + +def preprocess_tuples(tuples, preprocessor): + try: + tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for + i in range(tuples.shape[1])]) + except Exception as e: + raise PreprocessorError(e) + return tuples + + +def preprocess_points(points, preprocessor): + """form points if there is a preprocessor else keep them as such (assumes + that check_points has already been called)""" + try: + points = preprocessor(points) + except Exception as e: + raise PreprocessorError(e) + return points + + +def make_context(estimator): + """Helper function to create a string with the estimator name. + Taken from check_array function in scikit-learn. + Will return the following for instance: + NCA: ' by NCA' + 'NCA': ' by NCA' + None: '' + """ + estimator_name = make_name(estimator) + context = (' by ' + estimator_name) if estimator_name is not None else '' + return context + + +def make_name(estimator): + """Helper function that returns the name of estimator or the given string + if a string is given + """ + if estimator is not None: + if isinstance(estimator, str): + estimator_name = estimator + else: + estimator_name = estimator.__class__.__name__ + else: + estimator_name = None + return estimator_name + + +def check_tuple_size(tuples, tuple_size, context): + """Helper function to check that the number of points in each tuple is + equal to tuple_size (e.g. 2 for pairs), and raise a `ValueError` otherwise""" + if tuple_size is not None and tuples.shape[1] != tuple_size: + msg_t = (("Tuples of {} element(s) expected{}. Got tuples of {} " + "element(s) instead (shape={}):\ninput={}.\n") + .format(tuple_size, context, tuples.shape[1], tuples.shape, + tuples)) + raise ValueError(msg_t) + + +def check_y_valid_values_for_pairs(y): + """Checks that y values are in [-1, 1]""" + if not np.array_equal(np.abs(y), np.ones_like(y)): + raise ValueError("When training on pairs, the labels (y) should contain " + "only values in [-1, 1]. Found an incorrect value.") + + +class ArrayIndexer: + + def __init__(self, X): + # we check the array-like preprocessor here, and we as much permissive + # as possible (because the user will check for the desired + # format with arguments in check_input, and only this latter function + # should return the appropriate errors). We do this only to have a numpy + # array object which can be indexed by another numpy array object. + X = check_array(X, + accept_sparse=True, dtype=None, + force_all_finite=False, + ensure_2d=False, allow_nd=True, + ensure_min_samples=0, ensure_min_features=0, + estimator=None) + self.X = X + + def __call__(self, indices): + return self.X[indices] + + +def check_collapsed_pairs(pairs): + num_ident = (vector_norm(pairs[:, 0] - pairs[:, 1]) < 1e-9).sum() + if num_ident: + raise ValueError("{} collapsed pairs found (where the left element is " + "the same as the right element), out of {} pairs " + "in total.".format(num_ident, pairs.shape[0])) + + +def _check_sdp_from_eigen(w, tol=None): + """Checks if some of the eigenvalues given are negative, up to a tolerance + level, with a default value of the tolerance depending on the eigenvalues. + It also returns whether the matrix is positive definite, up to the above + tolerance. + + Parameters + ---------- + w : array-like, shape=(n_eigenvalues,) + Eigenvalues to check for non semidefinite positiveness. + + tol : positive `float`, optional + Absolute eigenvalues below tol are considered zero. If + tol is None, and eps is the epsilon value for datatype of w, then tol + is set to abs(w).max() * len(w) * eps. + + Returns + ------- + is_definite : bool + Whether the matrix is positive definite or not. + + See Also + -------- + np.linalg.matrix_rank for more details on the choice of tolerance (the same + strategy is applied here) + """ + if tol is None: + tol = np.abs(w).max() * len(w) * np.finfo(w.dtype).eps + if tol < 0: + raise ValueError("tol should be positive.") + if any(w < - tol): + raise NonPSDError() + if any(abs(w) < tol): + return False + return True + + +def components_from_metric(metric, tol=None): + """Returns the transformation matrix from the Mahalanobis matrix. + + Returns the transformation matrix from the Mahalanobis matrix, i.e. the + matrix L such that metric=L.T.dot(L). + + Parameters + ---------- + metric : symmetric `np.ndarray`, shape=(d x d) + The input metric, from which we want to extract a transformation matrix. + + tol : positive `float`, optional + Eigenvalues of `metric` between 0 and - tol are considered zero. If tol is + None, and w_max is `metric`'s largest eigenvalue, and eps is the epsilon + value for datatype of w, then tol is set to w_max * metric.shape[0] * eps. + + Returns + ------- + L : np.ndarray, shape=(d x d) + The transformation matrix, such that L.T.dot(L) == metric. + """ + if not np.allclose(metric, metric.T): + raise ValueError("The input metric should be symmetric.") + # If M is diagonal, we will just return the elementwise square root: + if np.array_equal(metric, np.diag(np.diag(metric))): + _check_sdp_from_eigen(np.diag(metric), tol) + return np.diag(np.sqrt(np.maximum(0, np.diag(metric)))) + else: + try: + # if `M` is positive semi-definite, it will admit a Cholesky + # decomposition: L = cholesky(M).T + return np.linalg.cholesky(metric).T + except LinAlgError: + # However, currently np.linalg.cholesky does not support indefinite + # matrices. So if the latter does not work we will return L = V.T w^( + # -1/2), with M = V*w*V.T being the eigenvector decomposition of M with + # the eigenvalues in the diagonal matrix w and the columns of V being the + # eigenvectors. + w, V = np.linalg.eigh(metric) + _check_sdp_from_eigen(w, tol) + return V.T * np.sqrt(np.maximum(0, w[:, None])) + + +def validate_vector(u, dtype=None): + # replica of scipy.spatial.distance._validate_vector, for making scipy + # compatible functions on vectors (such as distances computations) + u = np.asarray(u, dtype=dtype, order='c').squeeze() + # Ensure values such as u=1 and u=[1] still return 1-D arrays. + u = np.atleast_1d(u) + if u.ndim > 1: + raise ValueError("Input vector should be 1-D.") + return u + + +def _initialize_components(n_components, input, y=None, init='auto', + verbose=False, random_state=None, + has_classes=True): + """Returns the initial transformation to be used depending on the arguments. + + Parameters + ---------- + n_components : int + The number of components to take. (Note: it should have been checked + before, meaning it should not be None and it should be a value in + [1, X.shape[1]]) + + input : array-like + The input samples (can be tuples or regular samples). + + y : array-like or None + The input labels (or not if there are no labels). + + init : string or numpy array, optional (default='auto') + Initialization of the linear transformation. Possible options are + 'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape + (n_features_a, n_features_b). + + 'auto' + Depending on ``n_components``, the most reasonable initialization + will be chosen. If ``n_components <= n_classes`` we use 'lda' (see + the description of 'lda' init), as it uses labels information. If + not, but ``n_components < min(n_features, n_samples)``, we use 'pca', + as it projects data onto meaningful directions (those of higher + variance). Otherwise, we just use 'identity'. + + 'pca' + ``n_components`` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See `sklearn.decomposition.PCA`) + + 'lda' + ``min(n_components, n_classes)`` most discriminative + components of the inputs passed to :meth:`fit` will be used to + initialize the transformation. (If ``n_components > n_classes``, + the rest of the components will be zero.) (See + `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`). + This initialization is possible only if `has_classes == True`. + + 'identity' + The identity matrix. If ``n_components`` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first ``n_components`` rows. + + 'random' + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + numpy array + n_features_b must match the dimensionality of the inputs passed to + :meth:`fit` and n_features_a must be less than or equal to that. + If ``n_components`` is not None, n_features_a must match it. + + verbose : bool + Whether to print the details of the initialization or not. + + random_state : int or `numpy.RandomState` or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + transformation. If ``init='pca'``, ``random_state`` is passed as an + argument to PCA when initializing the transformation. + + has_classes : bool (default=True) + Whether the labels are in fact classes. If true, this will allow to use + the 'lda' initialization. + + Returns + ------- + init_components : `numpy.ndarray` + The initial transformation to use. + """ + # if we are doing a regression we cannot use lda: + n_features = input.shape[-1] + authorized_inits = ['auto', 'pca', 'identity', 'random'] + if has_classes: + authorized_inits.append('lda') + + if isinstance(init, np.ndarray): + # we copy the array, so that if we update the metric, we don't want to + # update the init + init = check_array(init, copy=True) + + # Assert that init.shape[1] = X.shape[1] + if init.shape[1] != n_features: + raise ValueError('The input dimensionality ({}) of the given ' + 'linear transformation `init` must match the ' + 'dimensionality of the given inputs `X` ({}).' + .format(init.shape[1], n_features)) + + # Assert that init.shape[0] <= init.shape[1] + if init.shape[0] > init.shape[1]: + raise ValueError('The output dimensionality ({}) of the given ' + 'linear transformation `init` cannot be ' + 'greater than its input dimensionality ({}).' + .format(init.shape[0], init.shape[1])) + + # Assert that self.n_components = init.shape[0] + if n_components != init.shape[0]: + raise ValueError('The preferred dimensionality of the ' + 'projected space `n_components` ({}) does' + ' not match the output dimensionality of ' + 'the given linear transformation ' + '`init` ({})!' + .format(n_components, + init.shape[0])) + elif init not in authorized_inits: + raise ValueError( + "`init` must be '{}' " + "or a numpy array of shape (n_components, n_features)." + .format("', '".join(authorized_inits))) + + random_state = check_random_state(random_state) + if isinstance(init, np.ndarray): + return init + n_samples = input.shape[0] + if init == 'auto': + if has_classes: + n_classes = len(np.unique(y)) + else: + n_classes = -1 + init = _auto_select_init(has_classes, n_features, n_samples, n_components, + n_classes) + if init == 'identity': + return np.eye(n_components, input.shape[-1]) + elif init == 'random': + return random_state.randn(n_components, input.shape[-1]) + elif init in {'pca', 'lda'}: + init_time = time.time() + if init == 'pca': + pca = PCA(n_components=n_components, + random_state=random_state) + if verbose: + print('Finding principal components... ') + sys.stdout.flush() + pca.fit(input) + transformation = pca.components_ + elif init == 'lda': + lda = LinearDiscriminantAnalysis(n_components=n_components) + if verbose: + print('Finding most discriminative components... ') + sys.stdout.flush() + lda.fit(input, y) + transformation = lda.scalings_.T[:n_components] + if verbose: + print('done in {:5.2f}s'.format(time.time() - init_time)) + return transformation + + +def _auto_select_init(has_classes, n_features, n_samples, n_components, + n_classes): + if has_classes and n_components <= min(n_features, n_classes - 1): + init = 'lda' + elif n_components < min(n_features, n_samples): + init = 'pca' + else: + init = 'identity' + return init + + +def _initialize_metric_mahalanobis(input, init='identity', random_state=None, + return_inverse=False, strict_pd=False, + matrix_name='matrix'): + """Returns a PSD matrix that can be used as a prior or an initialization + for the Mahalanobis distance + + Parameters + ---------- + input : array-like + The input samples (can be tuples or regular samples). + + init : string or numpy array, optional (default='identity') + Specification for the matrix to initialize. Possible options are + 'identity', 'covariance', 'random', and a numpy array of shape + (n_features, n_features). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The (pseudo-)inverse covariance matrix (raises an error if the + covariance matrix is not definite and `strict_pd == True`) + + 'random' + A random positive definite (PD) matrix of shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A PSD matrix (or strictly PD if strict_pd==True) of + shape (n_features, n_features), that will be used as such to + initialize the metric, or set the prior. + + random_state : int or `numpy.RandomState` or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to set the random Mahalanobis + matrix. If ``init='pca'``, ``random_state`` is passed as an + argument to PCA when initializing the matrix. + + return_inverse : bool, optional (default=False) + Whether to return the inverse of the specified matrix. This + can be sometimes useful. It will return the pseudo-inverse (which is the + same as the inverse if the matrix is definite (i.e. invertible)). If + `strict_pd == True` and the matrix is not definite, it will return an + error. + + strict_pd : bool, optional (default=False) + Whether to enforce that the provided matrix is definite (in addition to + being PSD). + + param_name : str, optional (default='matrix') + The name of the matrix used (example: 'init', 'prior'). Will be used in + error messages. + + Returns + ------- + M, or (M, M_inv) : `numpy.ndarray` + The initial matrix to use M, and its inverse if `return_inverse=True`. + """ + n_features = input.shape[-1] + if isinstance(init, np.ndarray): + # we copy the array, so that if we update the metric, we don't want to + # update the init + init = check_array(init, copy=True) + + # Assert that init.shape[1] = n_features + if init.shape != (n_features,) * 2: + raise ValueError('The input dimensionality {} of the given ' + 'mahalanobis matrix `{}` must match the ' + 'dimensionality of the given inputs ({}).' + .format(init.shape, matrix_name, n_features)) + + # Assert that the matrix is symmetric + if not np.allclose(init, init.T): + raise ValueError("`{}` is not symmetric.".format(matrix_name)) + + elif init not in ['identity', 'covariance', 'random']: + raise ValueError( + "`{}` must be 'identity', 'covariance', 'random' " + "or a numpy array of shape (n_features, n_features)." + .format(matrix_name)) + + random_state = check_random_state(random_state) + M = init + if isinstance(M, np.ndarray): + w, V = eigh(M, check_finite=False) + init_is_definite = _check_sdp_from_eigen(w) + if strict_pd and not init_is_definite: + raise LinAlgError("You should provide a strictly positive definite " + "matrix as `{}`. This one is not definite. Try another" + " {}, or an algorithm that does not " + "require the {} to be strictly positive definite." + .format(*((matrix_name,) * 3))) + elif return_inverse and not init_is_definite: + warnings.warn('The initialization matrix is not invertible: ' + 'using the pseudo-inverse instead.') + if return_inverse: + M_inv = _pseudo_inverse_from_eig(w, V) + return M, M_inv + else: + return M + elif init == 'identity': + M = np.eye(n_features, n_features) + if return_inverse: + M_inv = M.copy() + return M, M_inv + else: + return M + elif init == 'covariance': + if input.ndim == 3: + # if the input are tuples, we need to form an X by deduplication + X = np.unique(np.vstack(input), axis=0) + else: + X = input + # atleast2d is necessary to deal with scalar covariance matrices + M_inv = np.atleast_2d(np.cov(X, rowvar=False)) + w, V = eigh(M_inv, check_finite=False) + cov_is_definite = _check_sdp_from_eigen(w) + if strict_pd and not cov_is_definite: + raise LinAlgError("Unable to get a true inverse of the covariance " + "matrix since it is not definite. Try another " + "`{}`, or an algorithm that does not " + "require the `{}` to be strictly positive definite." + .format(*((matrix_name,) * 2))) + elif not cov_is_definite: + warnings.warn('The covariance matrix is not invertible: ' + 'using the pseudo-inverse instead.' + 'To make the covariance matrix invertible' + ' you can remove any linearly dependent features and/or ' + 'reduce the dimensionality of your input, ' + 'for instance using `sklearn.decomposition.PCA` as a ' + 'preprocessing step.') + M = _pseudo_inverse_from_eig(w, V) + if return_inverse: + return M, M_inv + else: + return M + elif init == 'random': + # we need to create a random symmetric matrix + M = make_spd_matrix(n_features, random_state=random_state) + if return_inverse: + # we use pinvh even if we know the matrix is definite, just because + # we need the returned matrix to be symmetric (and sometimes + # np.linalg.inv returns not symmetric inverses of symmetric matrices) + # TODO: there might be a more efficient method to do so + M_inv = pinvh(M) + return M, M_inv + else: + return M + + +def _check_n_components(n_features, n_components): + """Checks that n_components is less than n_features and deal with the None + case""" + if n_components is None: + return n_features + if 0 < n_components <= n_features: + return n_components + raise ValueError('Invalid n_components, must be in [1, %d]' % n_features) + + +def _pseudo_inverse_from_eig(w, V, tol=None): + """Compute the (Moore-Penrose) pseudo-inverse of the EVD of a symetric + matrix. + + Parameters + ---------- + w : (..., M) ndarray + The eigenvalues in ascending order, each repeated according to + its multiplicity. + + v : {(..., M, M) ndarray, (..., M, M) matrix} + The column ``v[:, i]`` is the normalized eigenvector corresponding + to the eigenvalue ``w[i]``. Will return a matrix object if `a` is + a matrix object. + + tol : positive `float`, optional + Absolute eigenvalues below tol are considered zero. + + Returns + ------- + output : (..., M, N) array_like + The pseudo-inverse given by the EVD. + """ + if tol is None: + tol = np.amax(w) * np.max(w.shape) * np.finfo(w.dtype).eps + # discard small eigenvalues and invert the rest + large = np.abs(w) > tol + w = np.divide(1, w, where=large, out=w) + w[~large] = 0 + + return np.dot(V * w, np.conjugate(V).T) diff --git a/metric_learn/_version.py b/metric_learn/_version.py new file mode 100644 index 00000000..a71c5c7f --- /dev/null +++ b/metric_learn/_version.py @@ -0,0 +1 @@ +__version__ = '0.7.0' diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 5fe2ca14..47efe4b7 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,75 +1,926 @@ -from numpy.linalg import inv,cholesky +""" +Base module. +""" +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import _is_arraylike, check_is_fitted +from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve +import numpy as np +from abc import ABCMeta, abstractmethod +from ._util import ArrayIndexer, check_input, validate_vector +import warnings -class BaseMetricLearner(object): - def __init__(self): - raise NotImplementedError('BaseMetricLearner should not be instantiated') - def metric(self): - """Computes the Mahalanobis matrix from the transformation matrix. +class BaseMetricLearner(BaseEstimator, metaclass=ABCMeta): + """ + Base class for all metric-learners. - .. math:: M = L^{\\top} L + Parameters + ---------- + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. + """ + + def __init__(self, preprocessor=None): + self.preprocessor = preprocessor + + @abstractmethod + def score_pairs(self, pairs): + """ + Returns the score between pairs + (can be a similarity, or a distance/metric depending on the algorithm) + + .. deprecated:: 0.7.0 + Refer to `pair_distance` and `pair_score`. + + .. warning:: + This method will be removed in 0.8.0. Please refer to `pair_distance` + or `pair_score`. This change will occur in order to add learners + that don't necessarily learn a Mahalanobis distance. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- - M : (d x d) matrix + scores : `numpy.ndarray` of shape=(n_pairs,) + The score of every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference between `score_pairs` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. """ - L = self.transformer() - return L.T.dot(L) - def transformer(self): - """Computes the transformation matrix from the Mahalanobis matrix. + @abstractmethod + def pair_score(self, pairs): + """ + .. versionadded:: 0.7.0 Compute the similarity score between pairs - L = inv(cholesky(M)) + Returns the similarity score between pairs of points (the larger the score, + the more similar the pair). For metric learners that learn a distance, + the score is simply the opposite of the distance between pairs. All + learners have access to this method. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- - L : (d x d) matrix + scores : `numpy.ndarray` of shape=(n_pairs,) + The score of every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `pair_score` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. """ - return inv(cholesky(self.metric())) - def transform(self, X=None): + @abstractmethod + def pair_distance(self, pairs): + """ + .. versionadded:: 0.7.0 Compute the distance between pairs + + Returns the (pseudo) distance between pairs, when available. For metric + learners that do not learn a (pseudo) distance, an error is thrown + instead. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs for which to compute the distance, with each + row corresponding to two points, for 2D array of indices of pairs + if the metric learner uses a preprocessor. + + Returns + ------- + scores : `numpy.ndarray` of shape=(n_pairs,) + The distance between every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `pair_distance` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. + """ + + def _check_preprocessor(self): + """Initializes the preprocessor""" + if _is_arraylike(self.preprocessor): + self.preprocessor_ = ArrayIndexer(self.preprocessor) + elif callable(self.preprocessor) or self.preprocessor is None: + self.preprocessor_ = self.preprocessor + else: + raise ValueError("Invalid type for the preprocessor: {}. You should " + "provide either None, an array-like object, " + "or a callable.".format(type(self.preprocessor))) + + def _prepare_inputs(self, X, y=None, type_of_inputs='classic', + **kwargs): + """Initializes the preprocessor and processes inputs. See `check_input` + for more details. + + Parameters + ---------- + X : array-like + The input data array to check. + + y : array-like + The input labels array to check. + + type_of_inputs : `str` {'classic', 'tuples'} + The type of inputs to check. If 'classic', the input should be + a 2D array-like of points or a 1D array like of indicators of points. If + 'tuples', the input should be a 3D array-like of tuples or a 2D + array-like of indicators of tuples. + + **kwargs : dict + Arguments to pass to check_input. + + Returns + ------- + X : `numpy.ndarray` + The checked input data array. + + y : `numpy.ndarray` (optional) + The checked input labels array. + """ + self._check_preprocessor() + + check_is_fitted(self, ['preprocessor_']) + outs = check_input(X, y, + type_of_inputs=type_of_inputs, + preprocessor=self.preprocessor_, + estimator=self, + tuple_size=getattr(self, '_tuple_size', None), + **kwargs) + # Conform to SLEP010 + if not hasattr(self, 'n_features_in_'): + self.n_features_in_ = (outs if y is None else outs[0]).shape[1] + return outs + + @abstractmethod + def get_metric(self): + """Returns a function that takes as input two 1D arrays and outputs + the value of the learned metric on these two points. Depending on the + algorithm, it can return a distance or a similarity function between + pairs. + + This function will be independent from the metric learner that learned it + (it will not be modified if the initial metric learner is modified), + and it can be directly plugged into the `metric` argument of + scikit-learn's estimators. + + Returns + ------- + metric_fun : function + The function described above. + + + Examples + -------- + .. doctest:: + + >>> from metric_learn import NCA + >>> from sklearn.datasets import make_classification + >>> from sklearn.neighbors import KNeighborsClassifier + >>> nca = NCA() + >>> X, y = make_classification() + >>> nca.fit(X, y) + >>> knn = KNeighborsClassifier(metric=nca.get_metric()) + >>> knn.fit(X, y) # doctest: +NORMALIZE_WHITESPACE + KNeighborsClassifier(algorithm='auto', leaf_size=30, + metric=.metric_fun + at 0x...>, + metric_params=None, n_jobs=None, n_neighbors=5, p=2, + weights='uniform') + + See Also + -------- + pair_distance : a method that returns the distance between several + pairs of points. Unlike `get_metric`, this is a method of the metric + learner and therefore can change if the metric learner changes. Besides, + it can use the metric learner's preprocessor, and works on concatenated + arrays. + + pair_score : a method that returns the similarity score between + several pairs of points. Unlike `get_metric`, this is a method of the + metric learner and therefore can change if the metric learner changes. + Besides, it can use the metric learner's preprocessor, and works on + concatenated arrays. + """ + + +class MetricTransformer(metaclass=ABCMeta): + """ + Base class for all learners that can transform data into a new space + with the metric learned. + """ + @abstractmethod + def transform(self, X): """Applies the metric transformation. Parameters ---------- - X : (n x d) matrix, optional - Data to transform. If not supplied, the training data will be used. + X : (n x d) matrix + Data to transform. Returns ------- transformed : (n x d) matrix - Input data transformed to the metric space by :math:`XL^{\\top}` + Input data transformed to the metric space by :math:`XL^{\\top}` + """ + + +class MahalanobisMixin(BaseMetricLearner, MetricTransformer, + metaclass=ABCMeta): + r"""Mahalanobis metric learning algorithms. + + Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, + defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x, + x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is a learned symmetric + positive semi-definite (PSD) matrix. The metric between points can then be + expressed as the euclidean distance between points embedded in a new space + through a linear transformation. Indeed, the above matrix can be decomposed + into the product of two transpose matrices (through SVD or Cholesky + decomposition): :math:`d_M(x, x')^2 = (x-x')^T M (x-x') = (x-x')^T L^T L + (x-x') = (L x - L x')^T (L x- L x')` + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + """ + + def score_pairs(self, pairs): + r""" + Returns the learned Mahalanobis distance between pairs. + + This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}` + where ``M`` is the learned Mahalanobis matrix, for every pair of points + ``x`` and ``x'``. This corresponds to the euclidean distance between + embeddings of the points in a new space, obtained through a linear + transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e - + x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See + :class:`MahalanobisMixin`). + + .. deprecated:: 0.7.0 + Please use `pair_distance` instead. + + .. warning:: + This method will be removed in 0.8.0. Please refer to `pair_distance` + or `pair_score`. This change will occur in order to add learners + that don't necessarily learn a Mahalanobis distance. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. + + Returns + ------- + scores : `numpy.ndarray` of shape=(n_pairs,) + The learned Mahalanobis distance for every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `score_pairs` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. + + :ref:`mahalanobis_distances` : The section of the project documentation + that describes Mahalanobis Distances. + """ + dpr_msg = ("score_pairs will be deprecated in release 0.7.0. " + "Use pair_score to compute similarity scores, or " + "pair_distances to compute distances.") + warnings.warn(dpr_msg, category=FutureWarning) + return self.pair_distance(pairs) + + def pair_score(self, pairs): + """ + Returns the opposite of the learned Mahalanobis distance between pairs. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. + + Returns + ------- + scores : `numpy.ndarray` of shape=(n_pairs,) + The opposite of the learned Mahalanobis distance for every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `pair_score` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. + + :ref:`mahalanobis_distances` : The section of the project documentation + that describes Mahalanobis Distances. + """ + return -1 * self.pair_distance(pairs) + + def pair_distance(self, pairs): + """ + Returns the learned Mahalanobis distance between pairs. + + This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}` + where ``M`` is the learned Mahalanobis matrix, for every pair of points + ``x`` and ``x'``. This corresponds to the euclidean distance between + embeddings of the points in a new space, obtained through a linear + transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e - + x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See + :class:`MahalanobisMixin`). + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. + + Returns + ------- + scores : `numpy.ndarray` of shape=(n_pairs,) + The learned Mahalanobis distance for every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `pair_distance` is that it works on two + 1D arrays and cannot use a preprocessor. Besides, the returned function + is independent of the metric learner and hence is not modified if the + metric learner is. + + :ref:`mahalanobis_distances` : The section of the project documentation + that describes Mahalanobis Distances. + """ + check_is_fitted(self, ['preprocessor_']) + pairs = check_input(pairs, type_of_inputs='tuples', + preprocessor=self.preprocessor_, + estimator=self, tuple_size=2) + pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) + # (for MahalanobisMixin, the embedding is linear so we can just embed the + # difference) + return np.sqrt(np.sum(pairwise_diffs**2, axis=-1)) + + def transform(self, X): + """Embeds data points in the learned linear embedding space. + + Transforms samples in ``X`` into ``X_embedded``, samples inside a new + embedding space such that: ``X_embedded = X.dot(L.T)``, where ``L`` is + the learned linear transformation (See :class:`MahalanobisMixin`). + + Parameters + ---------- + X : `numpy.ndarray`, shape=(n_samples, n_features) + The data points to embed. + + Returns + ------- + X_embedded : `numpy.ndarray`, shape=(n_samples, n_components) + The embedded data points. + """ + check_is_fitted(self, ['preprocessor_', 'components_']) + X_checked = check_input(X, type_of_inputs='classic', estimator=self, + preprocessor=self.preprocessor_, + accept_sparse=True) + return X_checked.dot(self.components_.T) + + def get_metric(self): + check_is_fitted(self, 'components_') + components_T = self.components_.T.copy() + + def metric_fun(u, v, squared=False): + """This function computes the metric between u and v, according to the + previously learned metric. + + Parameters + ---------- + u : array-like, shape=(n_features,) + The first point involved in the distance computation. + + v : array-like, shape=(n_features,) + The second point involved in the distance computation. + + squared : `bool` + If True, the function will return the squared metric between u and + v, which is faster to compute. + + Returns + ------- + distance : float + The distance between u and v according to the new metric. + """ + u = validate_vector(u) + v = validate_vector(v) + transformed_diff = (u - v).dot(components_T) + dist = np.dot(transformed_diff, transformed_diff.T) + if not squared: + dist = np.sqrt(dist) + return dist + + return metric_fun + + get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__ + + def get_mahalanobis_matrix(self): + """Returns a copy of the Mahalanobis matrix learned by the metric learner. + + Returns + ------- + M : `numpy.ndarray`, shape=(n_features, n_features) + The copy of the learned Mahalanobis matrix. + """ + check_is_fitted(self, 'components_') + return self.components_.T.dot(self.components_) + + +class _PairsClassifierMixin(BaseMetricLearner, ClassifierMixin): + """Base class for pairs learners. + + Attributes + ---------- + threshold_ : `float` + If the distance metric between two points is lower than this threshold, + points will be classified as similar, otherwise they will be + classified as dissimilar. + """ + + classes_ = np.array([0, 1]) + _tuple_size = 2 # number of points in a tuple, 2 for pairs + + def predict(self, pairs): + """Predicts the learned metric between input pairs. (For now it just + calls decision function). + + Returns the learned metric value between samples in every pair. It should + ideally be low for similar samples and high for dissimilar samples. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to predict, with each row corresponding to two + points, or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + Returns + ------- + y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) + The predicted learned metric value between samples in every pair. """ - if X is None: - X = self.X - L = self.transformer() - return X.dot(L.T) + check_is_fitted(self, 'preprocessor_') - def get_params(self, deep=False): - """Get parameters for this metric learner. + if "threshold_" not in vars(self): + msg = ("A threshold for this estimator has not been set, " + "call its set_threshold or calibrate_threshold method.") + raise AttributeError(msg) + return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1 + + def decision_function(self, pairs): + """Returns the decision function used to classify the pairs. + + Returns the opposite of the learned metric value between samples in every + pair, to be consistent with scikit-learn conventions. Hence it should + ideally be low for dissimilar samples and high for similar samples. + This is the decision function that is used to classify pairs as similar + (+1), or dissimilar (-1). Parameters ---------- - deep: boolean, optional - @WARNING doesn't do anything, only exists because - scikit-learn has this on BaseEstimator. + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to predict, with each row corresponding to two + points, or 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- - params : mapping of string to any - Parameter names mapped to their values. + y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) + The predicted decision function value for each pair. """ - return self.params + check_is_fitted(self, 'preprocessor_') + pairs = check_input(pairs, type_of_inputs='tuples', + preprocessor=self.preprocessor_, + estimator=self, tuple_size=self._tuple_size) + return self.pair_score(pairs) - def set_params(self, **kwarg): - """Set the parameters of this metric learner. + def score(self, pairs, y): + """Computes score of pairs similarity prediction. - Overwrites any default parameters or parameters specified in constructor. + Returns the ``roc_auc`` score of the fitted metric learner. It is + computed in the following way: for every value of a threshold + ``t`` we classify all pairs of samples where the predicted distance is + inferior to ``t`` as belonging to the "similar" class, and the other as + belonging to the "dissimilar" class, and we count false positive and + true positives as in a classical ``roc_auc`` curve. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs, with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + y : array-like, shape=(n_constraints,) + The corresponding labels. + + Returns + ------- + score : float + The ``roc_auc`` score. + """ + return roc_auc_score(y, self.decision_function(pairs)) + + def set_threshold(self, threshold): + """Sets the threshold of the metric learner to the given value `threshold`. + + See more in the :ref:`User Guide `. + + Parameters + ---------- + threshold : float + The threshold value we want to set. It is the value to which the + predicted distance for test pairs will be compared. If they are superior + to the threshold they will be classified as similar (+1), + and dissimilar (-1) if not. Returns ------- - self + self : `_PairsClassifier` + The pairs classifier with the new threshold set. """ - self.params.update(kwarg) + check_is_fitted(self, 'preprocessor_') + try: + self.threshold_ = float(threshold) + except TypeError: + raise ValueError('Parameter threshold must be a real number. ' + 'Got {} instead.'.format(type(threshold))) + except ValueError: + raise ValueError('Parameter threshold must be a real number. ' + 'Got {} instead.'.format(type(threshold))) return self + + def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy', + min_rate=None, beta=1.): + """Decision threshold calibration for pairwise binary classification + + Method that calibrates the decision threshold (cutoff point) of the metric + learner. This threshold will then be used when calling the method + `predict`. The methods for picking cutoff points make use of traditional + binary classification evaluation statistics such as the true positive and + true negative rates and F-scores. The threshold will be found to maximize + the chosen score on the validation set ``(pairs_valid, y_valid)``. + + See more in the :ref:`User Guide `. + + Parameters + ---------- + strategy : str, optional (default='accuracy') + The strategy to use for choosing the cutoff threshold. + + 'accuracy' + Selects a decision threshold that maximizes the accuracy. + 'f_beta' + Selects a decision threshold that maximizes the f_beta score, + with beta given by the parameter `beta`. + 'max_tpr' + Selects a decision threshold that yields the highest true positive + rate with true negative rate at least equal to the value of the + parameter `min_rate`. + 'max_tnr' + Selects a decision threshold that yields the highest true negative + rate with true positive rate at least equal to the value of the + parameter `min_rate`. + + beta : float in [0, 1], optional (default=None) + Beta value to be used in case strategy == 'f_beta'. + + min_rate : float in [0, 1] or None, (default=None) + In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set + to specify the minimal value for the true negative rate or true positive + rate respectively that needs to be achieved. + + pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features) + The validation set of pairs to use to set the threshold. + + y_valid : array-like, shape=(n_pairs_valid,) + The labels of the pairs of the validation set to use to set the + threshold. They must be +1 for positive pairs and -1 for negative pairs. + + References + ---------- + .. [1] Receiver-operating characteristic (ROC) plots: a fundamental + evaluation tool in clinical medicine, MH Zweig, G Campbell - + Clinical chemistry, 1993 + + .. [2] Most of the code of this function is from scikit-learn's PR #10117 + + See Also + -------- + sklearn.calibration : scikit-learn's module for calibrating classifiers + """ + check_is_fitted(self, 'preprocessor_') + + self._validate_calibration_params(strategy, min_rate, beta) + + pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid, + type_of_inputs='tuples') + + n_samples = pairs_valid.shape[0] + if strategy == 'accuracy': + scores = self.decision_function(pairs_valid) + scores_sorted_idces = np.argsort(scores)[::-1] + scores_sorted = scores[scores_sorted_idces] + # true labels ordered by decision_function value: (higher first) + y_ordered = y_valid[scores_sorted_idces] + # we need to add a threshold that will reject all points + scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted]) + + # finds the threshold that maximizes the accuracy: + cum_tp = stable_cumsum(y_ordered == 1) # cumulative number of true + # positives + # we need to add the point where all samples are rejected: + cum_tp = np.concatenate([[0.], cum_tp]) + cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1) + cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1] + cum_accuracy = (cum_tp + cum_tn) / n_samples + imax = np.argmax(cum_accuracy) + # we set the threshold to the lowest accepted score + # note: we are working with negative distances but we want the threshold + # to be with respect to the actual distances so we take minus sign + self.threshold_ = - scores_sorted[imax] + # note: if the best is to reject all points it's already one of the + # thresholds (scores_sorted[0]) + return self + + if strategy == 'f_beta': + precision, recall, thresholds = precision_recall_curve( + y_valid, self.decision_function(pairs_valid), pos_label=1) + + # here the thresholds are decreasing + # We ignore the warnings here, in the same taste as + # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065 + # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284 + with np.errstate(divide='ignore', invalid='ignore'): + f_beta = ((1 + beta**2) * (precision * recall) / + (beta**2 * precision + recall)) + # We need to set nans to zero otherwise they will be considered higher + # than the others (also discussed in https://github.com/scikit-learn/ + # scikit-learn/pull/10117/files#r262115773) + f_beta[np.isnan(f_beta)] = 0. + imax = np.argmax(f_beta) + # we set the threshold to the lowest accepted score + # note: we are working with negative distances but we want the threshold + # to be with respect to the actual distances so we take minus sign + self.threshold_ = - thresholds[imax] + # Note: we don't need to deal with rejecting all points (i.e. threshold = + # max_scores + 1), since this can never happen to be optimal + # (see a more detailed discussion in test_calibrate_threshold_extreme) + return self + + fpr, tpr, thresholds = roc_curve(y_valid, + self.decision_function(pairs_valid), + pos_label=1) + # here the thresholds are decreasing + fpr, tpr, thresholds = fpr, tpr, thresholds + + if strategy in ['max_tpr', 'max_tnr']: + if strategy == 'max_tpr': + indices = np.where(1 - fpr >= min_rate)[0] + imax = np.argmax(tpr[indices]) + + if strategy == 'max_tnr': + indices = np.where(tpr >= min_rate)[0] + imax = np.argmax(1 - fpr[indices]) + + imax_valid = indices[imax] + # note: we are working with negative distances but we want the threshold + # to be with respect to the actual distances so we take minus sign + if indices[imax] == len(thresholds): # we want to accept everything + self.threshold_ = - (thresholds[imax_valid] - 1) + else: + # thanks to roc_curve, the first point will always be max_scores + # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523 + self.threshold_ = - thresholds[imax_valid] + return self + + @staticmethod + def _validate_calibration_params(strategy='accuracy', min_rate=None, + beta=1.): + """Ensure that calibration parameters have allowed values""" + if strategy not in ('accuracy', 'f_beta', 'max_tpr', + 'max_tnr'): + raise ValueError('Strategy can either be "accuracy", "f_beta" or ' + '"max_tpr" or "max_tnr". Got "{}" instead.' + .format(strategy)) + if strategy == 'max_tpr' or strategy == 'max_tnr': + if (min_rate is None or not isinstance(min_rate, (int, float)) or + not min_rate >= 0 or not min_rate <= 1): + raise ValueError('Parameter min_rate must be a number in' + '[0, 1]. ' + 'Got {} instead.'.format(min_rate)) + if strategy == 'f_beta': + if beta is None or not isinstance(beta, (int, float)): + raise ValueError('Parameter beta must be a real number. ' + 'Got {} instead.'.format(type(beta))) + + +class _TripletsClassifierMixin(BaseMetricLearner, ClassifierMixin): + """ + Base class for triplets learners. + """ + + classes_ = np.array([0, 1]) + _tuple_size = 3 # number of points in a tuple, 3 for triplets + + def predict(self, triplets): + """Predicts the ordering between sample distances in input triplets. + + For each triplets, returns 1 if the first element is closer to the second + than to the last and -1 if not. + + Parameters + ---------- + triplets : array-like, shape=(n_triplets, 3, n_features) or (n_triplets, 3) + 3D array of triplets to predict, with each row corresponding to three + points, or 2D array of indices of triplets if the metric learner + uses a preprocessor. + + Returns + ------- + prediction : `numpy.ndarray` of floats, shape=(n_constraints,) + Predictions of the ordering of pairs, for each triplet. + """ + return 2 * (self.decision_function(triplets) > 0) - 1 + + def decision_function(self, triplets): + """Predicts differences between sample distances in input triplets. + + For each triplet (X_a, X_b, X_c) in the samples, computes the difference + between the learned distance of the second pair (X_a, X_c) minus the + learned distance of the first pair (X_a, X_b). The higher it is, the more + probable it is that the pairs in the triplets are presented in the right + order, i.e. that the label of the triplet is 1. The lower it is, the more + probable it is that the label of the triplet is -1. + + Parameters + ---------- + triplet : array-like, shape=(n_triplets, 3, n_features) or \ + (n_triplets, 3) + 3D array of triplets to predict, with each row corresponding to three + points, or 2D array of indices of triplets if the metric learner + uses a preprocessor. + + Returns + ------- + decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) + Metric differences. + """ + check_is_fitted(self, 'preprocessor_') + triplets = check_input(triplets, type_of_inputs='tuples', + preprocessor=self.preprocessor_, + estimator=self, tuple_size=self._tuple_size) + return (self.pair_score(triplets[:, :2]) - + self.pair_score(triplets[:, [0, 2]])) + + def score(self, triplets): + """Computes score on input triplets. + + Returns the accuracy score of the following classification task: a triplet + (X_a, X_b, X_c) is correctly classified if the predicted similarity between + the first pair (X_a, X_b) is higher than that of the second pair (X_a, X_c) + + Parameters + ---------- + triplets : array-like, shape=(n_triplets, 3, n_features) or \ + (n_triplets, 3) + 3D array of triplets to score, with each row corresponding to three + points, or 2D array of indices of triplets if the metric learner + uses a preprocessor. + + Returns + ------- + score : float + The triplets score. + """ + # Since the prediction is a vector of values in {-1, +1}, we need to + # rescale them to {0, 1} to compute the accuracy using the mean (because + # then 1 means a correctly classified result (pairs are in the right + # order), and a 0 an incorrectly classified result (pairs are in the + # wrong order). + return self.predict(triplets).mean() / 2 + 0.5 + + +class _QuadrupletsClassifierMixin(BaseMetricLearner, ClassifierMixin): + """ + Base class for quadruplets learners. + """ + + classes_ = np.array([0, 1]) + _tuple_size = 4 # number of points in a tuple, 4 for quadruplets + + def predict(self, quadruplets): + """Predicts the ordering between sample distances in input quadruplets. + + For each quadruplet, returns 1 if the quadruplet is in the right order ( + first pair is more similar than second pair), and -1 if not. + + Parameters + ---------- + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ + (n_quadruplets, 4) + 3D Array of quadruplets to predict, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. + + Returns + ------- + prediction : `numpy.ndarray` of floats, shape=(n_constraints,) + Predictions of the ordering of pairs, for each quadruplet. + """ + return np.sign(self.decision_function(quadruplets)) + + def decision_function(self, quadruplets): + """Predicts differences between sample distances in input quadruplets. + + For each quadruplet in the samples, computes the difference between the + learned metric of the second pair minus the learned metric of the first + pair. The higher it is, the more probable it is that the pairs in the + quadruplet are presented in the right order, i.e. that the label of the + quadruplet is 1. The lower it is, the more probable it is that the label of + the quadruplet is -1. + + Parameters + ---------- + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ + (n_quadruplets, 4) + 3D Array of quadruplets to predict, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. + + Returns + ------- + decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) + Metric differences. + """ + check_is_fitted(self, 'preprocessor_') + quadruplets = check_input(quadruplets, type_of_inputs='tuples', + preprocessor=self.preprocessor_, + estimator=self, tuple_size=self._tuple_size) + return (self.pair_score(quadruplets[:, :2]) - + self.pair_score(quadruplets[:, 2:])) + + def score(self, quadruplets): + """Computes score on input quadruplets + + Returns the accuracy score of the following classification task: a record + is correctly classified if the predicted similarity between the first two + samples is higher than that of the last two. + + Parameters + ---------- + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ + (n_quadruplets, 4) + 3D Array of quadruplets to score, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. + + Returns + ------- + score : float + The quadruplets score. + """ + # Since the prediction is a vector of values in {-1, +1}, we need to + # rescale them to {0, 1} to compute the accuracy using the mean (because + # then 1 means a correctly classified result (pairs are in the right + # order), and a 0 an incorrectly classified result (pairs are in the + # wrong order). + return self.predict(quadruplets).mean() / 2 + 0.5 diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py index 0f57b3e8..4993e9ef 100644 --- a/metric_learn/constraints.py +++ b/metric_learn/constraints.py @@ -3,87 +3,309 @@ from supervised data labels. """ import numpy as np -import random import warnings -from six.moves import xrange -from scipy.sparse import coo_matrix +from sklearn.utils import check_random_state +from sklearn.neighbors import NearestNeighbors + __all__ = ['Constraints'] class Constraints(object): + """ + Class to build constraints from labeled data. + + See more in the :ref:`User Guide `. + + Parameters + ---------- + partial_labels : `numpy.ndarray` of ints, shape=(n_samples,) + Array of labels, with -1 indicating unknown label. + + Attributes + ---------- + partial_labels : `numpy.ndarray` of ints, shape=(n_samples,) + Array of labels, with -1 indicating unknown label. + """ + def __init__(self, partial_labels): - '''partial_labels : int arraylike, -1 indicating unknown label''' - partial_labels = np.asanyarray(partial_labels) - self.num_points, = partial_labels.shape - self.known_label_idx, = np.where(partial_labels >= 0) - self.known_labels = partial_labels[self.known_label_idx] - - def adjacency_matrix(self, num_constraints): - a, b, c, d = self.positive_negative_pairs(num_constraints) - row = np.concatenate((a, c)) - col = np.concatenate((b, d)) - data = np.ones_like(row, dtype=int) - data[len(a):] = -1 - adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2) - # symmetrize - return adj + adj.T - - def positive_negative_pairs(self, num_constraints, same_length=False): - a, b = self._pairs(num_constraints, same_label=True) - c, d = self._pairs(num_constraints, same_label=False) + partial_labels = np.asanyarray(partial_labels, dtype=int) + self.partial_labels = partial_labels + + def positive_negative_pairs(self, n_constraints, same_length=False, + random_state=None, num_constraints='deprecated'): + """ + Generates positive pairs and negative pairs from labeled data. + + Positive pairs are formed by randomly drawing ``n_constraints`` pairs of + points with the same label. Negative pairs are formed by randomly drawing + ``n_constraints`` pairs of points with different label. + + In the case where it is not possible to generate enough positive or + negative pairs, a smaller number of pairs will be returned with a warning. + + Parameters + ---------- + n_constraints : int + Number of positive and negative constraints to generate. + + same_length : bool, optional (default=False) + If True, forces the number of positive and negative pairs to be + equal by ignoring some pairs from the larger set. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + + num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0 + + Returns + ------- + a : array-like, shape=(n_constraints,) + 1D array of indicators for the left elements of positive pairs. + + b : array-like, shape=(n_constraints,) + 1D array of indicators for the right elements of positive pairs. + + c : array-like, shape=(n_constraints,) + 1D array of indicators for the left elements of negative pairs. + + d : array-like, shape=(n_constraints,) + 1D array of indicators for the right elements of negative pairs. + """ + if num_constraints != 'deprecated': + warnings.warn('"num_constraints" parameter has been renamed to' + ' "n_constraints". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + self.n_constraints = num_constraints + else: + self.n_constraints = n_constraints + random_state = check_random_state(random_state) + a, b = self._pairs(n_constraints, same_label=True, + random_state=random_state) + c, d = self._pairs(n_constraints, same_label=False, + random_state=random_state) if same_length and len(a) != len(c): n = min(len(a), len(c)) return a[:n], b[:n], c[:n], d[:n] return a, b, c, d - def _pairs(self, num_constraints, same_label=True, max_iter=10): - num_labels = len(self.known_labels) + def generate_knntriplets(self, X, k_genuine, k_impostor): + """ + Generates triplets from labeled data. + + For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all + the combinations of taking one of its `k_genuine`-nearest neighbors of the + same class (X_b) and taking one of its `k_impostor`-nearest neighbors of + other classes (X_c). + + In the case a class doesn't have enough points in the same class (other + classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be + raised and the maximum value of genuine (impostor) neighbors will be used + for that class. + + Parameters + ---------- + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + k_genuine : int + Number of neighbors of the same class to be taken into account. + + k_impostor : int + Number of neighbors of different classes to be taken into account. + + Returns + ------- + triplets : array-like, shape=(n_constraints, 3) + 2D array of triplets of indicators. + """ + # Ignore unlabeled samples + known_labels_mask = self.partial_labels >= 0 + known_labels = self.partial_labels[known_labels_mask] + X = X[known_labels_mask] + + labels, labels_count = np.unique(known_labels, return_counts=True) + len_input = known_labels.shape[0] + + # Handle the case where there are too few elements to yield k_genuine or + # k_impostor neighbors for every class. + + k_genuine_vec = np.full_like(labels, k_genuine) + k_impostor_vec = np.full_like(labels, k_impostor) + + for i, count in enumerate(labels_count): + if k_genuine + 1 > count: + k_genuine_vec[i] = count-1 + warnings.warn("The class {} has {} elements, which is not sufficient " + "to generate {} genuine neighbors as specified by " + "k_genuine. Will generate {} genuine neighbors instead." + "\n" + .format(labels[i], count, k_genuine+1, + k_genuine_vec[i])) + if k_impostor > len_input - count: + k_impostor_vec[i] = len_input - count + warnings.warn("The class {} has {} elements of other classes, which is" + " not sufficient to generate {} impostor neighbors as " + "specified by k_impostor. Will generate {} impostor " + "neighbors instead.\n" + .format(labels[i], k_impostor_vec[i], k_impostor, + k_impostor_vec[i])) + + # The total number of possible triplets combinations per label comes from + # taking one of the k_genuine_vec[i] genuine neighbors and one of the + # k_impostor_vec[i] impostor neighbors for the labels_count[i] elements + comb_per_label = labels_count * k_genuine_vec * k_impostor_vec + + # Get start and finish for later triplet assigning + # append zero at the begining for start and get cumulative sum + start_finish_indices = np.hstack((0, comb_per_label)).cumsum() + + # Total number of triplets is the sum of all possible combinations per + # label + num_triplets = start_finish_indices[-1] + triplets = np.empty((num_triplets, 3), dtype=np.intp) + + neigh = NearestNeighbors() + + for i, label in enumerate(labels): + + # generate mask for current label + gen_mask = known_labels == label + gen_indx = np.where(gen_mask) + + # get k_genuine genuine neighbors + neigh.fit(X=X[gen_indx]) + # Take elements of gen_indx according to the yielded k-neighbors + gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i], + return_distance=False) + gen_neigh = np.take(gen_indx, gen_relative_indx) + + # generate mask for impostors of current label + imp_indx = np.where(~gen_mask) + + # get k_impostor impostor neighbors + neigh.fit(X=X[imp_indx]) + # Take elements of imp_indx according to the yielded k-neighbors + imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i], + X=X[gen_mask], + return_distance=False) + imp_neigh = np.take(imp_indx, imp_relative_indx) + + # length = len_label*k_genuine*k_impostor + start, finish = start_finish_indices[i:i+2] + + triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh, + k_genuine_vec[i], + k_impostor_vec[i]) + + return triplets + + def _pairs(self, n_constraints, same_label=True, max_iter=10, + random_state=np.random): + known_label_idx, = np.where(self.partial_labels >= 0) + known_labels = self.partial_labels[known_label_idx] + num_labels = len(known_labels) ab = set() it = 0 - while it < max_iter and len(ab) < num_constraints: - nc = num_constraints - len(ab) - for aidx in np.random.randint(num_labels, size=nc): + while it < max_iter and len(ab) < n_constraints: + nc = n_constraints - len(ab) + for aidx in random_state.randint(num_labels, size=nc): if same_label: - mask = self.known_labels[aidx] == self.known_labels + mask = known_labels[aidx] == known_labels mask[aidx] = False # avoid identity pairs else: - mask = self.known_labels[aidx] != self.known_labels + mask = known_labels[aidx] != known_labels b_choices, = np.where(mask) if len(b_choices) > 0: - ab.add((aidx, np.random.choice(b_choices))) + ab.add((aidx, random_state.choice(b_choices))) it += 1 - if len(ab) < num_constraints: + if len(ab) < n_constraints: warnings.warn("Only generated %d %s constraints (requested %d)" % ( - len(ab), 'positive' if same_label else 'negative', num_constraints)) - ab = np.array(list(ab)[:num_constraints], dtype=int) - return self.known_label_idx[ab.T] - - def chunks(self, num_chunks=100, chunk_size=2): - chunks = -np.ones_like(self.known_label_idx, dtype=int) - uniq, lookup = np.unique(self.known_labels, return_inverse=True) - all_inds = [set(np.where(lookup==c)[0]) for c in xrange(len(uniq))] + len(ab), 'positive' if same_label else 'negative', n_constraints)) + ab = np.array(list(ab)[:n_constraints], dtype=int) + return known_label_idx[ab.T] + + def chunks(self, n_chunks=100, chunk_size=2, random_state=None, + num_chunks='deprecated'): + """ + Generates chunks from labeled data. + + Each of ``n_chunks`` chunks is composed of ``chunk_size`` points from + the same class drawn at random. Each point can belong to at most 1 chunk. + + In the case where there is not enough points to generate ``n_chunks`` + chunks of size ``chunk_size``, a ValueError will be raised. + + Parameters + ---------- + n_chunks : int, optional (default=100) + Number of chunks to generate. + + chunk_size : int, optional (default=2) + Number of points in each chunk. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + + num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0 + + Returns + ------- + chunks : array-like, shape=(n_samples,) + 1D array of chunk indicators, where -1 indicates that the point does not + belong to any chunk. + """ + if num_chunks != 'deprecated': + warnings.warn('"num_chunks" parameter has been renamed to' + ' "n_chunks". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + n_chunks = num_chunks + random_state = check_random_state(random_state) + chunks = -np.ones_like(self.partial_labels, dtype=int) + uniq, lookup = np.unique(self.partial_labels, return_inverse=True) + unknown_uniq = np.where(uniq < 0)[0] + all_inds = [set(np.where(lookup == c)[0]) for c in range(len(uniq)) + if c not in unknown_uniq] + max_chunks = int(np.sum([len(s) // chunk_size for s in all_inds])) + if max_chunks < n_chunks: + raise ValueError(('Not enough possible chunks of %d elements in each' + ' class to form expected %d chunks - maximum number' + ' of chunks is %d' + ) % (chunk_size, n_chunks, max_chunks)) idx = 0 - while idx < num_chunks and all_inds: - c = random.randint(0, len(all_inds)-1) + while idx < n_chunks and all_inds: + if len(all_inds) == 1: + c = 0 + else: + c = random_state.randint(0, high=len(all_inds) - 1) inds = all_inds[c] if len(inds) < chunk_size: del all_inds[c] continue - ii = random.sample(inds, chunk_size) + ii = random_state.choice(list(inds), chunk_size, replace=False) inds.difference_update(ii) chunks[ii] = idx idx += 1 - if idx < num_chunks: - raise ValueError('Unable to make %d chunks of %d examples each' % - (num_chunks, chunk_size)) return chunks - @staticmethod - def random_subset(all_labels, num_preserved=np.inf): - n = len(all_labels) - num_ignored = max(0, n - num_preserved) - idx = np.random.randint(n, size=num_ignored) - partial_labels = np.array(all_labels, copy=True) - partial_labels[idx] = -1 - return Constraints(partial_labels) + +def comb(A, B, C, sizeB, sizeC): + # generate_knntriplets helper function + # generate an array with all combinations of choosing + # an element from A, B and C + return np.vstack((np.tile(A, (sizeB*sizeC, 1)).ravel(order='F'), + np.tile(np.hstack(B), (sizeC, 1)).ravel(order='F'), + np.tile(C, (1, sizeB)).ravel())).T + + +def wrap_pairs(X, constraints): + a = np.array(constraints[0]) + b = np.array(constraints[1]) + c = np.array(constraints[2]) + d = np.array(constraints[3]) + constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d)))) + y = np.concatenate([np.ones_like(a), -np.ones_like(c)]) + pairs = X[constraints] + return pairs, y diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 541cbfa9..2c05b28d 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -1,30 +1,60 @@ """ Covariance metric (baseline method) - -This method does not "learn" anything, rather it calculates -the covariance matrix of the input data. - -This is a simple baseline method first introduced in -On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936 """ -from __future__ import absolute_import import numpy as np +import scipy +from sklearn.base import TransformerMixin + +from .base_metric import MahalanobisMixin +from ._util import components_from_metric + + +class Covariance(MahalanobisMixin, TransformerMixin): + """Covariance metric (baseline method) -from .base_metric import BaseMetricLearner + This method does not "learn" anything, rather it calculates + the covariance matrix of the input data. + This is a simple baseline method first introduced in + On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936 -class Covariance(BaseMetricLearner): - def __init__(self): - self.params = {} + Read more in the :ref:`User Guide `. - def metric(self): - return self.M + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + Examples + -------- + >>> from metric_learn import Covariance + >>> from sklearn.datasets import load_iris + >>> iris = load_iris()['data'] + >>> cov = Covariance().fit(iris) + >>> x = cov.transform(iris) + + """ + + def __init__(self, preprocessor=None): + super(Covariance, self).__init__(preprocessor) def fit(self, X, y=None): """ - X: data matrix, (n x d) - y: unused, optional + Calculates the covariance matrix of the input data. + + Parameters + ---------- + X : data matrix, (n x d) + y : unused """ - self.M = np.cov(X.T) + X = self._prepare_inputs(X, ensure_min_samples=2) + M = np.atleast_2d(np.cov(X, rowvar=False)) + if M.size == 1: + M = 1. / M + else: + M = scipy.linalg.pinvh(M) + + self.components_ = components_from_metric(np.atleast_2d(M)) return self diff --git a/metric_learn/exceptions.py b/metric_learn/exceptions.py new file mode 100644 index 00000000..76f09778 --- /dev/null +++ b/metric_learn/exceptions.py @@ -0,0 +1,20 @@ +""" +The :mod:`metric_learn.exceptions` module includes all custom warnings and +error classes used across metric-learn. +""" +from numpy.linalg import LinAlgError + + +class PreprocessorError(Exception): + + def __init__(self, original_error): + err_msg = ("An error occurred when trying to use the " + "preprocessor: {}").format(repr(original_error)) + super(PreprocessorError, self).__init__(err_msg) + + +class NonPSDError(LinAlgError): + + def __init__(self): + err_msg = "Matrix is not positive semidefinite (PSD)." + super(LinAlgError, self).__init__(err_msg) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 7f2118bd..9537eec2 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -1,184 +1,407 @@ """ -Information Theoretic Metric Learning, Kulis et al., ICML 2007 - -ITML minimizes the differential relative entropy between two multivariate -Gaussians under constraints on the distance function, -which can be formulated into a Bregman optimization problem by minimizing the -LogDet divergence subject to linear constraints. -This algorithm can handle a wide variety of constraints and can optionally -incorporate a prior on the distance function. -Unlike some other methods, ITML does not rely on an eigenvalue computation -or semi-definite programming. +Information Theoretic Metric Learning (ITML) """ -from __future__ import print_function, absolute_import import numpy as np -from six.moves import xrange from sklearn.metrics import pairwise_distances +from sklearn.utils.validation import check_array +from sklearn.base import TransformerMixin +from .base_metric import _PairsClassifierMixin, MahalanobisMixin +from .constraints import Constraints, wrap_pairs +from ._util import components_from_metric, _initialize_metric_mahalanobis +import warnings -from .base_metric import BaseMetricLearner -from .constraints import Constraints - -class ITML(BaseMetricLearner): +class _BaseITML(MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" - def __init__(self, gamma=1., max_iters=1000, convergence_threshold=1e-3, - verbose=False): - """Initialize the learner. - Parameters - ---------- - gamma : float, optional - value for slack variables - max_iters : int, optional - convergence_threshold : float, optional - verbose : bool, optional - if True, prints information while learning - """ - self.params = { - 'gamma': gamma, - 'max_iters': max_iters, - 'convergence_threshold': convergence_threshold, - 'verbose': verbose, - } - - def _process_inputs(self, X, constraints, bounds, A0): - self.X = X - # check to make sure that no two constrained vectors are identical - a,b,c,d = constraints - ident = _vector_norm(self.X[a] - self.X[b]) > 1e-9 - a, b = a[ident], b[ident] - ident = _vector_norm(self.X[c] - self.X[d]) > 1e-9 - c, d = c[ident], d[ident] + _tuple_size = 2 # constraints are pairs + + def __init__(self, gamma=1., max_iter=1000, tol=1e-3, + prior='identity', verbose=False, + preprocessor=None, random_state=None, + convergence_threshold='deprecated'): + if convergence_threshold != 'deprecated': + warnings.warn('"convergence_threshold" parameter has been ' + ' renamed to "tol". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + tol = convergence_threshold + self.convergence_threshold = 'deprecated' # Avoid errors + self.gamma = gamma + self.max_iter = max_iter + self.tol = tol + self.prior = prior + self.verbose = verbose + self.random_state = random_state + super(_BaseITML, self).__init__(preprocessor) + + def _fit(self, pairs, y, bounds=None): + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') # init bounds if bounds is None: - self.bounds = np.percentile(pairwise_distances(X), (5, 95)) - else: - assert len(bounds) == 2 - self.bounds = bounds - # init metric - if A0 is None: - self.A = np.identity(X.shape[1]) + X = np.unique(np.vstack(pairs), axis=0) + self.bounds_ = np.percentile(pairwise_distances(X), (5, 95)) else: - self.A = A0 - return a,b,c,d - - def fit(self, X, constraints, bounds=None, A0=None): - """Learn the ITML model. - - Parameters - ---------- - X : (n x d) data matrix - each row corresponds to a single instance - constraints : 4-tuple of arrays - (a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d]) - bounds : list (pos,neg) pairs, optional - bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg - A0 : (d x d) matrix, optional - initial regularization matrix, defaults to identity - """ - verbose = self.params['verbose'] - a,b,c,d = self._process_inputs(X, constraints, bounds, A0) - gamma = self.params['gamma'] - conv_thresh = self.params['convergence_threshold'] - num_pos = len(a) - num_neg = len(c) + bounds = check_array(bounds, allow_nd=False, ensure_min_samples=0, + ensure_2d=False) + bounds = bounds.ravel() + if bounds.size != 2: + raise ValueError("`bounds` should be an array-like of two elements.") + self.bounds_ = bounds + self.bounds_[self.bounds_ == 0] = 1e-9 + # set the prior + # pairs will be deduplicated into X two times, TODO: avoid that + A = _initialize_metric_mahalanobis(pairs, self.prior, self.random_state, + strict_pd=True, + matrix_name='prior') + gamma = self.gamma + pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] + num_pos = len(pos_pairs) + num_neg = len(neg_pairs) _lambda = np.zeros(num_pos + num_neg) lambdaold = np.zeros_like(_lambda) - gamma_proj = 1. if gamma is np.inf else gamma/(gamma+1.) - pos_bhat = np.zeros(num_pos) + self.bounds[0] - neg_bhat = np.zeros(num_neg) + self.bounds[1] - A = self.A + gamma_proj = 1. if gamma is np.inf else gamma / (gamma + 1.) + pos_bhat = np.zeros(num_pos) + self.bounds_[0] + neg_bhat = np.zeros(num_neg) + self.bounds_[1] + pos_vv = pos_pairs[:, 0, :] - pos_pairs[:, 1, :] + neg_vv = neg_pairs[:, 0, :] - neg_pairs[:, 1, :] - for it in xrange(self.params['max_iters']): + for it in range(self.max_iter): # update positives - vv = self.X[a] - self.X[b] - for i,v in enumerate(vv): + for i, v in enumerate(pos_vv): wtw = v.dot(A).dot(v) # scalar - alpha = min(_lambda[i], gamma_proj*(1./wtw - 1./pos_bhat[i])) + alpha = min(_lambda[i], gamma_proj * (1. / wtw - 1. / pos_bhat[i])) _lambda[i] -= alpha - beta = alpha/(1 - alpha*wtw) - pos_bhat[i] = 1./((1 / pos_bhat[i]) + (alpha / gamma)) - A += beta * A.dot(np.outer(v,v)).dot(A) + beta = alpha / (1 - alpha * wtw) + pos_bhat[i] = 1. / ((1 / pos_bhat[i]) + (alpha / gamma)) + Av = A.dot(v) + A += np.outer(Av, Av * beta) # update negatives - vv = self.X[c] - self.X[d] - for i,v in enumerate(vv): + for i, v in enumerate(neg_vv): wtw = v.dot(A).dot(v) # scalar - alpha = min(_lambda[i+num_pos],gamma_proj*(1./neg_bhat[i] - 1./wtw)) - _lambda[i+num_pos] -= alpha - beta = -alpha/(1 + alpha*wtw) - neg_bhat[i] = 1./((1 / neg_bhat[i]) - (alpha / gamma)) - A += beta * A.dot(np.outer(v,v)).dot(A) + alpha = min(_lambda[i + num_pos], + gamma_proj * (1. / neg_bhat[i] - 1. / wtw)) + _lambda[i + num_pos] -= alpha + beta = -alpha / (1 + alpha * wtw) + neg_bhat[i] = 1. / ((1 / neg_bhat[i]) - (alpha / gamma)) + Av = A.dot(v) + A += np.outer(Av, Av * beta) normsum = np.linalg.norm(_lambda) + np.linalg.norm(lambdaold) if normsum == 0: conv = np.inf break conv = np.abs(lambdaold - _lambda).sum() / normsum - if conv < conv_thresh: + if conv < self.tol: break lambdaold = _lambda.copy() - if verbose: + if self.verbose: print('itml iter: %d, conv = %f' % (it, conv)) - if verbose: + + if self.verbose: print('itml converged at iter: %d, conv = %f' % (it, conv)) + self.n_iter_ = it + + self.components_ = components_from_metric(A) return self - def metric(self): - return self.A -# hack around lack of axis kwarg in older numpy versions -try: - np.linalg.norm([[4]], axis=1) -except TypeError: - def _vector_norm(X): - return np.apply_along_axis(np.linalg.norm, 1, X) -else: - def _vector_norm(X): - return np.linalg.norm(X, axis=1) +class ITML(_BaseITML, _PairsClassifierMixin): + """Information Theoretic Metric Learning (ITML) + `ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler + divergence, between two multivariate Gaussians subject to constraints on the + associated Mahalanobis distance, which can be formulated into a Bregman + optimization problem by minimizing the LogDet divergence subject to + linear constraints. This algorithm can handle a wide variety of constraints + and can optionally incorporate a prior on the distance function. Unlike some + other methods, `ITML` does not rely on an eigenvalue computation or + semi-definite programming. -class ITML_Supervised(ITML): - """Information Theoretic Metric Learning (ITML)""" - def __init__(self, gamma=1., max_iters=1000, convergence_threshold=1e-3, - num_labeled=np.inf, num_constraints=None, bounds=None, A0=None, - verbose=False): - """Initialize the learner. + Read more in the :ref:`User Guide `. + + Parameters + ---------- + gamma : float, optional (default=1.0) + Value for slack variables + + max_iter : int, optional (default=1000) + Maximum number of iteration of the optimization procedure. + + tol : float, optional (default=1e-3) + Convergence tolerance. + + prior : string or numpy array, optional (default='identity') + The Mahalanobis matrix to use as a prior. Possible options are + 'identity', 'covariance', 'random', and a numpy array of shape + (n_features, n_features). For ITML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. + + 'random' + The prior will be a random SPD matrix of shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``prior='random'``, ``random_state`` is used to set the prior. + + convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0 + + Attributes + ---------- + bounds_ : `numpy.ndarray`, shape=(2,) + Bounds on similarity, aside slack variables, s.t. + ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a`` + and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of + dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If + not provided at initialization, bounds_[0] and bounds_[1] are set at + train time to the 5th and 95th percentile of the pairwise distances among + all points present in the input `pairs`. + + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + threshold_ : `float` + If the distance metric between two points is lower than this threshold, + points will be classified as similar, otherwise they will be + classified as dissimilar. + + Examples + -------- + >>> from metric_learn import ITML + >>> pairs = [[[1.2, 7.5], [1.3, 1.5]], + >>> [[6.4, 2.6], [6.2, 9.7]], + >>> [[1.3, 4.5], [3.2, 4.6]], + >>> [[6.2, 5.5], [5.4, 5.4]]] + >>> y = [1, 1, -1, -1] + >>> # in this task we want points where the first feature is close to be + >>> # closer to each other, no matter how close the second feature is + >>> itml = ITML() + >>> itml.fit(pairs, y) + + References + ---------- + .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning + `_. ICML 2007. + """ + + def fit(self, pairs, y, bounds=None, calibration_params=None): + """Learn the ITML model. + + The threshold will be calibrated on the trainset using the parameters + `calibration_params`. Parameters ---------- - gamma : float, optional - value for slack variables - max_iters : int, optional - convergence_threshold : float, optional - num_labeled : int, optional - number of labels to preserve for training - num_constraints: int, optional - number of constraints to generate - verbose : bool, optional - if True, prints information while learning + pairs: array-like, shape=(n_constraints, 2, n_features) or \ + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + y: array-like, of shape (n_constraints,) + Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. + + bounds : array-like of two numbers + Bounds on similarity, aside slack variables, s.t. + ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a`` + and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of + dissimilar points ``c`` and ``d``, with ``d`` the learned distance. + If not provided at initialization, bounds_[0] and bounds_[1] will be + set to the 5th and 95th percentile of the pairwise distances among all + points present in the input `pairs`. + + calibration_params : `dict` or `None` + Dictionary of parameters to give to `calibrate_threshold` for the + threshold calibration step done at the end of `fit`. If `None` is + given, `calibrate_threshold` will use the default parameters. + + Returns + ------- + self : object + Returns the instance. """ - ITML.__init__(self, gamma=gamma, max_iters=max_iters, - convergence_threshold=convergence_threshold, verbose=verbose) - self.params.update(num_labeled=num_labeled, num_constraints=num_constraints, - bounds=bounds, A0=A0) + calibration_params = (calibration_params if calibration_params is not + None else dict()) + self._validate_calibration_params(**calibration_params) + self._fit(pairs, y, bounds=bounds) + self.calibrate_threshold(pairs, y, **calibration_params) + return self + + +class ITML_Supervised(_BaseITML, TransformerMixin): + """Supervised version of Information Theoretic Metric Learning (ITML) + + `ITML_Supervised` creates pairs of similar sample by taking same class + samples, and pairs of dissimilar samples by taking different class + samples. It then passes these pairs to `ITML` for training. + + Parameters + ---------- + gamma : float, optional (default=1.0) + Value for slack variables + + max_iter : int, optional (default=1000) + Maximum number of iterations of the optimization procedure. + + tol : float, optional (default=1e-3) + Tolerance of the optimization procedure. + + n_constraints : int, optional (default=None) + Number of constraints to generate. If None, default to `20 * + num_classes**2`. + + prior : string or numpy array, optional (default='identity') + Initialization of the Mahalanobis matrix. Possible options are + 'identity', 'covariance', 'random', and a numpy array of shape + (n_features, n_features). For ITML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. - def fit(self, X, labels): + 'random' + The prior will be a random SPD matrix of shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``prior='random'``, ``random_state`` is used to set the prior. In any + case, `random_state` is also used to randomly sample constraints from + labels. + + num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0 + + convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0 + + Attributes + ---------- + bounds_ : `numpy.ndarray`, shape=(2,) + Bounds on similarity, aside slack variables, s.t. + ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a`` + and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of + dissimilar points ``c`` and ``d``, with ``d`` the learned distance. + If not provided at initialization, bounds_[0] and bounds_[1] are set at + train time to the 5th and 95th percentile of the pairwise distances + among all points in the training data `X`. + + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + Examples + -------- + >>> from metric_learn import ITML_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> itml = ITML_Supervised(n_constraints=200) + >>> itml.fit(X, Y) + + See Also + -------- + metric_learn.ITML : The original weakly-supervised algorithm + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ + + def __init__(self, gamma=1.0, max_iter=1000, tol=1e-3, + n_constraints=None, prior='identity', + verbose=False, preprocessor=None, random_state=None, + num_constraints='deprecated', + convergence_threshold='deprecated'): + _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter, + tol=tol, + prior=prior, verbose=verbose, + preprocessor=preprocessor, + random_state=random_state, + convergence_threshold=convergence_threshold) + if num_constraints != 'deprecated': + warnings.warn('"num_constraints" parameter has been renamed to' + ' "n_constraints". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + n_constraints = num_constraints + self.n_constraints = n_constraints + # Avoid test get_params from failing (all params passed sholud be set) + self.num_constraints = 'deprecated' + + def fit(self, X, y, bounds=None): """Create constraints from labels and learn the ITML model. - Needs num_constraints specified in constructor. + Parameters ---------- - X : (n x d) data matrix - each row corresponds to a single instance - labels : (n) data labels + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + y : (n) array-like + Data labels. + + bounds : array-like of two numbers + Bounds on similarity, aside slack variables, s.t. + ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a`` + and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of + dissimilar points ``c`` and ``d``, with ``d`` the learned distance. + If not provided at initialization, bounds_[0] and bounds_[1] will be + set to the 5th and 95th percentile of the pairwise distances among all + points in the training data `X`. """ - num_constraints = self.params['num_constraints'] - if num_constraints is None: - num_classes = np.unique(labels) - num_constraints = 20*(len(num_classes))**2 - - c = Constraints.random_subset(labels, self.params['num_labeled']) - return ITML.fit(self, X, c.positive_negative_pairs(num_constraints), - bounds=self.params['bounds'], A0=self.params['A0']) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + n_constraints = self.n_constraints + if n_constraints is None: + num_classes = len(np.unique(y)) + n_constraints = 20 * num_classes**2 + + c = Constraints(y) + pos_neg = c.positive_negative_pairs(n_constraints, + random_state=self.random_state) + pairs, y = wrap_pairs(X, pos_neg) + return _BaseITML._fit(self, pairs, y, bounds=bounds) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 097379de..82ae20eb 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -1,122 +1,171 @@ """ Local Fisher Discriminant Analysis (LFDA) - -Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction -Sugiyama, ICML 2006 - -LFDA is a linear supervised dimensionality reduction method. -It is particularly useful when dealing with multimodality, -where one ore more classes consist of separate clusters in input space. -The core optimization problem of LFDA is solved as a generalized -eigenvalue problem. """ -from __future__ import division, absolute_import import numpy as np import scipy -from six.moves import xrange +import warnings from sklearn.metrics import pairwise_distances +from sklearn.base import TransformerMixin -from .base_metric import BaseMetricLearner +from ._util import _check_n_components +from .base_metric import MahalanobisMixin -class LFDA(BaseMetricLearner): +class LFDA(MahalanobisMixin, TransformerMixin): ''' Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction - Sugiyama, ICML 2006 - ''' - def __init__(self, dim=None, k=7, metric='weighted'): - ''' - dim : dimensionality of reduced space (defaults to dimension of X) - k : nearest neighbor used in local scaling method (default: 7) - metric : type of metric in the embedding space (default: 'weighted') - 'weighted' - weighted eigenvectors - 'orthonormalized' - orthonormalized - 'plain' - raw eigenvectors - ''' - if metric not in ('weighted', 'orthonormalized', 'plain'): - raise ValueError('Invalid metric: %r' % metric) - self.params = { - 'dim': dim, - 'metric': metric, - 'k': k, - } - - def transformer(self): - return self._transformer + LFDA is a linear supervised dimensionality reduction method. It is + particularly useful when dealing with multimodality, where one ore more + classes consist of separate clusters in input space. The core optimization + problem of LFDA is solved as a generalized eigenvalue problem. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + k : int, optional (default=None) + Number of nearest neighbors used in local scaling method. If None, + defaults to min(7, n_features - 1). + + embedding_type : str, optional (default: 'weighted') + Type of metric in the embedding space. + + 'weighted' + weighted eigenvectors + + 'orthonormalized' + orthonormalized + + 'plain' + raw eigenvectors + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + Examples + -------- + + >>> import numpy as np + >>> from metric_learn import LFDA + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> lfda = LFDA(k=2, dim=2) + >>> lfda.fit(X, Y) + + References + ---------- + .. [1] Masashi Sugiyama. `Dimensionality Reduction of Multimodal Labeled + Data by Local Fisher Discriminant Analysis + `_. JMLR 2007. + + .. [2] Yuan Tang. `Local Fisher Discriminant Analysis on Beer Style + Clustering + `_. + ''' - def _process_inputs(self, X, Y): - X = np.asanyarray(X) - self.X = X + def __init__(self, n_components=None, + k=None, embedding_type='weighted', preprocessor=None): + if embedding_type not in ('weighted', 'orthonormalized', 'plain'): + raise ValueError('Invalid embedding_type: %r' % embedding_type) + self.n_components = n_components + self.embedding_type = embedding_type + self.k = k + super(LFDA, self).__init__(preprocessor) + + def fit(self, X, y): + '''Fit the LFDA model. + + Parameters + ---------- + X : (n, d) array-like + Input data. + + y : (n,) array-like + Class labels, one per point of data. + ''' + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + unique_classes, y = np.unique(y, return_inverse=True) n, d = X.shape - unique_classes, Y = np.unique(Y, return_inverse=True) num_classes = len(unique_classes) - if self.params['dim'] is None: - self.params['dim'] = d - elif not 0 < self.params['dim'] <= d: - raise ValueError('Invalid embedding dimension, must be in [1,%d]' % d) - - if not 0 < self.params['k'] < d: - raise ValueError('Invalid k, must be in [0,%d]' % (d-1)) + dim = _check_n_components(d, self.n_components) - return X, Y, num_classes, n, d - - def fit(self, X, Y): - ''' - X: (n, d) array-like of samples - Y: (n,) array-like of class labels - ''' - X, Y, num_classes, n, d = self._process_inputs(X, Y) - tSb = np.zeros((d,d)) - tSw = np.zeros((d,d)) + if self.k is None: + k = min(7, d - 1) + elif self.k >= d: + warnings.warn('Chosen k (%d) too large, using %d instead.' + % (self.k, d - 1)) + k = d - 1 + else: + k = int(self.k) + tSb = np.zeros((d, d)) + tSw = np.zeros((d, d)) - for c in xrange(num_classes): - Xc = X[Y==c] + for c in range(num_classes): + Xc = X[y == c] nc = Xc.shape[0] # classwise affinity matrix dist = pairwise_distances(Xc, metric='l2', squared=True) # distances to k-th nearest neighbor - k = min(self.params['k'], nc-1) - sigma = np.sqrt(np.partition(dist, k, axis=0)[:,k]) + k = min(k, nc - 1) + sigma = np.sqrt(np.partition(dist, k, axis=0)[:, k]) local_scale = np.outer(sigma, sigma) with np.errstate(divide='ignore', invalid='ignore'): - A = np.exp(-dist/local_scale) - A[local_scale==0] = 0 + A = np.exp(-dist / local_scale) + A[local_scale == 0] = 0 - G = Xc.T.dot(A.sum(axis=0)[:,None] * Xc) - Xc.T.dot(A).dot(Xc) - tSb += G/n + (1-nc/n)*Xc.T.dot(Xc) + _sum_outer(Xc)/n - tSw += G/nc + G = Xc.T.dot(A.sum(axis=0)[:, None] * Xc) - Xc.T.dot(A).dot(Xc) + tSb += G / n + (1 - nc / n) * Xc.T.dot(Xc) + _sum_outer(Xc) / n + tSw += G / nc - tSb -= _sum_outer(X)/n - tSw + tSb -= _sum_outer(X) / n - tSw # symmetrize - tSb += tSb.T - tSb /= 2 - tSw += tSw.T - tSw /= 2 + tSb = (tSb + tSb.T) / 2 + tSw = (tSw + tSw.T) / 2 - if self.params['dim'] == d: - vals, vecs = scipy.linalg.eigh(tSb, tSw) - else: - vals, vecs = scipy.sparse.linalg.eigsh(tSb, k=self.params['dim'], M=tSw, - which='LA') - - order = np.argsort(-vals)[:self.params['dim']] - vals = vals[order] - vecs = vecs[:,order] + vals, vecs = _eigh(tSb, tSw, dim) + order = np.argsort(-vals)[:dim] + vals = vals[order].real + vecs = vecs[:, order] - if self.params['metric'] == 'weighted': + if self.embedding_type == 'weighted': vecs *= np.sqrt(vals) - elif self.params['metric'] == 'orthonormalized': + elif self.embedding_type == 'orthonormalized': vecs, _ = np.linalg.qr(vecs) - self._transformer = vecs.T + self.components_ = vecs.T return self def _sum_outer(x): s = x.sum(axis=0) return np.outer(s, s) + + +def _eigh(a, b, dim): + try: + return scipy.sparse.linalg.eigsh(a, k=dim, M=b, which='LA') + except np.linalg.LinAlgError: + pass # scipy already tried eigh for us + except (ValueError, scipy.sparse.linalg.ArpackNoConvergence): + try: + return scipy.linalg.eigh(a, b) + except np.linalg.LinAlgError: + pass + return scipy.linalg.eig(a, b) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 757d1be5..47bb065f 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -1,208 +1,320 @@ """ -Large-margin nearest neighbor metric learning. (Weinberger 2005) - -LMNN learns a Mahanalobis distance metric in the kNN classification setting -using semidefinite programming. -The learned metric attempts to keep k-nearest neighbors in the same class, -while keeping examples from different classes separated by a large margin. -This algorithm makes no assumptions about the distribution of the data. +Large Margin Nearest Neighbor Metric learning (LMNN) """ -#TODO: periodic recalculation of impostors, PCA initialization - -from __future__ import print_function, absolute_import import numpy as np from collections import Counter -from six.moves import xrange -from sklearn.metrics import pairwise_distances - -from .base_metric import BaseMetricLearner - - -# commonality between LMNN implementations -class _base_LMNN(BaseMetricLearner): - def __init__(self, **kwargs): - self.params = kwargs - - def transformer(self): - return self.L - - -# slower Python version -class python_LMNN(_base_LMNN): - def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, - regularization=0.5, convergence_tol=0.001, verbose=False): - """Initialize the LMNN object - - k: number of neighbors to consider. (does not include self-edges) - regularization: weighting of pull and push terms - """ - _base_LMNN.__init__(self, k=k, min_iter=min_iter, max_iter=max_iter, - learn_rate=learn_rate, regularization=regularization, - convergence_tol=convergence_tol, verbose=verbose) - - def _process_inputs(self, X, labels): - num_pts = X.shape[0] - assert len(labels) == num_pts - unique_labels, self.label_inds = np.unique(labels, return_inverse=True) - self.labels = np.arange(len(unique_labels)) - self.X = X - self.L = np.eye(X.shape[1]) - required_k = np.bincount(self.label_inds).min() - assert self.params['k'] <= required_k, ( - 'not enough class labels for specified k' - ' (smallest class has %d)' % required_k) - - def fit(self, X, labels): - k = self.params['k'] - verbose = self.params['verbose'] - reg = self.params['regularization'] - learn_rate = self.params['learn_rate'] - convergence_tol = self.params['convergence_tol'] - min_iter = self.params['min_iter'] - self._process_inputs(X, labels) - - target_neighbors = self._select_targets() - impostors = self._find_impostors(target_neighbors[:,-1]) +from sklearn.metrics import euclidean_distances +from sklearn.base import TransformerMixin +import warnings + +from ._util import _initialize_components, _check_n_components +from .base_metric import MahalanobisMixin + + +class LMNN(MahalanobisMixin, TransformerMixin): + """Large Margin Nearest Neighbor (LMNN) + + LMNN learns a Mahalanobis distance metric in the kNN classification + setting. The learned metric attempts to keep close k-nearest neighbors + from the same class, while keeping examples from different classes + separated by a large margin. This algorithm makes no assumptions about + the distribution of the data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + init : string or numpy array, optional (default='auto') + Initialization of the linear transformation. Possible options are + 'auto', 'pca', 'identity', 'random', and a numpy array of shape + (n_features_a, n_features_b). + + 'auto' + Depending on ``n_components``, the most reasonable initialization + will be chosen. If ``n_components <= n_classes`` we use 'lda', as + it uses labels information. If not, but + ``n_components < min(n_features, n_samples)``, we use 'pca', as + it projects data in meaningful directions (those of higher + variance). Otherwise, we just use 'identity'. + + 'pca' + ``n_components`` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See `sklearn.decomposition.PCA`) + + 'lda' + ``min(n_components, n_classes)`` most discriminative + components of the inputs passed to :meth:`fit` will be used to + initialize the transformation. (If ``n_components > n_classes``, + the rest of the components will be zero.) (See + `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) + + 'identity' + If ``n_components`` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first ``n_components`` rows. + + 'random' + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + numpy array + n_features_b must match the dimensionality of the inputs passed to + :meth:`fit` and n_features_a must be less than or equal to that. + If ``n_components`` is not None, n_features_a must match it. + + n_neighbors : int, optional (default=3) + Number of neighbors to consider, not including self-edges. + + min_iter : int, optional (default=50) + Minimum number of iterations of the optimization procedure. + + max_iter : int, optional (default=1000) + Maximum number of iterations of the optimization procedure. + + learn_rate : float, optional (default=1e-7) + Learning rate of the optimization procedure + + tol : float, optional (default=0.001) + Tolerance of the optimization procedure. If the objective value varies + less than `tol`, we consider the algorithm has converged and stop it. + + verbose : bool, optional (default=False) + Whether to print the progress of the optimization procedure. + + regularization: float, optional (default=0.5) + Relative weight between pull and push terms, with 0.5 meaning equal + weight. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + transformation. If ``init='pca'``, ``random_state`` is passed as an + argument to PCA when initializing the transformation. + + k : Renamed to n_neighbors. Will be deprecated in 0.7.0 + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + Examples + -------- + + >>> import numpy as np + >>> from metric_learn import LMNN + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> lmnn = LMNN(n_neighbors=5, learn_rate=1e-6) + >>> lmnn.fit(X, Y, verbose=False) + + References + ---------- + .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric + Learning for Large Margin Nearest Neighbor Classification + `_. NIPS + 2005. + """ + + def __init__(self, init='auto', n_neighbors=3, min_iter=50, max_iter=1000, + learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, + verbose=False, preprocessor=None, + n_components=None, random_state=None, k='deprecated'): + self.init = init + if k != 'deprecated': + warnings.warn('"num_chunks" parameter has been renamed to' + ' "n_chunks". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + n_neighbors = k + self.k = 'deprecated' # To avoid no_attribute error + self.n_neighbors = n_neighbors + self.min_iter = min_iter + self.max_iter = max_iter + self.learn_rate = learn_rate + self.regularization = regularization + self.convergence_tol = convergence_tol + self.verbose = verbose + self.n_components = n_components + self.random_state = random_state + super(LMNN, self).__init__(preprocessor) + + def fit(self, X, y): + k = self.n_neighbors + reg = self.regularization + learn_rate = self.learn_rate + + X, y = self._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) + num_pts, d = X.shape + output_dim = _check_n_components(d, self.n_components) + unique_labels, label_inds = np.unique(y, return_inverse=True) + if len(label_inds) != num_pts: + raise ValueError('Must have one label per point.') + self.labels_ = np.arange(len(unique_labels)) + + self.components_ = _initialize_components(output_dim, X, y, self.init, + self.verbose, + random_state=self.random_state) + required_k = np.bincount(label_inds).min() + if self.n_neighbors > required_k: + raise ValueError('not enough class labels for specified k' + ' (smallest class has %d)' % required_k) + + target_neighbors = self._select_targets(X, label_inds) # sum outer products - dfG = _sum_outer_products(self.X, target_neighbors.flatten(), - np.repeat(np.arange(self.X.shape[0]), k)) - df = np.zeros_like(dfG) - - # storage - a1 = [None]*k - a2 = [None]*k - for nn_idx in xrange(k): - a1[nn_idx] = np.array([]) - a2[nn_idx] = np.array([]) - - # initialize gradient and L - G = dfG * reg + df * (1-reg) - L = self.L - objective = np.inf + dfG = _sum_outer_products(X, target_neighbors.flatten(), + np.repeat(np.arange(X.shape[0]), k)) + + # initialize L + L = self.components_ + + # first iteration: we compute variables (including objective and gradient) + # at initialization point + G, objective, total_active = self._loss_grad(X, L, dfG, k, + reg, target_neighbors, + label_inds) + + it = 1 # we already made one iteration + + if self.verbose: + print("iter | objective | objective difference | active constraints", + "| learning rate") # main loop - for it in xrange(1, self.params['max_iter']): - df_old = df.copy() - a1_old = [a.copy() for a in a1] - a2_old = [a.copy() for a in a2] - objective_old = objective - # Compute pairwise distances under current metric - Lx = L.dot(self.X.T).T - g0 = _inplace_paired_L2(*Lx[impostors]) - Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:,None,:]) - g1,g2 = Ni[impostors] - - # compute the gradient - total_active = 0 - for nn_idx in reversed(xrange(k)): - act1 = g0 < g1[:,nn_idx] - act2 = g0 < g2[:,nn_idx] - total_active += act1.sum() + act2.sum() - - if it > 1: - plus1 = act1 & ~a1[nn_idx] - minus1 = a1[nn_idx] & ~act1 - plus2 = act2 & ~a2[nn_idx] - minus2 = a2[nn_idx] & ~act2 + for it in range(2, self.max_iter): + # then at each iteration, we try to find a value of L that has better + # objective than the previous L, following the gradient: + while True: + # the next point next_L to try out is found by a gradient step + L_next = L - learn_rate * G + # we compute the objective at next point + # we copy variables that can be modified by _loss_grad, because if we + # retry we don t want to modify them several times + (G_next, objective_next, total_active_next) = ( + self._loss_grad(X, L_next, dfG, k, reg, target_neighbors, + label_inds)) + assert not np.isnan(objective) + delta_obj = objective_next - objective + if delta_obj > 0: + # if we did not find a better objective, we retry with an L closer to + # the starting point, by decreasing the learning rate (making the + # gradient step smaller) + learn_rate /= 2 else: - plus1 = act1 - plus2 = act2 - minus1 = np.zeros(0, dtype=int) - minus2 = np.zeros(0, dtype=int) - - targets = target_neighbors[:,nn_idx] - PLUS, pweight = _count_edges(plus1, plus2, impostors, targets) - df += _sum_outer_products(self.X, PLUS[:,0], PLUS[:,1], pweight) - MINUS, mweight = _count_edges(minus1, minus2, impostors, targets) - df -= _sum_outer_products(self.X, MINUS[:,0], MINUS[:,1], mweight) - - in_imp, out_imp = impostors - df += _sum_outer_products(self.X, in_imp[minus1], out_imp[minus1]) - df += _sum_outer_products(self.X, in_imp[minus2], out_imp[minus2]) - - df -= _sum_outer_products(self.X, in_imp[plus1], out_imp[plus1]) - df -= _sum_outer_products(self.X, in_imp[plus2], out_imp[plus2]) - - a1[nn_idx] = act1 - a2[nn_idx] = act2 - - # do the gradient update - assert not np.isnan(df).any() - G = dfG * reg + df * (1-reg) - - # compute the objective function - objective = total_active * (1-reg) - objective += G.flatten().dot(L.T.dot(L).flatten()) - assert not np.isnan(objective) - delta_obj = objective - objective_old - - if verbose: + # otherwise, if we indeed found a better obj, we get out of the loop + break + # when the better L is found (and the related variables), we set the + # old variables to these new ones before next iteration and we + # slightly increase the learning rate + L = L_next + G, objective, total_active = G_next, objective_next, total_active_next + learn_rate *= 1.01 + + if self.verbose: print(it, objective, delta_obj, total_active, learn_rate) - # update step size - if delta_obj > 0: - # we're getting worse... roll back! - learn_rate /= 2.0 - df = df_old - a1 = a1_old - a2 = a2_old - objective = objective_old - else: - # update L - L -= learn_rate * 2 * L.dot(G) - learn_rate *= 1.01 - # check for convergence - if it > min_iter and abs(delta_obj) < convergence_tol: - if verbose: + if it > self.min_iter and abs(delta_obj) < self.convergence_tol: + if self.verbose: print("LMNN converged with objective", objective) break else: - if verbose: - print("LMNN didn't converge in %(max_iter)d steps." % self.params) + if self.verbose: + print("LMNN didn't converge in %d steps." % self.max_iter) # store the last L - self.L = L + self.components_ = L + self.n_iter_ = it return self - def metric(self): - return self.L.T.dot(self.L) - - def transform(self, X=None): - if X is None: - X = self.X - return self.L.dot(X.T).T - - def _select_targets(self): - k = self.params['k'] - target_neighbors = np.empty((self.X.shape[0], k), dtype=int) - for label in self.labels: - inds, = np.nonzero(self.label_inds == label) - dd = pairwise_distances(self.X[inds]) + def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): + # Compute pairwise distances under current metric + Lx = L.dot(X.T).T + + # we need to find the furthest neighbor: + Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :]) + furthest_neighbors = np.take_along_axis(target_neighbors, + Ni.argmax(axis=1)[:, None], 1) + impostors = self._find_impostors(furthest_neighbors.ravel(), X, + label_inds, L) + + g0 = _inplace_paired_L2(*Lx[impostors]) + + # we reorder the target neighbors + g1, g2 = Ni[impostors] + # compute the gradient + total_active = 0 + df = np.zeros((X.shape[1], X.shape[1])) + for nn_idx in reversed(range(k)): # note: reverse not useful here + act1 = g0 < g1[:, nn_idx] + act2 = g0 < g2[:, nn_idx] + total_active += act1.sum() + act2.sum() + + targets = target_neighbors[:, nn_idx] + PLUS, pweight = _count_edges(act1, act2, impostors, targets) + df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight) + + in_imp, out_imp = impostors + df -= _sum_outer_products(X, in_imp[act1], out_imp[act1]) + df -= _sum_outer_products(X, in_imp[act2], out_imp[act2]) + + # do the gradient update + assert not np.isnan(df).any() + G = dfG * reg + df * (1 - reg) + G = L.dot(G) + # compute the objective function + objective = total_active * (1 - reg) + objective += G.flatten().dot(L.flatten()) + return 2 * G, objective, total_active + + def _select_targets(self, X, label_inds): + target_neighbors = np.empty((X.shape[0], self.n_neighbors), dtype=int) + for label in self.labels_: + inds, = np.nonzero(label_inds == label) + dd = euclidean_distances(X[inds], squared=True) np.fill_diagonal(dd, np.inf) - nn = np.argsort(dd)[...,:k] + nn = np.argsort(dd)[..., :self.n_neighbors] target_neighbors[inds] = inds[nn] return target_neighbors - def _find_impostors(self, furthest_neighbors): - Lx = self.transform() + def _find_impostors(self, furthest_neighbors, X, label_inds, L): + Lx = X.dot(L.T) margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) impostors = [] - for label in self.labels[:-1]: - in_inds, = np.nonzero(self.label_inds == label) - out_inds, = np.nonzero(self.label_inds > label) - dist = pairwise_distances(Lx[out_inds], Lx[in_inds]) - i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None]) - i2,j2 = np.nonzero(dist < margin_radii[in_inds]) - i = np.hstack((i1,i2)) - j = np.hstack((j1,j2)) + for label in self.labels_[:-1]: + in_inds, = np.nonzero(label_inds == label) + out_inds, = np.nonzero(label_inds > label) + dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True) + i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None]) + i2, j2 = np.nonzero(dist < margin_radii[in_inds]) + i = np.hstack((i1, i2)) + j = np.hstack((j1, j2)) if i.size > 0: # get unique (i,j) pairs using index trickery - shape = (i.max()+1, j.max()+1) - tmp = np.ravel_multi_index((i,j), shape) - i,j = np.unravel_index(np.unique(tmp), shape) + shape = (i.max() + 1, j.max() + 1) + tmp = np.ravel_multi_index((i, j), shape) + i, j = np.unravel_index(np.unique(tmp), shape) impostors.append(np.vstack((in_inds[j], out_inds[i]))) + if len(impostors) == 0: + # No impostors detected + return impostors return np.hstack(impostors) @@ -213,53 +325,19 @@ def _inplace_paired_L2(A, B): def _count_edges(act1, act2, impostors, targets): - imp = impostors[0,act1] + imp = impostors[0, act1] c = Counter(zip(imp, targets[imp])) - imp = impostors[1,act2] + imp = impostors[1, act2] c.update(zip(imp, targets[imp])) if c: active_pairs = np.array(list(c.keys())) else: - active_pairs = np.empty((0,2), dtype=int) + active_pairs = np.empty((0, 2), dtype=int) return active_pairs, np.array(list(c.values())) def _sum_outer_products(data, a_inds, b_inds, weights=None): Xab = data[a_inds] - data[b_inds] if weights is not None: - return np.dot(Xab.T, Xab * weights[:,None]) + return np.dot(Xab.T, Xab * weights[:, None]) return np.dot(Xab.T, Xab) - - -try: - # use the fast C++ version, if available - from modshogun import LMNN as shogun_LMNN - from modshogun import RealFeatures, MulticlassLabels - - class LMNN(_base_LMNN): - def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, - regularization=0.5, convergence_tol=0.001, use_pca=True, - verbose=False): - _base_LMNN.__init__(self, k=k, min_iter=min_iter, max_iter=max_iter, - learn_rate=learn_rate, regularization=regularization, - convergence_tol=convergence_tol, use_pca=use_pca, - verbose=verbose) - - def fit(self, X, labels): - self.X = X - self.L = np.eye(X.shape[1]) - labels = MulticlassLabels(labels.astype(np.float64)) - self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k']) - self._lmnn.set_maxiter(self.params['max_iter']) - self._lmnn.set_obj_threshold(self.params['convergence_tol']) - self._lmnn.set_regularization(self.params['regularization']) - self._lmnn.set_stepsize(self.params['learn_rate']) - if self.params['use_pca']: - self._lmnn.train() - else: - self._lmnn.train(self.L) - self.L = self._lmnn.get_linear_transform() - return self - -except ImportError: - LMNN = python_LMNN diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 3a576ab8..af7fa95b 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -1,176 +1,348 @@ """ -Liu et al. -"Metric Learning from Relative Comparisons by Minimizing Squared Residual". -ICDM 2012. - -Adapted from https://gist.github.com/kcarnold/5439917 -Paper: http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf +Metric Learning from Relative Comparisons by Minimizing Squared Residual (LSML) """ -from __future__ import print_function, absolute_import import numpy as np import scipy.linalg -from six.moves import xrange +from sklearn.base import TransformerMixin -from .base_metric import BaseMetricLearner +from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints +from ._util import components_from_metric, _initialize_metric_mahalanobis +import warnings -class LSML(BaseMetricLearner): - def __init__(self, tol=1e-3, max_iter=1000, verbose=False): - """Initialize the learner. +class _BaseLSML(MahalanobisMixin): - Parameters - ---------- - tol : float, optional - max_iter : int, optional - verbose : bool, optional - if True, prints information while learning - """ - self.params = { - 'tol': tol, - 'max_iter': max_iter, - 'verbose': verbose, - } - - def _prepare_inputs(self, X, constraints, weights, prior): - self.X = X - a,b,c,d = constraints - self.vab = X[a] - X[b] - self.vcd = X[c] - X[d] - assert self.vab.shape == self.vcd.shape, 'Constraints must have same length' - if weights is None: - self.w = np.ones(self.vab.shape[0]) - else: - self.w = weights - self.w /= self.w.sum() # weights must sum to 1 - if prior is None: - self.M = np.cov(X.T) - else: - self.M = prior + _tuple_size = 4 # constraints are quadruplets - def metric(self): - return self.M + def __init__(self, tol=1e-3, max_iter=1000, prior='identity', + verbose=False, preprocessor=None, random_state=None): + self.prior = prior + self.tol = tol + self.max_iter = max_iter + self.verbose = verbose + self.random_state = random_state + super(_BaseLSML, self).__init__(preprocessor) - def fit(self, X, constraints, weights=None, prior=None): - """Learn the LSML model. + def _fit(self, quadruplets, weights=None): + quadruplets = self._prepare_inputs(quadruplets, + type_of_inputs='tuples') + + # check to make sure that no two constrained vectors are identical + vab = quadruplets[:, 0, :] - quadruplets[:, 1, :] + vcd = quadruplets[:, 2, :] - quadruplets[:, 3, :] + if vab.shape != vcd.shape: + raise ValueError('Constraints must have same length') + if weights is None: + self.w_ = np.ones(vab.shape[0]) + else: + self.w_ = weights + self.w_ /= self.w_.sum() # weights must sum to 1 + M, prior_inv = _initialize_metric_mahalanobis( + quadruplets, self.prior, + return_inverse=True, strict_pd=True, matrix_name='prior', + random_state=self.random_state) - Parameters - ---------- - X : (n x d) data matrix - each row corresponds to a single instance - constraints : 4-tuple of arrays - (a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d]) - weights : (m,) array of floats, optional - scale factor for each constraint - prior : (d x d) matrix, optional - guess at a metric [default: covariance(X)] - """ - verbose = self.params['verbose'] - self._prepare_inputs(X, constraints, weights, prior) - prior_inv = scipy.linalg.inv(self.M) - s_best = self._total_loss(self.M, prior_inv) step_sizes = np.logspace(-10, 0, 10) - if verbose: + # Keep track of the best step size and the loss at that step. + l_best = 0 + s_best = self._total_loss(M, vab, vcd, prior_inv) + if self.verbose: print('initial loss', s_best) - tol = self.params['tol'] - for it in xrange(1, self.params['max_iter']+1): - grad = self._gradient(self.M, prior_inv) + for it in range(1, self.max_iter + 1): + grad = self._gradient(M, vab, vcd, prior_inv) grad_norm = scipy.linalg.norm(grad) - if grad_norm < tol: + if grad_norm < self.tol: break - if verbose: + if self.verbose: print('gradient norm', grad_norm) M_best = None for step_size in step_sizes: step_size /= grad_norm - new_metric = self.M - step_size * grad + new_metric = M - step_size * grad w, v = scipy.linalg.eigh(new_metric) new_metric = v.dot((np.maximum(w, 1e-8) * v).T) - cur_s = self._total_loss(new_metric, prior_inv) + cur_s = self._total_loss(new_metric, vab, vcd, prior_inv) if cur_s < s_best: l_best = step_size s_best = cur_s M_best = new_metric - if verbose: + if self.verbose: print('iter', it, 'cost', s_best, 'best step', l_best * grad_norm) if M_best is None: break - self.M = M_best + M = M_best else: - if verbose: + if self.verbose: print("Didn't converge after", it, "iterations. Final loss:", s_best) + self.n_iter_ = it + + self.components_ = components_from_metric(M) return self - def _comparison_loss(self, metric): - dab = np.sum(self.vab.dot(metric) * self.vab, axis=1) - dcd = np.sum(self.vcd.dot(metric) * self.vcd, axis=1) + def _comparison_loss(self, metric, vab, vcd): + dab = np.sum(vab.dot(metric) * vab, axis=1) + dcd = np.sum(vcd.dot(metric) * vcd, axis=1) violations = dab > dcd - return self.w[violations].dot((np.sqrt(dab[violations]) - - np.sqrt(dcd[violations]))**2) + return self.w_[violations].dot((np.sqrt(dab[violations]) - + np.sqrt(dcd[violations]))**2) - def _total_loss(self, metric, prior_inv): - return (self._comparison_loss(metric) + - _regularization_loss(metric, prior_inv)) + def _total_loss(self, metric, vab, vcd, prior_inv): + # Regularization loss + sign, logdet = np.linalg.slogdet(metric) + reg_loss = np.sum(metric * prior_inv) - sign * logdet + return self._comparison_loss(metric, vab, vcd) + reg_loss - def _gradient(self, metric, prior_inv): - dMetric = prior_inv - scipy.linalg.inv(metric) - dabs = np.sum(self.vab.dot(metric) * self.vab, axis=1) - dcds = np.sum(self.vcd.dot(metric) * self.vcd, axis=1) + def _gradient(self, metric, vab, vcd, prior_inv): + dMetric = prior_inv - np.linalg.inv(metric) + dabs = np.sum(vab.dot(metric) * vab, axis=1) + dcds = np.sum(vcd.dot(metric) * vcd, axis=1) violations = dabs > dcds # TODO: vectorize - for vab, dab, vcd, dcd in zip(self.vab[violations], dabs[violations], - self.vcd[violations], dcds[violations]): - dMetric += ((1-np.sqrt(dcd/dab))*np.outer(vab, vab) + - (1-np.sqrt(dab/dcd))*np.outer(vcd, vcd)) + for vab, dab, vcd, dcd in zip(vab[violations], dabs[violations], + vcd[violations], dcds[violations]): + dMetric += ((1 - np.sqrt(dcd / dab)) * np.outer(vab, vab) + + (1 - np.sqrt(dab / dcd)) * np.outer(vcd, vcd)) return dMetric -def _regularization_loss(metric, prior_inv): - sign, logdet = np.linalg.slogdet(metric) - return np.sum(metric * prior_inv) - sign * logdet +class LSML(_BaseLSML, _QuadrupletsClassifierMixin): + """Least Squared-residual Metric Learning (LSML) + + `LSML` proposes a simple, yet effective, algorithm that minimizes a convex + objective function corresponding to the sum of squared residuals of + constraints. This algorithm uses the constraints in the form of the + relative distance comparisons, such method is especially useful where + pairwise constraints are not natural to obtain, thus pairwise constraints + based algorithms become infeasible to be deployed. Furthermore, its sparsity + extension leads to more stable estimation when the dimension is high and + only a small amount of constraints is given. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + prior : string or numpy array, optional (default='identity') + Prior to set for the metric. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). For LSML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. + + 'random' + The initial Mahalanobis matrix will be a random positive definite + (PD) matrix of shape `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + tol : float, optional (default=1e-3) + Convergence tolerance of the optimization procedure. + + max_iter : int, optional (default=1000) + Maximum number of iteration of the optimization procedure. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to set the random + prior. + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + Examples + -------- + >>> from metric_learn import LSML + >>> quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]], + >>> [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]], + >>> [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]], + >>> [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]] + >>> # we want to make closer points where the first feature is close, and + >>> # further if the second feature is close + >>> lsml = LSML() + >>> lsml.fit(quadruplets) + References + ---------- + .. [1] Liu et al. `Metric Learning from Relative Comparisons by Minimizing + Squared Residual + `_. ICDM 2012. -class LSML_Supervised(LSML): - def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, - num_constraints=None, weights=None, verbose=False): - """Initialize the learner. + .. [2] Code adapted from https://gist.github.com/kcarnold/5439917 + + See Also + -------- + metric_learn.LSML : The original weakly-supervised algorithm + + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ + + def fit(self, quadruplets, weights=None): + """Learn the LSML model. Parameters ---------- - tol : float, optional - max_iter : int, optional - prior : (d x d) matrix, optional - guess at a metric [default: covariance(X)] - num_labeled : int, optional - number of labels to preserve for training - num_constraints: int, optional - number of constraints to generate - weights : (m,) array of floats, optional - scale factor for each constraint - verbose : bool, optional - if True, prints information while learning + quadruplets : array-like, shape=(n_constraints, 4, n_features) or \ + (n_constraints, 4) + 3D array-like of quadruplets of points or 2D array of quadruplets of + indicators. In order to supervise the algorithm in the right way, we + should have the four samples ordered in a way such that: + d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i < + n_constraints. + + weights : (n_constraints,) array of floats, optional + scale factor for each constraint + + Returns + ------- + self : object + Returns the instance. """ - LSML.__init__(self, tol=tol, max_iter=max_iter, verbose=verbose) - self.params.update(prior=prior, num_labeled=num_labeled, - num_constraints=num_constraints, weights=weights) + return self._fit(quadruplets, weights=weights) + + +class LSML_Supervised(_BaseLSML, TransformerMixin): + """Supervised version of Least Squared-residual Metric Learning (LSML) + + `LSML_Supervised` creates quadruplets from labeled samples by taking two + samples from the same class, and two samples from different classes. + This way it builds quadruplets where the two first points must be more + similar than the two last points. + + Parameters + ---------- + tol : float, optional (default=1e-3) + Convergence tolerance of the optimization procedure. + + max_iter : int, optional (default=1000) + Number of maximum iterations of the optimization procedure. + + prior : string or numpy array, optional (default='identity') + Prior to set for the metric. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). For LSML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. + + 'random' + The initial Mahalanobis matrix will be a random positive definite + (PD) matrix of shape `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. - def fit(self, X, labels): + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + n_constraints: int, optional (default=None) + Number of constraints to generate. If None, default to `20 * + num_classes**2`. + + weights : (n_constraints,) array of floats, optional (default=None) + Relative weight given to each constraint. If None, defaults to uniform + weights. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to set the random + prior. In any case, `random_state` is also used to randomly sample + constraints from labels. + + num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0 + + Examples + -------- + >>> from metric_learn import LSML_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> lsml = LSML_Supervised(n_constraints=200) + >>> lsml.fit(X, Y) + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + """ + + def __init__(self, tol=1e-3, max_iter=1000, prior='identity', + n_constraints=None, weights=None, + verbose=False, preprocessor=None, random_state=None, + num_constraints='deprecated'): + _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior, + verbose=verbose, preprocessor=preprocessor, + random_state=random_state) + if num_constraints != 'deprecated': + warnings.warn('"num_constraints" parameter has been renamed to' + ' "n_constraints". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + self.n_constraints = num_constraints + else: + self.n_constraints = n_constraints + # Avoid test get_params from failing (all params passed sholud be set) + self.num_constraints = 'deprecated' + self.weights = weights + + def fit(self, X, y): """Create constraints from labels and learn the LSML model. - Needs num_constraints specified in constructor. Parameters ---------- - X : (n x d) data matrix - each row corresponds to a single instance - labels : (n) data labels + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + y : (n) array-like + Data labels. """ - num_constraints = self.params['num_constraints'] - if num_constraints is None: - num_classes = np.unique(labels) - num_constraints = 20*(len(num_classes))**2 - - c = Constraints.random_subset(labels, self.params['num_labeled']) - pairs = c.positive_negative_pairs(num_constraints, same_length=True) - return LSML.fit(self, X, pairs, weights=self.params['weights'], - prior=self.params['prior']) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + n_constraints = self.n_constraints + if n_constraints is None: + num_classes = len(np.unique(y)) + n_constraints = 20 * num_classes**2 + + c = Constraints(y) + pos_neg = c.positive_negative_pairs(n_constraints, same_length=True, + random_state=self.random_state) + return _BaseLSML._fit(self, X[np.column_stack(pos_neg)], + weights=self.weights) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py new file mode 100644 index 00000000..01d185e7 --- /dev/null +++ b/metric_learn/mlkr.py @@ -0,0 +1,208 @@ +""" +Metric Learning for Kernel Regression (MLKR) +""" +import time +import sys +import warnings +import numpy as np +from scipy.optimize import minimize +from scipy.special import logsumexp +from sklearn.base import TransformerMixin +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import pairwise_distances + +from .base_metric import MahalanobisMixin +from ._util import _initialize_components, _check_n_components + +EPS = np.finfo(float).eps + + +class MLKR(MahalanobisMixin, TransformerMixin): + """Metric Learning for Kernel Regression (MLKR) + + MLKR is an algorithm for supervised metric learning, which learns a + distance function by directly minimizing the leave-one-out regression error. + This algorithm can also be viewed as a supervised variation of PCA and can be + used for dimensionality reduction and high dimensional data visualization. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + init : string or numpy array, optional (default='auto') + Initialization of the linear transformation. Possible options are + 'auto', 'pca', 'identity', 'random', and a numpy array of shape + (n_features_a, n_features_b). + + 'auto' + Depending on ``n_components``, the most reasonable initialization + will be chosen. If ``n_components < min(n_features, n_samples)``, + we use 'pca', as it projects data in meaningful directions (those + of higher variance). Otherwise, we just use 'identity'. + + 'pca' + ``n_components`` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See `sklearn.decomposition.PCA`) + + 'identity' + If ``n_components`` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first ``n_components`` rows. + + 'random' + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + numpy array + n_features_b must match the dimensionality of the inputs passed to + :meth:`fit` and n_features_a must be less than or equal to that. + If ``n_components`` is not None, n_features_a must match it. + + tol : float, optional (default=None) + Convergence tolerance for the optimization. + + max_iter : int, optional (default=1000) + Cap on number of conjugate gradient iterations. + + verbose : bool, optional (default=False) + Whether to print progress messages or not. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + transformation. If ``init='pca'``, ``random_state`` is passed as an + argument to PCA when initializing the transformation. + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + Examples + -------- + + >>> from metric_learn import MLKR + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> mlkr = MLKR() + >>> mlkr.fit(X, Y) + + References + ---------- + .. [1] K.Q. Weinberger and G. Tesauto. `Metric Learning for Kernel + Regression `_. AISTATS 2007. + """ + + def __init__(self, n_components=None, init='auto', + tol=None, max_iter=1000, verbose=False, + preprocessor=None, random_state=None): + self.n_components = n_components + self.init = init + self.tol = tol + self.max_iter = max_iter + self.verbose = verbose + self.random_state = random_state + super(MLKR, self).__init__(preprocessor) + + def fit(self, X, y): + """ + Fit MLKR model + + Parameters + ---------- + X : (n x d) array of samples + y : (n) data labels + """ + X, y = self._prepare_inputs(X, y, y_numeric=True, + ensure_min_samples=2) + n, d = X.shape + if y.shape[0] != n: + raise ValueError('Data and label lengths mismatch: %d != %d' + % (n, y.shape[0])) + + m = _check_n_components(d, self.n_components) + m = self.n_components + if m is None: + m = d + # if the init is the default (None), we raise a warning + A = _initialize_components(m, X, y, init=self.init, + random_state=self.random_state, + # MLKR works on regression targets: + has_classes=False) + + # Measure the total training time + train_time = time.time() + + self.n_iter_ = 0 + res = minimize(self._loss, A.ravel(), (X, y), method='L-BFGS-B', + jac=True, tol=self.tol, + options=dict(maxiter=self.max_iter)) + self.components_ = res.x.reshape(A.shape) + + # Stop timer + train_time = time.time() - train_time + if self.verbose: + cls_name = self.__class__.__name__ + # Warn the user if the algorithm did not converge + if not res.success: + warnings.warn('[{}] MLKR did not converge: {}' + .format(cls_name, res.message), ConvergenceWarning) + print('[{}] Training took {:8.2f}s.'.format(cls_name, train_time)) + + return self + + def _loss(self, flatA, X, y): + + if self.n_iter_ == 0 and self.verbose: + header_fields = ['Iteration', 'Objective Value', 'Time(s)'] + header_fmt = '{:>10} {:>20} {:>10}' + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print('[{cls}]'.format(cls=cls_name)) + print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name, + header=header, + sep='-' * len(header))) + + start_time = time.time() + + A = flatA.reshape((-1, X.shape[1])) + X_embedded = np.dot(X, A.T) + dist = pairwise_distances(X_embedded, squared=True) + np.fill_diagonal(dist, np.inf) + softmax = np.exp(- dist - logsumexp(- dist, axis=1)[:, np.newaxis]) + yhat = softmax.dot(y) + ydiff = yhat - y + cost = (ydiff ** 2).sum() + + # also compute the gradient + W = softmax * ydiff[:, np.newaxis] * (y - yhat[:, np.newaxis]) + W_sym = W + W.T + np.fill_diagonal(W_sym, - W.sum(axis=0)) + grad = 4 * (X_embedded.T.dot(W_sym)).dot(X) + + if self.verbose: + start_time = time.time() - start_time + values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}' + print(values_fmt.format(cls=self.__class__.__name__, + n_iter=self.n_iter_, loss=cost, + start_time=start_time)) + sys.stdout.flush() + + self.n_iter_ += 1 + + return cost, grad.ravel() diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py new file mode 100644 index 00000000..5cf166fd --- /dev/null +++ b/metric_learn/mmc.py @@ -0,0 +1,601 @@ +"""Mahalanobis Metric for Clustering (MMC)""" +import numpy as np +from sklearn.base import TransformerMixin +from sklearn.utils.validation import assert_all_finite + +from .base_metric import _PairsClassifierMixin, MahalanobisMixin +from .constraints import Constraints, wrap_pairs +from ._util import components_from_metric, _initialize_metric_mahalanobis +import warnings + + +class _BaseMMC(MahalanobisMixin): + + _tuple_size = 2 # constraints are pairs + + def __init__(self, max_iter=100, max_proj=10000, tol=1e-3, + init='identity', diagonal=False, + diagonal_c=1.0, verbose=False, preprocessor=None, + random_state=None, + convergence_threshold='deprecated'): + if convergence_threshold != 'deprecated': + warnings.warn('"convergence_threshold" parameter has been ' + ' renamed to "tol". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + tol = convergence_threshold + self.convergence_threshold = 'deprecated' # Avoid errors + self.max_iter = max_iter + self.max_proj = max_proj + self.tol = tol + self.init = init + self.diagonal = diagonal + self.diagonal_c = diagonal_c + self.verbose = verbose + self.random_state = random_state + super(_BaseMMC, self).__init__(preprocessor) + + def _fit(self, pairs, y): + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') + + self.A_ = _initialize_metric_mahalanobis(pairs, self.init, + random_state=self.random_state, + matrix_name='init') + + if self.diagonal: + return self._fit_diag(pairs, y) + else: + return self._fit_full(pairs, y) + + def _fit_full(self, pairs, y): + """Learn full metric using MMC. + + Parameters + ---------- + X : (n x d) data matrix + Each row corresponds to a single instance. + constraints : 4-tuple of arrays + (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d) + dissimilar pairs. + """ + num_dim = pairs.shape[2] + + error2 = 1e10 + eps = 0.01 # error-bound of iterative projection on C1 and C2 + A = self.A_ + + pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] + + # Create weight vector from similar samples + pos_diff = pos_pairs[:, 0, :] - pos_pairs[:, 1, :] + w = np.einsum('ij,ik->jk', pos_diff, pos_diff).ravel() + # `w` is the sum of all outer products of the rows in `pos_diff`. + # The above `einsum` is equivalent to the much more inefficient: + # w = np.apply_along_axis( + # lambda x: np.outer(x,x).ravel(), + # 1, + # X[a] - X[b] + # ).sum(axis = 0) + t = w.dot(A.ravel()) / 100.0 + + w_norm = np.linalg.norm(w) + w1 = w / w_norm # make `w` a unit vector + t1 = t / w_norm # distance from origin to `w^T*x=t` plane + + cycle = 1 + alpha = 0.1 # initial step size along gradient + grad1 = self._fS1(pos_pairs, A) # gradient of similarity + # constraint function + grad2 = self._fD1(neg_pairs, A) # gradient of dissimilarity + # constraint function + # gradient of fD1 orthogonal to fS1: + M = self._grad_projection(grad1, grad2) + + A_old = A.copy() + + for cycle in range(self.max_iter): + + # projection of constraints C1 and C2 + satisfy = False + + for it in range(self.max_proj): + + # First constraint: + # f(A) = \sum_{i,j \in S} d_ij' A d_ij <= t (1) + # (1) can be rewritten as a linear constraint: w^T x = t, + # where x is the unrolled matrix of A, + # w is also an unrolled matrix of W where + # W_{kl}= \sum_{i,j \in S}d_ij^k * d_ij^l + x0 = A.ravel() + if w.dot(x0) <= t: + x = x0 + else: + x = x0 + (t1 - w1.dot(x0)) * w1 + A[:] = x.reshape(num_dim, num_dim) + + # Second constraint: + # PSD constraint A >= 0 + # project A onto domain A>0 + l, V = np.linalg.eigh((A + A.T) / 2) + A[:] = np.dot(V * np.maximum(0, l[None, :]), V.T) + + fDC2 = w.dot(A.ravel()) + error2 = (fDC2 - t) / t + if error2 < eps: + satisfy = True + break + + # third constraint: gradient ascent + # max: g(A) >= 1 + # here we suppose g(A) = fD(A) = \sum_{I,J \in D} sqrt(d_ij' A d_ij) + + obj_previous = self._fD(neg_pairs, A_old) # g(A_old) + obj = self._fD(neg_pairs, A) # g(A) + + if satisfy and (obj > obj_previous or cycle == 0): + + # If projection of 1 and 2 is successful, and such projection + # improves objective function, slightly increase learning rate + # and update from the current A. + alpha *= 1.05 + A_old[:] = A + grad2 = self._fS1(pos_pairs, A) + grad1 = self._fD1(neg_pairs, A) + M = self._grad_projection(grad1, grad2) + A += alpha * M + + else: + + # If projection of 1 and 2 failed, or obj <= obj_previous due + # to projection of 1 and 2, shrink learning rate and re-update + # from the previous A. + alpha /= 2 + A[:] = A_old + alpha * M + + delta = np.linalg.norm(alpha * M) / np.linalg.norm(A_old) + if delta < self.tol: + break + if self.verbose: + print('mmc iter: %d, conv = %f, projections = %d' % + (cycle, delta, it + 1)) + + if delta > self.tol: + self.converged_ = False + if self.verbose: + print('mmc did not converge, conv = %f' % (delta,)) + else: + self.converged_ = True + if self.verbose: + print('mmc converged at iter %d, conv = %f' % (cycle, delta)) + self.A_[:] = A_old + self.n_iter_ = cycle + + self.components_ = components_from_metric(self.A_) + return self + + def _fit_diag(self, pairs, y): + """Learn diagonal metric using MMC. + Parameters + ---------- + X : (n x d) data matrix + Each row corresponds to a single instance. + constraints : 4-tuple of arrays + (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d) + dissimilar pairs. + """ + num_dim = pairs.shape[2] + pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] + s_sum = np.sum((pos_pairs[:, 0, :] - pos_pairs[:, 1, :]) ** 2, axis=0) + + it = 0 + error = 1.0 + eps = 1e-6 + reduction = 2.0 + w = np.diag(self.A_).copy() + + while error > self.tol and it < self.max_iter: + + fD0, fD_1st_d, fD_2nd_d = self._D_constraint(neg_pairs, w) + obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0 + fS_1st_d = s_sum # first derivative of the similarity constraints + + # gradient of the objective: + gradient = fS_1st_d - self.diagonal_c * fD_1st_d + # Hessian of the objective: + hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim) + step = np.dot(np.linalg.inv(hessian), gradient) + + # Newton-Rapshon update + # search over optimal lambda + lambd = 1 # initial step-size + w_tmp = np.maximum(0, w - lambd * step) + obj = (np.dot(s_sum, w_tmp) + self.diagonal_c * + self._D_objective(neg_pairs, w_tmp)) + assert_all_finite(obj) + obj_previous = np.inf # just to get the while-loop started + + inner_it = 0 + while obj < obj_previous: + obj_previous = obj + w_previous = w_tmp.copy() + lambd /= reduction + w_tmp = np.maximum(0, w - lambd * step) + obj = (np.dot(s_sum, w_tmp) + self.diagonal_c * + self._D_objective(neg_pairs, w_tmp)) + inner_it += 1 + assert_all_finite(obj) + + w[:] = w_previous + error = np.abs((obj_previous - obj_initial) / obj_previous) + if self.verbose: + print('mmc iter: %d, conv = %f' % (it, error)) + it += 1 + + self.A_ = np.diag(w) + + self.components_ = components_from_metric(self.A_) + return self + + def _fD(self, neg_pairs, A): + r"""The value of the dissimilarity constraint function. + + f = f(\sum_{ij \in D} distance(x_i, x_j)) + i.e. distance can be L1: \sqrt{(x_i-x_j)A(x_i-x_j)'} + """ + diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :] + return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis=1))) + + 1e-6) + + def _fD1(self, neg_pairs, A): + r"""The gradient of the dissimilarity constraint function w.r.t. A. + + For example, let distance by L1 norm: + f = f(\sum_{ij \in D} \sqrt{(x_i-x_j)A(x_i-x_j)'}) + df/dA_{kl} = f'* d(\sum_{ij \in D} \sqrt{(x_i-x_j)^k*(x_i-x_j)^l})/dA_{kl} + + Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A) + so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij + df/dA = f'(\sum_{ij \in D} \sqrt{tr(d_ij'*d_ij*A)}) + * 0.5*(\sum_{ij \in D} (1/sqrt{tr(d_ij'*d_ij*A)})*(d_ij'*d_ij)) + """ + diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :] + # outer products of all rows in `diff` + M = np.einsum('ij,ik->ijk', diff, diff) + # faster version of: dist = np.sqrt(np.sum(M * A[None,:,:], axis=(1,2))) + dist = np.sqrt(np.einsum('ijk,jk', M, A)) + # faster version of: sum_deri = np.sum(M / + # (2 * (dist[:,None,None] + 1e-6)), axis=0) + sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6)) + sum_dist = dist.sum() + return sum_deri / (sum_dist + 1e-6) + + def _fS1(self, pos_pairs, A): + r"""The gradient of the similarity constraint function w.r.t. A. + + f = \sum_{ij}(x_i-x_j)A(x_i-x_j)' = \sum_{ij}d_ij*A*d_ij' + df/dA = d(d_ij*A*d_ij')/dA + + Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A) + so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij + """ + diff = pos_pairs[:, 0, :] - pos_pairs[:, 1, :] + # sum of outer products of all rows in `diff`: + return np.einsum('ij,ik->jk', diff, diff) + + def _grad_projection(self, grad1, grad2): + grad2 = grad2 / np.linalg.norm(grad2) + gtemp = grad1 - np.sum(grad1 * grad2) * grad2 + gtemp /= np.linalg.norm(gtemp) + return gtemp + + def _D_objective(self, neg_pairs, w): + return np.log(np.sum(np.sqrt(np.sum(((neg_pairs[:, 0, :] - + neg_pairs[:, 1, :]) ** 2) * + w[None, :], axis=1) + 1e-6))) + + def _D_constraint(self, neg_pairs, w): + """Compute the value, 1st derivative, second derivative (Hessian) of + a dissimilarity constraint function gF(sum_ij distance(d_ij A d_ij)) + where A is a diagonal matrix (in the form of a column vector 'w'). + """ + diff = neg_pairs[:, 0, :] - neg_pairs[:, 1, :] + diff_sq = diff * diff + dist = np.sqrt(diff_sq.dot(w)) + sum_deri1 = np.einsum('ij,i', diff_sq, 0.5 / np.maximum(dist, 1e-6)) + sum_deri2 = np.einsum( + 'ij,ik->jk', + diff_sq, + diff_sq / (-4 * np.maximum(1e-6, dist**3))[:, None] + ) + sum_dist = dist.sum() + return ( + np.log(sum_dist), + sum_deri1 / sum_dist, + sum_deri2 / sum_dist - + np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist) + ) + + +class MMC(_BaseMMC, _PairsClassifierMixin): + """Mahalanobis Metric for Clustering (MMC) + + MMC minimizes the sum of squared distances between similar points, while + enforcing the sum of distances between dissimilar ones to be greater than + one. This leads to a convex and, thus, local-minima-free optimization + problem that can be solved efficiently. + However, the algorithm involves the computation of eigenvalues, which is the + main speed-bottleneck. Since it has initially been designed for clustering + applications, one of the implicit assumptions of MMC is that all classes form + a compact set, i.e., follow a unimodal distribution, which restricts the + possible use-cases of this method. However, it is one of the earliest and a + still often cited technique. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + max_iter : int, optional (default=100) + Maximum number of iterations of the optimization procedure. + + max_proj : int, optional (default=10000) + Maximum number of projection steps. + + tol : float, optional (default=1e-3) + Convergence threshold for the optimization procedure. + + init : string or numpy array, optional (default='identity') + Initialization of the Mahalanobis matrix. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The (pseudo-)inverse of the covariance matrix. + + 'random' + The initial Mahalanobis matrix will be a random SPD matrix of + shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + An SPD matrix of shape (n_features, n_features), that will + be used as such to initialize the metric. + + diagonal : bool, optional (default=False) + If True, a diagonal metric will be learned, + i.e., a simple scaling of dimensions. The initialization will then + be the diagonal coefficients of the matrix given as 'init'. + + diagonal_c : float, optional (default=1.0) + Weight of the dissimilarity constraint for diagonal + metric learning. Ignored if ``diagonal=False``. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + transformation. + + convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0 + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + threshold_ : `float` + If the distance metric between two points is lower than this threshold, + points will be classified as similar, otherwise they will be + classified as dissimilar. + + Examples + -------- + >>> from metric_learn import MMC + >>> pairs = [[[1.2, 7.5], [1.3, 1.5]], + >>> [[6.4, 2.6], [6.2, 9.7]], + >>> [[1.3, 4.5], [3.2, 4.6]], + >>> [[6.2, 5.5], [5.4, 5.4]]] + >>> y = [1, 1, -1, -1] + >>> # in this task we want points where the first feature is close to be + >>> # closer to each other, no matter how close the second feature is + >>> mmc = MMC() + >>> mmc.fit(pairs, y) + + References + ---------- + .. [1] Xing, Jordan, Russell, Ng. `Distance metric learning with application + to clustering with side-information + `_. + NIPS 2002. + + See Also + -------- + metric_learn.MMC : The original weakly-supervised algorithm + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ + + def fit(self, pairs, y, calibration_params=None): + """Learn the MMC model. + + The threshold will be calibrated on the trainset using the parameters + `calibration_params`. + + Parameters + ---------- + pairs : array-like, shape=(n_constraints, 2, n_features) or \ + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + y : array-like, of shape (n_constraints,) + Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. + + calibration_params : `dict` or `None` + Dictionary of parameters to give to `calibrate_threshold` for the + threshold calibration step done at the end of `fit`. If `None` is + given, `calibrate_threshold` will use the default parameters. + + Returns + ------- + self : object + Returns the instance. + """ + calibration_params = (calibration_params if calibration_params is not + None else dict()) + self._validate_calibration_params(**calibration_params) + self._fit(pairs, y) + self.calibrate_threshold(pairs, y, **calibration_params) + return self + + +class MMC_Supervised(_BaseMMC, TransformerMixin): + """Supervised version of Mahalanobis Metric for Clustering (MMC) + + `MMC_Supervised` creates pairs of similar sample by taking same class + samples, and pairs of dissimilar samples by taking different class + samples. It then passes these pairs to `MMC` for training. + + Parameters + ---------- + max_iter : int, optional (default=100) + Maximum number of iterations of the optimization procedure. + + max_proj : int, optional (default=10000) + Maximum number of projection steps. + + tol : float, optional (default=1e-3) + Convergence threshold for the optimization procedure. + + n_constraints: int, optional (default=None) + Number of constraints to generate. If None, default to `20 * + num_classes**2`. + + init : string or numpy array, optional (default='identity') + Initialization of the Mahalanobis matrix. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The (pseudo-)inverse of the covariance matrix. + + 'random' + The initial Mahalanobis matrix will be a random SPD matrix of + shape `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A numpy array of shape (n_features, n_features), that will + be used as such to initialize the metric. + + diagonal : bool, optional (default=False) + If True, a diagonal metric will be learned, + i.e., a simple scaling of dimensions. The initialization will then + be the diagonal coefficients of the matrix given as 'init'. + + diagonal_c : float, optional (default=1.0) + Weight of the dissimilarity constraint for diagonal + metric learning. Ignored if ``diagonal=False``. + + verbose : bool, optional (default=False) + If True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + Mahalanobis matrix. In any case, `random_state` is also used to + randomly sample constraints from labels. + + num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0 + + convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0 + + Examples + -------- + >>> from metric_learn import MMC_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> mmc = MMC_Supervised(n_constraints=200) + >>> mmc.fit(X, Y) + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + """ + + def __init__(self, max_iter=100, max_proj=10000, tol=1e-6, + n_constraints=None, init='identity', + diagonal=False, diagonal_c=1.0, verbose=False, + preprocessor=None, random_state=None, + num_constraints='deprecated', + convergence_threshold='deprecated'): + _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj, + tol=tol, + init=init, diagonal=diagonal, + diagonal_c=diagonal_c, verbose=verbose, + preprocessor=preprocessor, + random_state=random_state, + convergence_threshold=convergence_threshold) + if num_constraints != 'deprecated': + warnings.warn('"num_constraints" parameter has been renamed to' + ' "n_constraints". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + self.n_constraints = num_constraints + else: + self.n_constraints = n_constraints + # Avoid test get_params from failing (all params passed sholud be set) + self.num_constraints = 'deprecated' + + def fit(self, X, y): + """Create constraints from labels and learn the MMC model. + + Parameters + ---------- + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + y : (n) array-like + Data labels. + """ + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + n_constraints = self.n_constraints + if n_constraints is None: + num_classes = len(np.unique(y)) + n_constraints = 20 * num_classes**2 + + c = Constraints(y) + pos_neg = c.positive_negative_pairs(n_constraints, + random_state=self.random_state) + pairs, y = wrap_pairs(X, pos_neg) + return _BaseMMC._fit(self, pairs, y) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index c0616e2f..7b4423d3 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -1,54 +1,225 @@ """ Neighborhood Components Analysis (NCA) -Ported to Python from https://github.com/vomjom/nca """ -from __future__ import absolute_import +import warnings +import time +import sys import numpy as np -from six.moves import xrange +from scipy.optimize import minimize +from scipy.special import logsumexp +from sklearn.base import TransformerMixin +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import pairwise_distances -from .base_metric import BaseMetricLearner +from ._util import _initialize_components, _check_n_components +from .base_metric import MahalanobisMixin +EPS = np.finfo(float).eps -class NCA(BaseMetricLearner): - def __init__(self, max_iter=100, learning_rate=0.01): - self.params = { - 'max_iter': max_iter, - 'learning_rate': learning_rate, - } - self.A = None - def transformer(self): - return self.A +class NCA(MahalanobisMixin, TransformerMixin): + """Neighborhood Components Analysis (NCA) - def fit(self, X, labels): + NCA is a distance metric learning algorithm which aims to improve the + accuracy of nearest neighbors classification compared to the standard + Euclidean distance. The algorithm directly maximizes a stochastic variant + of the leave-one-out k-nearest neighbors(KNN) score on the training set. + It can also learn a low-dimensional linear transformation of data that can + be used for data visualization and fast classification. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + init : string or numpy array, optional (default='auto') + Initialization of the linear transformation. Possible options are + 'auto', 'pca', 'identity', 'random', and a numpy array of shape + (n_features_a, n_features_b). + + 'auto' + Depending on ``n_components``, the most reasonable initialization + will be chosen. If ``n_components <= n_classes`` we use 'lda', as + it uses labels information. If not, but + ``n_components < min(n_features, n_samples)``, we use 'pca', as + it projects data in meaningful directions (those of higher + variance). Otherwise, we just use 'identity'. + + 'pca' + ``n_components`` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See `sklearn.decomposition.PCA`) + + 'lda' + ``min(n_components, n_classes)`` most discriminative + components of the inputs passed to :meth:`fit` will be used to + initialize the transformation. (If ``n_components > n_classes``, + the rest of the components will be zero.) (See + `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) + + 'identity' + If ``n_components`` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first ``n_components`` rows. + + 'random' + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + numpy array + n_features_b must match the dimensionality of the inputs passed to + :meth:`fit` and n_features_a must be less than or equal to that. + If ``n_components`` is not None, n_features_a must match it. + + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + max_iter : int, optional (default=100) + Maximum number of iterations done by the optimization algorithm. + + tol : float, optional (default=None) + Convergence tolerance for the optimization. + + verbose : bool, optional (default=False) + Whether to print progress messages or not. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to initialize the random + transformation. If ``init='pca'``, ``random_state`` is passed as an + argument to PCA when initializing the transformation. + + Examples + -------- + + >>> import numpy as np + >>> from metric_learn import NCA + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> nca = NCA(max_iter=1000) + >>> nca.fit(X, Y) + + Attributes + ---------- + n_iter_ : `int` + The number of iterations the solver has run. + + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + References + ---------- + .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. `Neighbourhood + Components Analysis + `_. + NIPS 2005. + + .. [2] Wikipedia entry on `Neighborhood Components Analysis + `_ + """ + + def __init__(self, init='auto', n_components=None, + max_iter=100, tol=None, verbose=False, preprocessor=None, + random_state=None): + self.n_components = n_components + self.init = init + self.max_iter = max_iter + self.tol = tol + self.verbose = verbose + self.random_state = random_state + super(NCA, self).__init__(preprocessor) + + def fit(self, X, y): """ X: data matrix, (n x d) - labels: scalar labels, (n) + y: scalar labels, (n) """ + X, labels = self._prepare_inputs(X, y, ensure_min_samples=2) n, d = X.shape - # Initialize A to a scaling matrix - A = np.zeros((d, d)) - np.fill_diagonal(A, 1./(X.max(axis=0)-X.min(axis=0))) + n_components = _check_n_components(d, self.n_components) + + # Measure the total training time + train_time = time.time() + + # Initialize A + A = _initialize_components(n_components, X, labels, self.init, + self.verbose, self.random_state) # Run NCA - dX = X[:,None] - X[None] # shape (n, n, d) - tmp = np.einsum('...i,...j->...ij', dX, dX) # shape (n, n, d, d) - masks = labels[:,None] == labels[None] - learning_rate = self.params['learning_rate'] - for it in xrange(self.params['max_iter']): - for i, label in enumerate(labels): - mask = masks[i] - Ax = A.dot(X.T).T # shape (n, d) - - softmax = np.exp(-((Ax[i] - Ax)**2).sum(axis=1)) # shape (n) - softmax[i] = 0 - softmax /= softmax.sum() - - t = softmax[:, None, None] * tmp[i] # shape (n, d, d) - d = softmax[mask].sum() * t.sum(axis=0) - t[mask].sum(axis=0) - A += learning_rate * A.dot(d) - - self.X = X - self.A = A + mask = labels[:, np.newaxis] == labels[np.newaxis, :] + optimizer_params = {'method': 'L-BFGS-B', + 'fun': self._loss_grad_lbfgs, + 'args': (X, mask, -1.0), + 'jac': True, + 'x0': A.ravel(), + 'options': dict(maxiter=self.max_iter), + 'tol': self.tol + } + + # Call the optimizer + self.n_iter_ = 0 + opt_result = minimize(**optimizer_params) + + self.components_ = opt_result.x.reshape(-1, X.shape[1]) + self.n_iter_ = opt_result.nit + + # Stop timer + train_time = time.time() - train_time + if self.verbose: + cls_name = self.__class__.__name__ + + # Warn the user if the algorithm did not converge + if not opt_result.success: + warnings.warn('[{}] NCA did not converge: {}'.format( + cls_name, opt_result.message), ConvergenceWarning) + + print('[{}] Training took {:8.2f}s.'.format(cls_name, train_time)) + return self + + def _loss_grad_lbfgs(self, A, X, mask, sign=1.0): + + if self.n_iter_ == 0 and self.verbose: + header_fields = ['Iteration', 'Objective Value', 'Time(s)'] + header_fmt = '{:>10} {:>20} {:>10}' + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print('[{cls}]'.format(cls=cls_name)) + print('[{cls}] {header}\n[{cls}] {sep}'.format(cls=cls_name, + header=header, + sep='-' * len(header))) + + start_time = time.time() + + A = A.reshape(-1, X.shape[1]) + X_embedded = np.dot(X, A.T) # (n_samples, n_components) + # Compute softmax distances + p_ij = pairwise_distances(X_embedded, squared=True) + np.fill_diagonal(p_ij, np.inf) + p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis]) + # (n_samples, n_samples) + + # Compute loss + masked_p_ij = p_ij * mask + p = masked_p_ij.sum(axis=1, keepdims=True) # (n_samples, 1) + loss = p.sum() + + # Compute gradient of loss w.r.t. `transform` + weighted_p_ij = masked_p_ij - p_ij * p + weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T + np.fill_diagonal(weighted_p_ij_sym, - weighted_p_ij.sum(axis=0)) + gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X) + + if self.verbose: + start_time = time.time() - start_time + values_fmt = '[{cls}] {n_iter:>10} {loss:>20.6e} {start_time:>10.2f}' + print(values_fmt.format(cls=self.__class__.__name__, + n_iter=self.n_iter_, loss=loss, + start_time=start_time)) + sys.stdout.flush() + + self.n_iter_ += 1 + return sign * loss, sign * gradient.ravel() diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 9e91167e..253b9c92 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -1,93 +1,125 @@ -"""Relative Components Analysis (RCA) - -RCA learns a full rank Mahalanobis distance metric based on a -weighted sum of in-class covariance matrices. -It applies a global linear transformation to assign large weights to -relevant dimensions and low weights to irrelevant dimensions. -Those relevant dimensions are estimated using "chunklets", -subsets of points that are known to belong to the same class. - -'Learning distance functions using equivalence relations', ICML 2003 +""" +Relative Components Analysis (RCA) """ -from __future__ import absolute_import import numpy as np -from six.moves import xrange +import warnings +from sklearn.base import TransformerMixin -from .base_metric import BaseMetricLearner +from ._util import _check_n_components +from .base_metric import MahalanobisMixin from .constraints import Constraints -class RCA(BaseMetricLearner): - """Relevant Components Analysis (RCA)""" - def __init__(self, dim=None): - """Initialize the learner. - - Parameters - ---------- - dim : int, optional - embedding dimension (default: original dimension of data) - """ - self.params = { - 'dim': dim, - } - - def transformer(self): - return self._transformer - - def _process_inputs(self, X, Y): - X = np.asanyarray(X) - self.X = X - n, d = X.shape - - if self.params['dim'] is None: - self.params['dim'] = d - elif not 0 < self.params['dim'] <= d: - raise ValueError('Invalid embedding dimension, must be in [1,%d]' % d) - - Y = np.asanyarray(Y) - num_chunks = Y.max() + 1 - - return X, Y, num_chunks, d - - def fit(self, data, chunks): +# mean center each chunklet separately +def _chunk_mean_centering(data, chunks): + n_chunks = chunks.max() + 1 + chunk_mask = chunks != -1 + # We need to ensure the data is float so that we can substract the + # mean on it + chunk_data = data[chunk_mask].astype(float, copy=False) + chunk_labels = chunks[chunk_mask] + for c in range(n_chunks): + mask = chunk_labels == c + chunk_data[mask] -= chunk_data[mask].mean(axis=0) + + return chunk_mask, chunk_data + + +class RCA(MahalanobisMixin, TransformerMixin): + """Relevant Components Analysis (RCA) + + RCA learns a full rank Mahalanobis distance metric based on a weighted sum of + in-chunklets covariance matrices. It applies a global linear transformation + to assign large weights to relevant dimensions and low weights to irrelevant + dimensions. Those relevant dimensions are estimated using "chunklets", + subsets of points that are known to belong to the same class. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + Examples + -------- + >>> from metric_learn import RCA + >>> X = [[-0.05, 3.0],[0.05, -3.0], + >>> [0.1, -3.55],[-0.1, 3.55], + >>> [-0.95, -0.05],[0.95, 0.05], + >>> [0.4, 0.05],[-0.4, -0.05]] + >>> chunks = [0, 0, 1, 1, 2, 2, 3, 3] + >>> rca = RCA() + >>> rca.fit(X, chunks) + + References + ---------- + .. [1] Noam Shental, et al. `Adjustment learning and relevant component + analysis `_ . + ECCV 2002. + + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + """ + + def __init__(self, n_components=None, preprocessor=None): + self.n_components = n_components + super(RCA, self).__init__(preprocessor) + + def _check_dimension(self, rank, X): + d = X.shape[1] + + if rank < d: + warnings.warn('The inner covariance matrix is not invertible, ' + 'so the transformation matrix may contain Nan values. ' + 'You should remove any linearly dependent features and/or ' + 'reduce the dimensionality of your input, ' + 'for instance using `sklearn.decomposition.PCA` as a ' + 'preprocessing step.') + + dim = _check_n_components(d, self.n_components) + return dim + + def fit(self, X, chunks): """Learn the RCA model. Parameters ---------- - X : (n x d) data matrix - each row corresponds to a single instance + data : (n x d) data matrix + Each row corresponds to a single instance + chunks : (n,) array of ints - when ``chunks[i] == -1``, point i doesn't belong to any chunklet, - when ``chunks[i] == j``, point i belongs to chunklet j. + When ``chunks[i] == -1``, point i doesn't belong to any chunklet. + When ``chunks[i] == j``, point i belongs to chunklet j. """ - data, chunks, num_chunks, d = self._process_inputs(data, chunks) + X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2) - # mean center - data -= data.mean(axis=0) + chunks = np.asanyarray(chunks, dtype=int) + chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) - # mean center each chunklet separately - chunk_mask = chunks != -1 - chunk_data = data[chunk_mask] - chunk_labels = chunks[chunk_mask] - for c in xrange(num_chunks): - mask = chunk_labels == c - chunk_data[mask] -= chunk_data[mask].mean(axis=0) - - # "inner" covariance of chunk deviations - inner_cov = np.cov(chunk_data, rowvar=0, bias=1) + inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1)) + dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X) # Fisher Linear Discriminant projection - if self.params['dim'] < d: - total_cov = np.cov(data[chunk_mask], rowvar=0) - tmp = np.linalg.lstsq(total_cov, inner_cov)[0] + if dim < X.shape[1]: + total_cov = np.cov(X[chunk_mask], rowvar=0) + tmp = np.linalg.lstsq(total_cov, inner_cov, rcond=None)[0] vals, vecs = np.linalg.eig(tmp) - inds = np.argsort(vals)[:self.params['dim']] - A = vecs[:,inds] - inner_cov = A.T.dot(inner_cov).dot(A) - self._transformer = _inv_sqrtm(inner_cov).dot(A.T) + inds = np.argsort(vals)[:dim] + A = vecs[:, inds] + inner_cov = np.atleast_2d(A.T.dot(inner_cov).dot(A)) + self.components_ = _inv_sqrtm(inner_cov).dot(A.T) else: - self._transformer = _inv_sqrtm(inner_cov).T + self.components_ = _inv_sqrtm(inner_cov).T return self @@ -99,29 +131,87 @@ def _inv_sqrtm(x): class RCA_Supervised(RCA): - def __init__(self, dim=None, num_chunks=100, chunk_size=2): - """Initialize the learner. - - Parameters - ---------- - dim : int, optional - embedding dimension (default: original dimension of data) - num_chunks: int, optional - chunk_size: int, optional - """ - RCA.__init__(self, dim=dim) - self.params.update(num_chunks=num_chunks, chunk_size=chunk_size) - - def fit(self, X, labels): - """Create constraints from labels and learn the LSML model. - Needs num_constraints specified in constructor. + """Supervised version of Relevant Components Analysis (RCA) + + `RCA_Supervised` creates chunks of similar points by first sampling a + class, taking `chunk_size` elements in it, and repeating the process + `n_chunks` times. + + Parameters + ---------- + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + + n_chunks: int, optional (default=100) + Number of chunks to generate. + + chunk_size: int, optional (default=2) + Number of points per chunk. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + It is used to randomly sample constraints from labels. + + num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0 + + Examples + -------- + >>> from metric_learn import RCA_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> rca = RCA_Supervised(n_chunks=30, chunk_size=2) + >>> rca.fit(X, Y) + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + """ + + def __init__(self, n_components=None, n_chunks=100, chunk_size=2, + preprocessor=None, random_state=None, + num_chunks='deprecated'): + """Initialize the supervised version of `RCA`.""" + RCA.__init__(self, n_components=n_components, preprocessor=preprocessor) + if num_chunks != 'deprecated': + warnings.warn('"num_chunks" parameter has been renamed to' + ' "n_chunks". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + n_chunks = num_chunks + self.num_chunks = 'deprecated' # To avoid no_attribute error + self.n_chunks = n_chunks + self.chunk_size = chunk_size + self.random_state = random_state + + def fit(self, X, y): + """Create constraints from labels and learn the RCA model. + Needs n_constraints specified in constructor. (Not true?) Parameters ---------- X : (n x d) data matrix - each row corresponds to a single instance - labels : (n) data labels + each row corresponds to a single instance + + y : (n) data labels """ - chunks = Constraints(labels).chunks(num_chunks=self.params['num_chunks'], - chunk_size=self.params['chunk_size']) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + chunks = Constraints(y).chunks(n_chunks=self.n_chunks, + chunk_size=self.chunk_size, + random_state=self.random_state) + + if self.n_chunks * (self.chunk_size - 1) < X.shape[1]: + warnings.warn('Due to the parameters of RCA_Supervised, ' + 'the inner covariance matrix is not invertible, ' + 'so the transformation matrix will contain Nan values. ' + 'Increase the number or size of the chunks to correct ' + 'this problem.' + ) + return RCA.fit(self, X, chunks) diff --git a/metric_learn/scml.py b/metric_learn/scml.py new file mode 100644 index 00000000..fedf393d --- /dev/null +++ b/metric_learn/scml.py @@ -0,0 +1,663 @@ +""" +Sparse Compositional Metric Learning (SCML) +""" + +from __future__ import print_function, absolute_import, division +import numpy as np +from .base_metric import _TripletsClassifierMixin, MahalanobisMixin +from ._util import components_from_metric +from sklearn.base import TransformerMixin +from .constraints import Constraints +from sklearn.preprocessing import normalize +from sklearn.neighbors import NearestNeighbors +from sklearn.cluster import KMeans +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.utils import check_array, check_random_state +import warnings + + +class _BaseSCML(MahalanobisMixin): + + _tuple_size = 3 # constraints are triplets + _authorized_basis = ['triplet_diffs'] + + def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, + gamma=5e-3, max_iter=10000, output_iter=500, batch_size=10, + verbose=False, preprocessor=None, random_state=None): + self.beta = beta + self.basis = basis + self.n_basis = n_basis + self.gamma = gamma + self.max_iter = max_iter + self.output_iter = output_iter + self.batch_size = batch_size + self.verbose = verbose + self.preprocessor = preprocessor + self.random_state = random_state + super(_BaseSCML, self).__init__(preprocessor) + + def _fit(self, triplets, basis=None, n_basis=None): + """ + Optimization procedure to find a sparse vector of weights to + construct the metric from the basis set. This is based on the + dual averaging method. + """ + + if not isinstance(self.max_iter, int): + raise ValueError("max_iter should be an integer, instead it is of type" + " %s" % type(self.max_iter)) + if not isinstance(self.output_iter, int): + raise ValueError("output_iter should be an integer, instead it is of " + "type %s" % type(self.output_iter)) + if not isinstance(self.batch_size, int): + raise ValueError("batch_size should be an integer, instead it is of type" + " %s" % type(self.batch_size)) + + if self.output_iter > self.max_iter: + raise ValueError("The value of output_iter must be equal or smaller than" + " max_iter.") + + # Currently prepare_inputs makes triplets contain points and not indices + triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') + + # TODO: + # This algorithm is built to work with indices, but in order to be + # compliant with the current handling of inputs it is converted + # back to indices by the following function. This should be improved + # in the future. + triplets, X = self._to_index_points(triplets) + + if basis is None: + basis, n_basis = self._initialize_basis(triplets, X) + + dist_diff = self._compute_dist_diff(triplets, X, basis) + + n_triplets = triplets.shape[0] + + # weight vector + w = np.zeros((1, n_basis)) + # avarage obj gradient wrt weights + avg_grad_w = np.zeros((1, n_basis)) + + # l2 norm in time of all obj gradients wrt weights + ada_grad_w = np.zeros((1, n_basis)) + # slack for not dividing by zero + delta = 0.001 + + best_obj = np.inf + + rng = check_random_state(self.random_state) + rand_int = rng.randint(low=0, high=n_triplets, + size=(self.max_iter, self.batch_size)) + for iter in range(self.max_iter): + + idx = rand_int[iter] + + slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) + slack_mask = np.squeeze(slack_val > 0, axis=1) + + grad_w = np.sum(dist_diff[idx[slack_mask], :], + axis=0, keepdims=True)/self.batch_size + avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1) + + ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w)) + + scale_f = -(iter+1) / (self.gamma * (delta + ada_grad_w)) + + # proximal operator with negative trimming equivalent + w = scale_f * np.minimum(avg_grad_w + self.beta, 0) + + if (iter + 1) % self.output_iter == 0: + # regularization part of obj function + obj1 = np.sum(w)*self.beta + + # Every triplet distance difference in the space given by L + # plus a slack of one + slack_val = 1 + np.matmul(dist_diff, w.T) + # Mask of places with positive slack + slack_mask = slack_val > 0 + + # loss function of learning task part of obj function + obj2 = np.sum(slack_val[slack_mask])/n_triplets + + obj = obj1 + obj2 + if self.verbose: + count = np.sum(slack_mask) + print("[%s] iter %d\t obj %.6f\t num_imp %d" % + (self.__class__.__name__, (iter+1), obj, count)) + + # update the best + if obj < best_obj: + best_obj = obj + best_w = w + + if self.verbose: + print("max iteration reached.") + + # return L matrix yielded from best weights + self.n_iter_ = iter + self.components_ = self._components_from_basis_weights(basis, best_w) + + return self + + def _compute_dist_diff(self, triplets, X, basis): + """ + Helper function to compute the distance difference of every triplet in the + space yielded by the basis set. + """ + # Transformation of data by the basis set + XB = np.matmul(X, basis.T) + + n_triplets = triplets.shape[0] + # get all positive and negative pairs with lowest index first + # np.array (2*n_triplets,2) + triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], + triplets[:, [0, 2]])), + kind='stable') + # calculate all unique pairs and their indices + uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, + axis=0) + # calculate L2 distance acording to bases only for unique pairs + dist = np.square(XB[uniqPairs[:, 0], :] - XB[uniqPairs[:, 1], :]) + + # return the diference of distances between all positive and negative + # pairs + return dist[indices[:n_triplets]] - dist[indices[n_triplets:]] + + def _components_from_basis_weights(self, basis, w): + """ + Get components matrix (L) from computed mahalanobis matrix. + """ + + # get rid of inactive bases + # TODO: Maybe have a tolerance over zero? + active_idx, = w > 0 + w = w[..., active_idx] + basis = basis[active_idx, :] + + n_basis, n_features = basis.shape + + if n_basis < n_features: # if metric is low-rank + warnings.warn("The number of bases with nonzero weight is less than the " + "number of features of the input, in consequence the " + "learned transformation reduces the dimension to %d." + % n_basis) + return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(basis) + + else: # if metric is full rank + return components_from_metric(np.matmul(basis.T, w.T*basis)) + + def _to_index_points(self, triplets): + shape = triplets.shape + X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) + triplets = triplets.reshape(shape[:2]) + return triplets, X + + def _initialize_basis(self, triplets, X): + """ Checks if the basis array is well constructed or constructs it based + on one of the available options. + """ + n_features = X.shape[1] + + if isinstance(self.basis, np.ndarray): + # TODO: should copy? + basis = check_array(self.basis, copy=True) + if basis.shape[1] != n_features: + raise ValueError('The dimensionality ({}) of the provided bases must' + ' match the dimensionality of the data ' + '({}).'.format(basis.shape[1], n_features)) + elif self.basis not in self._authorized_basis: + raise ValueError( + "`basis` must be one of the options '{}' " + "or an array of shape (n_basis, n_features)." + .format("', '".join(self._authorized_basis))) + if self.basis == 'triplet_diffs': + basis, n_basis = self._generate_bases_dist_diff(triplets, X) + + return basis, n_basis + + def _generate_bases_dist_diff(self, triplets, X): + """ Constructs the basis set from the differences of positive and negative + pairs from the triplets constraints. + + The basis set is constructed iteratively by taking n_features triplets, + then adding and substracting respectively all the outerproducts of the + positive and negative pairs, and finally selecting the eigenvectors + of this matrix with positive eigenvalue. This is done until n_basis are + selected. + """ + n_features = X.shape[1] + n_triplets = triplets.shape[0] + + if self.n_basis is None: + # TODO: Get a good default n_basis directive + n_basis = n_features*80 + warnings.warn('As no value for `n_basis` was selected, the number of ' + 'basis will be set to n_basis= %d' % n_basis) + elif isinstance(self.n_basis, int): + n_basis = self.n_basis + else: + raise ValueError("n_basis should be an integer, instead it is of type %s" + % type(self.n_basis)) + + if n_features > n_triplets: + raise ValueError( + "Number of features (%s) is greater than the number of triplets(%s).\n" + "Consider using dimensionality reduction or using another basis " + "generation scheme." % (n_features, n_triplets)) + + basis = np.zeros((n_basis, n_features)) + + # get all positive and negative pairs with lowest index first + # np.array (2*n_triplets,2) + triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], + triplets[:, [0, 2]])), + kind='stable') + # calculate all unique pairs and their indices + uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, + axis=0) + # calculate differences only for unique pairs + diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :] + + diff_pos = diff[indices[:n_triplets], :] + diff_neg = diff[indices[n_triplets:], :] + + rng = check_random_state(self.random_state) + + start = 0 + finish = 0 + while finish != n_basis: + # Select triplets to yield diff + select_triplet = rng.choice(n_triplets, size=n_features, replace=False) + + # select n_features positive differences + d_pos = diff_pos[select_triplet, :] + + # select n_features negative differences + d_neg = diff_neg[select_triplet, :] + + # Yield matrix + diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg) + + # Calculate eigenvalue and eigenvectors + w, v = np.linalg.eigh(diff_sum.T.dot(diff_sum)) + + # Add eigenvectors with positive eigenvalue to basis set + pos_eig_mask = w > 0 + start = finish + finish += pos_eig_mask.sum() + + try: + basis[start:finish, :] = v[pos_eig_mask] + except ValueError: + # if finish is greater than n_basis + basis[start:, :] = v[pos_eig_mask][:n_basis-start] + break + + # TODO: maybe add a warning in case there are no added bases, this could + # be caused by a bad triplet set. This would cause an infinite loop + + return basis, n_basis + + +class SCML(_BaseSCML, _TripletsClassifierMixin): + """Sparse Compositional Metric Learning (SCML) + + `SCML` learns an squared Mahalanobis distance from triplet constraints by + optimizing sparse positive weights assigned to a set of :math:`K` rank-one + PSD bases. This can be formulated as an optimization problem with only + :math:`K` parameters, that can be solved with an efficient stochastic + composite scheme. + + Read more in the :ref:`User Guide `. + + .. warning:: + SCML is still a bit experimental, don't hesitate to report if + something fails/doesn't work as expected. + + Parameters + ---------- + beta: float (default=1e-5) + L1 regularization parameter. + + basis : string or array-like, optional (default='triplet_diffs') + Set of bases to construct the metric. Possible options are + 'triplet_diffs', and an array-like of shape (n_basis, n_features). + + 'triplet_diffs' + The basis set is constructed iteratively from differences between points + of `n_features` positive or negative pairs randomly sampled from the + triplets constraints. Requires the number of training triplets to be + great or equal to `n_features`. + + array-like + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. + + n_basis : int, optional + Number of basis to be yielded. In case it is not set it will be set based + on `basis`. If no value is selected a default will be computed based on + the input. + + gamma: float (default = 5e-3) + Learning rate for the optimization algorithm. + + max_iter : int (default = 10000) + Number of iterations for the algorithm. + + output_iter : int (default = 5000) + Number of iterations to check current weights performance and output this + information in case verbose is True. + + verbose : bool, optional + If True, prints information while learning. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `_components_from_basis_weights`.) + + Examples + -------- + >>> from metric_learn import SCML + >>> triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]], + >>> [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]], + >>> [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]], + >>> [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]] + >>> scml = SCML() + >>> scml.fit(triplets) + + References + ---------- + .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. + `_. \ + (AAAI), 2014. + + .. [2] Adapted from original `Matlab implementation. \ + `_. + + See Also + -------- + metric_learn.SCML_Supervised : The supervised version of the algorithm. + + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ + + def fit(self, triplets): + """Learn the SCML model. + + Parameters + ---------- + triplets : array-like, shape=(n_constraints, 3, n_features) or \ + (n_constraints, 3) + 3D array-like of triplets of points or 2D array of triplets of + indicators. Triplets are assumed to be ordered such that: + d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 0], triplets[i, 2]). + + Returns + ------- + self : object + Returns the instance. + """ + + return self._fit(triplets) + + +class SCML_Supervised(_BaseSCML, TransformerMixin): + """Supervised version of Sparse Compositional Metric Learning (SCML) + + `SCML_Supervised` creates triplets by taking `k_genuine` neighbours + of the same class and `k_impostor` neighbours from different classes for each + point and then runs the SCML algorithm on these triplets. + + Read more in the :ref:`User Guide `. + + .. warning:: + SCML is still a bit experimental, don't hesitate to report if + something fails/doesn't work as expected. + + Parameters + ---------- + beta: float (default=1e-5) + L1 regularization parameter. + + basis : string or an array-like, optional (default='lda') + Set of bases to construct the metric. Possible options are + 'lda', and an array-like of shape (n_basis, n_features). + + 'lda' + The `n_basis` basis set is constructed from the LDA of significant + local regions in the feature space via clustering, for each region + center k-nearest neighbors are used to obtain the LDA scalings, + which correspond to the locally discriminative basis. + + array-like + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. + + n_basis : int, optional + Number of basis to be yielded. In case it is not set it will be set based + on `basis`. If no value is selected a default will be computed based on + the input. + + gamma: float (default = 5e-3) + Learning rate for the optimization algorithm. + + max_iter : int (default = 100000) + Number of iterations for the algorithm. + + output_iter : int (default = 5000) + Number of iterations to check current weights performance and output this + information in case verbose is True. + + verbose : bool, optional + If True, prints information while learning. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `_components_from_basis_weights`.) + + Examples + -------- + >>> from metric_learn import SCML_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> scml = SCML_Supervised(random_state=33) + >>> scml.fit(X, Y) + SCML_Supervised(random_state=33) + >>> scml.score_pairs([[X[0], X[1]], [X[0], X[2]]]) + array([1.84640733, 1.55984363]) + >>> scml.get_metric()(X[0], X[1]) + 1.8464073327922157 + + References + ---------- + .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. + `_. \ + (AAAI), 2014. + + .. [2] Adapted from original `Matlab implementation. \ + `_. + + See Also + -------- + metric_learn.SCML : The weakly supervised version of this + algorithm. + """ + # Add supervised authorized basis construction options + _authorized_basis = _BaseSCML._authorized_basis + ['lda'] + + def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda', + n_basis=None, gamma=5e-3, max_iter=10000, output_iter=500, + batch_size=10, verbose=False, preprocessor=None, + random_state=None): + self.k_genuine = k_genuine + self.k_impostor = k_impostor + _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis, + max_iter=max_iter, output_iter=output_iter, + batch_size=batch_size, verbose=verbose, + preprocessor=preprocessor, random_state=random_state) + + def fit(self, X, y): + """Create constraints from labels and learn the SCML model. + + Parameters + ---------- + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + y : (n) array-like + Data labels. + + Returns + ------- + self : object + Returns the instance. + """ + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + + basis, n_basis = self._initialize_basis_supervised(X, y) + + if not isinstance(self.k_genuine, int): + raise ValueError("k_genuine should be an integer, instead it is of type" + " %s" % type(self.k_genuine)) + if not isinstance(self.k_impostor, int): + raise ValueError("k_impostor should be an integer, instead it is of " + "type %s" % type(self.k_impostor)) + + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, self.k_genuine, + self.k_impostor) + + triplets = X[triplets] + + return self._fit(triplets, basis, n_basis) + + def _initialize_basis_supervised(self, X, y): + """ Constructs the basis set following one of the supervised options in + case one is selected. + """ + + if isinstance(self.basis, str) and self.basis == 'lda': + basis, n_basis = self._generate_bases_LDA(X, y) + else: + basis, n_basis = None, None + + return basis, n_basis + + def _generate_bases_LDA(self, X, y): + """ Generates bases for the 'lda' option. + + The basis set is constructed using Linear Discriminant Analysis of + significant local regions in the feature space via clustering, for + each region center k-nearest neighbors are used to obtain the LDA scalings, + which correspond to the locally discriminative basis. Currently this is + done at two scales `k={10,20}` if `n_feature < 50` or else `k={20,50}`. + """ + + labels, class_count = np.unique(y, return_counts=True) + n_class = len(labels) + + n_features = X.shape[1] + # Number of basis yielded from each LDA + num_eig = min(n_class-1, n_features) + + if self.n_basis is None: + # TODO: Get a good default n_basis directive + n_basis = min(20*n_features, X.shape[0]*2*num_eig - 1) + warnings.warn('As no value for `n_basis` was selected, the number of ' + 'basis will be set to n_basis= %d' % n_basis) + + elif isinstance(self.n_basis, int): + n_basis = self.n_basis + else: + raise ValueError("n_basis should be an integer, instead it is of type %s" + % type(self.n_basis)) + + # Number of clusters needed for 2 scales given the number of basis + # yielded by every LDA + n_clusters = int(np.ceil(n_basis/(2 * num_eig))) + + if n_basis < n_class: + warnings.warn("The number of basis is less than the number of classes, " + "which may lead to poor discriminative performance.") + elif n_basis >= X.shape[0]*2*num_eig: + raise ValueError("Not enough samples to generate %d LDA bases, n_basis" + "should be smaller than %d" % + (n_basis, X.shape[0]*2*num_eig)) + + kmeans = KMeans(n_clusters=n_clusters, n_init=10, + random_state=self.random_state, algorithm='elkan').fit(X) + cX = kmeans.cluster_centers_ + + n_scales = 2 + if n_features > 50: + scales = [20, 50] + else: + scales = [10, 20] + + k_class = np.vstack((np.minimum(class_count, scales[0]), + np.minimum(class_count, scales[1]))) + + idx_set = [np.zeros((n_clusters, sum(k_class[0, :])), dtype=np.int64), + np.zeros((n_clusters, sum(k_class[1, :])), dtype=np.int64)] + + start_finish_indices = np.hstack((np.zeros((2, 1), np.int64), + k_class)).cumsum(axis=1) + + neigh = NearestNeighbors() + + for c in range(n_class): + sel_c = np.where(y == labels[c]) + + # get k_class same class neighbors + neigh.fit(X=X[sel_c]) + # Only take the neighbors once for the biggest scale + neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[-1, c], + return_distance=False) + + # add index set of neighbors for every cluster center for both scales + for s, k in enumerate(k_class[:, c]): + start, finish = start_finish_indices[s, c:c+2] + idx_set[s][:, start:finish] = np.take(sel_c, neighbors[:, :k]) + + # Compute basis for every cluster in both scales + basis = np.zeros((n_basis, n_features)) + lda = LinearDiscriminantAnalysis() + start_finish_indices = np.hstack((np.vstack((0, n_clusters * num_eig)), + np.full((2, n_clusters), + num_eig))).cumsum(axis=1) + + for s in range(n_scales): + for c in range(n_clusters): + lda.fit(X[idx_set[s][c, :]], y[idx_set[s][c, :]]) + start, finish = start_finish_indices[s, c:c+2] + normalized_scalings = normalize(lda.scalings_.T) + try: + basis[start: finish, :] = normalized_scalings + except ValueError: + # handle tail + basis[start:, :] = normalized_scalings[:n_basis-start] + break + + return basis, n_basis diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index aba1b9be..c4c427b9 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -1,108 +1,351 @@ """ -Qi et al. -An efficient sparse metric learning in high-dimensional space via -L1-penalized log-determinant regularization. -ICML 2009 - -Adapted from https://gist.github.com/kcarnold/5439945 -Paper: http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/icml09-guojun.pdf +Sparse High-Dimensional Metric Learning (SDML) """ -from __future__ import absolute_import +import warnings import numpy as np -from scipy.sparse.csgraph import laplacian -from sklearn.covariance import graph_lasso -from sklearn.utils.extmath import pinvh - -from .base_metric import BaseMetricLearner -from .constraints import Constraints - - -class SDML(BaseMetricLearner): - def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, - verbose=False): - ''' - balance_param: float, optional - trade off between sparsity and M0 prior - sparsity_param: float, optional - trade off between optimizer and sparseness (see graph_lasso) - use_cov: bool, optional - controls prior matrix, will use the identity if use_cov=False - verbose : bool, optional - if True, prints information while learning - ''' - self.params = { - 'balance_param': balance_param, - 'sparsity_param': sparsity_param, - 'use_cov': use_cov, - 'verbose': verbose, - } - - def _prepare_inputs(self, X, W): - self.X = X - # set up prior M - if self.params['use_cov']: - self.M = np.cov(X.T) +from sklearn.base import TransformerMixin +from scipy.linalg import pinvh +try: + from sklearn.covariance._graph_lasso import ( + _graphical_lasso as graphical_lasso + ) +except ImportError: + from sklearn.covariance import graphical_lasso + +from sklearn.exceptions import ConvergenceWarning + +from .base_metric import MahalanobisMixin, _PairsClassifierMixin +from .constraints import Constraints, wrap_pairs +from ._util import components_from_metric, _initialize_metric_mahalanobis +try: + from inverse_covariance import quic +except ImportError: + HAS_SKGGM = False +else: + HAS_SKGGM = True + + +class _BaseSDML(MahalanobisMixin): + + _tuple_size = 2 # constraints are pairs + + def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity', + verbose=False, preprocessor=None, + random_state=None): + self.balance_param = balance_param + self.sparsity_param = sparsity_param + self.prior = prior + self.verbose = verbose + self.random_state = random_state + super(_BaseSDML, self).__init__(preprocessor) + + def _fit(self, pairs, y): + if not HAS_SKGGM: + if self.verbose: + print("SDML will use scikit-learn's graphical lasso solver.") else: - self.M = np.identity(X.shape[1]) - L = laplacian(W, normed=False) - self.loss_matrix = self.X.T.dot(L.dot(self.X)) + if self.verbose: + print("SDML will use skggm's graphical lasso solver.") + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') + n_features = pairs.shape[2] + if n_features < 2: + raise ValueError(f"Cannot fit SDML with {n_features} feature(s)") - def metric(self): - return self.M + # set up (the inverse of) the prior M + # if the prior is the default (None), we raise a warning + _, prior_inv = _initialize_metric_mahalanobis( + pairs, self.prior, + return_inverse=True, strict_pd=True, matrix_name='prior', + random_state=self.random_state) + diff = pairs[:, 0] - pairs[:, 1] + loss_matrix = (diff.T * y).dot(diff) + emp_cov = prior_inv + self.balance_param * loss_matrix - def fit(self, X, W): - """ - X: data matrix, (n x d) - each row corresponds to a single instance - W: connectivity graph, (n x n) - +1 for positive pairs, -1 for negative. + # our initialization will be the matrix with emp_cov's eigenvalues, + # with a constant added so that they are all positive (plus an epsilon + # to ensure definiteness). This is empirical. + w, V = np.linalg.eigh(emp_cov) + min_eigval = np.min(w) + if min_eigval < 0.: + warnings.warn("Warning, the input matrix of graphical lasso is not " + "positive semi-definite (PSD). The algorithm may diverge, " + "and lead to degenerate solutions. " + "To prevent that, try to decrease the balance parameter " + "`balance_param` and/or to set prior='identity'.", + ConvergenceWarning) + w -= min_eigval # we translate the eigenvalues to make them all positive + w += 1e-10 # we add a small offset to avoid definiteness problems + sigma0 = (V * w).dot(V.T) + try: + if HAS_SKGGM: + theta0 = pinvh(sigma0) + M, _, _, _, _, _ = quic(emp_cov, lam=self.sparsity_param, + msg=self.verbose, + Theta0=theta0, Sigma0=sigma0) + else: + _, M, *_ = graphical_lasso(emp_cov, alpha=self.sparsity_param, + verbose=self.verbose, + cov_init=sigma0) + raised_error = None + w_mahalanobis, _ = np.linalg.eigh(M) + not_spd = any(w_mahalanobis < 0.) + not_finite = not np.isfinite(M).all() + # TODO: Narrow this to the specific exceptions we expect. + except Exception as e: + raised_error = e + not_spd = False # not_spd not applicable here so we set to False + not_finite = False # not_finite not applicable here so we set to False + if raised_error is not None or not_spd or not_finite: + msg = ("There was a problem in SDML when using {}'s graphical " + "lasso solver.").format("skggm" if HAS_SKGGM else "scikit-learn") + if not HAS_SKGGM: + skggm_advice = (" skggm's graphical lasso can sometimes converge " + "on non SPD cases where scikit-learn's graphical " + "lasso fails to converge. Try to install skggm and " + "rerun the algorithm (see the README.md for the " + "right version of skggm).") + msg += skggm_advice + if raised_error is not None: + msg += " The following error message was thrown: {}.".format( + raised_error) + raise RuntimeError(msg) + + self.components_ = components_from_metric(np.atleast_2d(M)) + return self + + +class SDML(_BaseSDML, _PairsClassifierMixin): + r"""Sparse Distance Metric Learning (SDML) + + SDML is an efficient sparse metric learning in high-dimensional space via + double regularization: an L1-penalization on the off-diagonal elements of the + Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence + between :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either + :math:`\mathbf{I}` or :math:`\mathbf{\Omega}^{-1}`, where + :math:`\mathbf{\Omega}` is the covariance matrix). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + balance_param : float, optional (default=0.5) + Trade off between sparsity and M0 prior. + + sparsity_param : float, optional (default=0.01) + Trade off between optimizer and sparseness (see graph_lasso). + + prior : string or numpy array, optional (default='identity') + Prior to set for the metric. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). For SDML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. + + 'random' + The prior will be a random positive definite (PD) matrix of shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + verbose : bool, optional (default=False) + If True, prints information while learning. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``prior='random'``, ``random_state`` is used to set the prior. + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + threshold_ : `float` + If the distance metric between two points is lower than this threshold, + points will be classified as similar, otherwise they will be + classified as dissimilar. + + Examples + -------- + >>> from metric_learn import SDML_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> sdml = SDML_Supervised(n_constraints=200) + >>> sdml.fit(X, Y) + + References + ---------- + .. [1] Qi et al. `An efficient sparse metric learning in high-dimensional + space via L1-penalized log-determinant regularization + `_. + ICML 2009. + + .. [2] Code adapted from https://gist.github.com/kcarnold/5439945 + """ + + def fit(self, pairs, y, calibration_params=None): + """Learn the SDML model. + + The threshold will be calibrated on the trainset using the parameters + `calibration_params`. + + Parameters + ---------- + pairs : array-like, shape=(n_constraints, 2, n_features) or \ + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + y : array-like, of shape (n_constraints,) + Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. + + calibration_params : `dict` or `None` + Dictionary of parameters to give to `calibrate_threshold` for the + threshold calibration step done at the end of `fit`. If `None` is + given, `calibrate_threshold` will use the default parameters. + + Returns + ------- + self : object + Returns the instance. """ - self._prepare_inputs(X, W) - P = pinvh(self.M) + self.params['balance_param'] * self.loss_matrix - emp_cov = pinvh(P) - # hack: ensure positive semidefinite - emp_cov = emp_cov.T.dot(emp_cov) - self.M, _ = graph_lasso(emp_cov, self.params['sparsity_param'], - verbose=self.params['verbose']) + calibration_params = (calibration_params if calibration_params is not + None else dict()) + self._validate_calibration_params(**calibration_params) + self._fit(pairs, y) + self.calibrate_threshold(pairs, y, **calibration_params) return self -class SDML_Supervised(SDML): - def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, - num_labeled=np.inf, num_constraints=None, verbose=False): - SDML.__init__(self, balance_param=balance_param, - sparsity_param=sparsity_param, use_cov=use_cov, - verbose=verbose) - ''' - balance_param: float, optional - trade off between sparsity and M0 prior - sparsity_param: float, optional - trade off between optimizer and sparseness (see graph_lasso) - use_cov: bool, optional - controls prior matrix, will use the identity if use_cov=False - num_labeled : int, optional - number of labels to preserve for training - num_constraints: int, optional - number of constraints to generate - verbose : bool, optional - if True, prints information while learning - ''' - self.params.update(num_labeled=num_labeled, num_constraints=num_constraints) - - def fit(self, X, labels): +class SDML_Supervised(_BaseSDML, TransformerMixin): + """Supervised version of Sparse Distance Metric Learning (SDML) + + `SDML_Supervised` creates pairs of similar sample by taking same class + samples, and pairs of dissimilar samples by taking different class + samples. It then passes these pairs to `SDML` for training. + + Parameters + ---------- + balance_param : float, optional (default=0.5) + Trade off between sparsity and M0 prior. + + sparsity_param : float, optional (default=0.01) + Trade off between optimizer and sparseness (see graph_lasso). + + prior : string or numpy array, optional (default='identity') + Prior to set for the metric. Possible options are + 'identity', 'covariance', 'random', and a numpy array of + shape (n_features, n_features). For SDML, the prior should be strictly + positive definite (PD). + + 'identity' + An identity matrix of shape (n_features, n_features). + + 'covariance' + The inverse covariance matrix. + + 'random' + The prior will be a random SPD matrix of shape + `(n_features, n_features)`, generated using + `sklearn.datasets.make_spd_matrix`. + + numpy array + A positive definite (PD) matrix of shape + (n_features, n_features), that will be used as such to set the + prior. + + n_constraints : int, optional (default=None) + Number of constraints to generate. If None, defaults to `20 * + num_classes**2`. + + verbose : bool, optional (default=False) + If True, prints information while learning. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. + + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. If + ``init='random'``, ``random_state`` is used to set the random + prior. In any case, `random_state` is also used to randomly sample + constraints from labels. + + num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0 + + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + + See Also + -------- + metric_learn.SDML : The original weakly-supervised algorithm + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ + + def __init__(self, balance_param=0.5, sparsity_param=0.01, prior='identity', + n_constraints=None, verbose=False, preprocessor=None, + random_state=None, num_constraints='deprecated'): + _BaseSDML.__init__(self, balance_param=balance_param, + sparsity_param=sparsity_param, prior=prior, + verbose=verbose, + preprocessor=preprocessor, random_state=random_state) + if num_constraints != 'deprecated': + warnings.warn('"num_constraints" parameter has been renamed to' + ' "n_constraints". It has been deprecated in' + ' version 0.6.3 and will be removed in 0.7.0' + '', FutureWarning) + self.n_constraints = num_constraints + else: + self.n_constraints = n_constraints + # Avoid test get_params from failing (all params passed sholud be set) + self.num_constraints = 'deprecated' + + def fit(self, X, y): """Create constraints from labels and learn the SDML model. Parameters ---------- - X: data matrix, (n x d) - each row corresponds to a single instance - labels: data labels, (n,) array-like + X : array-like, shape (n, d) + data matrix, where each row corresponds to a single instance + + y : array-like, shape (n,) + data labels, one for each instance + + Returns + ------- + self : object + Returns the instance. """ - num_constraints = self.params['num_constraints'] - if num_constraints is None: - num_classes = np.unique(labels) - num_constraints = 20*(len(num_classes))**2 + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + n_constraints = self.n_constraints + if n_constraints is None: + num_classes = len(np.unique(y)) + n_constraints = 20 * num_classes**2 - c = Constraints.random_subset(labels, self.params['num_labeled']) - return SDML.fit(self, X, c.adjacency_matrix(num_constraints)) + c = Constraints(y) + pos_neg = c.positive_negative_pairs(n_constraints, + random_state=self.random_state) + pairs, y = wrap_pairs(X, pos_neg) + return _BaseSDML._fit(self, pairs, y) diff --git a/metric_learn/sklearn_shims.py b/metric_learn/sklearn_shims.py new file mode 100644 index 00000000..8d746890 --- /dev/null +++ b/metric_learn/sklearn_shims.py @@ -0,0 +1,25 @@ +"""This file is for fixing imports due to different APIs +depending on the scikit-learn version""" +import sklearn +from packaging import version +SKLEARN_AT_LEAST_0_22 = (version.parse(sklearn.__version__) + >= version.parse('0.22.0')) +if SKLEARN_AT_LEAST_0_22: + from sklearn.utils._testing import (set_random_state, + ignore_warnings, + assert_allclose_dense_sparse, + _get_args) + from sklearn.utils.estimator_checks import (_is_public_parameter + as is_public_parameter) + from sklearn.metrics._scorer import get_scorer +else: + from sklearn.utils.testing import (set_random_state, + ignore_warnings, + assert_allclose_dense_sparse, + _get_args) + from sklearn.utils.estimator_checks import is_public_parameter + from sklearn.metrics.scorer import get_scorer + +__all__ = ['set_random_state', 'set_random_state', + 'ignore_warnings', 'assert_allclose_dense_sparse', '_get_args', + 'is_public_parameter', 'get_scorer'] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..ef3c8acb --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + integration: mark a test as integration + unit: mark a test as unit \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 8d95aa1e..bc7695e3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,4 +2,6 @@ universal = 1 [metadata] -description-file = README.rst \ No newline at end of file +description-file = README.rst +license_files = + LICENSE.txt diff --git a/setup.py b/setup.py index 2031754a..23392077 100755 --- a/setup.py +++ b/setup.py @@ -1,33 +1,77 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- from setuptools import setup +import os +import io +import sys + + +CURRENT_PYTHON = sys.version_info[:2] +REQUIRED_PYTHON = (3, 6) + +# This check and everything above must remain compatible with Python 2.7. +if CURRENT_PYTHON < REQUIRED_PYTHON: + sys.stderr.write(""" +========================== +Unsupported Python version +========================== +This version of metric-learn requires Python {}.{}, but you're trying to +install it on Python {}.{}. +This may be because you are using a version of pip that doesn't +understand the python_requires classifier. Make sure you +have pip >= 9.0 and setuptools >= 24.2, then try again: + $ python -m pip install --upgrade pip setuptools + $ python -m pip install django +This will install the latest version of metric-learn which works on your +version of Python. If you can't upgrade your pip (or Python), request +an older version of metric-learn: + $ python -m pip install "metric-learn<0.6.0" +""".format(*(REQUIRED_PYTHON + CURRENT_PYTHON))) + sys.exit(1) + + +version = {} +with io.open(os.path.join('metric_learn', '_version.py')) as fp: + exec(fp.read(), version) + +# Get the long description from README.md +with io.open('README.rst', encoding='utf-8') as f: + long_description = f.read() -version = "0.3.0" setup(name='metric-learn', - version=version, + version=version['__version__'], description='Python implementations of metric learning algorithms', - author=['CJ Carey', 'Yuan Tang'], + long_description=long_description, + python_requires='>={}.{}'.format(*REQUIRED_PYTHON), + author=[ + 'CJ Carey', + 'Yuan Tang', + 'William de Vazelhes', + 'Aurélien Bellet', + 'Nathalie Vauquier' + ], author_email='ccarey@cs.umass.edu', - url='http://github.com/all-umass/metric-learn', + url='http://github.com/scikit-learn-contrib/metric-learn', license='MIT', classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', + 'Programming Language :: Python :: 3', 'Operating System :: OS Independent', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering' ], packages=['metric_learn'], install_requires=[ - 'numpy', - 'scipy', - 'scikit-learn', - 'six' + 'numpy>= 1.11.0', + 'scipy>= 0.17.0', + 'scikit-learn>=0.21.3', ], extras_require=dict( - docs=['sphinx', 'shinx_rtd_theme', 'numpydoc'], + docs=['sphinx', 'sphinx_rtd_theme', 'numpydoc', 'sphinx-gallery', + 'matplotlib'], demo=['matplotlib'], + sdml=['skggm>=0.2.9'] ), test_suite='test', keywords=[ @@ -36,5 +80,9 @@ 'Information Theoretic Metric Learning', 'Sparse Determinant Metric Learning', 'Least Squares Metric Learning', - 'Neighborhood Components Analysis' + 'Neighborhood Components Analysis', + 'Local Fisher Discriminant Analysis', + 'Relative Components Analysis', + 'Mahalanobis Metric for Clustering', + 'Metric Learning for Kernel Regression' ]) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 1a745596..d457b52d 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -1,24 +1,41 @@ +import warnings import unittest +import re +import pytest import numpy as np -from six.moves import xrange -from sklearn.metrics import pairwise_distances -from sklearn.datasets import load_iris -from numpy.testing import assert_array_almost_equal - -from metric_learn import ( - LMNN, NCA, LFDA, - LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised) +import scipy +from scipy.optimize import check_grad, approx_fprime +from sklearn.metrics import pairwise_distances, euclidean_distances +from sklearn.datasets import (load_iris, make_classification, make_regression, + make_spd_matrix) +from numpy.testing import (assert_array_almost_equal, assert_array_equal, + assert_allclose) +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils.validation import check_X_y +from sklearn.preprocessing import StandardScaler +try: + from inverse_covariance import quic + assert quic +except ImportError: + HAS_SKGGM = False +else: + HAS_SKGGM = True +from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, + SCML_Supervised, LSML_Supervised, + ITML_Supervised, SDML_Supervised, RCA_Supervised, + MMC_Supervised, SDML, RCA, ITML, SCML) # Import this specially for testing. -from metric_learn.lmnn import python_LMNN +from metric_learn.constraints import wrap_pairs, Constraints +from metric_learn.lmnn import _sum_outer_products def class_separation(X, labels): unique_labels, label_inds = np.unique(labels, return_inverse=True) ratio = 0 - for li in xrange(len(unique_labels)): - Xc = X[label_inds==li] - Xnc = X[label_inds!=li] - ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean() + for li in range(len(unique_labels)): + Xc = X[label_inds == li] + Xnc = X[label_inds != li] + ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean() return ratio / len(unique_labels) @@ -32,77 +49,1120 @@ def setUpClass(self): np.random.seed(1234) +class TestCovariance(MetricTestCase): + def test_iris(self): + cov = Covariance() + cov.fit(self.iris_points) + + csep = class_separation(cov.transform(self.iris_points), self.iris_labels) + # deterministic result + self.assertAlmostEqual(csep, 0.72981476) + + def test_singular_returns_pseudo_inverse(self): + """Checks that if the input covariance matrix is singular, we return + the pseudo inverse""" + X, y = load_iris(return_X_y=True) + # We add a virtual column that is a linear combination of the other + # columns so that the covariance matrix will be singular + X = np.concatenate([X, X[:, :2].dot([[2], [3]])], axis=1) + cov_matrix = np.cov(X, rowvar=False) + covariance = Covariance() + covariance.fit(X) + pseudo_inverse = covariance.get_mahalanobis_matrix() + # here is the definition of a pseudo inverse according to wikipedia: + assert_allclose(cov_matrix.dot(pseudo_inverse).dot(cov_matrix), + cov_matrix) + assert_allclose(pseudo_inverse.dot(cov_matrix).dot(pseudo_inverse), + pseudo_inverse) + + +class TestSCML(object): + @pytest.mark.parametrize('basis', ('lda', 'triplet_diffs')) + def test_iris(self, basis): + """ + SCML applied to Iris dataset should give better results when + computing class separation. + """ + X, y = load_iris(return_X_y=True) + before = class_separation(X, y) + scml = SCML_Supervised(basis=basis, n_basis=85, k_genuine=7, k_impostor=5, + random_state=42) + scml.fit(X, y) + after = class_separation(scml.transform(X), y) + assert before > after + 0.03 # It's better by a margin of 0.03 + + def test_big_n_features(self): + X, y = make_classification(n_samples=100, n_classes=3, n_features=60, + n_informative=60, n_redundant=0, n_repeated=0, + random_state=42) + X = StandardScaler().fit_transform(X) + scml = SCML_Supervised(random_state=42, n_basis=399) + scml.fit(X, y) + csep = class_separation(scml.transform(X), y) + assert csep < 0.7 + + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.ones((3, 3, 3)),)), + (SCML_Supervised, (np.array([[0, 0], [0, 1], + [2, 0], [2, 1]]), + np.array([1, 0, 1, 0])))]) + def test_bad_basis(self, estimator, data): + model = estimator(basis='bad_basis', n_basis=33) # n_basis doesn't matter + msg = ("`basis` must be one of the options '{}' or an array of shape " + "(n_basis, n_features)." + .format("', '".join(model._authorized_basis))) + with pytest.raises(ValueError) as raised_error: + model.fit(*data) + assert msg == raised_error.value.args[0] + + def test_dimension_reduction_msg(self): + scml = SCML(n_basis=2) + triplets = np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) + msg = ("The number of bases with nonzero weight is less than the " + "number of features of the input, in consequence the " + "learned transformation reduces the dimension to 1.") + with pytest.warns(UserWarning) as raised_warning: + scml.fit(triplets) + assert msg == raised_warning[0].message.args[0] + + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]),)), + (SCML_Supervised, (np.array([[0, 0], [1, 1], + [3, 3]]), + np.array([1, 2, 3])))]) + def test_n_basis_wrong_type(self, estimator, data): + n_basis = 4.0 + model = estimator(n_basis=n_basis) + msg = ("n_basis should be an integer, instead it is of type %s" + % type(n_basis)) + with pytest.raises(ValueError) as raised_error: + model.fit(*data) + assert msg == raised_error.value.args[0] + + def test_small_n_basis_lda(self): + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + y = np.array([0, 0, 1, 1]) + + n_class = 2 + scml = SCML_Supervised(n_basis=n_class-1) + msg = ("The number of basis is less than the number of classes, which may" + " lead to poor discriminative performance.") + with pytest.warns(UserWarning) as raised_warning: + scml.fit(X, y) + assert msg == raised_warning[0].message.args[0] + + def test_big_n_basis_lda(self): + X = np.array([[0, 0], [1, 1], [3, 3]]) + y = np.array([1, 2, 3]) + + n_class = 3 + num_eig = min(n_class - 1, X.shape[1]) + n_basis = X.shape[0] * 2 * num_eig + + scml = SCML_Supervised(n_basis=n_basis) + msg = ("Not enough samples to generate %d LDA bases, n_basis" + "should be smaller than %d" % + (n_basis, n_basis)) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.random.rand(3, 3, 2),)), + (SCML_Supervised, (np.array([[0, 0], [0, 1], + [2, 0], [2, 1]]), + np.array([1, 0, 1, 0])))]) + def test_array_basis(self, estimator, data): + """ Test that the proper error is raised when the shape of the input basis + array is not consistent with the input + """ + basis = np.eye(3) + scml = estimator(n_basis=3, basis=basis) + + msg = ('The dimensionality ({}) of the provided bases must match the ' + 'dimensionality of the data ({}).' + .format(basis.shape[1], data[0].shape[-1])) + with pytest.raises(ValueError) as raised_error: + scml.fit(*data) + assert msg == raised_error.value.args[0] + + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], + [1, 0, 3], [2, 3, 1], [2, 3, 0], + [3, 2, 1], [3, 2, 0]]),)), + (SCML_Supervised, (np.array([0, 1, 2, 3]), + np.array([0, 0, 1, 1])))]) + def test_verbose(self, estimator, data, capsys): + # assert there is proper output when verbose = True + model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), + max_iter=1, output_iter=1, batch_size=1, + basis='triplet_diffs', random_state=42, verbose=True) + model.fit(*data) + out, _ = capsys.readouterr() + expected_out = ('[%s] iter 1\t obj 0.569946\t num_imp 2\n' + 'max iteration reached.\n' % estimator.__name__) + assert out == expected_out + + def test_triplet_diffs_toy(self): + expected_n_basis = 10 + model = SCML_Supervised(n_basis=expected_n_basis) + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + triplets = np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], [1, 0, 3], + [2, 3, 1], [2, 3, 0], [3, 2, 1], [3, 2, 0]]) + basis, n_basis = model._generate_bases_dist_diff(triplets, X) + # All points are along the same line, so the only possible basis will be + # the vector along that line normalized. + expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) + assert n_basis == expected_n_basis + np.testing.assert_allclose(basis, expected_basis) + + def test_lda_toy(self): + expected_n_basis = 7 + model = SCML_Supervised(n_basis=expected_n_basis) + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + y = np.array([0, 0, 1, 1]) + basis, n_basis = model._generate_bases_LDA(X, y) + # All points are along the same line, so the only possible basis will be + # the vector along that line normalized. In this case it is possible to + # obtain it with positive or negative orientations. + expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) + assert n_basis == expected_n_basis + np.testing.assert_allclose(np.abs(basis), expected_basis) + + @pytest.mark.parametrize('n_samples', [100, 500]) + @pytest.mark.parametrize('n_features', [10, 50, 100]) + @pytest.mark.parametrize('n_classes', [5, 10, 15]) + def test_triplet_diffs(self, n_samples, n_features, n_classes): + """ + Test that the correct value of n_basis is being generated with + different triplet constraints. + """ + X, y = make_classification(n_samples=n_samples, n_classes=n_classes, + n_features=n_features, n_informative=n_features, + n_redundant=0, n_repeated=0) + X = StandardScaler().fit_transform(X) + model = SCML_Supervised(n_basis=None) # Explicit n_basis=None + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, model.k_genuine, + model.k_impostor) + + msg = "As no value for `n_basis` was selected, " + with pytest.warns(UserWarning) as raised_warning: + basis, n_basis = model._generate_bases_dist_diff(triplets, X) + assert msg in str(raised_warning[0].message) + + expected_n_basis = n_features * 80 + assert n_basis == expected_n_basis + assert basis.shape == (expected_n_basis, n_features) + + @pytest.mark.parametrize('n_samples', [100, 500]) + @pytest.mark.parametrize('n_features', [10, 50, 100]) + @pytest.mark.parametrize('n_classes', [5, 10, 15]) + def test_lda(self, n_samples, n_features, n_classes): + """ + Test that when n_basis=None, the correct n_basis is generated, + for SCML_Supervised and different values of n_samples, n_features + and n_classes. + """ + X, y = make_classification(n_samples=n_samples, n_classes=n_classes, + n_features=n_features, n_informative=n_features, + n_redundant=0, n_repeated=0) + X = StandardScaler().fit_transform(X) + + msg = "As no value for `n_basis` was selected, " + with pytest.warns(UserWarning) as raised_warning: + model = SCML_Supervised(n_basis=None) # Explicit n_basis=None + basis, n_basis = model._generate_bases_LDA(X, y) + assert msg in str(raised_warning[0].message) + + num_eig = min(n_classes - 1, n_features) + expected_n_basis = min(20 * n_features, n_samples * 2 * num_eig - 1) + assert n_basis == expected_n_basis + assert basis.shape == (expected_n_basis, n_features) + + @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size', + 'n_basis']) + def test_int_inputs(self, name): + value = 1.0 + d = {name: value} + scml = SCML(**d) + triplets = np.array([[[0, 1], [2, 1], [0, 0]]]) + + msg = ("%s should be an integer, instead it is of type" + " %s" % (name, type(value))) + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + + @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size', + 'k_genuine', 'k_impostor', 'n_basis']) + def test_int_inputs_supervised(self, name): + value = 1.0 + d = {name: value} + scml = SCML_Supervised(**d) + X = np.array([[0, 0], [1, 1], [3, 3], [4, 4]]) + y = np.array([1, 1, 0, 0]) + msg = ("%s should be an integer, instead it is of type" + " %s" % (name, type(value))) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_large_output_iter(self): + scml = SCML(max_iter=1, output_iter=2, n_basis=33) # n_basis don't matter + triplets = np.array([[[0, 1], [2, 1], [0, 0]]]) + msg = ("The value of output_iter must be equal or smaller than" + " max_iter.") + + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + + class TestLSML(MetricTestCase): def test_iris(self): - lsml = LSML_Supervised(num_constraints=200) + lsml = LSML_Supervised(n_constraints=200) lsml.fit(self.iris_points, self.iris_labels) - csep = class_separation(lsml.transform(), self.iris_labels) + csep = class_separation(lsml.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.8) # it's pretty terrible class TestITML(MetricTestCase): def test_iris(self): - itml = ITML_Supervised(num_constraints=200) + itml = ITML_Supervised(n_constraints=200) itml.fit(self.iris_points, self.iris_labels) - csep = class_separation(itml.transform(), self.iris_labels) - self.assertLess(csep, 0.4) # it's not great + csep = class_separation(itml.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.2) + + +@pytest.mark.parametrize('bounds', [None, (20., 100.), [20., 100.], + np.array([20., 100.]), + np.array([[20., 100.]]), + np.array([[20], [100]])]) +def test_bounds_parameters_valid(bounds): + """Asserts that we can provide any array-like of two elements as bounds, + and that the attribute bound_ is a numpy array""" + + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) + y_pairs = [1, -1] + itml = ITML() + itml.fit(pairs, y_pairs, bounds=bounds) + + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + itml_supervised = ITML_Supervised() + itml_supervised.fit(X, y, bounds=bounds) + + +@pytest.mark.parametrize('bounds', ['weird', ['weird1', 'weird2'], + np.array([1, 2, 3])]) +def test_bounds_parameters_invalid(bounds): + """Assert that if a non array-like is put for bounds, or an array-like + of length different than 2, an error is returned""" + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) + y_pairs = [1, -1] + itml = ITML() + with pytest.raises(Exception): + itml.fit(pairs, y_pairs, bounds=bounds) + + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + itml_supervised = ITML_Supervised() + with pytest.raises(Exception): + itml_supervised.fit(X, y, bounds=bounds) class TestLMNN(MetricTestCase): def test_iris(self): - # Test both impls, if available. - for LMNN_cls in set((LMNN, python_LMNN)): - lmnn = LMNN_cls(k=5, learn_rate=1e-6, verbose=False) - lmnn.fit(self.iris_points, self.iris_labels) + lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False) + lmnn.fit(self.iris_points, self.iris_labels) + + csep = class_separation(lmnn.transform(self.iris_points), + self.iris_labels) + self.assertLess(csep, 0.25) + + def test_loss_grad_lbfgs(self): + """Test gradient of loss function + Assert that the gradient is almost equal to its finite differences + approximation. + """ + rng = np.random.RandomState(42) + X, y = make_classification(random_state=rng) + L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) + lmnn = LMNN() + + k = lmnn.n_neighbors + reg = lmnn.regularization + + X, y = lmnn._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) + num_pts, n_components = X.shape + unique_labels, label_inds = np.unique(y, return_inverse=True) + lmnn.labels_ = np.arange(len(unique_labels)) + lmnn.components_ = np.eye(n_components) + + target_neighbors = lmnn._select_targets(X, label_inds) + + # sum outer products + dfG = _sum_outer_products(X, target_neighbors.flatten(), + np.repeat(np.arange(X.shape[0]), k)) + + # initialize L + def loss_grad(flat_L): + return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, + k, reg, target_neighbors, label_inds) + + def fun(x): + return loss_grad(x)[1] + + def grad(x): + return loss_grad(x)[0].ravel() + + # compute relative error + epsilon = np.sqrt(np.finfo(float).eps) + rel_diff = (check_grad(fun, grad, L.ravel()) / + np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) + np.testing.assert_almost_equal(rel_diff, 0., decimal=5) + + +def test_loss_func(capsys): + """Test the loss function (and its gradient) on a simple example, + by comparing the results with the actual implementation of metric-learn, + with a very simple (but nonperformant) implementation""" + + # toy dataset to use + X, y = make_classification(n_samples=10, n_classes=2, + n_features=6, + n_redundant=0, shuffle=True, + scale=[1, 1, 20, 20, 20, 20], random_state=42) + + def hinge(a): + if a > 0: + return a, 1 + else: + return 0, 0 + + def loss_fn(L, X, y, target_neighbors, reg): + L = L.reshape(-1, X.shape[1]) + Lx = np.dot(X, L.T) + loss = 0 + total_active = 0 + grad = np.zeros_like(L) + for i in range(X.shape[0]): + for j in target_neighbors[i]: + loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2) + grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j]) + for k in range(X.shape[0]): + if y[i] != y[k]: + hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) - + np.sum((Lx[i] - Lx[k])**2)) + total_active += active + if active: + loss += reg * hin + grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) - + np.outer(Lx[i] - Lx[k], X[i] - X[k]))) + grad = 2 * grad + return grad, loss, total_active + + # we check that the gradient we have computed in the non-performant implem + # is indeed the true gradient on a toy example: + + def _select_targets(X, y, k): + target_neighbors = np.empty((X.shape[0], k), dtype=int) + for label in np.unique(y): + inds, = np.nonzero(y == label) + dd = euclidean_distances(X[inds], squared=True) + np.fill_diagonal(dd, np.inf) + nn = np.argsort(dd)[..., :k] + target_neighbors[inds] = inds[nn] + return target_neighbors + + target_neighbors = _select_targets(X, y, 2) + regularization = 0.5 + n_features = X.shape[1] + x0 = np.random.randn(1, n_features) + + def loss(x0): + return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + regularization)[1] + + def grad(x0): + return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + regularization)[0].ravel() + + scipy.optimize.check_grad(loss, grad, x0.ravel()) + + class LMNN_with_callback(LMNN): + """ We will use a callback to get the gradient (see later) + """ + + def __init__(self, callback, *args, **kwargs): + self.callback = callback + super(LMNN_with_callback, self).__init__(*args, **kwargs) + + def _loss_grad(self, *args, **kwargs): + grad, objective, total_active = ( + super(LMNN_with_callback, self)._loss_grad(*args, **kwargs)) + self.callback.append(grad) + return grad, objective, total_active + + class LMNN_nonperformant(LMNN_with_callback): + + def fit(self, X, y): + self.y = y + return super(LMNN_nonperformant, self).fit(X, y) + + def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): + grad, loss, total_active = loss_fn(L.ravel(), X, self.y, + target_neighbors, self.regularization) + self.callback.append(grad) + return grad, loss, total_active + + mem1, mem2 = [], [] + lmnn_perf = LMNN_with_callback(verbose=True, random_state=42, + init='identity', max_iter=30, callback=mem1) + lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42, + init='identity', max_iter=30, + callback=mem2) + objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(), + dict()) + for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']): + algo.fit(X, y) + out, _ = capsys.readouterr() + lines = re.split("\n+", out) + # we get every variable that is printed from the algorithm in verbose + num = r'(-?\d+.?\d*(e[+|-]\d+)?)' + strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})" + .format(num, num, num), s) for s in lines] + objectives[name] = [float(match.group(1)) for match in strings if match is + not None] + obj_diffs[name] = [float(match.group(3)) for match in strings if match is + not None] + total_active[name] = [float(match.group(5)) for match in strings if + match is not + None] + learn_rate[name] = [float(match.group(6)) for match in strings if match is + not None] + assert len(strings) >= 10 # we ensure that we actually did more than 10 + # iterations + assert total_active[name][0] >= 2 # we ensure that we have some active + # constraints (that's the case we want to test) + # we remove the last element because it can be equal to the penultimate + # if the last gradient update is null + for i in range(len(mem1)): + np.testing.assert_allclose(lmnn_perf.callback[i], + lmnn_nonperf.callback[i], + err_msg='Gradient different at position ' + '{}'.format(i)) + np.testing.assert_allclose(objectives['perf'], objectives['nonperf']) + np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf']) + np.testing.assert_allclose(total_active['perf'], total_active['nonperf']) + np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf']) + + +@pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]), + [1, 1, 0, 0], 3.0), + (np.array([[0], [1], [2], [3]]), + [1, 0, 0, 1], 26.)]) +def test_toy_ex_lmnn(X, y, loss): + """Test that the loss give the right result on a toy example""" + L = np.array([[1]]) + lmnn = LMNN(n_neighbors=1, regularization=0.5) + + k = lmnn.n_neighbors + reg = lmnn.regularization + + X, y = lmnn._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) + num_pts, n_components = X.shape + unique_labels, label_inds = np.unique(y, return_inverse=True) + lmnn.labels_ = np.arange(len(unique_labels)) + lmnn.components_ = np.eye(n_components) + + target_neighbors = lmnn._select_targets(X, label_inds) + + # sum outer products + dfG = _sum_outer_products(X, target_neighbors.flatten(), + np.repeat(np.arange(X.shape[0]), k)) + + # storage + a1 = [None] * k + a2 = [None] * k + for nn_idx in range(k): + a1[nn_idx] = np.array([]) + a2[nn_idx] = np.array([]) + + # assert that the loss equals the one computed by hand + assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k, + reg, target_neighbors, label_inds)[1] == loss + - csep = class_separation(lmnn.transform(), self.iris_labels) - self.assertLess(csep, 0.25) +def test_convergence_simple_example(capsys): + # LMNN should converge on this simple example, which it did not with + # this issue: https://github.com/scikit-learn-contrib/metric-learn/issues/88 + X, y = make_classification(random_state=0) + lmnn = LMNN(verbose=True) + lmnn.fit(X, y) + out, _ = capsys.readouterr() + assert "LMNN converged with objective" in out + + +def test_no_twice_same_objective(capsys): + # test that the objective function never has twice the same value + # see https://github.com/scikit-learn-contrib/metric-learn/issues/88 + X, y = make_classification(random_state=0) + lmnn = LMNN(verbose=True) + lmnn.fit(X, y) + out, _ = capsys.readouterr() + lines = re.split("\n+", out) + # we get only objectives from each line: + # the regexp matches a float that follows an integer (the iteration + # number), and which is followed by a (signed) float (delta obj). It + # matches for instance: + # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06 + objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s) + for s in lines] + objectives = [match.group(1) for match in objectives if match is not None] + # we remove the last element because it can be equal to the penultimate + # if the last gradient update is null + assert len(objectives[:-1]) == len(set(objectives[:-1])) class TestSDML(MetricTestCase): + + @pytest.mark.skipif(HAS_SKGGM, + reason="The warning can be thrown only if skggm is " + "not installed.") + def test_sdml_supervised_raises_warning_msg_not_installed_skggm(self): + """Tests that the right warning message is raised if someone tries to + use SDML_Supervised but has not installed skggm, and that the algorithm + fails to converge""" + # TODO: remove if we don't need skggm anymore + # load_iris: dataset where we know scikit-learn's graphical lasso fails + # with a Floating Point error + X, y = load_iris(return_X_y=True) + sdml_supervised = SDML_Supervised(balance_param=0.5, sparsity_param=0.01) + msg = ("There was a problem in SDML when using scikit-learn's graphical " + "lasso solver. skggm's graphical lasso can sometimes converge on " + "non SPD cases where scikit-learn's graphical lasso fails to " + "converge. Try to install skggm and rerun the algorithm (see " + "the README.md for the right version of skggm). The following " + "error message was thrown:") + with pytest.raises(RuntimeError) as raised_error: + sdml_supervised.fit(X, y) + assert str(raised_error.value).startswith(msg) + + @pytest.mark.skipif(HAS_SKGGM, + reason="The warning can be thrown only if skggm is " + "not installed.") + def test_sdml_raises_warning_msg_not_installed_skggm(self): + """Tests that the right warning message is raised if someone tries to + use SDML but has not installed skggm, and that the algorithm fails to + converge""" + # TODO: remove if we don't need skggm anymore + # case on which we know that scikit-learn's graphical lasso fails + # because it will return a non SPD matrix + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) + y_pairs = [1, -1] + sdml = SDML(prior='identity', balance_param=100, verbose=True) + + msg = ("There was a problem in SDML when using scikit-learn's graphical " + "lasso solver. skggm's graphical lasso can sometimes converge on " + "non SPD cases where scikit-learn's graphical lasso fails to " + "converge. Try to install skggm and rerun the algorithm (see " + "the README.md for the right version of skggm).") + with pytest.raises(RuntimeError) as raised_error: + sdml.fit(pairs, y_pairs) + assert msg == str(raised_error.value) + + @pytest.mark.skipif(not HAS_SKGGM, + reason="The warning can be thrown only if skggm is " + "installed.") + def test_sdml_raises_warning_msg_installed_skggm(self): + """Tests that the right warning message is raised if someone tries to + use SDML and has installed skggm, and that the algorithm fails to + converge""" + # TODO: remove if we don't need skggm anymore + # case on which we know that skggm's graphical lasso fails + # because it will return non finite values + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) + y_pairs = [1, -1] + sdml = SDML(prior='identity', balance_param=100, verbose=True) + + msg = ("There was a problem in SDML when using skggm's graphical " + "lasso solver.") + with pytest.raises(RuntimeError) as raised_error: + sdml.fit(pairs, y_pairs) + assert msg == str(raised_error.value) + + @pytest.mark.skipif(not HAS_SKGGM, + reason="The warning can be thrown only if skggm is " + "installed.") + def test_sdml_supervised_raises_warning_msg_installed_skggm(self): + """Tests that the right warning message is raised if someone tries to + use SDML_Supervised but has not installed skggm, and that the algorithm + fails to converge""" + # TODO: remove if we don't need skggm anymore + # case on which we know that skggm's graphical lasso fails + # because it will return non finite values + rng = np.random.RandomState(42) + # This example will create a diagonal em_cov with a negative coeff ( + # pathological case) + X = np.array([[-10., 0.], [10., 0.], [5., 0.], [3., 0.]]) + y = [0, 0, 1, 1] + sdml_supervised = SDML_Supervised(balance_param=0.5, prior='identity', + sparsity_param=0.01, random_state=rng) + msg = ("There was a problem in SDML when using skggm's graphical " + "lasso solver.") + with pytest.raises(RuntimeError) as raised_error: + sdml_supervised.fit(X, y) + assert msg == str(raised_error.value) + + @pytest.mark.skipif(not HAS_SKGGM, + reason="It's only in the case where skggm is installed" + "that no warning should be thrown.") + def test_raises_no_warning_installed_skggm(self): + # otherwise we should be able to instantiate and fit SDML and it + # should raise no error and no ConvergenceWarning + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) + y_pairs = [1, -1] + X, y = make_classification(random_state=42) + with warnings.catch_warnings(record=True) as records: + sdml = SDML(prior='covariance') + sdml.fit(pairs, y_pairs) + for record in records: + assert record.category is not ConvergenceWarning + with warnings.catch_warnings(record=True) as records: + sdml_supervised = SDML_Supervised(prior='identity', balance_param=1e-5) + sdml_supervised.fit(X, y) + for record in records: + assert record.category is not ConvergenceWarning + def test_iris(self): # Note: this is a flaky test, which fails for certain seeds. # TODO: un-flake it! - np.random.seed(5555) + rs = np.random.RandomState(5555) - sdml = SDML_Supervised(num_constraints=1500) + sdml = SDML_Supervised(n_constraints=1500, prior='identity', + balance_param=5e-5, random_state=rs) sdml.fit(self.iris_points, self.iris_labels) - csep = class_separation(sdml.transform(), self.iris_labels) - self.assertLess(csep, 0.25) + csep = class_separation(sdml.transform(self.iris_points), + self.iris_labels) + self.assertLess(csep, 0.22) + + def test_sdml_raises_warning_non_psd(self): + """Tests that SDML raises a warning on a toy example where we know the + pseudo-covariance matrix is not PSD""" + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) + y = [1, -1] + sdml = SDML(prior='covariance', sparsity_param=0.01, balance_param=0.5) + msg = ("Warning, the input matrix of graphical lasso is not " + "positive semi-definite (PSD). The algorithm may diverge, " + "and lead to degenerate solutions. " + "To prevent that, try to decrease the balance parameter " + "`balance_param` and/or to set prior='identity'.") + with pytest.warns(ConvergenceWarning) as raised_warning: + try: + sdml.fit(pairs, y) + except Exception: + pass + # we assert that this warning is in one of the warning raised by the + # estimator + assert msg in list(map(lambda w: str(w.message), raised_warning)) + + def test_sdml_converges_if_psd(self): + """Tests that sdml converges on a simple problem where we know the + pseudo-covariance matrix is PSD""" + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) + y = [1, -1] + sdml = SDML(prior='covariance', sparsity_param=0.01, balance_param=0.5) + sdml.fit(pairs, y) + assert np.isfinite(sdml.get_mahalanobis_matrix()).all() + + @pytest.mark.skipif(not HAS_SKGGM, + reason="sklearn's graphical_lasso can sometimes not " + "work on some non SPD problems. We test that " + "is works only if skggm is installed.") + def test_sdml_works_on_non_spd_pb_with_skggm(self): + """Test that SDML works on a certain non SPD problem on which we know + it should work, but scikit-learn's graphical_lasso does not work""" + X, y = load_iris(return_X_y=True) + sdml = SDML_Supervised(balance_param=0.5, sparsity_param=0.01, + prior='covariance', + random_state=np.random.RandomState(42)) + sdml.fit(X, y) + + +@pytest.mark.skipif(not HAS_SKGGM, + reason='The message should be printed only if skggm is ' + 'installed.') +def test_verbose_has_installed_skggm_sdml(capsys): + # Test that if users have installed skggm, a message is printed telling them + # skggm's solver is used (when they use SDML) + # TODO: remove if we don't need skggm anymore + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) + y_pairs = [1, -1] + sdml = SDML(verbose=True, prior='covariance') + sdml.fit(pairs, y_pairs) + out, _ = capsys.readouterr() + assert "SDML will use skggm's graphical lasso solver." in out + + +@pytest.mark.skipif(not HAS_SKGGM, + reason='The message should be printed only if skggm is ' + 'installed.') +def test_verbose_has_installed_skggm_sdml_supervised(capsys): + # Test that if users have installed skggm, a message is printed telling them + # skggm's solver is used (when they use SDML_Supervised) + # TODO: remove if we don't need skggm anymore + X, y = load_iris(return_X_y=True) + sdml = SDML_Supervised(verbose=True, prior='identity', balance_param=1e-5) + sdml.fit(X, y) + out, _ = capsys.readouterr() + assert "SDML will use skggm's graphical lasso solver." in out + + +@pytest.mark.skipif(HAS_SKGGM, + reason='The message should be printed only if skggm is ' + 'not installed.') +def test_verbose_has_not_installed_skggm_sdml(capsys): + # Test that if users have installed skggm, a message is printed telling them + # skggm's solver is used (when they use SDML) + # TODO: remove if we don't need skggm anymore + pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) + y_pairs = [1, -1] + sdml = SDML(verbose=True, prior='covariance') + sdml.fit(pairs, y_pairs) + out, _ = capsys.readouterr() + assert "SDML will use scikit-learn's graphical lasso solver." in out + + +@pytest.mark.skipif(HAS_SKGGM, + reason='The message should be printed only if skggm is ' + 'not installed.') +def test_verbose_has_not_installed_skggm_sdml_supervised(capsys): + # Test that if users have installed skggm, a message is printed telling them + # skggm's solver is used (when they use SDML_Supervised) + # TODO: remove if we don't need skggm anymore + X, y = make_classification(random_state=42) + sdml = SDML_Supervised(verbose=True, balance_param=1e-5, prior='identity') + sdml.fit(X, y) + out, _ = capsys.readouterr() + assert "SDML will use scikit-learn's graphical lasso solver." in out class TestNCA(MetricTestCase): def test_iris(self): n = self.iris_points.shape[0] - nca = NCA(max_iter=(100000//n), learning_rate=0.01) + + # Without dimension reduction + nca = NCA(max_iter=(100000 // n)) + nca.fit(self.iris_points, self.iris_labels) + csep = class_separation(nca.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.15) + + # With dimension reduction + nca = NCA(max_iter=(100000 // n), n_components=2) nca.fit(self.iris_points, self.iris_labels) + csep = class_separation(nca.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.20) + + def test_finite_differences(self): + """Test gradient of loss function + + Assert that the gradient is almost equal to its finite differences + approximation. + """ + # Initialize the transformation `M`, as well as `X` and `y` and `NCA` + X, y = make_classification() + M = np.random.randn(np.random.randint(1, X.shape[1] + 1), X.shape[1]) + mask = y[:, np.newaxis] == y[np.newaxis, :] + nca = NCA() + nca.n_iter_ = 0 + + def fun(M): + return nca._loss_grad_lbfgs(M, X, mask)[0] + + def grad(M): + return nca._loss_grad_lbfgs(M, X, mask)[1].ravel() + + # compute relative error + epsilon = np.sqrt(np.finfo(float).eps) + rel_diff = (check_grad(fun, grad, M.ravel()) / + np.linalg.norm(approx_fprime(M.ravel(), fun, epsilon))) + np.testing.assert_almost_equal(rel_diff, 0., decimal=6) + + def test_simple_example(self): + """Test on a simple example. + + Puts four points in the input space where the opposite labels points are + next to each other. After transform the same labels points should be next + to each other. + + """ + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + nca = NCA(n_components=2,) + nca.fit(X, y) + Xansformed = nca.transform(X) + np.testing.assert_equal(pairwise_distances(Xansformed).argsort()[:, 1], + np.array([2, 3, 0, 1])) + + def test_singleton_class(self): + X = self.iris_points + y = self.iris_labels + + # one singleton class: test fitting works + singleton_class = 1 + ind_singleton, = np.where(y == singleton_class) + y[ind_singleton] = 2 + y[ind_singleton[0]] = singleton_class + + nca = NCA(max_iter=30) + nca.fit(X, y) + + # One non-singleton class: test fitting works + ind_1, = np.where(y == 1) + ind_2, = np.where(y == 2) + y[ind_1] = 0 + y[ind_1[0]] = 1 + y[ind_2] = 0 + y[ind_2[0]] = 2 - # Result copied from Iris example at - # https://github.com/vomjom/nca/blob/master/README.mkd - expected = [[-0.09935, -0.2215, 0.3383, 0.443], - [+0.2532, 0.5835, -0.8461, -0.8915], - [-0.729, -0.6386, 1.767, 1.832], - [-0.9405, -0.8461, 2.281, 2.794]] - assert_array_almost_equal(expected, nca.transformer(), decimal=3) + nca = NCA(max_iter=30) + nca.fit(X, y) + + # Only singleton classes: test fitting does nothing (the gradient + # must be null in this case, so the final matrix must stay like + # the initialization) + ind_0, = np.where(y == 0) + ind_1, = np.where(y == 1) + ind_2, = np.where(y == 2) + X = X[[ind_0[0], ind_1[0], ind_2[0]]] + y = y[[ind_0[0], ind_1[0], ind_2[0]]] + + A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1]) + nca = NCA(init=A, max_iter=30, n_components=X.shape[1]) + nca.fit(X, y) + assert_array_equal(nca.components_, A) + + def test_one_class(self): + # if there is only one class the gradient is null, so the final matrix + # must stay like the initialization + X = self.iris_points[self.iris_labels == 0] + y = self.iris_labels[self.iris_labels == 0] + + A = make_spd_matrix(n_dim=X.shape[1], random_state=X.shape[1]) + nca = NCA(init=A, max_iter=30, n_components=X.shape[1]) + nca.fit(X, y) + assert_array_equal(nca.components_, A) class TestLFDA(MetricTestCase): def test_iris(self): - lfda = LFDA(k=2, dim=2) + lfda = LFDA(k=2, n_components=2) lfda.fit(self.iris_points, self.iris_labels) - csep = class_separation(lfda.transform(), self.iris_labels) + csep = class_separation(lfda.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.15) + # Sanity checks for learned matrices. + self.assertEqual(lfda.get_mahalanobis_matrix().shape, (4, 4)) + self.assertEqual(lfda.components_.shape, (2, 4)) + class TestRCA(MetricTestCase): def test_iris(self): - rca = RCA_Supervised(dim=2, num_chunks=30, chunk_size=2) + rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2) rca.fit(self.iris_points, self.iris_labels) - csep = class_separation(rca.transform(), self.iris_labels) + csep = class_separation(rca.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.29) + + def test_rank_deficient_returns_warning(self): + """Checks that if the covariance matrix is not invertible, we raise a + warning message advising to use PCA""" + X, y = load_iris(return_X_y=True) + # we make the fourth column a linear combination of the two first, + # so that the covariance matrix will not be invertible: + X[:, 3] = X[:, 0] + 3 * X[:, 1] + rca = RCA() + msg = ('The inner covariance matrix is not invertible, ' + 'so the transformation matrix may contain Nan values. ' + 'You should remove any linearly dependent features and/or ' + 'reduce the dimensionality of your input, ' + 'for instance using `sklearn.decomposition.PCA` as a ' + 'preprocessing step.') + + with warnings.catch_warnings(record=True) as raised_warnings: + rca.fit(X, y) + assert any(str(w.message) == msg for w in raised_warnings) + + def test_unknown_labels(self): + n = 200 + n_chunks = 50 + X, y = make_classification(random_state=42, n_samples=2 * n, + n_features=6, n_informative=6, n_redundant=0) + y2 = np.concatenate((y[:n], -np.ones(n))) + + rca = RCA_Supervised(n_chunks=n_chunks, random_state=42) + rca.fit(X[:n], y[:n]) + + rca2 = RCA_Supervised(n_chunks=n_chunks, random_state=42) + rca2.fit(X, y2) + + assert not np.any(np.isnan(rca.components_)) + assert not np.any(np.isnan(rca2.components_)) + + np.testing.assert_array_equal(rca.components_, rca2.components_) + + def test_bad_parameters(self): + n = 200 + n_chunks = 3 + X, y = make_classification(random_state=42, n_samples=n, + n_features=6, n_informative=6, n_redundant=0) + + rca = RCA_Supervised(n_chunks=n_chunks, random_state=42) + msg = ('Due to the parameters of RCA_Supervised, ' + 'the inner covariance matrix is not invertible, ' + 'so the transformation matrix will contain Nan values. ' + 'Increase the number or size of the chunks to correct ' + 'this problem.' + ) + with warnings.catch_warnings(record=True) as raised_warning: + rca.fit(X, y) + assert any(str(w.message) == msg for w in raised_warning) + + +class TestMLKR(MetricTestCase): + def test_iris(self): + mlkr = MLKR() + mlkr.fit(self.iris_points, self.iris_labels) + csep = class_separation(mlkr.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.25) + def test_finite_differences(self): + """Test gradient of loss function + + Assert that the gradient is almost equal to its finite differences + approximation. + """ + # Initialize the transformation `M`, as well as `X`, and `y` and `MLKR` + X, y = make_regression(n_features=4, random_state=1, n_samples=20) + X, y = check_X_y(X, y) + M = np.random.randn(2, X.shape[1]) + mlkr = MLKR() + mlkr.n_iter_ = 0 + + def fun(M): + return mlkr._loss(M, X, y)[0] + + def grad_fn(M): + return mlkr._loss(M, X, y)[1].ravel() + + # compute relative error + rel_diff = check_grad(fun, grad_fn, M.ravel()) / np.linalg.norm(grad_fn(M)) + np.testing.assert_almost_equal(rel_diff, 0.) + + +class TestMMC(MetricTestCase): + def test_iris(self): + + # Generate full set of constraints for comparison with reference + # implementation + mask = self.iris_labels[None] == self.iris_labels[:, None] + a, b = np.nonzero(np.triu(mask, k=1)) + c, d = np.nonzero(np.triu(~mask, k=1)) + + # Full metric + n_features = self.iris_points.shape[1] + mmc = MMC(tol=0.01, init=np.eye(n_features) / 10) + mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d])) + expected = [[+0.000514, +0.000868, -0.001195, -0.001703], + [+0.000868, +0.001468, -0.002021, -0.002879], + [-0.001195, -0.002021, +0.002782, +0.003964], + [-0.001703, -0.002879, +0.003964, +0.005648]] + assert_array_almost_equal(expected, mmc.get_mahalanobis_matrix(), + decimal=6) + + # Diagonal metric + mmc = MMC(diagonal=True) + mmc.fit(*wrap_pairs(self.iris_points, [a, b, c, d])) + expected = [0, 0, 1.210220, 1.228596] + assert_array_almost_equal(np.diag(expected), mmc.get_mahalanobis_matrix(), + decimal=6) + + # Supervised Full + mmc = MMC_Supervised() + mmc.fit(self.iris_points, self.iris_labels) + csep = class_separation(mmc.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.15) + + # Supervised Diagonal + mmc = MMC_Supervised(diagonal=True) + mmc.fit(self.iris_points, self.iris_labels) + csep = class_separation(mmc.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.2) + + +@pytest.mark.parametrize(('algo_class', 'dataset'), + [(NCA, make_classification()), + (MLKR, make_regression())]) +def test_verbose(algo_class, dataset, capsys): + # assert there is proper output when verbose = True + X, y = dataset + model = algo_class(verbose=True) + model.fit(X, y) + out, _ = capsys.readouterr() + + # check output + lines = re.split('\n+', out) + header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value', + 'Time(s)') + assert lines[0] == '[{}]'.format(algo_class.__name__) + assert lines[1] == '[{}] {}'.format(algo_class.__name__, header) + assert lines[2] == '[{}] {}'.format(algo_class.__name__, '-' * len(header)) + for line in lines[3:-2]: + # The following regex will match for instance: + # '[NCA] 0 6.988936e+01 0.01' + assert re.match(r"\[" + algo_class.__name__ + r"\]\ *\d+\ *\d\.\d{6}e[+|-]" + r"\d+\ *\d+\.\d{2}", line) + assert re.match(r"\[" + algo_class.__name__ + r"\] Training took\ *" + r"\d+\.\d{2}s\.", lines[-2]) + assert lines[-1] == '' + + +@pytest.mark.parametrize(('algo_class', 'dataset'), + [(NCA, make_classification()), + (MLKR, make_regression(n_features=10))]) +def test_no_verbose(dataset, algo_class, capsys): + # assert by default there is no output (verbose=False) + X, y = dataset + model = algo_class() + model.fit(X, y) + out, _ = capsys.readouterr() + # check output + assert (out == '') + + +@pytest.mark.parametrize(('algo_class', 'dataset'), + [(NCA, make_classification()), + (MLKR, make_regression(n_features=10))]) +def test_convergence_warning(dataset, algo_class): + X, y = dataset + model = algo_class(max_iter=2, verbose=True) + cls_name = model.__class__.__name__ + msg = '[{}] {} did not converge'.format(cls_name, cls_name) + with pytest.warns(Warning) as raised_warning: + model.fit(X, y) + assert any([msg in str(warn.message) for warn in raised_warning]) + if __name__ == '__main__': unittest.main() diff --git a/test/test_base_metric.py b/test/test_base_metric.py new file mode 100644 index 00000000..b1e71020 --- /dev/null +++ b/test/test_base_metric.py @@ -0,0 +1,304 @@ +from numpy.core.numeric import array_equal +import warnings +import pytest +import re +import unittest +import metric_learn +import numpy as np +from sklearn import clone +from test.test_utils import ids_metric_learners, metric_learners, remove_y +from metric_learn.sklearn_shims import set_random_state, SKLEARN_AT_LEAST_0_22 + + +def remove_spaces(s): + return re.sub(r'\s+', '', s) + + +def sk_repr_kwargs(def_kwargs, nndef_kwargs): + """Given the non-default arguments, and the default + keywords arguments, build the string that will appear + in the __repr__ of the estimator, depending on the + version of scikit-learn. + """ + if SKLEARN_AT_LEAST_0_22: + def_kwargs = {} + def_kwargs.update(nndef_kwargs) + args_str = ",".join(f"{key}={repr(value)}" + for key, value in def_kwargs.items()) + return args_str + + +class TestStringRepr(unittest.TestCase): + + def test_covariance(self): + def_kwargs = {'preprocessor': None} + nndef_kwargs = {} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.Covariance())), + remove_spaces(f"Covariance({merged_kwargs})")) + + def test_lmnn(self): + def_kwargs = {'convergence_tol': 0.001, 'init': 'auto', 'n_neighbors': 3, + 'learn_rate': 1e-07, 'max_iter': 1000, 'min_iter': 50, + 'n_components': None, 'preprocessor': None, + 'random_state': None, 'regularization': 0.5, + 'verbose': False} + nndef_kwargs = {'convergence_tol': 0.01, 'n_neighbors': 6} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01, + n_neighbors=6))), + remove_spaces(f"LMNN({merged_kwargs})")) + + def test_nca(self): + def_kwargs = {'init': 'auto', 'max_iter': 100, 'n_components': None, + 'preprocessor': None, 'random_state': None, 'tol': None, + 'verbose': False} + nndef_kwargs = {'max_iter': 42} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.NCA(max_iter=42))), + remove_spaces(f"NCA({merged_kwargs})")) + + def test_lfda(self): + def_kwargs = {'embedding_type': 'weighted', 'k': None, + 'n_components': None, 'preprocessor': None} + nndef_kwargs = {'k': 2} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.LFDA(k=2))), + remove_spaces(f"LFDA({merged_kwargs})")) + + def test_itml(self): + def_kwargs = {'tol': 0.001, 'gamma': 1.0, + 'max_iter': 1000, 'preprocessor': None, + 'prior': 'identity', 'random_state': None, 'verbose': False} + nndef_kwargs = {'gamma': 0.5} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.ITML(gamma=0.5))), + remove_spaces(f"ITML({merged_kwargs})")) + def_kwargs = {'tol': 0.001, 'gamma': 1.0, + 'max_iter': 1000, 'n_constraints': None, + 'preprocessor': None, 'prior': 'identity', + 'random_state': None, 'verbose': False} + nndef_kwargs = {'n_constraints': 7} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.ITML_Supervised(n_constraints=7))), + remove_spaces(f"ITML_Supervised({merged_kwargs})")) + + def test_lsml(self): + def_kwargs = {'max_iter': 1000, 'preprocessor': None, 'prior': 'identity', + 'random_state': None, 'tol': 0.001, 'verbose': False} + nndef_kwargs = {'tol': 0.1} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.LSML(tol=0.1))), + remove_spaces(f"LSML({merged_kwargs})")) + def_kwargs = {'max_iter': 1000, 'n_constraints': None, + 'preprocessor': None, 'prior': 'identity', + 'random_state': None, 'tol': 0.001, 'verbose': False, + 'weights': None} + nndef_kwargs = {'verbose': True} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.LSML_Supervised(verbose=True))), + remove_spaces(f"LSML_Supervised({merged_kwargs})")) + + def test_sdml(self): + def_kwargs = {'balance_param': 0.5, 'preprocessor': None, + 'prior': 'identity', 'random_state': None, + 'sparsity_param': 0.01, 'verbose': False} + nndef_kwargs = {'verbose': True} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.SDML(verbose=True))), + remove_spaces(f"SDML({merged_kwargs})")) + def_kwargs = {'balance_param': 0.5, 'n_constraints': None, + 'preprocessor': None, 'prior': 'identity', + 'random_state': None, 'sparsity_param': 0.01, + 'verbose': False} + nndef_kwargs = {'sparsity_param': 0.5} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.SDML_Supervised(sparsity_param=0.5))), + remove_spaces(f"SDML_Supervised({merged_kwargs})")) + + def test_rca(self): + def_kwargs = {'n_components': None, 'preprocessor': None} + nndef_kwargs = {'n_components': 3} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.RCA(n_components=3))), + remove_spaces(f"RCA({merged_kwargs})")) + def_kwargs = {'chunk_size': 2, 'n_components': None, 'n_chunks': 100, + 'preprocessor': None, 'random_state': None} + nndef_kwargs = {'n_chunks': 5} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.RCA_Supervised(n_chunks=5))), + remove_spaces(f"RCA_Supervised({merged_kwargs})")) + + def test_mlkr(self): + def_kwargs = {'init': 'auto', 'max_iter': 1000, + 'n_components': None, 'preprocessor': None, + 'random_state': None, 'tol': None, 'verbose': False} + nndef_kwargs = {'max_iter': 777} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.MLKR(max_iter=777))), + remove_spaces(f"MLKR({merged_kwargs})")) + + def test_mmc(self): + def_kwargs = {'tol': 0.001, 'diagonal': False, + 'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100, + 'max_proj': 10000, 'preprocessor': None, + 'random_state': None, 'verbose': False} + nndef_kwargs = {'diagonal': True} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual(remove_spaces(str(metric_learn.MMC(diagonal=True))), + remove_spaces(f"MMC({merged_kwargs})")) + def_kwargs = {'tol': 1e-06, 'diagonal': False, + 'diagonal_c': 1.0, 'init': 'identity', 'max_iter': 100, + 'max_proj': 10000, 'n_constraints': None, + 'preprocessor': None, 'random_state': None, + 'verbose': False} + nndef_kwargs = {'max_iter': 1} + merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) + self.assertEqual( + remove_spaces(str(metric_learn.MMC_Supervised(max_iter=1))), + remove_spaces(f"MMC_Supervised({merged_kwargs})")) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_is_independent_from_metric_learner(estimator, + build_dataset): + """Tests that the get_metric method returns a function that is independent + from the original metric learner""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + + # we fit the metric learner on it and then we compute the metric on some + # points + model.fit(*remove_y(model, input_data, labels)) + metric = model.get_metric() + score = metric(X[0], X[1]) + + # then we refit the estimator on another dataset + model.fit(*remove_y(model, np.sin(input_data), labels)) + + # we recompute the distance between the two points: it should be the same + score_bis = metric(X[0], X[1]) + assert score_bis == score + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_raises_error(estimator, build_dataset): + """Tests that the metric returned by get_metric raises errors similar to + the distance functions in scipy.spatial.distance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(model, input_data, labels)) + metric = model.get_metric() + + list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]), # vectors with + # different dimensions + (X[0:4], X[1:5]), # 2D vectors + (X[0].tolist() + [5.2], X[1] + [7.2])] + # vectors of same dimension but incompatible with what the metric learner + # was trained on + + for u, v in list_test_get_metric_raises: + with pytest.raises(ValueError): + metric(u, v) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_works_does_not_raise(estimator, build_dataset): + """Tests that the metric returned by get_metric does not raise errors (or + warnings) similarly to the distance functions in scipy.spatial.distance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(model, input_data, labels)) + metric = model.get_metric() + + list_test_get_metric_doesnt_raise = [(X[0], X[1]), + (X[0].tolist(), X[1].tolist()), + (X[0][None], X[1][None])] + + for u, v in list_test_get_metric_doesnt_raise: + with warnings.catch_warnings(record=True) as record: + metric(u, v) + assert len(record) == 0 + + # Test that the scalar case works + model.components_ = np.array([3.1]) + metric = model.get_metric() + for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]: + with warnings.catch_warnings(record=True) as record: + metric(u, v) + assert len(record) == 0 + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_n_components(estimator, build_dataset): + """Check that estimators that have a n_components parameters can use it + and that it actually works as expected""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + + if hasattr(model, 'n_components'): + set_random_state(model) + model.set_params(n_components=None) + model.fit(*remove_y(model, input_data, labels)) + assert model.components_.shape == (X.shape[1], X.shape[1]) + + model = clone(estimator) + set_random_state(model) + model.set_params(n_components=X.shape[1] - 1) + model.fit(*remove_y(model, input_data, labels)) + assert model.components_.shape == (X.shape[1] - 1, X.shape[1]) + + model = clone(estimator) + set_random_state(model) + model.set_params(n_components=X.shape[1] + 1) + with pytest.raises(ValueError) as expected_err: + model.fit(*remove_y(model, input_data, labels)) + assert (str(expected_err.value) == + 'Invalid n_components, must be in [1, {}]'.format(X.shape[1])) + + model = clone(estimator) + set_random_state(model) + model.set_params(n_components=0) + with pytest.raises(ValueError) as expected_err: + model.fit(*remove_y(model, input_data, labels)) + assert (str(expected_err.value) == + 'Invalid n_components, must be in [1, {}]'.format(X.shape[1])) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_score_pairs_warning(estimator, build_dataset): + """Tests that score_pairs returns a FutureWarning regarding deprecation. + Also that score_pairs and pair_distance have the same behaviour""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + + # We fit the metric learner on it and then we call score_pairs on some + # points + model.fit(*remove_y(model, input_data, labels)) + + msg = ("score_pairs will be deprecated in release 0.7.0. " + "Use pair_score to compute similarity scores, or " + "pair_distances to compute distances.") + with pytest.warns(FutureWarning) as raised_warning: + score = model.score_pairs([[X[0], X[1]], ]) + dist = model.pair_distance([[X[0], X[1]], ]) + assert array_equal(score, dist) + assert any([str(warning.message) == msg for warning in raised_warning]) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py new file mode 100644 index 00000000..c6113957 --- /dev/null +++ b/test/test_components_metric_conversion.py @@ -0,0 +1,181 @@ +import unittest +import numpy as np +import pytest +from scipy.stats import ortho_group +from sklearn.datasets import load_iris +from numpy.testing import assert_array_almost_equal, assert_allclose +from metric_learn.sklearn_shims import ignore_warnings + +from metric_learn import ( + LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised) +from metric_learn._util import components_from_metric +from metric_learn.exceptions import NonPSDError + + +class TestTransformerMetricConversion(unittest.TestCase): + @classmethod + def setUpClass(self): + # runs once per test class + iris_data = load_iris() + self.X = iris_data['data'] + self.y = iris_data['target'] + + def test_cov(self): + cov = Covariance() + cov.fit(self.X) + L = cov.components_ + assert_array_almost_equal(L.T.dot(L), cov.get_mahalanobis_matrix()) + + def test_lsml_supervised(self): + seed = np.random.RandomState(1234) + lsml = LSML_Supervised(n_constraints=200, random_state=seed) + lsml.fit(self.X, self.y) + L = lsml.components_ + assert_array_almost_equal(L.T.dot(L), lsml.get_mahalanobis_matrix()) + + def test_itml_supervised(self): + seed = np.random.RandomState(1234) + itml = ITML_Supervised(n_constraints=200, random_state=seed) + itml.fit(self.X, self.y) + L = itml.components_ + assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix()) + + def test_lmnn(self): + lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False) + lmnn.fit(self.X, self.y) + L = lmnn.components_ + assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix()) + + def test_sdml_supervised(self): + seed = np.random.RandomState(1234) + sdml = SDML_Supervised(n_constraints=1500, prior='identity', + balance_param=1e-5, random_state=seed) + sdml.fit(self.X, self.y) + L = sdml.components_ + assert_array_almost_equal(L.T.dot(L), sdml.get_mahalanobis_matrix()) + + def test_nca(self): + n = self.X.shape[0] + nca = NCA(max_iter=(100000 // n)) + nca.fit(self.X, self.y) + L = nca.components_ + assert_array_almost_equal(L.T.dot(L), nca.get_mahalanobis_matrix()) + + def test_lfda(self): + lfda = LFDA(k=2, n_components=2) + lfda.fit(self.X, self.y) + L = lfda.components_ + assert_array_almost_equal(L.T.dot(L), lfda.get_mahalanobis_matrix()) + + def test_rca_supervised(self): + rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2) + rca.fit(self.X, self.y) + L = rca.components_ + assert_array_almost_equal(L.T.dot(L), rca.get_mahalanobis_matrix()) + + def test_mlkr(self): + mlkr = MLKR(n_components=2) + mlkr.fit(self.X, self.y) + L = mlkr.components_ + assert_array_almost_equal(L.T.dot(L), mlkr.get_mahalanobis_matrix()) + + @ignore_warnings + def test_components_from_metric_edge_cases(self): + """Test that components_from_metric returns the right result in various + edge cases""" + rng = np.random.RandomState(42) + + # an orthonormal matrix useful for creating matrices with given + # eigenvalues: + P = ortho_group.rvs(7, random_state=rng) + + # matrix with all its coefficients very low (to check that the algorithm + # does not consider it as a diagonal matrix)(non regression test for + # https://github.com/scikit-learn-contrib/metric-learn/issues/175) + M = np.diag([1e-15, 2e-16, 3e-15, 4e-16, 5e-15, 6e-16, 7e-15]) + M = P.dot(M).dot(P.T) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # diagonal matrix + M = np.diag(np.abs(rng.randn(5))) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # low-rank matrix (with zeros) + M = np.zeros((7, 7)) + small_random = rng.randn(3, 3) + M[:3, :3] = small_random.T.dot(small_random) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # low-rank matrix (without necessarily zeros) + R = np.abs(rng.randn(7, 7)) + M = R.dot(np.diag([1, 5, 3, 2, 0, 0, 0])).dot(R.T) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # matrix with a determinant still high but which is + # undefinite w.r.t to numpy standards + M = np.diag([1e5, 1e5, 1e5, 1e5, 1e5, 1e5, 1e-20]) + M = P.dot(M).dot(P.T) + assert np.abs(np.linalg.det(M)) > 10 + assert np.linalg.slogdet(M)[1] > 1 # (just to show that the computed + # determinant is far from null) + assert np.linalg.matrix_rank(M) < M.shape[0] + # (just to show that this case is indeed considered by numpy as an + # indefinite case) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # matrix with lots of small nonzeros that make a big zero when multiplied + M = np.diag([1e-3, 1e-3, 1e-3, 1e-3, 1e-3, 1e-3, 1e-3]) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + # full rank matrix + M = rng.randn(10, 10) + M = M.T.dot(M) + assert np.linalg.matrix_rank(M) == 10 + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + def test_non_symmetric_matrix_raises(self): + """Checks that if a non symmetric matrix is given to + components_from_metric, an error is thrown""" + rng = np.random.RandomState(42) + M = rng.randn(10, 10) + with pytest.raises(ValueError) as raised_error: + components_from_metric(M) + assert str(raised_error.value) == "The input metric should be symmetric." + + def test_non_psd_raises(self): + """Checks that a non PSD matrix (i.e. with negative eigenvalues) will + raise an error when passed to components_from_metric""" + rng = np.random.RandomState(42) + D = np.diag([1, 5, 3, 4.2, -4, -2, 1]) + P = ortho_group.rvs(7, random_state=rng) + M = P.dot(D).dot(P.T) + msg = ("Matrix is not positive semidefinite (PSD).") + with pytest.raises(NonPSDError) as raised_error: + components_from_metric(M) + assert str(raised_error.value) == msg + with pytest.raises(NonPSDError) as raised_error: + components_from_metric(D) + assert str(raised_error.value) == msg + + def test_almost_psd_dont_raise(self): + """Checks that if the metric is almost PSD (i.e. it has some negative + eigenvalues very close to zero), then components_from_metric will still + work""" + rng = np.random.RandomState(42) + D = np.diag([1, 5, 3, 4.2, -1e-20, -2e-20, -1e-20]) + P = ortho_group.rvs(7, random_state=rng) + M = P.dot(D).dot(P.T) + L = components_from_metric(M) + assert_allclose(L.T.dot(L), M) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_constraints.py b/test/test_constraints.py new file mode 100644 index 00000000..3429d9cc --- /dev/null +++ b/test/test_constraints.py @@ -0,0 +1,188 @@ +import pytest +import numpy as np +from sklearn.utils import shuffle +from metric_learn.constraints import Constraints +from sklearn.datasets import make_blobs + +SEED = 42 + + +def gen_labels_for_chunks(n_chunks, chunk_size, + n_classes=10, n_unknown_labels=5): + """Generates n_chunks*chunk_size labels that split in n_chunks chunks, + that are homogeneous in the label.""" + assert min(n_chunks, chunk_size) > 0 + classes = shuffle(np.arange(n_classes), random_state=SEED) + n_per_class = chunk_size * (n_chunks // n_classes) + n_maj_class = chunk_size * n_chunks - n_per_class * (n_classes - 1) + + first_labels = classes[0] * np.ones(n_maj_class, dtype=int) + remaining_labels = np.concatenate([k * np.ones(n_per_class, dtype=int) + for k in classes[1:]]) + unknown_labels = -1 * np.ones(n_unknown_labels, dtype=int) + + labels = np.concatenate([first_labels, remaining_labels, unknown_labels]) + return shuffle(labels, random_state=SEED) + + +@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)]) +def test_exact_num_points_for_chunks(n_chunks, chunk_size): + """Checks that the chunk generation works well with just enough points.""" + labels = gen_labels_for_chunks(n_chunks, chunk_size) + + constraints = Constraints(labels) + chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size, + random_state=SEED) + + chunk_no, size_each_chunk = np.unique(chunks[chunks >= 0], + return_counts=True) + + np.testing.assert_array_equal(size_each_chunk, chunk_size) + assert chunk_no.shape[0] == n_chunks + + +@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)]) +def test_chunk_case_one_miss_point(n_chunks, chunk_size): + """Checks that the chunk generation breaks when one point is missing.""" + labels = gen_labels_for_chunks(n_chunks, chunk_size) + + assert len(labels) >= 1 + constraints = Constraints(labels[1:]) + with pytest.raises(ValueError) as e: + constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size, + random_state=SEED) + + expected_message = (('Not enough possible chunks of %d elements in each' + ' class to form expected %d chunks - maximum number' + ' of chunks is %d' + ) % (chunk_size, n_chunks, n_chunks - 1)) + + assert str(e.value) == expected_message + + +@pytest.mark.parametrize("n_chunks, chunk_size", [(5, 10), (10, 50)]) +def test_unknown_labels_not_in_chunks(n_chunks, chunk_size): + """Checks that unknown labels are not assigned to any chunk.""" + labels = gen_labels_for_chunks(n_chunks, chunk_size) + + constraints = Constraints(labels) + chunks = constraints.chunks(n_chunks=n_chunks, chunk_size=chunk_size, + random_state=SEED) + + assert np.all(chunks[labels < 0] < 0) + + +@pytest.mark.parametrize("k_genuine, k_impostor, T_test", + [(2, 2, + [[0, 1, 3], [0, 1, 4], [0, 2, 3], [0, 2, 4], + [1, 0, 3], [1, 0, 4], [1, 2, 3], [1, 2, 4], + [2, 0, 3], [2, 0, 4], [2, 1, 3], [2, 1, 4], + [3, 4, 1], [3, 4, 2], [3, 5, 1], [3, 5, 2], + [4, 3, 1], [4, 3, 2], [4, 5, 1], [4, 5, 2], + [5, 3, 1], [5, 3, 2], [5, 4, 1], [5, 4, 2]]), + (1, 3, + [[0, 1, 3], [0, 1, 4], [0, 1, 5], [1, 0, 3], + [1, 0, 4], [1, 0, 5], [2, 1, 3], [2, 1, 4], + [2, 1, 5], [3, 4, 0], [3, 4, 1], [3, 4, 2], + [4, 3, 0], [4, 3, 1], [4, 3, 2], [5, 4, 0], + [5, 4, 1], [5, 4, 2]]), + (1, 2, + [[0, 1, 3], [0, 1, 4], [1, 0, 3], [1, 0, 4], + [2, 1, 3], [2, 1, 4], [3, 4, 1], [3, 4, 2], + [4, 3, 1], [4, 3, 2], [5, 4, 1], [5, 4, 2]])]) +def test_generate_knntriplets_under_edge(k_genuine, k_impostor, T_test): + """Checks under the edge cases of knn triplet construction with enough + neighbors""" + + X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]]) + y = np.array([1, 1, 1, 2, 2, 2, -1]) + + T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor) + + assert np.array_equal(sorted(T.tolist()), T_test) + + +@pytest.mark.parametrize("k_genuine, k_impostor,", + [(3, 3), (2, 4), (3, 4), (10, 9), (144, 33)]) +def test_generate_knntriplets(k_genuine, k_impostor): + """Checks edge and over the edge cases of knn triplet construction with not + enough neighbors""" + + T_test = [[0, 1, 3], [0, 1, 4], [0, 1, 5], [0, 2, 3], [0, 2, 4], [0, 2, 5], + [1, 0, 3], [1, 0, 4], [1, 0, 5], [1, 2, 3], [1, 2, 4], [1, 2, 5], + [2, 0, 3], [2, 0, 4], [2, 0, 5], [2, 1, 3], [2, 1, 4], [2, 1, 5], + [3, 4, 0], [3, 4, 1], [3, 4, 2], [3, 5, 0], [3, 5, 1], [3, 5, 2], + [4, 3, 0], [4, 3, 1], [4, 3, 2], [4, 5, 0], [4, 5, 1], [4, 5, 2], + [5, 3, 0], [5, 3, 1], [5, 3, 2], [5, 4, 0], [5, 4, 1], [5, 4, 2]] + + X = np.array([[0, 0], [2, 2], [4, 4], [8, 8], [16, 16], [32, 32], [33, 33]]) + y = np.array([1, 1, 1, 2, 2, 2, -1]) + + msg1 = ("The class 1 has 3 elements, which is not sufficient to " + f"generate {k_genuine+1} genuine neighbors " + "as specified by k_genuine") + msg2 = ("The class 2 has 3 elements, which is not sufficient to " + f"generate {k_genuine+1} genuine neighbors " + "as specified by k_genuine") + msg3 = ("The class 1 has 3 elements of other classes, which is " + f"not sufficient to generate {k_impostor} impostor " + "neighbors as specified by k_impostor") + msg4 = ("The class 2 has 3 elements of other classes, which is " + f"not sufficient to generate {k_impostor} impostor " + "neighbors as specified by k_impostor") + msgs = [msg1, msg2, msg3, msg4] + with pytest.warns(UserWarning) as user_warning: + T = Constraints(y).generate_knntriplets(X, k_genuine, k_impostor) + assert any([[msg in str(warn.message) for msg in msgs] + for warn in user_warning]) + assert np.array_equal(sorted(T.tolist()), T_test) + + +def test_generate_knntriplets_k_genuine(): + """Checks the correct error raised when k_genuine is too big """ + X, y = shuffle(*make_blobs(random_state=SEED), + random_state=SEED) + + label, labels_count = np.unique(y, return_counts=True) + labels_count_min = np.min(labels_count) + idx_smallest_label, = np.where(labels_count == labels_count_min) + k_genuine = labels_count_min + + warn_msgs = [] + for idx in idx_smallest_label: + warn_msgs.append("The class {} has {} elements, which is not sufficient " + "to generate {} genuine neighbors as specified by " + "k_genuine. Will generate {} genuine neighbors instead." + "\n" + .format(label[idx], k_genuine, k_genuine+1, k_genuine-1)) + + with pytest.warns(UserWarning) as raised_warning: + Constraints(y).generate_knntriplets(X, k_genuine, 1) + for warn in raised_warning: + assert str(warn.message) in warn_msgs + + +def test_generate_knntriplets_k_impostor(): + """Checks the correct error raised when k_impostor is too big """ + X, y = shuffle(*make_blobs(random_state=SEED), + random_state=SEED) + + length = len(y) + label, labels_count = np.unique(y, return_counts=True) + labels_count_max = np.max(labels_count) + idx_biggest_label, = np.where(labels_count == labels_count_max) + k_impostor = length - labels_count_max + 1 + + warn_msgs = [] + for idx in idx_biggest_label: + warn_msgs.append("The class {} has {} elements of other classes, which is" + " not sufficient to generate {} impostor neighbors as " + "specified by k_impostor. Will generate {} impostor " + "neighbors instead.\n" + .format(label[idx], k_impostor-1, k_impostor, + k_impostor-1)) + + with pytest.warns(UserWarning) as raised_warning: + Constraints(y).generate_knntriplets(X, 1, k_impostor) + for warn in raised_warning: + assert str(warn.message) in warn_msgs diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py new file mode 100644 index 00000000..246223b0 --- /dev/null +++ b/test/test_fit_transform.py @@ -0,0 +1,138 @@ +import unittest +import numpy as np +from sklearn.datasets import load_iris +from numpy.testing import assert_array_almost_equal + +from metric_learn import ( + LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, + MMC_Supervised) + + +class TestFitTransform(unittest.TestCase): + @classmethod + def setUpClass(self): + # runs once per test class + iris_data = load_iris() + self.X = iris_data['data'] + self.y = iris_data['target'] + + def test_cov(self): + cov = Covariance() + cov.fit(self.X) + res_1 = cov.transform(self.X) + + cov = Covariance() + res_2 = cov.fit_transform(self.X) + # deterministic result + assert_array_almost_equal(res_1, res_2) + + def test_lsml_supervised(self): + seed = np.random.RandomState(1234) + lsml = LSML_Supervised(n_constraints=200, random_state=seed) + lsml.fit(self.X, self.y) + res_1 = lsml.transform(self.X) + + seed = np.random.RandomState(1234) + lsml = LSML_Supervised(n_constraints=200, random_state=seed) + res_2 = lsml.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_itml_supervised(self): + seed = np.random.RandomState(1234) + itml = ITML_Supervised(n_constraints=200, random_state=seed) + itml.fit(self.X, self.y) + res_1 = itml.transform(self.X) + + seed = np.random.RandomState(1234) + itml = ITML_Supervised(n_constraints=200, random_state=seed) + res_2 = itml.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_lmnn(self): + lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False) + lmnn.fit(self.X, self.y) + res_1 = lmnn.transform(self.X) + + lmnn = LMNN(n_neighbors=5, learn_rate=1e-6, verbose=False) + res_2 = lmnn.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_sdml_supervised(self): + seed = np.random.RandomState(1234) + sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5, + prior='identity', random_state=seed) + sdml.fit(self.X, self.y) + res_1 = sdml.transform(self.X) + + seed = np.random.RandomState(1234) + sdml = SDML_Supervised(n_constraints=1500, balance_param=1e-5, + prior='identity', random_state=seed) + res_2 = sdml.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_nca(self): + n = self.X.shape[0] + nca = NCA(max_iter=(100000 // n)) + nca.fit(self.X, self.y) + res_1 = nca.transform(self.X) + + nca = NCA(max_iter=(100000 // n)) + res_2 = nca.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_lfda(self): + lfda = LFDA(k=2, n_components=2) + lfda.fit(self.X, self.y) + res_1 = lfda.transform(self.X) + + lfda = LFDA(k=2, n_components=2) + res_2 = lfda.fit_transform(self.X, self.y) + + # signs may be flipped, that's okay + assert_array_almost_equal(abs(res_1), abs(res_2)) + + def test_rca_supervised(self): + seed = np.random.RandomState(1234) + rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2, + random_state=seed) + rca.fit(self.X, self.y) + res_1 = rca.transform(self.X) + + seed = np.random.RandomState(1234) + rca = RCA_Supervised(n_components=2, n_chunks=30, chunk_size=2, + random_state=seed) + res_2 = rca.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_mlkr(self): + mlkr = MLKR(n_components=2) + mlkr.fit(self.X, self.y) + res_1 = mlkr.transform(self.X) + + mlkr = MLKR(n_components=2) + res_2 = mlkr.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + def test_mmc_supervised(self): + seed = np.random.RandomState(1234) + mmc = MMC_Supervised(n_constraints=200, random_state=seed) + mmc.fit(self.X, self.y) + res_1 = mmc.transform(self.X) + + seed = np.random.RandomState(1234) + mmc = MMC_Supervised(n_constraints=200, random_state=seed) + res_2 = mmc.fit_transform(self.X, self.y) + + assert_array_almost_equal(res_1, res_2) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py new file mode 100644 index 00000000..9378ac60 --- /dev/null +++ b/test/test_mahalanobis_mixin.py @@ -0,0 +1,756 @@ +from itertools import product + +import pytest +import numpy as np +from numpy.linalg import LinAlgError +from numpy.testing import assert_array_almost_equal, assert_allclose, \ + assert_array_equal +from scipy.spatial.distance import pdist, squareform, mahalanobis +from scipy.stats import ortho_group +from sklearn import clone +from sklearn.cluster import DBSCAN +from sklearn.datasets import make_spd_matrix, make_blobs +from sklearn.utils import check_random_state, shuffle +from sklearn.utils.multiclass import type_of_target +from metric_learn.sklearn_shims import set_random_state + +from metric_learn._util import make_context, _initialize_metric_mahalanobis +from metric_learn.sdml import _BaseSDML +from metric_learn.base_metric import (_QuadrupletsClassifierMixin, + _TripletsClassifierMixin, + _PairsClassifierMixin) +from metric_learn.exceptions import NonPSDError + +from test.test_utils import (ids_metric_learners, metric_learners, + remove_y, ids_classifiers) + +RNG = check_random_state(0) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_pair_distance_pair_score_equivalent(estimator, build_dataset): + """ + For Mahalanobis learners, pair_score should be equivalent to the + opposite of the pair_distance result. + """ + input_data, labels, _, X = build_dataset() + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + + distances = model.pair_distance(np.array(list(product(X, X)))) + scores = model.pair_score(np.array(list(product(X, X)))) + + assert_array_equal(distances, -1 * scores) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_pair_distance_pairwise(estimator, build_dataset): + # Computing pairwise scores should return a euclidean distance matrix. + input_data, labels, _, X = build_dataset() + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + + pairwise = model.pair_distance(np.array(list(product(X, X))))\ + .reshape(n_samples, n_samples) + + check_is_distance_matrix(pairwise) + + # a necessary condition for euclidean distance matrices: (see + # https://en.wikipedia.org/wiki/Euclidean_distance_matrix) + assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2 + + # assert that this distance is coherent with pdist on embeddings + assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X))) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_pair_distance_toy_example(estimator, build_dataset): + # Checks that pair_distance works on a toy example + input_data, labels, _, X = build_dataset() + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + pairs = np.stack([X[:10], X[10:20]], axis=1) + embedded_pairs = pairs.dot(model.components_.T) + distances = np.sqrt(np.sum((embedded_pairs[:, 1] - + embedded_pairs[:, 0])**2, + axis=-1)) + assert_array_almost_equal(model.pair_distance(pairs), distances) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_pair_distance_finite(estimator, build_dataset): + # tests that the score is finite + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + pairs = np.array(list(product(X, X))) + assert np.isfinite(model.pair_distance(pairs)).all() + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_pair_distance_dim(estimator, build_dataset): + # scoring of 3D arrays should return 1D array (several tuples), + # and scoring of 2D arrays (one tuple) should return an error (like + # scikit-learn's error when scoring 1D arrays) + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + tuples = np.array(list(product(X, X))) + assert model.pair_distance(tuples).shape == (tuples.shape[0],) + context = make_context(estimator) + msg = ("3D array of formed tuples expected{}. Found 2D array " + "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" + .format(context, tuples[1])) + with pytest.raises(ValueError) as raised_error: + model.pair_distance(tuples[1]) + assert str(raised_error.value) == msg + + +def check_is_distance_matrix(pairwise): + assert (pairwise >= 0).all() # positivity + assert np.array_equal(pairwise, pairwise.T) # symmetry + assert (pairwise.diagonal() == 0).all() # identity + # triangular inequality + tol = 1e-12 + assert (pairwise <= pairwise[:, :, np.newaxis] + + pairwise[:, np.newaxis, :] + tol).all() + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_embed_toy_example(estimator, build_dataset): + # Checks that embed works on a toy example + input_data, labels, _, X = build_dataset() + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + embedded_points = X.dot(model.components_.T) + assert_array_almost_equal(model.transform(X), embedded_points) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_embed_dim(estimator, build_dataset): + # Checks that the the dimension of the output space is as expected + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + assert model.transform(X).shape == X.shape + + # assert that ValueError is thrown if input shape is 1D + context = make_context(estimator) + err_msg = ("2D array of formed points expected{}. Found 1D array " + "instead:\ninput={}. Reshape your data and/or use a " + "preprocessor.\n".format(context, X[0])) + with pytest.raises(ValueError) as raised_error: + model.pair_distance(model.transform(X[0, :])) + assert str(raised_error.value) == err_msg + # we test that the shape is also OK when doing dimensionality reduction + if hasattr(model, 'n_components'): + model.set_params(n_components=2) + model.fit(*remove_y(estimator, input_data, labels)) + assert model.transform(X).shape == (X.shape[0], 2) + # assert that ValueError is thrown if input shape is 1D + with pytest.raises(ValueError) as raised_error: + model.transform(model.transform(X[0, :])) + assert str(raised_error.value) == err_msg + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_embed_finite(estimator, build_dataset): + # Checks that embed returns vectors with finite values + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + assert np.isfinite(model.transform(X)).all() + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_embed_is_linear(estimator, build_dataset): + # Checks that the embedding is linear + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + assert_array_almost_equal(model.transform(X[:10] + X[10:20]), + model.transform(X[:10]) + + model.transform(X[10:20])) + assert_array_almost_equal(model.transform(5 * X[:10]), + 5 * model.transform(X[:10])) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_equivalent_to_explicit_mahalanobis(estimator, + build_dataset): + """Tests that using the get_metric method of mahalanobis metric learners is + equivalent to explicitely calling scipy's mahalanobis metric + """ + rng = np.random.RandomState(42) + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + metric = model.get_metric() + n_features = X.shape[1] + a, b = (rng.randn(n_features), rng.randn(n_features)) + expected_dist = mahalanobis(a, b, VI=model.get_mahalanobis_matrix()) + assert_allclose(metric(a, b), expected_dist, rtol=1e-13) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_is_pseudo_metric(estimator, build_dataset): + """Tests that the get_metric method of mahalanobis metric learners returns a + pseudo-metric (metric but without one side of the equivalence of + the identity of indiscernables property) + """ + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + metric = model.get_metric() + + n_features = X.shape[1] + for seed in range(10): + rng = np.random.RandomState(seed) + a, b, c = (rng.randn(n_features) for _ in range(3)) + assert metric(a, b) >= 0 # positivity + assert metric(a, b) == metric(b, a) # symmetry + # one side of identity indiscernables: x == y => d(x, y) == 0. The other + # side of the equivalence is not always true for Mahalanobis distances. + assert metric(a, a) == 0 + # triangular inequality + assert (metric(a, c) < metric(a, b) + metric(b, c) or + np.isclose(metric(a, c), metric(a, b) + metric(b, c), rtol=1e-20)) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset): + """Check that the metric returned by get_metric is compatible with + scikit-learn's algorithms using a custom metric, DBSCAN for instance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + clustering = DBSCAN(metric=model.get_metric()) + clustering.fit(X) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_squared_metric(estimator, build_dataset): + """Test that the squared metric returned is indeed the square of the + metric""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(*remove_y(estimator, input_data, labels)) + metric = model.get_metric() + + n_features = X.shape[1] + for seed in range(10): + rng = np.random.RandomState(seed) + a, b = (rng.randn(n_features) for _ in range(2)) + assert_allclose(metric(a, b, squared=True), + metric(a, b, squared=False)**2, + rtol=1e-15) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_components_is_2D(estimator, build_dataset): + """Tests that the transformation matrix of metric learners is 2D""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + # test that it works for X.shape[1] features + model.fit(*remove_y(estimator, input_data, labels)) + assert model.components_.shape == (X.shape[1], X.shape[1]) + + if isinstance(estimator, _BaseSDML): + # SDML doesn't support running on a single feature. + return + + # test that it works for 1 feature. Use 2nd dimension, to avoid border cases + trunc_data = input_data[..., 1:2] + # we drop duplicates that might have been formed, i.e. of the form + # aabc or abcc or aabb for quadruplets, and aa for pairs. + + if isinstance(estimator, _QuadrupletsClassifierMixin): + pairs_idx = [[0, 1], [2, 3]] + elif isinstance(estimator, _TripletsClassifierMixin): + pairs_idx = [[0, 1], [0, 2]] + elif isinstance(estimator, _PairsClassifierMixin): + pairs_idx = [[0, 1]] + else: + pairs_idx = [] + + for pair_idx in pairs_idx: + pairs = trunc_data[:, pair_idx, :] + diffs = pairs[:, 1, :] - pairs[:, 0, :] + to_keep = np.abs(diffs.ravel()) > 1e-9 + trunc_data = trunc_data[to_keep] + labels = labels[to_keep] + + model.fit(*remove_y(estimator, trunc_data, labels)) + assert model.components_.shape == (1, 1) # the components must be 2D + + +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if hasattr(ml, 'n_components') and + hasattr(ml, 'init')], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if hasattr(ml, 'n_components') and + hasattr(ml, 'init')]) +def test_init_transformation(estimator, build_dataset): + input_data, labels, _, X = build_dataset() + is_classification = (type_of_target(labels) in ['multiclass', 'binary']) + model = clone(estimator) + rng = np.random.RandomState(42) + + # Start learning from scratch + model.set_params(init='identity') + model.fit(input_data, labels) + + # Initialize with random + model.set_params(init='random') + model.fit(input_data, labels) + + # Initialize with auto + model.set_params(init='auto') + model.fit(input_data, labels) + + # Initialize with PCA + model.set_params(init='pca') + model.fit(input_data, labels) + + # Initialize with LDA + if is_classification: + model.set_params(init='lda') + model.fit(input_data, labels) + + # Initialize with a numpy array + init = rng.rand(X.shape[1], X.shape[1]) + model.set_params(init=init) + model.fit(input_data, labels) + + # init.shape[1] must match X.shape[1] + init = rng.rand(X.shape[1], X.shape[1] + 1) + model.set_params(init=init) + msg = ('The input dimensionality ({}) of the given ' + 'linear transformation `init` must match the ' + 'dimensionality of the given inputs `X` ({}).' + .format(init.shape[1], X.shape[1])) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + # init.shape[0] must be <= init.shape[1] + init = rng.rand(X.shape[1] + 1, X.shape[1]) + model.set_params(init=init) + msg = ('The output dimensionality ({}) of the given ' + 'linear transformation `init` cannot be ' + 'greater than its input dimensionality ({}).' + .format(init.shape[0], init.shape[1])) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + # init.shape[0] must match n_components + init = rng.rand(X.shape[1], X.shape[1]) + n_components = X.shape[1] - 1 + model.set_params(init=init, n_components=n_components) + msg = ('The preferred dimensionality of the ' + 'projected space `n_components` ({}) does not match ' + 'the output dimensionality of the given ' + 'linear transformation `init` ({})!' + .format(n_components, init.shape[0])) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + # init must be as specified in the docstring + model.set_params(init=1) + msg = ("`init` must be 'auto', 'pca', 'identity', " + "'random'{} or a numpy array of shape " + "(n_components, n_features)." + .format(", 'lda'" if is_classification else '')) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('n_samples', [3, 5, 7, 11]) +@pytest.mark.parametrize('n_features', [3, 5, 7, 11]) +@pytest.mark.parametrize('n_classes', [5, 7, 11]) +@pytest.mark.parametrize('n_components', [3, 5, 7, 11]) +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if hasattr(ml, 'n_components') and + hasattr(ml, 'init')], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if hasattr(ml, 'n_components') and + hasattr(ml, 'init')]) +def test_auto_init_transformation(n_samples, n_features, n_classes, + n_components, estimator, build_dataset): + # Test that auto choose the init transformation as expected with every + # configuration of order of n_samples, n_features, n_classes and + # n_components, for all metric learners that learn a transformation. + if n_classes >= n_samples: + pass + # n_classes > n_samples is impossible, and n_classes == n_samples + # throws an error from lda but is an absurd case + else: + input_data, labels, _, X = build_dataset() + model_base = clone(estimator) + rng = np.random.RandomState(42) + model_base.set_params(init='auto', + n_components=n_components, + random_state=rng) + # To make the test work for LMNN: + if 'LMNN' in model_base.__class__.__name__: + model_base.set_params(n_neighbors=1) + # To make the test faster for estimators that have a max_iter: + if hasattr(model_base, 'max_iter'): + model_base.set_params(max_iter=1) + if n_components > n_features: + # this would return a ValueError, which is tested in + # test_init_transformation + pass + else: + # We need to build a dataset of the right shape: + num_to_pad_n_samples = ((n_samples // input_data.shape[0] + 1)) + num_to_pad_n_features = ((n_features // input_data.shape[-1] + 1)) + if input_data.ndim == 3: + input_data = np.tile(input_data, + (num_to_pad_n_samples, input_data.shape[1], + num_to_pad_n_features)) + else: + input_data = np.tile(input_data, + (num_to_pad_n_samples, num_to_pad_n_features)) + input_data = input_data[:n_samples, ..., :n_features] + assert input_data.shape[0] == n_samples + assert input_data.shape[-1] == n_features + has_classes = model_base.__class__.__name__ in ids_classifiers + if has_classes: + labels = np.tile(range(n_classes), n_samples // + n_classes + 1)[:n_samples] + else: + labels = np.tile(labels, n_samples // labels.shape[0] + 1)[:n_samples] + model = clone(model_base) + model.fit(input_data, labels) + if n_components <= min(n_classes - 1, n_features) and has_classes: + model_other = clone(model_base).set_params(init='lda') + elif n_components < min(n_features, n_samples): + model_other = clone(model_base).set_params(init='pca') + else: + model_other = clone(model_base).set_params(init='identity') + model_other.fit(input_data, labels) + assert_array_almost_equal(model.components_, + model_other.components_) + + +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if not hasattr(ml, 'n_components') and + hasattr(ml, 'init')], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if not hasattr(ml, 'n_components') and + hasattr(ml, 'init')]) +def test_init_mahalanobis(estimator, build_dataset): + """Tests that for estimators that learn a mahalanobis matrix + instead of a linear transformation, i.e. those that are mahalanobis metric + learners + where we can change the init, but not choose the n_components, + (TODO: be more explicit on this characterization, for instance with + safe_flags like in scikit-learn) that the init has an expected behaviour. + """ + input_data, labels, _, X = build_dataset() + + matrices_to_set = [] + if hasattr(estimator, 'init'): + matrices_to_set.append('init') + if hasattr(estimator, 'prior'): + matrices_to_set.append('prior') + + for param in matrices_to_set: + model = clone(estimator) + set_random_state(model) + rng = np.random.RandomState(42) + + # Start learning from scratch + model.set_params(**{param: 'identity'}) + model.fit(input_data, labels) + + # Initialize with random + model.set_params(**{param: 'random'}) + model.fit(input_data, labels) + + # Initialize with covariance + model.set_params(**{param: 'covariance'}) + model.fit(input_data, labels) + + # Initialize with a random spd matrix + init = make_spd_matrix(n_dim=X.shape[1], random_state=rng) + model.set_params(**{param: init}) + model.fit(input_data, labels) + + # init.shape[1] must match X.shape[1] + init = make_spd_matrix(n_dim=X.shape[1] + 1, random_state=rng) + model.set_params(**{param: init}) + msg = ('The input dimensionality {} of the given ' + 'mahalanobis matrix `{}` must match the ' + 'dimensionality of the given inputs ({}).' + .format(init.shape, param, input_data.shape[-1])) + + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + # The input matrix must be symmetric + init = rng.rand(X.shape[1], X.shape[1]) + model.set_params(**{param: init}) + msg = ("`{}` is not symmetric.".format(param)) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + # The input matrix must be SPD + P = ortho_group.rvs(X.shape[1], random_state=rng) + w = np.abs(rng.randn(X.shape[1])) + w[0] = -10. + M = P.dot(np.diag(w)).dot(P.T) + model.set_params(**{param: M}) + msg = ("Matrix is not positive semidefinite (PSD).") + with pytest.raises(NonPSDError) as raised_err: + model.fit(input_data, labels) + assert str(raised_err.value) == msg + + # init must be as specified in the docstring + model.set_params(**{param: 1}) + msg = ("`{}` must be 'identity', 'covariance', " + "'random' or a numpy array of shape " + "(n_features, n_features).".format(param)) + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, labels) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if idml[:4] in ['ITML', 'SDML', 'LSML']], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if idml[:4] in ['ITML', 'SDML', 'LSML']]) +def test_singular_covariance_init_or_prior_strictpd(estimator, build_dataset): + """Tests that when using the 'covariance' init or prior, it returns the + appropriate error if the covariance matrix is singular, for algorithms + that need a strictly PD prior or init (see + https://github.com/scikit-learn-contrib/metric-learn/issues/202 and + https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment + -492332451) + """ + matrices_to_set = [] + if hasattr(estimator, 'init'): + matrices_to_set.append('init') + if hasattr(estimator, 'prior'): + matrices_to_set.append('prior') + + input_data, labels, _, X = build_dataset() + for param in matrices_to_set: + model = clone(estimator) + set_random_state(model) + # We create a feature that is a linear combination of the first two + # features: + input_data = np.concatenate([input_data, input_data[:, ..., :2] + .dot([[2], [3]])], + axis=-1) + model.set_params(**{param: 'covariance'}) + msg = ("Unable to get a true inverse of the covariance " + "matrix since it is not definite. Try another " + "`{}`, or an algorithm that does not " + "require the `{}` to be strictly positive definite." + .format(param, param)) + with pytest.raises(LinAlgError) as raised_err: + model.fit(input_data, labels) + assert str(raised_err.value) == msg + + +@pytest.mark.integration +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if idml[:3] in ['MMC']], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if idml[:3] in ['MMC']]) +def test_singular_covariance_init_of_non_strict_pd(estimator, build_dataset): + """Tests that when using the 'covariance' init or prior, it returns the + appropriate warning if the covariance matrix is singular, for algorithms + that don't need a strictly PD init. Also checks that the returned + inverse matrix has finite values + """ + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + # We create a feature that is a linear combination of the first two + # features: + input_data = np.concatenate([input_data, input_data[:, ..., :2].dot([[2], + [3]])], + axis=-1) + model.set_params(init='covariance') + msg = ('The covariance matrix is not invertible: ' + 'using the pseudo-inverse instead.' + 'To make the covariance matrix invertible' + ' you can remove any linearly dependent features and/or ' + 'reduce the dimensionality of your input, ' + 'for instance using `sklearn.decomposition.PCA` as a ' + 'preprocessing step.') + with pytest.warns(UserWarning) as raised_warning: + model.fit(input_data, labels) + assert any([str(warning.message) == msg for warning in raised_warning]) + M, _ = _initialize_metric_mahalanobis(X, init='covariance', + random_state=RNG, + return_inverse=True, + strict_pd=False) + assert np.isfinite(M).all() + + +@pytest.mark.integration +@pytest.mark.parametrize('estimator, build_dataset', + [(ml, bd) for idml, (ml, bd) + in zip(ids_metric_learners, + metric_learners) + if idml[:4] in ['ITML', 'SDML', 'LSML']], + ids=[idml for idml, (ml, _) + in zip(ids_metric_learners, + metric_learners) + if idml[:4] in ['ITML', 'SDML', 'LSML']]) +@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20]) +def test_singular_array_init_or_prior_strictpd(estimator, build_dataset, w0): + """Tests that when using a custom array init (or prior), it returns the + appropriate error if it is singular, for algorithms + that need a strictly PD prior or init (see + https://github.com/scikit-learn-contrib/metric-learn/issues/202 and + https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment + -492332451) + """ + matrices_to_set = [] + if hasattr(estimator, 'init'): + matrices_to_set.append('init') + if hasattr(estimator, 'prior'): + matrices_to_set.append('prior') + + rng = np.random.RandomState(42) + input_data, labels, _, X = build_dataset() + for param in matrices_to_set: + model = clone(estimator) + set_random_state(model) + + P = ortho_group.rvs(X.shape[1], random_state=rng) + w = np.abs(rng.randn(X.shape[1])) + w[0] = w0 + M = P.dot(np.diag(w)).dot(P.T) + if hasattr(model, 'init'): + model.set_params(init=M) + if hasattr(model, 'prior'): + model.set_params(prior=M) + if not hasattr(model, 'prior') and not hasattr(model, 'init'): + raise RuntimeError("Neither prior or init could be set in the model.") + msg = ("You should provide a strictly positive definite " + "matrix as `{}`. This one is not definite. Try another" + " {}, or an algorithm that does not " + "require the {} to be strictly positive definite." + .format(*(param,) * 3)) + with pytest.raises(LinAlgError) as raised_err: + model.fit(input_data, labels) + assert str(raised_err.value) == msg + + +@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20]) +def test_singular_array_init_of_non_strict_pd(w0): + """Tests that when using a custom array init, it returns the + appropriate warning if it is singular. Also checks if the returned + inverse matrix is finite. This isn't checked for model fitting as no + model curently uses this setting. + """ + rng = np.random.RandomState(42) + X, y = shuffle(*make_blobs(random_state=rng), + random_state=rng) + P = ortho_group.rvs(X.shape[1], random_state=rng) + w = np.abs(rng.randn(X.shape[1])) + w[0] = w0 + M = P.dot(np.diag(w)).dot(P.T) + msg = ('The initialization matrix is not invertible: ' + 'using the pseudo-inverse instead.') + with pytest.warns(UserWarning) as raised_warning: + _, M_inv = _initialize_metric_mahalanobis(X, init=M, + random_state=rng, + return_inverse=True, + strict_pd=False) + assert str(raised_warning[0].message) == msg + assert np.isfinite(M_inv).all() + + +@pytest.mark.integration +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_deterministic_initialization(estimator, build_dataset): + """Test that estimators that have a prior or an init are deterministic + when it is set to to random and when the random_state is fixed.""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + if hasattr(estimator, 'init'): + model.set_params(init='random') + if hasattr(estimator, 'prior'): + model.set_params(prior='random') + model1 = clone(model) + set_random_state(model1, 42) + model1 = model1.fit(*remove_y(model, input_data, labels)) + model2 = clone(model) + set_random_state(model2, 42) + model2 = model2.fit(*remove_y(model, input_data, labels)) + np.testing.assert_allclose(model1.get_mahalanobis_matrix(), + model2.get_mahalanobis_matrix()) diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py new file mode 100644 index 00000000..bfedefea --- /dev/null +++ b/test/test_pairs_classifiers.py @@ -0,0 +1,574 @@ +from functools import partial + +import warnings +import pytest +from numpy.testing import assert_array_equal +from scipy.spatial.distance import euclidean + +from metric_learn.base_metric import _PairsClassifierMixin, MahalanobisMixin +from sklearn.exceptions import NotFittedError +from sklearn.metrics import (f1_score, accuracy_score, fbeta_score, + precision_score) +from sklearn.model_selection import train_test_split + +from test.test_utils import pairs_learners, ids_pairs_learners +from metric_learn.sklearn_shims import set_random_state +from sklearn import clone +import numpy as np +from itertools import product + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_predict_only_one_or_minus_one(estimator, build_dataset, + with_preprocessor): + """Test that all predicted values are either +1 or -1""" + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + pairs_train, pairs_test, y_train, y_test = train_test_split(input_data, + labels) + estimator.fit(pairs_train, y_train) + predictions = estimator.predict(pairs_test) + not_valid = [e for e in predictions if e not in [-1, 1]] + assert len(not_valid) == 0 + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_predict_monotonous(estimator, build_dataset, + with_preprocessor): + """Test that there is a threshold distance separating points labeled as + similar and points labeled as dissimilar """ + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + pairs_train, pairs_test, y_train, y_test = train_test_split(input_data, + labels) + estimator.fit(pairs_train, y_train) + scores = estimator.pair_score(pairs_test) + predictions = estimator.predict(pairs_test) + max_dissimilar = np.max(scores[predictions == -1]) + min_similar = np.min(scores[predictions == 1]) + assert max_dissimilar <= min_similar + separator = np.mean([max_dissimilar, min_similar]) + assert (predictions[scores < separator] == -1).all() + assert (predictions[scores > separator] == 1).all() + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, + with_preprocessor): + """Test that a NotFittedError is raised if someone tries to use + pair_score, score_pairs, decision_function, get_metric, transform or + get_mahalanobis_matrix on input data and the metric learner + has not been fitted.""" + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + with pytest.raises(NotFittedError): # Remove in 0.8.0 + estimator.score_pairs(input_data) + with pytest.raises(NotFittedError): + estimator.pair_score(input_data) + with pytest.raises(NotFittedError): + estimator.decision_function(input_data) + with pytest.raises(NotFittedError): + estimator.get_metric() + with pytest.raises(NotFittedError): + estimator.transform(input_data) + with pytest.raises(NotFittedError): + estimator.get_mahalanobis_matrix() + with pytest.raises(NotFittedError): + estimator.calibrate_threshold(input_data, labels) + + with pytest.raises(NotFittedError): + estimator.set_threshold(0.5) + with pytest.raises(NotFittedError): + estimator.predict(input_data) + + +@pytest.mark.parametrize('calibration_params', + [None, {}, dict(), {'strategy': 'accuracy'}] + + [{'strategy': strategy, 'min_rate': min_rate} + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] + + [{'strategy': 'f_beta', 'beta': beta} + for beta in [0., 0.1, 0.2, 1., 5.]] + ) +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_fit_with_valid_threshold_params(estimator, build_dataset, + with_preprocessor, + calibration_params): + """Tests that fitting `calibration_params` with appropriate parameters works + as expected""" + pairs, y, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + estimator.fit(pairs, y, calibration_params=calibration_params) + estimator.predict(pairs) + + +@pytest.mark.parametrize('kwargs', + [{'strategy': 'accuracy'}] + + [{'strategy': strategy, 'min_rate': min_rate} + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] + + [{'strategy': 'f_beta', 'beta': beta} + for beta in [0., 0.1, 0.2, 1., 5.]] + ) +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_threshold_different_scores_is_finite(estimator, build_dataset, + with_preprocessor, kwargs): + # test that calibrating the threshold works for every metric learner + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + estimator.fit(input_data, labels) + with warnings.catch_warnings(record=True) as record: + estimator.calibrate_threshold(input_data, labels, **kwargs) + assert len(record) == 0 + + +class IdentityPairsClassifier(MahalanobisMixin, _PairsClassifierMixin): + """A simple pairs classifier for testing purposes, that will just have + identity as components_, and a string threshold so that it returns an + error if not explicitely set. + """ + def fit(self, pairs, y): + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') + self.components_ = np.atleast_2d(np.identity(pairs.shape[2])) + # self.threshold_ is not set. + return self + + +def test_unset_threshold(): + """Tests that the "threshold is unset" error is raised when using predict + (performs binary classification on pairs) with an unset threshold.""" + identity_pairs_classifier = IdentityPairsClassifier() + pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]]) + y = np.array([1, 1, -1, -1]) + identity_pairs_classifier.fit(pairs, y) + with pytest.raises(AttributeError) as e: + identity_pairs_classifier.predict(pairs) + + expected_msg = ("A threshold for this estimator has not been set, " + "call its set_threshold or calibrate_threshold method.") + + assert str(e.value) == expected_msg + + +def test_set_threshold(): + # test that set_threshold indeed sets the threshold + identity_pairs_classifier = IdentityPairsClassifier() + pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]]) + y = np.array([1, 1, -1, -1]) + identity_pairs_classifier.fit(pairs, y) + identity_pairs_classifier.set_threshold(0.5) + assert identity_pairs_classifier.threshold_ == 0.5 + + +@pytest.mark.parametrize('value', ["ABC", None, [1, 2, 3], {'key': None}, + (1, 2), set(), + np.array([[[0.], [1.]], [[1.], [3.]]])]) +def test_set_wrong_type_threshold(value): + """ + Test that `set_threshold` indeed sets the threshold + and cannot accept nothing but float or integers, but + being permissive with boolean True=1.0 and False=0.0 + """ + model = IdentityPairsClassifier() + model.fit(np.array([[[0.], [1.]]]), np.array([1])) + msg = ('Parameter threshold must be a real number. ' + 'Got {} instead.'.format(type(value))) + + with pytest.raises(ValueError) as e: # String + model.set_threshold(value) + assert str(e.value).startswith(msg) + + +def test_f_beta_1_is_f_1(): + # test that putting beta to 1 indeed finds the best threshold to optimize + # the f1_score + rng = np.random.RandomState(42) + n_samples = 100 + pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples) + pairs_learner = IdentityPairsClassifier() + pairs_learner.fit(pairs, y) + pairs_learner.calibrate_threshold(pairs, y, strategy='f_beta', beta=1) + best_f1_score = f1_score(y, pairs_learner.predict(pairs)) + for threshold in - pairs_learner.decision_function(pairs): + pairs_learner.set_threshold(threshold) + assert f1_score(y, pairs_learner.predict(pairs)) <= best_f1_score + + +def true_pos_true_neg_rates(y_true, y_pred): + """A function that returns the true positive rates and the true negatives + rate. For testing purposes (optimized for readability not performance).""" + assert y_pred.shape[0] == y_true.shape[0] + tp = np.sum((y_pred == 1) * (y_true == 1)) + tn = np.sum((y_pred == -1) * (y_true == -1)) + fn = np.sum((y_pred == -1) * (y_true == 1)) + fp = np.sum((y_pred == 1) * (y_true == -1)) + tpr = tp / (tp + fn) + tnr = tn / (tn + fp) + tpr = tpr if not np.isnan(tpr) else 0. + tnr = tnr if not np.isnan(tnr) else 0. + return tpr, tnr + + +def tpr_threshold(y_true, y_pred, tnr_threshold=0.): + """A function that returns the true positive rate if the true negative + rate is higher or equal than `threshold`, and -1 otherwise. For testing + purposes""" + tpr, tnr = true_pos_true_neg_rates(y_true, y_pred) + if tnr < tnr_threshold: + return -1 + else: + return tpr + + +def tnr_threshold(y_true, y_pred, tpr_threshold=0.): + """A function that returns the true negative rate if the true positive + rate is higher or equal than `threshold`, and -1 otherwise. For testing + purposes""" + tpr, tnr = true_pos_true_neg_rates(y_true, y_pred) + if tpr < tpr_threshold: + return -1 + else: + return tnr + + +@pytest.mark.parametrize('kwargs, scoring', + [({'strategy': 'accuracy'}, accuracy_score)] + + [({'strategy': 'f_beta', 'beta': b}, + partial(fbeta_score, beta=b)) + for b in [0.1, 0.5, 1.]] + + [({'strategy': 'f_beta', 'beta': 0}, + precision_score)] + + [({'strategy': 'max_tpr', 'min_rate': t}, + partial(tpr_threshold, tnr_threshold=t)) + for t in [0., 0.1, 0.5, 0.8, 1.]] + + [({'strategy': 'max_tnr', 'min_rate': t}, + partial(tnr_threshold, tpr_threshold=t)) + for t in [0., 0.1, 0.5, 0.8, 1.]], + ) +def test_found_score_is_best_score(kwargs, scoring): + # test that when we use calibrate threshold, it will indeed be the + # threshold that have the best score + rng = np.random.RandomState(42) + n_samples = 50 + pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples) + pairs_learner = IdentityPairsClassifier() + pairs_learner.fit(pairs, y) + pairs_learner.calibrate_threshold(pairs, y, **kwargs) + best_score = scoring(y, pairs_learner.predict(pairs)) + scores = [] + predicted_scores = pairs_learner.decision_function(pairs) + predicted_scores = np.hstack([[np.min(predicted_scores) - 1], + predicted_scores, + [np.max(predicted_scores) + 1]]) + for threshold in - predicted_scores: + pairs_learner.set_threshold(threshold) + score = scoring(y, pairs_learner.predict(pairs)) + assert score <= best_score + scores.append(score) + assert len(set(scores)) > 1 # assert that we didn't always have the same + # value for the score (which could be a hint for some bug, but would still + # silently pass the test)) + + +@pytest.mark.parametrize('kwargs, scoring', + [({'strategy': 'accuracy'}, accuracy_score)] + + [({'strategy': 'f_beta', 'beta': b}, + partial(fbeta_score, beta=b)) + for b in [0.1, 0.5, 1.]] + + [({'strategy': 'f_beta', 'beta': 0}, + precision_score)] + + [({'strategy': 'max_tpr', 'min_rate': t}, + partial(tpr_threshold, tnr_threshold=t)) + for t in [0., 0.1, 0.5, 0.8, 1.]] + + [({'strategy': 'max_tnr', 'min_rate': t}, + partial(tnr_threshold, tpr_threshold=t)) + for t in [0., 0.1, 0.5, 0.8, 1.]] + ) +def test_found_score_is_best_score_duplicates(kwargs, scoring): + # test that when we use calibrate threshold, it will indeed be the + # threshold that have the best score. It's the same as the previous test + # except this time we test that the scores are coherent even if there are + # duplicates (i.e. points that have the same score returned by + # `decision_function`). + rng = np.random.RandomState(42) + n_samples = 50 + pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples) + # we create some duplicates points, which will also have the same score + # predicted + pairs[6:10] = pairs[10:14] + y[6:10] = y[10:14] + pairs_learner = IdentityPairsClassifier() + pairs_learner.fit(pairs, y) + pairs_learner.calibrate_threshold(pairs, y, **kwargs) + best_score = scoring(y, pairs_learner.predict(pairs)) + scores = [] + predicted_scores = pairs_learner.decision_function(pairs) + predicted_scores = np.hstack([[np.min(predicted_scores) - 1], + predicted_scores, + [np.max(predicted_scores) + 1]]) + for threshold in - predicted_scores: + pairs_learner.set_threshold(threshold) + score = scoring(y, pairs_learner.predict(pairs)) + assert score <= best_score + scores.append(score) + assert len(set(scores)) > 1 # assert that we didn't always have the same + # value for the score (which could be a hint for some bug, but would still + # silently pass the test)) + + +@pytest.mark.parametrize('invalid_args, expected_msg', + [({'strategy': 'weird'}, + ('Strategy can either be "accuracy", "f_beta" or ' + '"max_tpr" or "max_tnr". Got "weird" instead.'))] + + [({'strategy': strategy, 'min_rate': min_rate}, + 'Parameter min_rate must be a number in' + '[0, 1]. Got {} instead.'.format(min_rate)) + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], + [None, 'weird', -0.2, 1.2, 3 + 2j])] + + [({'strategy': 'f_beta', 'beta': beta}, + 'Parameter beta must be a real number. ' + 'Got {} instead.'.format(type(beta))) + for beta in [None, 'weird', 3 + 2j]] + ) +def test_calibrate_threshold_invalid_parameters_right_error(invalid_args, + expected_msg): + # test that the right error message is returned if invalid arguments are + # given to calibrate_threshold + rng = np.random.RandomState(42) + pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20) + pairs_learner = IdentityPairsClassifier() + pairs_learner.fit(pairs, y) + with pytest.raises(ValueError) as raised_error: + pairs_learner.calibrate_threshold(pairs, y, **invalid_args) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('valid_args', + [{'strategy': 'accuracy'}] + + [{'strategy': strategy, 'min_rate': min_rate} + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], + [0., 0.2, 0.8, 1.])] + + [{'strategy': 'f_beta', 'beta': beta} + for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]] + # Note that we authorize beta < 0 (even if + # in fact it will be squared, so it would be useless + # to do that) + ) +def test_calibrate_threshold_valid_parameters(valid_args): + # test that no warning message is returned if valid arguments are given to + # calibrate threshold + rng = np.random.RandomState(42) + pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20) + pairs_learner = IdentityPairsClassifier() + pairs_learner.fit(pairs, y) + with warnings.catch_warnings(record=True) as record: + pairs_learner.calibrate_threshold(pairs, y, **valid_args) + assert len(record) == 0 + + +def test_calibrate_threshold_extreme(): + """Test that in the (rare) case where we should accept all points or + reject all points, this is effectively what + is done""" + + class MockBadPairsClassifier(MahalanobisMixin, _PairsClassifierMixin): + """A pairs classifier that returns bad scores (i.e. in the inverse order + of what we would expect from a good pairs classifier + """ + + def fit(self, pairs, y, calibration_params=None): + self.preprocessor_ = 'not used' + self.components_ = 'not used' + self.calibrate_threshold(pairs, y, **(calibration_params if + calibration_params is not None else + dict())) + return self + + def decision_function(self, pairs): + return np.arange(pairs.shape[0], dtype=float) + + rng = np.random.RandomState(42) + pairs = rng.randn(7, 2, 5) # the info in X is not used, it's just for the + # API + + y = [1., 1., 1., -1., -1., -1., -1.] + mock_clf = MockBadPairsClassifier() + # case of bad scoring with more negative than positives. In + # this case, when: + # optimizing for accuracy we should reject all points + mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'}) + assert_array_equal(mock_clf.predict(pairs), - np.ones(7)) + + # optimizing for max_tpr we should accept all points if min_rate == 0. ( + # because by convention then tnr=0/0=0) + mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr', + 'min_rate': 0.}) + assert_array_equal(mock_clf.predict(pairs), np.ones(7)) + # optimizing for max_tnr we should reject all points if min_rate = 0. ( + # because by convention then tpr=0/0=0) + mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr', + 'min_rate': 0.}) + assert_array_equal(mock_clf.predict(pairs), - np.ones(7)) + + y = [1., 1., 1., 1., -1., -1., -1.] + # case of bad scoring with more positives than negatives. In + # this case, when: + # optimizing for accuracy we should accept all points + mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'}) + assert_array_equal(mock_clf.predict(pairs), np.ones(7)) + # optimizing for max_tpr we should accept all points if min_rate == 0. ( + # because by convention then tnr=0/0=0) + mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr', + 'min_rate': 0.}) + assert_array_equal(mock_clf.predict(pairs), np.ones(7)) + # optimizing for max_tnr we should reject all points if min_rate = 0. ( + # because by convention then tpr=0/0=0) + mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr', + 'min_rate': 0.}) + assert_array_equal(mock_clf.predict(pairs), - np.ones(7)) + + # Note: we'll never find a case where we would reject all points for + # maximizing tpr (we can always accept more points), and accept all + # points for maximizing tnr (we can always reject more points) + + # case of alternated scores: for optimizing the f_1 score we should accept + # all points (because this way we have max recall (1) and max precision ( + # here: 0.5)) + y = [1., -1., 1., -1., 1., -1.] + mock_clf.fit(pairs[:6], y, calibration_params={'strategy': 'f_beta', + 'beta': 1.}) + assert_array_equal(mock_clf.predict(pairs[:6]), np.ones(6)) + + # Note: for optimizing f_1 score, we will never find an optimal case where we + # reject all points because in this case we would have 0 precision (by + # convention, because it's 0/0), and 0 recall (and we could always decrease + # the threshold to increase the recall, and we couldn't do worse for + # precision so it would be better) + + +@pytest.mark.parametrize('estimator, _', + pairs_learners + [(IdentityPairsClassifier(), None), + (_PairsClassifierMixin, None)], + ids=ids_pairs_learners + ['mock', 'class']) +@pytest.mark.parametrize('invalid_args, expected_msg', + [({'strategy': 'weird'}, + ('Strategy can either be "accuracy", "f_beta" or ' + '"max_tpr" or "max_tnr". Got "weird" instead.'))] + + [({'strategy': strategy, 'min_rate': min_rate}, + 'Parameter min_rate must be a number in' + '[0, 1]. Got {} instead.'.format(min_rate)) + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], + [None, 'weird', -0.2, 1.2, 3 + 2j])] + + [({'strategy': 'f_beta', 'beta': beta}, + 'Parameter beta must be a real number. ' + 'Got {} instead.'.format(type(beta))) + for beta in [None, 'weird', 3 + 2j]] + ) +def test_validate_calibration_params_invalid_parameters_right_error( + estimator, _, invalid_args, expected_msg): + # test that the right error message is returned if invalid arguments are + # given to _validate_calibration_params, for all pairs metric learners as + # well as a mocking general identity pairs classifier and the class itself + with pytest.raises(ValueError) as raised_error: + estimator._validate_calibration_params(**invalid_args) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('estimator, _', + pairs_learners + [(IdentityPairsClassifier(), None), + (_PairsClassifierMixin, None)], + ids=ids_pairs_learners + ['mock', 'class']) +@pytest.mark.parametrize('valid_args', + [{}, {'strategy': 'accuracy'}] + + [{'strategy': strategy, 'min_rate': min_rate} + for (strategy, min_rate) in product( + ['max_tpr', 'max_tnr'], + [0., 0.2, 0.8, 1.])] + + [{'strategy': 'f_beta', 'beta': beta} + for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]] + # Note that we authorize beta < 0 (even if + # in fact it will be squared, so it would be useless + # to do that) + ) +def test_validate_calibration_params_valid_parameters( + estimator, _, valid_args): + # test that no warning message is returned if valid arguments are given to + # _validate_calibration_params for all pairs metric learners, as well as + # a mocking example, and the class itself + with warnings.catch_warnings(record=True) as record: + estimator._validate_calibration_params(**valid_args) + assert len(record) == 0 + + +@pytest.mark.parametrize('estimator, build_dataset', + pairs_learners, + ids=ids_pairs_learners) +def test_validate_calibration_params_invalid_parameters_error_before__fit( + estimator, build_dataset): + """For all pairs metric learners (which currently all have a _fit method), + make sure that calibration parameters are validated before fitting""" + estimator = clone(estimator) + input_data, labels, _, _ = build_dataset() + + def breaking_fun(**args): # a function that fails so that we will miss + # the calibration at the end and therefore the right error message from + # validating params should be thrown before + raise RuntimeError('Game over.') + estimator._fit = breaking_fun + expected_msg = ('Strategy can either be "accuracy", "f_beta" or ' + '"max_tpr" or "max_tnr". Got "weird" instead.') + with pytest.raises(ValueError) as raised_error: + estimator.fit(input_data, labels, calibration_params={'strategy': 'weird'}) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_accuracy_toy_example(estimator, build_dataset): + """Test that the accuracy works on some toy example (hence that the + prediction is OK)""" + input_data, labels, preprocessor, X = build_dataset(with_preprocessor=False) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + estimator.fit(input_data, labels) + # we force the transformation to be identity so that we control what it does + estimator.components_ = np.eye(X.shape[1]) + # the threshold for similar or dissimilar pairs is half of the distance + # between X[0] and X[1] + estimator.set_threshold(euclidean(X[0], X[1]) / 2) + # We take the two first points and we build 4 regularly spaced points on the + # line they define, so that it's easy to build quadruplets of different + # similarities. + X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4 + pairs_test = np.array( + [[X_test[0], X_test[1]], # similar + [X_test[0], X_test[3]], # dissimilar + [X_test[1], X_test[2]], # similar + [X_test[2], X_test[3]]]) # similar + y = np.array([-1, 1, 1, -1]) # [F, F, T, F] + assert accuracy_score(estimator.predict(pairs_test), y) == 0.25 diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py new file mode 100644 index 00000000..a8319961 --- /dev/null +++ b/test/test_quadruplets_classifiers.py @@ -0,0 +1,65 @@ +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import train_test_split + +from test.test_utils import quadruplets_learners, ids_quadruplets_learners +from metric_learn.sklearn_shims import set_random_state +from sklearn import clone +import numpy as np + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, + ids=ids_quadruplets_learners) +def test_predict_only_one_or_minus_one(estimator, build_dataset, + with_preprocessor): + """Test that all predicted values are either +1 or -1""" + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + (quadruplets_train, + quadruplets_test, y_train, y_test) = train_test_split(input_data, labels) + estimator.fit(quadruplets_train) + predictions = estimator.predict(quadruplets_test) + not_valid = [e for e in predictions if e not in [-1, 1]] + assert len(not_valid) == 0 + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, + ids=ids_quadruplets_learners) +def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, + with_preprocessor): + """Test that a NotFittedError is raised if someone tries to predict and + the metric learner has not been fitted.""" + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + with pytest.raises(NotFittedError): + estimator.predict(input_data) + + +@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, + ids=ids_quadruplets_learners) +def test_accuracy_toy_example(estimator, build_dataset): + """Test that the default scoring for quadruplets (accuracy) works on some + toy example""" + input_data, labels, preprocessor, X = build_dataset(with_preprocessor=False) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + estimator.fit(input_data) + # We take the two first points and we build 4 regularly spaced points on the + # line they define, so that it's easy to build quadruplets of different + # similarities. + X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4 + quadruplets_test = np.array( + [[X_test[0], X_test[2], X_test[0], X_test[1]], + [X_test[1], X_test[3], X_test[1], X_test[0]], + [X_test[1], X_test[2], X_test[0], X_test[3]], + [X_test[3], X_test[0], X_test[2], X_test[1]]]) + # we force the transformation to be identity so that we control what it does + estimator.components_ = np.eye(X.shape[1]) + assert estimator.score(quadruplets_test) == 0.25 diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py new file mode 100644 index 00000000..798d9036 --- /dev/null +++ b/test/test_sklearn_compat.py @@ -0,0 +1,465 @@ +import pytest +import unittest +from sklearn.utils.estimator_checks import check_estimator +from sklearn.base import TransformerMixin +from sklearn.pipeline import make_pipeline +from sklearn.utils import check_random_state +from metric_learn.sklearn_shims import (assert_allclose_dense_sparse, + set_random_state, _get_args, + is_public_parameter, get_scorer) +from metric_learn import (Covariance, LFDA, LMNN, MLKR, NCA, + ITML_Supervised, LSML_Supervised, + MMC_Supervised, RCA_Supervised, SDML_Supervised, + SCML_Supervised) +from sklearn import clone +import numpy as np +from sklearn.model_selection import (cross_val_score, cross_val_predict, + train_test_split, KFold) +from test.test_utils import (metric_learners, ids_metric_learners, + mock_preprocessor, tuples_learners, + ids_tuples_learners, pairs_learners, + ids_pairs_learners, remove_y, + metric_learners_pipeline, + ids_metric_learners_pipeline) + + +class Stable_RCA_Supervised(RCA_Supervised): + + def __init__(self, n_components=None, + chunk_size=2, preprocessor=None, random_state=None): + # this init makes RCA stable for scikit-learn examples. + super(Stable_RCA_Supervised, self).__init__( + n_chunks=2, n_components=n_components, + chunk_size=chunk_size, preprocessor=preprocessor, + random_state=random_state) + + +class Stable_SDML_Supervised(SDML_Supervised): + + def __init__(self, sparsity_param=0.01, + n_constraints=None, verbose=False, preprocessor=None, + random_state=None): + # this init makes SDML stable for scikit-learn examples. + super(Stable_SDML_Supervised, self).__init__( + sparsity_param=sparsity_param, + n_constraints=n_constraints, verbose=verbose, + preprocessor=preprocessor, balance_param=1e-5, prior='identity', + random_state=random_state) + + +class TestSklearnCompat(unittest.TestCase): + def test_covariance(self): + check_estimator(Covariance()) + + def test_lmnn(self): + check_estimator(LMNN()) + + def test_lfda(self): + check_estimator(LFDA()) + + def test_mlkr(self): + check_estimator(MLKR()) + + def test_nca(self): + check_estimator(NCA()) + + def test_lsml(self): + check_estimator(LSML_Supervised()) + + def test_itml(self): + check_estimator(ITML_Supervised()) + + def test_mmc(self): + check_estimator(MMC_Supervised()) + + def test_sdml(self): + check_estimator(Stable_SDML_Supervised()) + + def test_rca(self): + check_estimator(Stable_RCA_Supervised()) + + def test_scml(self): + msg = "As no value for `n_basis` was selected, " + with pytest.warns(UserWarning) as raised_warning: + check_estimator(SCML_Supervised()) + assert msg in str(raised_warning[0].message) + + +RNG = check_random_state(0) + + +# ---------------------- Test scikit-learn compatibility ---------------------- + +def generate_array_like(input_data, labels=None): + """Helper function to generate array-like variants of numpy datasets, + for testing purposes.""" + list_data = input_data.tolist() + input_data_changed = [input_data, list_data, tuple(list_data)] + if input_data.ndim >= 2: + input_data_changed.append(tuple(tuple(x) for x in list_data)) + if input_data.ndim >= 3: + input_data_changed.append(tuple(tuple(tuple(x) for x in y) for y in + list_data)) + if input_data.ndim == 2: + pd = pytest.importorskip('pandas') + input_data_changed.append(pd.DataFrame(input_data)) + if labels is not None: + labels_changed = [labels, list(labels), tuple(labels)] + else: + labels_changed = [labels] + return input_data_changed, labels_changed + + +@pytest.mark.integration +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_array_like_inputs(estimator, build_dataset, with_preprocessor): + """Test that metric-learners can have as input (of all functions that are + applied on data) any array-like object.""" + input_data, labels, preprocessor, X = build_dataset(with_preprocessor) + + # we subsample the data for the test to be more efficient + input_data, _, labels, _ = train_test_split(input_data, labels, + train_size=40, + random_state=42) + X = X[:10] + + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + input_variants, label_variants = generate_array_like(input_data, labels) + for input_variant in input_variants: + for label_variant in label_variants: + estimator.fit(*remove_y(estimator, input_variant, label_variant)) + if hasattr(estimator, "predict"): + estimator.predict(input_variant) + if hasattr(estimator, "predict_proba"): + estimator.predict_proba(input_variant) # anticipation in case some + # time we have that, or if ppl want to contribute with new algorithms + # it will be checked automatically + if hasattr(estimator, "decision_function"): + estimator.decision_function(input_variant) + if hasattr(estimator, "score"): + for label_variant in label_variants: + estimator.score(*remove_y(estimator, input_variant, label_variant)) + + X_variants, _ = generate_array_like(X) + for X_variant in X_variants: + estimator.transform(X_variant) + + pairs = np.array([[X[0], X[1]], [X[0], X[2]]]) + pairs_variants, _ = generate_array_like(pairs) + + not_implemented_msg = "" + # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says + # "This learner does not have pair_distance" + + for pairs_variant in pairs_variants: + estimator.pair_score(pairs_variant) # All learners have pair_score + + # But not all of them will have pair_distance + try: + estimator.pair_distance(pairs_variant) + except Exception as raised_exception: + assert raised_exception.value.args[0] == not_implemented_msg + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_various_scoring_on_tuples_learners(estimator, build_dataset, + with_preprocessor): + """Tests that scikit-learn's scoring returns something finite, + for other scoring than default scoring. (List of scikit-learn's scores can be + found in sklearn.metrics._scorer). For each type of output (predict, + predict_proba, decision_function), we test a bunch of scores. + We only test on pairs learners because quadruplets don't have a y argument. + """ + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + + # scores that need a predict function: every tuples learner should have a + # predict function (whether the pair is of positive samples or negative + # samples) + for scoring in ['accuracy', 'f1']: + check_score_is_finite(scoring, estimator, input_data, labels) + # scores that need a predict_proba: + if hasattr(estimator, "predict_proba"): + for scoring in ['neg_log_loss', 'brier_score']: + check_score_is_finite(scoring, estimator, input_data, labels) + # scores that need a decision_function: every tuples learner should have a + # decision function (the metric between points) + for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']: + check_score_is_finite(scoring, estimator, input_data, labels) + + +def check_score_is_finite(scoring, estimator, input_data, labels): + estimator = clone(estimator) + assert np.isfinite(cross_val_score(estimator, input_data, labels, + scoring=scoring)).all() + estimator.fit(input_data, labels) + assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels)) + + +@pytest.mark.parametrize('estimator, build_dataset', tuples_learners, + ids=ids_tuples_learners) +def test_cross_validation_is_finite(estimator, build_dataset): + """Tests that validation on metric-learn estimators returns something finite + """ + input_data, labels, preprocessor, _ = build_dataset() + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + assert np.isfinite(cross_val_score(estimator, + *remove_y(estimator, input_data, labels) + )).all() + assert np.isfinite(cross_val_predict(estimator, + *remove_y(estimator, input_data, labels) + )).all() + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_cross_validation_manual_vs_scikit(estimator, build_dataset, + with_preprocessor): + """Tests that if we make a manual cross-validation, the result will be the + same as scikit-learn's cross-validation (some code for generating the + folds is taken from scikit-learn). + """ + if any(hasattr(estimator, method) for method in ["predict", "score"]): + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + n_splits = 3 + kfold = KFold(shuffle=False, n_splits=n_splits) + n_samples = input_data.shape[0] + fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int64) + fold_sizes[:n_samples % n_splits] += 1 + current = 0 + scores, predictions = [], np.zeros(input_data.shape[0]) + for fold_size in fold_sizes: + start, stop = current, current + fold_size + current = stop + test_slice = slice(start, stop) + train_mask = np.ones(input_data.shape[0], bool) + train_mask[test_slice] = False + y_train, y_test = labels[train_mask], labels[test_slice] + estimator.fit(*remove_y(estimator, input_data[train_mask], y_train)) + if hasattr(estimator, "score"): + scores.append(estimator.score(*remove_y( + estimator, input_data[test_slice], y_test))) + if hasattr(estimator, "predict"): + predictions[test_slice] = estimator.predict(input_data[test_slice]) + if hasattr(estimator, "score"): + assert all(scores == cross_val_score( + estimator, *remove_y(estimator, input_data, labels), + cv=kfold)) + if hasattr(estimator, "predict"): + assert all(predictions == cross_val_predict( + estimator, + *remove_y(estimator, input_data, labels), + cv=kfold)) + + +def check_score(estimator, tuples, y): + if hasattr(estimator, "score"): + score = estimator.score(*remove_y(estimator, tuples, y)) + assert np.isfinite(score) + + +def check_predict(estimator, tuples): + if hasattr(estimator, "predict"): + y_predicted = estimator.predict(tuples) + assert len(y_predicted), len(tuples) + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_simple_estimator(estimator, build_dataset, with_preprocessor): + """Tests that fit, predict and scoring works. + """ + if any(hasattr(estimator, method) for method in ["predict", "score"]): + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + (tuples_train, tuples_test, y_train, + y_test) = train_test_split(input_data, labels, random_state=RNG) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + + estimator.fit(*remove_y(estimator, tuples_train, y_train)) + check_score(estimator, tuples_test, y_test) + check_predict(estimator, tuples_test) + + +@pytest.mark.parametrize('estimator', [est[0] for est in metric_learners], + ids=ids_metric_learners) +@pytest.mark.parametrize('preprocessor', [None, mock_preprocessor]) +def test_no_attributes_set_in_init(estimator, preprocessor): + """Check setting during init. Adapted from scikit-learn.""" + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(type(estimator).__init__, "deprecated_original"): + return + + init_params = _get_args(type(estimator).__init__) + parents_init_params = [param for params_parent in + (_get_args(parent) for parent in + type(estimator).__mro__) + for param in params_parent] + + # Test for no setting apart from parameters during init + invalid_attr = (set(vars(estimator)) - set(init_params) - + set(parents_init_params)) + assert not invalid_attr, \ + ("Estimator %s should not set any attribute apart" + " from parameters during init. Found attributes %s." + % (type(estimator).__name__, sorted(invalid_attr))) + # Ensure that each parameter is set in init + invalid_attr = (set(init_params) - set(vars(estimator)) - + set(["self"])) + assert not invalid_attr, \ + ("Estimator %s should store all parameters" + " as an attribute during init. Did not find " + "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_estimators_fit_returns_self(estimator, build_dataset, + with_preprocessor): + """Check if self is returned when calling fit""" + # Adapted from scikit-learn + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + assert estimator.fit(*remove_y(estimator, input_data, labels)) is estimator + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners_pipeline, + ids=ids_metric_learners_pipeline) +def test_pipeline_consistency(estimator, build_dataset, + with_preprocessor): + # Adapted from scikit learn + # check that make_pipeline(est) gives same score as est + + input_data, y, preprocessor, _ = build_dataset(with_preprocessor) + + def make_random_state(estimator, in_pipeline): + rs = {} + name_estimator = estimator.__class__.__name__ + if name_estimator[-11:] == '_Supervised': + name_param = 'random_state' + if in_pipeline: + name_param = name_estimator.lower() + '__' + name_param + rs[name_param] = check_random_state(0) + return rs + + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor, + **make_random_state(estimator, False)) + pipeline = make_pipeline(estimator) + estimator.fit(input_data, y) + estimator.set_params(preprocessor=preprocessor) + pipeline.set_params(**make_random_state(estimator, True)) + pipeline.fit(input_data, y) + + if hasattr(estimator, 'score'): + result = estimator.score(input_data, y) + result_pipe = pipeline.score(input_data, y) + assert_allclose_dense_sparse(result, result_pipe) + + if hasattr(estimator, 'predict'): + result = estimator.predict(input_data) + result_pipe = pipeline.predict(input_data) + assert_allclose_dense_sparse(result, result_pipe) + + if issubclass(estimator.__class__, TransformerMixin): + if hasattr(estimator, 'transform'): + result = estimator.transform(input_data) + result_pipe = pipeline.transform(input_data) + assert_allclose_dense_sparse(result, result_pipe) + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_dict_unchanged(estimator, build_dataset, with_preprocessor): + # Adapted from scikit-learn + (input_data, labels, preprocessor, + to_transform) = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + estimator.fit(*remove_y(estimator, input_data, labels)) + + def check_dict(): + assert estimator.__dict__ == dict_before, ( + "Estimator changes __dict__ during %s" % method) + for method in ["predict", "decision_function", "predict_proba"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + getattr(estimator, method)(input_data) + check_dict() + if hasattr(estimator, "transform"): + dict_before = estimator.__dict__.copy() + # we transform only dataset of points + estimator.transform(to_transform) + check_dict() + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_dont_overwrite_parameters(estimator, build_dataset, + with_preprocessor): + # Adapted from scikit-learn + # check that fit method only changes or sets private attributes + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + dict_before_fit = estimator.__dict__.copy() + + estimator.fit(*remove_y(estimator, input_data, labels)) + dict_after_fit = estimator.__dict__ + + public_keys_after_fit = [key for key in dict_after_fit.keys() + if is_public_parameter(key)] + + attrs_added_by_fit = [key for key in public_keys_after_fit + if key not in dict_before_fit.keys()] + + # check that fit doesn't add any public attribute + assert not attrs_added_by_fit, ( + "Estimator adds public attribute(s) during" + " the fit method." + " Estimators are only allowed to add private " + "attributes" + " either started with _ or ended" + " with _ but %s added" % ', '.join(attrs_added_by_fit)) + + # check that fit doesn't change any public attribute + attrs_changed_by_fit = [key for key in public_keys_after_fit + if (dict_before_fit[key] + is not dict_after_fit[key])] + + assert not attrs_changed_by_fit, ( + "Estimator changes public attribute(s) during" + " the fit method. Estimators are only allowed" + " to change attributes started" + " or ended with _, but" + " %s changed" % ', '.join(attrs_changed_by_fit)) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_triplets_classifiers.py b/test/test_triplets_classifiers.py new file mode 100644 index 00000000..515a0a33 --- /dev/null +++ b/test/test_triplets_classifiers.py @@ -0,0 +1,127 @@ +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import train_test_split + +from metric_learn import SCML +from test.test_utils import ( + triplets_learners, + ids_triplets_learners, + build_triplets +) +from metric_learn.sklearn_shims import set_random_state +from sklearn import clone +import numpy as np +from numpy.testing import assert_array_equal + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', triplets_learners, + ids=ids_triplets_learners) +def test_predict_only_one_or_minus_one(estimator, build_dataset, + with_preprocessor): + """Test that all predicted values are either +1 or -1""" + input_data, _, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + triplets_train, triplets_test = train_test_split(input_data) + estimator.fit(triplets_train) + predictions = estimator.predict(triplets_test) + + not_valid = [e for e in predictions if e not in [-1, 1]] + assert len(not_valid) == 0 + + +@pytest.mark.parametrize('estimator, build_dataset', triplets_learners, + ids=ids_triplets_learners) +def test_no_zero_prediction(estimator, build_dataset): + """ + Test that all predicted values are not zero, even when the + distance d(x,y) and d(x,z) is the same for a triplet of the + form (x, y, z). i.e border cases. + """ + triplets, _, _, X = build_dataset(with_preprocessor=False) + # Force 3 dimentions only, to use cross product and get easy orthogonal vec. + triplets = np.array([[t[0][:3], t[1][:3], t[2][:3]] for t in triplets]) + X = X[:, :3] + # Dummy fit + estimator = clone(estimator) + set_random_state(estimator) + estimator.fit(triplets) + # We force the transformation to be identity, to force euclidean distance + estimator.components_ = np.eye(X.shape[1]) + + # Get two orthogonal vectors in respect to X[1] + k = X[1] / np.linalg.norm(X[1]) # Normalize first vector + x = X[2] - X[2].dot(k) * k # Get random orthogonal vector + x /= np.linalg.norm(x) # Normalize + y = np.cross(k, x) # Get orthogonal vector to x + # Assert these orthogonal vectors are different + with pytest.raises(AssertionError): + assert_array_equal(X[1], x) + with pytest.raises(AssertionError): + assert_array_equal(X[1], y) + # Assert the distance is the same for both + assert estimator.get_metric()(X[1], x) == estimator.get_metric()(X[1], y) + + # Form the three scenarios where predict() gives 0 with numpy.sign + triplets_test = np.array( # Critical examples + [[X[0], X[2], X[2]], + [X[1], X[1], X[1]], + [X[1], x, y]]) + # Predict + predictions = estimator.predict(triplets_test) + # Check there are no zero values + assert np.sum(predictions == 0) == 0 + + +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', triplets_learners, + ids=ids_triplets_learners) +def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, + with_preprocessor): + """Test that a NotFittedError is raised if someone tries to predict and + the metric learner has not been fitted.""" + input_data, _, preprocessor, _ = build_dataset(with_preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + with pytest.raises(NotFittedError): + estimator.predict(input_data) + + +@pytest.mark.parametrize('estimator, build_dataset', triplets_learners, + ids=ids_triplets_learners) +def test_accuracy_toy_example(estimator, build_dataset): + """Test that the default scoring for triplets (accuracy) works on some + toy example""" + triplets, _, _, X = build_dataset(with_preprocessor=False) + estimator = clone(estimator) + set_random_state(estimator) + estimator.fit(triplets) + # We take the two first points and we build 4 regularly spaced points on the + # line they define, so that it's easy to build triplets of different + # similarities. + X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4 + + triplets_test = np.array( + [[X_test[0], X_test[2], X_test[1]], + [X_test[1], X_test[3], X_test[0]], + [X_test[1], X_test[2], X_test[3]], + [X_test[3], X_test[0], X_test[2]]]) + # we force the transformation to be identity so that we control what it does + estimator.components_ = np.eye(X.shape[1]) + assert estimator.score(triplets_test) == 0.25 + + +def test_raise_big_number_of_features(): + triplets, _, _, X = build_triplets(with_preprocessor=False) + triplets = triplets[:3, :, :] + estimator = SCML(n_basis=320) + set_random_state(estimator) + with pytest.raises(ValueError) as exc_info: + estimator.fit(triplets) + assert exc_info.value.args[0] == \ + "Number of features (4) is greater than the number of triplets(3)." \ + "\nConsider using dimensionality reduction or using another basis " \ + "generation scheme." diff --git a/test/test_utils.py b/test/test_utils.py new file mode 100644 index 00000000..c0383792 --- /dev/null +++ b/test/test_utils.py @@ -0,0 +1,1273 @@ +import warnings +import pytest +from scipy.linalg import eigh, pinvh +from collections import namedtuple +import numpy as np +from numpy.testing import assert_array_equal, assert_equal +from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state, shuffle +from metric_learn.sklearn_shims import set_random_state +from sklearn.base import clone +from metric_learn._util import (check_input, make_context, preprocess_tuples, + make_name, preprocess_points, + check_collapsed_pairs, validate_vector, + _check_sdp_from_eigen, _check_n_components, + check_y_valid_values_for_pairs, + _auto_select_init, _pseudo_inverse_from_eig) +from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, + LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, + MMC_Supervised, RCA_Supervised, SDML_Supervised, + SCML, SCML_Supervised, Constraints) +from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin, + _PairsClassifierMixin, + _TripletsClassifierMixin, + _QuadrupletsClassifierMixin) +from metric_learn.exceptions import PreprocessorError, NonPSDError +from sklearn.datasets import make_regression, make_blobs, load_iris + + +SEED = 42 +RNG = check_random_state(SEED) + +Dataset = namedtuple('Dataset', ('data target preprocessor to_transform')) +# Data and target are what we will fit on. Preprocessor is the additional +# data if we use a preprocessor (which should be the default ArrayIndexer), +# and to_transform is some additional data that we would want to transform + + +def build_classification(with_preprocessor=False): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_blobs(random_state=SEED), + random_state=SEED) + indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) + if with_preprocessor: + return Dataset(indices, y[indices], X, indices) + else: + return Dataset(X[indices], y[indices], None, X[indices]) + + +def build_regression(with_preprocessor=False): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_regression(n_samples=100, n_features=5, + random_state=SEED), + random_state=SEED) + indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) + if with_preprocessor: + return Dataset(indices, y[indices], X, indices) + else: + return Dataset(X[indices], y[indices], None, X[indices]) + + +def build_data(): + input_data, labels = load_iris(return_X_y=True) + X, y = shuffle(input_data, labels, random_state=SEED) + n_constraints = 50 + constraints = Constraints(y) + pairs = ( + constraints + .positive_negative_pairs(n_constraints, same_length=True, + random_state=check_random_state(SEED))) + return X, pairs + + +def build_pairs(with_preprocessor=False): + # builds a toy pairs problem + X, indices = build_data() + c = np.vstack([np.column_stack(indices[:2]), np.column_stack(indices[2:])]) + target = np.concatenate([np.ones(indices[0].shape[0]), + - np.ones(indices[0].shape[0])]) + c, target = shuffle(c, target, random_state=SEED) + if with_preprocessor: + # if preprocessor, we build a 2D array of pairs of indices + return Dataset(c, target, X, c[:, 0]) + else: + # if not, we build a 3D array of pairs of samples + return Dataset(X[c], target, None, X[c[:, 0]]) + + +def build_triplets(with_preprocessor=False): + input_data, labels = load_iris(return_X_y=True) + X, y = shuffle(input_data, labels, random_state=SEED) + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, k_genuine=3, k_impostor=4) + if with_preprocessor: + # if preprocessor, we build a 2D array of triplets of indices + return Dataset(triplets, np.ones(len(triplets)), X, np.arange(len(X))) + else: + # if not, we build a 3D array of triplets of samples + return Dataset(X[triplets], np.ones(len(triplets)), None, X) + + +def build_quadruplets(with_preprocessor=False): + # builds a toy quadruplets problem + X, indices = build_data() + c = np.column_stack(indices) + target = np.ones(c.shape[0]) # quadruplets targets are not used + # anyways + c, target = shuffle(c, target, random_state=SEED) + if with_preprocessor: + # if preprocessor, we build a 2D array of quadruplets of indices + return Dataset(c, target, X, c[:, 0]) + else: + # if not, we build a 3D array of quadruplets of samples + return Dataset(X[c], target, None, X[c[:, 0]]) + + +quadruplets_learners = [(LSML(), build_quadruplets)] +ids_quadruplets_learners = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + quadruplets_learners])) + +triplets_learners = [(SCML(n_basis=320), build_triplets)] +ids_triplets_learners = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + triplets_learners])) + +pairs_learners = [(ITML(max_iter=2), build_pairs), # max_iter=2 to be faster + (MMC(max_iter=2), build_pairs), # max_iter=2 to be faster + (SDML(prior='identity', balance_param=1e-5), build_pairs)] +ids_pairs_learners = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + pairs_learners])) + +classifiers = [(Covariance(), build_classification), + (LFDA(), build_classification), + (LMNN(), build_classification), + (NCA(), build_classification), + (RCA(), build_classification), + (ITML_Supervised(max_iter=5), build_classification), + (LSML_Supervised(), build_classification), + (MMC_Supervised(max_iter=5), build_classification), + (RCA_Supervised(n_chunks=5), build_classification), + (SDML_Supervised(prior='identity', balance_param=1e-5), + build_classification), + (SCML_Supervised(n_basis=80), build_classification)] +ids_classifiers = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + classifiers])) + +regressors = [(MLKR(init='pca'), build_regression)] +ids_regressors = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in regressors])) + +WeaklySupervisedClasses = (_PairsClassifierMixin, + _TripletsClassifierMixin, + _QuadrupletsClassifierMixin) + +tuples_learners = pairs_learners + triplets_learners + quadruplets_learners +ids_tuples_learners = ids_pairs_learners + ids_triplets_learners \ + + ids_quadruplets_learners + +supervised_learners = classifiers + regressors +ids_supervised_learners = ids_classifiers + ids_regressors + +metric_learners = tuples_learners + supervised_learners +ids_metric_learners = ids_tuples_learners + ids_supervised_learners + +metric_learners_pipeline = pairs_learners + supervised_learners +ids_metric_learners_pipeline = ids_pairs_learners + ids_supervised_learners + + +def remove_y(estimator, X, y): + """Quadruplets and triplets learners have no y in fit, but to write test for + all estimators, it is convenient to have this function, that will return X + and y if the estimator needs a y to fit on, and just X otherwise.""" + no_y_fit = quadruplets_learners + triplets_learners + if estimator.__class__.__name__ in [e.__class__.__name__ + for (e, _) in no_y_fit]: + return (X,) + else: + return (X, y) + + +def mock_preprocessor(indices): + """A preprocessor for testing purposes that returns an all ones 3D array + """ + return np.ones((indices.shape[0], 3)) + + +@pytest.mark.parametrize('type_of_inputs', ['other', 'tuple', 'classics', 2, + int, NCA()]) +def test_check_input_invalid_type_of_inputs(type_of_inputs): + """Tests that an invalid type of inputs in check_inputs raises an error.""" + with pytest.raises(ValueError) as e: + check_input([[0.2, 2.1], [0.2, .8]], type_of_inputs=type_of_inputs) + msg = ("Unknown value {} for type_of_inputs. Valid values are " + "'classic' or 'tuples'.".format(type_of_inputs)) + assert str(e.value) == msg + + +# ---------------- test check_input with 'tuples' type_of_input' ------------ + + +def tuples_prep(): + """Basic array for testing when using a preprocessor""" + tuples = np.array([[1, 2], + [2, 3]]) + return tuples + + +def tuples_no_prep(): + """Basic array for testing when using no preprocessor""" + tuples = np.array([[[1., 2.3], [2.3, 5.3]], + [[2.3, 4.3], [0.2, 0.4]]]) + return tuples + + +@pytest.mark.parametrize('estimator, expected', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +def test_make_context(estimator, expected): + """test the make_name function""" + assert make_context(estimator) == expected + + +@pytest.mark.parametrize('estimator, expected', + [(NCA(), "NCA"), ('NCA', "NCA"), (None, None)]) +def test_make_name(estimator, expected): + """test the make_name function""" + assert make_name(estimator) == expected + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('load_tuples, preprocessor', + [(tuples_prep, mock_preprocessor), + (tuples_no_prep, None), + (tuples_no_prep, mock_preprocessor)]) +def test_check_tuples_invalid_tuple_size(estimator, context, load_tuples, + preprocessor): + """Checks that the exception are raised if tuple_size is not the one + expected""" + tuples = load_tuples() + preprocessed_tuples = (preprocess_tuples(tuples, preprocessor) + if (preprocessor is not None and + tuples.ndim == 2) else tuples) + expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 " + "element(s) instead (shape={}):\ninput={}.\n" + .format(context, preprocessed_tuples.shape, + preprocessed_tuples)) + with pytest.raises(ValueError) as raised_error: + check_input(tuples, type_of_inputs='tuples', tuple_size=3, + preprocessor=preprocessor, estimator=estimator) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('tuples, found, expected, preprocessor', + [(5, '0', '2D array of indicators or 3D array of ' + 'formed tuples', mock_preprocessor), + (5, '0', '3D array of formed tuples', None), + ([1, 2], '1', '2D array of indicators or 3D array ' + 'of formed tuples', mock_preprocessor), + ([1, 2], '1', '3D array of formed tuples', None), + ([[[[5]]]], '4', '2D array of indicators or 3D array' + ' of formed tuples', + mock_preprocessor), + ([[[[5]]]], '4', '3D array of formed tuples', None), + ([[1], [3]], '2', '3D array of formed ' + 'tuples', None)]) +def test_check_tuples_invalid_shape(estimator, context, tuples, found, + expected, preprocessor): + """Checks that a value error with the appropriate message is raised if + shape is invalid (not 2D with preprocessor or 3D with no preprocessor) + """ + tuples = np.array(tuples) + msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your " + "data{}.\n" + .format(expected, context, ' when using a preprocessor' + if preprocessor else '', found, tuples, + ' and/or use a preprocessor' if + (not preprocessor and tuples.ndim == 2) else '')) + with pytest.raises(ValueError) as raised_error: + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, ensure_min_samples=0, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +def test_check_tuples_invalid_n_features(estimator, context): + """Checks that the right warning is printed if not enough features + Here we only test if no preprocessor (otherwise we don't ensure this) + """ + msg = ("Found array with 2 feature(s) (shape={}) while" + " a minimum of 3 is required{}.".format(tuples_no_prep().shape, + context)) + with pytest.raises(ValueError) as raised_error: + check_input(tuples_no_prep(), type_of_inputs='tuples', + preprocessor=None, ensure_min_features=3, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('load_tuples, preprocessor', + [(tuples_prep, mock_preprocessor), + (tuples_no_prep, None), + (tuples_no_prep, mock_preprocessor)]) +def test_check_tuples_invalid_n_samples(estimator, context, load_tuples, + preprocessor): + """Checks that the right warning is printed if n_samples is too small""" + tuples = load_tuples() + msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " + "is required{}.".format((preprocess_tuples(tuples, preprocessor) + if (preprocessor is not None and + tuples.ndim == 2) else tuples).shape, + context)) + with pytest.raises(ValueError) as raised_error: + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, + ensure_min_samples=3, estimator=estimator) + assert str(raised_error.value) == msg + + +def test_check_tuples_invalid_dtype_not_convertible_with_preprocessor(): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float, when using a preprocessor + """ + + def preprocessor(indices): + # preprocessor that returns objects + return np.full((indices.shape[0], 3), 'a') + + with pytest.raises(ValueError): + check_input(tuples_prep(), type_of_inputs='tuples', + preprocessor=preprocessor, dtype=np.float64) + + +def test_check_tuples_invalid_dtype_not_convertible_without_preprocessor(): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float, when using no preprocessor + """ + tuples = np.full_like(tuples_no_prep(), 'a', dtype=object) + with pytest.raises(ValueError): + check_input(tuples, type_of_inputs='tuples', + preprocessor=None, dtype=np.float64) + + +@pytest.mark.parametrize('tuple_size', [2, None]) +def test_check_tuples_valid_tuple_size(tuple_size): + """For inputs that have the right matrix dimension (2D or 3D for instance), + checks that checking the number of tuples (pairs, quadruplets, etc) raises + no warning if there is the right number of points in a tuple. + """ + with warnings.catch_warnings(record=True) as record: + check_input(tuples_prep(), type_of_inputs='tuples', + preprocessor=mock_preprocessor, tuple_size=tuple_size) + check_input(tuples_no_prep(), type_of_inputs='tuples', preprocessor=None, + tuple_size=tuple_size) + assert len(record) == 0 + + +@pytest.mark.parametrize('tuples', + [np.array([[2.5, 0.1, 2.6], + [1.6, 4.8, 9.1]]), + np.array([[2, 0, 2], + [1, 4, 9]]), + np.array([["img1.png", "img3.png"], + ["img2.png", "img4.png"]]), + [[2, 0, 2], + [1, 4, 9]], + [np.array([2, 0, 2]), + np.array([1, 4, 9])], + ((2, 0, 2), + (1, 4, 9)), + np.array([[[1.2, 2.2], [1.4, 3.3]], + [[2.6, 2.3], [3.4, 5.0]]])]) +def test_check_tuples_valid_with_preprocessor(tuples): + """Test that valid inputs when using a preprocessor raises no warning""" + with warnings.catch_warnings(record=True) as record: + check_input(tuples, type_of_inputs='tuples', + preprocessor=mock_preprocessor) + assert len(record) == 0 + + +@pytest.mark.parametrize('tuples', + [np.array([[[2.5], [0.1], [2.6]], + [[1.6], [4.8], [9.1]], + [[5.6], [2.8], [6.1]]]), + np.array([[[2], [0], [2]], + [[1], [4], [9]], + [[1], [5], [3]]]), + [[[2], [0], [2]], + [[1], [4], [9]], + [[3], [4], [29]]], + (((2, 1), (0, 2), (2, 3)), + ((1, 2), (4, 4), (9, 3)), + ((3, 1), (4, 4), (29, 4)))]) +def test_check_tuples_valid_without_preprocessor(tuples): + """Test that valid inputs when using no preprocessor raises no warning""" + with warnings.catch_warnings(record=True) as record: + check_input(tuples, type_of_inputs='tuples', preprocessor=None) + assert len(record) == 0 + + +def test_check_tuples_behaviour_auto_dtype(): + """Checks that check_tuples allows by default every type if using a + preprocessor, and numeric types if using no preprocessor""" + tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] + with warnings.catch_warnings(record=True) as record: + check_input(tuples_prep, type_of_inputs='tuples', + preprocessor=mock_preprocessor) + assert len(record) == 0 + + with warnings.catch_warnings(record=True) as record: + check_input(tuples_no_prep(), type_of_inputs='tuples') # numeric type + assert len(record) == 0 + + # not numeric type + tuples_no_prep_bis = np.array([[['img1.png'], ['img2.png']], + [['img3.png'], ['img5.png']]]) + tuples_no_prep_bis = tuples_no_prep_bis.astype(object) + with pytest.raises(ValueError): + check_input(tuples_no_prep_bis, type_of_inputs='tuples') + + +def test_check_tuples_invalid_complex_data(): + """Checks that the right error message is thrown if given complex data ( + this comes from sklearn's check_array's message)""" + tuples = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], + [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) + msg = ("Complex data not supported\n" + "{}\n".format(tuples)) + with pytest.raises(ValueError) as raised_error: + check_input(tuples, type_of_inputs='tuples') + assert str(raised_error.value) == msg + + +# ------------- test check_input with 'classic' type_of_inputs ---------------- + + +def points_prep(): + """Basic array for testing when using a preprocessor""" + points = np.array([1, 2]) + return points + + +def points_no_prep(): + """Basic array for testing when using no preprocessor""" + points = np.array([[1., 2.3], + [2.3, 4.3]]) + return points + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('points, found, expected, preprocessor', + [(5, '0', '1D array of indicators or 2D array of ' + 'formed points', mock_preprocessor), + (5, '0', '2D array of formed points', None), + ([1, 2], '1', '2D array of formed points', None), + ([[[5]]], '3', '1D array of indicators or 2D ' + 'array of formed points', + mock_preprocessor), + ([[[5]]], '3', '2D array of formed points', None)]) +def test_check_classic_invalid_shape(estimator, context, points, found, + expected, preprocessor): + """Checks that a value error with the appropriate message is raised if + shape is invalid (valid being 1D or 2D with preprocessor or 2D with no + preprocessor) + """ + points = np.array(points) + msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your " + "data{}.\n" + .format(expected, context, ' when using a preprocessor' + if preprocessor else '', found, points, + ' and/or use a preprocessor' if + (not preprocessor and points.ndim == 1) else '')) + with pytest.raises(ValueError) as raised_error: + check_input(points, type_of_inputs='classic', preprocessor=preprocessor, + ensure_min_samples=0, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +def test_check_classic_invalid_n_features(estimator, context): + """Checks that the right warning is printed if not enough features + Here we only test if no preprocessor (otherwise we don't ensure this) + """ + msg = ("Found array with 2 feature(s) (shape={}) while" + " a minimum of 3 is required{}.".format(points_no_prep().shape, + context)) + with pytest.raises(ValueError) as raised_error: + check_input(points_no_prep(), type_of_inputs='classic', + preprocessor=None, ensure_min_features=3, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('load_points, preprocessor', + [(points_prep, mock_preprocessor), + (points_no_prep, None), + (points_no_prep, mock_preprocessor)]) +def test_check_classic_invalid_n_samples(estimator, context, load_points, + preprocessor): + """Checks that the right warning is printed if n_samples is too small""" + points = load_points() + msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " + "is required{}.".format((preprocess_points(points, + preprocessor) + if preprocessor is not None and + points.ndim == 1 else + points).shape, + context)) + with pytest.raises(ValueError) as raised_error: + check_input(points, type_of_inputs='classic', preprocessor=preprocessor, + ensure_min_samples=3, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('preprocessor, points', + [(mock_preprocessor, np.array([['a', 'b'], + ['e', 'b']])), + (None, np.array([[['b', 'v'], ['a', 'd']], + [['x', 'u'], ['c', 'a']]]))]) +def test_check_classic_invalid_dtype_not_convertible(preprocessor, points): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float + """ + with pytest.raises(ValueError): + check_input(points, type_of_inputs='classic', + preprocessor=preprocessor, dtype=np.float64) + + +@pytest.mark.parametrize('points', + [["img1.png", "img3.png", "img2.png"], + np.array(["img1.png", "img3.png", "img2.png"]), + [2, 0, 2, 1, 4, 9], + range(10), + np.array([2, 0, 2]), + (2, 0, 2), + np.array([[1.2, 2.2], + [2.6, 2.3]])]) +def test_check_classic_valid_with_preprocessor(points): + """Test that valid inputs when using a preprocessor raises no warning""" + with warnings.catch_warnings(record=True) as record: + check_input(points, type_of_inputs='classic', + preprocessor=mock_preprocessor) + assert len(record) == 0 + + +@pytest.mark.parametrize('points', + [np.array([[2.5, 0.1, 2.6], + [1.6, 4.8, 9.1], + [5.6, 2.8, 6.1]]), + np.array([[2, 0, 2], + [1, 4, 9], + [1, 5, 3]]), + [[2, 0, 2], + [1, 4, 9], + [3, 4, 29]], + ((2, 1, 0, 2, 2, 3), + (1, 2, 4, 4, 9, 3), + (3, 1, 4, 4, 29, 4))]) +def test_check_classic_valid_without_preprocessor(points): + """Test that valid inputs when using no preprocessor raises no warning""" + with warnings.catch_warnings(record=True) as record: + check_input(points, type_of_inputs='classic', preprocessor=None) + assert len(record) == 0 + + +def test_check_classic_by_default(): + """Checks that 'classic' is the default behaviour of check_input""" + assert (check_input([[2, 3], [3, 2]]) == + check_input([[2, 3], [3, 2]], type_of_inputs='classic')).all() + + +def test_check_classic_behaviour_auto_dtype(): + """Checks that check_input (for points) allows by default every type if + using a preprocessor, and numeric types if using no preprocessor""" + points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png'] + with warnings.catch_warnings(record=True) as record: + check_input(points_prep, type_of_inputs='classic', + preprocessor=mock_preprocessor) + assert len(record) == 0 + + with warnings.catch_warnings(record=True) as record: + check_input(points_no_prep(), type_of_inputs='classic') # numeric type + assert len(record) == 0 + + # not numeric type + points_no_prep_bis = np.array(['img1.png', 'img2.png', 'img3.png', + 'img5.png']) + points_no_prep_bis = points_no_prep_bis.astype(object) + with pytest.raises(ValueError): + check_input(points_no_prep_bis, type_of_inputs='classic') + + +def test_check_classic_invalid_complex_data(): + """Checks that the right error message is thrown if given complex data ( + this comes from sklearn's check_array's message)""" + points = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], + [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) + msg = ("Complex data not supported\n" + "{}\n".format(points)) + with pytest.raises(ValueError) as raised_error: + check_input(points, type_of_inputs='classic') + assert str(raised_error.value) == msg + + +# ----------------------------- Test preprocessor ----------------------------- + + +X = np.array([[0.89, 0.11, 1.48, 0.12], + [2.63, 1.08, 1.68, 0.46], + [1.00, 0.59, 0.62, 1.15]]) + + +class MockFileLoader: + """Preprocessor that takes a root file path at construction and simulates + fetching the file in the specific root folder when given the name of the + file""" + + def __init__(self, root): + self.root = root + self.folders = {'fake_root': {'img0.png': X[0], + 'img1.png': X[1], + 'img2.png': X[2] + }, + 'other_folder': {} # empty folder + } + + def __call__(self, path_list): + images = list() + for path in path_list: + images.append(self.folders[self.root][path]) + return np.array(images) + + +def mock_id_loader(list_of_indicators): + """A preprocessor as a function that takes indicators (strings) and + returns the corresponding samples""" + points = [] + for indicator in list_of_indicators: + points.append(X[int(indicator[2:])]) + return np.array(points) + + +tuples_list = [np.array([[0, 1], + [2, 1]]), + + np.array([['img0.png', 'img1.png'], + ['img2.png', 'img1.png']]), + + np.array([['id0', 'id1'], + ['id2', 'id1']]) + ] + +points_list = [np.array([0, 1, 2, 1]), + + np.array(['img0.png', 'img1.png', 'img2.png', 'img1.png']), + + np.array(['id0', 'id1', 'id2', 'id1']) + ] + +preprocessors = [X, MockFileLoader('fake_root'), mock_id_loader] + + +@pytest.fixture +def y_tuples(): + y = [-1, 1] + return y + + +@pytest.fixture +def y_points(): + y = [0, 1, 0, 0] + return y + + +@pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors, + tuples_list)) +def test_preprocessor_weakly_supervised(preprocessor, tuples, y_tuples): + """Tests different ways to use the preprocessor argument: an array, + a class callable, and a function callable, with a weakly supervised + algorithm + """ + nca = ITML(preprocessor=preprocessor) + nca.fit(tuples, y_tuples) + + +@pytest.mark.parametrize('preprocessor, points', zip(preprocessors, + points_list)) +def test_preprocessor_supervised(preprocessor, points, y_points): + """Tests different ways to use the preprocessor argument: an array, + a class callable, and a function callable, with a supervised algorithm + """ + lfda = LFDA(preprocessor=preprocessor) + lfda.fit(points, y_points) + + +@pytest.mark.parametrize('estimator', ['NCA', NCA(), None]) +def test_preprocess_tuples_invalid_message(estimator): + """Checks that if the preprocessor does some weird stuff, the preprocessed + input is detected as weird. Checks this for preprocess_tuples.""" + + context = make_context(estimator) + (' after the preprocessor ' + 'has been applied') + + def preprocessor(sequence): + return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D + + with pytest.raises(ValueError) as raised_error: + check_input(np.ones((3, 2)), type_of_inputs='tuples', + preprocessor=preprocessor, estimator=estimator) + expected_msg = ("3D array of formed tuples expected{}. Found 4D " + "array instead:\ninput={}. Reshape your data{}.\n" + .format(context, np.ones((3, 2, 2, 2)), + ' and/or use a preprocessor' if preprocessor + is not None else '')) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('estimator', ['NCA', NCA(), None]) +def test_preprocess_points_invalid_message(estimator): + """Checks that if the preprocessor does some weird stuff, the preprocessed + input is detected as weird.""" + + context = make_context(estimator) + (' after the preprocessor ' + 'has been applied') + + def preprocessor(sequence): + return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D + + with pytest.raises(ValueError) as raised_error: + check_input(np.ones((3,)), type_of_inputs='classic', + preprocessor=preprocessor, estimator=estimator) + expected_msg = ("2D array of formed points expected{}. " + "Found 3D array instead:\ninput={}. Reshape your data{}.\n" + .format(context, np.ones((3, 2, 2)), + ' and/or use a preprocessor' if preprocessor + is not None else '')) + assert str(raised_error.value) == expected_msg + + +def test_preprocessor_error_message(): + """Tests whether the preprocessor returns a preprocessor error when there + is a problem using the preprocessor + """ + preprocessor = ArrayIndexer(np.array([[1.2, 3.3], [3.1, 3.2]])) + + # with tuples + X = np.array([[[2, 3], [3, 3]], [[2, 3], [3, 2]]]) + # There are less samples than the max index we want to preprocess + with pytest.raises(PreprocessorError): + preprocess_tuples(X, preprocessor) + + # with points + X = np.array([[1], [2], [3], [3]]) + with pytest.raises(PreprocessorError): + preprocess_points(X, preprocessor) + + +@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]], + ((5, 3), (3, 2)) + ]) +@pytest.mark.parametrize('indices', [[0, 1], (1, 0)]) +def test_array_like_indexer_array_like_valid_classic(input_data, indices): + """Checks that any array-like is valid in the 'preprocessor' argument, + and in the indices, for a classic input""" + class MockMetricLearner(MahalanobisMixin): + def fit(self): + pass + pass + + mock_algo = MockMetricLearner(preprocessor=input_data) + mock_algo._prepare_inputs(indices, type_of_inputs='classic') + + +@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]], + ((5, 3), (3, 2)) + ]) +@pytest.mark.parametrize('indices', [[[0, 1], [1, 0]], ((1, 0), (1, 0))]) +def test_array_like_indexer_array_like_valid_tuples(input_data, indices): + """Checks that any array-like is valid in the 'preprocessor' argument, + and in the indices, for a classic input""" + class MockMetricLearner(MahalanobisMixin): + def fit(self): + pass + pass + + mock_algo = MockMetricLearner(preprocessor=input_data) + mock_algo._prepare_inputs(indices, type_of_inputs='tuples') + + +@pytest.mark.parametrize('preprocessor', [4, NCA()]) +def test_error_message_check_preprocessor(preprocessor): + """Checks that if the preprocessor given is not an array-like or a + callable, the right error message is returned""" + class MockMetricLearner(MahalanobisMixin): + pass + + mock_algo = MockMetricLearner(preprocessor=preprocessor) + with pytest.raises(ValueError) as e: + mock_algo._check_preprocessor() + assert str(e.value) == ("Invalid type for the preprocessor: {}. You should " + "provide either None, an array-like object, " + "or a callable.".format(type(preprocessor))) + + +@pytest.mark.parametrize('estimator, _', tuples_learners, + ids=ids_tuples_learners) +def test_error_message_tuple_size(estimator, _): + """Tests that if a tuples learner is not given the good number of points + per tuple, it throws an error message""" + estimator = clone(estimator) + set_random_state(estimator) + invalid_pairs = np.ones((2, 5, 2)) + y = [1, 1] + with pytest.raises(ValueError) as raised_err: + estimator.fit(*remove_y(estimator, invalid_pairs, y)) + expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 5 " + "element(s) instead (shape=(2, 5, 2)):\ninput={}.\n" + .format(estimator._tuple_size, make_context(estimator), + invalid_pairs)) + assert str(raised_err.value) == expected_msg + + +@pytest.mark.parametrize('estimator, _', metric_learners, + ids=ids_metric_learners) +def test_error_message_t_pair_distance_or_score(estimator, _): + """Tests that if you want to pair_distance or pair_score on triplets + for instance, it returns the right error message + """ + estimator = clone(estimator) + set_random_state(estimator) + estimator._check_preprocessor() + triplets = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]], + [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]]) + with pytest.raises(ValueError) as raised_err: + estimator.pair_score(triplets) + expected_msg = ("Tuples of 2 element(s) expected{}. Got tuples of 3 " + "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n" + .format(make_context(estimator), triplets)) + assert str(raised_err.value) == expected_msg + + not_implemented_msg = "" + # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says + # "This learner does not have pair_distance" + + # One exception will trigger for sure + with pytest.raises(Exception) as raised_exception: + estimator.pair_distance(triplets) + err_value = raised_exception.value.args[0] + assert err_value == expected_msg or err_value == not_implemented_msg + + +def test_preprocess_tuples_simple_example(): + """Test the preprocessor on a very simple example of tuples to ensure the + result is as expected""" + array = np.array([[1, 2], + [2, 3], + [4, 5]]) + + def fun(row): + return np.array([[1, 1], [3, 3], [4, 4]]) + + expected_result = np.array([[[1, 1], [1, 1]], + [[3, 3], [3, 3]], + [[4, 4], [4, 4]]]) + + assert (preprocess_tuples(array, fun) == expected_result).all() + + +def test_preprocess_points_simple_example(): + """Test the preprocessor on very simple examples of points to ensure the + result is as expected""" + array = np.array([1, 2, 4]) + + def fun(row): + return [[1, 1], [3, 3], [4, 4]] + + expected_result = np.array([[1, 1], + [3, 3], + [4, 4]]) + + assert (preprocess_points(array, fun) == expected_result).all() + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_same_with_or_without_preprocessor(estimator, build_dataset): + """Test that algorithms using a preprocessor behave consistently +# with their no-preprocessor equivalent + """ + dataset_indices = build_dataset(with_preprocessor=True) + dataset_formed = build_dataset(with_preprocessor=False) + X = dataset_indices.preprocessor + indicators_to_transform = dataset_indices.to_transform + formed_points_to_transform = dataset_formed.to_transform + (indices_train, indices_test, y_train, y_test, formed_train, + formed_test) = train_test_split(dataset_indices.data, + dataset_indices.target, + dataset_formed.data, + random_state=SEED) + + estimator_with_preprocessor = clone(estimator) + set_random_state(estimator_with_preprocessor) + estimator_with_preprocessor.set_params(preprocessor=X) + estimator_with_preprocessor.fit(*remove_y(estimator, indices_train, y_train)) + + estimator_without_preprocessor = clone(estimator) + set_random_state(estimator_without_preprocessor) + estimator_without_preprocessor.set_params(preprocessor=None) + estimator_without_preprocessor.fit(*remove_y(estimator, formed_train, + y_train)) + + estimator_with_prep_formed = clone(estimator) + set_random_state(estimator_with_prep_formed) + estimator_with_prep_formed.set_params(preprocessor=X) + estimator_with_prep_formed.fit(*remove_y(estimator, indices_train, y_train)) + + # test prediction methods + for method in ["predict", "decision_function"]: + if hasattr(estimator, method): + output_with_prep = getattr(estimator_with_preprocessor, + method)(indices_test) + output_without_prep = getattr(estimator_without_preprocessor, + method)(formed_test) + assert np.array(output_with_prep == output_without_prep).all() + output_with_prep = getattr(estimator_with_preprocessor, + method)(indices_test) + output_with_prep_formed = getattr(estimator_with_prep_formed, + method)(formed_test) + assert np.array(output_with_prep == output_with_prep_formed).all() + + # Test pair_score, all learners have it. + idx1 = np.array([[0, 2], [5, 3]], dtype=int) + output_with_prep = estimator_with_preprocessor.pair_score( + indicators_to_transform[idx1]) + output_without_prep = estimator_without_preprocessor.pair_score( + formed_points_to_transform[idx1]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.pair_score( + indicators_to_transform[idx1]) + output_without_prep = estimator_with_prep_formed.pair_score( + formed_points_to_transform[idx1]) + assert np.array(output_with_prep == output_without_prep).all() + + # Test pair_distance + not_implemented_msg = "" + # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says + # "This learner does not have pair_distance" + try: + output_with_prep = estimator_with_preprocessor.pair_distance( + indicators_to_transform[idx1]) + output_without_prep = estimator_without_preprocessor.pair_distance( + formed_points_to_transform[idx1]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.pair_distance( + indicators_to_transform[idx1]) + output_without_prep = estimator_with_prep_formed.pair_distance( + formed_points_to_transform[idx1]) + assert np.array(output_with_prep == output_without_prep).all() + + except Exception as raised_exception: + assert raised_exception.value.args[0] == not_implemented_msg + + # Test transform + not_implemented_msg = "" + # Todo in 0.7.0: Change 'not_implemented_msg' for the message that says + # "This learner does not have transform" + try: + output_with_prep = estimator_with_preprocessor.transform( + indicators_to_transform) + output_without_prep = estimator_without_preprocessor.transform( + formed_points_to_transform) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.transform( + indicators_to_transform) + output_without_prep = estimator_with_prep_formed.transform( + formed_points_to_transform) + assert np.array(output_with_prep == output_without_prep).all() + + except Exception as raised_exception: + assert raised_exception.value.args[0] == not_implemented_msg + + +def test_check_collapsed_pairs_raises_no_error(): + """Checks that check_collapsed_pairs raises no error if no collapsed pairs + is present""" + pairs_ok = np.array([[[0.1, 3.3], [3.3, 0.1]], + [[0.1, 3.3], [3.3, 0.1]], + [[2.5, 8.1], [0.1, 3.3]]]) + check_collapsed_pairs(pairs_ok) + + +def test_check_collapsed_pairs_raises_error(): + """Checks that check_collapsed_pairs raises no error if no collapsed pairs + is present""" + pairs_not_ok = np.array([[[0.1, 3.3], [0.1, 3.3]], + [[0.1, 3.3], [3.3, 0.1]], + [[2.5, 8.1], [2.5, 8.1]]]) + with pytest.raises(ValueError) as e: + check_collapsed_pairs(pairs_not_ok) + assert str(e.value) == ("2 collapsed pairs found (where the left element is " + "the same as the right element), out of 3 pairs in" + " total.") + + +def test__validate_vector(): + """Replica of scipy.spatial.tests.test_distance.test__validate_vector""" + x = [1, 2, 3] + y = validate_vector(x) + assert_array_equal(y, x) + + y = validate_vector(x, dtype=np.float64) + assert_array_equal(y, x) + assert_equal(y.dtype, np.float64) + + x = [1] + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_equal(y, x) + + x = 1 + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_equal(y, [x]) + + x = np.arange(5).reshape(1, -1, 1) + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_array_equal(y, x[0, :, 0]) + + x = [[1, 2], [3, 4]] + with pytest.raises(ValueError): + validate_vector(x) + + +def test__check_sdp_from_eigen_positive_err_messages(): + """Tests that if _check_sdp_from_eigen is given a negative tol it returns + an error, and if positive (or None) it does not""" + w = np.abs(np.random.RandomState(42).randn(10)) + 1 + with pytest.raises(ValueError) as raised_error: + _check_sdp_from_eigen(w, -5.) + assert str(raised_error.value) == "tol should be positive." + with pytest.raises(ValueError) as raised_error: + _check_sdp_from_eigen(w, -1e-10) + assert str(raised_error.value) == "tol should be positive." + _check_sdp_from_eigen(w, 1.) + _check_sdp_from_eigen(w, 0.) + _check_sdp_from_eigen(w, None) + + +@pytest.mark.unit +@pytest.mark.parametrize('w', [np.array([-1.2, 5.5, 6.6]), + np.array([-1.2, -5.6])]) +def test__check_sdp_from_eigen_positive_eigenvalues(w): + """Tests that _check_sdp_from_eigen, returns a NonPSDError when + the eigenvalues are negatives or null.""" + with pytest.raises(NonPSDError): + _check_sdp_from_eigen(w) + + +@pytest.mark.unit +@pytest.mark.parametrize('w', [np.array([0., 2.3, 5.3]), + np.array([1e-20, 3.5]), + np.array([1.5, 2.4, 4.6])]) +def test__check_sdp_from_eigen_negative_eigenvalues(w): + """Tests that _check_sdp_from_eigen, returns no error when the + eigenvalues are positive.""" + _check_sdp_from_eigen(w) + + +@pytest.mark.unit +@pytest.mark.parametrize('w, is_definite', [(np.array([1e-15, 5.6]), False), + (np.array([-1e-15, 5.6]), False), + (np.array([3.2, 5.6, 0.01]), True), + ]) +def test__check_sdp_from_eigen_returns_definiteness(w, is_definite): + """Tests that _check_sdp_from_eigen returns the definiteness of the + matrix (when it is PSD), based on the given eigenvalues""" + assert _check_sdp_from_eigen(w) == is_definite + + +@pytest.mark.unit +@pytest.mark.parametrize('w, tol, is_definite', + [(np.array([5., 3.]), 2, True), + (np.array([5., 1.]), 2, False), + (np.array([5., -1.]), 2, False)]) +def test__check_sdp_from_eigen_tol_psd(w, tol, is_definite): + """Tests that _check_sdp_from_eigen, for PSD matrices, returns + False if an eigenvalue is lower than tol""" + assert _check_sdp_from_eigen(w, tol=tol) == is_definite + + +@pytest.mark.unit +@pytest.mark.parametrize('w, tol', + [(np.array([5., -3.]), 2), + (np.array([1., -3.]), 2)]) +def test__check_sdp_from_eigen_tol_non_psd(w, tol): + """Tests that _check_sdp_from_eigen raises a NonPSDError + when there is a negative value with abs value higher than tol""" + with pytest.raises(NonPSDError): + _check_sdp_from_eigen(w, tol=tol) + + +@pytest.mark.unit +@pytest.mark.parametrize('w, is_definite', + [(np.array([1e5, 1e5, 1e5, 1e5, + 1e5, 1e5, 1e-20]), False), + (np.array([1e-10, 1e-10]), True)]) +def test__check_sdp_from_eigen_tol_default_psd(w, is_definite): + """Tests that the default tol argument gives good results for edge cases + like even if the determinant is high but clearly one eigenvalue is low, + (undefinite so returns False) or when all eigenvalues are low (definite so + returns True)""" + assert _check_sdp_from_eigen(w, tol=None) == is_definite + + +@pytest.mark.unit +@pytest.mark.parametrize('w', + [np.array([1., -1.]), + np.array([-1e-10, 1e-10])]) +def test__check_sdp_from_eigen_tol_default_non_psd(w): + """Tests that the default tol argument is good for raising + NonPSDError, e.g. that when a value is clearly relatively + negative it raises such an error""" + with pytest.raises(NonPSDError): + _check_sdp_from_eigen(w, tol=None) + + +def test__check_n_components(): + """Checks that n_components returns what is expected + (including the errors)""" + dim = _check_n_components(5, None) + assert dim == 5 + + dim = _check_n_components(5, 3) + assert dim == 3 + + with pytest.raises(ValueError) as expected_err: + _check_n_components(5, 10) + assert str(expected_err.value) == 'Invalid n_components, must be in [1, 5]' + + with pytest.raises(ValueError) as expected_err: + _check_n_components(5, 0) + assert str(expected_err.value) == 'Invalid n_components, must be in [1, 5]' + + +@pytest.mark.unit +@pytest.mark.parametrize('wrong_labels', + [[0.5, 0.6, 0.7, 0.8, 0.9], + np.random.RandomState(42).randn(5), + np.random.RandomState(42).choice([0, 1], size=5)]) +def test_check_y_valid_values_for_pairs(wrong_labels): + expected_msg = ("When training on pairs, the labels (y) should contain " + "only values in [-1, 1]. Found an incorrect value.") + with pytest.raises(ValueError) as raised_error: + check_y_valid_values_for_pairs(wrong_labels) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.integration +@pytest.mark.parametrize('wrong_labels', + [[0.5, 0.6, 0.7, 0.8, 0.9], + np.random.RandomState(42).randn(5), + np.random.RandomState(42).choice([0, 1], size=5)]) +def test_check_input_invalid_tuples_without_preprocessor(wrong_labels): + pairs = np.random.RandomState(42).randn(5, 2, 3) + expected_msg = ("When training on pairs, the labels (y) should contain " + "only values in [-1, 1]. Found an incorrect value.") + with pytest.raises(ValueError) as raised_error: + check_input(pairs, wrong_labels, preprocessor=None, + type_of_inputs='tuples') + assert str(raised_error.value) == expected_msg + + +@pytest.mark.integration +@pytest.mark.parametrize('wrong_labels', + [[0.5, 0.6, 0.7, 0.8, 0.9], + np.random.RandomState(42).randn(5), + np.random.RandomState(42).choice([0, 1], size=5)]) +def test_check_input_invalid_tuples_with_preprocessor(wrong_labels): + n_samples, n_features, n_pairs = 10, 4, 5 + rng = np.random.RandomState(42) + pairs = rng.randint(10, size=(n_pairs, 2)) + preprocessor = rng.randn(n_samples, n_features) + expected_msg = ("When training on pairs, the labels (y) should contain " + "only values in [-1, 1]. Found an incorrect value.") + with pytest.raises(ValueError) as raised_error: + check_input(pairs, wrong_labels, preprocessor=ArrayIndexer(preprocessor), + type_of_inputs='tuples') + assert str(raised_error.value) == expected_msg + + +@pytest.mark.integration +@pytest.mark.parametrize('with_preprocessor', [True, False]) +@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, + ids=ids_pairs_learners) +def test_check_input_pairs_learners_invalid_y(estimator, build_dataset, + with_preprocessor): + """checks that the only allowed labels for learning pairs are +1 and -1""" + input_data, labels, _, X = build_dataset() + wrong_labels_list = [labels + 0.5, + np.random.RandomState(42).randn(len(labels)), + np.random.RandomState(42).choice([0, 1], + size=len(labels))] + model = clone(estimator) + set_random_state(model) + + expected_msg = ("When training on pairs, the labels (y) should contain " + "only values in [-1, 1]. Found an incorrect value.") + + for wrong_labels in wrong_labels_list: + with pytest.raises(ValueError) as raised_error: + model.fit(input_data, wrong_labels) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('has_classes, n_features, n_samples, n_components, ' + 'n_classes, result', + [(False, 3, 20, 3, 0, 'identity'), + (False, 3, 2, 3, 0, 'identity'), + (False, 5, 3, 4, 0, 'identity'), + (False, 4, 5, 3, 0, 'pca'), + (True, 5, 6, 3, 4, 'lda'), + (True, 6, 3, 3, 3, 'identity'), + (True, 5, 6, 4, 2, 'pca'), + (True, 2, 6, 2, 10, 'lda'), + (True, 4, 6, 2, 3, 'lda') + ]) +def test__auto_select_init(has_classes, n_features, n_samples, n_components, + n_classes, + result): + """Checks that the auto selection of the init works as expected""" + assert (_auto_select_init(has_classes, n_features, + n_samples, n_components, n_classes) == result) + + +@pytest.mark.parametrize('w0', [1e-20, 0., -1e-20]) +def test_pseudo_inverse_from_eig_and_pinvh_singular(w0): + """Checks that _pseudo_inverse_from_eig returns the same result as + scipy.linalg.pinvh for a singular matrix""" + rng = np.random.RandomState(SEED) + A = rng.rand(100, 100) + A = A + A.T + w, V = eigh(A) + w[0] = w0 + A = V.dot(np.diag(w)).dot(V.T) + np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A), + rtol=1e-05) + + +def test_pseudo_inverse_from_eig_and_pinvh_nonsingular(): + """Checks that _pseudo_inverse_from_eig returns the same result as + scipy.linalg.pinvh for a non singular matrix""" + rng = np.random.RandomState(SEED) + A = rng.rand(100, 100) + A = A + A.T + w, V = eigh(A, check_finite=False) + np.testing.assert_allclose(_pseudo_inverse_from_eig(w, V), pinvh(A))