scikit-learn · amueller · Oct 10, 2017 · Oct 11, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
@@ -318,6 +318,7 @@ writing data in that format.
     olivetti_faces
     twenty_newsgroups
     mldata
+    openml
     labeled_faces
     covtype
     rcv1
@@ -328,6 +329,8 @@ writing data in that format.
 
 .. include:: twenty_newsgroups.rst
 
+.. include:: openml.rst
+
 .. include:: mldata.rst
 
 .. include:: labeled_faces.rst

diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst
@@ -16,9 +16,10 @@ Downloading datasets from the mldata.org repository
 
 `mldata.org <http://mldata.org>`_ is a public repository for machine learning
 data, supported by the `PASCAL network <http://www.pascal-network.org>`_ .
+It is no longer actively maintained, and it's suggested to use :ref:`openml` instead.
 
-The ``sklearn.datasets`` package is able to directly download data
-sets from the repository using the function
+The ``sklearn.datasets`` package is able to directly download datasets
+from the repository using the function
 :func:`sklearn.datasets.fetch_mldata`.
 
 For example, to download the MNIST digit recognition database::

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
@@ -0,0 +1,146 @@
+..
+    For doctests:
+
+    >>> import numpy as np
+    >>> import os
+    >>> import tempfile
+    >>> # Create a temporary folder for the data fetcher
+    >>> custom_data_home = tempfile.mkdtemp()
+    >>> os.makedirs(os.path.join(custom_data_home, 'openml'))
+
+
+.. _openml:
+
+Downloading datasets from the openml.org repository
+===================================================
+
+`openml.org <https://openml.org>`_ is a public repository for machine learning
+data and experiments, that allows everybody to upload open datasets.
+
+The ``sklearn.datasets`` package is able to directly download datasets
+from the repository using the function
+:func:`sklearn.datasets.fetch_openml`.
+
+For example, to download a dataset of gene expressions in mice brains::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> mice = fetch_openml('miceprotein', version=4, data_home=custom_data_home)
+
+To fully specify a dataset, you need to provide a name and a version, though the
+version is optional, see :ref:`openml_versions`_ below.
+The dataset contains a total of 1080 examples belonging to 8 different classes::
+
+  >>> mice.data.shape
+  (1080, 81)
+  >>> mice.target.shape
+  (1080,)
+  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'",
+  b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8')
+
+You can get more information on the dataset by looking at the ``DESCR``
+and ``details`` attributes::
+
+  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios  
+  **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015   
+  **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
+  Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
+  Syndrome. PLoS ONE 10(6): e0129126...
+
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
+
+
+The ``DESCR`` contains a free-text description of the data, while ``details``
+contains a dictionary of meta-data stored by openml, like the dataset id.
+The id of the mice protein dataset is 40966, and you can use this (or the name)
+to get more information on the dataset on the openml website::
+
+  >>> print(mice.url)
+  https://www.openml.org/d/40966
+
+The id is also the best way to specify how to fetch a dataset from OpenML::
+
+  >>> mice = fetch_openml(40966, data_home=custom_data_home)
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
+
+.. _openml_versions:
+
+Dataset Versions
+----------------
+
+A dataset is uniquely specified by its id, but not necessarily by its name.
+Several different "versions" of a dataset with the same name can exist which can contain
+entirely different datasets.
+If a particular version of a dataset has been found to contain significant
+issues, it might be inactivated. Using a name to specify a dataset will yield
+the earliest version of a dataset that is still active. That means that
+``fetch_openml("miceprotein")`` can yield different results at different times
+if earlier versions become inactive.
+You can see that the dataset with id 40966 that we fetched above is the version 1
+of the "miceprotein" dataset::
+
+  >>> mice.details['version']  #doctest: +SKIP
+  '1'
+
+In fact, this dataset only has one version. The iris dataset on the other hand
+has multiple versions::
+
+  >>> iris = fetch_openml("iris", data_home=custom_data_home)
+  >>> iris.details['version']  #doctest: +SKIP
+  '1'
+  >>> iris.details['id']  #doctest: +SKIP
+  '61'
+
+  >>> iris_61 = fetch_openml(61, data_home=custom_data_home)
+  >>> iris_61.details['version']  #doctest: +SKIP
+  '1'
+  >>> iris_61.details['id']  #doctest: +SKIP
+  '61'
+
+  >>> iris_969 = fetch_openml(969, data_home=custom_data_home)
+  >>> iris_969.details['version']  #doctest: +SKIP
+  '3'
+  >>> iris_969.details['id']  #doctest: +SKIP
+  '969'
+
+Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
+To make sure you always get this exact dataset, it is safest to specify it by the dataset id.
+The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains
+a binarized version of the data::
+
+  >>> np.unique(iris_969.target)  #doctest: +SKIP
+  array([b'N', b'P'],
+        dtype='|S1')
+
+You can also specify both the name and the version, which also uniquely identifies the dataset:: 
+  >>> iris_version_3 = fetch_openml("iris", version=3, data_home=custom_data_home)
+  >>> iris_version_3.details['version']
+  '3'
+  >>> iris_version_3.details['id']
+  '969'
+
+
+..
+    >>> import shutil
+    >>> shutil.rmtree(custom_data_home)
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -228,6 +228,7 @@ Loaders
    datasets.fetch_lfw_people
    datasets.fetch_mldata
    datasets.fetch_olivetti_faces
+   datasets.fetch_openml
    datasets.fetch_rcv1
    datasets.fetch_species_distributions
    datasets.get_data_home

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -49,6 +49,9 @@ Classifiers and regressors
   Naive Bayes classifier described in Rennie et al. (2003).
   :issue:`8190` by :user:`Michael A. Alcorn <airalcorn2>`.
 
+- Added :class:`multioutput.RegressorChain` for multi-target
+  regression. :issue:`9257` by :user:`Kumar Ashutosh <thechargedneutron>`.
+
 Preprocessing
 
 - Added :class:`preprocessing.CategoricalEncoder`, which allows to encode
@@ -74,8 +77,11 @@ Model evaluation
   ``'balanced_accuracy'`` scorer for binary classification.
   :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.
 
-- Added :class:`multioutput.RegressorChain` for multi-target
-  regression. :issue:`9257` by :user:`Kumar Ashutosh <thechargedneutron>`.
+Datasets
+
+- Added :func:`dataset.fetch_openml` to fetch any dataset from `OpenML <http://openml.org>`.
+  OpenML is a free, open data sharing platform and will replace mldata, which
+  is no longer maintained. :issue:`9908` by `Andreas Müller`_
 
 Enhancements
 ............

diff --git a/setup.cfg b/setup.cfg
@@ -7,6 +7,7 @@ test = pytest
 addopts =
     --doctest-modules
     --disable-pytest-warnings
+doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE
 
 [wheelhouse_uploader]
 artifact_indexes=

diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
@@ -23,6 +23,7 @@
 from .twenty_newsgroups import fetch_20newsgroups
 from .twenty_newsgroups import fetch_20newsgroups_vectorized
 from .mldata import fetch_mldata, mldata_filename
+from .openml import fetch_openml
 from .samples_generator import make_classification
 from .samples_generator import make_multilabel_classification
 from .samples_generator import make_hastie_10_2
@@ -65,6 +66,7 @@
            'fetch_covtype',
            'fetch_rcv1',
            'fetch_kddcup99',
+           'fetch_openml',
            'get_data_home',
            'load_boston',
            'load_diabetes',