diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 1316d596f50f1..1dcdec8550f9a 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -318,6 +318,7 @@ writing data in that format. olivetti_faces twenty_newsgroups mldata + openml labeled_faces covtype rcv1 @@ -328,6 +329,8 @@ writing data in that format. .. include:: twenty_newsgroups.rst +.. include:: openml.rst + .. include:: mldata.rst .. include:: labeled_faces.rst diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst index b94dfd7620a24..60546bfcfd363 100644 --- a/doc/datasets/mldata.rst +++ b/doc/datasets/mldata.rst @@ -16,9 +16,10 @@ Downloading datasets from the mldata.org repository `mldata.org `_ is a public repository for machine learning data, supported by the `PASCAL network `_ . +It is no longer actively maintained, and it's suggested to use :ref:`openml` instead. -The ``sklearn.datasets`` package is able to directly download data -sets from the repository using the function +The ``sklearn.datasets`` package is able to directly download datasets +from the repository using the function :func:`sklearn.datasets.fetch_mldata`. For example, to download the MNIST digit recognition database:: diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst new file mode 100644 index 0000000000000..60fc090cdecfc --- /dev/null +++ b/doc/datasets/openml.rst @@ -0,0 +1,146 @@ +.. + For doctests: + + >>> import numpy as np + >>> import os + >>> import tempfile + >>> # Create a temporary folder for the data fetcher + >>> custom_data_home = tempfile.mkdtemp() + >>> os.makedirs(os.path.join(custom_data_home, 'openml')) + + +.. _openml: + +Downloading datasets from the openml.org repository +=================================================== + +`openml.org `_ is a public repository for machine learning +data and experiments, that allows everybody to upload open datasets. + +The ``sklearn.datasets`` package is able to directly download datasets +from the repository using the function +:func:`sklearn.datasets.fetch_openml`. + +For example, to download a dataset of gene expressions in mice brains:: + + >>> from sklearn.datasets import fetch_openml + >>> mice = fetch_openml('miceprotein', version=4, data_home=custom_data_home) + +To fully specify a dataset, you need to provide a name and a version, though the +version is optional, see :ref:`openml_versions`_ below. +The dataset contains a total of 1080 examples belonging to 8 different classes:: + + >>> mice.data.shape + (1080, 81) + >>> mice.target.shape + (1080,) + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'", + b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8') + +You can get more information on the dataset by looking at the ``DESCR`` +and ``details`` attributes:: + + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios + **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 + **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing + Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down + Syndrome. PLoS ONE 10(6): e0129126... + + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + + +The ``DESCR`` contains a free-text description of the data, while ``details`` +contains a dictionary of meta-data stored by openml, like the dataset id. +The id of the mice protein dataset is 40966, and you can use this (or the name) +to get more information on the dataset on the openml website:: + + >>> print(mice.url) + https://www.openml.org/d/40966 + +The id is also the best way to specify how to fetch a dataset from OpenML:: + + >>> mice = fetch_openml(40966, data_home=custom_data_home) + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + +.. _openml_versions: + +Dataset Versions +---------------- + +A dataset is uniquely specified by its id, but not necessarily by its name. +Several different "versions" of a dataset with the same name can exist which can contain +entirely different datasets. +If a particular version of a dataset has been found to contain significant +issues, it might be inactivated. Using a name to specify a dataset will yield +the earliest version of a dataset that is still active. That means that +``fetch_openml("miceprotein")`` can yield different results at different times +if earlier versions become inactive. +You can see that the dataset with id 40966 that we fetched above is the version 1 +of the "miceprotein" dataset:: + + >>> mice.details['version'] #doctest: +SKIP + '1' + +In fact, this dataset only has one version. The iris dataset on the other hand +has multiple versions:: + + >>> iris = fetch_openml("iris", data_home=custom_data_home) + >>> iris.details['version'] #doctest: +SKIP + '1' + >>> iris.details['id'] #doctest: +SKIP + '61' + + >>> iris_61 = fetch_openml(61, data_home=custom_data_home) + >>> iris_61.details['version'] #doctest: +SKIP + '1' + >>> iris_61.details['id'] #doctest: +SKIP + '61' + + >>> iris_969 = fetch_openml(969, data_home=custom_data_home) + >>> iris_969.details['version'] #doctest: +SKIP + '3' + >>> iris_969.details['id'] #doctest: +SKIP + '969' + +Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. +To make sure you always get this exact dataset, it is safest to specify it by the dataset id. +The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains +a binarized version of the data:: + + >>> np.unique(iris_969.target) #doctest: +SKIP + array([b'N', b'P'], + dtype='|S1') + +You can also specify both the name and the version, which also uniquely identifies the dataset:: + >>> iris_version_3 = fetch_openml("iris", version=3, data_home=custom_data_home) + >>> iris_version_3.details['version'] + '3' + >>> iris_version_3.details['id'] + '969' + + +.. + >>> import shutil + >>> shutil.rmtree(custom_data_home) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2792ba8484664..2aad173950838 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -228,6 +228,7 @@ Loaders datasets.fetch_lfw_people datasets.fetch_mldata datasets.fetch_olivetti_faces + datasets.fetch_openml datasets.fetch_rcv1 datasets.fetch_species_distributions datasets.get_data_home diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index b43f0ccdbbbdd..89b380622bc48 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -49,6 +49,9 @@ Classifiers and regressors Naive Bayes classifier described in Rennie et al. (2003). :issue:`8190` by :user:`Michael A. Alcorn `. +- Added :class:`multioutput.RegressorChain` for multi-target + regression. :issue:`9257` by :user:`Kumar Ashutosh `. + Preprocessing - Added :class:`preprocessing.CategoricalEncoder`, which allows to encode @@ -74,8 +77,11 @@ Model evaluation ``'balanced_accuracy'`` scorer for binary classification. :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. -- Added :class:`multioutput.RegressorChain` for multi-target - regression. :issue:`9257` by :user:`Kumar Ashutosh `. +Datasets + +- Added :func:`dataset.fetch_openml` to fetch any dataset from `OpenML `. + OpenML is a free, open data sharing platform and will replace mldata, which + is no longer maintained. :issue:`9908` by `Andreas Müller`_ Enhancements ............ diff --git a/setup.cfg b/setup.cfg index f96e9cf9f85ab..0ca865a1e4648 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,6 +7,7 @@ test = pytest addopts = --doctest-modules --disable-pytest-warnings +doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE [wheelhouse_uploader] artifact_indexes= diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c43c0c4758b10..c7d78e633493d 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -23,6 +23,7 @@ from .twenty_newsgroups import fetch_20newsgroups from .twenty_newsgroups import fetch_20newsgroups_vectorized from .mldata import fetch_mldata, mldata_filename +from .openml import fetch_openml from .samples_generator import make_classification from .samples_generator import make_multilabel_classification from .samples_generator import make_hastie_10_2 @@ -65,6 +66,7 @@ 'fetch_covtype', 'fetch_rcv1', 'fetch_kddcup99', + 'fetch_openml', 'get_data_home', 'load_boston', 'load_diabetes', diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py new file mode 100644 index 0000000000000..002935518c378 --- /dev/null +++ b/sklearn/datasets/openml.py @@ -0,0 +1,232 @@ +import json +import numbers +import sys +import os +from os.path import join, exists +from warnings import warn + +try: + # Python 2 + from urllib2 import urlopen +except ImportError: + # Python 3+ + from urllib.request import urlopen + + +from scipy.io.arff import loadarff +import numpy as np + +from .base import get_data_home +from ..externals.joblib import Memory +from ..externals.six import StringIO +from ..externals.six.moves.urllib.error import HTTPError +from ..utils import Bunch + +_SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" +_DATA_INFO = "https://openml.org/api/v1/json/data/{}" +_DATA_FEATURES = "https://openml.org/api/v1/json/data/features/{}" + + +def _get_data_info_by_name(name, version): + data_found = True + try: + if version == "active": + json_string = urlopen(_SEARCH_NAME.format(name + + "/status/active/")) + else: + json_string = urlopen(_SEARCH_NAME.format(name) + + "/data_version/{}".format(version)) + except HTTPError as error: + if error.code == 412: + data_found = False + else: + raise error + + if not data_found and version != "active": + # might have been deactivated. will warn later + data_found = True + try: + json_string = urlopen(_SEARCH_NAME.format(name) + + "/data_version/{}/status/deactivated".format( + version)) + except HTTPError as error: + if error.code == 412: + data_found = False + else: + raise error + + if not data_found: + # not in except for nicer traceback + if version == "active": + raise ValueError("No active dataset {} found.".format(name)) + raise ValueError("Dataset {} with version {}" + " not found.".format(name, version)) + + json_data = json.loads(json_string.read().decode("utf-8")) + return json_data['data']['dataset'][0] + + +def _get_data_description_by_id(data_id): + data_found = True + try: + json_string = urlopen(_DATA_INFO.format(data_id)) + except HTTPError as error: + if error.code == 412: + data_found = False + if not data_found: + # not in except for nicer traceback + raise ValueError("Dataset with id {} " + "not found.".format(data_id)) + json_data = json.loads(json_string.read().decode("utf-8")) + return json_data['data_set_description'] + + +def _get_data_features(data_id): + data_found = True + try: + json_string = urlopen(_DATA_FEATURES.format(data_id)) + except HTTPError as error: + if error.code == 412: + data_found = False + if not data_found: + # not in except for nicer traceback + raise ValueError("Dataset with id {} " + "not found.".format(data_id)) + json_data = json.loads(json_string.read().decode("utf-8")) + return json_data['data_features']['feature'] + + +def _download_data(url): + response = urlopen(url) + if sys.version_info[0] == 2: + # Python2.7 numpy can't handle unicode? + arff = loadarff(StringIO(response.read())) + else: + arff = loadarff(StringIO(response.read().decode('utf-8'))) + + response.close() + return arff + + +def _download_data_csv(file_id): + response = urlopen("https://openml.org/data/v1/get_csv/{}".format(file_id)) + data = np.genfromtxt(response, names=True, dtype=None, delimiter=',', + missing_values='?') + response.close() + return data + + +def fetch_openml(name_or_id=None, version='active', data_home=None, + target_column='default-target', memory=True): + """Fetch dataset from openml by name or dataset id. + + Datasets are uniquely identified by either an integer ID or by a + combination of name and version (i.e. there might be multiple + versions of the 'iris' dataset). + + Parameters + ---------- + name_or_id : string or integer + Identifier of the dataset. If integer, assumed to be the id of the + dataset on OpenML, if string, assumed to be the name of the dataset. + + version : integer or 'active', default='active' + Version of the dataset. Only used if ``name_or_id`` is a string. + If 'active' the oldest version that's still active is used. + + data_home : string or None, default None + Specify another download and cache folder for the data sets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + target_column : string or None, default 'default-target' + Specify the column name in the data to use as target. If + 'default-target', the standard target column a stored on the server + is used. If ``None``, all columns are returned as data and the + tharget is ``None``. + + memory : boolean, default=True + Whether to store downloaded datasets using joblib. + + Returns + ------- + + data : Bunch + Dictionary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the regression target or + classification labels, 'DESCR', the full description of the dataset, + 'feature_names', the original names of the dataset columns, and + 'details' which provide more information on the openml meta-data. + """ + data_home = get_data_home(data_home=data_home) + data_home = join(data_home, 'openml') + if memory: + mem = Memory(join(data_home, 'cache'), verbose=0).cache + else: + def mem(func): + return func + _get_data_info_by_name_ = mem(_get_data_info_by_name) + _get_data_description_by_id_ = mem(_get_data_description_by_id) + _get_data_features_ = mem(_get_data_features) + _download_data_csv_ = mem(_download_data_csv) + + if not exists(data_home): + os.makedirs(data_home) + + # check if dataset id is known + if isinstance(name_or_id, numbers.Integral): + if version != "active": + raise ValueError( + "Dataset id={} and version={} passed, but you can only " + "specify a numeric id or a version, not both.".format( + name_or_id, version)) + data_id = name_or_id + elif isinstance(name_or_id, str): + data_info = _get_data_info_by_name_(name_or_id, version) + data_id = data_info['did'] + + else: + raise TypeError( + "Invalid name_or_id {}, should be string or integer.".format( + name_or_id)) + + data_description = _get_data_description_by_id_(data_id) + if data_description['status'] != "active": + warn("Version {} of dataset {} is inactive, meaning that issues have" + " been found in the dataset. Try using a newer version.".format( + data_description['version'], data_description['name'])) + if target_column == "default-target": + target_column = data_description.get('default_target_attribute', None) + + # download actual data + features = _get_data_features_(data_id) + # TODO: stacking the content of the structured array + # this results in a copy. If the data was homogeneous + # and target at start or end, we could use a view instead. + data_columns = [] + for feature in features: + if (feature['name'] != target_column and feature['is_ignore'] == + 'false' and feature['is_row_identifier'] == 'false'): + data_columns.append(feature['name']) + + data = _download_data_csv_(data_description['file_id']) + if target_column is not None: + y = data[target_column] + else: + y = None + + if all([feature['data_type'] == "numeric" for feature in features + if feature['name'] in data_columns]): + dtype = None + else: + dtype = object + X = np.array([data[c] for c in data_columns], dtype=dtype).T + + description = u"{}\n\nDownloaded from openml.org.".format( + data_description.pop('description')) + + bunch = Bunch( + data=X, target=y, feature_names=data_columns, + DESCR=description, details=data_description, features=features, + url="https://www.openml.org/d/{}".format(data_id)) + + return bunch diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py new file mode 100644 index 0000000000000..671c53a93349c --- /dev/null +++ b/sklearn/datasets/tests/test_openml.py @@ -0,0 +1,36 @@ +"""Test the openml loader. + +Skipped on travis. +""" + +from sklearn.datasets import fetch_openml +from sklearn.utils.testing import (assert_warns_message, + assert_raise_message) + + +def test_fetch_openml(): + # check_skip_travis() + # fetch with version + iris_1 = fetch_openml("iris", version=1) + assert iris_1.details['id'] == '61' + # fetch without version + iris_1 = fetch_openml("iris") + assert iris_1.details['id'] == '61' + # fetch with dataset id + iris_by_id = fetch_openml(61) + assert iris_by_id.details['name'] == "iris" + assert iris_by_id.data.shape == (150, 4) + assert iris_by_id.target.shape == (150,) + # fetch inactive dataset by id + glas2 = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + 40675) + # fetch inactive dataset by name and version + assert glas2.data.shape == (163, 9) + glas2_by_version = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + "glass2", 1) + # there is no active version of glass2 + assert glas2_by_version.details['id'] == '40675' + assert_raise_message(ValueError, "No active dataset glass2 found", + fetch_openml, 'glass2')