From 268f533dd0f761c6a135021ff40542beda3aac23 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 10 Oct 2017 13:50:01 +0200 Subject: [PATCH 01/28] start on openml dataset loader --- sklearn/datasets/openml.py | 78 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 sklearn/datasets/openml.py diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py new file mode 100644 index 0000000000000..fefeb4aa62203 --- /dev/null +++ b/sklearn/datasets/openml.py @@ -0,0 +1,78 @@ +import json +import warnings +import numpy as np +import numbers + +try: + # Python 2 + from urllib2 import urlopen +except ImportError: + # Python 3+ + from urllib.request import urlopen + +from scipy.io.arff import loadarff + +_SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" + +jsons = "https://openml.org/api/v1/json/data/list/data_name/{}" +data_dl = "https://www.openml.org/data/download/{}" + + +def _get_data_info_by_name(name): + url_path = urlopen(_SEARCH_NAME.format(name)) + json_data = json.load(url_path) + return json_data['data']['dataset'][0] + + +def fetch_openml(name_or_id=None, version=1, json_loc=jsons, + data_loc=data_dl): + """Fetch dataset from openml by name or dataset id. + + Parameters + ---------- + + Returns + ------- + """ + if isinstance(name_or_id, numbers.Integral): + if version != 1: + raise ValueError( + "Dataset id={} and version={} passed, but you can only " + "specify a numeric id or a version, not both.".format( + name_or_id, version)) + data_id = name_or_id + elif isinstance(name_or_id, str): + name = name_or_id + + else: + raise TypeError( + "Invalid name_or_id {}, should be string or integer.".format( + name_or_id)) + + json_dl = urlretrieve(json_loc.format(name))[0] + # get the json file + with open(json_dl, 'r') as tmp: + json_data = json.load(tmp)['data']['dataset'] + vers = [(idx, val) for idx, item in enumerate(json_data) + for key, val in item.items() if key == "version"] + # tell user there are more versions if they dont specify number + if len(vers) > 1 and name_vers is None: + msg = ("dataset: {} has versions {}, " + "default is {}").format(name, + [i[1] for i in vers], + min([i[1] for i in vers])) + warnings.warn(msg) + # check if the version specified (if it is) is in the ones gotten + use = 1 if name_vers is None else name_vers + for v in vers: + if v[1] == use: + to_get = json_data[v[0]]['file_id'] + # download data + data_tmp = urlretrieve(data_loc.format(to_get))[0] + # load the data + data = loadarff(data_tmp) + data_fmt = np.zeros((data[0].shape[0], len(data[0][0])), dtype=object) + # scipy returns a tuple so try to put it in the right format + for idx, row in enumerate(data[0]): + data_fmt[idx, :] = [val for val in row] + return data_fmt From 4f3e93e1cbed32acc7fb3f9921a65fa414fc8f6b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 11 Oct 2017 14:18:02 +0200 Subject: [PATCH 02/28] working on stuff --- sklearn/datasets/openml.py | 43 +++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index fefeb4aa62203..a4e78fa352caf 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -2,6 +2,8 @@ import warnings import numpy as np import numbers +import os +from os.path import join, exists try: # Python 2 @@ -11,29 +13,51 @@ from urllib.request import urlopen from scipy.io.arff import loadarff +from .base import get_data_home _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" - -jsons = "https://openml.org/api/v1/json/data/list/data_name/{}" -data_dl = "https://www.openml.org/data/download/{}" +_DATA_INFO = "https://openml.org/api/v1/json/data/{}" +_DATA_DOWNLOAD = "https://www.openml.org/data/download/{}" def _get_data_info_by_name(name): - url_path = urlopen(_SEARCH_NAME.format(name)) + url_path = urlopen(_SEARCH_NAME.format(name)) json_data = json.load(url_path) return json_data['data']['dataset'][0] -def fetch_openml(name_or_id=None, version=1, json_loc=jsons, - data_loc=data_dl): +def _get_data_description_by_id(data_id): + url_path = urlopen(_DATA_INFO.format(data_id)) + json_data = json.load(url_path) + return json_data['data_set_description'] + + +def fetch_openml(name_or_id=None, version=1, data_home=None): """Fetch dataset from openml by name or dataset id. Parameters ---------- + data_home : optional, default: None + Specify another download and cache folder for the data sets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + Returns ------- + + data : Bunch + Dictionary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the classification labels, + 'DESCR', the full description of the dataset, and + 'COL_NAMES', the original names of the dataset columns. """ + data_home = get_data_home(data_home=data_home) + data_home = join(data_home, 'openml') + if not exists(data_home): + os.makedirs(data_home) + + # check if dataset id is known + if isinstance(name_or_id, numbers.Integral): if version != 1: raise ValueError( @@ -42,13 +66,18 @@ def fetch_openml(name_or_id=None, version=1, json_loc=jsons, name_or_id, version)) data_id = name_or_id elif isinstance(name_or_id, str): - name = name_or_id + data_info = _get_data_info_by_name(name_or_id) + data_id = data_info['did'] else: raise TypeError( "Invalid name_or_id {}, should be string or integer.".format( name_or_id)) + + data_description = _get_data_description_by_id(data_id) + + # download actual data json_dl = urlretrieve(json_loc.format(name))[0] # get the json file with open(json_dl, 'r') as tmp: From fabaa90a563ea578d68425ddaef06559815e3db7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 09:47:13 +0200 Subject: [PATCH 03/28] first version working --- sklearn/datasets/openml.py | 76 ++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index a4e78fa352caf..99865d4dfa8b7 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -1,9 +1,8 @@ import json -import warnings -import numpy as np import numbers import os from os.path import join, exists +from http.client import IncompleteRead try: # Python 2 @@ -13,7 +12,12 @@ from urllib.request import urlopen from scipy.io.arff import loadarff +import numpy as np + from .base import get_data_home +# from ..externals.joblib import Memory +from ..externals.six import StringIO +from ..utils import Bunch _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" _DATA_INFO = "https://openml.org/api/v1/json/data/{}" @@ -21,18 +25,18 @@ def _get_data_info_by_name(name): - url_path = urlopen(_SEARCH_NAME.format(name)) - json_data = json.load(url_path) + json_string = urlopen(_SEARCH_NAME.format(name)) + json_data = json.load(json_string) return json_data['data']['dataset'][0] def _get_data_description_by_id(data_id): - url_path = urlopen(_DATA_INFO.format(data_id)) - json_data = json.load(url_path) + json_string = urlopen(_DATA_INFO.format(data_id)) + json_data = json.load(json_string) return json_data['data_set_description'] -def fetch_openml(name_or_id=None, version=1, data_home=None): +def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): """Fetch dataset from openml by name or dataset id. Parameters @@ -53,6 +57,11 @@ def fetch_openml(name_or_id=None, version=1, data_home=None): """ data_home = get_data_home(data_home=data_home) data_home = join(data_home, 'openml') + # if memory: + # mem = Memory(join(data_home, 'cache')) + # _get_data_info_by_name = mem(_get_data_info_by_name) + # _get_data_description_by_id = mem(_get_data_description_by_id) + # _download_data = mem(_download_data) if not exists(data_home): os.makedirs(data_home) @@ -74,34 +83,31 @@ def fetch_openml(name_or_id=None, version=1, data_home=None): "Invalid name_or_id {}, should be string or integer.".format( name_or_id)) - data_description = _get_data_description_by_id(data_id) + target_name = data_description['default_target_attribute'] # download actual data - json_dl = urlretrieve(json_loc.format(name))[0] - # get the json file - with open(json_dl, 'r') as tmp: - json_data = json.load(tmp)['data']['dataset'] - vers = [(idx, val) for idx, item in enumerate(json_data) - for key, val in item.items() if key == "version"] - # tell user there are more versions if they dont specify number - if len(vers) > 1 and name_vers is None: - msg = ("dataset: {} has versions {}, " - "default is {}").format(name, - [i[1] for i in vers], - min([i[1] for i in vers])) - warnings.warn(msg) - # check if the version specified (if it is) is in the ones gotten - use = 1 if name_vers is None else name_vers - for v in vers: - if v[1] == use: - to_get = json_data[v[0]]['file_id'] - # download data - data_tmp = urlretrieve(data_loc.format(to_get))[0] - # load the data - data = loadarff(data_tmp) - data_fmt = np.zeros((data[0].shape[0], len(data[0][0])), dtype=object) - # scipy returns a tuple so try to put it in the right format - for idx, row in enumerate(data[0]): - data_fmt[idx, :] = [val for val in row] - return data_fmt + response = urlopen(_DATA_DOWNLOAD.format(data_id)) + # we need to catch IncompleteRead which is likely a server-side issue + try: + data_arff = response.read() + except IncompleteRead as e: + data_arff = e.partial + # getting structured array and metadata + data, meta = loadarff(StringIO(data_arff.decode("utf-8"))) + columns = np.array(meta.names()) + data_columns = columns[columns != target_name] + # TODO: stacking the content of the structured array + # this results in a copy. If the data was homogeneous + # we could use a view instead. + X = np.column_stack(data[c] for c in data_columns) + y = data[target_name] + + description = "{}\n\nDownloaded from openml.org.".format( + data_description['description']) + + bunch = Bunch( + data=X, target=y, feature_names=data_columns, + DESCR=description) + + return bunch From 1804b14d0b2e7a6365a8e8c40d18eef8ba965eb0 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 10:35:28 +0200 Subject: [PATCH 04/28] docstrings, use version="active" as default --- sklearn/datasets/openml.py | 46 +++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 99865d4dfa8b7..6ab2a22678191 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -2,7 +2,7 @@ import numbers import os from os.path import join, exists -from http.client import IncompleteRead +from warnings import warn try: # Python 2 @@ -10,6 +10,7 @@ except ImportError: # Python 3+ from urllib.request import urlopen + from http.client import IncompleteRead from scipy.io.arff import loadarff import numpy as np @@ -24,8 +25,11 @@ _DATA_DOWNLOAD = "https://www.openml.org/data/download/{}" -def _get_data_info_by_name(name): - json_string = urlopen(_SEARCH_NAME.format(name)) +def _get_data_info_by_name(name, version): + if version == "active": + json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/")) + else: + json_string = urlopen(_SEARCH_NAME.format(name)) json_data = json.load(json_string) return json_data['data']['dataset'][0] @@ -36,24 +40,41 @@ def _get_data_description_by_id(data_id): return json_data['data_set_description'] -def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): +def fetch_openml(name_or_id=None, version='active', data_home=None, + memory=True): """Fetch dataset from openml by name or dataset id. + Datasets are uniquely identified by either an integer ID or by a + combination of name and version (i.e. there might be multiple + versions of the 'iris' dataset). Newer versions are assumed to fix + issues in earlier versions. + Parameters ---------- + name_or_id : string or integer + Identifier of the dataset. If integer, assumed to be the id of the + dataset on OpenML, if string, assumed to be the name of the dataset. + + version : integer or 'active', default='active' + Version of the dataset. Only used if ``name_or_id`` is a string. + If 'active' the oldest version that's still active is used. data_home : optional, default: None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + memory : boolean, default=True + Whether to store downloaded datasets using joblib. + Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: - 'data', the data to learn, 'target', the classification labels, - 'DESCR', the full description of the dataset, and - 'COL_NAMES', the original names of the dataset columns. + 'data', the data to learn, 'target', the regression target or + classification labels, 'DESCR', the full description of the dataset, + 'feature_names', the original names of the dataset columns, and + 'details' which provide more information on the openml meta-data. """ data_home = get_data_home(data_home=data_home) data_home = join(data_home, 'openml') @@ -66,7 +87,6 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): os.makedirs(data_home) # check if dataset id is known - if isinstance(name_or_id, numbers.Integral): if version != 1: raise ValueError( @@ -75,7 +95,7 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): name_or_id, version)) data_id = name_or_id elif isinstance(name_or_id, str): - data_info = _get_data_info_by_name(name_or_id) + data_info = _get_data_info_by_name(name_or_id, version) data_id = data_info['did'] else: @@ -84,6 +104,10 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): name_or_id)) data_description = _get_data_description_by_id(data_id) + if data_description['status'] != "active": + warn("Version {} of dataset {} is inactive, meaning that issues have" + " been found in the dataset. Try using a newer version.".format( + data_description['name'], data_description['version'])) target_name = data_description['default_target_attribute'] # download actual data @@ -104,10 +128,10 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True): y = data[target_name] description = "{}\n\nDownloaded from openml.org.".format( - data_description['description']) + data_description.pop('description')) bunch = Bunch( data=X, target=y, feature_names=data_columns, - DESCR=description) + DESCR=description, details=data_description) return bunch From ffd43359ce73ec70172a39f83c1d4686c23bd1d9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 10:43:19 +0200 Subject: [PATCH 05/28] add caching to openml loader --- sklearn/datasets/openml.py | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 6ab2a22678191..608c6e68b7147 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -16,7 +16,7 @@ import numpy as np from .base import get_data_home -# from ..externals.joblib import Memory +from ..externals.joblib import Memory from ..externals.six import StringIO from ..utils import Bunch @@ -40,6 +40,17 @@ def _get_data_description_by_id(data_id): return json_data['data_set_description'] +def _download_data(data_id): + response = urlopen(_DATA_DOWNLOAD.format(data_id)) + # we need to catch IncompleteRead which is likely a server-side issue + try: + data_arff = response.read() + except IncompleteRead as e: + data_arff = e.partial + # getting structured array and metadata + return loadarff(StringIO(data_arff.decode("utf-8"))) + + def fetch_openml(name_or_id=None, version='active', data_home=None, memory=True): """Fetch dataset from openml by name or dataset id. @@ -78,11 +89,14 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, """ data_home = get_data_home(data_home=data_home) data_home = join(data_home, 'openml') - # if memory: - # mem = Memory(join(data_home, 'cache')) - # _get_data_info_by_name = mem(_get_data_info_by_name) - # _get_data_description_by_id = mem(_get_data_description_by_id) - # _download_data = mem(_download_data) + if memory: + mem = Memory(join(data_home, 'cache'), verbose=0).cache + else: + mem = lambda x: x + _get_data_info_by_name_ = mem(_get_data_info_by_name) + _get_data_description_by_id_ = mem(_get_data_description_by_id) + _download_data_ = mem(_download_data) + if not exists(data_home): os.makedirs(data_home) @@ -95,7 +109,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, name_or_id, version)) data_id = name_or_id elif isinstance(name_or_id, str): - data_info = _get_data_info_by_name(name_or_id, version) + data_info = _get_data_info_by_name_(name_or_id, version) data_id = data_info['did'] else: @@ -103,7 +117,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, "Invalid name_or_id {}, should be string or integer.".format( name_or_id)) - data_description = _get_data_description_by_id(data_id) + data_description = _get_data_description_by_id_(data_id) if data_description['status'] != "active": warn("Version {} of dataset {} is inactive, meaning that issues have" " been found in the dataset. Try using a newer version.".format( @@ -111,14 +125,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, target_name = data_description['default_target_attribute'] # download actual data - response = urlopen(_DATA_DOWNLOAD.format(data_id)) - # we need to catch IncompleteRead which is likely a server-side issue - try: - data_arff = response.read() - except IncompleteRead as e: - data_arff = e.partial - # getting structured array and metadata - data, meta = loadarff(StringIO(data_arff.decode("utf-8"))) + data, meta = _download_data_(data_id) columns = np.array(meta.names()) data_columns = columns[columns != target_name] # TODO: stacking the content of the structured array From fe0904bcc15084bf35583957672135333007a037 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 11:40:55 +0200 Subject: [PATCH 06/28] pep8 annoyance --- sklearn/datasets/openml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 608c6e68b7147..364ef90f83dd1 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -92,7 +92,8 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, if memory: mem = Memory(join(data_home, 'cache'), verbose=0).cache else: - mem = lambda x: x + def mem(func): + return func _get_data_info_by_name_ = mem(_get_data_info_by_name) _get_data_description_by_id_ = mem(_get_data_description_by_id) _download_data_ = mem(_download_data) From eea026e5d60ffa672eb58a54ea71407aae603a71 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 14:03:32 +0200 Subject: [PATCH 07/28] fix download url, allow datasets without target --- sklearn/datasets/openml.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 364ef90f83dd1..c1fc2f7af9b60 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -10,7 +10,7 @@ except ImportError: # Python 3+ from urllib.request import urlopen - from http.client import IncompleteRead + # from http.client import IncompleteRead from scipy.io.arff import loadarff import numpy as np @@ -22,13 +22,13 @@ _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" _DATA_INFO = "https://openml.org/api/v1/json/data/{}" -_DATA_DOWNLOAD = "https://www.openml.org/data/download/{}" def _get_data_info_by_name(name, version): if version == "active": json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/")) else: + # FIXME waiting for new filter mechanism json_string = urlopen(_SEARCH_NAME.format(name)) json_data = json.load(json_string) return json_data['data']['dataset'][0] @@ -40,15 +40,11 @@ def _get_data_description_by_id(data_id): return json_data['data_set_description'] -def _download_data(data_id): - response = urlopen(_DATA_DOWNLOAD.format(data_id)) - # we need to catch IncompleteRead which is likely a server-side issue - try: - data_arff = response.read() - except IncompleteRead as e: - data_arff = e.partial - # getting structured array and metadata - return loadarff(StringIO(data_arff.decode("utf-8"))) +def _download_data(url): + response = urlopen(url) + arff = loadarff(StringIO(response.read().decode('utf-8'))) + response.close() + return arff def fetch_openml(name_or_id=None, version='active', data_home=None, @@ -103,7 +99,7 @@ def mem(func): # check if dataset id is known if isinstance(name_or_id, numbers.Integral): - if version != 1: + if version != "active": raise ValueError( "Dataset id={} and version={} passed, but you can only " "specify a numeric id or a version, not both.".format( @@ -123,17 +119,20 @@ def mem(func): warn("Version {} of dataset {} is inactive, meaning that issues have" " been found in the dataset. Try using a newer version.".format( data_description['name'], data_description['version'])) - target_name = data_description['default_target_attribute'] + target_name = data_description.get('default_target_attribute', None) # download actual data - data, meta = _download_data_(data_id) + data, meta = _download_data_(data_description['url']) columns = np.array(meta.names()) data_columns = columns[columns != target_name] # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous # we could use a view instead. X = np.column_stack(data[c] for c in data_columns) - y = data[target_name] + if target_name is not None: + y = data[target_name] + else: + y = None description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) From bca12e9dd77d8c7451a9a7b431b853f1e6571615 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 14:25:14 +0200 Subject: [PATCH 08/28] allow specifying the target column, starting on docs --- doc/datasets/openml.rst | 68 ++++++++++++++++++++++++++++++++++++++ sklearn/datasets/openml.py | 19 +++++++---- 2 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 doc/datasets/openml.rst diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst new file mode 100644 index 0000000000000..f91226b454b72 --- /dev/null +++ b/doc/datasets/openml.rst @@ -0,0 +1,68 @@ +.. + For doctests: + + >>> import numpy as np + >>> import os + >>> import tempfile + >>> # Create a temporary folder for the data fetcher + >>> custom_data_home = tempfile.mkdtemp() + >>> os.makedirs(os.path.join(custom_data_home, 'mldata')) + + +.. _mldata: + +Downloading datasets from the openml.org repository +=================================================== + +`openml.org `_ is a public repository for machine learning +data and experiments. + +The ``sklearn.datasets`` package is able to directly download data +sets from the repository using the function +:func:`sklearn.datasets.fetch_openml`. + +For example, to download a dataset of gene expressions in mice brains: + + >>> from sklearn.datasets import fetch_mldata + >>> mnist = fetch_mldata('miceprotein', data_home=custom_data_home) + +The MNIST database contains a total of 70000 examples of handwritten digits +of size 28x28 pixels, labeled from 0 to 9:: + + >>> mnist.data.shape + (70000, 784) + >>> mnist.target.shape + (70000,) + >>> np.unique(mnist.target) + array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]) + +After the first download, the dataset is cached locally in the path +specified by the ``data_home`` keyword argument, which defaults to +``~/scikit_learn_data/``:: + + >>> os.listdir(os.path.join(custom_data_home, 'mldata')) + ['mnist-original.mat'] + +Data sets in `mldata.org `_ do not adhere to a strict +naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is +able to make sense of the most common cases, but allows to tailor the +defaults to individual datasets: + +* For datasets with multiple columns, :func:`sklearn.datasets.fetch_mldata` + tries to identify the target and data columns and rename them to ``target`` + and ``data``. This is done by looking for arrays named ``label`` and + ``data`` in the dataset, and failing that by choosing the first array to be + ``target`` and the second to be ``data``. This behavior can be changed with + the ``target_name`` and ``data_name`` keywords, setting them to a specific + name or index number (the name and order of the columns in the datasets + can be found at its `mldata.org `_ under the tab "Data":: + + >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, data_name=0, + ... data_home=custom_data_home) + >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class', + ... data_name='double0', data_home=custom_data_home) + + +.. + >>> import shutil + >>> shutil.rmtree(custom_data_home) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index c1fc2f7af9b60..bc57e6b2feadc 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -48,7 +48,7 @@ def _download_data(url): def fetch_openml(name_or_id=None, version='active', data_home=None, - memory=True): + target_column='default-target', memory=True): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -66,10 +66,16 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, Version of the dataset. Only used if ``name_or_id`` is a string. If 'active' the oldest version that's still active is used. - data_home : optional, default: None + data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + target_column : string or None, default 'default-target' + Specify the column name in the data to use as target. If + 'default-target', the standard target column a stored on the server + is used. If ``None``, all columns are returned as data and the + tharget is ``None``. + memory : boolean, default=True Whether to store downloaded datasets using joblib. @@ -119,18 +125,19 @@ def mem(func): warn("Version {} of dataset {} is inactive, meaning that issues have" " been found in the dataset. Try using a newer version.".format( data_description['name'], data_description['version'])) - target_name = data_description.get('default_target_attribute', None) + if target_column == "default-target": + target_column = data_description.get('default_target_attribute', None) # download actual data data, meta = _download_data_(data_description['url']) columns = np.array(meta.names()) - data_columns = columns[columns != target_name] + data_columns = columns[columns != target_column] # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous # we could use a view instead. X = np.column_stack(data[c] for c in data_columns) - if target_name is not None: - y = data[target_name] + if target_column is not None: + y = data[target_column] else: y = None From f59ce8b47dc429af6dd57f25520c06ad2bbe209b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 15:42:05 +0200 Subject: [PATCH 09/28] add openml to the narrative docs --- doc/datasets/index.rst | 3 ++ doc/datasets/openml.rst | 79 +++++++++++++++++++++++++++----------- sklearn/datasets/openml.py | 3 +- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index f91163fc235c5..580d9e3950561 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -318,6 +318,7 @@ writing data in that format. olivetti_faces twenty_newsgroups mldata + openml labeled_faces covtype rcv1 @@ -327,6 +328,8 @@ writing data in that format. .. include:: twenty_newsgroups.rst +.. include:: openml.rst + .. include:: mldata.rst .. include:: labeled_faces.rst diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index f91226b454b72..84669cedf694b 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -21,47 +21,80 @@ The ``sklearn.datasets`` package is able to directly download data sets from the repository using the function :func:`sklearn.datasets.fetch_openml`. -For example, to download a dataset of gene expressions in mice brains: +For example, to download a dataset of gene expressions in mice brains:: >>> from sklearn.datasets import fetch_mldata - >>> mnist = fetch_mldata('miceprotein', data_home=custom_data_home) + >>> mice = fetch_mldata('miceprotein', data_home=custom_data_home) -The MNIST database contains a total of 70000 examples of handwritten digits +The dataset contains a total of 70000 examples of handwritten digits of size 28x28 pixels, labeled from 0 to 9:: - >>> mnist.data.shape + >>> mice.data.shape (70000, 784) - >>> mnist.target.shape + >>> mice.target.shape (70000,) - >>> np.unique(mnist.target) + >>> np.unique(mice.target) array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]) -After the first download, the dataset is cached locally in the path -specified by the ``data_home`` keyword argument, which defaults to -``~/scikit_learn_data/``:: +You can get more information on the dataset by looking at the ``DESCR`` +and ``details`` attributes:: - >>> os.listdir(os.path.join(custom_data_home, 'mldata')) - ['mnist-original.mat'] + >>> print(mice.DESCR) + something + >>> mice.details + +The ``DESCR`` contains a free-text description of the data, while ``details`` +contains a dictionary of meta-data stored by openml, like the dataset id. +The id of the mice protein dataset is 4550, and you can use this (or the name) +to get more information on the dataset on the openml website: https://www.openml.org/d/4550. Data sets in `mldata.org `_ do not adhere to a strict naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is able to make sense of the most common cases, but allows to tailor the defaults to individual datasets: -* For datasets with multiple columns, :func:`sklearn.datasets.fetch_mldata` - tries to identify the target and data columns and rename them to ``target`` - and ``data``. This is done by looking for arrays named ``label`` and - ``data`` in the dataset, and failing that by choosing the first array to be - ``target`` and the second to be ``data``. This behavior can be changed with - the ``target_name`` and ``data_name`` keywords, setting them to a specific - name or index number (the name and order of the columns in the datasets - can be found at its `mldata.org `_ under the tab "Data":: +The id is also the best way to specify how to fetch a dataset from OpenML:: + + >>> mice = fetch_mldata(4550, data_home=custom_data_home) + >>> mice.details + +Dataset Versions +---------------- + +A dataset is uniquely specified by its id, but not necessarily by its name. +Several different "versions" of a dataset with the same name can exist. +If a particular version of a dataset has been found to contain significant +issues, it might be inactivated. Using a name to specify a dataset will yield +the earliest version of a dataset that is still active. That means that +``fetch_mldata("miceprotein")`` can yield different results at differnt times +if earlier versions become inactive. +You can see that the dataset with id 4550 that we fetched above is the version 1 +of the "miceprotein" dataset:: + + >>> mice.details['version'] + 1 + +In fact, this dataset only has one version. The iris dataset on the other hand +has multiple versions:: + + >>> iris = fetch_mldata("iris", data_home=custom_data_home) + >>> iris.details['version'] + >>> iris.details['id'] + + >>> iris_61 = fetch_mldata(61, data_home=custom_data_home) + >>> iris_61.details['version'] + >>> iris_61.details['id'] + + >>> iris_969 = fetch_mldata(969, data_home=custom_data_home) + >>> iris_969.details['version'] + >>> iris_969.details['id'] - >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, data_name=0, - ... data_home=custom_data_home) - >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class', - ... data_name='double0', data_home=custom_data_home) +Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. +To make sure you always get this exact dataset, it is safest to specify it by the dataset id. +The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains +a binarized version of the data:: + >>> np.bincount(iris_969.target) .. >>> import shutil diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index bc57e6b2feadc..01037e25f8693 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -53,8 +53,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None, Datasets are uniquely identified by either an integer ID or by a combination of name and version (i.e. there might be multiple - versions of the 'iris' dataset). Newer versions are assumed to fix - issues in earlier versions. + versions of the 'iris' dataset). Parameters ---------- From d7dee6dc0bb5a5af106bbb2b373ae7fa1846fc9d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 15:43:05 +0200 Subject: [PATCH 10/28] get more people to upload stuff to openml. --- doc/datasets/openml.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index 84669cedf694b..40c0e81afd9c6 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -15,7 +15,7 @@ Downloading datasets from the openml.org repository =================================================== `openml.org `_ is a public repository for machine learning -data and experiments. +data and experiments, that allows everybody to upload open datasets. The ``sklearn.datasets`` package is able to directly download data sets from the repository using the function From 4c19ad9a300f19f33927e1ade6fe981457720743 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 15:48:56 +0200 Subject: [PATCH 11/28] store metadata, convert to dtype object if there is nominal data. --- sklearn/datasets/openml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 01037e25f8693..4bb51e05c5759 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -134,7 +134,8 @@ def mem(func): # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous # we could use a view instead. - X = np.column_stack(data[c] for c in data_columns) + dtype = object if "nominal" in meta.types() else None + X = np.array([data[c] for c in data_columns], dtype=dtype).T if target_column is not None: y = data[target_column] else: @@ -145,6 +146,6 @@ def mem(func): bunch = Bunch( data=X, target=y, feature_names=data_columns, - DESCR=description, details=data_description) + DESCR=description, details=data_description, meta=meta) return bunch From 16b7fed08477d165715ef2a40830912cdd38bddd Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 12 Oct 2017 16:05:36 +0200 Subject: [PATCH 12/28] fix doctests, add fetch_openml to __init__ --- doc/datasets/mldata.rst | 1 + doc/datasets/openml.rst | 80 +++++++++++++++++++++++++----------- doc/modules/classes.rst | 1 + sklearn/datasets/__init__.py | 2 + 4 files changed, 59 insertions(+), 25 deletions(-) diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst index b94dfd7620a24..cb076e1e75bb7 100644 --- a/doc/datasets/mldata.rst +++ b/doc/datasets/mldata.rst @@ -16,6 +16,7 @@ Downloading datasets from the mldata.org repository `mldata.org `_ is a public repository for machine learning data, supported by the `PASCAL network `_ . +It is no longer actively maintained, and it's suggested to use :ref:openml instead. The ``sklearn.datasets`` package is able to directly download data sets from the repository using the function diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index 40c0e81afd9c6..8a080fffc1ed8 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -6,10 +6,10 @@ >>> import tempfile >>> # Create a temporary folder for the data fetcher >>> custom_data_home = tempfile.mkdtemp() - >>> os.makedirs(os.path.join(custom_data_home, 'mldata')) + >>> os.makedirs(os.path.join(custom_data_home, 'openml')) -.. _mldata: +.. _openml: Downloading datasets from the openml.org repository =================================================== @@ -23,40 +23,62 @@ sets from the repository using the function For example, to download a dataset of gene expressions in mice brains:: - >>> from sklearn.datasets import fetch_mldata - >>> mice = fetch_mldata('miceprotein', data_home=custom_data_home) + >>> from sklearn.datasets import fetch_openml + >>> mice = fetch_openml('miceprotein', data_home=custom_data_home) The dataset contains a total of 70000 examples of handwritten digits of size 28x28 pixels, labeled from 0 to 9:: >>> mice.data.shape - (70000, 784) + (1080, 81) >>> mice.target.shape - (70000,) - >>> np.unique(mice.target) - array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]) + (1080,) + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'", + b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8') You can get more information on the dataset by looking at the ``DESCR`` and ``details`` attributes:: - >>> print(mice.DESCR) - something - >>> mice.details + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios + **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 + **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing + Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down + Syndrome. PLoS ONE 10(6): e0129126... + + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + The ``DESCR`` contains a free-text description of the data, while ``details`` contains a dictionary of meta-data stored by openml, like the dataset id. The id of the mice protein dataset is 4550, and you can use this (or the name) to get more information on the dataset on the openml website: https://www.openml.org/d/4550. -Data sets in `mldata.org `_ do not adhere to a strict -naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is -able to make sense of the most common cases, but allows to tailor the -defaults to individual datasets: - The id is also the best way to specify how to fetch a dataset from OpenML:: - >>> mice = fetch_mldata(4550, data_home=custom_data_home) - >>> mice.details + >>> mice = fetch_openml(4550, data_home=custom_data_home) + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} Dataset Versions ---------------- @@ -66,35 +88,43 @@ Several different "versions" of a dataset with the same name can exist. If a particular version of a dataset has been found to contain significant issues, it might be inactivated. Using a name to specify a dataset will yield the earliest version of a dataset that is still active. That means that -``fetch_mldata("miceprotein")`` can yield different results at differnt times +``fetch_openml("miceprotein")`` can yield different results at differnt times if earlier versions become inactive. You can see that the dataset with id 4550 that we fetched above is the version 1 of the "miceprotein" dataset:: >>> mice.details['version'] - 1 + '1' In fact, this dataset only has one version. The iris dataset on the other hand has multiple versions:: - >>> iris = fetch_mldata("iris", data_home=custom_data_home) + >>> iris = fetch_openml("iris", data_home=custom_data_home) >>> iris.details['version'] + '1' >>> iris.details['id'] + '61' - >>> iris_61 = fetch_mldata(61, data_home=custom_data_home) + >>> iris_61 = fetch_openml(61, data_home=custom_data_home) >>> iris_61.details['version'] + '1' >>> iris_61.details['id'] + '61' - >>> iris_969 = fetch_mldata(969, data_home=custom_data_home) + >>> iris_969 = fetch_openml(969, data_home=custom_data_home) >>> iris_969.details['version'] + '3' >>> iris_969.details['id'] + '969' -Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. +'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. To make sure you always get this exact dataset, it is safest to specify it by the dataset id. The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains a binarized version of the data:: - >>> np.bincount(iris_969.target) + >>> np.unique(iris_969.target) + array([b'N', b'P'], + dtype='|S1') .. >>> import shutil diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index cfe2fd11c9ac4..b6cbb05a01f55 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -224,6 +224,7 @@ Loaders datasets.fetch_lfw_people datasets.fetch_mldata datasets.fetch_olivetti_faces + datasets.fetch_openml datasets.fetch_rcv1 datasets.fetch_species_distributions datasets.get_data_home diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c43c0c4758b10..c7d78e633493d 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -23,6 +23,7 @@ from .twenty_newsgroups import fetch_20newsgroups from .twenty_newsgroups import fetch_20newsgroups_vectorized from .mldata import fetch_mldata, mldata_filename +from .openml import fetch_openml from .samples_generator import make_classification from .samples_generator import make_multilabel_classification from .samples_generator import make_hastie_10_2 @@ -65,6 +66,7 @@ 'fetch_covtype', 'fetch_rcv1', 'fetch_kddcup99', + 'fetch_openml', 'get_data_home', 'load_boston', 'load_diabetes', From b3f6c3690a1fe8c04b5e4fb8f2e00ad432510f85 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 17 Oct 2017 17:47:54 -0400 Subject: [PATCH 13/28] make arff reading work in python2.7 --- sklearn/datasets/openml.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 4bb51e05c5759..d4ed8163be725 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -1,5 +1,6 @@ import json import numbers +import sys import os from os.path import join, exists from warnings import warn @@ -42,7 +43,12 @@ def _get_data_description_by_id(data_id): def _download_data(url): response = urlopen(url) - arff = loadarff(StringIO(response.read().decode('utf-8'))) + if sys.version_info[0] == 2: + # Python2.7 numpy can't handle unicode? + arff = loadarff(StringIO(response.read())) + else: + arff = loadarff(StringIO(response.read().decode('utf-8'))) + response.close() return arff @@ -141,7 +147,7 @@ def mem(func): else: y = None - description = "{}\n\nDownloaded from openml.org.".format( + description = u"{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) bunch = Bunch( From dc401f26b5177300f3ea003c75342baef74ae013 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 24 Oct 2017 16:41:36 -0400 Subject: [PATCH 14/28] ignore doctests for now because of unicode issues --- doc/datasets/openml.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index 8a080fffc1ed8..d82f12fa6e60c 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -33,21 +33,21 @@ of size 28x28 pixels, labeled from 0 to 9:: (1080, 81) >>> mice.target.shape (1080,) - >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'", b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8') You can get more information on the dataset by looking at the ``DESCR`` and ``details`` attributes:: - >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): e0129126... - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', 'creator': ..., 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': @@ -68,7 +68,7 @@ to get more information on the dataset on the openml website: https://www.openml The id is also the best way to specify how to fetch a dataset from OpenML:: >>> mice = fetch_openml(4550, data_home=custom_data_home) - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', 'creator': ..., 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': @@ -93,28 +93,28 @@ if earlier versions become inactive. You can see that the dataset with id 4550 that we fetched above is the version 1 of the "miceprotein" dataset:: - >>> mice.details['version'] + >>> mice.details['version'] #doctest: +SKIP '1' In fact, this dataset only has one version. The iris dataset on the other hand has multiple versions:: >>> iris = fetch_openml("iris", data_home=custom_data_home) - >>> iris.details['version'] + >>> iris.details['version'] #doctest: +SKIP '1' - >>> iris.details['id'] + >>> iris.details['id'] #doctest: +SKIP '61' >>> iris_61 = fetch_openml(61, data_home=custom_data_home) - >>> iris_61.details['version'] + >>> iris_61.details['version'] #doctest: +SKIP '1' - >>> iris_61.details['id'] + >>> iris_61.details['id'] #doctest: +SKIP '61' >>> iris_969 = fetch_openml(969, data_home=custom_data_home) - >>> iris_969.details['version'] + >>> iris_969.details['version'] #doctest: +SKIP '3' - >>> iris_969.details['id'] + >>> iris_969.details['id'] #doctest: +SKIP '969' 'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. @@ -122,7 +122,7 @@ To make sure you always get this exact dataset, it is safest to specify it by th The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains a binarized version of the data:: - >>> np.unique(iris_969.target) + >>> np.unique(iris_969.target) #doctest: +SKIP array([b'N', b'P'], dtype='|S1') From d8cfd379e3870343bcb227788f5a23c316929c9a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 24 Oct 2017 16:51:43 -0400 Subject: [PATCH 15/28] add version filter. --- doc/datasets/openml.rst | 10 +++++++++- sklearn/datasets/openml.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index d82f12fa6e60c..b36a933b13406 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -117,7 +117,7 @@ has multiple versions:: >>> iris_969.details['id'] #doctest: +SKIP '969' -'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. +Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61. To make sure you always get this exact dataset, it is safest to specify it by the dataset id. The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains a binarized version of the data:: @@ -126,6 +126,14 @@ a binarized version of the data:: array([b'N', b'P'], dtype='|S1') +You can also specify both the name and the version, which also uniquely identifies the dataset:: + >>> iris_version_3 = fetch_openml("iris", version=3, data_home=custom_data_home) + >>> iris_version_3.details['version'] + '3' + >>> iris_version_3.details['id'] + '969' + + .. >>> import shutil >>> shutil.rmtree(custom_data_home) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d4ed8163be725..f523f7cd5d145 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -29,8 +29,8 @@ def _get_data_info_by_name(name, version): if version == "active": json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/")) else: - # FIXME waiting for new filter mechanism - json_string = urlopen(_SEARCH_NAME.format(name)) + json_string = urlopen(_SEARCH_NAME.format(name) + + "/data_version/{}".format(version)) json_data = json.load(json_string) return json_data['data']['dataset'][0] From 6f6bb576510055c24b2c34cd2ecb5e2705324a44 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 14 Nov 2017 18:23:45 -0500 Subject: [PATCH 16/28] some typos, addressing joel's comments, working on better errors --- doc/datasets/mldata.rst | 6 +++--- doc/datasets/openml.rst | 12 ++++++++---- sklearn/datasets/openml.py | 27 +++++++++++++++++++++------ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst index cb076e1e75bb7..60546bfcfd363 100644 --- a/doc/datasets/mldata.rst +++ b/doc/datasets/mldata.rst @@ -16,10 +16,10 @@ Downloading datasets from the mldata.org repository `mldata.org `_ is a public repository for machine learning data, supported by the `PASCAL network `_ . -It is no longer actively maintained, and it's suggested to use :ref:openml instead. +It is no longer actively maintained, and it's suggested to use :ref:`openml` instead. -The ``sklearn.datasets`` package is able to directly download data -sets from the repository using the function +The ``sklearn.datasets`` package is able to directly download datasets +from the repository using the function :func:`sklearn.datasets.fetch_mldata`. For example, to download the MNIST digit recognition database:: diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index b36a933b13406..087fcf2dee4e9 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -24,10 +24,11 @@ sets from the repository using the function For example, to download a dataset of gene expressions in mice brains:: >>> from sklearn.datasets import fetch_openml - >>> mice = fetch_openml('miceprotein', data_home=custom_data_home) + >>> mice = fetch_openml('miceprotein', version=2, data_home=custom_data_home) -The dataset contains a total of 70000 examples of handwritten digits -of size 28x28 pixels, labeled from 0 to 9:: +To fully specify a dataset, you need to provide a name and a version, though the +version is optional, see :ref:`openml_versions`_ below. +The dataset contains a total of 1080 examples belonging to 8 different classes:: >>> mice.data.shape (1080, 81) @@ -80,11 +81,14 @@ The id is also the best way to specify how to fetch a dataset from OpenML:: 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} +.. _openml_versions: + Dataset Versions ---------------- A dataset is uniquely specified by its id, but not necessarily by its name. -Several different "versions" of a dataset with the same name can exist. +Several different "versions" of a dataset with the same name can exist which can contain +entirely different datasets. If a particular version of a dataset has been found to contain significant issues, it might be inactivated. Using a name to specify a dataset will yield the earliest version of a dataset that is still active. That means that diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index f523f7cd5d145..d4ef3b3588e9d 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -11,7 +11,7 @@ except ImportError: # Python 3+ from urllib.request import urlopen - # from http.client import IncompleteRead + from scipy.io.arff import loadarff import numpy as np @@ -19,6 +19,7 @@ from .base import get_data_home from ..externals.joblib import Memory from ..externals.six import StringIO +from ..externals.six.moves.urllib.error import HTTPError from ..utils import Bunch _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" @@ -26,11 +27,25 @@ def _get_data_info_by_name(name, version): - if version == "active": - json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/")) - else: - json_string = urlopen(_SEARCH_NAME.format(name) - + "/data_version/{}".format(version)) + data_found = True + try: + if version == "active": + json_string = urlopen(_SEARCH_NAME.format(name + + "/status/active/")) + else: + json_string = urlopen(_SEARCH_NAME.format(name) + + "/data_version/{}".format(version)) + except HTTPError as error: + if error.code == 412: + data_found = False + + if not data_found: + # not in except for nicer traceback + if version == "active": + raise ValueError("No active dataset {} found.".format(name)) + raise ValueError("Dataset {} with version {}" + " not found.".format(name, version)) + json_data = json.load(json_string) return json_data['data']['dataset'][0] From b5c72d9c49453b0ba5902f14798aa9da69f822e2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 15:08:39 -0500 Subject: [PATCH 17/28] nicer error message on non-existing ID --- sklearn/datasets/openml.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d4ef3b3588e9d..3658a3ddf435d 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -51,7 +51,16 @@ def _get_data_info_by_name(name, version): def _get_data_description_by_id(data_id): - json_string = urlopen(_DATA_INFO.format(data_id)) + data_found = True + try: + json_string = urlopen(_DATA_INFO.format(data_id)) + except HTTPError as error: + if error.code == 412: + data_found = False + if not data_found: + # not in except for nicer traceback + raise ValueError("Dataset with id {} " + "not found.".format(data_id)) json_data = json.load(json_string) return json_data['data_set_description'] From 64483f8ec76e5193dea318565f326c4e849a0c96 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 15:17:11 -0500 Subject: [PATCH 18/28] minor improvements to data wrangling --- sklearn/datasets/openml.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 3658a3ddf435d..9503ea9d29793 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -159,17 +159,20 @@ def mem(func): # download actual data data, meta = _download_data_(data_description['url']) - columns = np.array(meta.names()) - data_columns = columns[columns != target_column] # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous - # we could use a view instead. - dtype = object if "nominal" in meta.types() else None - X = np.array([data[c] for c in data_columns], dtype=dtype).T + # and target at start or end, we could use a view instead. if target_column is not None: y = data[target_column] + data_columns = meta.names().remove(target_column) else: y = None + data_columns = meta.names() + if all([x == "numeric" for x in meta.types()]): + dtype = None + else: + dtype = object + X = np.array([data[c] for c in data_columns], dtype=dtype).T description = u"{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) From 26aaff2c0a58d4efe5b163c47866763045f848dc Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 15:39:14 -0500 Subject: [PATCH 19/28] allow downloading inactive datasets if specified by name and version --- doc/datasets/openml.rst | 10 +++++++--- sklearn/datasets/openml.py | 22 +++++++++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index 087fcf2dee4e9..3738b720bc5c0 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -17,8 +17,8 @@ Downloading datasets from the openml.org repository `openml.org `_ is a public repository for machine learning data and experiments, that allows everybody to upload open datasets. -The ``sklearn.datasets`` package is able to directly download data -sets from the repository using the function +The ``sklearn.datasets`` package is able to directly download datasets +from the repository using the function :func:`sklearn.datasets.fetch_openml`. For example, to download a dataset of gene expressions in mice brains:: @@ -64,7 +64,11 @@ and ``details`` attributes:: The ``DESCR`` contains a free-text description of the data, while ``details`` contains a dictionary of meta-data stored by openml, like the dataset id. The id of the mice protein dataset is 4550, and you can use this (or the name) -to get more information on the dataset on the openml website: https://www.openml.org/d/4550. +to get more information on the dataset on the openml website:: + + >>> print(mice.url) + + https://www.openml.org/d/4550 The id is also the best way to specify how to fetch a dataset from OpenML:: diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 9503ea9d29793..1ee4d12fb8836 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -38,6 +38,21 @@ def _get_data_info_by_name(name, version): except HTTPError as error: if error.code == 412: data_found = False + else: + raise error + + if not data_found and version != "active": + # might have been deactivated. will warn later + data_found = True + try: + json_string = urlopen(_SEARCH_NAME.format(name) + + "/data_version/{}/status/deactivated".format( + version)) + except HTTPError as error: + if error.code == 412: + data_found = False + else: + raise error if not data_found: # not in except for nicer traceback @@ -162,12 +177,12 @@ def mem(func): # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous # and target at start or end, we could use a view instead. + data_columns = meta.names() if target_column is not None: y = data[target_column] - data_columns = meta.names().remove(target_column) + data_columns.remove(target_column) else: y = None - data_columns = meta.names() if all([x == "numeric" for x in meta.types()]): dtype = None else: @@ -179,6 +194,7 @@ def mem(func): bunch = Bunch( data=X, target=y, feature_names=data_columns, - DESCR=description, details=data_description, meta=meta) + DESCR=description, details=data_description, meta=meta, + url="https://www.openml.org/d/{}".format(data_id)) return bunch From b3b927637c5cf7d37f454b5bc4eb536976aa00fd Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 15:43:07 -0500 Subject: [PATCH 20/28] update mice version 4 dataset id --- doc/datasets/openml.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst index 3738b720bc5c0..60fc090cdecfc 100644 --- a/doc/datasets/openml.rst +++ b/doc/datasets/openml.rst @@ -24,7 +24,7 @@ from the repository using the function For example, to download a dataset of gene expressions in mice brains:: >>> from sklearn.datasets import fetch_openml - >>> mice = fetch_openml('miceprotein', version=2, data_home=custom_data_home) + >>> mice = fetch_openml('miceprotein', version=4, data_home=custom_data_home) To fully specify a dataset, you need to provide a name and a version, though the version is optional, see :ref:`openml_versions`_ below. @@ -63,16 +63,15 @@ and ``details`` attributes:: The ``DESCR`` contains a free-text description of the data, while ``details`` contains a dictionary of meta-data stored by openml, like the dataset id. -The id of the mice protein dataset is 4550, and you can use this (or the name) +The id of the mice protein dataset is 40966, and you can use this (or the name) to get more information on the dataset on the openml website:: >>> print(mice.url) - - https://www.openml.org/d/4550 + https://www.openml.org/d/40966 The id is also the best way to specify how to fetch a dataset from OpenML:: - >>> mice = fetch_openml(4550, data_home=custom_data_home) + >>> mice = fetch_openml(40966, data_home=custom_data_home) >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', 'creator': ..., @@ -96,9 +95,9 @@ entirely different datasets. If a particular version of a dataset has been found to contain significant issues, it might be inactivated. Using a name to specify a dataset will yield the earliest version of a dataset that is still active. That means that -``fetch_openml("miceprotein")`` can yield different results at differnt times +``fetch_openml("miceprotein")`` can yield different results at different times if earlier versions become inactive. -You can see that the dataset with id 4550 that we fetched above is the version 1 +You can see that the dataset with id 40966 that we fetched above is the version 1 of the "miceprotein" dataset:: >>> mice.details['version'] #doctest: +SKIP From 7e91c7140b4d1542a5f0eb946cc32778dd65b3a3 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 15:48:11 -0500 Subject: [PATCH 21/28] add whatsnew entry --- doc/whats_new/v0.20.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 58506cf8aa99b..5de8cf36e8066 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -51,6 +51,12 @@ Model evaluation ``'balanced_accuracy'`` scorer for binary classification. :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. +Datasets + +- Added :func:`dataset.fetch_openml` to fetch any dataset from `OpenML `. + OpenML is a free, open data sharing platform and will replace mldata, which + is no longer maintained. :issue:`9908` by `Andreas Müller`_ + Enhancements ............ From 11909d54fbba7d873154ea301c48dcb9941c0a5c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 15 Nov 2017 16:29:51 -0500 Subject: [PATCH 22/28] add unicode and normalize whitespace flags to pytest config --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 02b3015e87f2e..3b82e8eaf996c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,7 @@ ignore-files=^setup\.py$ addopts = --doctest-modules --disable-pytest-warnings +doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE [wheelhouse_uploader] artifact_indexes= From 7e1620328536638c4130e5f6132c155834e6cd11 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 22 Nov 2017 12:00:32 -0500 Subject: [PATCH 23/28] add test for fetch_openml --- sklearn/datasets/tests/test_openml.py | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 sklearn/datasets/tests/test_openml.py diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py new file mode 100644 index 0000000000000..b8f912c4c91e4 --- /dev/null +++ b/sklearn/datasets/tests/test_openml.py @@ -0,0 +1,30 @@ +"""Test the openml loader. + +Skipped on travis. +""" + +from sklearn.datasets import fetch_openml +from sklearn.utils.testing import check_skip_travis, assert_warns, assert_raises + + +def test_fetch_openml(): + # check_skip_travis() + # fetch with version + iris_1 = fetch_openml("iris", version=1) + assert iris_1.details['id'] == '61' + # fetch without version + iris_1 = fetch_openml("iris") + assert iris_1.details['id'] == '61' + # fetch with dataset id + iris_by_id = fetch_openml(61) + assert iris_by_id.details['name'] == "iris" + assert iris_by_id.data.shape == (150, 4) + assert iris_by_id.target.shape == (150,) + # fetch inactive dataset by id + glas2 = assert_warns(UserWarning, fetch_openml, 40675) + # fetch inactive dataset by name and version + assert glas2.data.shape == (163, 9) + glas2_by_version = assert_warns(UserWarning, fetch_openml, 'glass2', 1) + # there is no active version of glass2 + assert glas2_by_version.details['id'] == '40675' + assert_raises(ValueError, fetch_openml, 'glass2') From 8dcb26bb16a0af4eb4685ba8803c5324d6601747 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 22 Nov 2017 12:28:57 -0500 Subject: [PATCH 24/28] test error messages --- sklearn/datasets/openml.py | 2 +- sklearn/datasets/tests/test_openml.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 1ee4d12fb8836..473c03475fe4f 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -168,7 +168,7 @@ def mem(func): if data_description['status'] != "active": warn("Version {} of dataset {} is inactive, meaning that issues have" " been found in the dataset. Try using a newer version.".format( - data_description['name'], data_description['version'])) + data_description['version'], data_description['name'])) if target_column == "default-target": target_column = data_description.get('default_target_attribute', None) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index b8f912c4c91e4..c7b66b6045783 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -4,7 +4,8 @@ """ from sklearn.datasets import fetch_openml -from sklearn.utils.testing import check_skip_travis, assert_warns, assert_raises +from sklearn.utils.testing import (check_skip_travis, assert_warns_message, + assert_raise_message) def test_fetch_openml(): @@ -21,10 +22,15 @@ def test_fetch_openml(): assert iris_by_id.data.shape == (150, 4) assert iris_by_id.target.shape == (150,) # fetch inactive dataset by id - glas2 = assert_warns(UserWarning, fetch_openml, 40675) + glas2 = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + 40675) # fetch inactive dataset by name and version assert glas2.data.shape == (163, 9) - glas2_by_version = assert_warns(UserWarning, fetch_openml, 'glass2', 1) + glas2_by_version = assert_warns_message( + UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, + "glass2", 1) # there is no active version of glass2 assert glas2_by_version.details['id'] == '40675' - assert_raises(ValueError, fetch_openml, 'glass2') + assert_raise_message(ValueError, "No active dataset glass2 found", + fetch_openml, 'glass2') From 0d562b62df3402c47ae864a4e04222cb105d5bdf Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 22 Nov 2017 12:35:31 -0500 Subject: [PATCH 25/28] fix command for make test-coverage --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6725a7441f75a..6f2115820308c 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ endif test-coverage: rm -rf coverage .coverage - $(PYTEST) sklearn --show-locals -v --with-cov sklearn + $(PYTEST) sklearn --showlocals -v --cov=sklearn test: test-code test-sphinxext test-doc From e274ad3acfd4272943d460f925ef50fdda115fe8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 22 Nov 2017 12:48:12 -0500 Subject: [PATCH 26/28] make flake8 green --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index c7b66b6045783..671c53a93349c 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -4,7 +4,7 @@ """ from sklearn.datasets import fetch_openml -from sklearn.utils.testing import (check_skip_travis, assert_warns_message, +from sklearn.utils.testing import (assert_warns_message, assert_raise_message) From eb39a01fdbd316633a9e9c972c779e6d617cbaef Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 22 Nov 2017 15:06:44 -0500 Subject: [PATCH 27/28] py35 compatiility --- sklearn/datasets/openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 473c03475fe4f..e610f27256a1f 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -61,7 +61,7 @@ def _get_data_info_by_name(name, version): raise ValueError("Dataset {} with version {}" " not found.".format(name, version)) - json_data = json.load(json_string) + json_data = json.loads(json_string.read().decode("utf-8")) return json_data['data']['dataset'][0] @@ -76,7 +76,7 @@ def _get_data_description_by_id(data_id): # not in except for nicer traceback raise ValueError("Dataset with id {} " "not found.".format(data_id)) - json_data = json.load(json_string) + json_data = json.loads(json_string.read().decode("utf-8")) return json_data['data_set_description'] From 67825e85c3be2fb92da49a697ca4ae89b461737e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 21 Dec 2017 17:45:41 -0500 Subject: [PATCH 28/28] trying to use CSV interface --- sklearn/datasets/openml.py | 44 ++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index e610f27256a1f..002935518c378 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -24,6 +24,7 @@ _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1" _DATA_INFO = "https://openml.org/api/v1/json/data/{}" +_DATA_FEATURES = "https://openml.org/api/v1/json/data/features/{}" def _get_data_info_by_name(name, version): @@ -80,6 +81,21 @@ def _get_data_description_by_id(data_id): return json_data['data_set_description'] +def _get_data_features(data_id): + data_found = True + try: + json_string = urlopen(_DATA_FEATURES.format(data_id)) + except HTTPError as error: + if error.code == 412: + data_found = False + if not data_found: + # not in except for nicer traceback + raise ValueError("Dataset with id {} " + "not found.".format(data_id)) + json_data = json.loads(json_string.read().decode("utf-8")) + return json_data['data_features']['feature'] + + def _download_data(url): response = urlopen(url) if sys.version_info[0] == 2: @@ -92,6 +108,14 @@ def _download_data(url): return arff +def _download_data_csv(file_id): + response = urlopen("https://openml.org/data/v1/get_csv/{}".format(file_id)) + data = np.genfromtxt(response, names=True, dtype=None, delimiter=',', + missing_values='?') + response.close() + return data + + def fetch_openml(name_or_id=None, version='active', data_home=None, target_column='default-target', memory=True): """Fetch dataset from openml by name or dataset id. @@ -142,7 +166,8 @@ def mem(func): return func _get_data_info_by_name_ = mem(_get_data_info_by_name) _get_data_description_by_id_ = mem(_get_data_description_by_id) - _download_data_ = mem(_download_data) + _get_data_features_ = mem(_get_data_features) + _download_data_csv_ = mem(_download_data_csv) if not exists(data_home): os.makedirs(data_home) @@ -173,17 +198,24 @@ def mem(func): target_column = data_description.get('default_target_attribute', None) # download actual data - data, meta = _download_data_(data_description['url']) + features = _get_data_features_(data_id) # TODO: stacking the content of the structured array # this results in a copy. If the data was homogeneous # and target at start or end, we could use a view instead. - data_columns = meta.names() + data_columns = [] + for feature in features: + if (feature['name'] != target_column and feature['is_ignore'] == + 'false' and feature['is_row_identifier'] == 'false'): + data_columns.append(feature['name']) + + data = _download_data_csv_(data_description['file_id']) if target_column is not None: y = data[target_column] - data_columns.remove(target_column) else: y = None - if all([x == "numeric" for x in meta.types()]): + + if all([feature['data_type'] == "numeric" for feature in features + if feature['name'] in data_columns]): dtype = None else: dtype = object @@ -194,7 +226,7 @@ def mem(func): bunch = Bunch( data=X, target=y, feature_names=data_columns, - DESCR=description, details=data_description, meta=meta, + DESCR=description, details=data_description, features=features, url="https://www.openml.org/d/{}".format(data_id)) return bunch