From 268f533dd0f761c6a135021ff40542beda3aac23 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Tue, 10 Oct 2017 13:50:01 +0200
Subject: [PATCH 01/28] start on openml dataset loader

---
 sklearn/datasets/openml.py | 78 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 sklearn/datasets/openml.py

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
new file mode 100644
index 0000000000000..fefeb4aa62203
--- /dev/null
+++ b/sklearn/datasets/openml.py
@@ -0,0 +1,78 @@
+import json
+import warnings
+import numpy as np
+import numbers
+
+try:
+    # Python 2
+    from urllib2 import urlopen
+except ImportError:
+    # Python 3+
+    from urllib.request import urlopen
+
+from scipy.io.arff import loadarff
+
+_SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
+
+jsons = "https://openml.org/api/v1/json/data/list/data_name/{}"
+data_dl = "https://www.openml.org/data/download/{}"
+
+
+def _get_data_info_by_name(name):
+    url_path = urlopen(_SEARCH_NAME.format(name))                                                                               
+    json_data = json.load(url_path)
+    return json_data['data']['dataset'][0]
+
+
+def fetch_openml(name_or_id=None, version=1, json_loc=jsons,
+                 data_loc=data_dl):
+    """Fetch dataset from openml by name or dataset id.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    """
+    if isinstance(name_or_id, numbers.Integral):
+        if version != 1:
+            raise ValueError(
+                "Dataset id={} and version={} passed, but you can only "
+                "specify a numeric id or a version, not both.".format(
+                    name_or_id, version))
+        data_id = name_or_id
+    elif isinstance(name_or_id, str):
+        name = name_or_id
+
+    else:
+        raise TypeError(
+            "Invalid name_or_id {}, should be string or integer.".format(
+                name_or_id))
+
+    json_dl = urlretrieve(json_loc.format(name))[0]
+    # get the json file
+    with open(json_dl, 'r') as tmp:
+        json_data = json.load(tmp)['data']['dataset']
+    vers = [(idx, val) for idx, item in enumerate(json_data)
+            for key, val in item.items() if key == "version"]
+    # tell user there are more versions if they dont specify number
+    if len(vers) > 1 and name_vers is None:
+        msg = ("dataset: {} has versions {}, "
+               "default is {}").format(name,
+                                       [i[1] for i in vers],
+                                       min([i[1] for i in vers]))
+        warnings.warn(msg)
+    # check if the version specified (if it is) is in the ones gotten
+    use = 1 if name_vers is None else name_vers
+    for v in vers:
+        if v[1] == use:
+            to_get = json_data[v[0]]['file_id']
+    # download data
+    data_tmp = urlretrieve(data_loc.format(to_get))[0]
+    # load the data
+    data = loadarff(data_tmp)
+    data_fmt = np.zeros((data[0].shape[0], len(data[0][0])), dtype=object)
+    # scipy returns a tuple so try to put it in the right format
+    for idx, row in enumerate(data[0]):
+        data_fmt[idx, :] = [val for val in row]
+    return data_fmt

From 4f3e93e1cbed32acc7fb3f9921a65fa414fc8f6b Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Wed, 11 Oct 2017 14:18:02 +0200
Subject: [PATCH 02/28] working on stuff

---
 sklearn/datasets/openml.py | 43 +++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index fefeb4aa62203..a4e78fa352caf 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -2,6 +2,8 @@
 import warnings
 import numpy as np
 import numbers
+import os
+from os.path import join, exists
 
 try:
     # Python 2
@@ -11,29 +13,51 @@
     from urllib.request import urlopen
 
 from scipy.io.arff import loadarff
+from .base import get_data_home
 
 _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
-
-jsons = "https://openml.org/api/v1/json/data/list/data_name/{}"
-data_dl = "https://www.openml.org/data/download/{}"
+_DATA_INFO = "https://openml.org/api/v1/json/data/{}"
+_DATA_DOWNLOAD = "https://www.openml.org/data/download/{}"
 
 
 def _get_data_info_by_name(name):
-    url_path = urlopen(_SEARCH_NAME.format(name))                                                                               
+    url_path = urlopen(_SEARCH_NAME.format(name))
     json_data = json.load(url_path)
     return json_data['data']['dataset'][0]
 
 
-def fetch_openml(name_or_id=None, version=1, json_loc=jsons,
-                 data_loc=data_dl):
+def _get_data_description_by_id(data_id):
+    url_path = urlopen(_DATA_INFO.format(data_id))
+    json_data = json.load(url_path)
+    return json_data['data_set_description']
+
+
+def fetch_openml(name_or_id=None, version=1, data_home=None):
     """Fetch dataset from openml by name or dataset id.
 
     Parameters
     ----------
 
+    data_home : optional, default: None
+        Specify another download and cache folder for the data sets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
     Returns
     -------
+
+    data : Bunch
+        Dictionary-like object, the interesting attributes are:
+        'data', the data to learn, 'target', the classification labels,
+        'DESCR', the full description of the dataset, and
+        'COL_NAMES', the original names of the dataset columns.
     """
+    data_home = get_data_home(data_home=data_home)
+    data_home = join(data_home, 'openml')
+    if not exists(data_home):
+        os.makedirs(data_home)
+
+    # check if dataset id is known
+
     if isinstance(name_or_id, numbers.Integral):
         if version != 1:
             raise ValueError(
@@ -42,13 +66,18 @@ def fetch_openml(name_or_id=None, version=1, json_loc=jsons,
                     name_or_id, version))
         data_id = name_or_id
     elif isinstance(name_or_id, str):
-        name = name_or_id
+        data_info = _get_data_info_by_name(name_or_id)
+        data_id = data_info['did']
 
     else:
         raise TypeError(
             "Invalid name_or_id {}, should be string or integer.".format(
                 name_or_id))
 
+
+    data_description = _get_data_description_by_id(data_id)
+
+    # download actual data
     json_dl = urlretrieve(json_loc.format(name))[0]
     # get the json file
     with open(json_dl, 'r') as tmp:

From fabaa90a563ea578d68425ddaef06559815e3db7 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 09:47:13 +0200
Subject: [PATCH 03/28] first version working

---
 sklearn/datasets/openml.py | 76 ++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index a4e78fa352caf..99865d4dfa8b7 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -1,9 +1,8 @@
 import json
-import warnings
-import numpy as np
 import numbers
 import os
 from os.path import join, exists
+from http.client import IncompleteRead
 
 try:
     # Python 2
@@ -13,7 +12,12 @@
     from urllib.request import urlopen
 
 from scipy.io.arff import loadarff
+import numpy as np
+
 from .base import get_data_home
+# from ..externals.joblib import Memory
+from ..externals.six import StringIO
+from ..utils import Bunch
 
 _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
 _DATA_INFO = "https://openml.org/api/v1/json/data/{}"
@@ -21,18 +25,18 @@
 
 
 def _get_data_info_by_name(name):
-    url_path = urlopen(_SEARCH_NAME.format(name))
-    json_data = json.load(url_path)
+    json_string = urlopen(_SEARCH_NAME.format(name))
+    json_data = json.load(json_string)
     return json_data['data']['dataset'][0]
 
 
 def _get_data_description_by_id(data_id):
-    url_path = urlopen(_DATA_INFO.format(data_id))
-    json_data = json.load(url_path)
+    json_string = urlopen(_DATA_INFO.format(data_id))
+    json_data = json.load(json_string)
     return json_data['data_set_description']
 
 
-def fetch_openml(name_or_id=None, version=1, data_home=None):
+def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
     """Fetch dataset from openml by name or dataset id.
 
     Parameters
@@ -53,6 +57,11 @@ def fetch_openml(name_or_id=None, version=1, data_home=None):
     """
     data_home = get_data_home(data_home=data_home)
     data_home = join(data_home, 'openml')
+    # if memory:
+    #     mem = Memory(join(data_home, 'cache'))
+    #     _get_data_info_by_name = mem(_get_data_info_by_name)
+    #     _get_data_description_by_id = mem(_get_data_description_by_id)
+    #     _download_data = mem(_download_data)
     if not exists(data_home):
         os.makedirs(data_home)
 
@@ -74,34 +83,31 @@ def fetch_openml(name_or_id=None, version=1, data_home=None):
             "Invalid name_or_id {}, should be string or integer.".format(
                 name_or_id))
 
-
     data_description = _get_data_description_by_id(data_id)
+    target_name = data_description['default_target_attribute']
 
     # download actual data
-    json_dl = urlretrieve(json_loc.format(name))[0]
-    # get the json file
-    with open(json_dl, 'r') as tmp:
-        json_data = json.load(tmp)['data']['dataset']
-    vers = [(idx, val) for idx, item in enumerate(json_data)
-            for key, val in item.items() if key == "version"]
-    # tell user there are more versions if they dont specify number
-    if len(vers) > 1 and name_vers is None:
-        msg = ("dataset: {} has versions {}, "
-               "default is {}").format(name,
-                                       [i[1] for i in vers],
-                                       min([i[1] for i in vers]))
-        warnings.warn(msg)
-    # check if the version specified (if it is) is in the ones gotten
-    use = 1 if name_vers is None else name_vers
-    for v in vers:
-        if v[1] == use:
-            to_get = json_data[v[0]]['file_id']
-    # download data
-    data_tmp = urlretrieve(data_loc.format(to_get))[0]
-    # load the data
-    data = loadarff(data_tmp)
-    data_fmt = np.zeros((data[0].shape[0], len(data[0][0])), dtype=object)
-    # scipy returns a tuple so try to put it in the right format
-    for idx, row in enumerate(data[0]):
-        data_fmt[idx, :] = [val for val in row]
-    return data_fmt
+    response = urlopen(_DATA_DOWNLOAD.format(data_id))
+    # we need to catch IncompleteRead which is likely a server-side issue
+    try:
+        data_arff = response.read()
+    except IncompleteRead as e:
+        data_arff = e.partial
+    # getting structured array and metadata
+    data, meta = loadarff(StringIO(data_arff.decode("utf-8")))
+    columns = np.array(meta.names())
+    data_columns = columns[columns != target_name]
+    # TODO: stacking the content of the structured array
+    # this results in a copy. If the data was homogeneous
+    # we could use a view instead.
+    X = np.column_stack(data[c] for c in data_columns)
+    y = data[target_name]
+
+    description = "{}\n\nDownloaded from openml.org.".format(
+        data_description['description'])
+
+    bunch = Bunch(
+        data=X, target=y, feature_names=data_columns,
+        DESCR=description)
+
+    return bunch

From 1804b14d0b2e7a6365a8e8c40d18eef8ba965eb0 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 10:35:28 +0200
Subject: [PATCH 04/28] docstrings, use version="active" as default

---
 sklearn/datasets/openml.py | 46 +++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 99865d4dfa8b7..6ab2a22678191 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -2,7 +2,7 @@
 import numbers
 import os
 from os.path import join, exists
-from http.client import IncompleteRead
+from warnings import warn
 
 try:
     # Python 2
@@ -10,6 +10,7 @@
 except ImportError:
     # Python 3+
     from urllib.request import urlopen
+    from http.client import IncompleteRead
 
 from scipy.io.arff import loadarff
 import numpy as np
@@ -24,8 +25,11 @@
 _DATA_DOWNLOAD = "https://www.openml.org/data/download/{}"
 
 
-def _get_data_info_by_name(name):
-    json_string = urlopen(_SEARCH_NAME.format(name))
+def _get_data_info_by_name(name, version):
+    if version == "active":
+        json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/"))
+    else:
+        json_string = urlopen(_SEARCH_NAME.format(name))
     json_data = json.load(json_string)
     return json_data['data']['dataset'][0]
 
@@ -36,24 +40,41 @@ def _get_data_description_by_id(data_id):
     return json_data['data_set_description']
 
 
-def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
+def fetch_openml(name_or_id=None, version='active', data_home=None,
+                 memory=True):
     """Fetch dataset from openml by name or dataset id.
 
+    Datasets are uniquely identified by either an integer ID or by a
+    combination of name and version (i.e. there might be multiple
+    versions of the 'iris' dataset). Newer versions are assumed to fix
+    issues in earlier versions.
+
     Parameters
     ----------
+    name_or_id : string or integer
+        Identifier of the dataset. If integer, assumed to be the id of the
+        dataset on OpenML, if string, assumed to be the name of the dataset.
+
+    version : integer or 'active', default='active'
+        Version of the dataset. Only used if ``name_or_id`` is a string.
+        If 'active' the oldest version that's still active is used.
 
     data_home : optional, default: None
         Specify another download and cache folder for the data sets. By default
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
+    memory : boolean, default=True
+        Whether to store downloaded datasets using joblib.
+
     Returns
     -------
 
     data : Bunch
         Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, 'target', the classification labels,
-        'DESCR', the full description of the dataset, and
-        'COL_NAMES', the original names of the dataset columns.
+        'data', the data to learn, 'target', the regression target or
+        classification labels, 'DESCR', the full description of the dataset,
+        'feature_names', the original names of the dataset columns, and
+        'details' which provide more information on the openml meta-data.
     """
     data_home = get_data_home(data_home=data_home)
     data_home = join(data_home, 'openml')
@@ -66,7 +87,6 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
         os.makedirs(data_home)
 
     # check if dataset id is known
-
     if isinstance(name_or_id, numbers.Integral):
         if version != 1:
             raise ValueError(
@@ -75,7 +95,7 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
                     name_or_id, version))
         data_id = name_or_id
     elif isinstance(name_or_id, str):
-        data_info = _get_data_info_by_name(name_or_id)
+        data_info = _get_data_info_by_name(name_or_id, version)
         data_id = data_info['did']
 
     else:
@@ -84,6 +104,10 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
                 name_or_id))
 
     data_description = _get_data_description_by_id(data_id)
+    if data_description['status'] != "active":
+        warn("Version {} of dataset {} is inactive, meaning that issues have"
+             " been found in the dataset. Try using a newer version.".format(
+                 data_description['name'], data_description['version']))
     target_name = data_description['default_target_attribute']
 
     # download actual data
@@ -104,10 +128,10 @@ def fetch_openml(name_or_id=None, version=1, data_home=None, memory=True):
     y = data[target_name]
 
     description = "{}\n\nDownloaded from openml.org.".format(
-        data_description['description'])
+        data_description.pop('description'))
 
     bunch = Bunch(
         data=X, target=y, feature_names=data_columns,
-        DESCR=description)
+        DESCR=description, details=data_description)
 
     return bunch

From ffd43359ce73ec70172a39f83c1d4686c23bd1d9 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 10:43:19 +0200
Subject: [PATCH 05/28] add caching to openml loader

---
 sklearn/datasets/openml.py | 39 ++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 6ab2a22678191..608c6e68b7147 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -16,7 +16,7 @@
 import numpy as np
 
 from .base import get_data_home
-# from ..externals.joblib import Memory
+from ..externals.joblib import Memory
 from ..externals.six import StringIO
 from ..utils import Bunch
 
@@ -40,6 +40,17 @@ def _get_data_description_by_id(data_id):
     return json_data['data_set_description']
 
 
+def _download_data(data_id):
+    response = urlopen(_DATA_DOWNLOAD.format(data_id))
+    # we need to catch IncompleteRead which is likely a server-side issue
+    try:
+        data_arff = response.read()
+    except IncompleteRead as e:
+        data_arff = e.partial
+    # getting structured array and metadata
+    return loadarff(StringIO(data_arff.decode("utf-8")))
+
+
 def fetch_openml(name_or_id=None, version='active', data_home=None,
                  memory=True):
     """Fetch dataset from openml by name or dataset id.
@@ -78,11 +89,14 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
     """
     data_home = get_data_home(data_home=data_home)
     data_home = join(data_home, 'openml')
-    # if memory:
-    #     mem = Memory(join(data_home, 'cache'))
-    #     _get_data_info_by_name = mem(_get_data_info_by_name)
-    #     _get_data_description_by_id = mem(_get_data_description_by_id)
-    #     _download_data = mem(_download_data)
+    if memory:
+        mem = Memory(join(data_home, 'cache'), verbose=0).cache
+    else:
+        mem = lambda x: x
+    _get_data_info_by_name_ = mem(_get_data_info_by_name)
+    _get_data_description_by_id_ = mem(_get_data_description_by_id)
+    _download_data_ = mem(_download_data)
+
     if not exists(data_home):
         os.makedirs(data_home)
 
@@ -95,7 +109,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
                     name_or_id, version))
         data_id = name_or_id
     elif isinstance(name_or_id, str):
-        data_info = _get_data_info_by_name(name_or_id, version)
+        data_info = _get_data_info_by_name_(name_or_id, version)
         data_id = data_info['did']
 
     else:
@@ -103,7 +117,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
             "Invalid name_or_id {}, should be string or integer.".format(
                 name_or_id))
 
-    data_description = _get_data_description_by_id(data_id)
+    data_description = _get_data_description_by_id_(data_id)
     if data_description['status'] != "active":
         warn("Version {} of dataset {} is inactive, meaning that issues have"
              " been found in the dataset. Try using a newer version.".format(
@@ -111,14 +125,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
     target_name = data_description['default_target_attribute']
 
     # download actual data
-    response = urlopen(_DATA_DOWNLOAD.format(data_id))
-    # we need to catch IncompleteRead which is likely a server-side issue
-    try:
-        data_arff = response.read()
-    except IncompleteRead as e:
-        data_arff = e.partial
-    # getting structured array and metadata
-    data, meta = loadarff(StringIO(data_arff.decode("utf-8")))
+    data, meta = _download_data_(data_id)
     columns = np.array(meta.names())
     data_columns = columns[columns != target_name]
     # TODO: stacking the content of the structured array

From fe0904bcc15084bf35583957672135333007a037 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 11:40:55 +0200
Subject: [PATCH 06/28] pep8 annoyance

---
 sklearn/datasets/openml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 608c6e68b7147..364ef90f83dd1 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -92,7 +92,8 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
     if memory:
         mem = Memory(join(data_home, 'cache'), verbose=0).cache
     else:
-        mem = lambda x: x
+        def mem(func):
+            return func
     _get_data_info_by_name_ = mem(_get_data_info_by_name)
     _get_data_description_by_id_ = mem(_get_data_description_by_id)
     _download_data_ = mem(_download_data)

From eea026e5d60ffa672eb58a54ea71407aae603a71 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 14:03:32 +0200
Subject: [PATCH 07/28] fix download url, allow datasets without target

---
 sklearn/datasets/openml.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 364ef90f83dd1..c1fc2f7af9b60 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -10,7 +10,7 @@
 except ImportError:
     # Python 3+
     from urllib.request import urlopen
-    from http.client import IncompleteRead
+    # from http.client import IncompleteRead
 
 from scipy.io.arff import loadarff
 import numpy as np
@@ -22,13 +22,13 @@
 
 _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
 _DATA_INFO = "https://openml.org/api/v1/json/data/{}"
-_DATA_DOWNLOAD = "https://www.openml.org/data/download/{}"
 
 
 def _get_data_info_by_name(name, version):
     if version == "active":
         json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/"))
     else:
+        # FIXME waiting for new filter mechanism
         json_string = urlopen(_SEARCH_NAME.format(name))
     json_data = json.load(json_string)
     return json_data['data']['dataset'][0]
@@ -40,15 +40,11 @@ def _get_data_description_by_id(data_id):
     return json_data['data_set_description']
 
 
-def _download_data(data_id):
-    response = urlopen(_DATA_DOWNLOAD.format(data_id))
-    # we need to catch IncompleteRead which is likely a server-side issue
-    try:
-        data_arff = response.read()
-    except IncompleteRead as e:
-        data_arff = e.partial
-    # getting structured array and metadata
-    return loadarff(StringIO(data_arff.decode("utf-8")))
+def _download_data(url):
+    response = urlopen(url)
+    arff = loadarff(StringIO(response.read().decode('utf-8')))
+    response.close()
+    return arff
 
 
 def fetch_openml(name_or_id=None, version='active', data_home=None,
@@ -103,7 +99,7 @@ def mem(func):
 
     # check if dataset id is known
     if isinstance(name_or_id, numbers.Integral):
-        if version != 1:
+        if version != "active":
             raise ValueError(
                 "Dataset id={} and version={} passed, but you can only "
                 "specify a numeric id or a version, not both.".format(
@@ -123,17 +119,20 @@ def mem(func):
         warn("Version {} of dataset {} is inactive, meaning that issues have"
              " been found in the dataset. Try using a newer version.".format(
                  data_description['name'], data_description['version']))
-    target_name = data_description['default_target_attribute']
+    target_name = data_description.get('default_target_attribute', None)
 
     # download actual data
-    data, meta = _download_data_(data_id)
+    data, meta = _download_data_(data_description['url'])
     columns = np.array(meta.names())
     data_columns = columns[columns != target_name]
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
     # we could use a view instead.
     X = np.column_stack(data[c] for c in data_columns)
-    y = data[target_name]
+    if target_name is not None:
+        y = data[target_name]
+    else:
+        y = None
 
     description = "{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))

From bca12e9dd77d8c7451a9a7b431b853f1e6571615 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 14:25:14 +0200
Subject: [PATCH 08/28] allow specifying the target column, starting on docs

---
 doc/datasets/openml.rst    | 68 ++++++++++++++++++++++++++++++++++++++
 sklearn/datasets/openml.py | 19 +++++++----
 2 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 doc/datasets/openml.rst

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
new file mode 100644
index 0000000000000..f91226b454b72
--- /dev/null
+++ b/doc/datasets/openml.rst
@@ -0,0 +1,68 @@
+..
+    For doctests:
+
+    >>> import numpy as np
+    >>> import os
+    >>> import tempfile
+    >>> # Create a temporary folder for the data fetcher
+    >>> custom_data_home = tempfile.mkdtemp()
+    >>> os.makedirs(os.path.join(custom_data_home, 'mldata'))
+
+
+.. _mldata:
+
+Downloading datasets from the openml.org repository
+===================================================
+
+`openml.org <https://openml.org>`_ is a public repository for machine learning
+data and experiments.
+
+The ``sklearn.datasets`` package is able to directly download data
+sets from the repository using the function
+:func:`sklearn.datasets.fetch_openml`.
+
+For example, to download a dataset of gene expressions in mice brains:
+
+  >>> from sklearn.datasets import fetch_mldata
+  >>> mnist = fetch_mldata('miceprotein', data_home=custom_data_home)
+
+The MNIST database contains a total of 70000 examples of handwritten digits
+of size 28x28 pixels, labeled from 0 to 9::
+
+  >>> mnist.data.shape
+  (70000, 784)
+  >>> mnist.target.shape
+  (70000,)
+  >>> np.unique(mnist.target)
+  array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
+
+After the first download, the dataset is cached locally in the path
+specified by the ``data_home`` keyword argument, which defaults to
+``~/scikit_learn_data/``::
+
+  >>> os.listdir(os.path.join(custom_data_home, 'mldata'))
+  ['mnist-original.mat']
+
+Data sets in `mldata.org <http://mldata.org>`_ do not adhere to a strict
+naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is
+able to make sense of the most common cases, but allows to tailor the
+defaults to individual datasets:
+
+* For datasets with multiple columns, :func:`sklearn.datasets.fetch_mldata`
+  tries to identify the target and data columns and rename them to ``target``
+  and ``data``. This is done by looking for arrays named ``label`` and
+  ``data`` in the dataset, and failing that by choosing the first array to be
+  ``target`` and the second to be ``data``. This behavior can be changed with
+  the ``target_name`` and ``data_name`` keywords, setting them to a specific
+  name or index number (the name and order of the columns in the datasets
+  can be found at its `mldata.org <http://mldata.org>`_ under the tab "Data"::
+
+    >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, data_name=0,
+    ...                      data_home=custom_data_home)
+    >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class',
+    ...                      data_name='double0', data_home=custom_data_home)
+
+
+..
+    >>> import shutil
+    >>> shutil.rmtree(custom_data_home)
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index c1fc2f7af9b60..bc57e6b2feadc 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -48,7 +48,7 @@ def _download_data(url):
 
 
 def fetch_openml(name_or_id=None, version='active', data_home=None,
-                 memory=True):
+                 target_column='default-target', memory=True):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -66,10 +66,16 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
         Version of the dataset. Only used if ``name_or_id`` is a string.
         If 'active' the oldest version that's still active is used.
 
-    data_home : optional, default: None
+    data_home : string or None, default None
         Specify another download and cache folder for the data sets. By default
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
+    target_column : string or None, default 'default-target'
+        Specify the column name in the data to use as target. If
+        'default-target', the standard target column a stored on the server
+        is used. If ``None``, all columns are returned as data and the
+        tharget is ``None``.
+
     memory : boolean, default=True
         Whether to store downloaded datasets using joblib.
 
@@ -119,18 +125,19 @@ def mem(func):
         warn("Version {} of dataset {} is inactive, meaning that issues have"
              " been found in the dataset. Try using a newer version.".format(
                  data_description['name'], data_description['version']))
-    target_name = data_description.get('default_target_attribute', None)
+    if target_column == "default-target":
+        target_column = data_description.get('default_target_attribute', None)
 
     # download actual data
     data, meta = _download_data_(data_description['url'])
     columns = np.array(meta.names())
-    data_columns = columns[columns != target_name]
+    data_columns = columns[columns != target_column]
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
     # we could use a view instead.
     X = np.column_stack(data[c] for c in data_columns)
-    if target_name is not None:
-        y = data[target_name]
+    if target_column is not None:
+        y = data[target_column]
     else:
         y = None
 

From f59ce8b47dc429af6dd57f25520c06ad2bbe209b Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 15:42:05 +0200
Subject: [PATCH 09/28] add openml to the narrative docs

---
 doc/datasets/index.rst     |  3 ++
 doc/datasets/openml.rst    | 79 +++++++++++++++++++++++++++-----------
 sklearn/datasets/openml.py |  3 +-
 3 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index f91163fc235c5..580d9e3950561 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -318,6 +318,7 @@ writing data in that format.
     olivetti_faces
     twenty_newsgroups
     mldata
+    openml
     labeled_faces
     covtype
     rcv1
@@ -327,6 +328,8 @@ writing data in that format.
 
 .. include:: twenty_newsgroups.rst
 
+.. include:: openml.rst
+
 .. include:: mldata.rst
 
 .. include:: labeled_faces.rst
diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index f91226b454b72..84669cedf694b 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -21,47 +21,80 @@ The ``sklearn.datasets`` package is able to directly download data
 sets from the repository using the function
 :func:`sklearn.datasets.fetch_openml`.
 
-For example, to download a dataset of gene expressions in mice brains:
+For example, to download a dataset of gene expressions in mice brains::
 
   >>> from sklearn.datasets import fetch_mldata
-  >>> mnist = fetch_mldata('miceprotein', data_home=custom_data_home)
+  >>> mice = fetch_mldata('miceprotein', data_home=custom_data_home)
 
-The MNIST database contains a total of 70000 examples of handwritten digits
+The dataset contains a total of 70000 examples of handwritten digits
 of size 28x28 pixels, labeled from 0 to 9::
 
-  >>> mnist.data.shape
+  >>> mice.data.shape
   (70000, 784)
-  >>> mnist.target.shape
+  >>> mice.target.shape
   (70000,)
-  >>> np.unique(mnist.target)
+  >>> np.unique(mice.target)
   array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
 
-After the first download, the dataset is cached locally in the path
-specified by the ``data_home`` keyword argument, which defaults to
-``~/scikit_learn_data/``::
+You can get more information on the dataset by looking at the ``DESCR``
+and ``details`` attributes::
 
-  >>> os.listdir(os.path.join(custom_data_home, 'mldata'))
-  ['mnist-original.mat']
+  >>> print(mice.DESCR)
+  something
+  >>> mice.details
+
+The ``DESCR`` contains a free-text description of the data, while ``details``
+contains a dictionary of meta-data stored by openml, like the dataset id.
+The id of the mice protein dataset is 4550, and you can use this (or the name)
+to get more information on the dataset on the openml website: https://www.openml.org/d/4550.
 
 Data sets in `mldata.org <http://mldata.org>`_ do not adhere to a strict
 naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is
 able to make sense of the most common cases, but allows to tailor the
 defaults to individual datasets:
 
-* For datasets with multiple columns, :func:`sklearn.datasets.fetch_mldata`
-  tries to identify the target and data columns and rename them to ``target``
-  and ``data``. This is done by looking for arrays named ``label`` and
-  ``data`` in the dataset, and failing that by choosing the first array to be
-  ``target`` and the second to be ``data``. This behavior can be changed with
-  the ``target_name`` and ``data_name`` keywords, setting them to a specific
-  name or index number (the name and order of the columns in the datasets
-  can be found at its `mldata.org <http://mldata.org>`_ under the tab "Data"::
+The id is also the best way to specify how to fetch a dataset from OpenML::
+
+  >>> mice = fetch_mldata(4550, data_home=custom_data_home)
+  >>> mice.details
+
+Dataset Versions
+----------------
+
+A dataset is uniquely specified by its id, but not necessarily by its name.
+Several different "versions" of a dataset with the same name can exist.
+If a particular version of a dataset has been found to contain significant
+issues, it might be inactivated. Using a name to specify a dataset will yield
+the earliest version of a dataset that is still active. That means that
+``fetch_mldata("miceprotein")`` can yield different results at differnt times
+if earlier versions become inactive.
+You can see that the dataset with id 4550 that we fetched above is the version 1
+of the "miceprotein" dataset::
+
+  >>> mice.details['version']
+  1
+
+In fact, this dataset only has one version. The iris dataset on the other hand
+has multiple versions::
+
+  >>> iris = fetch_mldata("iris", data_home=custom_data_home)
+  >>> iris.details['version']
+  >>> iris.details['id']
+
+  >>> iris_61 = fetch_mldata(61, data_home=custom_data_home)
+  >>> iris_61.details['version']
+  >>> iris_61.details['id']
+
+  >>> iris_969 = fetch_mldata(969, data_home=custom_data_home)
+  >>> iris_969.details['version']
+  >>> iris_969.details['id']
 
-    >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, data_name=0,
-    ...                      data_home=custom_data_home)
-    >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class',
-    ...                      data_name='double0', data_home=custom_data_home)
+Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
+To make sure you always get this exact dataset, it is safest to specify it by the dataset id.
+The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains
+a binarized version of the data::
 
+  >>> np.bincount(iris_969.target)
 
 ..
     >>> import shutil
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index bc57e6b2feadc..01037e25f8693 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -53,8 +53,7 @@ def fetch_openml(name_or_id=None, version='active', data_home=None,
 
     Datasets are uniquely identified by either an integer ID or by a
     combination of name and version (i.e. there might be multiple
-    versions of the 'iris' dataset). Newer versions are assumed to fix
-    issues in earlier versions.
+    versions of the 'iris' dataset).
 
     Parameters
     ----------

From d7dee6dc0bb5a5af106bbb2b373ae7fa1846fc9d Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 15:43:05 +0200
Subject: [PATCH 10/28] get more people to upload stuff to openml.

---
 doc/datasets/openml.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index 84669cedf694b..40c0e81afd9c6 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -15,7 +15,7 @@ Downloading datasets from the openml.org repository
 ===================================================
 
 `openml.org <https://openml.org>`_ is a public repository for machine learning
-data and experiments.
+data and experiments, that allows everybody to upload open datasets.
 
 The ``sklearn.datasets`` package is able to directly download data
 sets from the repository using the function

From 4c19ad9a300f19f33927e1ade6fe981457720743 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 15:48:56 +0200
Subject: [PATCH 11/28] store metadata, convert to dtype object if there is
 nominal data.

---
 sklearn/datasets/openml.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 01037e25f8693..4bb51e05c5759 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -134,7 +134,8 @@ def mem(func):
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
     # we could use a view instead.
-    X = np.column_stack(data[c] for c in data_columns)
+    dtype = object if "nominal" in meta.types() else None
+    X = np.array([data[c] for c in data_columns], dtype=dtype).T
     if target_column is not None:
         y = data[target_column]
     else:
@@ -145,6 +146,6 @@ def mem(func):
 
     bunch = Bunch(
         data=X, target=y, feature_names=data_columns,
-        DESCR=description, details=data_description)
+        DESCR=description, details=data_description, meta=meta)
 
     return bunch

From 16b7fed08477d165715ef2a40830912cdd38bddd Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Thu, 12 Oct 2017 16:05:36 +0200
Subject: [PATCH 12/28] fix doctests, add fetch_openml to __init__

---
 doc/datasets/mldata.rst      |  1 +
 doc/datasets/openml.rst      | 80 +++++++++++++++++++++++++-----------
 doc/modules/classes.rst      |  1 +
 sklearn/datasets/__init__.py |  2 +
 4 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst
index b94dfd7620a24..cb076e1e75bb7 100644
--- a/doc/datasets/mldata.rst
+++ b/doc/datasets/mldata.rst
@@ -16,6 +16,7 @@ Downloading datasets from the mldata.org repository
 
 `mldata.org <http://mldata.org>`_ is a public repository for machine learning
 data, supported by the `PASCAL network <http://www.pascal-network.org>`_ .
+It is no longer actively maintained, and it's suggested to use :ref:openml instead.
 
 The ``sklearn.datasets`` package is able to directly download data
 sets from the repository using the function
diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index 40c0e81afd9c6..8a080fffc1ed8 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -6,10 +6,10 @@
     >>> import tempfile
     >>> # Create a temporary folder for the data fetcher
     >>> custom_data_home = tempfile.mkdtemp()
-    >>> os.makedirs(os.path.join(custom_data_home, 'mldata'))
+    >>> os.makedirs(os.path.join(custom_data_home, 'openml'))
 
 
-.. _mldata:
+.. _openml:
 
 Downloading datasets from the openml.org repository
 ===================================================
@@ -23,40 +23,62 @@ sets from the repository using the function
 
 For example, to download a dataset of gene expressions in mice brains::
 
-  >>> from sklearn.datasets import fetch_mldata
-  >>> mice = fetch_mldata('miceprotein', data_home=custom_data_home)
+  >>> from sklearn.datasets import fetch_openml
+  >>> mice = fetch_openml('miceprotein', data_home=custom_data_home)
 
 The dataset contains a total of 70000 examples of handwritten digits
 of size 28x28 pixels, labeled from 0 to 9::
 
   >>> mice.data.shape
-  (70000, 784)
+  (1080, 81)
   >>> mice.target.shape
-  (70000,)
-  >>> np.unique(mice.target)
-  array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
+  (1080,)
+  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'",
+  b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8')
 
 You can get more information on the dataset by looking at the ``DESCR``
 and ``details`` attributes::
 
-  >>> print(mice.DESCR)
-  something
-  >>> mice.details
+  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios  
+  **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015   
+  **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
+  Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
+  Syndrome. PLoS ONE 10(6): e0129126...
+
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
+
 
 The ``DESCR`` contains a free-text description of the data, while ``details``
 contains a dictionary of meta-data stored by openml, like the dataset id.
 The id of the mice protein dataset is 4550, and you can use this (or the name)
 to get more information on the dataset on the openml website: https://www.openml.org/d/4550.
 
-Data sets in `mldata.org <http://mldata.org>`_ do not adhere to a strict
-naming or formatting convention. :func:`sklearn.datasets.fetch_mldata` is
-able to make sense of the most common cases, but allows to tailor the
-defaults to individual datasets:
-
 The id is also the best way to specify how to fetch a dataset from OpenML::
 
-  >>> mice = fetch_mldata(4550, data_home=custom_data_home)
-  >>> mice.details
+  >>> mice = fetch_openml(4550, data_home=custom_data_home)
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
 
 Dataset Versions
 ----------------
@@ -66,35 +88,43 @@ Several different "versions" of a dataset with the same name can exist.
 If a particular version of a dataset has been found to contain significant
 issues, it might be inactivated. Using a name to specify a dataset will yield
 the earliest version of a dataset that is still active. That means that
-``fetch_mldata("miceprotein")`` can yield different results at differnt times
+``fetch_openml("miceprotein")`` can yield different results at differnt times
 if earlier versions become inactive.
 You can see that the dataset with id 4550 that we fetched above is the version 1
 of the "miceprotein" dataset::
 
   >>> mice.details['version']
-  1
+  '1'
 
 In fact, this dataset only has one version. The iris dataset on the other hand
 has multiple versions::
 
-  >>> iris = fetch_mldata("iris", data_home=custom_data_home)
+  >>> iris = fetch_openml("iris", data_home=custom_data_home)
   >>> iris.details['version']
+  '1'
   >>> iris.details['id']
+  '61'
 
-  >>> iris_61 = fetch_mldata(61, data_home=custom_data_home)
+  >>> iris_61 = fetch_openml(61, data_home=custom_data_home)
   >>> iris_61.details['version']
+  '1'
   >>> iris_61.details['id']
+  '61'
 
-  >>> iris_969 = fetch_mldata(969, data_home=custom_data_home)
+  >>> iris_969 = fetch_openml(969, data_home=custom_data_home)
   >>> iris_969.details['version']
+  '3'
   >>> iris_969.details['id']
+  '969'
 
-Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
+'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
 To make sure you always get this exact dataset, it is safest to specify it by the dataset id.
 The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains
 a binarized version of the data::
 
-  >>> np.bincount(iris_969.target)
+  >>> np.unique(iris_969.target)
+  array([b'N', b'P'],
+        dtype='|S1')
 
 ..
     >>> import shutil
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index cfe2fd11c9ac4..b6cbb05a01f55 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -224,6 +224,7 @@ Loaders
    datasets.fetch_lfw_people
    datasets.fetch_mldata
    datasets.fetch_olivetti_faces
+   datasets.fetch_openml
    datasets.fetch_rcv1
    datasets.fetch_species_distributions
    datasets.get_data_home
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
index c43c0c4758b10..c7d78e633493d 100644
--- a/sklearn/datasets/__init__.py
+++ b/sklearn/datasets/__init__.py
@@ -23,6 +23,7 @@
 from .twenty_newsgroups import fetch_20newsgroups
 from .twenty_newsgroups import fetch_20newsgroups_vectorized
 from .mldata import fetch_mldata, mldata_filename
+from .openml import fetch_openml
 from .samples_generator import make_classification
 from .samples_generator import make_multilabel_classification
 from .samples_generator import make_hastie_10_2
@@ -65,6 +66,7 @@
            'fetch_covtype',
            'fetch_rcv1',
            'fetch_kddcup99',
+           'fetch_openml',
            'get_data_home',
            'load_boston',
            'load_diabetes',

From b3f6c3690a1fe8c04b5e4fb8f2e00ad432510f85 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Tue, 17 Oct 2017 17:47:54 -0400
Subject: [PATCH 13/28] make arff reading work in python2.7

---
 sklearn/datasets/openml.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 4bb51e05c5759..d4ed8163be725 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -1,5 +1,6 @@
 import json
 import numbers
+import sys
 import os
 from os.path import join, exists
 from warnings import warn
@@ -42,7 +43,12 @@ def _get_data_description_by_id(data_id):
 
 def _download_data(url):
     response = urlopen(url)
-    arff = loadarff(StringIO(response.read().decode('utf-8')))
+    if sys.version_info[0] == 2:
+        # Python2.7 numpy can't handle unicode?
+        arff = loadarff(StringIO(response.read()))
+    else:
+        arff = loadarff(StringIO(response.read().decode('utf-8')))
+
     response.close()
     return arff
 
@@ -141,7 +147,7 @@ def mem(func):
     else:
         y = None
 
-    description = "{}\n\nDownloaded from openml.org.".format(
+    description = u"{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))
 
     bunch = Bunch(

From dc401f26b5177300f3ea003c75342baef74ae013 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 24 Oct 2017 16:41:36 -0400
Subject: [PATCH 14/28] ignore doctests for now because of unicode issues

---
 doc/datasets/openml.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index 8a080fffc1ed8..d82f12fa6e60c 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -33,21 +33,21 @@ of size 28x28 pixels, labeled from 0 to 9::
   (1080, 81)
   >>> mice.target.shape
   (1080,)
-  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
   array([b"'c-CS-m'", b"'c-CS-s'", b"'c-SC-m'", b"'c-SC-s'", b"'t-CS-m'",
   b"'t-CS-s'", b"'t-SC-m'", b"'t-SC-s'"], dtype='|S8')
 
 You can get more information on the dataset by looking at the ``DESCR``
 and ``details`` attributes::
 
-  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
   **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios  
   **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015   
   **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
   Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
   Syndrome. PLoS ONE 10(6): e0129126...
 
-  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
   {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
   'creator': ...,
   'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
@@ -68,7 +68,7 @@ to get more information on the dataset on the openml website: https://www.openml
 The id is also the best way to specify how to fetch a dataset from OpenML::
 
   >>> mice = fetch_openml(4550, data_home=custom_data_home)
-  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
   {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
   'creator': ...,
   'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
@@ -93,28 +93,28 @@ if earlier versions become inactive.
 You can see that the dataset with id 4550 that we fetched above is the version 1
 of the "miceprotein" dataset::
 
-  >>> mice.details['version']
+  >>> mice.details['version']  #doctest: +SKIP
   '1'
 
 In fact, this dataset only has one version. The iris dataset on the other hand
 has multiple versions::
 
   >>> iris = fetch_openml("iris", data_home=custom_data_home)
-  >>> iris.details['version']
+  >>> iris.details['version']  #doctest: +SKIP
   '1'
-  >>> iris.details['id']
+  >>> iris.details['id']  #doctest: +SKIP
   '61'
 
   >>> iris_61 = fetch_openml(61, data_home=custom_data_home)
-  >>> iris_61.details['version']
+  >>> iris_61.details['version']  #doctest: +SKIP
   '1'
-  >>> iris_61.details['id']
+  >>> iris_61.details['id']  #doctest: +SKIP
   '61'
 
   >>> iris_969 = fetch_openml(969, data_home=custom_data_home)
-  >>> iris_969.details['version']
+  >>> iris_969.details['version']  #doctest: +SKIP
   '3'
-  >>> iris_969.details['id']
+  >>> iris_969.details['id']  #doctest: +SKIP
   '969'
 
 'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
@@ -122,7 +122,7 @@ To make sure you always get this exact dataset, it is safest to specify it by th
 The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains
 a binarized version of the data::
 
-  >>> np.unique(iris_969.target)
+  >>> np.unique(iris_969.target)  #doctest: +SKIP
   array([b'N', b'P'],
         dtype='|S1')
 

From d8cfd379e3870343bcb227788f5a23c316929c9a Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 24 Oct 2017 16:51:43 -0400
Subject: [PATCH 15/28] add version filter.

---
 doc/datasets/openml.rst    | 10 +++++++++-
 sklearn/datasets/openml.py |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index d82f12fa6e60c..b36a933b13406 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -117,7 +117,7 @@ has multiple versions::
   >>> iris_969.details['id']  #doctest: +SKIP
   '969'
 
-'Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
+Specifying the dataset by the name "iris" yields the lowest version, version 1, with the id 61.
 To make sure you always get this exact dataset, it is safest to specify it by the dataset id.
 The other dataset, with id 969, is version 3 (version 2 has become inactive), and contains
 a binarized version of the data::
@@ -126,6 +126,14 @@ a binarized version of the data::
   array([b'N', b'P'],
         dtype='|S1')
 
+You can also specify both the name and the version, which also uniquely identifies the dataset:: 
+  >>> iris_version_3 = fetch_openml("iris", version=3, data_home=custom_data_home)
+  >>> iris_version_3.details['version']
+  '3'
+  >>> iris_version_3.details['id']
+  '969'
+
+
 ..
     >>> import shutil
     >>> shutil.rmtree(custom_data_home)
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index d4ed8163be725..f523f7cd5d145 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -29,8 +29,8 @@ def _get_data_info_by_name(name, version):
     if version == "active":
         json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/"))
     else:
-        # FIXME waiting for new filter mechanism
-        json_string = urlopen(_SEARCH_NAME.format(name))
+        json_string = urlopen(_SEARCH_NAME.format(name)
+                              + "/data_version/{}".format(version))
     json_data = json.load(json_string)
     return json_data['data']['dataset'][0]
 

From 6f6bb576510055c24b2c34cd2ecb5e2705324a44 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Tue, 14 Nov 2017 18:23:45 -0500
Subject: [PATCH 16/28] some typos, addressing joel's comments, working on
 better errors

---
 doc/datasets/mldata.rst    |  6 +++---
 doc/datasets/openml.rst    | 12 ++++++++----
 sklearn/datasets/openml.py | 27 +++++++++++++++++++++------
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst
index cb076e1e75bb7..60546bfcfd363 100644
--- a/doc/datasets/mldata.rst
+++ b/doc/datasets/mldata.rst
@@ -16,10 +16,10 @@ Downloading datasets from the mldata.org repository
 
 `mldata.org <http://mldata.org>`_ is a public repository for machine learning
 data, supported by the `PASCAL network <http://www.pascal-network.org>`_ .
-It is no longer actively maintained, and it's suggested to use :ref:openml instead.
+It is no longer actively maintained, and it's suggested to use :ref:`openml` instead.
 
-The ``sklearn.datasets`` package is able to directly download data
-sets from the repository using the function
+The ``sklearn.datasets`` package is able to directly download datasets
+from the repository using the function
 :func:`sklearn.datasets.fetch_mldata`.
 
 For example, to download the MNIST digit recognition database::
diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index b36a933b13406..087fcf2dee4e9 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -24,10 +24,11 @@ sets from the repository using the function
 For example, to download a dataset of gene expressions in mice brains::
 
   >>> from sklearn.datasets import fetch_openml
-  >>> mice = fetch_openml('miceprotein', data_home=custom_data_home)
+  >>> mice = fetch_openml('miceprotein', version=2, data_home=custom_data_home)
 
-The dataset contains a total of 70000 examples of handwritten digits
-of size 28x28 pixels, labeled from 0 to 9::
+To fully specify a dataset, you need to provide a name and a version, though the
+version is optional, see :ref:`openml_versions`_ below.
+The dataset contains a total of 1080 examples belonging to 8 different classes::
 
   >>> mice.data.shape
   (1080, 81)
@@ -80,11 +81,14 @@ The id is also the best way to specify how to fetch a dataset from OpenML::
   'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
   '3c479a6885bfa0438971388283a1ce32'}
 
+.. _openml_versions:
+
 Dataset Versions
 ----------------
 
 A dataset is uniquely specified by its id, but not necessarily by its name.
-Several different "versions" of a dataset with the same name can exist.
+Several different "versions" of a dataset with the same name can exist which can contain
+entirely different datasets.
 If a particular version of a dataset has been found to contain significant
 issues, it might be inactivated. Using a name to specify a dataset will yield
 the earliest version of a dataset that is still active. That means that
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index f523f7cd5d145..d4ef3b3588e9d 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -11,7 +11,7 @@
 except ImportError:
     # Python 3+
     from urllib.request import urlopen
-    # from http.client import IncompleteRead
+
 
 from scipy.io.arff import loadarff
 import numpy as np
@@ -19,6 +19,7 @@
 from .base import get_data_home
 from ..externals.joblib import Memory
 from ..externals.six import StringIO
+from ..externals.six.moves.urllib.error import HTTPError
 from ..utils import Bunch
 
 _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
@@ -26,11 +27,25 @@
 
 
 def _get_data_info_by_name(name, version):
-    if version == "active":
-        json_string = urlopen(_SEARCH_NAME.format(name + "/status/active/"))
-    else:
-        json_string = urlopen(_SEARCH_NAME.format(name)
-                              + "/data_version/{}".format(version))
+    data_found = True
+    try:
+        if version == "active":
+            json_string = urlopen(_SEARCH_NAME.format(name
+                                                      + "/status/active/"))
+        else:
+            json_string = urlopen(_SEARCH_NAME.format(name)
+                                  + "/data_version/{}".format(version))
+    except HTTPError as error:
+        if error.code == 412:
+            data_found = False
+
+    if not data_found:
+        # not in except for nicer traceback
+        if version == "active":
+            raise ValueError("No active dataset {} found.".format(name))
+        raise ValueError("Dataset {} with version {}"
+                         " not found.".format(name, version))
+
     json_data = json.load(json_string)
     return json_data['data']['dataset'][0]
 

From b5c72d9c49453b0ba5902f14798aa9da69f822e2 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 15:08:39 -0500
Subject: [PATCH 17/28] nicer error message on non-existing ID

---
 sklearn/datasets/openml.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index d4ef3b3588e9d..3658a3ddf435d 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -51,7 +51,16 @@ def _get_data_info_by_name(name, version):
 
 
 def _get_data_description_by_id(data_id):
-    json_string = urlopen(_DATA_INFO.format(data_id))
+    data_found = True
+    try:
+        json_string = urlopen(_DATA_INFO.format(data_id))
+    except HTTPError as error:
+        if error.code == 412:
+            data_found = False
+    if not data_found:
+        # not in except for nicer traceback
+        raise ValueError("Dataset with id {} "
+                         "not found.".format(data_id))
     json_data = json.load(json_string)
     return json_data['data_set_description']
 

From 64483f8ec76e5193dea318565f326c4e849a0c96 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 15:17:11 -0500
Subject: [PATCH 18/28] minor improvements to data wrangling

---
 sklearn/datasets/openml.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 3658a3ddf435d..9503ea9d29793 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -159,17 +159,20 @@ def mem(func):
 
     # download actual data
     data, meta = _download_data_(data_description['url'])
-    columns = np.array(meta.names())
-    data_columns = columns[columns != target_column]
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
-    # we could use a view instead.
-    dtype = object if "nominal" in meta.types() else None
-    X = np.array([data[c] for c in data_columns], dtype=dtype).T
+    # and target at start or end, we could use a view instead.
     if target_column is not None:
         y = data[target_column]
+        data_columns = meta.names().remove(target_column)
     else:
         y = None
+        data_columns = meta.names()
+    if all([x == "numeric" for x in meta.types()]):
+        dtype = None
+    else:
+        dtype = object
+    X = np.array([data[c] for c in data_columns], dtype=dtype).T
 
     description = u"{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))

From 26aaff2c0a58d4efe5b163c47866763045f848dc Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 15:39:14 -0500
Subject: [PATCH 19/28] allow downloading inactive datasets if specified by
 name and version

---
 doc/datasets/openml.rst    | 10 +++++++---
 sklearn/datasets/openml.py | 22 +++++++++++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index 087fcf2dee4e9..3738b720bc5c0 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -17,8 +17,8 @@ Downloading datasets from the openml.org repository
 `openml.org <https://openml.org>`_ is a public repository for machine learning
 data and experiments, that allows everybody to upload open datasets.
 
-The ``sklearn.datasets`` package is able to directly download data
-sets from the repository using the function
+The ``sklearn.datasets`` package is able to directly download datasets
+from the repository using the function
 :func:`sklearn.datasets.fetch_openml`.
 
 For example, to download a dataset of gene expressions in mice brains::
@@ -64,7 +64,11 @@ and ``details`` attributes::
 The ``DESCR`` contains a free-text description of the data, while ``details``
 contains a dictionary of meta-data stored by openml, like the dataset id.
 The id of the mice protein dataset is 4550, and you can use this (or the name)
-to get more information on the dataset on the openml website: https://www.openml.org/d/4550.
+to get more information on the dataset on the openml website::
+
+  >>> print(mice.url)
+
+  https://www.openml.org/d/4550
 
 The id is also the best way to specify how to fetch a dataset from OpenML::
 
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 9503ea9d29793..1ee4d12fb8836 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -38,6 +38,21 @@ def _get_data_info_by_name(name, version):
     except HTTPError as error:
         if error.code == 412:
             data_found = False
+        else:
+            raise error
+
+    if not data_found and version != "active":
+        # might have been deactivated. will warn later
+        data_found = True
+        try:
+            json_string = urlopen(_SEARCH_NAME.format(name) +
+                                  "/data_version/{}/status/deactivated".format(
+                                      version))
+        except HTTPError as error:
+            if error.code == 412:
+                data_found = False
+            else:
+                raise error
 
     if not data_found:
         # not in except for nicer traceback
@@ -162,12 +177,12 @@ def mem(func):
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
     # and target at start or end, we could use a view instead.
+    data_columns = meta.names()
     if target_column is not None:
         y = data[target_column]
-        data_columns = meta.names().remove(target_column)
+        data_columns.remove(target_column)
     else:
         y = None
-        data_columns = meta.names()
     if all([x == "numeric" for x in meta.types()]):
         dtype = None
     else:
@@ -179,6 +194,7 @@ def mem(func):
 
     bunch = Bunch(
         data=X, target=y, feature_names=data_columns,
-        DESCR=description, details=data_description, meta=meta)
+        DESCR=description, details=data_description, meta=meta,
+        url="https://www.openml.org/d/{}".format(data_id))
 
     return bunch

From b3b927637c5cf7d37f454b5bc4eb536976aa00fd Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 15:43:07 -0500
Subject: [PATCH 20/28] update mice version 4 dataset id

---
 doc/datasets/openml.rst | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
index 3738b720bc5c0..60fc090cdecfc 100644
--- a/doc/datasets/openml.rst
+++ b/doc/datasets/openml.rst
@@ -24,7 +24,7 @@ from the repository using the function
 For example, to download a dataset of gene expressions in mice brains::
 
   >>> from sklearn.datasets import fetch_openml
-  >>> mice = fetch_openml('miceprotein', version=2, data_home=custom_data_home)
+  >>> mice = fetch_openml('miceprotein', version=4, data_home=custom_data_home)
 
 To fully specify a dataset, you need to provide a name and a version, though the
 version is optional, see :ref:`openml_versions`_ below.
@@ -63,16 +63,15 @@ and ``details`` attributes::
 
 The ``DESCR`` contains a free-text description of the data, while ``details``
 contains a dictionary of meta-data stored by openml, like the dataset id.
-The id of the mice protein dataset is 4550, and you can use this (or the name)
+The id of the mice protein dataset is 40966, and you can use this (or the name)
 to get more information on the dataset on the openml website::
 
   >>> print(mice.url)
-
-  https://www.openml.org/d/4550
+  https://www.openml.org/d/40966
 
 The id is also the best way to specify how to fetch a dataset from OpenML::
 
-  >>> mice = fetch_openml(4550, data_home=custom_data_home)
+  >>> mice = fetch_openml(40966, data_home=custom_data_home)
   >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
   {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
   'creator': ...,
@@ -96,9 +95,9 @@ entirely different datasets.
 If a particular version of a dataset has been found to contain significant
 issues, it might be inactivated. Using a name to specify a dataset will yield
 the earliest version of a dataset that is still active. That means that
-``fetch_openml("miceprotein")`` can yield different results at differnt times
+``fetch_openml("miceprotein")`` can yield different results at different times
 if earlier versions become inactive.
-You can see that the dataset with id 4550 that we fetched above is the version 1
+You can see that the dataset with id 40966 that we fetched above is the version 1
 of the "miceprotein" dataset::
 
   >>> mice.details['version']  #doctest: +SKIP

From 7e91c7140b4d1542a5f0eb946cc32778dd65b3a3 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 15:48:11 -0500
Subject: [PATCH 21/28] add whatsnew entry

---
 doc/whats_new/v0.20.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 58506cf8aa99b..5de8cf36e8066 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -51,6 +51,12 @@ Model evaluation
   ``'balanced_accuracy'`` scorer for binary classification.
   :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.
 
+Datasets
+
+- Added :func:`dataset.fetch_openml` to fetch any dataset from `OpenML <http://openml.org>`.
+  OpenML is a free, open data sharing platform and will replace mldata, which
+  is no longer maintained. :issue:`9908` by `Andreas Müller`_
+
 Enhancements
 ............
 

From 11909d54fbba7d873154ea301c48dcb9941c0a5c Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 15 Nov 2017 16:29:51 -0500
Subject: [PATCH 22/28] add unicode and normalize whitespace flags to pytest
 config

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 02b3015e87f2e..3b82e8eaf996c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,6 +28,7 @@ ignore-files=^setup\.py$
 addopts =
     --doctest-modules
     --disable-pytest-warnings
+doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE
 
 [wheelhouse_uploader]
 artifact_indexes=

From 7e1620328536638c4130e5f6132c155834e6cd11 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 22 Nov 2017 12:00:32 -0500
Subject: [PATCH 23/28] add test for fetch_openml

---
 sklearn/datasets/tests/test_openml.py | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 sklearn/datasets/tests/test_openml.py

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
new file mode 100644
index 0000000000000..b8f912c4c91e4
--- /dev/null
+++ b/sklearn/datasets/tests/test_openml.py
@@ -0,0 +1,30 @@
+"""Test the openml loader.
+
+Skipped on travis.
+"""
+
+from sklearn.datasets import fetch_openml
+from sklearn.utils.testing import check_skip_travis, assert_warns, assert_raises
+
+
+def test_fetch_openml():
+    # check_skip_travis()
+    # fetch with version
+    iris_1 = fetch_openml("iris", version=1)
+    assert iris_1.details['id'] == '61'
+    # fetch without version
+    iris_1 = fetch_openml("iris")
+    assert iris_1.details['id'] == '61'
+    # fetch with dataset id
+    iris_by_id = fetch_openml(61)
+    assert iris_by_id.details['name'] == "iris"
+    assert iris_by_id.data.shape == (150, 4)
+    assert iris_by_id.target.shape == (150,)
+    # fetch inactive dataset by id
+    glas2 = assert_warns(UserWarning, fetch_openml, 40675)
+    # fetch inactive dataset by name and version
+    assert glas2.data.shape == (163, 9)
+    glas2_by_version = assert_warns(UserWarning, fetch_openml, 'glass2', 1)
+    # there is no active version of glass2
+    assert glas2_by_version.details['id'] == '40675'
+    assert_raises(ValueError, fetch_openml, 'glass2')

From 8dcb26bb16a0af4eb4685ba8803c5324d6601747 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 22 Nov 2017 12:28:57 -0500
Subject: [PATCH 24/28] test error messages

---
 sklearn/datasets/openml.py            |  2 +-
 sklearn/datasets/tests/test_openml.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 1ee4d12fb8836..473c03475fe4f 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -168,7 +168,7 @@ def mem(func):
     if data_description['status'] != "active":
         warn("Version {} of dataset {} is inactive, meaning that issues have"
              " been found in the dataset. Try using a newer version.".format(
-                 data_description['name'], data_description['version']))
+                 data_description['version'], data_description['name']))
     if target_column == "default-target":
         target_column = data_description.get('default_target_attribute', None)
 
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index b8f912c4c91e4..c7b66b6045783 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -4,7 +4,8 @@
 """
 
 from sklearn.datasets import fetch_openml
-from sklearn.utils.testing import check_skip_travis, assert_warns, assert_raises
+from sklearn.utils.testing import (check_skip_travis, assert_warns_message,
+                                   assert_raise_message)
 
 
 def test_fetch_openml():
@@ -21,10 +22,15 @@ def test_fetch_openml():
     assert iris_by_id.data.shape == (150, 4)
     assert iris_by_id.target.shape == (150,)
     # fetch inactive dataset by id
-    glas2 = assert_warns(UserWarning, fetch_openml, 40675)
+    glas2 = assert_warns_message(
+        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
+        40675)
     # fetch inactive dataset by name and version
     assert glas2.data.shape == (163, 9)
-    glas2_by_version = assert_warns(UserWarning, fetch_openml, 'glass2', 1)
+    glas2_by_version = assert_warns_message(
+        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
+        "glass2", 1)
     # there is no active version of glass2
     assert glas2_by_version.details['id'] == '40675'
-    assert_raises(ValueError, fetch_openml, 'glass2')
+    assert_raise_message(ValueError, "No active dataset glass2 found",
+                         fetch_openml, 'glass2')

From 0d562b62df3402c47ae864a4e04222cb105d5bdf Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 22 Nov 2017 12:35:31 -0500
Subject: [PATCH 25/28] fix command for make test-coverage

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6725a7441f75a..6f2115820308c 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ endif
 
 test-coverage:
 	rm -rf coverage .coverage
-	$(PYTEST) sklearn --show-locals -v --with-cov sklearn
+	$(PYTEST) sklearn --showlocals -v --cov=sklearn
 
 test: test-code test-sphinxext test-doc
 

From e274ad3acfd4272943d460f925ef50fdda115fe8 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 22 Nov 2017 12:48:12 -0500
Subject: [PATCH 26/28] make flake8 green

---
 sklearn/datasets/tests/test_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index c7b66b6045783..671c53a93349c 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -4,7 +4,7 @@
 """
 
 from sklearn.datasets import fetch_openml
-from sklearn.utils.testing import (check_skip_travis, assert_warns_message,
+from sklearn.utils.testing import (assert_warns_message,
                                    assert_raise_message)
 
 

From eb39a01fdbd316633a9e9c972c779e6d617cbaef Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Wed, 22 Nov 2017 15:06:44 -0500
Subject: [PATCH 27/28] py35 compatiility

---
 sklearn/datasets/openml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 473c03475fe4f..e610f27256a1f 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -61,7 +61,7 @@ def _get_data_info_by_name(name, version):
         raise ValueError("Dataset {} with version {}"
                          " not found.".format(name, version))
 
-    json_data = json.load(json_string)
+    json_data = json.loads(json_string.read().decode("utf-8"))
     return json_data['data']['dataset'][0]
 
 
@@ -76,7 +76,7 @@ def _get_data_description_by_id(data_id):
         # not in except for nicer traceback
         raise ValueError("Dataset with id {} "
                          "not found.".format(data_id))
-    json_data = json.load(json_string)
+    json_data = json.loads(json_string.read().decode("utf-8"))
     return json_data['data_set_description']
 
 

From 67825e85c3be2fb92da49a697ca4ae89b461737e Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Thu, 21 Dec 2017 17:45:41 -0500
Subject: [PATCH 28/28] trying to use CSV interface

---
 sklearn/datasets/openml.py | 44 ++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index e610f27256a1f..002935518c378 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -24,6 +24,7 @@
 
 _SEARCH_NAME = "https://openml.org/api/v1/json/data/list/data_name/{}/limit/1"
 _DATA_INFO = "https://openml.org/api/v1/json/data/{}"
+_DATA_FEATURES = "https://openml.org/api/v1/json/data/features/{}"
 
 
 def _get_data_info_by_name(name, version):
@@ -80,6 +81,21 @@ def _get_data_description_by_id(data_id):
     return json_data['data_set_description']
 
 
+def _get_data_features(data_id):
+    data_found = True
+    try:
+        json_string = urlopen(_DATA_FEATURES.format(data_id))
+    except HTTPError as error:
+        if error.code == 412:
+            data_found = False
+    if not data_found:
+        # not in except for nicer traceback
+        raise ValueError("Dataset with id {} "
+                         "not found.".format(data_id))
+    json_data = json.loads(json_string.read().decode("utf-8"))
+    return json_data['data_features']['feature']
+
+
 def _download_data(url):
     response = urlopen(url)
     if sys.version_info[0] == 2:
@@ -92,6 +108,14 @@ def _download_data(url):
     return arff
 
 
+def _download_data_csv(file_id):
+    response = urlopen("https://openml.org/data/v1/get_csv/{}".format(file_id))
+    data = np.genfromtxt(response, names=True, dtype=None, delimiter=',',
+                         missing_values='?')
+    response.close()
+    return data
+
+
 def fetch_openml(name_or_id=None, version='active', data_home=None,
                  target_column='default-target', memory=True):
     """Fetch dataset from openml by name or dataset id.
@@ -142,7 +166,8 @@ def mem(func):
             return func
     _get_data_info_by_name_ = mem(_get_data_info_by_name)
     _get_data_description_by_id_ = mem(_get_data_description_by_id)
-    _download_data_ = mem(_download_data)
+    _get_data_features_ = mem(_get_data_features)
+    _download_data_csv_ = mem(_download_data_csv)
 
     if not exists(data_home):
         os.makedirs(data_home)
@@ -173,17 +198,24 @@ def mem(func):
         target_column = data_description.get('default_target_attribute', None)
 
     # download actual data
-    data, meta = _download_data_(data_description['url'])
+    features = _get_data_features_(data_id)
     # TODO: stacking the content of the structured array
     # this results in a copy. If the data was homogeneous
     # and target at start or end, we could use a view instead.
-    data_columns = meta.names()
+    data_columns = []
+    for feature in features:
+        if (feature['name'] != target_column and feature['is_ignore'] ==
+                'false' and feature['is_row_identifier'] == 'false'):
+            data_columns.append(feature['name'])
+
+    data = _download_data_csv_(data_description['file_id'])
     if target_column is not None:
         y = data[target_column]
-        data_columns.remove(target_column)
     else:
         y = None
-    if all([x == "numeric" for x in meta.types()]):
+
+    if all([feature['data_type'] == "numeric" for feature in features
+            if feature['name'] in data_columns]):
         dtype = None
     else:
         dtype = object
@@ -194,7 +226,7 @@ def mem(func):
 
     bunch = Bunch(
         data=X, target=y, feature_names=data_columns,
-        DESCR=description, details=data_description, meta=meta,
+        DESCR=description, details=data_description, features=features,
         url="https://www.openml.org/d/{}".format(data_id))
 
     return bunch