scikit-learn
diff --git a/‎sklearn/datasets/openml.py
Lines changed: 25 additions & 5 deletions b/‎sklearn/datasets/openml.py
Lines changed: 25 additions & 5 deletions
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40704.json.gz
418 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40704.json.gz
418 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40704.json.gz
264 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40704.json.gz
264 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-data_version-1.json.gz
378 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-data_version-1.json.gz
378 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-status-active-.json.gz
446 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-status-active-.json.gz
446 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
31.6 KB b/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
31.6 KB
diff --git a/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-4965305.arff.gz
2.69 KB b/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-4965305.arff.gz
2.69 KB
diff --git a/‎sklearn/datasets/tests/test_openml.py
Lines changed: 70 additions & 14 deletions b/‎sklearn/datasets/tests/test_openml.py
Lines changed: 70 additions & 14 deletions
@@ -235,7 +235,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
     y : np.array
     """
     if isinstance(arff_data, list):
-        data = np.array(arff_data, dtype=np.float64)
+        data = np.array(arff_data)
         X = np.array(data[:, col_slice_x], dtype=np.float64)
         y = np.array(data[:, col_slice_y], dtype=np.float64)
         return X, y
@@ -278,7 +278,7 @@ def _get_data_info_by_name(name, version, data_home):
     Returns
     -------
     first_dataset : json
-        json representation of the first dataset object that adhired to the
+        json representation of the first dataset object that adhered to the
         search criteria
 
     """
@@ -399,7 +399,8 @@ def _valid_data_column_names(features_list, target_columns):
 
 
 def fetch_openml(name=None, version='active', data_id=None, data_home=None,
-                 target_column='default-target', cache=True, return_X_y=False):
+                 ignore_strings=False, target_column='default-target',
+                 cache=True, return_X_y=False):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -438,6 +439,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         Specify another download and cache folder for the data sets. By default
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
+    ignore_strings : boolean, default=True
+        Whether to ignore string attributes when loading a dataset.
+
     target_column : string, list or None, default 'default-target'
         Specify the column name in the data to use as target. If
         'default-target', the standard target column a stored on the server
@@ -536,11 +540,27 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     # download data features, meta-info about column types
     features_list = _get_data_features(data_id, data_home)
 
+    if ignore_strings:
+        string_features = list(filter(lambda f: f['data_type'] == 'string',
+                                      features_list))
+        if string_features:
+            string_feature_names = list(map(lambda f: f['name'],
+                                            string_features))
+            warn("STRING attributes which are not yet supported. "
+                 "Therefore, the following column(s) will not be returned: {}"
+                 .format(",".join(string_feature_names)))
+            features_list = list(filter(lambda f: f['name'] not
+                                        in string_feature_names,
+                                        features_list))
+
     for feature in features_list:
         if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
             continue
-        if feature['data_type'] == 'string':
-            raise ValueError('STRING attributes are not yet supported')
+        if feature['data_type'] == 'string' and not ignore_strings:
+            raise ValueError('STRING attributes are not yet supported.'
+                             'If you would like to return the data '
+                             'without STRING attributes, try using '
+                             'ignore_strings')
 
     if target_column == "default-target":
         # determines the default target based on the data feature results
 
@@ -65,7 +65,7 @@ def decode_column(data_bunch, col_idx):
 
 
 def _fetch_dataset_from_openml(data_id, data_name, data_version,
-                               target_column,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                expected_data_dtype, expected_target_dtype,
@@ -75,17 +75,18 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     # result. Note that this function can be mocked (by invoking
     # _monkey_patch_webbased_functions before invoking this function)
     data_by_name_id = fetch_openml(name=data_name, version=data_version,
-                                   cache=False)
+                                   ignore_strings=ignore_strings, cache=False)
     assert int(data_by_name_id.details['id']) == data_id
 
     # Please note that cache=False is crucial, as the monkey patched files are
     # not consistent with reality
-    fetch_openml(name=data_name, cache=False)
+    fetch_openml(name=data_name, ignore_strings=ignore_strings, cache=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
 
     # fetch with dataset id
     data_by_id = fetch_openml(data_id=data_id, cache=False,
+                              ignore_strings=ignore_strings,
                               target_column=target_column)
     assert data_by_id.details['name'] == data_name
     assert data_by_id.data.shape == (expected_observations, expected_features)
@@ -111,7 +112,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     if compare_default_target:
         # check whether the data by id and data by id target are equal
-        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
+        data_by_id_default = fetch_openml(data_id=data_id,
+                                          ignore_strings=ignore_strings,
+                                          cache=False)
         if data_by_id.data.dtype == np.float64:
             np.testing.assert_allclose(data_by_id.data,
                                        data_by_id_default.data)
@@ -132,8 +135,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
                 expected_missing)
 
     # test return_X_y option
-    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
-                         target_column=target_column)
+    fetch_func = partial(fetch_openml, data_id=data_id,
+                         ignore_strings=ignore_strings,
+                         cache=False, target_column=target_column)
     check_return_X_y(data_by_id, fetch_func)
     return data_by_id
 
@@ -260,6 +264,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
     data_id = 61
     data_name = 'iris'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     expected_observations = 150
     expected_features = 4
@@ -274,6 +279,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
         _fetch_dataset_from_openml,
         **{'data_id': data_id, 'data_name': data_name,
            'data_version': data_version,
+           'ignore_strings': ignore_strings,
            'target_column': target_column,
            'expected_observations': expected_observations,
            'expected_features': expected_features,
@@ -297,13 +303,15 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
     data_id = 61
     data_name = 'iris'
     data_version = 1
+    ignore_strings = False
     target_column = ['sepallength', 'sepalwidth']
     expected_observations = 150
     expected_features = 3
     expected_missing = 0
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, np.float64, expect_sparse=False,
@@ -316,13 +324,15 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response):
     data_id = 2
     data_name = 'anneal'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 11
     expected_features = 38
     expected_missing = 267
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, object, expect_sparse=False,
@@ -341,13 +351,15 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
     data_id = 2
     data_name = 'anneal'
     data_version = 1
+    ignore_strings = False
     target_column = ['class', 'product-type', 'shape']
     # Not all original instances included for space reasons
     expected_observations = 11
     expected_features = 36
     expected_missing = 267
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, object, expect_sparse=False,
@@ -360,12 +372,14 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response):
     data_id = 561
     data_name = 'cpu'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     expected_observations = 209
     expected_features = 7
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, np.float64, expect_sparse=False,
@@ -387,6 +401,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
     data_id = 292
     data_name = 'Australian'
     data_version = 1
+    ignore_strings = False
     target_column = 'Y'
     # Not all original instances included for space reasons
     expected_observations = 85
@@ -399,6 +414,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
         _fetch_dataset_from_openml,
         **{'data_id': data_id, 'data_name': data_name,
            'data_version': data_version,
+           'ignore_strings': ignore_strings,
            'target_column': target_column,
            'expected_observations': expected_observations,
            'expected_features': expected_features,
@@ -416,13 +432,15 @@ def test_fetch_openml_adultcensus(monkeypatch, gzip_response):
     data_id = 1119
     data_name = 'adult-census'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 10
     expected_features = 14
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -438,13 +456,15 @@ def test_fetch_openml_miceprotein(monkeypatch, gzip_response):
     data_id = 40966
     data_name = 'MiceProtein'
     data_version = 4
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 7
     expected_features = 77
     expected_missing = 7
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -457,14 +477,16 @@ def test_fetch_openml_emotions(monkeypatch, gzip_response):
     data_id = 40589
     data_name = 'emotions'
     data_version = 3
+    ignore_strings = False
     target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm',
                      'quiet.still', 'sad.lonely', 'angry.aggresive']
     expected_observations = 13
     expected_features = 72
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
 
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -477,6 +499,27 @@ def test_decode_emotions(monkeypatch):
     _test_features_list(data_id)
 
 
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_titanic(monkeypatch, gzip_response):
+    # check because of the string attributes
+    data_id = 40945
+    data_name = 'Titanic'
+    data_version = 1
+    ignore_strings = True
+    target_column = 'survived'
+    # Not all original features included because five are strings
+    expected_observations = 1309
+    expected_features = 8
+    expected_missing = 1454
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               np.float64, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
     data_id = 61
@@ -659,14 +702,27 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response):
                          cache=False)
 
 
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_ignore_strings(monkeypatch, gzip_response):
+    data_id = 40945
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+    assert_warns_message(
+        UserWarning,
+        "STRING attributes which are not yet supported. "
+        "Therefore, the following column(s) will not be returned:",
+        fetch_openml, data_id=data_id, ignore_strings=True, cache=False
+    )
+
+
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_string_attribute(monkeypatch, gzip_response):
     data_id = 40945
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # single column test
     assert_raise_message(ValueError,
                          'STRING attributes are not yet supported',
-                         fetch_openml, data_id=data_id, cache=False)
+                         fetch_openml, data_id=data_id, ignore_strings=False,
+                         cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])