From f4754a937548518d63f6fa3320f94ce951a078af Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 17 May 2019 15:21:27 -0400 Subject: [PATCH 01/39] TST Adds tests --- sklearn/datasets/openml.py | 33 +++-- sklearn/datasets/tests/test_openml.py | 197 +++++++++++++++++++++++++- 2 files changed, 218 insertions(+), 12 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 6f76ee15e2e40..379fc2f5e93e5 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -436,7 +436,8 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, - target_column='default-target', cache=True, return_X_y=False): + target_column='default-target', cache=True, return_X_y=False, + return_frame=False): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -489,24 +490,33 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` objects. + return_frame : boolean, default=False + If True, returns a Bunch where the data attribute is a pandas + DataFrame. + Returns ------- data : Bunch Dictionary-like object, with attributes: - data : np.array or scipy.sparse.csr_matrix of floats + data : np.array, scipy.sparse.csr_matrix of floats, or pandas Dataframe The feature matrix. Categorical features are encoded as ordinals. - target : np.array + If ``return_frame`` is True, this is a pandas DataFrame. + target : np.array or None The regression target or classification labels, if applicable. Dtype is float if numeric, and object if categorical. + If ``return_frame`` is True, this is None. DESCR : str The full description of the dataset feature_names : list The names of the dataset columns - categories : dict + target_names : list + The names of the target columns + categories : dict or None Maps each categorical feature name to a list of values, such - that the value encoded as i is ith in the list. + that the value encoded as i is ith in the list. If ``return_frame`` + is True, this is None. details : dict More metadata from OpenML @@ -571,11 +581,14 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) - for feature in features_list: - if 'true' in (feature['is_ignore'], feature['is_row_identifier']): - continue - if feature['data_type'] == 'string': - raise ValueError('STRING attributes are not yet supported') + if not return_frame: + for feature in features_list: + if 'true' in (feature['is_ignore'], feature['is_row_identifier']): + continue + if feature['data_type'] == 'string': + raise ValueError('STRING attributes are not supported for ' + 'arrays as a return value. Try ' + 'return_frame=True') if target_column == "default-target": # determines the default target based on the data feature results diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 9c8200731aa6d..a3b7be2604250 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -255,6 +255,198 @@ def _mock_urlopen(request): context.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen) +def test_fetch_openml_iris_pandas(monkeypatch): + # classification dataset with numeric only columns + pd = pytest.importorskip('pandas') + data_id = 61 + expected_shape = (150, 5) + + cat_dtype = pd.CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) + expected_dtypes = [np.float64] * 4 + [cat_dtype] + expected_feature_names = ['sepallength', 'sepalwidth', 'petallength', + 'petalwidth'] + expected_target_names = ['class'] + expected_columns = expected_feature_names + expected_target_names + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + df = bunch.data + + assert np.all(df.dtypes == expected_dtypes) + assert df.shape == expected_shape + assert np.all(df.columns == expected_columns) + assert np.all(bunch.feature_names == expected_feature_names) + assert np.all(bunch.target_names == expected_target_names) + + +def test_fetch_openml_anneal_pandas(monkeypatch): + # classification dataset with numeric and categorical columns + pd = pytest.importorskip('pandas') + data_id = 2 + target_column = 'class' + expected_shape = (11, 39) + expected_categories = 33 + expected_floats = 6 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + bunch = fetch_openml(data_id=data_id, return_frame=True, + target_column=target_column, cache=False) + + df = bunch.data + assert df.shape == expected_shape + + n_categories = len([dtype for dtype in df.dtypes + if isinstance(dtype, pd.CategoricalDtype)]) + n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) + assert expected_categories == n_categories + assert expected_floats == n_floats + assert np.all(bunch.target_names == target_column) + + +def test_fetch_openml_cpu_pandas(monkeypatch): + # regression dataset with numeric and categorical columns + pd = pytest.importorskip('pandas') + data_id = 561 + expected_shape = (209, 8) + + cat_dtype = pd.CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', + 'bti', 'burroughs', 'c.r.d', 'cdc', + 'cambex', 'dec', 'dg', 'formation', + 'four-phase', 'gould', 'hp', 'harris', + 'honeywell', 'ibm', 'ipl', 'magnuson', + 'microdata', 'nas', 'ncr', 'nixdorf', + 'perkin-elmer', 'prime', 'siemens', + 'sperry', 'sratus', 'wang']) + expected_dtypes = [cat_dtype] + [np.float64] * 7 + expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', + 'CHMAX'] + expected_target_names = ['class'] + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + df = bunch.data + + assert df.shape == expected_shape + assert np.all(df.dtypes == expected_dtypes) + assert np.all(df.columns == expected_feature_names + expected_target_names) + assert np.all(bunch.feature_names == expected_feature_names) + assert np.all(bunch.target_names == expected_target_names) + + +def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): + data_id = 292 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + msg = ('Cannot return dataframe with sparse data') + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=data_id, return_frame=True, cache=False) + + +def test_fetch_openml_adultcensus_pandas(monkeypatch): + pd = pytest.importorskip('pandas') + # Check because of the numeric row attribute (issue #12329) + data_id = 1119 + expected_shape = (10, 14) + expected_categories = 9 + expected_floats = 7 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + + df = bunch.data + assert df.shape == expected_shape + + n_categories = len([dtype for dtype in df.dtypes + if isinstance(dtype, pd.CategoricalDtype)]) + n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) + assert expected_categories == n_categories + assert expected_floats == n_floats + + +def test_fetch_openml_miceprotein_pandas(monkeypatch): + # JvR: very important check, as this dataset defined several row ids + # and ignore attributes. Note that data_features json has 82 attributes, + # and row id (1), ignore attributes (3) have been removed. + pd = pytest.importorskip('pandas') + data_id = 40966 + expected_shape = (7, 78) + expected_floats = 77 + expected_categories = 5 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + + df = bunch.data + assert df.shape == expected_shape + + n_categories = len([dtype for dtype in df.dtypes + if isinstance(dtype, pd.CategoricalDtype)]) + n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) + assert expected_categories == n_categories + assert expected_floats == n_floats + + +def test_fetch_openml_emotions_pandas(monkeypatch): + # classification dataset with multiple targets (natively) + pd = pytest.importorskip('pandas') + + data_id = 40589 + target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', + 'quiet.still', 'sad.lonely', 'angry.aggresive'] + expected_shape = (13, 78) + expected_categories = 6 + expected_floats = 72 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False, + target_column=target_column) + + df = bunch.data + assert df.shape == expected_shape + + n_categories = len([dtype for dtype in df.dtypes + if isinstance(dtype, pd.CategoricalDtype)]) + n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) + assert expected_categories == n_categories + assert expected_floats == n_floats + assert np.all(bunch.target_column == target_column) + + +def test_fetch_openml_titanic_pandas(monkeypatch): + # dataset with strings + pd = pytest.importorskip('pandas') + + data_id = 40945 + expected_shape = (1309, 14) + expected_dtypes = [np.float64, pd.CategoricalDtype(['0', '1']), + object, pd.CategoricalDtype(['female', 'male']), + np.float64, np.float64, np.float64, object, + np.float64, object, + pd.CategoricalDtype(['C', 'Q', 'S']), object, + np.float64, object] + expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', + 'sibsp', 'parch', 'ticket', 'fare', 'cabin', + 'embarked', 'boat', 'body', 'home.dest'] + expected_feature_names = ['pclass', 'name', 'sex', 'age', + 'sibsp', 'parch', 'ticket', 'fare', 'cabin', + 'embarked', 'boat', 'body', 'home.dest'] + expected_target_names = ['survived'] + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + + df = bunch.data + assert df.shape == expected_shape + assert np.all(df.dtypes == expected_dtypes) + assert np.all(df.columns == expected_columns) + assert np.all(bunch.feature_names == expected_feature_names) + assert np.all(bunch.target_names == expected_target_names) + + @pytest.mark.parametrize('gzip_response', [True, False]) def test_fetch_openml_iris(monkeypatch, gzip_response): # classification dataset with numeric only columns @@ -661,12 +853,13 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): @pytest.mark.parametrize('gzip_response', [True, False]) -def test_string_attribute(monkeypatch, gzip_response): +def test_string_attribute_without_dataframe(monkeypatch, gzip_response): data_id = 40945 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_raise_message(ValueError, - 'STRING attributes are not yet supported', + ('STRING attributes are not supported for arrays as ' + 'a return value. Try return_frame=True'), fetch_openml, data_id=data_id, cache=False) From e67182e51019501b20196bdb816e84ff05908614 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 17 May 2019 18:06:22 -0400 Subject: [PATCH 02/39] ENH Adds support for dataframes in open_ml --- .../plot_column_transformer_mixed_types.py | 7 +- sklearn/datasets/openml.py | 85 ++++++++++++++-- sklearn/datasets/tests/test_openml.py | 96 ++++++++++++++----- sklearn/utils/__init__.py | 21 ++++ 4 files changed, 170 insertions(+), 39 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 264ae7495296c..19651cd7cf622 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -24,10 +24,10 @@ # # License: BSD 3 clause -import pandas as pd import numpy as np from sklearn.compose import ColumnTransformer +from sklearn.datasets import fetch_openml from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder @@ -37,9 +37,8 @@ np.random.seed(0) # Read data from Titanic dataset. -titanic_url = ('https://raw.githubusercontent.com/amueller/' - 'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv') -data = pd.read_csv(titanic_url) +titantic = fetch_openml(data_id=40945, return_frame=False) +data = titantic.data # We will train our classifier with the following features: # Numeric Features: diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 379fc2f5e93e5..44a0b89c188d4 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -8,6 +8,7 @@ from functools import wraps import itertools from collections.abc import Generator +from collections import OrderedDict from urllib.request import urlopen, Request @@ -18,6 +19,7 @@ from .base import get_data_home from urllib.error import HTTPError from ..utils import Bunch +from ..utils import check_pandas_support # noqa __all__ = ['fetch_openml'] @@ -263,6 +265,58 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): raise ValueError('Unexpected Data Type obtained from arff.') +def _feature_to_dtype(feature): + """Map feature to dtype for pandas DataFrame + """ + if feature["data_type"] == "string": + return object + elif feature["data_type"] == "nominal": + return 'category' + # only numeric, integer, real are left + elif (feature["number_of_missing_values"] != "0" or + feature["data_type"] in ["numeric", "real"]): + return np.float64 + elif feature["data_type"] == "integer": + return np.int64 + raise ValueError("Unsupported feature: {}".format(feature)) + + +def _convert_arff_data_dataframe(arrf_data, all_columns, features_dict): + """Convert the ARFF object into a pandas DataFrame. + + Parameters + ---------- + arff_data : list or dict + as obtained from liac-arff object + + all_columns : list + columns to return + + features_dict : OrderedDict + map from feature to feature info from openml. This includes + columns that are not ignored. + + Returns + ------- + df : pd.DataFrame + """ + check_pandas_support('fetch_openml with return_frame=True') + import pandas as pd + + df = pd.DataFrame(arrf_data['data'], columns=list(features_dict.keys()), + dtype=object) + df = df[all_columns].copy() + + dtypes = {} + for column in all_columns: + dtype = _feature_to_dtype(features_dict[column]) + if dtype == object: + continue + dtypes[column] = dtype + + return df.astype(dtypes) + + def _get_data_info_by_name(name, version, data_home): """ Utilizes the openml dataset listing api to find a dataset by @@ -578,6 +632,13 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, warn("OpenML raised a warning on the dataset. It might be " "unusable. Warning: {}".format(data_description['warning'])) + return_sparse = False + if data_description['format'].lower() == 'sparse_arff': + return_sparse = True + + if return_sparse and return_frame: + raise ValueError('Cannot return dataframe with sparse data') + # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) @@ -609,7 +670,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column) # prepare which columns and data types should be returned for the X and y - features_dict = {feature['name']: feature for feature in features_list} + features_dict = OrderedDict([(feature['name'], feature) + for feature in features_list]) # XXX: col_slice_y should be all nominal or all numeric _verify_target_data_type(features_dict, target_column) @@ -628,10 +690,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, 'columns. '.format(feat['name'], nr_missing)) # determine arff encoding to return - return_sparse = False - if data_description['format'].lower() == 'sparse_arff': - return_sparse = True - if not return_sparse: data_qualities = _get_data_qualities(data_id, data_home) shape = _get_data_shape(data_qualities) @@ -644,7 +702,18 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, # obtain the data arff = _download_data_arff(data_description['file_id'], return_sparse, - data_home) + data_home, encode_nominal=not return_frame) + + description = "{}\n\nDownloaded from openml.org.".format( + data_description.pop('description')) + + if return_frame: + all_columns = data_columns + target_column + df = _convert_arff_data_dataframe(arff, all_columns, features_dict) + return Bunch(data=df, target=None, feature_names=data_columns, + target_names=target_column, DESCR=description, + details=data_description, categories=None, + url="https://www.openml.org/d/{}".format(data_id)) # nominal attributes is a dict mapping from the attribute name to the # possible values. Includes also the target column (which will be popped @@ -669,9 +738,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, raise ValueError('Mix of nominal and non-nominal targets is not ' 'currently supported') - description = "{}\n\nDownloaded from openml.org.".format( - data_description.pop('description')) - # reshape y back to 1-D array, if there is only 1 target column; back # to None if there are not target columns if y.shape[1] == 1: @@ -684,6 +750,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, bunch = Bunch( data=X, target=y, feature_names=data_columns, + target_names=target_column, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index a3b7be2604250..034d78f144753 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -14,7 +14,8 @@ _get_data_description_by_id, _download_data_arff, _get_local_path, - _retry_with_clean_cache) + _retry_with_clean_cache, + _feature_to_dtype) from sklearn.utils.testing import (assert_warns_message, assert_raise_message) from sklearn.utils import is_scalar_nan @@ -93,10 +94,12 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) + assert data_by_id.target_names[0] == target_column elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) + assert np.all(data_by_id.target_names == target_column) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features @@ -255,6 +258,31 @@ def _mock_urlopen(request): context.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen) +@pytest.mark.parametrize('feature, expected_dtype', [ + ({'data_type': 'string', 'number_of_missing_values': '0'}, object), + ({'data_type': 'string', 'number_of_missing_values': '1'}, object), + ({'data_type': 'numeric', 'number_of_missing_values': '0'}, np.float64), + ({'data_type': 'numeric', 'number_of_missing_values': '1'}, np.float64), + ({'data_type': 'real', 'number_of_missing_values': '0'}, np.float64), + ({'data_type': 'real', 'number_of_missing_values': '1'}, np.float64), + ({'data_type': 'integer', 'number_of_missing_values': '0'}, np.int64), + ({'data_type': 'integer', 'number_of_missing_values': '1'}, np.float64), + ({'data_type': 'nominal', 'number_of_missing_values': '0'}, 'category'), + ({'data_type': 'nominal', 'number_of_missing_values': '1'}, 'category'), +]) +def test_feature_to_dtype(feature, expected_dtype): + assert _feature_to_dtype(feature) == expected_dtype + + +@pytest.mark.parametrize('feature', [ + {'data_type': 'datatime', 'number_of_missing_values': '0'} +]) +def test_feature_to_dtype_error(feature): + msg = 'Unsupported feature: {}'.format(feature) + with pytest.raises(ValueError, match=msg): + _feature_to_dtype(feature) + + def test_fetch_openml_iris_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') @@ -274,6 +302,7 @@ def test_fetch_openml_iris_pandas(monkeypatch): bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) df = bunch.data + assert isinstance(df, pd.DataFrame) assert np.all(df.dtypes == expected_dtypes) assert df.shape == expected_shape assert np.all(df.columns == expected_columns) @@ -294,16 +323,16 @@ def test_fetch_openml_anneal_pandas(monkeypatch): bunch = fetch_openml(data_id=data_id, return_frame=True, target_column=target_column, cache=False) - df = bunch.data - assert df.shape == expected_shape + assert isinstance(df, pd.DataFrame) + assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats - assert np.all(bunch.target_names == target_column) + assert np.all(bunch.target_names == [target_column]) def test_fetch_openml_cpu_pandas(monkeypatch): @@ -329,6 +358,7 @@ def test_fetch_openml_cpu_pandas(monkeypatch): bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) df = bunch.data + assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape assert np.all(df.dtypes == expected_dtypes) assert np.all(df.columns == expected_feature_names + expected_target_names) @@ -341,7 +371,7 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) - msg = ('Cannot return dataframe with sparse data') + msg = 'Cannot return dataframe with sparse data' with pytest.raises(ValueError, match=msg): fetch_openml(data_id=data_id, return_frame=True, cache=False) @@ -350,16 +380,16 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') # Check because of the numeric row attribute (issue #12329) data_id = 1119 - expected_shape = (10, 14) + expected_shape = (10, 15) expected_categories = 9 - expected_floats = 7 + expected_floats = 6 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data - assert df.shape == expected_shape + assert isinstance(df, pd.DataFrame) + assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) @@ -375,14 +405,15 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): data_id = 40966 expected_shape = (7, 78) expected_floats = 77 - expected_categories = 5 + expected_categories = 1 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) df = bunch.data - assert df.shape == expected_shape + assert isinstance(df, pd.DataFrame) + assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) @@ -406,14 +437,15 @@ def test_fetch_openml_emotions_pandas(monkeypatch): target_column=target_column) df = bunch.data - assert df.shape == expected_shape + assert isinstance(df, pd.DataFrame) + assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats - assert np.all(bunch.target_column == target_column) + assert np.all(bunch.target_names == target_column) def test_fetch_openml_titanic_pandas(monkeypatch): @@ -422,24 +454,36 @@ def test_fetch_openml_titanic_pandas(monkeypatch): data_id = 40945 expected_shape = (1309, 14) - expected_dtypes = [np.float64, pd.CategoricalDtype(['0', '1']), - object, pd.CategoricalDtype(['female', 'male']), - np.float64, np.float64, np.float64, object, - np.float64, object, - pd.CategoricalDtype(['C', 'Q', 'S']), object, - np.float64, object] - expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', - 'sibsp', 'parch', 'ticket', 'fare', 'cabin', - 'embarked', 'boat', 'body', 'home.dest'] - expected_feature_names = ['pclass', 'name', 'sex', 'age', - 'sibsp', 'parch', 'ticket', 'fare', 'cabin', - 'embarked', 'boat', 'body', 'home.dest'] + name_to_dtype = { + 'pclass': np.float64, + 'name': object, + 'sex': pd.CategoricalDtype(['female', 'male']), + 'age': np.float64, + 'sibsp': np.float64, + 'parch': np.float64, + 'ticket': object, + 'fare': np.float64, + 'cabin': object, + 'embarked': pd.CategoricalDtype(['C', 'Q', 'S']), + 'boat': object, + 'body': np.float64, + 'home.dest': object, + 'survived': pd.CategoricalDtype(['0', '1']) + } + expected_columns = ['pclass', 'name', 'sex', 'age', 'sibsp', + 'parch', 'ticket', 'fare', 'cabin', 'embarked', + 'boat', 'body', 'home.dest', 'survived'] + expected_dtypes = [name_to_dtype[col] for col in expected_columns] + expected_feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp', + 'parch', 'ticket', 'fare', 'cabin', 'embarked', + 'boat', 'body', 'home.dest'] expected_target_names = ['survived'] _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data + + assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape assert np.all(df.dtypes == expected_dtypes) assert np.all(df.columns == expected_columns) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ea56498cac7c5..8d9e55f5e6df1 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -824,3 +824,24 @@ def check_matplotlib_support(caller_name): "{} requires matplotlib. You can install matplotlib with " "`pip install matplotlib`".format(caller_name) ) from e + + +def check_pandas_support(caller_name): + """Raise ImportError with detailed error message if pandsa is not + installed. + + Plot utilities like :func:`fetch_openml` should lazily import + pandas and call this helper before any computation. + + Parameters + ---------- + caller_name : str + The name of the caller that requires pandas. + """ + try: + import pandas # noqa + except ImportError as e: + raise ImportError( + "{} requires pandas. You can install pandas with " + "`pip install pandas`".format(caller_name) + ) from e From 98bfa765f52b1a1a4415264386d65578ec33e07e Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 17 May 2019 23:56:13 -0400 Subject: [PATCH 03/39] BUG Add datafiles --- .../api-v1-json-data-qualities-40945.json.gz | Bin 0 -> 1042 bytes .../40945/data-v1-download-16826755.arff.gz | Bin 0 -> 32243 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 sklearn/datasets/tests/data/openml/40945/api-v1-json-data-qualities-40945.json.gz create mode 100644 sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz diff --git a/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-qualities-40945.json.gz b/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-qualities-40945.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..279a0bd82ad663e5c09d2fa7ef448472ee82b205 GIT binary patch literal 1042 zcmV+t1nv7DiwFq*Sb1Ck17UD!Ep{<2YIARHEo5PIVJ&fWVQgu1X=QUQG%z_dH7;s% zZ*BmcSKDseMil%PJ}1Pv?A32+oB}~%r>%@xCXFgmya+M` z1BQ2Ich8(NJ9wuS`NbahdYd)Po3gE|zaIynXrmEfPr*rGdx*I#c6r%g zmoKL{L=lsVmV;+!y|?BN_U1!ZU}avm*WK*98FPw^>G6&qd_Ocs^g)Q`byLNO?mU5KL5 z+UQ1S2g9OL>eJvT1QAEJKgAJxQ7UFj&e~wCkc2Vzx+f1=BeAv15`o-x{+r!1VHWHY_=ZP~P0c{^OI^yu9fljp3i2szFxR9F$Y#w(v)pimwdKvx~c|i%34ff#K?0?TuoK~7~D;zS+fuuKi ziwA={-g39f^HP2>d$5Vf(BG%E=bB?*Xsg!c`~OrPZ3aIz;@QwHv;CMlM+4$$`;x4{cqsJs7cumtR%HlLdY!ZZLbb5f79dZEra6*JZ9`W#F>c6`Ngl~)c{Pz6( M3AdqM{SglU0Nlj@%K!iX literal 0 HcmV?d00001 diff --git a/sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz b/sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz new file mode 100644 index 0000000000000000000000000000000000000000..824fd370dd582102afba8a87ac98e40e6ae96cb9 GIT binary patch literal 32243 zcmV(vK&zg=s6;E_| z?EJ5H>i?hhXBFAdW2b@NS1bF6=r{QL#}9NvN_u>H%>QNoELqV53XA(@JN`#o!WQfC zKWdA`BW*_y^FY`ERSR-oRHHvbH}`}+@>?HI;R2uG*WKU-_ca+;cwaAf;|kaHhPq4I z=%4%Lk0oizFaD{2dURfvv^sUJi>7g%d08~%p0;b}D=FK$IyyZP<~%wL@>37~ISL{_ z@!+?ynmga2xstGX*15c%n-S5Y)IDj`ivnd@A1OZdI>h0&*) z!A}Yeo}2qg3O^8>9{GNlX8zHsANy_$N~S^JOoK#f@_nxAx~^JANqOqre0F|!rfZ7g zp9;9qsq^V0YWGa-{E8P8?h8j`q^PKQzc(m8`w^}Bz8~sl-4SQXXt7!g>k8eW_a^0j z82^C!M(cN`&dG!}O+oQ}m$fq^RYA+kEhPZzqL?CpBwe2f7Yy`WkH-TYg#YsrXa?JlJJz91gz(Q z;52pfI15jYW~uTfkE;qRlD7S`%$LuEE#VPL>YPm1gq3v*0eeL@a1irN;t*bht4fm`e(SqYn!_7Seb1Md znDIMI$gU}tVE)e?rB!2IhP7lxp6ISEh`7-$*%&UE`rQwNlIpe4%P*oy)bJ4N9^ zAfB8)frk3j`7hS3I`B&a^IF@bfh|8(Gx%`lVCx2wZ#`gXx1Hb>jdXD*?w! zq9dus52EoW7YexHv;ybknk*rrA(ovlq}-6@iaPIVwxFi{{OoPokD`MP*FkEAr@V$# z>ReGU18Te>T{*}ikzvLt1x-org0oG+SgOdmiW|Z(MZROCT2QHDLfAcJIu&D{I{ru@ zlIgLjB2ht;q5L#+vzRAd_&W)Ay7#(4hBli59m zC+7+2=sXCeZ%sseK;pp2M$0tv)hDFekV?ijf0bD;^31SA=x3N&+&oZ6JsUGLm-wya zMgW1IyJEG1C;>lR_F}n(eJMANQEr59>19}oGG~I-Nx0G=NA+e_8qkvfOf=33FACI| z(JRs|Dda>(A)k?L*IHrdy-~GzhjHcKzeCPdk9=MN7BG`l7^tL@`C$Guaf94XA-sL% zKbd-BG!}(2Ur?Q)%qI<%pM*R;rLha8J4KI|lFs=dzZO+hZ0QPwsFXdSi^u0eInHgd zSYw((uZMqY`54kZPf9Ulp*Vo#4e8=&7AimxABrYZL9?g9Ik_X~^bXi>x24t2e1qs` zw63LN{Ryeoxu~BZwzOX#)>fSr>gJx~&@mM>h`~7;#|bo--ro zsF*F-(nfGU~yrJ6~)%i6F5@tyBk{7a?wLY*0jW1z>r_1%LdYtW}cg zQ35!u9$J{!W1s+wQHZ57^eyYZ$H%)ZpfH;f>#7n5<>sFfRu3v^C`Cgs24H4CiQFtn zFn?LDe)MlYx8&10iNbtYSau5nQ3QR&(~vnh zVh@$glAZ&KT;BXvSHIm+qOGS)h6)*3mUJg9_MX-Zu-nP8{0{)E>yA0_cXr^6$G;A7 z1?1@{NR=5S{t@8ns&1U~Y6V%NQK`!l+IGh4Oekv^+35H2dx!913=9tg<2>+<=eYqY z&1#VH)sPAVr;sdRzA1D?QS3&*g&eDu=SDz2;HUf^q;T}`X+zdvqmT1mDt!izn;~u! zb^x^dXjb>$3PG4+waQ(c!SSH$qC{Z4T_fDT>C-pkPi{aT;G~cx5Js~VVYGF=)Bs7W zifcmJ1P>1ND;1ze;b!sdGg>_%vYLJxQ-?Oc?ijo$kR2KCz;&uo!jr$1@X!*_Ap-OM zirwN-#y+YAoC}G41BDr|b?cnWHya}I0pw7&G?gZOGJ6qw9kgx7s%9_aILff5MU|%| zd6HmZiO|#LkwTB5S;M;)Z%3fGpf4=e#XT^0ajj~#HvKm?{MYOA%OuPU2Og~(Q&Nbd zbDBWEp+~cQPXkBGc#78uUY5Qo9Eq|uB}*9wg4o_r$YEUBUJ{i2DB;HpGVCMGgXlE` zH^q}tn>|Z(Fohri zb9t$?___hdac6*OAWxi0E+bi1@d?}EOFuK)En3|}xat-KJo!ysKANyO`Zp99x0thv zHmw;0ugRajL4Gq-;G7uuF+KPV3J+X}zj-4GG!Q~eD7^M9&GkWxA$}?RpXg zg=^Q0kWS!eKCbZOSP~T2E>E6VoRqJr5hhkP0?(#xz3C;vBO{?gG9%x4RW% zt>G!azKZmq_Jor%UghBh6mJ_Qi+_Qn@rc5{Oevd4xAhX|i;oIVPvNZFVvLq}*q-yu zYzfvv@u zKX*m7*zLc77bW=eaLjW*GIH8oTX%f?s7MnEuFIZ?EATF;uJ&FDe&Up~?%0h5YL88N zA!8IgY0`t|db4eH6yFg>oLi`di-py0%SBsPf>7|#46+8F)6M&Co-eHo0QMQOs%j^B zrV^9FIA6#P%8fO&OAqVeZMWWo21=}(ux3OHQs|wemX}CDn%Z`w1c`KT;EwUjcjW4s z!E06gNsmD79qHlL(xGisMvRI2A z+U3VyYTev}$4$w$?S1aOxJ)lN^~7s^o&SD=l``gY%A<*HnFUD>xVtEyN%x3l`KxO1 zD*1Bg&kJ+w+zx9;a9FrZ$1e&uc<6ie5L41Zk-_GMZUO*O0Zd4i19y;a6bJD}N2`~FVV91U# ztNLfJYd$<@osm+ep&N?%c?S4Lw@_oVMHd?j1FAm9j_d#y8|0#tb-i3%bWk;nhmHiM zT6&=AUFN9<;F-CJaDa=;EebQmnH^I_bjK6CIk1@#V2?d#R^(7WsxM6Y&YilBAEemj z!Tdf+P0Ttnbu{HRTCj8S334^!=$*X+BA0Lu>XGIc(*)Q+WnpgR^Cha-y8?H~vJRV)#PST!Mf?M5%? zLw^M%QIf_2Y%_qArB30oXRcl#CilI>jri(CmZ(IO-~^Y_?HV~GPNH$2h%O0pZpjv0 z7Yc{IW3RKp{tuOalsCF?me0$u9O#Q#|DJFE+zj77fpu-vz9uDcW?efNRz}(DhLA=0 zt(NE>9-Q}3b1<;4fXTSBLsH`M0s!+CVuUz%9o8d++rK&|$8cun8gL^KBbDLlaI7B7 zaRT5*buQoFJ#ZjSQN~VjSAZi->t<8q+C~QmA2pf^`>u*MMucS{PTZWg9M1#q6{YBS zNVt{X$p!-yWxS})=b6>NUj?R}Lwa%TtS=^dQi4mB7WXyBOM|7R=+#0y@4!9A#uZ)D za2#Rf2r-9^L{}!@W7t{$PO7(7*N=EogI3I+h%pl3oE1>4eG8Mp$JqV|eK$#kCzzCO z1ev}={tG!yFdk`WESV-9O@+83Eq076(=E1CeDly z=gdTIBUT4J8*yN6l9Aa^-{p&N@CYo<&7|OAE8|_cvo^0T*>eHzd3?!$2ds=>vrNpZ zA>gJ^&Rp!R8S#iaa|`kv06sAk21OHE{fEE}Al*QEYKhTv(yfz_Q-5qw@_D-NzYI4) zLEBY&69eh-=)^vVPD9@2&2ur|!r%%*XBx!tYp~5ZGCv(2+eNH zSK71-B=Ncj2MPF_K7R*swnqFhQ`i#SoAeap;5^k7kJq?t2wC9t z(L`{OYUcuKiH+g^te;oEQGPQxKTzcoyFnMUfI~wE}+y<7p{D*xs&l|`L^L#)b%Kl zoeb22RWx_`o|^v?dKy!aUf@_0XAYJGvlqk>e|rz;{op~k+wZKc(0yMIJOtGq-*GWE zn^f3zEl!1~T1i~|PY0>%f)bwTjUJui;h~xzOntSFEV`JBqU}iAqYdAd-3){SGA>F; zGGyo6P)1)uR%tAE5Q#DCxsc`@JRX_1XvsI00~wVJ=XpA zD@<2>lqm1?o)#;kb-WoYmxg-hBgV7iS2?`sh88%Jh~a&4Z_HC>H3DXg#;QQDGmHvf zZWm>^j^joOf*iy)y>v8wzsf6DR%KBOyNhchLE# zA9_RS@-MS-q8ngX#(6YMW0iC!hgWYvI8@XjUfx?u^v!Q^Lb1F(iR{ma-ZM(8XP(!l zUzA~#cx@BW&eh4t+>VK}0NZz=8zr1(<$d4Hhln2oAX-6VZGCLExZ5n}cpth7mz@tT z_lmX3htKhQDK``!Nco;K^Odt5j(ba!h<%559Gw$f z9EP$5b%@nnL&t*u2NmPvbt{9kX>y`NI60IL0%wv%FT3>xE?6?3 zQ83o^`2rhXlj_-s?|Z5`UXZ3C_tGY0vmwU!SE(6P_M7#S+R#897v%Y|(Nk;LFSU2$ z)k7tDmWk{=i|ictXTi7pvB%gkMt8k|cgU*-+#c~OU7JL;&bB9w;t=6OLZ$@;73*m6=yBV)) z^T_&kmKDPtg^OZ~RdTR;Xjki4;^72gMvRlvFdsWN1mQ-mPs@iWwp)S)RJhUy3Y6+S zmdb8v;%z@sh}93Vg&(JGE;pUR ziUxV!-U9v)Dy>)azOEGj;eEm>b~Pa#%g>uFk5bdQ;iuB-^$&h4eNmTB#%JDoIpPz5 zTFbc<*Gh4BCJy?13_xo%gPMYyJ;}0pbj~@(m-@LH45Q-%`s60psKQNOCGlf*F~OL_ zZ=R?*!qC*qkAlS5df=TRQH3w7C5~XWK;9d*`@mJ~TD88c##cb)IfgQUr`(3jBZ!k6 zojE_8|G}wo6LQ0M5PE#8HK~9B|61=a_sjh`j}N_*TyMqu2L}jBbTf7f=i~!^*n#X8 zzsA1bjXuf`-wf1WE{OhAl7GE7HXA`+j3p|=N*9PX@4u0T|99CK>;g_Z{-v1N8;o&! z00buW#h}F;QP@mu#AcVxhV-8x@$YUXtXmixf+mfTU6dHGY8=5YwH7j!8xC@rPQLVq zx}t+BHD$ChKN(VRILYlrW^SDRQ;k#SFXcUaW4AJq5x%8Swqv8`9|@uV-`kfp$898A z{}qH@#2X>g*wOvO;3iQLDRI~uDQ4b4Hb_E^1~@>2bnDlDm04TXf)**+-;KCqTXtsw zg{oXno_uwhhmOV-@;)+eP?c*S@+G(curd>=3)L8(Dg1;dqBlf$HGu zHH@2cg_^AHT7L$>znsuwKUuI|Uzf5c6nC8+%?9~Ee2Te%_*u7AN79`TaYZ)}t87Xi z9Pz=Pw4sn1XMZ+W5#TSZ37@GcuXQ8?&{vPgAI3W6&LU=-bHD8Rjp*OJlCRNMBeT1@ z_i1#n zn>`E%9v24W)y&#M?(7fvJ1Mzd@u$5N=9y~&_Fvqw*nk2JVztXf_R57OI=$6$i`iP( z%PqZgU<_dH6J**K^<7^BKz1-Q8ky2yaRP{Ot3^Yit|u3)Xe6_RE`sh6O||WP3PThl zP9p_~uPl0@s|{RUaBfxxv|8c?ZGToOm??Mm92yPo3`@-% zQLKY_>fJMV_%z!$PL_%_d>5GAIYK@FtXuT4Gox`B0=g*8B}QpR=uN~Wz9^TqZg%I% zB3o&fS6IC^X|bcKl5Xa#1p?r^>S& zoLSnZK6HCBd>jny;;`6XmNPHws_!dX|A0@RdM}RC!U+F;r3Y@Kr_b)q zJ7}hm)l6u@!QOer9wbkAQF-JlX3_GFf`08J{LV}o=b{Q#(W5*In37=SOCfN>Qv)g-tVec!ojOrAf4_ZUVqqvSJ%WB; zenb&XCRq%6fj#v;!U9ed;7D}5R)487$KizZ{yXzn4A&?5Md2Y4uAvRzs%ubvW`G@g z#HSit3xt6a&pvc2>%#k$B-Ix+YID8>gom+En*i56NmaMU)JgortR*>&;vK9u^*oj7 zki@*K);!0t01Lp^|aC82|5SE+5Q}T zB8SZl#0n;pUcpb9%$8ms;lYT$jQO0C>J?xL4|aMnAbr@7ekcx)zbfxqrUm@a53Y;z zNG6-r!4a^7z5}WQw!oXL1x=X}Si48Bj->gq-XYCF-Eg*f%*iDL@IM(lHRafBNbcJ(O`-3U6f#Lq%e$8vE~tNW^w!8MQLwI`yvk_%hST;V$` zp_K8dlHESCLFWn(GJ^?ZQAIT97pnYi6M0x@UNQJWhuXG4O~zw)ub`${x3z@SwHuP` zF?cWyLd656@?)7rWKq<$D)w41Kl+d!LQRqw;jx&`O@m*JlzEu>GMLVTiyMejOtej! zOeIU?O_)rUTy@oTqjE^38SiksW14>)DCJK=(RbNABusIKhH7@H83oCN<-a1Nga`1! zF*B6Sx@;?&7Ny0@*()E3p6~NwLpmokzq<#n)0@tdUPz@?Ee7 zbJ8CwV6759G+K6FDc0o9wA(Tk)suqENkBjVb#;<&Nm3#ywG8$oG<$JBZYocc?>Bk^ z4Rp&-kmeRHM*+_@+F(%wUTcHaRJef;5Wn}f7iW}~WpUQe1nP0GaG9+coFA2S?-CC{ zilIQHG^CJI>P?-D&4?uQpXkF&UKc}bu*WO;7f3E@oS*rwb>#8!EXB@yJq~{yH6sGf zuzlh)5Uy>&Jh8Z4zlw4kV(@)9WA|Y!RRaGsN96znCvNZ9qUj!N#zxtJKyoD{Je#RC zOez}q`bT}eDIr=>7_-HiPsWf&5Yh*C0HWKvF94@Ewymq*STELU0s$|qK*r(b-j&IM ziB#j(UL7is4?j8h<2M16jDz8_Fz4;BB1J+^Nb$4S;$vKLeHxRpUHh0 zSg_ryMhN`Cm+V!B>ydn4xl3^rgWjCQf+OH4H7i^f%(<;398zBewZLGn5bVMH6X+kQ z9;?=hz-5a27gv3#mdlEGaDo;{uzP5?L1dv#uI3~)F5j2&uWH$$_e0gMT7qi!&*h3T zHKh_d{(B1y&I-CEbW>sumVH^ak)4@OOT%AWKE&lgvzw|##<%*S(nOVwW+-SC#h11w zYYgrpJZ)A5r{f(=ZJ8=pPOU1Ay?5PBd0#P&^T$P5-C=s>gm?C80~Twotmxv8RQ6|= zR))RXC=@3o@aHrqhUcxn<&(03lA&rMr$##HNpSoQ2Hpq_1>BqL10Do2 z(lKd#swRp(^QF^5UUi9W&!vw=hmfj?usu>&sE(Gu{vNlDKa< zz123uN|pu3nL4I(f2u?aAEfkPPt4B=pd*5LM&?T-&t7uEr9e_93*JlP!%&<62r?4D zIbx+KQ%U4BhXVjAS0K{#!F#H2T{!#)NX_Wc3g0U1?t_Ge!wsb{DESU7sHe>Qh3(-` zD@GY$`Z8_4sG)`_&NP?O4)A7<@|R) z-rSGA-xFs#{37qQh9vVg`@*16s>aY5y3y$jzguM6&kQH%)Hk6&n2Tp$%eD2nzJOX|Or-)LV@ z+fL8)GSKV?E|C2j>IMA#yz@#z@(ter$&)PWO50N|(Vj+q8jn6~@s|Wk`0G@ zbg6_Eo3X?}K}}Nv=-YnPA1VrIT~uowN2$GqakKB@Ji2xw*fwGOjY>^)I&jJ(%ld zkusc&1`Z{Y@vPNGMq%@>P8+ydB4ko-ydj+f82Nvw51O=hw(xkj@W>E8SkJEKfAF7L zBS&|>j<9i4YqB$}wV@hI@yaO5=**neAoFS#2;jJxy$rcz;7&F}$dj=o;|{FGkpf{} zkpD9RADDPj8UZp4Q7M;g*IRF**OO}C6FLR7kRJbJO@U!}&Z^`s|2RjyiG(mW*lp4W zZ<(*qJ?Yb-Q+gTfTf!6QlpfzdID%tKOgyALBtUu!cA91$oTg3}7RcIDcPzqu^PNYvm{fU75Sr`b(5v1IGq(xjEEBYIgCL_%$6?>rF0|(c%!=w zmYuy*kqB1fB$`8}6*Ur-JytEu!KjhN_K+PDNGg{ik@10fdQf5kf6ZBuJ403;Oirpj z-ap(Ak`x<2qR?Cu(wk4%hAQ2aGd<+uy;|u`63;2mQ}35dVaNHb$X>lM2R$*Djr$Gi zM({5$CDPO+A<2IYM(}lTc-I0b$H~acsY}k9vTEd87G7kyzb2C`sTCLBJWbNv{^sAX z=dZi&R#i*(|ER3h=nwGTwt^VK{oeBkr=M7#`?6|(lnBt3AAx=8!&To2!z~wCU&F)z z3#eo@%Sdo4|I|^Kcs}*JvFcl(DCVEwh)l&fT$V`GV2<68S|pxj8L~*CmZN=9N63Q0 z#2h|iv@K?BA`Q!%;O)732}-C%vOETrjjW`>JN&u${N?i}IJEi9eUr;(yYA4go&YKG z2C}y9sXZz;oPYPKdQ?RUb4fWCVX^-iMoL8ynuu6p;zSQq`sS7I#UdOdE{nBxN1w=w z#&Zr$Q0G&WgS#H9zjl+ODV5A>|C_+Y$Y%EC zHt#gdCa0tNRLr3-#2fq%<0Om#F_5sx;;mKgea9TTkqJQp5S(&^jvjL=uRHj2{&neU z$_#d`i+@Z1u@!MLfRFSdig&G<{NkFN!37)~WKr*VeGdva!BIJ`!QR*mE^vF3zwO zae?AMGq+YzZcDpjXnLtAnOHz*n2LU}3En^A1S+kNk4u0VY^Yx8Hk6f1HaLuQjGWY* zfanbEWlo!Q-Iv1OqG=c*{;6t5;eakWg!kzDj&K)QG?OLH^Xn}RalZ1+N~U2r5P(dZ zmj*8m6G#+Uav)j3!B}s=;yhMAIDQh`j|=$;#fV}u1)iDhx?v`8X`t8i;V6Kb4i3j# zHBl|HW)+w%6z)PC=X%O4ufYqB&Dk-iL7ch5&bmA;g5arq;4DWhbOFl|9Nsbu1V`S$ z){2c2vzHaBU4CQxYe;Tld)0#%ElFOHAnC0&DYR41NqTF|>P;se=*8QB&$PW)eR3z| z-5UB_tG3JflvEf@k{hxc?n`cw(lQDC z<)p6e#EcjIJ|o*uagVR6yBf!wysgaUpG+!>G@qG{=e&5s`Dq5=Hx^ZBE_jP0JkYHX zr{$`H=@SiS4;@@}!OVB$=pj=hZz*fX-bX$?!=Tkw(qQ9bTbjp_f$7$wAyW7Rhrs6> zykncLA2QsZC5R%9g_4+IQN_ zn^zm`<$}G%wt0lfh-HN)wPY@;(~=z)YTNXYqO-p5oC z7bUysU0dByPf^Y;HB6qJ7>ScP-ba5QpODjE*R3^wM!SdF<%_6C18fRk)vY=~W zri=YyTW+y83Fme>5oIc)*p&GNv8_}L!z!1+`)4mf)Ar96cwas!uSIpDn1GU)7*UMH zNJJDR+-kLd(N9K&@|FriqHKg;9RGQE7O?OkDU63i*ofZg#Y4Bks6?4bvcN!BV8)pkFO_DkDYd{)mW5aAm~pF@GB2SFoOlBIa(HsT?f(f{CyfCjVh8k0J8rm<7C=h2 zg@VjqF4kb%ortwHidvZWsM}S|V)wemH#;2eH|6!%lyAh$UcQ+y&UGVF zwTqg?RNLTKRJQa|(xaV)fc7IVBisb#&BDx<;Aq=RiX^qh)(BmZf*^Q(&ODvk===~M z6p_`L$9&jjoLOJ%2nG#544V$l0eT9scDY3^BkX`>2mAdA*~~4e);!Ybml3}-Q!5_4 za5>|3`$4W3?BOx#kZTF?Y+p)qP}I*J z7QXx?bBG6K<;%7%)%Nz+;s|LN-I?U9X1noN8*gLV!mmrYE`}v;mC|+M^JY z89jhW1nM2eSK9~B=a!s78z9zqrCXxboV4&VSq!O2Ja#kS=_(R)$IiFyl11gi#!jf~ z1qSntNZiP?>3CSMc<%t|;=ko}#nTv^e5U15xsiNkWOfo$doDIiDYQoK0dD3Jhv;rP z>Dvqy`=9cEW364ugakghV+{c10TZ*(yp7o7#DpF7<&V0NAFL{GWWkHyGsjykp7FsB zrIf4fIaE-;~XkJHhoWpwvsT z`^IVmU5wz4A{^cvVqoN2F|l zdwv8Nr-7(P?g!6e9McqQA4t@Ou*L16gW|usxhJY%$rF;CYlWsOxXbPcJ@87@56j0^ zX_3(3-Hf6)UC(s2_IYBl$v1%YM&=6#h`rZ?7l)C#$DJ`Wp>VDex9xmRuz8h^%KoNm zI-oc~su5JFlv8JNWnM4cSnNrmyY}oi)@GtknYCD>H(?-%*z#MFmy@aWc_GS$-ZV6>!Ct}F9-R5;K6 zc|fk~u74`GKt$KIElt?JEm_QhL~{fHrO&j?jLtdNy%HSHa=I-p$q9aSQ#h7Ne*9qm zGBxYK70)id3wl(6rMdR2NHDqc$7kR5x)+YIB5k$9xQG;IgD8st(L04D@RwuXc7F-$ zx8a!0v(d}<7_+BM%v31*XMe%xZQl*Ddjz(rWoEkTmceVBrd4T2lzNBh&J@+#u~mpd z9-168PjhY12`9vbagWP~vi*Knld760NO-$tAI&7Rce$hCX>$`%ksvjkWU%+tOG0dk z@t#CE7l9qT+;RJ9QVr9G$4vPF8Tw~O>f*j!_9ba3RS1xm3ywf=Kh#TexU6js(UVJN zv+$K$2sEM6;#r3k$pMQQec~YJlbd_+jIDi~EP~3KAGk9f%)(FH83A_p`!f>N&E2K2 zz^1&3%cteZcmvT$RH2HG6=5|ln!Llu<$7rVUWiPM;jnNIRDAl&;;+hv^0bl)Vw_va z&lu_?h|u|1wSz>i8Cke@DoRD~%idijK1pV|sdysn8*Hk_%0%@kvDNTUTgWIY%_`{! z;#d?v$!=J5F+%F29s^XiL+}xN7xrp|Y)YjJvM{&cN=cYW#g6WaJ*UylD1!{M0vyfV zbn=RLXa{;Wz=ffsc?Vs^b z)N&+j$rM_K?Ewx2bNmA7xCJbTk#KPJwe=+>r_}x^FtSt^SlQwK(n0U3%yL9z1{^Uw zAF5%1yx=Fc-!v;I_|xz@6v)6MHKL$QsM(Yg(@Zh88?{9;~6#Q^nLsD7;<%?NAbg3Hy9%kA`W=@v# zBlBPDt|{*~r0nIy-*8A+$ZX`h0A~1_;|mQWS16E;bEIKbi77cmZ#svsS0K1hd-oD8 zeYfYhQ@XfPJQ;LRZEk?c%MCKq7<<(t%eFmsBH?413B7_r2gRds_;My|AQu0VU)!nW zo%DivP32Pbb_mJT^)p`}0;+#EKX}$j;I-g*pdJhN)$<~BD)6)deABbtL~>bGb(T^G zB~~+2h^Bq;5WLw4YdM*2+qELThvd*aTwnK?^mo%lznbwq>xQls*?H!Zme=FDrfEqT z?iX{jV!@yMOSjsxAAcwxO1H0Mw$+ws{4|-RSkf?pehLIJAFA8is#o)Op?Ma@K-w)r!Ycnp zA;w%t;GtM}WMp09_-|MTX1yDlEI4M7W%oqwKFO0LQ53ny$6LU(vXyWQs{Tp2X@_vt z7Lqavxnh9#cH~{67PFSP8vt!ODq$ExmU6&WF$YW)bkIz)BjSsGESchy_-_{aX+@N1 zOc|oKjcjrP{PV+2_k<18Bzlji$U(?tFN2{DY$Xs}`RahqGV&1n*TF~c^{})LqALx? zG1q+sN~^W#XeHEQ7a&1pQ5_TlyqIW}L3P3|M2U@Q7Th6@bDUuhlm*cH$;a>gOY3hQ zUV>}_aQLWud-#MYPkuHjFv*2-55znio><3cYZ?jF(2Z?xOR$pEt&)_|ah%|s6q4Su z@kGAUQu9eMB9fQj0^x&ezd@>!Xr3jB`^>N2HRW)}KWf?D;bA$IOSHOuRkl%iGwa#T z%=$+NPFli@QgXy#z2g7SnnPD*<1Vi_%!=5?jYYs|qv}3~eIpg2QW@vq`%Ywog2lGE zeXPubpA&yNvaO}VhG(sT`)?-~lkitWuCIwvu%=eY4Nt?EkEq-)LSEK0kKcLOL&Zd1 z4yQFL;=v_U36U-!<->Qv@YDJo+c zsgS=Gqo@#H%lm~8p|*9<56Xn6aV{HKUF@B;2Aq%TaVwM5P~$Cc{x^~urVReb7s<^?$%PNazvb2`2dov zWIlgc!vL11d$PJmc0I$vPoE%!<^;u>g1Ias<|cToKQE!3R~n30r-foz1HLd4Bl~UY zMcI^?qu~1ra{^p@R}|QQm+glsu;Q-5$ zQtXE?5K`Xf(9jZ|)2WtGT`$y-a;ClAiyZ7)fxZaoblHQ6mvt)?;+@Zr{Q1Jqdb{?n zE+5OAn|nDTrRvMuy6yP9>CXNaAid(xXW%9CwN>@PnisV{TNK%}Msve=%^EFa=URu}kt%u5)wTmvT^poF>#S zsZ*s>8FA#4QejxbV!K3S0=x?|6AG~avSq1LB-i+cxx14jY2j{93E)6Jo=6D%s1)C2 z1t4i`UejBm*atkIH3tj=2NV*PKr*h=rDAko@U`yP*`>Bq&ZmJi8o2QDeY?=1(@_w( zNZ<177td;_k1RmW1yh2M!ZK7~^mnY<^}EifY@%9xy}DfR^*=4|YP?dHCZ@OjKSjnSH4KlCtYY5B0$ui$>+?Pp)t^4r(_p z5WfFVwfE&6c-x{5sl)@bkO@Y=;FEuq^dh8YnJMiZ92r6l-9Y3D0Y$*R$)8=X>$ak0 zB;>9Fi)>0+N*%i&G{K_ct|&`fW)X9&mXOz%PgrL@QTY^Y$Yqgv9%6FCn9_-wcJqCpgP~FFaS7}xyh}-L@TXEt-vHB&^DfEvLKtYIEIgmL z7Sfr(Y2xj5)lvs77I3~y1qFxfa2gD)QM!8(kBP!H9{X}rlYZLMVxP zMOMx)JEUo?AW5g~JG{rq*y9qaUVm9Gdl*~E9NtP6VX#fWnKVez-Q-INms$V>rV-tj zrDW!}@R16|ou+Z&0n?n7%;Cn{!aX^B=mvD18rCW5XqIO#KRSKz!hxkfc%WXj}bw;$&yH*(K(dW!SuTK5puY4{^8H zDn-*rm?nuQeUIMKZVVu1Io1_OGcQRIi!Q zx6QM2#%wL&fNPH=HYgBc32|aNvRhwQ;ATFm(xAN<0FQKL9~`QI|Cz`N%KzSUupELT z+r?5GcG@{K9V7A_aLops%V`n9$pM`HH$B+-nsqy@R3|*l6)+NXDuPUNlbDHLUCzHK zpOh)?LAmADl+d;tZ)sY>0d(_mCs7C~a7TGKrR#{w91Nkz zM}mYTvWRXqh9|z24yKpc0x%oQFQAm#o3d|9yD?3L*v!y)uCJJN&9|Z^Nl>_A23X6{ z_leGiJ};vv&xH;-1m0v(8#5;aK~N;oVc_n}NNe(LP?dKfnL2^NrcQkGn@bSXh!|lh z;A0L@B>CmVmcz1abZu|Eq3T0o#FSr}&Afi8MOF+S35r^qjo`$9qz9B}p2~4m0diek zJmgG}UVtF}Fq@6Ew-JV^bNPhzz8dc|>JWCyHq{g{0p2|!!jfC>z^@xg*syQ9R?Nk+ zcD=Z{U)M{S7&o(8Ixd{$nZvw5MUOFbKWl z=@prx#8Hr{YI(lhHg&++#LPBTIbjF=AIc^s+7n-C$f~DIjtj|XT4a02B`r3`hSB#i z8btOzbBPSt8mm=DAh`Bm3<}5eK?C(mc!{+a-#ZM&6?h6}ST4Ip^V($cA|(Zj$gJ=m z^Xg->30OFTcb7zEi>BOebTa&`Za5*(*7x{HI*VpzjZta|gMgerbW^Qm>%9c?nhoa) zAv23ZyZA}ATt2B7%t)DB?WTKKAW5P$nmJ}m#1#wX18+Sr;IneOtlV2E(hreatk~Vj z3h-~$aW5A7M#E1Mxp07xYZg6Oj0KR`2(2mxQ(DlE*r9Ms?Hu+D>EHyJlE|32p3~w< zm0m0qHZw$|5kPnduinaJSv*hGWb5k2Dj7;&q$S@(D=ck2&Egw%?d2eK`UsjU#FICFYH2*Q1hT z+|l$8LwR%0BBzZan*1{M4-#@oi%kyuxSZr0A)|}wscKK&jAFI$!7Dx3JL&pva}TuY zZSbvQiWdUi|1)3p_e(q~HT=PGFlZ~nboI+BI>Hh&p<@B!6^j*F)eaD1ZaCO8!60&dV zWMp#=2`MZEbjS|$vO=7qIbn*fp*c)Q^8ED?Dv{T~i-C!Px^c$P{tZu1U0-WgfyKim zRkK(V;~Q4Qf-Tz6D6v(#)^vXQqg3_q&<5|kbwW`k%&l7A#UJrMD z_h@7$J@KSkV^E6#OlZSw=~5&Gal-Q@GgIEi6#D`6(R=n2@S>4qz=t+=_9(-~Q=Mah zUqV}hqMwF+QaVTIb;dVJe_JZTdN^oABqv@UvVeviy%QMg^`);K%VA}vtudz`|KhTP zzF0tx+nYBl+@D9-10L(VT_(oy*+e}@Q1Wt>y|s#(;)GqkBkl6derugjt%DWWr>)pW znvU8j|KM3KA;v*S>5AbKI8WH0SlPATeWClRTq-TfhfwHV_nhX};xsEfgcLF|r&s2b zg^Ewu$945yFH|XBA@psa*diQBr{F^JP1#&QH(e`KH!S3}rfliYA>DH*afMDW;SO<< zGCh;KaDKK^27&0jNQ`h$j?@IGj3ZjkhawRWy=96Shk4PduR-8H9dJU^od$Zw@(w0L zq1NJ>p2_$Riv*MoLG(R>_jvI_FdV$L5>=Ykz{%2!#5U$B53jEk!bh<2Xy=OB$t=S` z(dMA510|%}5V$1YUpo+rME01UzZ|wY4y|5!9C8WtnJ_;N-~w>u%r&s+J`4a%ws}mV zoD}zcD@5~fa*c?tG4Ot299IX$6r{K{E^D@PDKU9%*HQrH(h>AW=7P069Y8%5Nl(Kf zeVBYv=(t0M1cKq}9A;2KO%c1K4&7DQwta5OabRBs*aQbpXpO7HVwG2KgLk>C>+`p{ zta_ZgC##vzNG_J?OWxMT!4W=e}0Z3R~Ruv4r|W|djV&a_)?Zl z-EOM;WjW}kXk)gMevEppJXsF@$N8eb<<8!I1CxbEqo5~4yFA0=9h32>$2?gHj4Y(j zh@Zw~d9Mm&BlBi|1fS|=aH?g}&kKqWuucU%j71Eyt;!ziRH7rsq(zb|tp^=RIC;<{ zjb@N$or$8Yz_293PE@v`;6m>k6~*zMo<07JN=RZ3OQeYn&dYwV$jKxzT9Ks)-AYY! zC|=`mRq@Q_C;4So#K9725-iT!QgKE^X#6LnrmpEDFrAEpk%jzrWmTV^OfR!KIa9487W`#870oP`8@Ne4X(Dme)8YxP?f&el}a2e)lK0+6>3o$KNjDavhUj#eqb!ps%;R3n$WVcSFimtqHdgJyuCnMw*&eH z+`0%QvZ#S^5(`*R$6~QJ!FZ)OLJY1%2(xOF$nJ9&dWz8Ce0t?B#WQ!;*Jbb?%AtU{ zs+Y8okpG?QAfy1B93`SgBMW|QnaH_$qkABg+z>2$)E>VPEm#etW+u zOMTlX4;@+fJ&Sv|J5VaN38N@8({DM)&yr^5?SgUx3d}2{C^hk!0 zKa72y|BauIokw;!;2aLlY`N&~S%>!_v47+yTi;!{NZIe74^?YrZZ+Pmz1QEM`@2UP zR*E~7$!fT{34W?MFO(=>(Q|QC`>;jxKnz$|WS$RiCZ=4ugSjuz(}ou{_{cx{2Kbu$ z?yfg|fM4FWP)u0<5`UA6SvMCGvujU`jYGu>%1|q#Qs$^OvBxE?(7e?!aVZn7V#-QbkHYE_iLyn|1k55 zO2-*+T#azjYZ)Gz5z@EPfPsaySUxkYC!qI{9mr(nea^sYL77;gDS7&y4_$%zN%|Pg zpSzZ`RODF-k3Ms=zki1@;q_m7qjoK$9N>wSd@uDDV~5Yf4cWJ7=PJ*C89CJw!Cmi| zm(-6-Bj5OUy^H+(_!>Z?33)H^nG8Aw47alBH;dAE$DxROvH-AxETs2!NA@#PvSL%#!K5TLC8c&g zi>+dyt9(MmFv(3xFd2XgH$6P@V1P`F(*0CJ_`PDl&2FlicF8O3Nj^`X^>X;RiGyKw zIjge=Msnsek{|nW)%Dw@PLTi=ri`YzqJ|H??Z#_enP|JwbZ#XcAfHWSd3X^1pKfgL zfO1q6J@6fl%A1=`aU9>?8Z%?nejs(3wI2scz=}}We}qe6HE_tu)98H{9FIJk^uBjG z-;uT8V64dbkiZ4U3R_9Ej#Z0DCZFfv#ZS{PUtL8YkQ_F$oFUXWI0F|iQqT8e(p^G7 z(9gx6!Gx$vR;T`{RXCA|n(1m-iN9eDOkc?vvghTdzAb})jqHu=R((D!`j^&P5;j(j zcy&52Wb)zg&olrGmzic1O9=Xq#fo{_Rf+j$6$P?YHWH=7j0ZXJ3b>KS$&QQDtm+$K zFkz#;UT`||c){nv~BIVy?J7*1#V^3gWzdW!CqO{25eDHm&xamP^~6K z1OE<=dem~UZ9~rpMr6EWp?+XR$f|U zl9HH(J##ltwj)t>R6*yMnS`^t_as6&iIX|hM<=8!fy_=A;b+Udt<|RMJqM`>xdpO9 z129125=rl+m)@$b2hsT-RM?iVnoEI`%LX;&he7ELsbmVtLl}VLX2`q zAZGb~O58}UD$ct04x;iq@7J&URmCAL;=0Om3V=3@dX@07k|>^YAIth?M?K^%1~vZ0 z-pcRn@AVhsVWU0mO6oq$M`2iJMHHLvW_8~+x_D!+(n0cJxvgXz|8A_>&CR_(l#kOF zhVn<<*wFKExFQjeq(sYw>5eTRD_0gDv0GKE3Mob0uL`RF-X$c7)xIh=--IjdBM{^e2Jv;7vSqe#WoA4wb)9liq{63Hb&aAY-&YTOE zezR&S=FeJ_Iqa}V7TxkRgEF3N?x9}J=OVCAkKiFO=fHFWMp#UpxQ`yMT_fldW>dH? z6weaP(8&H2ka5=Oy6XNC^Acs|L9f+26r{>-u^>Li4?Pnj8eev%ea4e0o2swYb$Rdl zOnDZAdyrc?%#klLM0|Dq%|>P?sg`uEtTLJG{kTJIyOV6GMN{MLn0}Yz@&=uLu~55o zn!(ZbXCcJ$v;91eJ;@MDNMC>VbpWB1^ygA=TkpEyU4K`)9s(0GkDevud|Z(vGWq>% z?)m+zg@;o$-3E zam*YUST@-hohMX8w!_-GH04sZt^@#~wUg=;@(=g_%=Q6|=&BU;n7vHOIZUpC4}UiI zCDGgzm=Z-?7oLN`qP6M1yj*tURPPq`9n`<{Tg!6>nmYDr>XY(r0BDua*lz*KK&oBl zDwxx?dr4BYM9LGCwxM)}J_4YQ&-s=)JSJE-zEwER0(*yOpNs;;lqrv4Zn`9AcVdYK2qDaaruTY*df1*_^Mg4&0n52@`&h z5@I6B1@$DHm|rB?WZ5!g1o-cUjT~dL)>nT`xcDV~4Zgk&lGB&{zVF)Zfy60?4?r%} z(5!y0?q=xCzW_`Sc&#QRo>XfVebYLDEPFmvVplcyf}`gI6W|?lYoP(JT~_3Kn}97{ zH7J~_ff}eSKTvlsegAi3Kiqfh{=FNA*N(jdZ<04L(qy19D!~V2JnqP_L`U<>b6P|Z zS0J%{-OlSZ;^Xv7pvg{lyY~AIgJPn*PUapI(?-?)WDytLah4+Q&@LGQ z6d?lV4Q0%|Ss|P}>JE9zD`Wm+!Ewm3=|6hJNp)qPs3KGzJ$t--ZoN1QhV++FV|0Py zQU~e@J@3i1`jB%UFf-~-96(${=u7V)U90x*q@4GUl$W~DQsVgwq@35|vU;pZMr%%< zCwi8oR;F!?k+i0c`B#AE%L|E=#s$yPxY>|=)hRdu=he{Qu*3>`h_$XUSo*}g-fVv^ zm5tfE<~_OQYBIh&?{4m){}%sAFPHpX%NbM+yv;XweZkv(p&P4Y2i8c{fKrNob!GfW zf9Yhr5p8BF>`R&B=Qo*CXI|fWsWq8nL->RaiMOx4_$H*xXPAbVLm@S_; zidFP3p2-dC>uXi7JugR?0b-3D#Hb`flaS=Gk!Vs>`5uIOKq)}PYw@*Y1>mgQV9G#W z^mKBigBL6_HsiW`)bZjl5u$~@P)}UFKbTv52XPgbc~ag>|B)HHCH2R2duKkM+3C}{ ztbd)THieHIZ^Ry&@Qn!(MFt`aYtAtiiccD<5_ixw`RPPIS|jGv;zxW}^~*B&chlD^ zUmCB-$2Lxy%%X8SW>E?0pqp=2o;;FgU6~|#*3!!=xTv6=sjL)x>pPsEHm;rS zPGo=xcRV!q_j9YYgrpK*_c)Ex&1ctz^eqW1~8$zT8*cvLkan zl(J8=0-C8P(LZlLJF1$ydR&|2Ja`zEz);W(bgS5{>{{YH_{bGpQ_n4`b$w;HHhS={(>@yb|A*1!aH#XA;aZU^vLo959P zVa}{;Zgltm^75ftp1h^m5E$pX7dm*!OZJ0aW)DV&^De8+ret9fi=8Li@PF?O8=Y~# zs+Qk7ujbEup`lcpJZmHZdu9o!_j5qPxhV(~%3wdzy#$+$R|4-Yp0+<-*YY9K12vj3 z?P8Ew5mTYZI(UZCW9$SV2d)ff4qI*taZ{w~+AzCRxvrg4p^`pU63%&nfmEnOw((=# ztjil-2!B?+>V$c`a(DZc$LWU;B|Q`U{%yHtTKj$BImi9naAr9d0XG{?#mxl+vy|vgR~1J z}j7O4m<_*9p7z!zg~nQ28)HjR!??GpKP@~2mAx}X|ottFOf)3qzDU6^*E;+D<;PyyXLXB&aguN($dd!S7-k{Z|~C;J=q`hy@V!kx4sQsQ$Cp8m+lq}-c<9{ zDZ$twn(w@BN1s2$o<8*u7D13A@dtpoE3?+`r5IXab9>pYX{D6Z7*4JFXh6RnTm7@#p?*9W7MNU z!tyvy(XX6|2wS;FnG!JlQ0hqrfp{TVRg06a&{jK{hpy#+^&ZAbA*Ic~7ZNxl@)1&# zeXaYux`iV7<{sS&jj#W?8unm6VAu_ zuD0LbsTjTSf$zkmMd*g50MD^t(zl zo^@?!GYWnNwIlra_eD&j50&NT=J4k=5y&~H@NfpT4QC^a? z0aMdZO(NBHo7Gcgagt5C6Y@xOoh-{UAOFr4;w{TdlHpx-`L}o$`6mO%ux@qVKO#zp ztwDeaF;^_<&+{A(UXt!7rn0{30J#KVKU75{6PsjC9Ei&J4xQOkii3!0oaK*HZnMP z&HmP`1SxkllXgPGGhPHegJp#|(NNmMyV9H!E;k{`^1=jdNbL)--3Ez!UQ*jN$|MBe)o3%s%kkn;F9){gQCI%+$`=ny{8>ZtFYT8l4v`W=5^ zPurWaQL}U|(0ztR1{P`=!!c)mjS=R%Az`>g2i%_en0rE~@qi-A!Q{|sa8Tu!DUOR&5*v-+&Q!OWyP zs2I8Nma_t18C#Vt5pr|@lxxy#=n9AHcCU1NBUfOF;6$MMIlBwS?<&(M*OI!*{8 zE^+dh(QHP9C_2nzdWnu~^n5FwsI7_@wqXZ!>H**8W(V(_d6qlmFYPQ8p z$k{R_yB_XJqeoQ2xIvcNVw2DHCzZkm57uthHsO~0~$Kd7heS8b&y zY;bS)^1iFC=@k@uTgBh=DfVzCZ>?%9f;f7Qw;!L^5YZ6kK?Ugs@QxAj@*1BOxgp!> z*phh~rHJN}Q;cneFboo%aMnRqfL(%j!6)hn%9G~M`qJA~5xn6UV1LWtHM4;RszTYf zFex%C(o-a;C~T>wjw2mcaM&9iI;z}UU^^kLbdVc}lH~oXpPBGGrhCgQWWV|8YpLum zbe-l-*Qt7xdbO&_sJ8Rc95^Raq^5?M#jcrCk5{);;J2z>HXcsZ#3z>*P+YnAq|Emc zRVH&bo40Mf$WqMe&2tjr8N0~*hn0p4zrSn0mp4?A>$<874P*w52x5ANQjx+n2Qr^W zYm71>wBS{I({2-a>f_q@33*m$ah^_AWY6mUkxt)(jT-ylls?NF4!FiW=OgT5hvCf? zhvia9haK#(PjXiVr*Mpy|6^~vaf+kQS_57hc{~(2Q5Iv>S`L;13Eg2f+J5TW!>-w= zW-9Kq*=e_#l;h=F)-T_+tCH;HsXdTUjWagyM>2gsSe#JmM{`qmpoiQxA@w{@rB10| zFkjvWXpq_M{Qw?l9?g8jQSy1l^|hkj=CyF?EPbkPI@b$Rs2G@55P7B*Fk8=j3HiWV zr8I?Yar2BtXc4Cy5BQ!>6$lph<&)%+`VSg~TY`fKr9qhaN~ls)Q>&!se;R+vGu{CQ z1Cn-j9PYIH46MrM^`qI{E@v}YUfG0h^U6j4FIaT9rgF=?f}?4{j|5izaVpCj`kIl% z_xn(?kN;RoEos1y3Hm<`twgb@Lq3$P)xi9@=?Iqv@vRbv0+SLNe*i`kf;C~`jfEjcOe056^=2iU8Y*qT#s^z`(o(0*bn zYQv5s*zl@fZl`H*f@h12D#yLKo?WcZ0$;S^OoJ(=vGjR%Bt6q`;T0xx+s7qT z`zr%RwDYNqO#(lRv-*C+41vI^iCh>|*?|+!8u}?ai=MZ28~TA2I}o3t#``xbI%h00 z>f8lp1ALwtkuBy_$cmp}oB&n`v+(yz!g+$QubzT;zxR}a-5wo^Qh!AA@-pk}W2RF_!m_JQP8b+LBKtDi-`mm%7%|Y$lnfk5Lmy4Bl*GKUx=}Qp zAE8EXxyxdHQ5q#Uzi8(3S}ZXT;suA6oJh$j{Bj~H2b6ftp2fvOb5|-*j=PmHHh1b{ z$@hRuSw?XHYTn<)@d~eHyVs2i=DlHD*ZXUbdN4hvmIgd!HudgAjFJ(x3 z!QS0vjz({73WXk_1VT1*%C^0S-qOc%s3~0~@z}D&_Sg)YZwdqo)Nzn$S!~7hY1A~| zk*+f(;)&(u9kYopt|3;aiJ1lv%hYgd6Q8rc=UWwzK>OPD}{J5SxKB^a}%_xLjc(8iZECpTARQ|(9^|Y zgeS!}j}wzg$V~7Vf;}{p!efpnJmyeKa6pbps_>)$HRSOY<<=;|pUOy8mjd4(PazIp zg6WT?f;)P#lEutLNXgX|Wk2}%Kv!#5O5!~C(&ab;1}P!f`)_%VJPeSld%c5pYq#VY z*>=2d#$%$qh+}`|#Qr%Sf#(%4iD3O$H+KNsHXBYqc=ng;t|IYELCQsC*sR>8s3c~B z8qMEtBxJxPuiKl7s2djIXbDr4coCQI$5{oPRvHL(jzh-Z^%XQTwU#1`h0p|ZWud;B z@vCXEs3OLO-=4kRke~d5VPBRufeTZ@d25 zXtu!B*)0I=nC+)tu!EK*bKHip{sj>(zvC~75kpgcM~()J?g_R=o4-7LhP1lv6cU74 zrA;ledp`>==hJo5^J>kGRrk;}-6$#55H@h`@gkX{o?vD)A!YAqo@-&kQStk-+}nXd zQ{=Fd+Y$hH#?Gqa2mb8mh!>S*Znj$_H6fJ1Wcuw~(p9^DwCXdSq$!ay>gus-<#^|6m62VZ(FtbHm^&i&y3vJf2~|YI zBut>Et``+aymnWj>SPltNzLA%`!47FVYd@}IXA*G#SN}Jfl1e{n8Y&+jLx!k?uu16Ok_=tTVO>H5} z@@Fq`v)ZbzLlb1I@&bH1irjHgFiEtCN8OLfnc|v;Sw=Kv)Fi6zOi4_UOc7I@kBR|= zC+ysuCQ!oJS2aNa&b0AA&z&|5^qA{l$8$;P_q#1Fa zg*4K7{e^we%5raLBAAn0WqSLq_uRIdQm3ThI84x;_{d6xkc;x>V=3|v0u(*bABiox zXR z5Ch$%nGw#1QqIc1~}Txn-jiZVUc?6sR!DRFh(2X@_~#;sXVD}<>qWWJz*)$lSHh?`KDf0*d?1t;4tBx(f=U(c zeM4;uXVRc?*e?KYG36p9r2jjeW%l0Wk$#M}tvlz0dcZi47s{=?kU3@j0l8h zDj?1>BeI5Uz7-cUQG|1-q?6kyD{eFu<_i&HOLU$uPtQKhj$rr#655cYkt6lqmYaI* zIl>anW{>?;kHTVl^*S7tLCt$2(dL70ND&;>6Y^fsr$2V!M7_ z%0VqMDQJ%@5GSBOmP~)d43czO`>sW=S3gm#B;;67j6G+_k-}`Br7>P8Mj$;*TmEbr zCsn`hG)|f6vAgANJtmkIW_GisJP0?ZkMg;u{JI=j5Is=|KGHkpV)WC2k$w={H6#xr z1U+f(fmH}zdoUolBO0~f$zg&Hs%!ROcK729lM?D%|#Ozjjhp;fJ^p@cE?TmIaCyFf}+woim% z3MYWDSmrZ3k*%Ep2u%nXFg6vFwymmG;)TWK-{UnX9OhXCSKU)nZ9OX_X@WB+ro?s% zy3(~jv=^3T%J=*&YJzIOz6WKR%Qes7kA zR?KS|+`5syvL?@z>RF#B&byFbw<!pjxqe2Z!56v63+TF1k|6y9#Smbv*9=0hP7zIPWrDRREQuU3 zRZu#(rsoO^xiHfF1c?oMNRf1E)BPYzv&ik+RL|9Goekuc7Jqm*Enp#n?2-8Ss^lTNcHNP zZcu#6$Te*u9Hk5xI%+GLBjdb`*USfB>EmY&h(CBE*tYV>9#W~@POqS|vZ>d>yRPN1 z3YkjbVeaey`<)e#c9)0Cd!N@F%dm!e-)r3k;hnVk^CgBs4LH64+HD~F=L0=GW{G{E z9SoC9+NKnVkShT5*N~*CT}*!gxr@o0Gs0LYg>~P>`a$?i?3pZdVC^*wbI@J5uxZUo$~zsCp|fA%>}tskiJO?%kn>_>vC5QFzozXt=KSD^h_f8SH69{4tK!VK9%2VHQ)k@pjGgZ-Bp00z=rwUbat1DsVEdBu4>YG13ZCy0BPZw zNsSl~_Yw{m+c-(*GbIhf{CnC)7)s^wTnT(2`ZC!2R@N(=qQIUhxL^u+>G<0}6^#4e z-jrIQOD$(jOG=D;kR+4T^PftC|9B#$80}zC{!|WZTN?(##!l9rt5JgP2PaFVC8n3jn zGu{{wmh?H|;00@-`Kw59+-^$Lezz9(Ru;^$qUj+#Py$y?scY{xLAB~nP4Xz3QlbFv zB#i-})JxPZ{=uhEbhlb7jg}Tm+?hbj`(9Evi^%>onuJe`jdN!}zOA57i2{QOVm6I- z^b&ol`$v&beqRseYE?sp8ERvPHDNHfc$8w1QjXn}@+-+Vu^5+}I+W9z;&u$OyAh$e z_hdM#KO;MvTZOkd-H2{@>O_(a90WdP8B%SP8*Cl^nL=zmXqY%8^bq+CbC3GZEs6Q7Cu{Q9V` zr6le#yH0yGWZ$M2sPa|ddwE(4K4vKKtp%ghK@ldqY(yS(k~6vne2 z5$40Gf}Q{o0m+vAn5m>U_OgR;Ih~@<8+$&d%5pfPa2j)LikPGh%nPd>j*8F~Km?rI z{lvl?vH%DB5n`W0{QUZ)YrDJq5hBo?xhyC;_(%BScROL$hw1*GC&d7-Y>hIa>X1o# z+UiUKPJ4x6a67UHoZ;0|X##Pg=*!8=4Zps z0${{IV-i~!?&yd8Y`1xErTY#N9!B+@a4V$w6u>qz1&Rhbn?H&i)52m`GC(p+)MI@| zba*c)sXUZAoH@(X3bQ$rF*KbbY^tGkXlU#aT32RlclNT?_e`Hw4n_9C906 zo=&D>E~{q4!aGUUEqj+`yRlQ`vn5Y6%v@IeN@vOevuUIW?J;D2mh=eaP{IuuIf}Dv z>7%J(JEDJag0Ko<^Q8F5fg)B}jhAO;=vpVDEM_Ozq#%_#utH!*2B5J3F^G~*Jd%0? zd9>oHB?Np0I2s0K?sD~Dt3I?@xMx{>6+x<`*bnR*G z2h5oFcox&_ZbFa|8$r<@!Q*RYu4&Qn>1 zh&i*^kqo)gWL%Z?y{GvKH&zqRT)5(XvCMhWg6sSGu^Vwp zDi1No0W-+p^0%AqrtE{)3&2Sm^M=%H&7b-DS9QB7weNx`dwn+j4hwGxaqn3K;<|pR zqkcQx?X#ZgoZE&%aELg#)T??>$#`DoVyf8F0F&e?I*$H;LQ5Hx7-#VE^CZEjrNL5k zq)9ci`i2TiP!!EwQ3m(tVX*hEFK;SDPt}_(64eu9d}_!t=ZS8u-HG| z|MPG^-N#Aildkhsw_aD9$7(5JyN~U-g!G9g1^^^cGHI%>x`B4-Er9!ngY68rMs)TU zaE!4vDfX*(B@HYn_72JdnO1MJmkS$|v_1Z8GNW0yZfqU@Om@U4GU<2j%X>UG4tSMC zPu7fAT1&p_#-TE+fYxM$P{##HHlH>J<8Uv}F$T6m1sVq=Qe%qYaD>@xK2=V>+Df#F zze5cm|G8CHFz|y~z*J>oH&$;^%4rUVYd5hT=S4JacL-`0UIr#Zy5M{F&?^N>~ ziA7m$?u~vV_P>wJO)y4cr~#b0U^r~Vh(@xJNEmYIW{ksL6jF|etfwgl;HUI(VO`Tqzpog=vUwy zWLNp0Cy);oB~?fhxhvzdBF>TYFY(zOCS~=#YWSQQO76ueU1(ZEM-t40hXRW3u|iCw zrfJ|Y`?soZNz9iRP$p+u43V@hmE|zdTB03_y;+QC$Zq7Og2~a;)p^kiJV@|UV`RNlw?kGy3QqGwn6o$*luxROUQe^KTNGkVCzr;v*l7TosY2r7rUdX8Ney^%0=%wS_`=4;a(kl^ zX> z-Ggb7IWxV>yV}$}lcdSB)vdByQ|UaCiWOaI-e&E^J(NDhPRS8!6YZo$I-lrH0YUYe zc}9aa7fv|E{i1*I{x(0KAj;`W=JTqa}XZZQd+ENgPY9vk>EQB;{tPb67fTjQjl4j9fAWG=kYhwuf{1+tjclE|Q zts;JjFs@>aFNgC4(`O5y88G~2(BXWKa|OykQp-BCnSL(6Vedt|7B)A2p!*a)9A$Jj z-J_bRyTImo?3TQ7c>v4{JlUJISOegpC8WUa<%_Z~*-w9$GpJWJezcCxp=>lc)&F36ad{S+e|4;25ejMZtO-{Fm)|F#e>lGhy;9))k`~$i&r+_XJQvrCG8= zn@}hI3#d3>wfK#o%`QUV5skwA6oe@+-Xl*ab}$VSa?u7$9YkyDp9J6tyGqZc^_D%Izi$qj_;LZmRoI!tRCIAgvRb=P?U+ z7PXWnh_1*}>N=Q$1{4x}?>&L8h~T*DDM?i%GXz;OGl8;}4KPHRGlb;BzziXtf%NuZ zePNNP>g{|tw!m-yeyC|=qk=yWl~|(K0|mx@1C8!A^VF!$2yEP^?xRj(C)*trxq3SM zIjLO^L%H-6t3ysBA94&zjCJLV63get6AGaurnofnM(Dyr$Gk-;=@sFNCgObd%wK;2 z4(-u#A$U#LmwBeUVhy1Cf*r_J@cIkf`qFH~%q2p{vpcD^J7iLf6b?P@_280SF@qgW z1}a~8_;%BGx5`uLZ|cXgv9}vpPf{P95jzROtzyV_XxsGS55@PAalc9me5KQUWQC zv{GwPI>vmNFt{WqW&gyy_C3h4R#IjF&>9Cp9{)@pq!BlMGs-qdfsIVmN+YWG&Z^~8Z6J;EL`h05mUHtsJmMI0I7~h1L*~#S zHw}j00aKSV>>>7I4*Vj&l?_B{X1O-n&vwd;CS<}L8le~v95vg9&e!9&I-ZmmL*T$= zeq}P>^9k6B2o4DHj4u+t5!14=0g+eOO@^ zNEy8A9!nOgI-I*dUj_E_qH0#vb+wWWcK3Btc{m+WQZYe{{1r?jboecGJ&MmQvQm{g zD$ELqxpqkV2E33$K{3tE9PHz5U;7%y;mLN&YN2{44IA^stnuPh&@9;{h&@9;ZU+U8 z<(|Jjc7+xMxjj!-NQk*)(Zfm?a0TJBJ^v$`32r@*m*PMFRG?eBU{PJ7q$JOl0{e8j zWG7)sWJUQeJkg-0JnfyXv5kkTc>E979ka-l3Gq)NiJhHz;s{n@hFqtiaXh&v4V}#_ z1`pd?{|a-T8y!E`!vJJH8M^fI>CVKjl{t$VM|B*sXb%1YLv$^LqJxt(+$FYq)CGcK=6C72|T|KUYKau|z6JnpwUP?mih(va>1J7rilf_qe{#IRI`)T5x3@J8@Oy=3XRqYRXCeCZf z1?!?x67Q2zV3C>XBA(CW0sVGgUMucHRp3XX75hdC*!S#MphptVl!pJx;+;?Y7`yeOPetVxu5h4~-= z5fzm?qSQTSr(speP>(w#d!nXRRm;JLFoc*Iflt17Y#Mn)4$B_M>tC}^5PYlP&wYIW zyrNckVRtCaNHaPoPSLs-)3om5z)PGoPV`1hFMv?{-kqn0%`<5}eSQsr=he{EureLMYvpcuPH@p>@#&#jG_7s{L-V?0p*L&no4TcpkrMw#0p%w%d2L$#*kx|;go z!Kqwu5|I=;eWxB}v#B20m%o!Y{1_N=h+5Q+JiFghqaglM4%d||KI0s!n!c1|*qPh& z%Zx~c2$9pQYNi~3D53VVyhYixX!>*jx0InXvfU9Io@SaWC7#(i78aIMDYX#tt^DDQ z5et8gM*MpEA#@W%A>KpvGZ7j8Ew8I(LR6t`Jh(S5ZadFJj%63_gSM4re^lKAD2 z=UJ6{($z)zz#P)0IUzjPo0~b%$s|nuqCT7CA%)5Xgp>RG-x{WGt1H3WHT`>qV#>mq Sn6O};;r|EmyHt!9%>e)hl&`b^ literal 0 HcmV?d00001 From f162cd449bc8cacc64d3dc236ca6ddaa2530ac46 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 22 May 2019 11:49:12 -0400 Subject: [PATCH 04/39] ENH Uses categories from arrf file --- .../compose/plot_column_transformer_mixed_types.py | 2 +- sklearn/datasets/openml.py | 12 +++++++----- sklearn/utils/__init__.py | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 19651cd7cf622..9233e298d9fce 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -37,7 +37,7 @@ np.random.seed(0) # Read data from Titanic dataset. -titantic = fetch_openml(data_id=40945, return_frame=False) +titantic = fetch_openml(data_id=40945, return_frame=True) data = titantic.data # We will train our classifier with the following features: diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 44a0b89c188d4..d8991012d7a66 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -300,18 +300,20 @@ def _convert_arff_data_dataframe(arrf_data, all_columns, features_dict): ------- df : pd.DataFrame """ - check_pandas_support('fetch_openml with return_frame=True') - import pandas as pd - - df = pd.DataFrame(arrf_data['data'], columns=list(features_dict.keys()), - dtype=object) + pd = check_pandas_support('fetch_openml with return_frame=True') + df = pd.DataFrame.from_records(arrf_data['data'], + columns=list(features_dict.keys())) df = df[all_columns].copy() + attributes = dict(arrf_data['attributes']) + dtypes = {} for column in all_columns: dtype = _feature_to_dtype(features_dict[column]) if dtype == object: continue + if dtype == 'category': + dtype = pd.CategoricalDtype(attributes[column]) dtypes[column] = dtype return df.astype(dtypes) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8d9e55f5e6df1..3b94bd85f08d0 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -840,6 +840,7 @@ def check_pandas_support(caller_name): """ try: import pandas # noqa + return pandas except ImportError as e: raise ImportError( "{} requires pandas. You can install pandas with " From f3818f12b8f9430d07b4a95d8c981b8a86c178bf Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 22 May 2019 15:05:56 -0400 Subject: [PATCH 05/39] STY Fix --- sklearn/datasets/tests/test_openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 034d78f144753..76f368676b598 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -350,8 +350,8 @@ def test_fetch_openml_cpu_pandas(monkeypatch): 'perkin-elmer', 'prime', 'siemens', 'sperry', 'sratus', 'wang']) expected_dtypes = [cat_dtype] + [np.float64] * 7 - expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', - 'CHMAX'] + expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', + 'CHMIN', 'CHMAX'] expected_target_names = ['class'] _monkey_patch_webbased_functions(monkeypatch, data_id, True) From 61c3dea5b601d11ec129bbceb4a384ca014e84e3 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 13:35:03 -0400 Subject: [PATCH 06/39] ENH Adds nrows for chunking --- sklearn/datasets/openml.py | 122 +++++++++++++++++++------- sklearn/datasets/tests/test_openml.py | 31 ++++--- 2 files changed, 109 insertions(+), 44 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d8991012d7a66..6745fcf7e44b3 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -9,6 +9,7 @@ import itertools from collections.abc import Generator from collections import OrderedDict +from itertools import zip_longest from urllib.request import urlopen, Request @@ -268,55 +269,96 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): def _feature_to_dtype(feature): """Map feature to dtype for pandas DataFrame """ - if feature["data_type"] == "string": + if feature['data_type'] == 'string': return object - elif feature["data_type"] == "nominal": + elif feature['data_type'] == 'nominal': return 'category' # only numeric, integer, real are left - elif (feature["number_of_missing_values"] != "0" or - feature["data_type"] in ["numeric", "real"]): + elif (feature['number_of_missing_values'] != '0' or + feature['data_type'] in ['numeric', 'real']): + # cast to floats when there are any missing values return np.float64 - elif feature["data_type"] == "integer": + elif feature['data_type'] == 'integer': return np.int64 - raise ValueError("Unsupported feature: {}".format(feature)) + raise ValueError('Unsupported feature: {}'.format(feature)) -def _convert_arff_data_dataframe(arrf_data, all_columns, features_dict): +def _chunk_iterable(seq, chunksize): + + pad_value = '__PADDING__' + + args = [iter(seq)] * chunksize + it = zip_longest(*args, fillvalue=pad_value) + try: + prev = next(it) + except StopIteration: + # Nothing to iterate + return + + # yield everything except the final value + for item in it: + yield prev + prev = item + + # handle final value + if prev[-1] is pad_value: + # uses binary search to find the final index + lo, hi = 0, chunksize + while lo < hi: + mid = (lo + hi) // 2 + if prev[mid] is pad_value: + hi = mid + else: + lo = mid + 1 + yield prev[:lo] + else: + # no padding needed + yield prev + + +def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): """Convert the ARFF object into a pandas DataFrame. Parameters ---------- - arff_data : list or dict - as obtained from liac-arff object + arrf : dict + As obtained from liac-arff object. - all_columns : list - columns to return + columns : list + Columns to return. features_dict : OrderedDict - map from feature to feature info from openml. This includes - columns that are not ignored. + Maps feature name to feature info from openml. + + nrows : int + Number of rows to read at a time. Returns ------- - df : pd.DataFrame + dataframe : pandas DataFrame """ pd = check_pandas_support('fetch_openml with return_frame=True') - df = pd.DataFrame.from_records(arrf_data['data'], - columns=list(features_dict.keys())) - df = df[all_columns].copy() - attributes = dict(arrf_data['attributes']) + attributes = dict(arrf['attributes']) + arrf_columns = list(attributes) + + arrf_data_gen = _chunk_iterable(arrf['data'], nrows) + dfs = [pd.DataFrame(list(data), columns=arrf_columns) + for data in arrf_data_gen] + df = pd.concat(dfs, copy=False) + + columns_to_keep = [col for col in arrf_columns if col in columns] - dtypes = {} - for column in all_columns: + # copy dataframe when there are columns that needs to be removed + if len(columns_to_keep) != len(arrf_columns): + df = df[columns_to_keep].copy() + + for column in columns_to_keep: dtype = _feature_to_dtype(features_dict[column]) - if dtype == object: - continue if dtype == 'category': dtype = pd.CategoricalDtype(attributes[column]) - dtypes[column] = dtype - - return df.astype(dtypes) + df[column] = df[column].astype(dtype, copy=False) + return df def _get_data_info_by_name(name, version, data_home): @@ -493,7 +535,7 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - return_frame=False): + return_frame=False, nrows=10000): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -550,19 +592,27 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns a Bunch where the data attribute is a pandas DataFrame. + nrows : int, default=10000 + Number of rows to read at a time when constructing a dataframe. + Only used when ``return_frame`` is True. + Returns ------- data : Bunch Dictionary-like object, with attributes: - data : np.array, scipy.sparse.csr_matrix of floats, or pandas Dataframe + data : np.array, scipy.sparse.csr_matrix of floats, or None The feature matrix. Categorical features are encoded as ordinals. - If ``return_frame`` is True, this is a pandas DataFrame. + If ``return_frame`` is True, this is None. target : np.array or None The regression target or classification labels, if applicable. Dtype is float if numeric, and object if categorical. If ``return_frame`` is True, this is None. + dataframe : pandas DataFrame + The pandas DataFrame that includes the data and the target. + Use ``feature_names`` and ``target_names`` to seperate the target + from the features. If ``return_frame`` is False, this is None. DESCR : str The full description of the dataset feature_names : list @@ -638,8 +688,12 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if data_description['format'].lower() == 'sparse_arff': return_sparse = True - if return_sparse and return_frame: - raise ValueError('Cannot return dataframe with sparse data') + if return_frame: + if return_sparse: + raise ValueError('Cannot return dataframe with sparse data') + if return_X_y: + raise ValueError('return_X_y=True can not be set when ' + 'return_frame=True') # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) @@ -710,9 +764,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data_description.pop('description')) if return_frame: - all_columns = data_columns + target_column - df = _convert_arff_data_dataframe(arff, all_columns, features_dict) - return Bunch(data=df, target=None, feature_names=data_columns, + columns = data_columns + target_column + df = _convert_arff_data_dataframe(arff, columns, features_dict, nrows) + + return Bunch(dataframe=df, data=None, target=None, + feature_names=data_columns, target_names=target_column, DESCR=description, details=data_description, categories=None, url="https://www.openml.org/d/{}".format(data_id)) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 76f368676b598..fcf6c851cf582 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -300,7 +300,7 @@ def test_fetch_openml_iris_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert np.all(df.dtypes == expected_dtypes) @@ -323,7 +323,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch): bunch = fetch_openml(data_id=data_id, return_frame=True, target_column=target_column, cache=False) - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape @@ -356,7 +356,7 @@ def test_fetch_openml_cpu_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape @@ -376,6 +376,17 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): fetch_openml(data_id=data_id, return_frame=True, cache=False) +def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): + data_id = 1119 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + msg = 'return_X_y=True can not be set when return_frame=True' + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=data_id, return_frame=True, cache=False, + return_X_y=True) + + def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') # Check because of the numeric row attribute (issue #12329) @@ -386,7 +397,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape @@ -409,8 +420,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape @@ -435,8 +445,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False, target_column=target_column) - - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape @@ -470,9 +479,9 @@ def test_fetch_openml_titanic_pandas(monkeypatch): 'home.dest': object, 'survived': pd.CategoricalDtype(['0', '1']) } - expected_columns = ['pclass', 'name', 'sex', 'age', 'sibsp', + expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', - 'boat', 'body', 'home.dest', 'survived'] + 'boat', 'body', 'home.dest'] expected_dtypes = [name_to_dtype[col] for col in expected_columns] expected_feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', @@ -481,7 +490,7 @@ def test_fetch_openml_titanic_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.data + df = bunch.dataframe assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape From 052491f9cfb4a0bc3af660e81ed04cd1b91cd6cf Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 13:38:55 -0400 Subject: [PATCH 07/39] STY Fix --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index fcf6c851cf582..9e67cb5e8f503 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -377,7 +377,7 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): - data_id = 1119 + data_id = 1119 _monkey_patch_webbased_functions(monkeypatch, data_id, True) From e7a6f9c094a73d2019af17ec08782e7baa2d14b5 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 13:59:15 -0400 Subject: [PATCH 08/39] DOC Adds more comments --- sklearn/datasets/openml.py | 4 +++- sklearn/utils/__init__.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 6745fcf7e44b3..b242310484ade 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -284,6 +284,8 @@ def _feature_to_dtype(feature): def _chunk_iterable(seq, chunksize): + """Chunk ``seq`` into tuples of length ``chunksize``. The last chunk may + have a length less than ``chunksize``.""" pad_value = '__PADDING__' @@ -325,7 +327,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): As obtained from liac-arff object. columns : list - Columns to return. + Columns from dataframe to return. features_dict : OrderedDict Maps feature name to feature info from openml. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3b94bd85f08d0..b1b246b2b22b5 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -843,6 +843,5 @@ def check_pandas_support(caller_name): return pandas except ImportError as e: raise ImportError( - "{} requires pandas. You can install pandas with " - "`pip install pandas`".format(caller_name) + "{} requires pandas.".format(caller_name) ) from e From 95b4153763348f449aedb8556b2f68a9d4225cd2 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 14:00:00 -0400 Subject: [PATCH 09/39] DOC Fixes example --- examples/compose/plot_column_transformer_mixed_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 9233e298d9fce..87a71e51f822b 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -38,7 +38,7 @@ # Read data from Titanic dataset. titantic = fetch_openml(data_id=40945, return_frame=True) -data = titantic.data +data = titantic.dataframe # We will train our classifier with the following features: # Numeric Features: From 6c8c7097326b39c53791c376db1b1fa8536d4bc7 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 15:13:09 -0400 Subject: [PATCH 10/39] ENH Uses object types when loading into dataframe --- sklearn/datasets/openml.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index b242310484ade..7272d6072e2ce 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -345,7 +345,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): arrf_columns = list(attributes) arrf_data_gen = _chunk_iterable(arrf['data'], nrows) - dfs = [pd.DataFrame(list(data), columns=arrf_columns) + dfs = [pd.DataFrame(list(data), columns=arrf_columns, dtype=object) for data in arrf_data_gen] df = pd.concat(dfs, copy=False) @@ -594,7 +594,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns a Bunch where the data attribute is a pandas DataFrame. - nrows : int, default=10000 + nrows : int, default=5000 Number of rows to read at a time when constructing a dataframe. Only used when ``return_frame`` is True. @@ -728,8 +728,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column) # prepare which columns and data types should be returned for the X and y - features_dict = OrderedDict([(feature['name'], feature) - for feature in features_list]) + features_dict = {feature['name']: feature for feature in features_list} # XXX: col_slice_y should be all nominal or all numeric _verify_target_data_type(features_dict, target_column) From 26b03b219b80df27de74c153265f35a766093374 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 15:40:22 -0400 Subject: [PATCH 11/39] CLN Address comments --- sklearn/datasets/openml.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 7272d6072e2ce..9af5d68cadcaa 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -318,7 +318,7 @@ def _chunk_iterable(seq, chunksize): yield prev -def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): +def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): """Convert the ARFF object into a pandas DataFrame. Parameters @@ -332,7 +332,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): features_dict : OrderedDict Maps feature name to feature info from openml. - nrows : int + chunksize : int Number of rows to read at a time. Returns @@ -344,10 +344,10 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, nrows): attributes = dict(arrf['attributes']) arrf_columns = list(attributes) - arrf_data_gen = _chunk_iterable(arrf['data'], nrows) - dfs = [pd.DataFrame(list(data), columns=arrf_columns, dtype=object) + arrf_data_gen = _chunk_iterable(arrf['data'], chunksize) + dfs = [pd.DataFrame(list(data), columns=arrf_columns) for data in arrf_data_gen] - df = pd.concat(dfs, copy=False) + df = pd.concat(dfs) columns_to_keep = [col for col in arrf_columns if col in columns] @@ -537,7 +537,7 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - return_frame=False, nrows=10000): + return_frame=False, chunksize=5000): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -594,7 +594,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns a Bunch where the data attribute is a pandas DataFrame. - nrows : int, default=5000 + chunksize : int, default=5000 Number of rows to read at a time when constructing a dataframe. Only used when ``return_frame`` is True. @@ -766,7 +766,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if return_frame: columns = data_columns + target_column - df = _convert_arff_data_dataframe(arff, columns, features_dict, nrows) + df = _convert_arff_data_dataframe(arff, columns, features_dict, + chunksize) return Bunch(dataframe=df, data=None, target=None, feature_names=data_columns, From f5a60bd7251b1c32f6fadde0392264bf22d6678d Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 24 May 2019 17:31:59 -0400 Subject: [PATCH 12/39] STY Fix --- sklearn/datasets/openml.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 9af5d68cadcaa..58d4ba79a49be 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -8,7 +8,6 @@ from functools import wraps import itertools from collections.abc import Generator -from collections import OrderedDict from itertools import zip_longest from urllib.request import urlopen, Request @@ -329,7 +328,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): columns : list Columns from dataframe to return. - features_dict : OrderedDict + features_dict : dict Maps feature name to feature info from openml. chunksize : int From 599666fd4c147e686fe6c62e41fc50e847c21baa Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 25 May 2019 16:39:20 -0400 Subject: [PATCH 13/39] TST Fix pandas test --- sklearn/datasets/openml.py | 5 +-- sklearn/datasets/tests/test_openml.py | 44 ++++++++++++++++----------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 58d4ba79a49be..5b9eea939e0b3 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -9,6 +9,7 @@ import itertools from collections.abc import Generator from itertools import zip_longest +from collections import OrderedDict from urllib.request import urlopen, Request @@ -340,7 +341,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): """ pd = check_pandas_support('fetch_openml with return_frame=True') - attributes = dict(arrf['attributes']) + attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) arrf_data_gen = _chunk_iterable(arrf['data'], chunksize) @@ -357,7 +358,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): for column in columns_to_keep: dtype = _feature_to_dtype(features_dict[column]) if dtype == 'category': - dtype = pd.CategoricalDtype(attributes[column]) + dtype = pd.api.types.CategoricalDtype(attributes[column]) df[column] = df[column].astype(dtype, copy=False) return df diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 9e67cb5e8f503..990dd08eda2b2 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -286,11 +286,12 @@ def test_feature_to_dtype_error(feature): def test_fetch_openml_iris_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 expected_shape = (150, 5) - cat_dtype = pd.CategoricalDtype(['Iris-setosa', 'Iris-versicolor', - 'Iris-virginica']) + cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) expected_dtypes = [np.float64] * 4 + [cat_dtype] expected_feature_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] @@ -313,6 +314,8 @@ def test_fetch_openml_iris_pandas(monkeypatch): def test_fetch_openml_anneal_pandas(monkeypatch): # classification dataset with numeric and categorical columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype + data_id = 2 target_column = 'class' expected_shape = (11, 39) @@ -328,7 +331,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -338,17 +341,18 @@ def test_fetch_openml_anneal_pandas(monkeypatch): def test_fetch_openml_cpu_pandas(monkeypatch): # regression dataset with numeric and categorical columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 expected_shape = (209, 8) - cat_dtype = pd.CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', - 'bti', 'burroughs', 'c.r.d', 'cdc', - 'cambex', 'dec', 'dg', 'formation', - 'four-phase', 'gould', 'hp', 'harris', - 'honeywell', 'ibm', 'ipl', 'magnuson', - 'microdata', 'nas', 'ncr', 'nixdorf', - 'perkin-elmer', 'prime', 'siemens', - 'sperry', 'sratus', 'wang']) + cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', + 'bti', 'burroughs', 'c.r.d', 'cdc', + 'cambex', 'dec', 'dg', 'formation', + 'four-phase', 'gould', 'hp', 'harris', + 'honeywell', 'ibm', 'ipl', 'magnuson', + 'microdata', 'nas', 'ncr', 'nixdorf', + 'perkin-elmer', 'prime', 'siemens', + 'sperry', 'sratus', 'wang']) expected_dtypes = [cat_dtype] + [np.float64] * 7 expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX'] @@ -389,6 +393,8 @@ def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype + # Check because of the numeric row attribute (issue #12329) data_id = 1119 expected_shape = (10, 15) @@ -402,7 +408,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -413,6 +419,8 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): # and ignore attributes. Note that data_features json has 82 attributes, # and row id (1), ignore attributes (3) have been removed. pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype + data_id = 40966 expected_shape = (7, 78) expected_floats = 77 @@ -425,7 +433,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -434,6 +442,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): def test_fetch_openml_emotions_pandas(monkeypatch): # classification dataset with multiple targets (natively) pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40589 target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', @@ -450,7 +459,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -460,24 +469,25 @@ def test_fetch_openml_emotions_pandas(monkeypatch): def test_fetch_openml_titanic_pandas(monkeypatch): # dataset with strings pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 expected_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, - 'sex': pd.CategoricalDtype(['female', 'male']), + 'sex': CategoricalDtype(['female', 'male']), 'age': np.float64, 'sibsp': np.float64, 'parch': np.float64, 'ticket': object, 'fare': np.float64, 'cabin': object, - 'embarked': pd.CategoricalDtype(['C', 'Q', 'S']), + 'embarked': CategoricalDtype(['C', 'Q', 'S']), 'boat': object, 'body': np.float64, 'home.dest': object, - 'survived': pd.CategoricalDtype(['0', '1']) + 'survived': CategoricalDtype(['0', '1']) } expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', From b8011a6c1a2d1300b0ffcb135d228138176ff56b Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 25 May 2019 16:54:20 -0400 Subject: [PATCH 14/39] TST Adds small chunksize for testing --- sklearn/datasets/tests/test_openml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 990dd08eda2b2..c8f54e1b4b031 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -283,7 +283,8 @@ def test_feature_to_dtype_error(feature): _feature_to_dtype(feature) -def test_fetch_openml_iris_pandas(monkeypatch): +@pytest.mark.parametrize('chunksize', [10, 1000]) +def test_fetch_openml_iris_pandas(monkeypatch, chunksize): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype @@ -300,7 +301,8 @@ def test_fetch_openml_iris_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) + bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False, + chunksize=chunksize) df = bunch.dataframe assert isinstance(df, pd.DataFrame) From f71aeb6bf4ea660cedba8c3360e6ca8ec006454a Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Sat, 25 May 2019 17:24:15 -0400 Subject: [PATCH 15/39] TST Uses cats directly --- .circleci/config.yml | 2 +- sklearn/datasets/openml.py | 2 +- sklearn/datasets/tests/test_openml.py | 41 +++++++++++---------------- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f279f577a4641..46f6ca6341204 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ jobs: - PYTHON_VERSION: 3.5 - NUMPY_VERSION: 1.11.0 - SCIPY_VERSION: 0.17.0 - - PANDAS_VERSION: 0.18.0 + - PANDAS_VERSION: 0.21.0 - MATPLOTLIB_VERSION: 1.5.1 - SCIKIT_IMAGE_VERSION: 0.12.3 steps: diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 5b9eea939e0b3..aadc6edd9b430 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -358,7 +358,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): for column in columns_to_keep: dtype = _feature_to_dtype(features_dict[column]) if dtype == 'category': - dtype = pd.api.types.CategoricalDtype(attributes[column]) + dtype = pd.CategoricalDtype(attributes[column]) df[column] = df[column].astype(dtype, copy=False) return df diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index c8f54e1b4b031..f841d5205fe09 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -287,12 +287,11 @@ def test_feature_to_dtype_error(feature): def test_fetch_openml_iris_pandas(monkeypatch, chunksize): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 expected_shape = (150, 5) - cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', - 'Iris-virginica']) + cat_dtype = pd.CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) expected_dtypes = [np.float64] * 4 + [cat_dtype] expected_feature_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] @@ -316,7 +315,6 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): def test_fetch_openml_anneal_pandas(monkeypatch): # classification dataset with numeric and categorical columns pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 2 target_column = 'class' @@ -333,7 +331,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, CategoricalDtype)]) + if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -343,18 +341,17 @@ def test_fetch_openml_anneal_pandas(monkeypatch): def test_fetch_openml_cpu_pandas(monkeypatch): # regression dataset with numeric and categorical columns pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 expected_shape = (209, 8) - cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', - 'bti', 'burroughs', 'c.r.d', 'cdc', - 'cambex', 'dec', 'dg', 'formation', - 'four-phase', 'gould', 'hp', 'harris', - 'honeywell', 'ibm', 'ipl', 'magnuson', - 'microdata', 'nas', 'ncr', 'nixdorf', - 'perkin-elmer', 'prime', 'siemens', - 'sperry', 'sratus', 'wang']) + cat_dtype = pd.CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', + 'bti', 'burroughs', 'c.r.d', 'cdc', + 'cambex', 'dec', 'dg', 'formation', + 'four-phase', 'gould', 'hp', 'harris', + 'honeywell', 'ibm', 'ipl', 'magnuson', + 'microdata', 'nas', 'ncr', 'nixdorf', + 'perkin-elmer', 'prime', 'siemens', + 'sperry', 'sratus', 'wang']) expected_dtypes = [cat_dtype] + [np.float64] * 7 expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX'] @@ -395,7 +392,6 @@ def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype # Check because of the numeric row attribute (issue #12329) data_id = 1119 @@ -410,7 +406,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, CategoricalDtype)]) + if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -421,7 +417,6 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): # and ignore attributes. Note that data_features json has 82 attributes, # and row id (1), ignore attributes (3) have been removed. pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40966 expected_shape = (7, 78) @@ -435,7 +430,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, CategoricalDtype)]) + if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -444,7 +439,6 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): def test_fetch_openml_emotions_pandas(monkeypatch): # classification dataset with multiple targets (natively) pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40589 target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', @@ -461,7 +455,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, CategoricalDtype)]) + if isinstance(dtype, pd.CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -471,25 +465,24 @@ def test_fetch_openml_emotions_pandas(monkeypatch): def test_fetch_openml_titanic_pandas(monkeypatch): # dataset with strings pd = pytest.importorskip('pandas') - CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 expected_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, - 'sex': CategoricalDtype(['female', 'male']), + 'sex': pd.CategoricalDtype(['female', 'male']), 'age': np.float64, 'sibsp': np.float64, 'parch': np.float64, 'ticket': object, 'fare': np.float64, 'cabin': object, - 'embarked': CategoricalDtype(['C', 'Q', 'S']), + 'embarked': pd.CategoricalDtype(['C', 'Q', 'S']), 'boat': object, 'body': np.float64, 'home.dest': object, - 'survived': CategoricalDtype(['0', '1']) + 'survived': pd.CategoricalDtype(['0', '1']) } expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', From bd912624dc4c76ac9f6afb2dbabfdfa103dac8c9 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 28 May 2019 11:45:52 -0400 Subject: [PATCH 16/39] ENH Adds support for pandas 0.23 --- .circleci/config.yml | 1 - azure-pipelines.yml | 1 + sklearn/datasets/openml.py | 2 +- sklearn/datasets/tests/test_openml.py | 41 ++++++++++++++++----------- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 46f6ca6341204..a162c05db2a81 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,6 @@ jobs: - PYTHON_VERSION: 3.5 - NUMPY_VERSION: 1.11.0 - SCIPY_VERSION: 0.17.0 - - PANDAS_VERSION: 0.21.0 - MATPLOTLIB_VERSION: 1.5.1 - SCIKIT_IMAGE_VERSION: 0.12.3 steps: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c31385dd3e48d..c76a97c6664eb 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,6 +20,7 @@ jobs: INSTALL_MKL: 'false' NUMPY_VERSION: '1.11.0' SCIPY_VERSION: '0.17.0' + PANDAS_VERSION: '*' CYTHON_VERSION: '*' PILLOW_VERSION: '4.0.0' MATPLOTLIB_VERSION: '1.5.1' diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index aadc6edd9b430..5b9eea939e0b3 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -358,7 +358,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): for column in columns_to_keep: dtype = _feature_to_dtype(features_dict[column]) if dtype == 'category': - dtype = pd.CategoricalDtype(attributes[column]) + dtype = pd.api.types.CategoricalDtype(attributes[column]) df[column] = df[column].astype(dtype, copy=False) return df diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index f841d5205fe09..c8f54e1b4b031 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -287,11 +287,12 @@ def test_feature_to_dtype_error(feature): def test_fetch_openml_iris_pandas(monkeypatch, chunksize): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 expected_shape = (150, 5) - cat_dtype = pd.CategoricalDtype(['Iris-setosa', 'Iris-versicolor', - 'Iris-virginica']) + cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) expected_dtypes = [np.float64] * 4 + [cat_dtype] expected_feature_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] @@ -315,6 +316,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): def test_fetch_openml_anneal_pandas(monkeypatch): # classification dataset with numeric and categorical columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 2 target_column = 'class' @@ -331,7 +333,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -341,17 +343,18 @@ def test_fetch_openml_anneal_pandas(monkeypatch): def test_fetch_openml_cpu_pandas(monkeypatch): # regression dataset with numeric and categorical columns pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 expected_shape = (209, 8) - cat_dtype = pd.CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', - 'bti', 'burroughs', 'c.r.d', 'cdc', - 'cambex', 'dec', 'dg', 'formation', - 'four-phase', 'gould', 'hp', 'harris', - 'honeywell', 'ibm', 'ipl', 'magnuson', - 'microdata', 'nas', 'ncr', 'nixdorf', - 'perkin-elmer', 'prime', 'siemens', - 'sperry', 'sratus', 'wang']) + cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', + 'bti', 'burroughs', 'c.r.d', 'cdc', + 'cambex', 'dec', 'dg', 'formation', + 'four-phase', 'gould', 'hp', 'harris', + 'honeywell', 'ibm', 'ipl', 'magnuson', + 'microdata', 'nas', 'ncr', 'nixdorf', + 'perkin-elmer', 'prime', 'siemens', + 'sperry', 'sratus', 'wang']) expected_dtypes = [cat_dtype] + [np.float64] * 7 expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX'] @@ -392,6 +395,7 @@ def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype # Check because of the numeric row attribute (issue #12329) data_id = 1119 @@ -406,7 +410,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -417,6 +421,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): # and ignore attributes. Note that data_features json has 82 attributes, # and row id (1), ignore attributes (3) have been removed. pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40966 expected_shape = (7, 78) @@ -430,7 +435,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -439,6 +444,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): def test_fetch_openml_emotions_pandas(monkeypatch): # classification dataset with multiple targets (natively) pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40589 target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', @@ -455,7 +461,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): assert isinstance(df, pd.DataFrame) assert df.shape == expected_shape n_categories = len([dtype for dtype in df.dtypes - if isinstance(dtype, pd.CategoricalDtype)]) + if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) assert expected_categories == n_categories assert expected_floats == n_floats @@ -465,24 +471,25 @@ def test_fetch_openml_emotions_pandas(monkeypatch): def test_fetch_openml_titanic_pandas(monkeypatch): # dataset with strings pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 expected_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, - 'sex': pd.CategoricalDtype(['female', 'male']), + 'sex': CategoricalDtype(['female', 'male']), 'age': np.float64, 'sibsp': np.float64, 'parch': np.float64, 'ticket': object, 'fare': np.float64, 'cabin': object, - 'embarked': pd.CategoricalDtype(['C', 'Q', 'S']), + 'embarked': CategoricalDtype(['C', 'Q', 'S']), 'boat': object, 'body': np.float64, 'home.dest': object, - 'survived': pd.CategoricalDtype(['0', '1']) + 'survived': CategoricalDtype(['0', '1']) } expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', From 22a76ff974346803018af05dc5c75c2d14c63828 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 28 May 2019 12:06:00 -0400 Subject: [PATCH 17/39] DOC Better comments --- sklearn/datasets/openml.py | 61 +++++++++++--------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 5b9eea939e0b3..ff1a50766bec9 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -8,6 +8,7 @@ from functools import wraps import itertools from collections.abc import Generator +from itertools import islice from itertools import zip_longest from collections import OrderedDict @@ -283,39 +284,15 @@ def _feature_to_dtype(feature): raise ValueError('Unsupported feature: {}'.format(feature)) -def _chunk_iterable(seq, chunksize): - """Chunk ``seq`` into tuples of length ``chunksize``. The last chunk may - have a length less than ``chunksize``.""" - - pad_value = '__PADDING__' - - args = [iter(seq)] * chunksize - it = zip_longest(*args, fillvalue=pad_value) - try: - prev = next(it) - except StopIteration: - # Nothing to iterate - return - - # yield everything except the final value - for item in it: - yield prev - prev = item - - # handle final value - if prev[-1] is pad_value: - # uses binary search to find the final index - lo, hi = 0, chunksize - while lo < hi: - mid = (lo + hi) // 2 - if prev[mid] is pad_value: - hi = mid - else: - lo = mid + 1 - yield prev[:lo] - else: - # no padding needed - yield prev +def _chunk_generator(gen, chunksize): + """Chunk generator, ``gen`` into tuples of length ``chunksize``. The last + chunk may have a length less than ``chunksize``.""" + while True: + chunk = tuple(islice(gen, chunksize)) + if chunk: + yield chunk + else: + return def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): @@ -344,7 +321,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) - arrf_data_gen = _chunk_iterable(arrf['data'], chunksize) + arrf_data_gen = _chunk_generator(arrf['data'], chunksize) dfs = [pd.DataFrame(list(data), columns=arrf_columns) for data in arrf_data_gen] df = pd.concat(dfs) @@ -595,7 +572,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, DataFrame. chunksize : int, default=5000 - Number of rows to read at a time when constructing a dataframe. + Number of rows of arrf file to read at a time. Higher values leads to + more memory usage. Only used when ``return_frame`` is True. Returns @@ -604,17 +582,12 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data : Bunch Dictionary-like object, with attributes: - data : np.array, scipy.sparse.csr_matrix of floats, or None + data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame The feature matrix. Categorical features are encoded as ordinals. - If ``return_frame`` is True, this is None. - target : np.array or None + target : np.array, pandas Series or DataFrame The regression target or classification labels, if applicable. - Dtype is float if numeric, and object if categorical. - If ``return_frame`` is True, this is None. - dataframe : pandas DataFrame - The pandas DataFrame that includes the data and the target. - Use ``feature_names`` and ``target_names`` to seperate the target - from the features. If ``return_frame`` is False, this is None. + Dtype is float if numeric, and object if categorical. If + ``return_frame`` is True, ``target`` is a pandas object. DESCR : str The full description of the dataset feature_names : list From 1712492866ef3abfe8ed201bd5dd0ca1c72265dd Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 28 May 2019 12:40:18 -0400 Subject: [PATCH 18/39] CLN Minor --- sklearn/datasets/openml.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index ff1a50766bec9..dab5a9556f081 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -9,7 +9,6 @@ import itertools from collections.abc import Generator from itertools import islice -from itertools import zip_longest from collections import OrderedDict from urllib.request import urlopen, Request @@ -285,10 +284,10 @@ def _feature_to_dtype(feature): def _chunk_generator(gen, chunksize): - """Chunk generator, ``gen`` into tuples of length ``chunksize``. The last + """Chunk generator, ``gen`` into lists of length ``chunksize``. The last chunk may have a length less than ``chunksize``.""" while True: - chunk = tuple(islice(gen, chunksize)) + chunk = list(islice(gen, chunksize)) if chunk: yield chunk else: @@ -322,7 +321,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): arrf_columns = list(attributes) arrf_data_gen = _chunk_generator(arrf['data'], chunksize) - dfs = [pd.DataFrame(list(data), columns=arrf_columns) + dfs = [pd.DataFrame(data, columns=arrf_columns) for data in arrf_data_gen] df = pd.concat(dfs) From 58c5c2db7edd85a5fac6285846333c61919c5f84 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 29 May 2019 21:20:02 -0400 Subject: [PATCH 19/39] WIP --- sklearn/datasets/openml.py | 30 ++- sklearn/datasets/tests/test_openml.py | 319 ++++++++++++++++++-------- 2 files changed, 234 insertions(+), 115 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index dab5a9556f081..cbf738410d6af 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -315,7 +315,7 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): ------- dataframe : pandas DataFrame """ - pd = check_pandas_support('fetch_openml with return_frame=True') + pd = check_pandas_support('fetch_openml with as_frame=True') attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) @@ -513,7 +513,7 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - return_frame=False, chunksize=5000): + as_frame=False, chunksize=5000): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -566,14 +566,14 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` objects. - return_frame : boolean, default=False + as_frame : boolean, default=False If True, returns a Bunch where the data attribute is a pandas DataFrame. chunksize : int, default=5000 Number of rows of arrf file to read at a time. Higher values leads to more memory usage. - Only used when ``return_frame`` is True. + Only used when ``as_frame`` is True. Returns ------- @@ -586,16 +586,14 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, target : np.array, pandas Series or DataFrame The regression target or classification labels, if applicable. Dtype is float if numeric, and object if categorical. If - ``return_frame`` is True, ``target`` is a pandas object. + ``as_frame`` is True, ``target`` is a pandas object. DESCR : str The full description of the dataset feature_names : list The names of the dataset columns - target_names : list - The names of the target columns categories : dict or None Maps each categorical feature name to a list of values, such - that the value encoded as i is ith in the list. If ``return_frame`` + that the value encoded as i is ith in the list. If ``as_frame`` is True, this is None. details : dict More metadata from OpenML @@ -662,24 +660,24 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if data_description['format'].lower() == 'sparse_arff': return_sparse = True - if return_frame: + if as_frame: if return_sparse: raise ValueError('Cannot return dataframe with sparse data') if return_X_y: raise ValueError('return_X_y=True can not be set when ' - 'return_frame=True') + 'as_frame=True') # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) - if not return_frame: + if not as_frame: for feature in features_list: if 'true' in (feature['is_ignore'], feature['is_row_identifier']): continue if feature['data_type'] == 'string': raise ValueError('STRING attributes are not supported for ' 'arrays as a return value. Try ' - 'return_frame=True') + 'as_frame=True') if target_column == "default-target": # determines the default target based on the data feature results @@ -731,19 +729,18 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, # obtain the data arff = _download_data_arff(data_description['file_id'], return_sparse, - data_home, encode_nominal=not return_frame) + data_home, encode_nominal=not as_frame) description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) - if return_frame: + if as_frame: columns = data_columns + target_column df = _convert_arff_data_dataframe(arff, columns, features_dict, chunksize) return Bunch(dataframe=df, data=None, target=None, - feature_names=data_columns, - target_names=target_column, DESCR=description, + feature_names=data_columns, DESCR=description, details=data_description, categories=None, url="https://www.openml.org/d/{}".format(data_id)) @@ -782,7 +779,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, bunch = Bunch( data=X, target=y, feature_names=data_columns, - target_names=target_column, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index c8f54e1b4b031..07549c71f60f2 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -289,28 +289,80 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 - expected_shape = (150, 5) + data_shape = (150, 4) + target_shape = (150, ) + frame_shape = (150, 5) - cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', - 'Iris-virginica']) - expected_dtypes = [np.float64] * 4 + [cat_dtype] - expected_feature_names = ['sepallength', 'sepalwidth', 'petallength', - 'petalwidth'] - expected_target_names = ['class'] - expected_columns = expected_feature_names + expected_target_names + target_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) + data_dtypes = [np.float64] * 4 + data_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] + target_names = 'class' + columns = data_names + [target_names] _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False, + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, chunksize=chunksize) - df = bunch.dataframe + data = bunch.data + target = bunch.target + frame = bunch.frame - assert isinstance(df, pd.DataFrame) - assert np.all(df.dtypes == expected_dtypes) - assert df.shape == expected_shape - assert np.all(df.columns == expected_columns) - assert np.all(bunch.feature_names == expected_feature_names) - assert np.all(bunch.target_names == expected_target_names) + assert isinstance(data, pd.DataFrame) + assert np.all(data.dtypes == data_dtypes) + assert data.shape == data_shape + assert np.all(data.columns == columns) + assert np.all(bunch.feature_names == data_names) + + assert isinstance(target, pd.Series) + assert target.dtype == target_dtype + assert target.shape == target_shape + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape + assert np.all(frame.dtype == data_dtypes + [target_dtype]) + + +@pytest.mark.parametrize('chunksize', [10, 1000]) +def test_fetch_openml_iris_multitarget_pandas(monkeypatch, chunksize): + # classification dataset with numeric only columns + pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype + data_id = 61 + data_shape = (150, 3) + target_shape = (150, 2) + frame_shape = (150, 5) + target_column = ['petalwidth', 'petallength'] + + target_dtype = [CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']), np.float64] + data_dtypes = [np.float64] * 3 + data_names = ['sepallength', 'sepalwidth', 'class'] + target_names = ['petalwidth', 'petallength'] + columns = data_names + target_names + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, + chunksize=chunksize, target_column=target_column) + data = bunch.data + target = bunch.target + frame = bunch.frame + + assert isinstance(data, pd.DataFrame) + assert np.all(data.dtypes == data_dtypes) + assert data.shape == data_shape + assert np.all(data.columns == columns) + assert np.all(bunch.feature_names == data_names) + + assert isinstance(target, pd.DataFrame) + assert np.all(target.dtypes == target_dtype) + assert target.shape == target_shape + assert np.all(target.columns == target_column) + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape + assert np.all(frame.dtype == data_dtypes + target_dtype) def test_fetch_openml_anneal_pandas(monkeypatch): @@ -320,24 +372,34 @@ def test_fetch_openml_anneal_pandas(monkeypatch): data_id = 2 target_column = 'class' - expected_shape = (11, 39) - expected_categories = 33 - expected_floats = 6 + data_shape = (11, 38) + target_shape = (11,) + frame_shape = (11, 39) + expected_data_categories = 32 + expected_data_floats = 6 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, + bunch = fetch_openml(data_id=data_id, as_frame=True, target_column=target_column, cache=False) - df = bunch.dataframe + data = bunch.data + target = bunch.target + frame = bunch.frame - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - n_categories = len([dtype for dtype in df.dtypes + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape + n_categories = len([dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)]) - n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) - assert expected_categories == n_categories - assert expected_floats == n_floats - assert np.all(bunch.target_names == [target_column]) + n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) + assert expected_data_categories == n_categories + assert expected_data_floats == n_floats + + assert isinstance(target, pd.Series) + assert target.shape == target_shape + assert isinstance(target.dtype, CategoricalDtype) + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape def test_fetch_openml_cpu_pandas(monkeypatch): @@ -345,7 +407,9 @@ def test_fetch_openml_cpu_pandas(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 - expected_shape = (209, 8) + data_shape = (209, 7) + target_shape = (209, ) + frame_shape = (209, 8) cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', 'bti', 'burroughs', 'c.r.d', 'cdc', @@ -355,21 +419,30 @@ def test_fetch_openml_cpu_pandas(monkeypatch): 'microdata', 'nas', 'ncr', 'nixdorf', 'perkin-elmer', 'prime', 'siemens', 'sperry', 'sratus', 'wang']) - expected_dtypes = [cat_dtype] + [np.float64] * 7 - expected_feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', - 'CHMIN', 'CHMAX'] - expected_target_names = ['class'] + data_dtypes = [cat_dtype] + [np.float64] * 6 + feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', + 'CHMIN', 'CHMAX'] + target_name = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.dataframe + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + data = bunch.data + target = bunch.target + frame = bunch.frame - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - assert np.all(df.dtypes == expected_dtypes) - assert np.all(df.columns == expected_feature_names + expected_target_names) - assert np.all(bunch.feature_names == expected_feature_names) - assert np.all(bunch.target_names == expected_target_names) + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape + assert np.all(data.dtypes == data_dtypes) + assert np.all(data.columns == feature_names) + assert np.all(bunch.feature_names == feature_names) + + assert isinstance(target, pd.Series) + assert target.shape == target_shape + assert target.dtype == np.float64 + assert target.name == target_name + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): @@ -379,7 +452,7 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): msg = 'Cannot return dataframe with sparse data' with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=data_id, return_frame=True, cache=False) + fetch_openml(data_id=data_id, as_frame=True, cache=False) def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): @@ -387,9 +460,9 @@ def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) - msg = 'return_X_y=True can not be set when return_frame=True' + msg = 'return_X_y=True can not be set when as_frame=True' with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=data_id, return_frame=True, cache=False, + fetch_openml(data_id=data_id, as_frame=True, cache=False, return_X_y=True) @@ -399,21 +472,34 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): # Check because of the numeric row attribute (issue #12329) data_id = 1119 - expected_shape = (10, 15) - expected_categories = 9 - expected_floats = 6 + data_shape = (10, 14) + target_shape = (10, ) + frame_shape = (10, 15) - _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.dataframe + expected_data_categories = 8 + expected_data_floats = 6 + target_column = 'class' - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - n_categories = len([dtype for dtype in df.dtypes + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + data = bunch.data + target = bunch.target + frame = bunch.frame + + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape + n_categories = len([dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)]) - n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) - assert expected_categories == n_categories - assert expected_floats == n_floats + n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) + assert expected_data_categories == n_categories + assert expected_data_floats == n_floats + + assert isinstance(target, pd.Series) + assert target.shape == target_shape + assert target.name == target_column + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape def test_fetch_openml_miceprotein_pandas(monkeypatch): @@ -424,21 +510,36 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40966 - expected_shape = (7, 78) - expected_floats = 77 - expected_categories = 1 + data_shape = (7, 77) + target_shape = (7, ) + frame_shape = (7, 78) - _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.dataframe + target_column = 'class' + frame_n_categories = 1 + frame_n_floats = 77 - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - n_categories = len([dtype for dtype in df.dtypes + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + data = bunch.data + target = bunch.target + frame = bunch.frame + + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape + assert np.all(data.dtypes == np.float64) + + assert isinstance(target, pd.Series) + assert isinstance(target.dtype, CategoricalDtype) + assert target.shape == target_shape + assert target.name == target_column + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape + n_categories = len([dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)]) - n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) - assert expected_categories == n_categories - assert expected_floats == n_floats + n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) + assert frame_n_categories == n_categories + assert frame_n_floats == n_floats def test_fetch_openml_emotions_pandas(monkeypatch): @@ -449,23 +550,34 @@ def test_fetch_openml_emotions_pandas(monkeypatch): data_id = 40589 target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', 'quiet.still', 'sad.lonely', 'angry.aggresive'] - expected_shape = (13, 78) - expected_categories = 6 - expected_floats = 72 + data_shape = (13, 72) + target_shape = (13, 6) + frame_shape = (13, 78) + + expected_frame_categories = 6 + expected_frame_floats = 72 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False, + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, target_column=target_column) - df = bunch.dataframe + data = bunch.data + target = bunch.target + frame = bunch.frame + + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - n_categories = len([dtype for dtype in df.dtypes + assert isinstance(target, pd.DataFrame) + assert target.shape == target_shape + assert np.all(target.columns == target_column) + + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape + n_categories = len([dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)]) - n_floats = len([dtype for dtype in df.dtypes if dtype.kind == 'f']) - assert expected_categories == n_categories - assert expected_floats == n_floats - assert np.all(bunch.target_names == target_column) + n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) + assert expected_frame_categories == n_categories + assert expected_frame_floats == n_floats def test_fetch_openml_titanic_pandas(monkeypatch): @@ -474,7 +586,9 @@ def test_fetch_openml_titanic_pandas(monkeypatch): CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 - expected_shape = (1309, 14) + data_shape = (1309, 13) + target_shape = (1309, ) + frame_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, @@ -491,25 +605,34 @@ def test_fetch_openml_titanic_pandas(monkeypatch): 'home.dest': object, 'survived': CategoricalDtype(['0', '1']) } - expected_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', - 'parch', 'ticket', 'fare', 'cabin', 'embarked', - 'boat', 'body', 'home.dest'] - expected_dtypes = [name_to_dtype[col] for col in expected_columns] - expected_feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp', - 'parch', 'ticket', 'fare', 'cabin', 'embarked', - 'boat', 'body', 'home.dest'] - expected_target_names = ['survived'] + + frame_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', + 'parch', 'ticket', 'fare', 'cabin', 'embarked', + 'boat', 'body', 'home.dest'] + frame_dtypes = [name_to_dtype[col] for col in frame_columns] + feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp', + 'parch', 'ticket', 'fare', 'cabin', 'embarked', + 'boat', 'body', 'home.dest'] + target_name = 'survived' _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, return_frame=True, cache=False) - df = bunch.dataframe + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + data = bunch.data + target = bunch.target + frame = bunch.frame + + assert isinstance(data, pd.DataFrame) + assert data.shape == data_shape + assert np.all(data.columns == feature_names) + + assert isinstance(target, pd.Series) + assert target.shape == target_shape + assert target.name == target_name + assert target.dtype == name_to_dtype[target_name] - assert isinstance(df, pd.DataFrame) - assert df.shape == expected_shape - assert np.all(df.dtypes == expected_dtypes) - assert np.all(df.columns == expected_columns) - assert np.all(bunch.feature_names == expected_feature_names) - assert np.all(bunch.target_names == expected_target_names) + assert isinstance(frame, pd.DataFrame) + assert frame.shape == frame_shape + assert np.all(data.dtypes == frame_dtypes) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -924,7 +1047,7 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response): # single column test assert_raise_message(ValueError, ('STRING attributes are not supported for arrays as ' - 'a return value. Try return_frame=True'), + 'a return value. Try as_frame=True'), fetch_openml, data_id=data_id, cache=False) From 8780000ab0e903c093ad44f03944c0113610b821 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 14:53:04 -0400 Subject: [PATCH 20/39] ENH Return data and target --- sklearn/datasets/openml.py | 10 ++++++++-- sklearn/datasets/tests/test_openml.py | 27 +++++++++++++-------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index cbf738410d6af..65e0a0bdf6a6d 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -738,8 +738,14 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, columns = data_columns + target_column df = _convert_arff_data_dataframe(arff, columns, features_dict, chunksize) - - return Bunch(dataframe=df, data=None, target=None, + X = df[data_columns] + if len(target_column) >= 2: + y = df[target_column] + elif len(target_column) == 1: + y = df[target_column[0]] + else: + y = None + return Bunch(frame=df, data=X, target=y, feature_names=data_columns, DESCR=description, details=data_description, categories=None, url="https://www.openml.org/d/{}".format(data_id)) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 07549c71f60f2..f13a223e1033c 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -298,7 +298,6 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): data_dtypes = [np.float64] * 4 data_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] target_names = 'class' - columns = data_names + [target_names] _monkey_patch_webbased_functions(monkeypatch, data_id, True) @@ -311,7 +310,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): assert isinstance(data, pd.DataFrame) assert np.all(data.dtypes == data_dtypes) assert data.shape == data_shape - assert np.all(data.columns == columns) + assert np.all(data.columns == data_names) assert np.all(bunch.feature_names == data_names) assert isinstance(target, pd.Series) @@ -320,7 +319,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape - assert np.all(frame.dtype == data_dtypes + [target_dtype]) + assert np.all(frame.dtypes == data_dtypes + [target_dtype]) @pytest.mark.parametrize('chunksize', [10, 1000]) @@ -334,12 +333,12 @@ def test_fetch_openml_iris_multitarget_pandas(monkeypatch, chunksize): frame_shape = (150, 5) target_column = ['petalwidth', 'petallength'] - target_dtype = [CategoricalDtype(['Iris-setosa', 'Iris-versicolor', - 'Iris-virginica']), np.float64] - data_dtypes = [np.float64] * 3 + cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica']) + data_dtypes = [np.float64, np.float64] + [cat_dtype] data_names = ['sepallength', 'sepalwidth', 'class'] + target_dtypes = [np.float64, np.float64] target_names = ['petalwidth', 'petallength'] - columns = data_names + target_names _monkey_patch_webbased_functions(monkeypatch, data_id, True) @@ -352,17 +351,17 @@ def test_fetch_openml_iris_multitarget_pandas(monkeypatch, chunksize): assert isinstance(data, pd.DataFrame) assert np.all(data.dtypes == data_dtypes) assert data.shape == data_shape - assert np.all(data.columns == columns) + assert np.all(data.columns == data_names) assert np.all(bunch.feature_names == data_names) assert isinstance(target, pd.DataFrame) - assert np.all(target.dtypes == target_dtype) + assert np.all(target.dtypes == target_dtypes) assert target.shape == target_shape - assert np.all(target.columns == target_column) + assert np.all(target.columns == target_names) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape - assert np.all(frame.dtype == data_dtypes + target_dtype) + assert np.all(frame.dtypes == [np.float64] * 4 + [cat_dtype]) def test_fetch_openml_anneal_pandas(monkeypatch): @@ -535,9 +534,9 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape - n_categories = len([dtype for dtype in data.dtypes + n_categories = len([dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)]) - n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) + n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) assert frame_n_categories == n_categories assert frame_n_floats == n_floats @@ -632,7 +631,7 @@ def test_fetch_openml_titanic_pandas(monkeypatch): assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape - assert np.all(data.dtypes == frame_dtypes) + assert np.all(frame.dtypes == frame_dtypes) @pytest.mark.parametrize('gzip_response', [True, False]) From a7519cd30a291e518892093209406c530337f532 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 14:57:23 -0400 Subject: [PATCH 21/39] ENH Adds support for return_X_y --- sklearn/datasets/openml.py | 5 ++--- sklearn/datasets/tests/test_openml.py | 28 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 65e0a0bdf6a6d..0699d040bc241 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -663,9 +663,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if as_frame: if return_sparse: raise ValueError('Cannot return dataframe with sparse data') - if return_X_y: - raise ValueError('return_X_y=True can not be set when ' - 'as_frame=True') # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) @@ -745,6 +742,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, y = df[target_column[0]] else: y = None + if return_X_y: + return X, y return Bunch(frame=df, data=X, target=y, feature_names=data_columns, DESCR=description, details=data_description, categories=None, diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index f13a223e1033c..4760bfc7ef336 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -454,15 +454,33 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): fetch_openml(data_id=data_id, as_frame=True, cache=False) -def test_fetch_openml_adultcensus_pandas_return_X_y_errors(monkeypatch): +def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): + pd = pytest.importorskip('pandas') + CategoricalDtype = pd.api.types.CategoricalDtype + + # Check because of the numeric row attribute (issue #12329) data_id = 1119 + data_shape = (10, 14) + target_shape = (10, ) + + expected_data_categories = 8 + expected_data_floats = 6 + target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) + X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False, + return_X_y=True) + assert isinstance(X, pd.DataFrame) + assert X.shape == data_shape + n_categories = len([dtype for dtype in X.dtypes + if isinstance(dtype, CategoricalDtype)]) + n_floats = len([dtype for dtype in X.dtypes if dtype.kind == 'f']) + assert expected_data_categories == n_categories + assert expected_data_floats == n_floats - msg = 'return_X_y=True can not be set when as_frame=True' - with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=data_id, as_frame=True, cache=False, - return_X_y=True) + assert isinstance(y, pd.Series) + assert y.shape == target_shape + assert y.name == target_column def test_fetch_openml_adultcensus_pandas(monkeypatch): From 5396d8da6ac568cac27aedd9758cd7d98c3e9ae5 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 14:58:28 -0400 Subject: [PATCH 22/39] ENH Update example --- examples/compose/plot_column_transformer_mixed_types.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 87a71e51f822b..162d31832417c 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -37,8 +37,9 @@ np.random.seed(0) # Read data from Titanic dataset. -titantic = fetch_openml(data_id=40945, return_frame=True) -data = titantic.dataframe +titantic = fetch_openml(data_id=40945, as_frame=True) +X = titantic.data +y = titantic.target # We will train our classifier with the following features: # Numeric Features: @@ -70,9 +71,6 @@ clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) -X = data.drop('survived', axis=1) -y = data['survived'] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) From b33fcf9c27e4201902711f51baac2b881781de35 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 14:59:44 -0400 Subject: [PATCH 23/39] STY Lint --- sklearn/datasets/tests/test_openml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 4760bfc7ef336..b1796ac2e4f25 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -297,7 +297,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): 'Iris-virginica']) data_dtypes = [np.float64] * 4 data_names = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] - target_names = 'class' + target_name = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) @@ -316,6 +316,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): assert isinstance(target, pd.Series) assert target.dtype == target_dtype assert target.shape == target_shape + assert target.name == target_name assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape From 706a2545e6fd46f26ad1356e718c83eac8003b79 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 15:41:18 -0400 Subject: [PATCH 24/39] BUG Removes target_column --- sklearn/datasets/tests/test_openml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index b1796ac2e4f25..e945920a5801a 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -94,12 +94,10 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) - assert data_by_id.target_names[0] == target_column elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) - assert np.all(data_by_id.target_names == target_column) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features From 7a7de894e6754f3f9b71b7c9fe4e01fd64433e1b Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 4 Jun 2019 21:59:20 -0400 Subject: [PATCH 25/39] BLD Trigger CI From 9568d4cbfc310f837c3000227a985de0621a7537 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 5 Jun 2019 10:52:10 -0400 Subject: [PATCH 26/39] ENH Uses working_memory to calcuate chunksize --- sklearn/datasets/openml.py | 40 ++++++++++++++++----------- sklearn/datasets/tests/test_openml.py | 25 +++++++++++------ 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 0699d040bc241..9b93a9dd32e40 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -10,6 +10,7 @@ from collections.abc import Generator from itertools import islice from collections import OrderedDict +import warnings from urllib.request import urlopen, Request @@ -20,6 +21,7 @@ from .base import get_data_home from urllib.error import HTTPError from ..utils import Bunch +from .. import get_config from ..utils import check_pandas_support # noqa __all__ = ['fetch_openml'] @@ -294,7 +296,7 @@ def _chunk_generator(gen, chunksize): return -def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): +def _convert_arff_data_dataframe(arrf, columns, features_dict): """Convert the ARFF object into a pandas DataFrame. Parameters @@ -308,9 +310,6 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): features_dict : dict Maps feature name to feature info from openml. - chunksize : int - Number of rows to read at a time. - Returns ------- dataframe : pandas DataFrame @@ -320,9 +319,25 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict, chunksize): attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) - arrf_data_gen = _chunk_generator(arrf['data'], chunksize) - dfs = [pd.DataFrame(data, columns=arrf_columns) - for data in arrf_data_gen] + # calculate chunksize + working_memory = get_config()['working_memory'] + first_row = next(arrf['data']) + first_df = pd.DataFrame([first_row], columns=arrf_columns) + + row_bytes = first_df.memory_usage(deep=True).sum() + chunksize = int(working_memory * (2 ** 20) // row_bytes) + + if chunksize < 1: + warnings.warn('Could not adhere to working_memory config. ' + 'Currently %.0fMiB, %.0fMiB required.' % + (working_memory, np.ceil(row_bytes * 2 ** -20))) + chunksize = 1 + + # read arrf data with chunks + dfs = [] + dfs.append(first_df) + for data in _chunk_generator(arrf['data'], chunksize): + dfs.append(pd.DataFrame(data, columns=arrf_columns)) df = pd.concat(dfs) columns_to_keep = [col for col in arrf_columns if col in columns] @@ -570,11 +585,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, If True, returns a Bunch where the data attribute is a pandas DataFrame. - chunksize : int, default=5000 - Number of rows of arrf file to read at a time. Higher values leads to - more memory usage. - Only used when ``as_frame`` is True. - Returns ------- @@ -673,8 +683,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, continue if feature['data_type'] == 'string': raise ValueError('STRING attributes are not supported for ' - 'arrays as a return value. Try ' - 'as_frame=True') + 'array representation. Try as_frame=True') if target_column == "default-target": # determines the default target based on the data feature results @@ -733,8 +742,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if as_frame: columns = data_columns + target_column - df = _convert_arff_data_dataframe(arff, columns, features_dict, - chunksize) + df = _convert_arff_data_dataframe(arff, columns, features_dict) X = df[data_columns] if len(target_column) >= 2: y = df[target_column] diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index e945920a5801a..4e8db429e5d64 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -9,6 +9,7 @@ import sklearn import pytest +from sklearn import config_context from sklearn.datasets import fetch_openml from sklearn.datasets.openml import (_open_openml_url, _get_data_description_by_id, @@ -281,8 +282,7 @@ def test_feature_to_dtype_error(feature): _feature_to_dtype(feature) -@pytest.mark.parametrize('chunksize', [10, 1000]) -def test_fetch_openml_iris_pandas(monkeypatch, chunksize): +def test_fetch_openml_iris_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype @@ -299,8 +299,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - chunksize=chunksize) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -321,8 +320,7 @@ def test_fetch_openml_iris_pandas(monkeypatch, chunksize): assert np.all(frame.dtypes == data_dtypes + [target_dtype]) -@pytest.mark.parametrize('chunksize', [10, 1000]) -def test_fetch_openml_iris_multitarget_pandas(monkeypatch, chunksize): +def test_fetch_openml_iris_multitarget_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype @@ -342,7 +340,7 @@ def test_fetch_openml_iris_multitarget_pandas(monkeypatch, chunksize): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - chunksize=chunksize, target_column=target_column) + target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame @@ -453,11 +451,22 @@ def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): fetch_openml(data_id=data_id, as_frame=True, cache=False) +def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch): + pytest.importorskip('pandas') + + data_id = 1119 + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + msg = 'Could not adhere to working_memory config.' + with pytest.warns(UserWarning, match=msg): + with config_context(working_memory=1e-6): + fetch_openml(data_id=data_id, as_frame=True, cache=False) + + def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype - # Check because of the numeric row attribute (issue #12329) data_id = 1119 data_shape = (10, 14) target_shape = (10, ) From 36d11e32551917f5720177e425a0ca0b73415ddb Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 5 Jun 2019 15:56:57 -0400 Subject: [PATCH 27/39] TST Fix error message --- sklearn/datasets/tests/test_openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 4e8db429e5d64..361d59991ce0a 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1071,8 +1071,8 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response): _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_raise_message(ValueError, - ('STRING attributes are not supported for arrays as ' - 'a return value. Try as_frame=True'), + ('STRING attributes are not supported for ' + 'array representation. Try as_frame=True'), fetch_openml, data_id=data_id, cache=False) From 7edb62cc3a477d28541afe2bd75a2557d3ffe9f7 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 5 Jun 2019 16:44:25 -0400 Subject: [PATCH 28/39] DOC Adds frame to docs --- sklearn/datasets/openml.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 9b93a9dd32e40..9cc1eca8e5d9d 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -528,7 +528,7 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - as_frame=False, chunksize=5000): + as_frame=False): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -607,6 +607,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, is True, this is None. details : dict More metadata from OpenML + frame : pandas DataFrame + DataFrame with ``data`` and ``target``. This is set when + ``as_frame`` is True. (data, target) : tuple if ``return_X_y`` is True From fb10fd110458619744cf122ccb4c9c6c644a9a3f Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 5 Jun 2019 16:49:09 -0400 Subject: [PATCH 29/39] BLD Trigger CI From 87cc0b08acbd47d437c307a4724918404926661d Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 13 Jun 2019 14:21:10 -0400 Subject: [PATCH 30/39] CLN Only create bunch once --- sklearn/datasets/openml.py | 80 +++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 9cc1eca8e5d9d..3d899e1e4c08e 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -743,58 +743,56 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) + nominal_attributes = None + frame = None if as_frame: columns = data_columns + target_column - df = _convert_arff_data_dataframe(arff, columns, features_dict) - X = df[data_columns] + frame = _convert_arff_data_dataframe(arff, columns, features_dict) + X = frame[data_columns] if len(target_column) >= 2: - y = df[target_column] + y = frame[target_column] elif len(target_column) == 1: - y = df[target_column[0]] + y = frame[target_column[0]] else: y = None - if return_X_y: - return X, y - return Bunch(frame=df, data=X, target=y, - feature_names=data_columns, DESCR=description, - details=data_description, categories=None, - url="https://www.openml.org/d/{}".format(data_id)) - - # nominal attributes is a dict mapping from the attribute name to the - # possible values. Includes also the target column (which will be popped - # off below, before it will be packed in the Bunch object) - nominal_attributes = {k: v for k, v in arff['attributes'] - if isinstance(v, list) and - k in data_columns + target_column} - - X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) - - is_classification = {col_name in nominal_attributes - for col_name in target_column} - if not is_classification: - # No target - pass - elif all(is_classification): - y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name), - dtype='O'), - y[:, i:i+1].astype(int, copy=False)) - for i, col_name in enumerate(target_column)]) - elif any(is_classification): - raise ValueError('Mix of nominal and non-nominal targets is not ' - 'currently supported') - - # reshape y back to 1-D array, if there is only 1 target column; back - # to None if there are not target columns - if y.shape[1] == 1: - y = y.reshape((-1,)) - elif y.shape[1] == 0: - y = None + else: + # nominal attributes is a dict mapping from the attribute name to the + # possible values. Includes also the target column (which will be popped + # off below, before it will be packed in the Bunch object) + nominal_attributes = {k: v for k, v in arff['attributes'] + if isinstance(v, list) and + k in data_columns + target_column} + + X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) + + is_classification = {col_name in nominal_attributes + for col_name in target_column} + if not is_classification: + # No target + pass + elif all(is_classification): + y = np.hstack([ + np.take( + np.asarray(nominal_attributes.pop(col_name), dtype='O'), + y[:, i:i + 1].astype(int, copy=False)) + for i, col_name in enumerate(target_column) + ]) + elif any(is_classification): + raise ValueError('Mix of nominal and non-nominal targets is not ' + 'currently supported') + + # reshape y back to 1-D array, if there is only 1 target column; back + # to None if there are not target columns + if y.shape[1] == 1: + y = y.reshape((-1,)) + elif y.shape[1] == 0: + y = None if return_X_y: return X, y bunch = Bunch( - data=X, target=y, feature_names=data_columns, + data=X, target=y, frame=frame, feature_names=data_columns, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) From ebd12f546799a13e1613d6a84654718382bc46a7 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 13 Jun 2019 15:58:38 -0400 Subject: [PATCH 31/39] CLN Uses target_columns (plural) --- sklearn/datasets/openml.py | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 3d899e1e4c08e..d9393e578f568 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -21,6 +21,7 @@ from .base import get_data_home from urllib.error import HTTPError from ..utils import Bunch +from ..utils import get_chunk_n_rows from .. import get_config from ..utils import check_pandas_support # noqa @@ -320,18 +321,11 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict): arrf_columns = list(attributes) # calculate chunksize - working_memory = get_config()['working_memory'] first_row = next(arrf['data']) first_df = pd.DataFrame([first_row], columns=arrf_columns) row_bytes = first_df.memory_usage(deep=True).sum() - chunksize = int(working_memory * (2 ** 20) // row_bytes) - - if chunksize < 1: - warnings.warn('Could not adhere to working_memory config. ' - 'Currently %.0fMiB, %.0fMiB required.' % - (working_memory, np.ceil(row_bytes * 2 ** -20))) - chunksize = 1 + chunksize = get_chunk_n_rows(row_bytes) # read arrf data with chunks dfs = [] @@ -582,8 +576,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, below for more information about the `data` and `target` objects. as_frame : boolean, default=False - If True, returns a Bunch where the data attribute is a pandas - DataFrame. + If True, where the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target_columns. + If ``return_X_y`` is True, then ``(data, target)`` will be pandas + DataFrames or Series as describe above. Returns ------- @@ -673,9 +670,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if data_description['format'].lower() == 'sparse_arff': return_sparse = True - if as_frame: - if return_sparse: - raise ValueError('Cannot return dataframe with sparse data') + if as_frame and return_sparse: + raise ValueError('Cannot return dataframe with sparse data') # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) @@ -692,28 +688,30 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, # determines the default target based on the data feature results # (which is currently more reliable than the data description; # see issue: https://github.com/openml/OpenML/issues/768) - target_column = [feature['name'] for feature in features_list - if feature['is_target'] == 'true'] + target_columns = [feature['name'] for feature in features_list + if feature['is_target'] == 'true'] elif isinstance(target_column, str): # for code-simplicity, make target_column by default a list - target_column = [target_column] + target_columns = [target_column] elif target_column is None: - target_column = [] - elif not isinstance(target_column, list): + target_columns = [] + elif isinstance(target_column, list): + target_columns = target_column + else: raise TypeError("Did not recognize type of target_column" "Should be str, list or None. Got: " "{}".format(type(target_column))) data_columns = _valid_data_column_names(features_list, - target_column) + target_columns) # prepare which columns and data types should be returned for the X and y features_dict = {feature['name']: feature for feature in features_list} # XXX: col_slice_y should be all nominal or all numeric - _verify_target_data_type(features_dict, target_column) + _verify_target_data_type(features_dict, target_columns) col_slice_y = [int(features_dict[col_name]['index']) - for col_name in target_column] + for col_name in target_columns] col_slice_x = [int(features_dict[col_name]['index']) for col_name in data_columns] @@ -746,13 +744,13 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, nominal_attributes = None frame = None if as_frame: - columns = data_columns + target_column + columns = data_columns + target_columns frame = _convert_arff_data_dataframe(arff, columns, features_dict) X = frame[data_columns] - if len(target_column) >= 2: - y = frame[target_column] - elif len(target_column) == 1: - y = frame[target_column[0]] + if len(target_columns) >= 2: + y = frame[target_columns] + elif len(target_columns) == 1: + y = frame[target_columns[0]] else: y = None else: @@ -761,12 +759,12 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, # off below, before it will be packed in the Bunch object) nominal_attributes = {k: v for k, v in arff['attributes'] if isinstance(v, list) and - k in data_columns + target_column} + k in data_columns + target_columns} X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) is_classification = {col_name in nominal_attributes - for col_name in target_column} + for col_name in target_columns} if not is_classification: # No target pass @@ -775,7 +773,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, np.take( np.asarray(nominal_attributes.pop(col_name), dtype='O'), y[:, i:i + 1].astype(int, copy=False)) - for i, col_name in enumerate(target_column) + for i, col_name in enumerate(target_columns) ]) elif any(is_classification): raise ValueError('Mix of nominal and non-nominal targets is not ' From 7cd6f30dd69ddc9ba34a03c697670116a8e37347 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 13 Jun 2019 16:07:54 -0400 Subject: [PATCH 32/39] CLN Fliter columns sooner --- sklearn/datasets/openml.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d9393e578f568..de31977d0dede 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -319,10 +319,11 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict): attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) + columns_to_keep = [col for col in arrf_columns if col in columns] # calculate chunksize first_row = next(arrf['data']) - first_df = pd.DataFrame([first_row], columns=arrf_columns) + first_df = pd.DataFrame([first_row], columns=arrf_columns)[columns_to_keep] row_bytes = first_df.memory_usage(deep=True).sum() chunksize = get_chunk_n_rows(row_bytes) @@ -331,15 +332,9 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict): dfs = [] dfs.append(first_df) for data in _chunk_generator(arrf['data'], chunksize): - dfs.append(pd.DataFrame(data, columns=arrf_columns)) + dfs.append(pd.DataFrame(data, columns=arrf_columns)[columns_to_keep]) df = pd.concat(dfs) - columns_to_keep = [col for col in arrf_columns if col in columns] - - # copy dataframe when there are columns that needs to be removed - if len(columns_to_keep) != len(arrf_columns): - df = df[columns_to_keep].copy() - for column in columns_to_keep: dtype = _feature_to_dtype(features_dict[column]) if dtype == 'category': From 00274d755764fb56d6acf45b06db2c44e9d69658 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 13 Jun 2019 16:13:01 -0400 Subject: [PATCH 33/39] CLN Filter earlier --- sklearn/datasets/openml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index de31977d0dede..ea441d4421411 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -319,18 +319,18 @@ def _convert_arff_data_dataframe(arrf, columns, features_dict): attributes = OrderedDict(arrf['attributes']) arrf_columns = list(attributes) - columns_to_keep = [col for col in arrf_columns if col in columns] # calculate chunksize first_row = next(arrf['data']) - first_df = pd.DataFrame([first_row], columns=arrf_columns)[columns_to_keep] + first_df = pd.DataFrame([first_row], columns=arrf_columns) row_bytes = first_df.memory_usage(deep=True).sum() chunksize = get_chunk_n_rows(row_bytes) # read arrf data with chunks + columns_to_keep = [col for col in arrf_columns if col in columns] dfs = [] - dfs.append(first_df) + dfs.append(first_df[columns_to_keep]) for data in _chunk_generator(arrf['data'], chunksize): dfs.append(pd.DataFrame(data, columns=arrf_columns)[columns_to_keep]) df = pd.concat(dfs) From 65c575c607705e5d25e42168ee53deb47aa1d2ea Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 13 Jun 2019 16:18:12 -0400 Subject: [PATCH 34/39] STY Flake8 --- sklearn/datasets/openml.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index ea441d4421411..63139a6cce37f 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -10,7 +10,6 @@ from collections.abc import Generator from itertools import islice from collections import OrderedDict -import warnings from urllib.request import urlopen, Request @@ -22,7 +21,6 @@ from urllib.error import HTTPError from ..utils import Bunch from ..utils import get_chunk_n_rows -from .. import get_config from ..utils import check_pandas_support # noqa __all__ = ['fetch_openml'] @@ -750,13 +748,14 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, y = None else: # nominal attributes is a dict mapping from the attribute name to the - # possible values. Includes also the target column (which will be popped - # off below, before it will be packed in the Bunch object) + # possible values. Includes also the target column (which will be + # popped off below, before it will be packed in the Bunch object) nominal_attributes = {k: v for k, v in arff['attributes'] if isinstance(v, list) and k in data_columns + target_columns} - X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) + X, y = _convert_arff_data(arff['data'], col_slice_x, + col_slice_y, shape) is_classification = {col_name in nominal_attributes for col_name in target_columns} From 52211bb19ff07c41fc6abe08c57ab9d1d8e9c1e4 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 25 Jun 2019 09:28:41 -0400 Subject: [PATCH 35/39] TST Adds check for all numerical data --- sklearn/datasets/tests/test_openml.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 361d59991ce0a..de13f96675f16 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -20,6 +20,7 @@ from sklearn.utils.testing import (assert_warns_message, assert_raise_message) from sklearn.utils import is_scalar_nan +from sklearn.utils.testing import assert_allclose, assert_array_equal from urllib.error import HTTPError from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial @@ -320,6 +321,25 @@ def test_fetch_openml_iris_pandas(monkeypatch): assert np.all(frame.dtypes == data_dtypes + [target_dtype]) +def test_fetch_openml_iris_pandas_equal_to_no_frame(monkeypatch): + # as_frame = True returns the same underlying data as as_frame = False + pytest.importorskip('pandas') + data_id = 61 + + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + frame_bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + frame_data = frame_bunch.data + frame_target = frame_bunch.target + + norm_bunch = fetch_openml(data_id=data_id, as_frame=False, cache=False) + norm_data = norm_bunch.data + norm_target = norm_bunch.target + + assert_allclose(norm_data, frame_data) + assert_array_equal(norm_target, frame_target) + + def test_fetch_openml_iris_multitarget_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') From 8b5610bf9f654faf8cecd68f9a82823f806df936 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 2 Jul 2019 11:22:06 -0400 Subject: [PATCH 36/39] CLN Moves _chunk_generator to utils --- sklearn/datasets/openml.py | 13 +------------ sklearn/utils/__init__.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 63139a6cce37f..413091ec192fa 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -8,7 +8,6 @@ from functools import wraps import itertools from collections.abc import Generator -from itertools import islice from collections import OrderedDict from urllib.request import urlopen, Request @@ -21,6 +20,7 @@ from urllib.error import HTTPError from ..utils import Bunch from ..utils import get_chunk_n_rows +from ..utils import _chunk_generator from ..utils import check_pandas_support # noqa __all__ = ['fetch_openml'] @@ -284,17 +284,6 @@ def _feature_to_dtype(feature): raise ValueError('Unsupported feature: {}'.format(feature)) -def _chunk_generator(gen, chunksize): - """Chunk generator, ``gen`` into lists of length ``chunksize``. The last - chunk may have a length less than ``chunksize``.""" - while True: - chunk = list(islice(gen, chunksize)) - if chunk: - yield chunk - else: - return - - def _convert_arff_data_dataframe(arrf, columns, features_dict): """Convert the ARFF object into a pandas DataFrame. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8216ff49ba5c5..4528c2ba0caeb 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -3,6 +3,7 @@ """ from collections.abc import Sequence from contextlib import contextmanager +from itertools import islice import numbers import platform import struct @@ -477,6 +478,17 @@ def safe_sqr(X, copy=True): return X +def _chunk_generator(gen, chunksize): + """Chunk generator, ``gen`` into lists of length ``chunksize``. The last + chunk may have a length less than ``chunksize``.""" + while True: + chunk = list(islice(gen, chunksize)) + if chunk: + yield chunk + else: + return + + def gen_batches(n, batch_size, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. From 3873332fd4fcdbe6ef16004084aca5aba30b12d1 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 2 Jul 2019 11:27:01 -0400 Subject: [PATCH 37/39] DOC Adds whats_new --- doc/whats_new/v0.22.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e998294e6d255..336a13e7c3bea 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -39,6 +39,12 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.datasets` +....................... + +- |Feature| :func:`fetch_openml` now supports heterogeneous data using pandas + by setting `as_frame=True`. :pr:`13902` by `Thomas Fan`_. + :mod:`sklearn.ensemble` ....................... From e14742038077ded71bbff083024048a5cc8702e5 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 2 Jul 2019 11:58:57 -0400 Subject: [PATCH 38/39] CLN: Update doc/whats_new/v0.22.rst Co-Authored-By: Guillaume Lemaitre --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e3bf4f8a1ca30..00d816d30c023 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -42,7 +42,7 @@ Changelog :mod:`sklearn.datasets` ....................... -- |Feature| :func:`fetch_openml` now supports heterogeneous data using pandas +- |Feature| :func:`datasets.fetch_openml` now supports heterogeneous data using pandas by setting `as_frame=True`. :pr:`13902` by `Thomas Fan`_. :mod:`sklearn.decomposition` From c34707eb533860fcd4a6d0a7a4afb126e5e3e92c Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Tue, 2 Jul 2019 13:23:11 -0400 Subject: [PATCH 39/39] CLN Address comments --- examples/compose/plot_column_transformer_mixed_types.py | 4 ++++ sklearn/datasets/openml.py | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 162d31832417c..0f6c5d3c222c6 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -41,6 +41,10 @@ X = titantic.data y = titantic.target +# Alternatively X and y can be obtained directly from the frame attribute: +# X = titantic.frame.drop('survived', axis=1) +# y = titantic.frame['survived'] + # We will train our classifier with the following features: # Numeric Features: # - age: float. diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 413091ec192fa..3d82027e29118 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -558,10 +558,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, below for more information about the `data` and `target` objects. as_frame : boolean, default=False - If True, where the data is a pandas DataFrame including columns with + If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric, string or categorical). The target is a pandas DataFrame or Series depending on the number of target_columns. - If ``return_X_y`` is True, then ``(data, target)`` will be pandas + The Bunch will contain a ``frame`` attribute with the target and the + data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas DataFrames or Series as describe above. Returns @@ -587,8 +588,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, details : dict More metadata from OpenML frame : pandas DataFrame - DataFrame with ``data`` and ``target``. This is set when - ``as_frame`` is True. + Only present when `as_frame=True`. DataFrame with ``data`` and + ``target``. (data, target) : tuple if ``return_X_y`` is True