-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG] Adds fetch_openml pandas dataframe support #13902
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f4754a9
e67182e
98bfa76
f162cd4
f3818f1
61c3dea
d7651cc
052491f
e7a6f9c
95b4153
6c8c709
26b03b2
f5a60bd
599666f
b8011a6
f71aeb6
bd91262
22a76ff
1712492
58c5c2d
8780000
a7519cd
5396d8d
b33fcf9
706a254
7a7de89
9568d4c
36d11e3
7edb62c
fb10fd1
556a779
87cc0b0
ebd12f5
7cd6f30
00274d7
65c575c
52211bb
8b5610b
3873332
729791a
e147420
83dcdc9
c34707e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,10 +24,10 @@ | |
# | ||
# License: BSD 3 clause | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from sklearn.compose import ColumnTransformer | ||
from sklearn.datasets import fetch_openml | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import StandardScaler, OneHotEncoder | ||
|
@@ -37,9 +37,13 @@ | |
np.random.seed(0) | ||
|
||
# Read data from Titanic dataset. | ||
titanic_url = ('https://raw.githubusercontent.com/amueller/' | ||
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv') | ||
data = pd.read_csv(titanic_url) | ||
titantic = fetch_openml(data_id=40945, as_frame=True) | ||
X = titantic.data | ||
y = titantic.target | ||
|
||
# Alternatively X and y can be obtained directly from the frame attribute: | ||
# X = titantic.frame.drop('survived', axis=1) | ||
# y = titantic.frame['survived'] | ||
|
||
# We will train our classifier with the following features: | ||
# Numeric Features: | ||
|
@@ -71,9 +75,6 @@ | |
clf = Pipeline(steps=[('preprocessor', preprocessor), | ||
('classifier', LogisticRegression())]) | ||
|
||
X = data.drop('survived', axis=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like to document this idiom somewhere. It's sooo common and I'm not sure if it's anywhere else in the docs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a comment using the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally prefer |
||
y = data['survived'] | ||
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | ||
|
||
clf.fit(X_train, y_train) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
from functools import wraps | ||
import itertools | ||
from collections.abc import Generator | ||
from collections import OrderedDict | ||
|
||
from urllib.request import urlopen, Request | ||
|
||
|
@@ -18,6 +19,9 @@ | |
from .base import get_data_home | ||
from urllib.error import HTTPError | ||
from ..utils import Bunch | ||
from ..utils import get_chunk_n_rows | ||
from ..utils import _chunk_generator | ||
from ..utils import check_pandas_support # noqa | ||
|
||
__all__ = ['fetch_openml'] | ||
|
||
|
@@ -263,6 +267,69 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): | |
raise ValueError('Unexpected Data Type obtained from arff.') | ||
|
||
|
||
def _feature_to_dtype(feature): | ||
"""Map feature to dtype for pandas DataFrame | ||
""" | ||
if feature['data_type'] == 'string': | ||
return object | ||
elif feature['data_type'] == 'nominal': | ||
return 'category' | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# only numeric, integer, real are left | ||
elif (feature['number_of_missing_values'] != '0' or | ||
feature['data_type'] in ['numeric', 'real']): | ||
# cast to floats when there are any missing values | ||
return np.float64 | ||
elif feature['data_type'] == 'integer': | ||
return np.int64 | ||
raise ValueError('Unsupported feature: {}'.format(feature)) | ||
|
||
|
||
def _convert_arff_data_dataframe(arrf, columns, features_dict): | ||
"""Convert the ARFF object into a pandas DataFrame. | ||
|
||
Parameters | ||
---------- | ||
arrf : dict | ||
As obtained from liac-arff object. | ||
|
||
columns : list | ||
Columns from dataframe to return. | ||
|
||
features_dict : dict | ||
Maps feature name to feature info from openml. | ||
|
||
Returns | ||
------- | ||
dataframe : pandas DataFrame | ||
""" | ||
pd = check_pandas_support('fetch_openml with as_frame=True') | ||
|
||
attributes = OrderedDict(arrf['attributes']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typos, should be |
||
arrf_columns = list(attributes) | ||
|
||
# calculate chunksize | ||
first_row = next(arrf['data']) | ||
first_df = pd.DataFrame([first_row], columns=arrf_columns) | ||
|
||
row_bytes = first_df.memory_usage(deep=True).sum() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may be more precise than we need, since working_memory only needs to account for temporary usage and object-dtype data will still occupy the same heap space when fully constructed. But I suppose if columns_to_keep filters out columns that use a whole lot of heap space then this deep consumption might be relevant There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to fine tune this now in any case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But now I'm thinking that actually the working memory we are trying to limit is not the usage of the data frame, but rather the bytes of string space and list storage used by arff.py There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fundamentally, using the string space + list storage per line can be used to estimate the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be raw instead of row? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe just comment to that effect. In practice the temporary list would have a further intp overhead per feature per sample if I'm not mistaken. |
||
chunksize = get_chunk_n_rows(row_bytes) | ||
|
||
# read arrf data with chunks | ||
columns_to_keep = [col for col in arrf_columns if col in columns] | ||
dfs = [] | ||
dfs.append(first_df[columns_to_keep]) | ||
for data in _chunk_generator(arrf['data'], chunksize): | ||
dfs.append(pd.DataFrame(data, columns=arrf_columns)[columns_to_keep]) | ||
df = pd.concat(dfs) | ||
|
||
for column in columns_to_keep: | ||
dtype = _feature_to_dtype(features_dict[column]) | ||
if dtype == 'category': | ||
dtype = pd.api.types.CategoricalDtype(attributes[column]) | ||
df[column] = df[column].astype(dtype, copy=False) | ||
return df | ||
|
||
|
||
def _get_data_info_by_name(name, version, data_home): | ||
""" | ||
Utilizes the openml dataset listing api to find a dataset by | ||
|
@@ -436,7 +503,8 @@ def _valid_data_column_names(features_list, target_columns): | |
|
||
|
||
def fetch_openml(name=None, version='active', data_id=None, data_home=None, | ||
target_column='default-target', cache=True, return_X_y=False): | ||
target_column='default-target', cache=True, return_X_y=False, | ||
as_frame=False): | ||
"""Fetch dataset from openml by name or dataset id. | ||
|
||
Datasets are uniquely identified by either an integer ID or by a | ||
|
@@ -489,26 +557,39 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, | |
If True, returns ``(data, target)`` instead of a Bunch object. See | ||
below for more information about the `data` and `target` objects. | ||
|
||
as_frame : boolean, default=False | ||
If True, the data is a pandas DataFrame including columns with | ||
appropriate dtypes (numeric, string or categorical). The target is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't mention the |
||
a pandas DataFrame or Series depending on the number of target_columns. | ||
The Bunch will contain a ``frame`` attribute with the target and the | ||
data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas | ||
DataFrames or Series as describe above. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. *described |
||
|
||
Returns | ||
------- | ||
|
||
data : Bunch | ||
Dictionary-like object, with attributes: | ||
|
||
data : np.array or scipy.sparse.csr_matrix of floats | ||
data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame | ||
The feature matrix. Categorical features are encoded as ordinals. | ||
target : np.array | ||
target : np.array, pandas Series or DataFrame | ||
The regression target or classification labels, if applicable. | ||
Dtype is float if numeric, and object if categorical. | ||
Dtype is float if numeric, and object if categorical. If | ||
``as_frame`` is True, ``target`` is a pandas object. | ||
DESCR : str | ||
The full description of the dataset | ||
feature_names : list | ||
The names of the dataset columns | ||
categories : dict | ||
categories : dict or None | ||
Maps each categorical feature name to a list of values, such | ||
that the value encoded as i is ith in the list. | ||
that the value encoded as i is ith in the list. If ``as_frame`` | ||
is True, this is None. | ||
details : dict | ||
More metadata from OpenML | ||
frame : pandas DataFrame | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Only present when |
||
Only present when `as_frame=True`. DataFrame with ``data`` and | ||
``target``. | ||
|
||
(data, target) : tuple if ``return_X_y`` is True | ||
|
||
|
@@ -568,41 +649,52 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, | |
warn("OpenML raised a warning on the dataset. It might be " | ||
"unusable. Warning: {}".format(data_description['warning'])) | ||
|
||
return_sparse = False | ||
if data_description['format'].lower() == 'sparse_arff': | ||
return_sparse = True | ||
|
||
if as_frame and return_sparse: | ||
raise ValueError('Cannot return dataframe with sparse data') | ||
|
||
# download data features, meta-info about column types | ||
features_list = _get_data_features(data_id, data_home) | ||
|
||
for feature in features_list: | ||
if 'true' in (feature['is_ignore'], feature['is_row_identifier']): | ||
continue | ||
if feature['data_type'] == 'string': | ||
raise ValueError('STRING attributes are not yet supported') | ||
if not as_frame: | ||
for feature in features_list: | ||
if 'true' in (feature['is_ignore'], feature['is_row_identifier']): | ||
continue | ||
if feature['data_type'] == 'string': | ||
raise ValueError('STRING attributes are not supported for ' | ||
'array representation. Try as_frame=True') | ||
|
||
if target_column == "default-target": | ||
# determines the default target based on the data feature results | ||
# (which is currently more reliable than the data description; | ||
# see issue: https://github.com/openml/OpenML/issues/768) | ||
target_column = [feature['name'] for feature in features_list | ||
if feature['is_target'] == 'true'] | ||
target_columns = [feature['name'] for feature in features_list | ||
if feature['is_target'] == 'true'] | ||
elif isinstance(target_column, str): | ||
# for code-simplicity, make target_column by default a list | ||
target_column = [target_column] | ||
target_columns = [target_column] | ||
elif target_column is None: | ||
target_column = [] | ||
elif not isinstance(target_column, list): | ||
target_columns = [] | ||
elif isinstance(target_column, list): | ||
target_columns = target_column | ||
else: | ||
raise TypeError("Did not recognize type of target_column" | ||
"Should be str, list or None. Got: " | ||
"{}".format(type(target_column))) | ||
data_columns = _valid_data_column_names(features_list, | ||
target_column) | ||
target_columns) | ||
|
||
# prepare which columns and data types should be returned for the X and y | ||
features_dict = {feature['name']: feature for feature in features_list} | ||
|
||
# XXX: col_slice_y should be all nominal or all numeric | ||
_verify_target_data_type(features_dict, target_column) | ||
_verify_target_data_type(features_dict, target_columns) | ||
|
||
col_slice_y = [int(features_dict[col_name]['index']) | ||
for col_name in target_column] | ||
for col_name in target_columns] | ||
|
||
col_slice_x = [int(features_dict[col_name]['index']) | ||
for col_name in data_columns] | ||
|
@@ -615,10 +707,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, | |
'columns. '.format(feat['name'], nr_missing)) | ||
|
||
# determine arff encoding to return | ||
return_sparse = False | ||
if data_description['format'].lower() == 'sparse_arff': | ||
return_sparse = True | ||
|
||
if not return_sparse: | ||
data_qualities = _get_data_qualities(data_id, data_home) | ||
shape = _get_data_shape(data_qualities) | ||
|
@@ -631,46 +719,62 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, | |
|
||
# obtain the data | ||
arff = _download_data_arff(data_description['file_id'], return_sparse, | ||
data_home) | ||
|
||
# nominal attributes is a dict mapping from the attribute name to the | ||
# possible values. Includes also the target column (which will be popped | ||
# off below, before it will be packed in the Bunch object) | ||
nominal_attributes = {k: v for k, v in arff['attributes'] | ||
if isinstance(v, list) and | ||
k in data_columns + target_column} | ||
|
||
X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) | ||
|
||
is_classification = {col_name in nominal_attributes | ||
for col_name in target_column} | ||
if not is_classification: | ||
# No target | ||
pass | ||
elif all(is_classification): | ||
y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name), | ||
dtype='O'), | ||
y[:, i:i+1].astype(int, copy=False)) | ||
for i, col_name in enumerate(target_column)]) | ||
elif any(is_classification): | ||
raise ValueError('Mix of nominal and non-nominal targets is not ' | ||
'currently supported') | ||
data_home, encode_nominal=not as_frame) | ||
|
||
description = "{}\n\nDownloaded from openml.org.".format( | ||
data_description.pop('description')) | ||
|
||
# reshape y back to 1-D array, if there is only 1 target column; back | ||
# to None if there are not target columns | ||
if y.shape[1] == 1: | ||
y = y.reshape((-1,)) | ||
elif y.shape[1] == 0: | ||
y = None | ||
nominal_attributes = None | ||
frame = None | ||
if as_frame: | ||
columns = data_columns + target_columns | ||
frame = _convert_arff_data_dataframe(arff, columns, features_dict) | ||
X = frame[data_columns] | ||
if len(target_columns) >= 2: | ||
y = frame[target_columns] | ||
elif len(target_columns) == 1: | ||
y = frame[target_columns[0]] | ||
else: | ||
y = None | ||
else: | ||
# nominal attributes is a dict mapping from the attribute name to the | ||
# possible values. Includes also the target column (which will be | ||
# popped off below, before it will be packed in the Bunch object) | ||
nominal_attributes = {k: v for k, v in arff['attributes'] | ||
if isinstance(v, list) and | ||
k in data_columns + target_columns} | ||
|
||
X, y = _convert_arff_data(arff['data'], col_slice_x, | ||
col_slice_y, shape) | ||
|
||
is_classification = {col_name in nominal_attributes | ||
for col_name in target_columns} | ||
if not is_classification: | ||
# No target | ||
pass | ||
elif all(is_classification): | ||
y = np.hstack([ | ||
np.take( | ||
np.asarray(nominal_attributes.pop(col_name), dtype='O'), | ||
y[:, i:i + 1].astype(int, copy=False)) | ||
for i, col_name in enumerate(target_columns) | ||
]) | ||
elif any(is_classification): | ||
raise ValueError('Mix of nominal and non-nominal targets is not ' | ||
'currently supported') | ||
|
||
# reshape y back to 1-D array, if there is only 1 target column; back | ||
# to None if there are not target columns | ||
if y.shape[1] == 1: | ||
y = y.reshape((-1,)) | ||
elif y.shape[1] == 0: | ||
y = None | ||
|
||
if return_X_y: | ||
return X, y | ||
|
||
bunch = Bunch( | ||
data=X, target=y, feature_names=data_columns, | ||
data=X, target=y, frame=frame, feature_names=data_columns, | ||
DESCR=description, details=data_description, | ||
categories=nominal_attributes, | ||
url="https://www.openml.org/d/{}".format(data_id)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to use return_X_y here also?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I usually prefer to show case the Bunch object.