Skip to content

[MRG] adding as_frame functionality for california housing dataset loader #15486

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import hashlib

from ..utils import Bunch
from ..utils import check_random_state
from ..utils import check_random_state, check_pandas_support

import numpy as np

Expand Down Expand Up @@ -67,6 +67,17 @@ def clear_data_home(data_home=None):
shutil.rmtree(data_home)


def _convert_data_dataframe(caller_name, data, target,
feature_names, target_names):
pd = check_pandas_support('{} with as_frame=True'.format(caller_name))
data_df = pd.DataFrame(data, columns=feature_names)
target_df = pd.DataFrame(target, columns=target_names)
combined_df = pd.concat([data_df, target_df], axis=1)
X = combined_df[feature_names]
y = combined_df[target_names]
return combined_df, X, y


def load_files(container_path, description=None, categories=None,
load_content=True, shuffle=True, encoding=None,
decode_error='strict', random_state=0):
Expand Down
34 changes: 30 additions & 4 deletions sklearn/datasets/_california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import joblib

from . import get_data_home
from ._base import _convert_data_dataframe
from ._base import _fetch_remote
from ._base import _pkl_filepath
from ._base import RemoteFileMetadata
Expand All @@ -49,7 +50,7 @@


def fetch_california_housing(data_home=None, download_if_missing=True,
return_X_y=False):
return_X_y=False, as_frame=False):
"""Load the California housing dataset (regression).

============== ==============
Expand Down Expand Up @@ -78,15 +79,22 @@ def fetch_california_housing(data_home=None, download_if_missing=True,

.. versionadded:: 0.20

as_frame : boolean, default=False
If True, the data is a pandas DataFrame including columns with
appropriate dtypes (numeric, string or categorical). The target is
a pandas DataFrame or Series depending on the number of target_columns.

Returns
-------
dataset : dict-like object with the following attributes:

dataset.data : ndarray, shape [20640, 8]
Each row corresponding to the 8 feature values in order.
If ``as_frame`` is True, ``data`` is a pandas object.

dataset.target : numpy array of shape (20640,)
Each value corresponds to the average house value in units of 100,000.
If ``as_frame`` is True, ``target`` is a pandas object.

dataset.feature_names : array of length 8
Array of ordered feature names used in the dataset.
Expand All @@ -98,6 +106,10 @@ def fetch_california_housing(data_home=None, download_if_missing=True,

.. versionadded:: 0.20

frame : pandas DataFrame
Only present when `as_frame=True`. DataFrame with ``data`` and
``target``.

Notes
-----

Expand Down Expand Up @@ -155,10 +167,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
descr = dfile.read()

X = data
y = target

frame = None
target_names = ["MedHouseVal", ]
if as_frame:
frame, X, y = _convert_data_dataframe("fetch_california_housing",
data,
target,
feature_names,
target_names)

if return_X_y:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this to better handle the case where both return_X_y and as_frame are True. If they are both true, this will return X and y as pandas objects.

Copy link
Contributor Author

@gitsteph gitsteph Nov 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TY to @wconnell for this comment => #15486 (comment)

return data, target
return X, y

return Bunch(data=data,
target=target,
return Bunch(data=X,
target=y,
frame=frame,
target_names=target_names,
feature_names=feature_names,
DESCR=descr)
31 changes: 28 additions & 3 deletions sklearn/datasets/tests/test_california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Skipped if california_housing is not already downloaded to data_home.
"""

import pytest

from sklearn.datasets import fetch_california_housing
from sklearn.utils._testing import SkipTest
from sklearn.datasets.tests.test_common import check_return_X_y
Expand All @@ -13,14 +15,37 @@ def fetch(*args, **kwargs):
return fetch_california_housing(*args, download_if_missing=False, **kwargs)


def test_fetch():
def _is_california_housing_dataset_not_available():
try:
data = fetch()
fetch_california_housing(download_if_missing=False)
return False
except IOError:
raise SkipTest("California housing dataset can not be loaded.")
return True


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch():
data = fetch()
assert((20640, 8) == data.data.shape)
assert((20640, ) == data.target.shape)

# test return_X_y option
fetch_func = partial(fetch)
check_return_X_y(data, fetch_func)


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch_asframe():
pd = pytest.importorskip('pandas')
bunch = fetch(as_frame=True)
frame = bunch.frame
assert hasattr(bunch, 'frame') is True
assert frame.shape == (20640, 9)
assert isinstance(bunch.data, pd.DataFrame)
assert isinstance(bunch.target, pd.DataFrame)