Skip to content

ENH adding as_frame functionality for CA housing dataset loader #15950

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Dec 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
78a196f
adding as_frame functionality for california housing dataset loader
gitsteph Nov 2, 2019
d604f4b
fixes for as_frame to fetch_california_housing
gitsteph Nov 2, 2019
0a01ecf
minor change
gitsteph Nov 2, 2019
e00f4f4
forgot to rename feature_names to columns here
gitsteph Nov 2, 2019
3bda9b5
changes to use pandas concat and make conversion to df more generaliz…
gitsteph Nov 2, 2019
0dba66a
moved _convert_data_dataframe to _base.py
gitsteph Nov 2, 2019
df9f086
breaking at 79 chars
gitsteph Nov 2, 2019
5019c9e
added test_fetch_asframe
gitsteph Nov 2, 2019
587be05
moved test to test_california_housing.py
gitsteph Nov 2, 2019
f8eeba9
moved return_X_y handling below as_frame check
gitsteph Nov 2, 2019
ee19ac7
fixed linting issues caught by flake8
gitsteph Nov 3, 2019
19ed2f3
using pytest.importorskip to handle test environments without pandas …
gitsteph Nov 3, 2019
45940b2
skip test if dataset is not available in test env
gitsteph Nov 3, 2019
a6fcb50
Update sklearn/datasets/tests/test_california_housing.py
gitsteph Nov 4, 2019
e3db481
Update sklearn/datasets/tests/test_california_housing.py
gitsteph Nov 4, 2019
4c946e0
Merge remote-tracking branch 'upstream/master' into pr/15486
thomasjpfan Dec 11, 2019
20daa13
Merge remote-tracking branch 'upstream/master' into pr/15486
thomasjpfan Dec 13, 2019
815b419
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
Dec 22, 2019
c41c9a4
removing SkipTest import; flake8 error it is unused
Dec 22, 2019
7d311d1
move imports to 2 separate lines
Dec 22, 2019
5062770
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
Dec 23, 2019
8e118a0
adding test for ImportError; version added
Dec 24, 2019
a963f0d
updated test code for pandas check
Dec 24, 2019
2083f6b
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
Dec 24, 2019
167dd9e
added check pandas test
Dec 24, 2019
59d0e54
added noqa comment to unused pandas import'
Dec 24, 2019
3f65f08
DOC add what's new entry
rth Dec 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v0.23.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ Changelog
:func:`datasets.make_moons` now accept two-element tuple.
:pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.

- |Feature| :func:`datasets.fetch_california_housing` now supports
heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950`
by :user:`Stephanie Andrews <gitsteph>` and
:user:`Reshama Shaikh <reshamas>`.

:mod:`sklearn.feature_extraction`
.................................

Expand Down
12 changes: 12 additions & 0 deletions sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from ..utils import Bunch
from ..utils import check_random_state
from ..utils import check_pandas_support

import numpy as np

Expand Down Expand Up @@ -67,6 +68,17 @@ def clear_data_home(data_home=None):
shutil.rmtree(data_home)


def _convert_data_dataframe(caller_name, data, target,
feature_names, target_names):
pd = check_pandas_support('{} with as_frame=True'.format(caller_name))
data_df = pd.DataFrame(data, columns=feature_names)
target_df = pd.DataFrame(target, columns=target_names)
combined_df = pd.concat([data_df, target_df], axis=1)
X = combined_df[feature_names]
y = combined_df[target_names]
return combined_df, X, y


def load_files(container_path, description=None, categories=None,
load_content=True, shuffle=True, encoding=None,
decode_error='strict', random_state=0):
Expand Down
38 changes: 34 additions & 4 deletions sklearn/datasets/_california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import joblib

from . import get_data_home
from ._base import _convert_data_dataframe
from ._base import _fetch_remote
from ._base import _pkl_filepath
from ._base import RemoteFileMetadata
Expand All @@ -49,7 +50,7 @@


def fetch_california_housing(data_home=None, download_if_missing=True,
return_X_y=False):
return_X_y=False, as_frame=False):
"""Load the California housing dataset (regression).

============== ==============
Expand Down Expand Up @@ -78,15 +79,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,

.. versionadded:: 0.20

as_frame : boolean, default=False
If True, the data is a pandas DataFrame including columns with
appropriate dtypes (numeric, string or categorical). The target is
a pandas DataFrame or Series depending on the number of target_columns.

.. versionadded:: 0.23

Returns
-------
dataset : dict-like object with the following attributes:

dataset.data : ndarray, shape [20640, 8]
Each row corresponding to the 8 feature values in order.
If ``as_frame`` is True, ``data`` is a pandas object.

dataset.target : numpy array of shape (20640,)
Each value corresponds to the average house value in units of 100,000.
If ``as_frame`` is True, ``target`` is a pandas object.

dataset.feature_names : array of length 8
Array of ordered feature names used in the dataset.
Expand All @@ -98,6 +108,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True,

.. versionadded:: 0.20

frame : pandas DataFrame
Only present when `as_frame=True`. DataFrame with ``data`` and
``target``.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pleased add the directive:

.. versionadded:: 0.23

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the right way to do it?

def _pandas_is_missing():
    try:
        import pandas
    except ImportError:
        raise SkipTest('fetch_california_housing with as_frame=True'
                       ' requires pandas')


@pytest.mark.skipif(
    _pandas_is_missing(),
    reason='Import pandas library to run this test'
)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In #15950 (comment) I mentioned to add a new test to check that the warning is raised without skipping the test when pandas is missing. Something like the following (untested):

@pytest.mark.skipif(
    _is_california_housing_dataset_not_available(),
    reason='Download California Housing dataset to run this test'
)
def test_pandas_dependency_message()
    try:
        import pandas
        pytest.skip("This test requires pandas to be not installed")
    except ImportError:
        # Check that pandas is imported lazily and that an informative error message is
        # raised when pandas is missing:
        expected_msg = "fetch_california_housing with as_frame=True requires pandas"
        with pytest.raises(ImportError, match=expected_msg):
            fetch_california_housing(as_frame=True)

.. versionadded:: 0.23

Notes
-----

Expand Down Expand Up @@ -155,10 +171,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
descr = dfile.read()

X = data
y = target

frame = None
target_names = ["MedHouseVal", ]
if as_frame:
frame, X, y = _convert_data_dataframe("fetch_california_housing",
data,
target,
feature_names,
target_names)

if return_X_y:
return data, target
return X, y

return Bunch(data=data,
target=target,
return Bunch(data=X,
target=y,
frame=frame,
target_names=target_names,
feature_names=feature_names,
DESCR=descr)
49 changes: 45 additions & 4 deletions sklearn/datasets/tests/test_california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
Skipped if california_housing is not already downloaded to data_home.
"""

import pytest

from sklearn.datasets import fetch_california_housing
from sklearn.utils._testing import SkipTest
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial

Expand All @@ -13,14 +14,54 @@ def fetch(*args, **kwargs):
return fetch_california_housing(*args, download_if_missing=False, **kwargs)


def test_fetch():
def _is_california_housing_dataset_not_available():
try:
data = fetch()
fetch_california_housing(download_if_missing=False)
return False
except IOError:
raise SkipTest("California housing dataset can not be loaded.")
return True


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch():
data = fetch()
assert((20640, 8) == data.data.shape)
assert((20640, ) == data.target.shape)

# test return_X_y option
fetch_func = partial(fetch)
check_return_X_y(data, fetch_func)


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch_asframe():
pd = pytest.importorskip('pandas')
bunch = fetch(as_frame=True)
frame = bunch.frame
assert hasattr(bunch, 'frame') is True
assert frame.shape == (20640, 9)
assert isinstance(bunch.data, pd.DataFrame)
assert isinstance(bunch.target, pd.DataFrame)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add a test that checks that ImportError is raised if pandas is missing and that the message is " fetch_california_housing with as_frame=True requires pandas."?



@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_pandas_dependency_message():
try:
import pandas # noqa
pytest.skip("This test requires pandas to be not installed")
except ImportError:
# Check that pandas is imported lazily and that an informative error
# message is raised when pandas is missing:
expected_msg = ('fetch_california_housing with as_frame=True'
' requires pandas')
with pytest.raises(ImportError, match=expected_msg):
fetch_california_housing(as_frame=True)