diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index babc557d9aaa0..3362559888681 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -58,6 +58,11 @@ Changelog :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski `. +- |Feature| :func:`datasets.fetch_california_housing` now supports + heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950` + by :user:`Stephanie Andrews ` and + :user:`Reshama Shaikh `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 9f33bc1f5fbf7..334e0a72b47c6 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -17,6 +17,7 @@ from ..utils import Bunch from ..utils import check_random_state +from ..utils import check_pandas_support import numpy as np @@ -67,6 +68,17 @@ def clear_data_home(data_home=None): shutil.rmtree(data_home) +def _convert_data_dataframe(caller_name, data, target, + feature_names, target_names): + pd = check_pandas_support('{} with as_frame=True'.format(caller_name)) + data_df = pd.DataFrame(data, columns=feature_names) + target_df = pd.DataFrame(target, columns=target_names) + combined_df = pd.concat([data_df, target_df], axis=1) + X = combined_df[feature_names] + y = combined_df[target_names] + return combined_df, X, y + + def load_files(container_path, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0): diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index bd02ff52ee19c..c71ebf3871b75 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -31,6 +31,7 @@ import joblib from . import get_data_home +from ._base import _convert_data_dataframe from ._base import _fetch_remote from ._base import _pkl_filepath from ._base import RemoteFileMetadata @@ -49,7 +50,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, - return_X_y=False): + return_X_y=False, as_frame=False): """Load the California housing dataset (regression). ============== ============== @@ -78,15 +79,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, .. versionadded:: 0.20 + as_frame : boolean, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target_columns. + + .. versionadded:: 0.23 + Returns ------- dataset : dict-like object with the following attributes: dataset.data : ndarray, shape [20640, 8] Each row corresponding to the 8 feature values in order. + If ``as_frame`` is True, ``data`` is a pandas object. dataset.target : numpy array of shape (20640,) Each value corresponds to the average house value in units of 100,000. + If ``as_frame`` is True, ``target`` is a pandas object. dataset.feature_names : array of length 8 Array of ordered feature names used in the dataset. @@ -98,6 +108,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True, .. versionadded:: 0.20 + frame : pandas DataFrame + Only present when `as_frame=True`. DataFrame with ``data`` and + ``target``. + + .. versionadded:: 0.23 + Notes ----- @@ -155,10 +171,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, with open(join(module_path, 'descr', 'california_housing.rst')) as dfile: descr = dfile.read() + X = data + y = target + + frame = None + target_names = ["MedHouseVal", ] + if as_frame: + frame, X, y = _convert_data_dataframe("fetch_california_housing", + data, + target, + feature_names, + target_names) + if return_X_y: - return data, target + return X, y - return Bunch(data=data, - target=target, + return Bunch(data=X, + target=y, + frame=frame, + target_names=target_names, feature_names=feature_names, DESCR=descr) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index ef45226c01f02..56cd62ef8bc35 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -3,8 +3,9 @@ Skipped if california_housing is not already downloaded to data_home. """ +import pytest + from sklearn.datasets import fetch_california_housing -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial @@ -13,14 +14,54 @@ def fetch(*args, **kwargs): return fetch_california_housing(*args, download_if_missing=False, **kwargs) -def test_fetch(): +def _is_california_housing_dataset_not_available(): try: - data = fetch() + fetch_california_housing(download_if_missing=False) + return False except IOError: - raise SkipTest("California housing dataset can not be loaded.") + return True + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_fetch(): + data = fetch() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data, fetch_func) + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_fetch_asframe(): + pd = pytest.importorskip('pandas') + bunch = fetch(as_frame=True) + frame = bunch.frame + assert hasattr(bunch, 'frame') is True + assert frame.shape == (20640, 9) + assert isinstance(bunch.data, pd.DataFrame) + assert isinstance(bunch.target, pd.DataFrame) + + +@pytest.mark.skipif( + _is_california_housing_dataset_not_available(), + reason='Download California Housing dataset to run this test' +) +def test_pandas_dependency_message(): + try: + import pandas # noqa + pytest.skip("This test requires pandas to be not installed") + except ImportError: + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing(as_frame=True)