scikit-learn · rth · Dec 26, 2019 · Nov 2, 2019 · Nov 2, 2019 · Nov 2, 2019
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -58,6 +58,11 @@ Changelog
   :func:`datasets.make_moons` now accept two-element tuple.
   :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
 
+- |Feature| :func:`datasets.fetch_california_housing` now supports
+  heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950`
+  by :user:`Stephanie Andrews <gitsteph>` and
+  :user:`Reshama Shaikh <reshamas>`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
@@ -17,6 +17,7 @@
 
 from ..utils import Bunch
 from ..utils import check_random_state
+from ..utils import check_pandas_support
 
 import numpy as np
 
@@ -67,6 +68,17 @@ def clear_data_home(data_home=None):
     shutil.rmtree(data_home)
 
 
+def _convert_data_dataframe(caller_name, data, target,
+                            feature_names, target_names):
+    pd = check_pandas_support('{} with as_frame=True'.format(caller_name))
+    data_df = pd.DataFrame(data, columns=feature_names)
+    target_df = pd.DataFrame(target, columns=target_names)
+    combined_df = pd.concat([data_df, target_df], axis=1)
+    X = combined_df[feature_names]
+    y = combined_df[target_names]
+    return combined_df, X, y
+
+
 def load_files(container_path, description=None, categories=None,
                load_content=True, shuffle=True, encoding=None,
                decode_error='strict', random_state=0):

diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
@@ -31,6 +31,7 @@
 import joblib
 
 from . import get_data_home
+from ._base import _convert_data_dataframe
 from ._base import _fetch_remote
 from ._base import _pkl_filepath
 from ._base import RemoteFileMetadata
@@ -49,7 +50,7 @@
 
 
 def fetch_california_housing(data_home=None, download_if_missing=True,
-                             return_X_y=False):
+                             return_X_y=False, as_frame=False):
     """Load the California housing dataset (regression).
 
     ==============   ==============
@@ -78,15 +79,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    as_frame : boolean, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target_columns.
+
+        .. versionadded:: 0.23
+
     Returns
     -------
     dataset : dict-like object with the following attributes:
 
     dataset.data : ndarray, shape [20640, 8]
         Each row corresponding to the 8 feature values in order.
+        If ``as_frame`` is True, ``data`` is a pandas object.
 
     dataset.target : numpy array of shape (20640,)
         Each value corresponds to the average house value in units of 100,000.
+        If ``as_frame`` is True, ``target`` is a pandas object.
 
     dataset.feature_names : array of length 8
         Array of ordered feature names used in the dataset.
@@ -98,6 +108,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    frame : pandas DataFrame
+        Only present when `as_frame=True`. DataFrame with ``data`` and
+        ``target``.
+
+        .. versionadded:: 0.23
+
     Notes
     -----
 
@@ -155,10 +171,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
     with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
         descr = dfile.read()
 
+    X = data
+    y = target
+
+    frame = None
+    target_names = ["MedHouseVal", ]
+    if as_frame:
+        frame, X, y = _convert_data_dataframe("fetch_california_housing",
+                                              data,
+                                              target,
+                                              feature_names,
+                                              target_names)
+
     if return_X_y:
-        return data, target
+        return X, y
 
-    return Bunch(data=data,
-                 target=target,
+    return Bunch(data=X,
+                 target=y,
+                 frame=frame,
+                 target_names=target_names,
                  feature_names=feature_names,
                  DESCR=descr)
diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py
@@ -3,8 +3,9 @@
 Skipped if california_housing is not already downloaded to data_home.
 """
 
+import pytest
+
 from sklearn.datasets import fetch_california_housing
-from sklearn.utils._testing import SkipTest
 from sklearn.datasets.tests.test_common import check_return_X_y
 from functools import partial
 
@@ -13,14 +14,54 @@ def fetch(*args, **kwargs):
     return fetch_california_housing(*args, download_if_missing=False, **kwargs)
 
 
-def test_fetch():
+def _is_california_housing_dataset_not_available():
     try:
-        data = fetch()
+        fetch_california_housing(download_if_missing=False)
+        return False
     except IOError:
-        raise SkipTest("California housing dataset can not be loaded.")
+        return True
+
+
+@pytest.mark.skipif(
+    _is_california_housing_dataset_not_available(),
+    reason='Download California Housing dataset to run this test'
+)
+def test_fetch():
+    data = fetch()
     assert((20640, 8) == data.data.shape)
     assert((20640, ) == data.target.shape)
 
     # test return_X_y option
     fetch_func = partial(fetch)
     check_return_X_y(data, fetch_func)
+
+
+@pytest.mark.skipif(
+    _is_california_housing_dataset_not_available(),
+    reason='Download California Housing dataset to run this test'
+)
+def test_fetch_asframe():
+    pd = pytest.importorskip('pandas')
+    bunch = fetch(as_frame=True)
+    frame = bunch.frame
+    assert hasattr(bunch, 'frame') is True
+    assert frame.shape == (20640, 9)
+    assert isinstance(bunch.data, pd.DataFrame)
+    assert isinstance(bunch.target, pd.DataFrame)
+
+
+@pytest.mark.skipif(
+    _is_california_housing_dataset_not_available(),
+    reason='Download California Housing dataset to run this test'
+)
+def test_pandas_dependency_message():
+    try:
+        import pandas  # noqa
+        pytest.skip("This test requires pandas to be not installed")
+    except ImportError:
+        # Check that pandas is imported lazily and that an informative error
+        # message is raised when pandas is missing:
+        expected_msg = ('fetch_california_housing with as_frame=True'
+                        ' requires pandas')
+        with pytest.raises(ImportError, match=expected_msg):
+            fetch_california_housing(as_frame=True)