scikit-learn · gitsteph · Nov 2, 2019 · Nov 2, 2019 · Nov 2, 2019 · Nov 2, 2019
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
@@ -16,7 +16,7 @@
 import hashlib
 
 from ..utils import Bunch
-from ..utils import check_random_state
+from ..utils import check_random_state, check_pandas_support
 
 import numpy as np
 
@@ -67,6 +67,17 @@ def clear_data_home(data_home=None):
     shutil.rmtree(data_home)
 
 
+def _convert_data_dataframe(caller_name, data, target,
+                            feature_names, target_names):
+    pd = check_pandas_support('{} with as_frame=True'.format(caller_name))
+    data_df = pd.DataFrame(data, columns=feature_names)
+    target_df = pd.DataFrame(target, columns=target_names)
+    combined_df = pd.concat([data_df, target_df], axis=1)
+    X = combined_df[feature_names]
+    y = combined_df[target_names]
+    return combined_df, X, y
+
+
 def load_files(container_path, description=None, categories=None,
                load_content=True, shuffle=True, encoding=None,
                decode_error='strict', random_state=0):

diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
@@ -31,6 +31,7 @@
 import joblib
 
 from . import get_data_home
+from ._base import _convert_data_dataframe
 from ._base import _fetch_remote
 from ._base import _pkl_filepath
 from ._base import RemoteFileMetadata
@@ -49,7 +50,7 @@
 
 
 def fetch_california_housing(data_home=None, download_if_missing=True,
-                             return_X_y=False):
+                             return_X_y=False, as_frame=False):
     """Load the California housing dataset (regression).
 
     ==============   ==============
@@ -78,15 +79,22 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    as_frame : boolean, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric, string or categorical). The target is
+        a pandas DataFrame or Series depending on the number of target_columns.
+
     Returns
     -------
     dataset : dict-like object with the following attributes:
 
     dataset.data : ndarray, shape [20640, 8]
         Each row corresponding to the 8 feature values in order.
+        If ``as_frame`` is True, ``data`` is a pandas object.
 
     dataset.target : numpy array of shape (20640,)
         Each value corresponds to the average house value in units of 100,000.
+        If ``as_frame`` is True, ``target`` is a pandas object.
 
     dataset.feature_names : array of length 8
         Array of ordered feature names used in the dataset.
@@ -98,6 +106,10 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    frame : pandas DataFrame
+        Only present when `as_frame=True`. DataFrame with ``data`` and
+        ``target``.
+
     Notes
     -----
 
@@ -155,10 +167,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
     with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
         descr = dfile.read()
 
+    X = data
+    y = target
+
+    frame = None
+    target_names = ["MedHouseVal", ]
+    if as_frame:
+        frame, X, y = _convert_data_dataframe("fetch_california_housing",
+                                              data,
+                                              target,
+                                              feature_names,
+                                              target_names)
+
     if return_X_y:
-        return data, target
+        return X, y
 
-    return Bunch(data=data,
-                 target=target,
+    return Bunch(data=X,
+                 target=y,
+                 frame=frame,
+                 target_names=target_names,
                  feature_names=feature_names,
                  DESCR=descr)
diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py
@@ -3,6 +3,8 @@
 Skipped if california_housing is not already downloaded to data_home.
 """
 
+import pytest
+
 from sklearn.datasets import fetch_california_housing
 from sklearn.utils._testing import SkipTest
 from sklearn.datasets.tests.test_common import check_return_X_y
@@ -13,14 +15,37 @@ def fetch(*args, **kwargs):
     return fetch_california_housing(*args, download_if_missing=False, **kwargs)
 
 
-def test_fetch():
+def _is_california_housing_dataset_not_available():
     try:
-        data = fetch()
+        fetch_california_housing(download_if_missing=False)
+        return False
     except IOError:
-        raise SkipTest("California housing dataset can not be loaded.")
+        return True
+
+
+@pytest.mark.skipif(
+    _is_california_housing_dataset_not_available(),
+    reason='Download California Housing dataset to run this test'
+)
+def test_fetch():
+    data = fetch()
     assert((20640, 8) == data.data.shape)
     assert((20640, ) == data.target.shape)
 
     # test return_X_y option
     fetch_func = partial(fetch)
     check_return_X_y(data, fetch_func)
+
+
+@pytest.mark.skipif(
+    _is_california_housing_dataset_not_available(),
+    reason='Download California Housing dataset to run this test'
+)
+def test_fetch_asframe():
+    pd = pytest.importorskip('pandas')
+    bunch = fetch(as_frame=True)
+    frame = bunch.frame
+    assert hasattr(bunch, 'frame') is True
+    assert frame.shape == (20640, 9)
+    assert isinstance(bunch.data, pd.DataFrame)
+    assert isinstance(bunch.target, pd.DataFrame)