From 6078cc6297bf25c655143516e75ef417c3d6cd7d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 1 Jun 2020 11:51:10 -0400
Subject: [PATCH 1/2] WIP

---
 sklearn/base.py                               | 22 +++++++++-
 .../gradient_boosting.py                      |  4 +-
 .../tests/test_gradient_boosting.py           |  7 ++++
 sklearn/utils/estimator_checks.py             | 42 +++++++++++++++++++
 sklearn/utils/tests/test_validation.py        | 13 ++++++
 sklearn/utils/validation.py                   |  5 +++
 6 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index a4cc26acc0c9a..5bf9aa4cc15ac 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -17,6 +17,7 @@
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils.validation import _is_dataframe
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
 
@@ -376,6 +377,21 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
+    def _check_feature_names(self, df, reset):
+        # set _feature_names_in attribute or check against it
+
+        if reset:
+            self._feature_names_in = df.columns.values
+        elif hasattr(self, '_feature_names_in'):
+            feature_names = df.columns.values
+            if np.any(feature_names != self._feature_names_in):
+                raise ValueError(
+                    "The column names of the dataframe must match those "
+                    "that were passed during fit(), in the same order. "
+                    f"Got ({feature_names}), expected "
+                    f"({self._feature_names_in})."
+                )
+
     def _validate_data(self, X, y=None, reset=True,
                        validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
@@ -406,9 +422,11 @@ def _validate_data(self, X, y=None, reset=True,
         out : {ndarray, sparse matrix} or tuple of these
             The validated input. A tuple is returned if `y` is not None.
         """
+        is_df = _is_dataframe(X)
+        X_orig = X
 
         if y is None:
-            if self._get_tags()['requires_y']:
+            if reset and self._get_tags()['requires_y']:
                 raise ValueError(
                     f"This {self.__class__.__name__} estimator "
                     f"requires y to be passed, but the target y is None."
@@ -430,6 +448,8 @@ def _validate_data(self, X, y=None, reset=True,
 
         if check_params.get('ensure_2d', True):
             self._check_n_features(X, reset=reset)
+            if is_df:
+                self._check_feature_names(X_orig, reset=reset)
 
         return out
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index a057d4f274b50..2c0dd3d66e5e1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -640,8 +640,8 @@ def _raw_predict(self, X):
         raw_predictions : array, shape (n_trees_per_iteration, n_samples)
             The raw predicted values.
         """
-        X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE],
-                        force_all_finite=False)
+        X = self._validate_data(X, dtype=[X_DTYPE, X_BINNED_DTYPE],
+                                force_all_finite=False, reset=False)
         check_is_fitted(self)
         if X.shape[1] != self.n_features_:
             raise ValueError(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 4a424bac47d63..8742de9331a9d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -794,3 +794,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
 
             assert_allclose(staged_predictions, pred_aux)
             assert staged_predictions.shape == pred_aux.shape
+
+
+def test_df_column_names():
+    from sklearn.utils.estimator_checks import check_dataframe_column_names_consistency  # noqa
+
+    check_dataframe_column_names_consistency('hgbdt',
+                                             HistGradientBoostingRegressor())
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index bbde6264a1c77..ed137f14c5427 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2991,3 +2991,45 @@ def check_requires_y_none(name, estimator_orig):
     except ValueError as ve:
         if not any(msg in str(ve) for msg in expected_err_msgs):
             warnings.warn(warning_msg, FutureWarning)
+
+
+def check_dataframe_column_names_consistency(name, estimator_orig):
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest(
+            "pandas is not installed: not checking column names consistency"
+        )
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if 'warm_start' in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _pairwise_estimator_convert_X(X, estimator)
+    X = pd.DataFrame(X)
+
+    if is_regressor(estimator_orig):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    estimator.fit(X, y)
+    if hasattr(estimator, '_feature_names_in'):
+        assert_array_equal(estimator._feature_names_in, X.columns.values)
+
+    bad_X = X[X.columns[::-1]]  # reverse column order
+
+    for method in ["predict", "transform", "decision_function",
+                   "predict_proba"]:
+        if hasattr(estimator, method):
+            estimator.predict(X)
+            msg = ("column names of the dataframe must match those that were "
+                   "passed during fit")
+            assert_raises_regex(ValueError, msg, getattr(estimator, method),
+                                bad_X)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index bcfd8fcd8d50e..5d4d5f37f471e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -43,6 +43,7 @@
     _deprecate_positional_args,
     _check_sample_weight,
     _allclose_dense_sparse,
+    _is_dataframe,
     FLOAT_DTYPES)
 from sklearn.utils.validation import _check_fit_params
 
@@ -1213,3 +1214,15 @@ def test_check_sparse_pandas_sp_format(sp_format):
     assert sp.issparse(result)
     assert result.format == sp_format
     assert_allclose_dense_sparse(sp_mat, result)
+
+
+def test_is_dataframe():
+    pd = pytest.importorskip('pandas')
+
+    assert _is_dataframe(pd.DataFrame(np.arange(10)))
+    assert _is_dataframe(pd.Series(np.arange(10)))
+
+    assert not _is_dataframe(np.arange(10))
+    assert not _is_dataframe(list(range(10)))
+    assert not _is_dataframe(1234)
+    assert not _is_dataframe('still not a df')
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 140b8d48a1bcd..d0ff001c34b42 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1378,3 +1378,8 @@ def _check_fit_params(X, fit_params, indices=None):
             )
 
     return fit_params_validated
+
+
+def _is_dataframe(X):
+    # Return True if X is a pandas dataframe (or a Series)
+    return hasattr(X, 'iloc')

From 1c9827b7faa324ddb2b09b0d1de952d0b0ebe83b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 1 Jun 2020 12:14:22 -0400
Subject: [PATCH 2/2] maybe fix

---
 sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 2c0dd3d66e5e1..a5926fe46b90d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -640,9 +640,9 @@ def _raw_predict(self, X):
         raw_predictions : array, shape (n_trees_per_iteration, n_samples)
             The raw predicted values.
         """
+        check_is_fitted(self)
         X = self._validate_data(X, dtype=[X_DTYPE, X_BINNED_DTYPE],
                                 force_all_finite=False, reset=False)
-        check_is_fitted(self)
         if X.shape[1] != self.n_features_:
             raise ValueError(
                 'X has {} features but this estimator was trained with '