scikit-learn · NicolasHug · Jul 17, 2018 · Jul 17, 2018
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -223,6 +223,54 @@ def set_params(self, **params):
 
         return self
 
+    def _check_column_names(self, X, set_names=False):
+        """
+        Check that the columns of X are as expected when X is a pandas
+        dataframe, else silently exits.
+
+        This function is aimed to be called in fit() with set_names=True
+        (before check_array()), and then in subsequent methods like predict()
+        or transform() with set_names=False (defaut).
+
+        Parameters
+        ----------
+        X : {pandas dataframe, object}
+            The input to check. If X is not a pandas dataframe, the function
+            silently exits.
+        set_names: bool, optional (default=False)
+            If True, the column names of X are those that will be expected in
+            the subsequent calls to _check_column_names.
+
+        Raises
+        ------
+        RuntimeError
+            When _check_column_names has not been called with set_names to True
+            before.
+        ValueError
+            When the column names of X are not as expected.
+        """
+
+        if not hasattr(X, 'iloc'):  # X is not a dataframe, ignore
+            return
+
+        if set_names:
+            self._column_names = list(X.columns)
+        elif not hasattr(self, '_column_names'):
+            raise RuntimeError(
+                "You should first call _check_column_names() with "
+                "set_names=True (most likely in fit())."
+            )
+
+        if self._column_names != list(X.columns):
+            raise ValueError(
+                "This estimator was fitted with a pandas dataframe with "
+                "different columns: expected column names are {} but got "
+                "{}. If you are sure your dataframe is correct, you may "
+                "instead pass 'df.values' to explicitely convert the "
+                "dataframe to a numpy array.".format(self._column_names,
+                                                     list(X.columns))
+            )
+
     def __repr__(self):
         class_name = self.__class__.__name__
         return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -635,6 +635,7 @@ def partial_fit(self, X, y=None):
         y
             Ignored
         """
+        self._check_column_names(X, set_names=True)
         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
                         warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES,
                         force_all_finite='allow-nan')
@@ -735,6 +736,7 @@ def transform(self, X, y='deprecated', copy=None):
                           "deprecated since 0.19 and will be removed in 0.21",
                           DeprecationWarning)
 
+        self._check_column_names(X)
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy
@@ -771,6 +773,7 @@ def inverse_transform(self, X, copy=None):
         X_tr : array-like, shape [n_samples, n_features]
             Transformed array.
         """
+        self._check_column_names(X)
         check_is_fitted(self, 'scale_')
 
         copy = copy if copy is not None else self.copy

diff --git a/sklearn/tests/test_dataframe_column_names.py b/sklearn/tests/test_dataframe_column_names.py
@@ -0,0 +1,74 @@
+# Author: Nicolas Hug
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+
+from sklearn.base import BaseEstimator
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils.testing import SkipTest
+try:
+    import pandas as pd
+except ImportError:
+    raise SkipTest("Pandas not found")
+
+
+def test_check_column_names():
+
+    class CustomEstimator(BaseEstimator):
+
+        def fit(self, X, y=None):
+            # forgot to call _check_column_names
+            return self
+
+        def transform(self, X):
+
+            self._check_column_names(X)
+            return X
+
+        def predict(self, X):
+
+            self._check_column_names(X)
+            return np.zeros(shape=X.shape[0])
+
+    df = pd.DataFrame({'a': np.arange(-1, 1, .1),
+                       'b': np.arange(-1, 1, .1)})
+
+    est = CustomEstimator()
+    est.fit(df)
+    with pytest.raises(RuntimeError):
+        est.transform(df)
+    with pytest.raises(RuntimeError):
+        est.predict(df)
+
+
+def test_column_names_standard_scaler():
+    df = pd.DataFrame({'a': np.arange(-1, 1, .1),
+                       'b': np.arange(-1, 1, .1)})
+    df2 = pd.DataFrame({'c': np.arange(-1, 1, .1),
+                        'd': np.arange(-1, 1, .1)})
+
+    ss = StandardScaler()
+
+    for function in (ss.fit, ss.partial_fit):
+        function(df)
+        ss.transform(df)  # all is fine
+        ss.inverse_transform(df)  # all is fine
+
+        # different column order
+        with pytest.raises(ValueError):
+            ss.transform(df[['b', 'a']])
+        with pytest.raises(ValueError):
+            ss.inverse_transform(df[['b', 'a']])
+
+        # completely different names
+        with pytest.raises(ValueError):
+            ss.transform(df2)
+        with pytest.raises(ValueError):
+            ss.inverse_transform(df2)
+
+        # column order OK but unknown columns
+        with pytest.raises(ValueError):
+            ss.transform(pd.concat([df, df2]))
+        with pytest.raises(ValueError):
+            ss.inverse_transform(pd.concat([df, df2]))