From 779385f2fa1d8497d19988667560015aad1475b7 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 26 Jul 2020 21:56:51 -0400
Subject: [PATCH 01/52] ENH Adds column name consistency

---
 sklearn/base.py                            | 42 +++++++++++-
 sklearn/linear_model/_base.py              |  6 +-
 sklearn/tests/test_base.py                 | 25 ++++++++
 sklearn/tests/test_common.py               | 19 ++++++
 sklearn/tests/test_docstring_parameters.py |  2 +-
 sklearn/tests/test_extarray.py             | 27 ++++++++
 sklearn/utils/_extarray.py                 | 16 +++++
 sklearn/utils/estimator_checks.py          | 75 ++++++++++++++++++++++
 8 files changed, 207 insertions(+), 5 deletions(-)
 create mode 100644 sklearn/tests/test_extarray.py
 create mode 100644 sklearn/utils/_extarray.py

diff --git a/sklearn/base.py b/sklearn/base.py
index 46398baabfd3a..2961a5a50707f 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -19,6 +19,7 @@
 from .utils.validation import check_array
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
+from .utils._extarray import _get_feature_names
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -376,6 +377,44 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
+    def _check_feature_names(self, X, reset=True):
+        """Validate feature names and set or check the `feature_names_in_`
+        attribute.
+
+        Parameters
+        ----------
+        X : {dataframe-like} of shape (n_samples, n_features)
+            The input samples.
+        reset : bool, default=True
+            Whether to reset the `feature_names_in_` attribute.
+            If False, the Input will be checked for consistency with data
+            provided when reset was last True.
+        """
+
+        if reset:
+            self.feature_names_in_ = _get_feature_names(X)
+            return
+
+        fitted_feature_names = getattr(self, "feature_names_in_", None)
+        if fitted_feature_names is None:
+            # no feature names to check
+            return
+
+        feature_names_in = _get_feature_names(X)
+        if feature_names_in is None:
+            # X does not have feature names but estimator was fitted with
+            # data with feature names
+            return
+
+        # valid the `feature_names_in_` attribute
+        if (len(fitted_feature_names) != len(feature_names_in) or
+                np.any(fitted_feature_names != feature_names_in)):
+            warnings.warn("The column names should match those that were "
+                          "passed during fit(), in the same order. Got "
+                          f"({feature_names_in}) expected "
+                          f"({fitted_feature_names}). Starting version 0.26, "
+                          "an error will be raised", FutureWarning)
+
     def _validate_data(self, X, y=None, reset=True,
                        validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
@@ -406,9 +445,10 @@ def _validate_data(self, X, y=None, reset=True,
         out : {ndarray, sparse matrix} or tuple of these
             The validated input. A tuple is returned if `y` is not None.
         """
+        self._check_feature_names(X, reset=reset)
 
         if y is None:
-            if self._get_tags()['requires_y']:
+            if reset and self._get_tags()['requires_y']:
                 raise ValueError(
                     f"This {self.__class__.__name__} estimator "
                     f"requires y to be passed, but the target y is None."
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 4ab797578dbde..c922480875f9d 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -214,8 +214,8 @@ def fit(self, X, y):
 
     def _decision_function(self, X):
         check_is_fitted(self)
-
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                reset=False)
         return safe_sparse_dot(X, self.coef_.T,
                                dense_output=True) + self.intercept_
 
@@ -279,7 +279,7 @@ class would be predicted.
         """
         check_is_fitted(self)
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         n_features = self.coef_.shape[1]
         if X.shape[1] != n_features:
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index db5c88051346a..51dfbd163a312 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -16,6 +16,8 @@
 from sklearn.svm import SVC
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model import LinearRegression
+from sklearn.datasets import make_regression
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -537,3 +539,26 @@ def test_repr_html_wraps():
     with config_context(display='diagram'):
         output = tree._repr_html_()
         assert "<style>" in output
+
+
+def test_check_feature_names():
+    # check cases when feature names are not avaliable
+    pd = pytest.importorskip("pandas")
+    X, y = make_regression(random_state=42)
+    est = LinearRegression()
+    est.fit(X, y)
+
+    assert est.feature_names_in_ is None
+    with pytest.warns(None) as record:
+        est.predict(X)
+    assert not record
+
+    # does not check names
+    names = [f"col_{i}" for i in range(X.shape[1])]
+    df = pd.DataFrame(X, columns=names)
+    est.fit(df, y)
+
+    assert_array_equal(est.feature_names_in_, names)
+    with pytest.warns(None) as record:
+        est.predict(X)
+    assert not record
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index b9f50a76f7b30..d2aa30c57637a 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -27,12 +27,15 @@
 
 from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LinearRegression
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import SkipTest
 from sklearn.utils.estimator_checks import (
     _construct_instance,
     _set_checking_parameters,
     _set_check_estimator_ids,
+    check_dataframe_column_names_consistency,
+    check_dataarray_column_name_consistency,
     check_class_weight_balanced_linear_classifier,
     parametrize_with_checks)
 
@@ -204,3 +207,19 @@ def test_class_support_removed():
 
     with pytest.raises(TypeError, match=msg):
         parametrize_with_checks([LogisticRegression])
+
+
+@pytest.mark.parametrize('estimator', [LogisticRegression(),
+                                       LinearRegression()],
+                         ids=_set_check_estimator_ids)
+def test_pandas_column_name_consistency(estimator):
+    name = estimator.__class__.__name__
+    check_dataframe_column_names_consistency(name, estimator)
+
+
+@pytest.mark.parametrize('estimator', [LogisticRegression(),
+                                       LinearRegression()],
+                         ids=_set_check_estimator_ids)
+def test_xarray_column_name_consistency(estimator):
+    name = estimator.__class__.__name__
+    check_dataarray_column_name_consistency(name, estimator)
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index a48af83b15a7a..94d3958f50540 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -217,7 +217,7 @@ def test_fit_docstring_attributes(name, Estimator):
     else:
         est.fit(X, y)
 
-    skipped_attributes = {'n_features_in_'}
+    skipped_attributes = {'n_features_in_', 'feature_names_in_'}
 
     for attr in attributes:
         if attr.name in skipped_attributes:
diff --git a/sklearn/tests/test_extarray.py b/sklearn/tests/test_extarray.py
new file mode 100644
index 0000000000000..ef148c08f89a2
--- /dev/null
+++ b/sklearn/tests/test_extarray.py
@@ -0,0 +1,27 @@
+import pytest
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils._extarray import _get_feature_names
+
+
+def _construct_array(array_type, column_names):
+    X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
+
+    if array_type == "dataframe":
+        pd = pytest.importorskip("pandas")
+        return pd.DataFrame(X, columns=column_names)
+    elif array_type == "dataarray":
+        xr = pytest.importorskip("xarray")
+        return xr.DataArray(X, dims=('index', 'columns'),
+                            coords={'columns': column_names})
+
+
+@pytest.mark.parametrize("array_type", ["dataframe", "dataarray"])
+def test_pandas_get_feature_names(array_type):
+    column_names = [f'col_{i}' for i in range(3)]
+    X = _construct_array(array_type, column_names)
+    names = _get_feature_names(X)
+
+    assert_array_equal(names, column_names)
diff --git a/sklearn/utils/_extarray.py b/sklearn/utils/_extarray.py
new file mode 100644
index 0000000000000..2ea523db74471
--- /dev/null
+++ b/sklearn/utils/_extarray.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def _get_feature_names(X):
+    """Get feature names from X.
+
+    Supports:
+       - pandas DataFrame
+       - xarray DataArray
+       - Return None for unrecognized array containers
+    """
+    if hasattr(X, "columns"):  # pandas
+        return np.array(X.columns, dtype=object)
+    elif hasattr(X, "dims") and isinstance(X.dims, tuple) and len(X.dims) == 2:
+        # xarray DataArray
+        return np.array(X.coords[X.dims[1]], dtype=object)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ea85566f68988..01b6f5c63514b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2988,3 +2988,78 @@ def check_requires_y_none(name, estimator_orig):
     except ValueError as ve:
         if not any(msg in str(ve) for msg in expected_err_msgs):
             warnings.warn(warning_msg, FutureWarning)
+
+
+def check_dataframe_column_names_consistency(name, estimator_orig):
+    try:
+        import pandas as pd
+    except ImportError:
+        raise SkipTest("pandas is not installed: not checking "
+                       "column name consistency for pandas")
+
+    def _construct_dataframe(X, columns):
+        return pd.DataFrame(X, columns=columns)
+    _check_column_name_consistency(name, estimator_orig, _construct_dataframe)
+
+
+def check_dataarray_column_name_consistency(name, estimator_orig):
+    try:
+        import xarray as xr
+    except ImportError:
+        raise SkipTest("xarray is not installed: not checking "
+                       "column name consistency for xarray")
+
+    def _construct_xarray(X, columns):
+        return xr.DataArray(X, dims=('index', 'columns'),
+                            coords={'columns': columns})
+    _check_column_name_consistency(name, estimator_orig, _construct_xarray)
+
+
+def _check_column_name_consistency(name, estimator_orig, construct_X):
+    estimator = clone(estimator_orig)
+
+    tags = estimator._get_tags()
+    if "2darray" not in tags["X_types"]:
+        warnings.warn("Can't test estimator {} which requires input "
+                      " of type {}".format(name, tags["X_types"]),
+                      SkipTestWarning)
+        return
+
+    set_random_state(estimator)
+    if 'warm_start' in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    X_orig, _ = make_regression(random_state=0, n_features=10)
+    X_orig = _enforce_estimator_tags_x(estimator, X_orig)
+    X_orig = _pairwise_estimator_convert_X(X_orig, estimator)
+
+    n_samples, n_features = X_orig.shape
+    names = np.array([f"col_{i}" for i in range(n_features)])
+    X = construct_X(X_orig, names)
+
+    rng = np.random.RandomState(0)
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    estimator.fit(X, y)
+    if not hasattr(estimator, 'feature_names_in_'):
+        return
+
+    assert_array_equal(estimator.feature_names_in_, names)
+    bad_names = names[::-1]
+    X_bad = construct_X(X, bad_names)
+
+    expected_msg = ("The column names should match those that were passed "
+                    f"during fit(), in the same order. Got ({bad_names}) "
+                    f"expected ({names}). Starting version 0.26, an error "
+                    "will be raised")
+    for method in ("predict", "transform", "decision_function",
+                   "predict_proba"):
+        func = getattr(estimator, method, None)
+        if func is None:
+            continue
+        # TODO In 0.26, this will be an error.
+        assert_warns_message(FutureWarning, expected_msg, func, X_bad)

From a579392af5e54863f959b5083fc5fd64ab2313b0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 26 Jul 2020 22:20:01 -0400
Subject: [PATCH 02/52] BUG Fix

---
 sklearn/linear_model/_logistic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index da86a1755c2f1..047ab2e56e97d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1000,6 +1000,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
             log_reg.coef_ = w
             log_reg.intercept_ = 0.
 
+        log_reg.n_features_in_ = X_train.shape[1]
         if scoring is None:
             scores.append(log_reg.score(X_test, y_test))
         else:

From 74368fd00dd1269750d6983801bf6aa356ec5e61 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 26 Jul 2020 23:23:03 -0400
Subject: [PATCH 03/52] BUG Fix

---
 sklearn/ensemble/tests/test_stacking.py      | 5 ++---
 sklearn/linear_model/_stochastic_gradient.py | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py
index 88dd76cb0b49d..6a0ac0889f6ec 100644
--- a/sklearn/ensemble/tests/test_stacking.py
+++ b/sklearn/ensemble/tests/test_stacking.py
@@ -505,9 +505,8 @@ def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
 
     class MyEstimator(Estimator):
         """Estimator without n_features_in_"""
-        def fit(self, X, y):
-            super().fit(X, y)
-            del self.n_features_in_
+        def _check_n_features(self, X, reset):
+            pass
 
     X, y = make_dataset(random_state=0, n_samples=100)
     stacker = Stacking(estimators=[('lr', MyEstimator())])
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 4c772c0ff79a3..6ef46734ca397 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -64,6 +64,7 @@ def __call__(self, coef, intercept):
         est = self.estimator
         est.coef_ = coef.reshape(1, -1)
         est.intercept_ = np.atleast_1d(intercept)
+        est.n_features_in_ = coef.size
         return est.score(self.X_val, self.y_val, self.sample_weight_val)
 
 
@@ -487,13 +488,13 @@ def _partial_fit(self, X, y, alpha, C,
                      loss, learning_rate, max_iter,
                      classes, sample_weight,
                      coef_init, intercept_init):
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-                         order="C", accept_large_sparse=False)
+        first_call = _check_partial_fit_first_call(self, classes)
+        X, y = self._validate_data(
+            X, y, accept_sparse='csr', dtype=np.float64,
+            order="C", accept_large_sparse=False, reset=first_call)
 
         n_samples, n_features = X.shape
 
-        _check_partial_fit_first_call(self, classes)
-
         n_classes = self.classes_.shape[0]
 
         # Allocate datastructures from input arguments

From bce5d0f6e8b451bc89c20a2d95ae5b2be0f92d7d Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 26 Jul 2020 23:37:09 -0400
Subject: [PATCH 04/52] STY Flake8

---
 sklearn/linear_model/_stochastic_gradient.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 6ef46734ca397..417aeeffa5857 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -15,7 +15,7 @@
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
 from ..base import BaseEstimator, RegressorMixin
-from ..utils import check_array, check_random_state, check_X_y
+from ..utils import check_array, check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight

From 2ca4dbfdc1565fb3b0bffc6229a5f1a9148a2021 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 27 Jul 2020 09:42:57 -0400
Subject: [PATCH 05/52] MNT Adds xarray

---
 build_tools/azure/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 2a9a4544aca9a..25ca9f43b27c6 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -63,7 +63,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
 
-    python -m pip install pandas matplotlib pyamg scikit-image
+    python -m pip install pandas matplotlib pyamg scikit-image xarray
     # do not install dependencies for lightgbm since it requires scikit-learn
     python -m pip install lightgbm --no-deps
 elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then

From 7465ec2f8016f8a4ff2090286278c6b644754d91 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Sep 2020 23:18:56 -0400
Subject: [PATCH 06/52] CLN Smaller diff

---
 sklearn/base.py                               |  2 +-
 sklearn/dummy.py                              |  8 ++-
 sklearn/ensemble/tests/test_stacking.py       |  5 +-
 sklearn/impute/_base.py                       |  2 +-
 sklearn/impute/_iterative.py                  | 11 ++-
 sklearn/impute/_knn.py                        | 10 +--
 sklearn/linear_model/_logistic.py             |  1 -
 sklearn/linear_model/_stochastic_gradient.py  | 11 ++-
 sklearn/tests/test_base.py                    | 25 -------
 sklearn/tests/test_common.py                  | 67 ++++++++++++++-----
 sklearn/tests/test_extarray.py                |  2 +-
 sklearn/utils/{_extarray.py => _array_out.py} |  0
 sklearn/utils/estimator_checks.py             | 22 +++---
 13 files changed, 94 insertions(+), 72 deletions(-)
 rename sklearn/utils/{_extarray.py => _array_out.py} (100%)

diff --git a/sklearn/base.py b/sklearn/base.py
index c65a90c063ef7..95f8d124692e8 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -19,7 +19,7 @@
 from .utils.validation import check_array
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
-from .utils._extarray import _get_feature_names
+from .utils._array_out import _get_feature_names
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 4773ccbd1f92f..9d656ccbc6282 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -147,7 +147,9 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        self.n_features_in_ = None  # No input validation is done for X
+        # No input validation is done for X
+        self.n_features_in_ = None
+        self.feature_names_in_ = None
 
         check_consistent_length(X, y)
 
@@ -472,7 +474,9 @@ def fit(self, X, y, sample_weight=None):
                              % (self.strategy, allowed_strategies))
 
         y = check_array(y, ensure_2d=False)
-        self.n_features_in_ = None  # No input validation is done for X
+        # No input validation is done for X
+        self.n_features_in_ = None
+        self.feature_names_in_ = None
         if len(y) == 0:
             raise ValueError("y must not be empty.")
 
diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py
index 6a0ac0889f6ec..88dd76cb0b49d 100644
--- a/sklearn/ensemble/tests/test_stacking.py
+++ b/sklearn/ensemble/tests/test_stacking.py
@@ -505,8 +505,9 @@ def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
 
     class MyEstimator(Estimator):
         """Estimator without n_features_in_"""
-        def _check_n_features(self, X, reset):
-            pass
+        def fit(self, X, y):
+            super().fit(X, y)
+            del self.n_features_in_
 
     X, y = make_dataset(random_state=0, n_samples=100)
     stacker = Stacking(estimators=[('lr', MyEstimator())])
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index e1334d1980fa0..25f2580b12af3 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -793,7 +793,7 @@ def transform(self, X):
         # Need not validate X again as it would have already been validated
         # in the Imputer calling MissingIndicator
         if not self._precomputed:
-            X = self._validate_input(X, in_fit=True)
+            X = self._validate_input(X, in_fit=False)
         else:
             if not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):
                 raise ValueError("precomputed is True but the input data is "
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 46c0dea06cbcd..0f85e7853fad5 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -468,7 +468,7 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
         abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
         return abs_corr_mat
 
-    def _initial_imputation(self, X):
+    def _initial_imputation(self, X, in_fit=True):
         """Perform initial imputation for input X.
 
         Parameters
@@ -477,6 +477,9 @@ def _initial_imputation(self, X):
             Input data, where "n_samples" is the number of samples and
             "n_features" is the number of features.
 
+        in_fit : bool, default=True
+            Whether the imputation is done in fit.
+
         Returns
         -------
         Xt : ndarray, shape (n_samples, n_features)
@@ -501,7 +504,8 @@ def _initial_imputation(self, X):
             force_all_finite = True
 
         X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F",
-                                force_all_finite=force_all_finite)
+                                force_all_finite=force_all_finite,
+                                reset=in_fit)
         _check_inputs_dtype(X, self.missing_values)
 
         X_missing_mask = _get_mask(X, self.missing_values)
@@ -695,7 +699,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X)
+        X, Xt, mask_missing_values, complete_mask = \
+            self._initial_imputation(X, in_fit=False)
 
         X_indicator = super()._transform_indicator(complete_mask)
 
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index df66e4a20aff6..c4b407fdd66e7 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -10,7 +10,6 @@
 from ..metrics.pairwise import _NAN_METRICS
 from ..neighbors._base import _get_weights
 from ..neighbors._base import _check_weights
-from ..utils import check_array
 from ..utils import is_scalar_nan
 from ..utils._mask import _get_mask
 from ..utils.validation import check_is_fitted
@@ -213,12 +212,9 @@ def transform(self, X):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
-                        force_all_finite=force_all_finite, copy=self.copy)
-
-        if X.shape[1] != self._fit_X.shape[1]:
-            raise ValueError("Incompatible dimension between the fitted "
-                             "dataset and the one to be transformed")
+        X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                                force_all_finite=force_all_finite,
+                                copy=self.copy, reset=False)
 
         mask = _get_mask(X, self.missing_values)
         mask_fit_X = self._mask_fit_X
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 4c22ea6d4396e..1370c8f32cf2f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1000,7 +1000,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
             log_reg.coef_ = w
             log_reg.intercept_ = 0.
 
-        log_reg.n_features_in_ = X_train.shape[1]
         if scoring is None:
             scores.append(log_reg.score(X_test, y_test))
         else:
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 958ffb2e0128d..f48f7e8b1514e 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -15,7 +15,7 @@
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
 from ..base import BaseEstimator, RegressorMixin
-from ..utils import check_array, check_random_state
+from ..utils import check_array, check_random_state, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -64,7 +64,6 @@ def __call__(self, coef, intercept):
         est = self.estimator
         est.coef_ = coef.reshape(1, -1)
         est.intercept_ = np.atleast_1d(intercept)
-        est.n_features_in_ = coef.size
         return est.score(self.X_val, self.y_val, self.sample_weight_val)
 
 
@@ -488,13 +487,13 @@ def _partial_fit(self, X, y, alpha, C,
                      loss, learning_rate, max_iter,
                      classes, sample_weight,
                      coef_init, intercept_init):
-        first_call = _check_partial_fit_first_call(self, classes)
-        X, y = self._validate_data(
-            X, y, accept_sparse='csr', dtype=np.float64,
-            order="C", accept_large_sparse=False, reset=first_call)
+        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
+                         order="C", accept_large_sparse=False)
 
         n_samples, n_features = X.shape
 
+        _check_partial_fit_first_call(self, classes)
+
         n_classes = self.classes_.shape[0]
 
         # Allocate datastructures from input arguments
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 51dfbd163a312..db5c88051346a 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -16,8 +16,6 @@
 from sklearn.svm import SVC
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
-from sklearn.linear_model import LinearRegression
-from sklearn.datasets import make_regression
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeRegressor
@@ -539,26 +537,3 @@ def test_repr_html_wraps():
     with config_context(display='diagram'):
         output = tree._repr_html_()
         assert "<style>" in output
-
-
-def test_check_feature_names():
-    # check cases when feature names are not avaliable
-    pd = pytest.importorskip("pandas")
-    X, y = make_regression(random_state=42)
-    est = LinearRegression()
-    est.fit(X, y)
-
-    assert est.feature_names_in_ is None
-    with pytest.warns(None) as record:
-        est.predict(X)
-    assert not record
-
-    # does not check names
-    names = [f"col_{i}" for i in range(X.shape[1])]
-    df = pd.DataFrame(X, columns=names)
-    est.fit(df, y)
-
-    assert_array_equal(est.feature_names_in_, names)
-    with pytest.warns(None) as record:
-        est.predict(X)
-    assert not record
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index f618dd650b314..4b7fa058bb023 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -29,7 +29,6 @@
 from sklearn.utils.validation import check_non_negative, check_array
 from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.linear_model import LogisticRegression
-from sklearn.linear_model import LinearRegression
 from sklearn.svm import NuSVC
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import SkipTest
@@ -42,6 +41,7 @@
     check_dataframe_column_names_consistency,
     check_dataarray_column_name_consistency)
 
+
 def test_all_estimator_no_base_class():
     # test that all_estimators doesn't find abstract classes.
     for name, Estimator in all_estimators():
@@ -272,17 +272,54 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
     check(estimator)
 
 
-@pytest.mark.parametrize('estimator', [LogisticRegression(),
-                                       LinearRegression()],
-                         ids=_set_check_estimator_ids)
-def test_pandas_column_name_consistency(estimator):
-    name = estimator.__class__.__name__
-    check_dataframe_column_names_consistency(name, estimator)
-
-
-@pytest.mark.parametrize('estimator', [LogisticRegression(),
-                                       LinearRegression()],
-                         ids=_set_check_estimator_ids)
-def test_xarray_column_name_consistency(estimator):
-    name = estimator.__class__.__name__
-    check_dataarray_column_name_consistency(name, estimator)
+column_name_modules_to_ignore = {
+    'calibration',
+    'cluster',
+    'compose',
+    'covariance',
+    'cross_decomposition',
+    'decomposition',
+    'discriminant_analysis',
+    'ensemble',
+    'feature_extraction',
+    'feature_selection',
+    'gaussian_process',
+    'isotonic',
+    'kernel_approximation',
+    'kernel_ridge',
+    'linear_model',
+    'manifold',
+    'mixture',
+    'model_selection',
+    'multiclass',
+    'multioutput',
+    'naive_bayes',
+    'neighbors',
+    'neural_network',
+    'pipeline',
+    'preprocessing',
+    'random_projection',
+    'semi_supervised',
+    'svm',
+    'tree',
+}
+
+column_name_estimators = [
+    est for _, est in all_estimators()
+    if est.__module__.split('.')[1] not in column_name_modules_to_ignore]
+
+
+@pytest.mark.parametrize('Estimator', column_name_estimators,
+                         ids=_get_check_estimator_ids)
+def test_pandas_column_name_consistency(Estimator):
+    estimator = _construct_instance(Estimator)
+    _set_checking_parameters(estimator)
+    check_dataframe_column_names_consistency(Estimator.__name__, estimator)
+
+
+@pytest.mark.parametrize('Estimator', column_name_estimators,
+                         ids=_get_check_estimator_ids)
+def test_xarray_column_name_consistency(Estimator):
+    estimator = _construct_instance(Estimator)
+    _set_checking_parameters(estimator)
+    check_dataarray_column_name_consistency(Estimator.__name__, estimator)
diff --git a/sklearn/tests/test_extarray.py b/sklearn/tests/test_extarray.py
index ef148c08f89a2..76a6a0a03a539 100644
--- a/sklearn/tests/test_extarray.py
+++ b/sklearn/tests/test_extarray.py
@@ -3,7 +3,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
-from sklearn.utils._extarray import _get_feature_names
+from sklearn.utils._array_out import _get_feature_names
 
 
 def _construct_array(array_type, column_names):
diff --git a/sklearn/utils/_extarray.py b/sklearn/utils/_array_out.py
similarity index 100%
rename from sklearn/utils/_extarray.py
rename to sklearn/utils/_array_out.py
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f2aa4cc699747..d5dc583c06a17 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3098,7 +3098,8 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
 
     def _construct_dataframe(X, columns):
         return pd.DataFrame(X, columns=columns)
-    _check_column_name_consistency(name, estimator_orig, _construct_dataframe)
+    _check_column_name_consistency(name, estimator_orig, _construct_dataframe,
+                                   "dataframe")
 
 
 def check_dataarray_column_name_consistency(name, estimator_orig):
@@ -3111,10 +3112,12 @@ def check_dataarray_column_name_consistency(name, estimator_orig):
     def _construct_xarray(X, columns):
         return xr.DataArray(X, dims=('index', 'columns'),
                             coords={'columns': columns})
-    _check_column_name_consistency(name, estimator_orig, _construct_xarray)
+    _check_column_name_consistency(name, estimator_orig, _construct_xarray,
+                                   "xarray")
 
 
-def _check_column_name_consistency(name, estimator_orig, construct_X):
+def _check_column_name_consistency(name, estimator_orig, construct_X,
+                                   array_name):
     estimator = clone(estimator_orig)
 
     tags = estimator._get_tags()
@@ -3124,10 +3127,6 @@ def _check_column_name_consistency(name, estimator_orig, construct_X):
                       SkipTestWarning)
         return
 
-    set_random_state(estimator)
-    if 'warm_start' in estimator.get_params():
-        estimator.set_params(warm_start=False)
-
     X_orig, _ = make_regression(random_state=0, n_features=10)
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
     X_orig = _pairwise_estimator_convert_X(X_orig, estimator)
@@ -3144,7 +3143,13 @@ def _check_column_name_consistency(name, estimator_orig, construct_X):
     y = _enforce_estimator_tags_y(estimator, y)
 
     estimator.fit(X, y)
-    if not hasattr(estimator, 'feature_names_in_'):
+
+    if not hasattr(estimator, "feature_names_in_"):
+        raise ValueError("Estimator does not have a feature_names_in_ "
+                         f"attribute after fitting with a {array_name}")
+
+    if estimator.feature_names_in_ is None:
+        # no names to check
         return
 
     assert_array_equal(estimator.feature_names_in_, names)
@@ -3163,6 +3168,7 @@ def _check_column_name_consistency(name, estimator_orig, construct_X):
         # TODO In 0.26, this will be an error.
         assert_warns_message(FutureWarning, expected_msg, func, X_bad)
 
+
 # set of checks that are completely strict, i.e. they have no non-strict part
 _FULLY_STRICT_CHECKS = set([
     'check_n_features_in',

From 19583bc37f5148313a9571ec40156d51347a7d0f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 11:03:12 -0400
Subject: [PATCH 07/52] CLN Smaller diff

---
 sklearn/linear_model/_base.py                         | 6 +++---
 sklearn/tests/{test_extarray.py => test_array_out.py} | 0
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename sklearn/tests/{test_extarray.py => test_array_out.py} (100%)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 2f7c2a128ed78..9d165829c5e7e 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -215,8 +215,8 @@ def fit(self, X, y):
 
     def _decision_function(self, X):
         check_is_fitted(self)
-        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
-                                reset=False)
+
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
         return safe_sparse_dot(X, self.coef_.T,
                                dense_output=True) + self.intercept_
 
@@ -280,7 +280,7 @@ class would be predicted.
         """
         check_is_fitted(self)
 
-        X = self._validate_data(X, accept_sparse='csr', reset=False)
+        X = check_array(X, accept_sparse='csr')
 
         n_features = self.coef_.shape[1]
         if X.shape[1] != n_features:
diff --git a/sklearn/tests/test_extarray.py b/sklearn/tests/test_array_out.py
similarity index 100%
rename from sklearn/tests/test_extarray.py
rename to sklearn/tests/test_array_out.py

From 4d0840a2836d611a4267cd8009bc7c9d560eb824 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 14:33:54 -0400
Subject: [PATCH 08/52] TST Adds test for feature_names_in

---
 sklearn/base.py            |  2 +-
 sklearn/tests/test_base.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 95f8d124692e8..f4007239d9e26 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -411,7 +411,7 @@ def _check_feature_names(self, X, reset=True):
         if (len(fitted_feature_names) != len(feature_names_in) or
                 np.any(fitted_feature_names != feature_names_in)):
             warnings.warn("The column names should match those that were "
-                          "passed during fit(), in the same order. Got "
+                          "passed during fit in the same order. Got "
                           f"({feature_names_in}) expected "
                           f"({fitted_feature_names}). Starting version 0.26, "
                           "an error will be raised", FutureWarning)
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index db5c88051346a..3b994b623e852 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -537,3 +537,33 @@ def test_repr_html_wraps():
     with config_context(display='diagram'):
         output = tree._repr_html_()
         assert "<style>" in output
+
+
+def test_feature_names_in():
+    # Simple checks for feature_names_in
+    pd = pytest.importorskip("pandas")
+    iris = datasets.load_iris()
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    y = iris.target
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        def fit(self, X, y):
+            self._validate_data(X)
+            return self
+
+        def transform(self, X):
+            self._validate_data(X, reset=False)
+            return X
+
+    trans = NoOpTransformer().fit(df, y)
+    assert_array_equal(trans.feature_names_in_, df.columns)
+
+    msg = "The column names should match those that were passed"
+    df_bad = pd.DataFrame(iris.data, columns=iris.feature_names[::-1])
+    with pytest.warns(FutureWarning, match=msg):
+        trans.transform(df_bad)
+
+    # does not warn when transforming on numpy array
+    with pytest.warns(None) as record:
+        trans.transform(iris.data)
+    assert not record

From 53270fee19b91ed3552afe704370b813e0a2d10f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 14:48:42 -0400
Subject: [PATCH 09/52] ENH Adds tests for coverage

---
 sklearn/tests/test_common.py                 |  4 ++--
 sklearn/utils/estimator_checks.py            |  9 +--------
 sklearn/utils/tests/test_estimator_checks.py | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 4b7fa058bb023..b30bd9dab777e 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -39,7 +39,7 @@
     check_class_weight_balanced_linear_classifier,
     parametrize_with_checks,
     check_dataframe_column_names_consistency,
-    check_dataarray_column_name_consistency)
+    check_dataarray_column_names_consistency)
 
 
 def test_all_estimator_no_base_class():
@@ -322,4 +322,4 @@ def test_pandas_column_name_consistency(Estimator):
 def test_xarray_column_name_consistency(Estimator):
     estimator = _construct_instance(Estimator)
     _set_checking_parameters(estimator)
-    check_dataarray_column_name_consistency(Estimator.__name__, estimator)
+    check_dataarray_column_names_consistency(Estimator.__name__, estimator)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 5c38aabf0579f..06db263b3bb15 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3102,7 +3102,7 @@ def _construct_dataframe(X, columns):
                                    "dataframe")
 
 
-def check_dataarray_column_name_consistency(name, estimator_orig):
+def check_dataarray_column_names_consistency(name, estimator_orig):
     try:
         import xarray as xr
     except ImportError:
@@ -3120,13 +3120,6 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
                                    array_name):
     estimator = clone(estimator_orig)
 
-    tags = estimator._get_tags()
-    if "2darray" not in tags["X_types"]:
-        warnings.warn("Can't test estimator {} which requires input "
-                      " of type {}".format(name, tags["X_types"]),
-                      SkipTestWarning)
-        return
-
     X_orig, _ = make_regression(random_state=0, n_features=10)
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
     X_orig = _pairwise_estimator_convert_X(X_orig, estimator)
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index fc42329c94933..44d97ee268883 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -23,6 +23,10 @@
 from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
 from sklearn.utils.estimator_checks import check_classifier_data_not_an_array
 from sklearn.utils.estimator_checks import check_regressor_data_not_an_array
+from sklearn.utils.estimator_checks import \
+    check_dataframe_column_names_consistency
+from sklearn.utils.estimator_checks import \
+    check_dataarray_column_names_consistency
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.estimator_checks import check_outlier_corruption
 from sklearn.utils.fixes import np_version, parse_version
@@ -622,6 +626,20 @@ def test_check_regressor_data_not_an_array():
                         EstimatorInconsistentForPandas())
 
 
+def test_check_dataframe_column_names_consistency():
+    assert_raises_regex(ValueError,
+                        "Estimator does not have a feature_names_in_",
+                        check_dataframe_column_names_consistency,
+                        'estimator_name', BaseBadClassifier())
+
+
+def test_check_dataarray_column_name_consistency():
+    assert_raises_regex(ValueError,
+                        "Estimator does not have a feature_names_in_",
+                        check_dataarray_column_names_consistency,
+                        'estimator_name', BaseBadClassifier())
+
+
 def run_tests_without_pytest():
     """Runs the tests in this file without using pytest.
     """

From 4f7c5e25fef34046bd974f150e10743df384da85 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 15:39:48 -0400
Subject: [PATCH 10/52] TST Fixes warning message

---
 sklearn/base.py                   | 8 ++++----
 sklearn/utils/estimator_checks.py | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index f4007239d9e26..05736bb9999de 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -411,10 +411,10 @@ def _check_feature_names(self, X, reset=True):
         if (len(fitted_feature_names) != len(feature_names_in) or
                 np.any(fitted_feature_names != feature_names_in)):
             warnings.warn("The column names should match those that were "
-                          "passed during fit in the same order. Got "
-                          f"({feature_names_in}) expected "
-                          f"({fitted_feature_names}). Starting version 0.26, "
-                          "an error will be raised", FutureWarning)
+                          f"passed during fit. Got ({feature_names_in}) "
+                          f"expected ({fitted_feature_names}). Starting "
+                          "version 0.26, an error will be raised",
+                          FutureWarning)
 
     def _validate_data(self, X, y=None, reset=True,
                        validate_separately=False, **check_params):
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 06db263b3bb15..a0565781cbbeb 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3150,9 +3150,8 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     X_bad = construct_X(X, bad_names)
 
     expected_msg = ("The column names should match those that were passed "
-                    f"during fit(), in the same order. Got ({bad_names}) "
-                    f"expected ({names}). Starting version 0.26, an error "
-                    "will be raised")
+                    f"during fit. Got ({bad_names}) expected ({names}). "
+                    "Starting version 0.26, an error will be raised")
     for method in ("predict", "transform", "decision_function",
                    "predict_proba"):
         func = getattr(estimator, method, None)

From 37117ebf51b4b57fd013e8d27289812a533e34e4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 16:20:26 -0400
Subject: [PATCH 11/52] ENH Adds xarray

---
 build_tools/azure/install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 94e696c3b75c1..608402f12471f 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -37,6 +37,7 @@ if [[ "$DISTRIB" == "conda" ]]; then
     TO_INSTALL="$TO_INSTALL $(get_dep pyamg $PYAMG_VERSION)"
     TO_INSTALL="$TO_INSTALL $(get_dep Pillow $PILLOW_VERSION)"
     TO_INSTALL="$TO_INSTALL $(get_dep matplotlib $MATPLOTLIB_VERSION)"
+    TO_INSTALL="$TO_INSTALL xarray"
 
     if [[ "$UNAMESTR" == "Darwin" ]]; then
         if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then

From 5ed789bb76b879aa447768b625aaaaa850587188 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Sep 2020 18:15:30 -0400
Subject: [PATCH 12/52] BLD Force build on ci


From 1b81d1257a018c394129f69686f4e280f4035322 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 1 Oct 2020 10:29:13 -0400
Subject: [PATCH 13/52] CLN Changes naming

---
 sklearn/base.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 0c80f2f0dbd81..1ceb7c98d5e7c 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -389,7 +389,7 @@ def _check_feature_names(self, X, reset=True):
             The input samples.
         reset : bool, default=True
             Whether to reset the `feature_names_in_` attribute.
-            If False, the Input will be checked for consistency with data
+            If False, the input will be checked for consistency with data
             provided when reset was last True.
         """
 
@@ -402,17 +402,17 @@ def _check_feature_names(self, X, reset=True):
             # no feature names to check
             return
 
-        feature_names_in = _get_feature_names(X)
-        if feature_names_in is None:
+        new_feature_names_in = _get_feature_names(X)
+        if new_feature_names_in is None:
             # X does not have feature names but estimator was fitted with
             # data with feature names
             return
 
         # valid the `feature_names_in_` attribute
-        if (len(fitted_feature_names) != len(feature_names_in) or
-                np.any(fitted_feature_names != feature_names_in)):
+        if (len(fitted_feature_names) != len(new_feature_names_in) or
+                np.any(fitted_feature_names != new_feature_names_in)):
             warnings.warn("The column names should match those that were "
-                          f"passed during fit. Got ({feature_names_in}) "
+                          f"passed during fit. Got ({new_feature_names_in}) "
                           f"expected ({fitted_feature_names}). Starting "
                           "version 0.26, an error will be raised",
                           FutureWarning)

From 3522f37bbd046d134ddcc3e418df2043a5edd948 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 1 Oct 2020 13:14:06 -0400
Subject: [PATCH 14/52] TST Adds more testing

---
 sklearn/tests/test_common.py      | 26 +++++++++++++++-----------
 sklearn/utils/estimator_checks.py | 11 +++++++++++
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 0868b9c0f2d2b..0db977e0050ca 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -274,7 +274,10 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
     check(estimator)
 
 
-column_name_modules_to_ignore = {
+# TODO: When more modules get added, we can remove it from this list to make
+# sure it gets tested. After we finish each module we can move the checks
+# into check_estimator
+COLUMN_NAME_MODULES_TO_IGNORE = {
     'calibration',
     'cluster',
     'compose',
@@ -287,6 +290,7 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
     'feature_selection',
     'gaussian_process',
     'isotonic',
+    'impute',
     'kernel_approximation',
     'kernel_ridge',
     'linear_model',
@@ -307,21 +311,21 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
 }
 
 column_name_estimators = [
-    est for _, est in all_estimators()
-    if est.__module__.split('.')[1] not in column_name_modules_to_ignore]
+    est for est in _tested_estimators()
+    if est.__module__.split('.')[1] not in COLUMN_NAME_MODULES_TO_IGNORE]
 
 
-@pytest.mark.parametrize('Estimator', column_name_estimators,
+@pytest.mark.parametrize('estimator', column_name_estimators,
                          ids=_get_check_estimator_ids)
-def test_pandas_column_name_consistency(Estimator):
-    estimator = _construct_instance(Estimator)
+def test_pandas_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataframe_column_names_consistency(Estimator.__name__, estimator)
+    check_dataframe_column_names_consistency(type(estimator).__name__,
+                                             estimator)
 
 
-@pytest.mark.parametrize('Estimator', column_name_estimators,
+@pytest.mark.parametrize('estimator', column_name_estimators,
                          ids=_get_check_estimator_ids)
-def test_xarray_column_name_consistency(Estimator):
-    estimator = _construct_instance(Estimator)
+def test_xarray_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataarray_column_names_consistency(Estimator.__name__, estimator)
+    check_dataarray_column_names_consistency(type(estimator).__name__,
+                                             estimator)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index d4ebd4f8fb07d..dac9a1b8f1a20 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3151,6 +3151,9 @@ def _construct_xarray(X, columns):
 def _check_column_name_consistency(name, estimator_orig, construct_X,
                                    array_name):
     estimator = clone(estimator_orig)
+    tags = estimator._get_tags()
+    if "2darray" not in tags["X_types"] or tags["no_validation"]:
+        return
 
     X_orig, _ = make_regression(random_state=0, n_features=10)
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
@@ -3192,6 +3195,14 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
         # TODO In 0.26, this will be an error.
         assert_warns_message(FutureWarning, expected_msg, func, X_bad)
 
+    # partial_fit checks on second call
+    if not hasattr(estimator, "partial_fit"):
+        return  #
+
+    estimator = clone(estimator_orig)
+    estimator.partial_fit(X, y)
+    assert_warns_message(FutureWarning, expected_msg, func, X_bad, y)
+
 
 # set of checks that are completely strict, i.e. they have no non-strict part
 _FULLY_STRICT_CHECKS = set([

From a81e2a3bd1bc672422d093ff2b8551710d6e09f5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 1 Oct 2020 14:01:03 -0400
Subject: [PATCH 15/52] WIP Address comments

---
 sklearn/base.py                                    | 14 +++++++++++---
 sklearn/dummy.py                                   |  8 ++------
 sklearn/impute/_base.py                            |  2 +-
 sklearn/impute/_iterative.py                       | 11 +++--------
 sklearn/impute/_knn.py                             |  9 ++++++---
 sklearn/tests/test_array_out.py                    |  2 +-
 sklearn/tests/test_base.py                         | 14 ++++++++++----
 sklearn/utils/{_array_out.py => _feature_names.py} |  0
 sklearn/utils/estimator_checks.py                  |  5 +----
 9 files changed, 35 insertions(+), 30 deletions(-)
 rename sklearn/utils/{_array_out.py => _feature_names.py} (100%)

diff --git a/sklearn/base.py b/sklearn/base.py
index 1ceb7c98d5e7c..1b39173ee9a20 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -19,7 +19,7 @@
 from .utils.validation import check_array
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
-from .utils._array_out import _get_feature_names
+from .utils._feature_names import _get_feature_names
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -389,8 +389,12 @@ def _check_feature_names(self, X, reset=True):
             The input samples.
         reset : bool, default=True
             Whether to reset the `feature_names_in_` attribute.
-            If False, the input will be checked for consistency with data
-            provided when reset was last True.
+            If False, the input will be checked for consistency with
+            feature names of data provided when reset was last True.
+            .. note::
+               It is recommended to call reset=True in `fit` and in the first
+               call to `partial_fit`. All other methods that validates `X`
+               should set `reset=False`.
         """
 
         if reset:
@@ -433,6 +437,10 @@ def _validate_data(self, X, y=None, reset=True,
             Whether to reset the `n_features_in_` attribute.
             If False, the input will be checked for consistency with data
             provided when reset was last True.
+            .. note::
+               It is recommended to call reset=True in `fit` and in the first
+               call to `partial_fit`. All other methods that validates `X`
+               should set `reset=False`.
         validate_separately : False or tuple of dicts, default=False
             Only used if y is not None.
             If False, call validate_X_y(). Else, it must be a tuple of kwargs
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 9d656ccbc6282..4773ccbd1f92f 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -147,9 +147,7 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        # No input validation is done for X
-        self.n_features_in_ = None
-        self.feature_names_in_ = None
+        self.n_features_in_ = None  # No input validation is done for X
 
         check_consistent_length(X, y)
 
@@ -474,9 +472,7 @@ def fit(self, X, y, sample_weight=None):
                              % (self.strategy, allowed_strategies))
 
         y = check_array(y, ensure_2d=False)
-        # No input validation is done for X
-        self.n_features_in_ = None
-        self.feature_names_in_ = None
+        self.n_features_in_ = None  # No input validation is done for X
         if len(y) == 0:
             raise ValueError("y must not be empty.")
 
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 9830bed7273e1..20b22224d53c7 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -793,7 +793,7 @@ def transform(self, X):
         # Need not validate X again as it would have already been validated
         # in the Imputer calling MissingIndicator
         if not self._precomputed:
-            X = self._validate_input(X, in_fit=False)
+            X = self._validate_input(X, in_fit=True)
         else:
             if not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):
                 raise ValueError("precomputed is True but the input data is "
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 6afa8ec6db208..325a484244143 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -468,7 +468,7 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
         abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
         return abs_corr_mat
 
-    def _initial_imputation(self, X, in_fit=True):
+    def _initial_imputation(self, X):
         """Perform initial imputation for input X.
 
         Parameters
@@ -477,9 +477,6 @@ def _initial_imputation(self, X, in_fit=True):
             Input data, where "n_samples" is the number of samples and
             "n_features" is the number of features.
 
-        in_fit : bool, default=True
-            Whether the imputation is done in fit.
-
         Returns
         -------
         Xt : ndarray, shape (n_samples, n_features)
@@ -504,8 +501,7 @@ def _initial_imputation(self, X, in_fit=True):
             force_all_finite = True
 
         X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F",
-                                force_all_finite=force_all_finite,
-                                reset=in_fit)
+                                force_all_finite=force_all_finite)
         _check_inputs_dtype(X, self.missing_values)
 
         X_missing_mask = _get_mask(X, self.missing_values)
@@ -699,8 +695,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X, Xt, mask_missing_values, complete_mask = \
-            self._initial_imputation(X, in_fit=False)
+        X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X)
 
         X_indicator = super()._transform_indicator(complete_mask)
 
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index c4b407fdd66e7..b7e9f66a321b7 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -212,9 +212,12 @@ def transform(self, X):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
-        X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES,
-                                force_all_finite=force_all_finite,
-                                copy=self.copy, reset=False)
+        X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                        force_all_finite=force_all_finite, copy=self.copy)
+
+        if X.shape[1] != self._fit_X.shape[1]:
+            raise ValueError("Incompatible dimension between the fitted "
+                             "dataset and the one to be transformed")
 
         mask = _get_mask(X, self.missing_values)
         mask_fit_X = self._mask_fit_X
diff --git a/sklearn/tests/test_array_out.py b/sklearn/tests/test_array_out.py
index 76a6a0a03a539..d1be1f6225b88 100644
--- a/sklearn/tests/test_array_out.py
+++ b/sklearn/tests/test_array_out.py
@@ -3,7 +3,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
-from sklearn.utils._array_out import _get_feature_names
+from sklearn.utils._feature_names import _get_feature_names
 
 
 def _construct_array(array_type, column_names):
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 3b994b623e852..1ba8dc87a20ed 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -543,8 +543,8 @@ def test_feature_names_in():
     # Simple checks for feature_names_in
     pd = pytest.importorskip("pandas")
     iris = datasets.load_iris()
-    df = pd.DataFrame(iris.data, columns=iris.feature_names)
-    y = iris.target
+    X_np, y = iris.data, iris.target
+    df = pd.DataFrame(X_np, columns=iris.feature_names)
 
     class NoOpTransformer(TransformerMixin, BaseEstimator):
         def fit(self, X, y):
@@ -559,11 +559,17 @@ def transform(self, X):
     assert_array_equal(trans.feature_names_in_, df.columns)
 
     msg = "The column names should match those that were passed"
-    df_bad = pd.DataFrame(iris.data, columns=iris.feature_names[::-1])
+    df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
     with pytest.warns(FutureWarning, match=msg):
         trans.transform(df_bad)
 
     # does not warn when transforming on numpy array
     with pytest.warns(None) as record:
-        trans.transform(iris.data)
+        trans.transform(X_np)
+    assert not record
+
+    # fitted on numpy array and transformed on pandas array does not warn
+    trans = NoOpTransformer().fit(X_np, y)
+    with pytest.warns(None) as record:
+        trans.transform(df)
     assert not record
diff --git a/sklearn/utils/_array_out.py b/sklearn/utils/_feature_names.py
similarity index 100%
rename from sklearn/utils/_array_out.py
rename to sklearn/utils/_feature_names.py
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index dac9a1b8f1a20..294b8209f26c8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3164,10 +3164,7 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     X = construct_X(X_orig, names)
 
     rng = np.random.RandomState(0)
-    if is_regressor(estimator):
-        y = rng.normal(size=n_samples)
-    else:
-        y = rng.randint(low=0, high=2, size=n_samples)
+    y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
 
     estimator.fit(X, y)

From 2c45b6590dd7113c35412efa77ff2a1211f31d61 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 1 Oct 2020 17:13:54 -0400
Subject: [PATCH 16/52] ENH Only update cross_decompositon

---
 sklearn/cross_decomposition/_pls.py | 6 +++---
 sklearn/tests/test_common.py        | 1 -
 sklearn/utils/estimator_checks.py   | 9 +++------
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 9d8df42bf1a46..ada69e070a8f0 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -316,7 +316,7 @@ def transform(self, X, Y=None, copy=True):
         `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.
         """
         check_is_fitted(self)
-        X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
+        X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
         # Normalize
         X -= self.x_mean_
         X /= self.x_std_
@@ -378,7 +378,7 @@ def predict(self, X, copy=True):
         space.
         """
         check_is_fitted(self)
-        X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
+        X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
         # Normalize
         X -= self.x_mean_
         X /= self.x_std_
@@ -925,7 +925,7 @@ def transform(self, X, Y=None):
             `(X_transformed, Y_transformed)` otherwise.
         """
         check_is_fitted(self)
-        X = check_array(X, dtype=np.float64)
+        X = self._validate_data(X, dtype=np.float64, reset=False)
         Xr = (X - self.x_mean_) / self.x_std_
         x_scores = np.dot(Xr, self.x_weights_)
         if Y is not None:
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 0db977e0050ca..ed41cabb2b97f 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -282,7 +282,6 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
     'cluster',
     'compose',
     'covariance',
-    'cross_decomposition',
     'decomposition',
     'discriminant_analysis',
     'ensemble',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 294b8209f26c8..2d3a94f0e3110 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3173,10 +3173,6 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
         raise ValueError("Estimator does not have a feature_names_in_ "
                          f"attribute after fitting with a {array_name}")
 
-    if estimator.feature_names_in_ is None:
-        # no names to check
-        return
-
     assert_array_equal(estimator.feature_names_in_, names)
     bad_names = names[::-1]
     X_bad = construct_X(X, bad_names)
@@ -3194,11 +3190,12 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
 
     # partial_fit checks on second call
     if not hasattr(estimator, "partial_fit"):
-        return  #
+        return  # partial_fit is not defined
 
     estimator = clone(estimator_orig)
     estimator.partial_fit(X, y)
-    assert_warns_message(FutureWarning, expected_msg, func, X_bad, y)
+    assert_warns_message(FutureWarning, expected_msg, estimator.partial_fit,
+                         X_bad, y)
 
 
 # set of checks that are completely strict, i.e. they have no non-strict part

From f43356b6d21beba4ee3067534fff93d4da59a693 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 1 Oct 2020 18:00:17 -0400
Subject: [PATCH 17/52] TST Adds test for partial_fit

---
 sklearn/utils/estimator_checks.py            |  2 +-
 sklearn/utils/tests/test_estimator_checks.py | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 2d3a94f0e3110..c22d1d7ec2718 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3155,7 +3155,7 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     if "2darray" not in tags["X_types"] or tags["no_validation"]:
         return
 
-    X_orig, _ = make_regression(random_state=0, n_features=10)
+    X_orig, _ = make_regression(random_state=0, n_features=5)
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
     X_orig = _pairwise_estimator_convert_X(X_orig, estimator)
 
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index fe3da3d0b6de6..9d3784b8e81e8 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -371,6 +371,18 @@ def _more_tags(self):
         return {"poor_score": True}
 
 
+class PartialFitChecksName(BaseEstimator):
+    def fit(self, X, y):
+        self._validate_data(X, y)
+        return self
+
+    def partial_fit(self, X, y):
+        reset = not hasattr(self, '_fitted')
+        self._validate_data(X, y, reset=reset)
+        self._fitted = True
+        return self
+
+
 def test_not_an_array_array_function():
     if np_version < parse_version('1.17'):
         raise SkipTest("array_function protocol not supported in numpy <1.17")
@@ -619,6 +631,8 @@ def test_check_dataframe_column_names_consistency():
                         "Estimator does not have a feature_names_in_",
                         check_dataframe_column_names_consistency,
                         'estimator_name', BaseBadClassifier())
+    check_dataframe_column_names_consistency('estimator_name',
+                                             PartialFitChecksName())
 
 
 def test_check_dataarray_column_name_consistency():
@@ -626,6 +640,8 @@ def test_check_dataarray_column_name_consistency():
                         "Estimator does not have a feature_names_in_",
                         check_dataarray_column_names_consistency,
                         'estimator_name', BaseBadClassifier())
+    check_dataarray_column_names_consistency('estimator_name',
+                                             PartialFitChecksName())
 
 
 def run_tests_without_pytest():

From 46f332da3a38dd2717a4b3c858ee3127d02cd1e6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Oct 2020 12:55:38 -0400
Subject: [PATCH 18/52] TST Fixes

---
 sklearn/impute/_knn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index b7e9f66a321b7..ace683bff931c 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -212,8 +212,9 @@ def transform(self, X):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
-                        force_all_finite=force_all_finite, copy=self.copy)
+        X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                                force_all_finite=force_all_finite,
+                                copy=self.copy)
 
         if X.shape[1] != self._fit_X.shape[1]:
             raise ValueError("Incompatible dimension between the fitted "

From c93bd9d26424a1bdf6af2e1418bd73fd8a7f724f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Oct 2020 13:09:59 -0400
Subject: [PATCH 19/52] DOC More cleanup

---
 sklearn/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 1b39173ee9a20..98db4b4a762a5 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -387,12 +387,13 @@ def _check_feature_names(self, X, reset=True):
         ----------
         X : {dataframe-like} of shape (n_samples, n_features)
             The input samples.
+
         reset : bool, default=True
             Whether to reset the `feature_names_in_` attribute.
             If False, the input will be checked for consistency with
             feature names of data provided when reset was last True.
             .. note::
-               It is recommended to call reset=True in `fit` and in the first
+               It is recommended to call `reset=True` in `fit` and in the first
                call to `partial_fit`. All other methods that validates `X`
                should set `reset=False`.
         """
@@ -438,7 +439,7 @@ def _validate_data(self, X, y=None, reset=True,
             If False, the input will be checked for consistency with data
             provided when reset was last True.
             .. note::
-               It is recommended to call reset=True in `fit` and in the first
+               It is recommended to call `reset=True` in `fit` and in the first
                call to `partial_fit`. All other methods that validates `X`
                should set `reset=False`.
         validate_separately : False or tuple of dicts, default=False

From 5039f5a240047d42ac9608099c01602db1617bb1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Oct 2020 13:30:02 -0400
Subject: [PATCH 20/52] DOC Adds docstring

---
 sklearn/utils/_feature_names.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
index 2ea523db74471..51c7b21d93e80 100644
--- a/sklearn/utils/_feature_names.py
+++ b/sklearn/utils/_feature_names.py
@@ -4,10 +4,21 @@
 def _get_feature_names(X):
     """Get feature names from X.
 
-    Supports:
-       - pandas DataFrame
-       - xarray DataArray
-       - Return None for unrecognized array containers
+    Parameters
+    ----------
+    X : {dataframe, dataarray} of shape (n_samples, n_features)
+        Array container to extract feature names.
+
+        - pandas DataFrame : The columns will be considered to be feature
+          names.
+        - xarray DataArray : The coords of the second dimension will be
+          considered to be feature names.
+        - All other array containers will return `None`.
+
+    Returns
+    -------
+    names: array-like of shape (n_features,) or None
+        Column names of `X`. Unrecognized array containers will return `None`.
     """
     if hasattr(X, "columns"):  # pandas
         return np.array(X.columns, dtype=object)

From ee03ab7439ba5ca035ee614e1626dfa6baf75357 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Oct 2020 13:41:31 -0400
Subject: [PATCH 21/52] TST Adds smoke test

---
 sklearn/tests/test_common.py      |  2 +-
 sklearn/utils/estimator_checks.py | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index ed41cabb2b97f..1af074ba07581 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -276,7 +276,7 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
 
 # TODO: When more modules get added, we can remove it from this list to make
 # sure it gets tested. After we finish each module we can move the checks
-# into check_estimator
+# into check_estimator.
 COLUMN_NAME_MODULES_TO_IGNORE = {
     'calibration',
     'cluster',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index d6d26439cd9d5..e826a33516371 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3142,6 +3142,8 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
                                    array_name):
     estimator = clone(estimator_orig)
     tags = estimator._get_tags()
+
+    # should be
     if "2darray" not in tags["X_types"] or tags["no_validation"]:
         return
 
@@ -3163,6 +3165,13 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
         raise ValueError("Estimator does not have a feature_names_in_ "
                          f"attribute after fitting with a {array_name}")
 
+    check_funcs = [
+        getattr(estimator, func) for func in
+        ("predict", "transform", "decision_function", "predict_proba")
+        if hasattr(estimator, func)]
+    for func in check_funcs:
+        func(X)  # works
+
     assert_array_equal(estimator.feature_names_in_, names)
     bad_names = names[::-1]
     X_bad = construct_X(X, bad_names)
@@ -3170,13 +3179,9 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     expected_msg = ("The column names should match those that were passed "
                     f"during fit. Got ({bad_names}) expected ({names}). "
                     "Starting version 0.26, an error will be raised")
-    for method in ("predict", "transform", "decision_function",
-                   "predict_proba"):
-        func = getattr(estimator, method, None)
-        if func is None:
-            continue
+    for method in check_funcs:
         # TODO In 0.26, this will be an error.
-        assert_warns_message(FutureWarning, expected_msg, func, X_bad)
+        assert_warns_message(FutureWarning, expected_msg, method, X_bad)
 
     # partial_fit checks on second call
     if not hasattr(estimator, "partial_fit"):

From be480b61eae2c9a16ef116a9dba300e2ee10d84a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Oct 2020 16:55:52 -0400
Subject: [PATCH 22/52] ENH Restricts column names to string

---
 ...est_array_out.py => test_feature_names.py} | 18 ++++++++++++---
 sklearn/utils/_feature_names.py               | 23 +++++++++++++++----
 2 files changed, 34 insertions(+), 7 deletions(-)
 rename sklearn/tests/{test_array_out.py => test_feature_names.py} (61%)

diff --git a/sklearn/tests/test_array_out.py b/sklearn/tests/test_feature_names.py
similarity index 61%
rename from sklearn/tests/test_array_out.py
rename to sklearn/tests/test_feature_names.py
index d1be1f6225b88..f165dede90835 100644
--- a/sklearn/tests/test_array_out.py
+++ b/sklearn/tests/test_feature_names.py
@@ -12,16 +12,28 @@ def _construct_array(array_type, column_names):
     if array_type == "dataframe":
         pd = pytest.importorskip("pandas")
         return pd.DataFrame(X, columns=column_names)
-    elif array_type == "dataarray":
+    else:
         xr = pytest.importorskip("xarray")
         return xr.DataArray(X, dims=('index', 'columns'),
                             coords={'columns': column_names})
 
 
 @pytest.mark.parametrize("array_type", ["dataframe", "dataarray"])
-def test_pandas_get_feature_names(array_type):
+def test_get_feature_names(array_type):
     column_names = [f'col_{i}' for i in range(3)]
     X = _construct_array(array_type, column_names)
-    names = _get_feature_names(X)
 
+    names = _get_feature_names(X)
     assert_array_equal(names, column_names)
+
+
+@pytest.mark.parametrize("array_type", ["dataframe", "dataarray"])
+@pytest.mark.parametrize("column_names", [
+    np.array(["one", 2, "tree"], dtype=object),
+    np.array([1, 2, 3], dtype=object)
+])
+def test_get_feature_names_non_str(array_type, column_names):
+    X = _construct_array(array_type, column_names)
+    msg = "X contains non-string feature names"
+    with pytest.raises(ValueError, match=msg):
+        _get_feature_names(X)
diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
index 51c7b21d93e80..668abb3b59ff9 100644
--- a/sklearn/utils/_feature_names.py
+++ b/sklearn/utils/_feature_names.py
@@ -17,11 +17,26 @@ def _get_feature_names(X):
 
     Returns
     -------
-    names: array-like of shape (n_features,) or None
+    names: ndarray of shape (n_features,) or None
         Column names of `X`. Unrecognized array containers will return `None`.
+
+    Raises
+    ------
+    ValueError
+        If column names consist of a non-string data type.
     """
-    if hasattr(X, "columns"):  # pandas
-        return np.array(X.columns, dtype=object)
+    if hasattr(X, "columns"):
+        # pandas
+        out = np.array(X.columns, dtype=object)
     elif hasattr(X, "dims") and isinstance(X.dims, tuple) and len(X.dims) == 2:
         # xarray DataArray
-        return np.array(X.coords[X.dims[1]], dtype=object)
+        out = np.array(X.coords[X.dims[1]], dtype=object)
+    else:
+        # unrecognized array container
+        return None
+
+    # check out for strings
+    if any(not isinstance(item, str) for item in out):
+        raise ValueError("X contains non-string feature names")
+
+    return out

From 670996a499a483e86bdc8839ee1f830e1df7e062 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 7 Oct 2020 14:55:59 -0400
Subject: [PATCH 23/52] CLN Removes reset

---
 sklearn/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 3a7c8f6a965d0..7f2b82b435e36 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -472,7 +472,7 @@ def _validate_data(self, X, y='no_validation', reset=True,
         self._check_feature_names(X, reset=reset)
 
         if y is None:
-            if reset and self._get_tags()['requires_y']:
+            if self._get_tags()['requires_y']:
                 raise ValueError(
                     f"This {self.__class__.__name__} estimator "
                     f"requires y to be passed, but the target y is None."

From a4af9c05df2f2e9e08491004d4439a973b3c5a4e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 7 Oct 2020 14:57:02 -0400
Subject: [PATCH 24/52] CLN Reduce diffs

---
 sklearn/tests/test_common.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 174a7d60af1e2..9e54fe0860a76 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -277,7 +277,14 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
 
 # TODO: When more modules get added, we can remove it from this list to make
 # sure it gets tested. After we finish each module we can move the checks
-# into check_estimator.
+# into sklearn.utils.estimator_checks.check_n_features_in.
+#
+# check_estimators_partial_fit_n_features can either be removed or updated
+# with the two more assertions:
+# 1. `n_features_in_` is set during the first call to `partial_fit`.
+# 2. More strict when it comes to the error message.
+#
+# check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'calibration',
     'cluster',
@@ -315,6 +322,7 @@ def test_strict_mode_parametrize_with_checks(estimator, check):
     N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE
 ]
 
+
 @pytest.mark.parametrize("estimator", N_FEATURES_IN_AFTER_FIT_ESTIMATORS,
                          ids=_get_check_estimator_ids)
 def test_check_n_features_in_after_fitting(estimator):

From f853336b9e446d3a7b800e84eae0e75eaf218a2e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 14:26:39 -0400
Subject: [PATCH 25/52] STY Runs black

---
 sklearn/tests/test_common.py        | 80 +++++++++++++++--------------
 sklearn/tests/test_feature_names.py |  7 +--
 sklearn/utils/estimator_checks.py   | 53 ++++++++++---------
 3 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index c1742dea64e7d..209c2d50fe5b7 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -315,54 +315,56 @@ def test_check_n_features_in_after_fitting(estimator):
 # sure it gets tested. After we finish each module we can move the checks
 # into check_estimator.
 COLUMN_NAME_MODULES_TO_IGNORE = {
-    'calibration',
-    'cluster',
-    'compose',
-    'covariance',
-    'decomposition',
-    'discriminant_analysis',
-    'ensemble',
-    'feature_extraction',
-    'feature_selection',
-    'gaussian_process',
-    'isotonic',
-    'impute',
-    'kernel_approximation',
-    'kernel_ridge',
-    'linear_model',
-    'manifold',
-    'mixture',
-    'model_selection',
-    'multiclass',
-    'multioutput',
-    'naive_bayes',
-    'neighbors',
-    'neural_network',
-    'pipeline',
-    'preprocessing',
-    'random_projection',
-    'semi_supervised',
-    'svm',
-    'tree',
+    "calibration",
+    "cluster",
+    "compose",
+    "covariance",
+    "decomposition",
+    "discriminant_analysis",
+    "ensemble",
+    "feature_extraction",
+    "feature_selection",
+    "gaussian_process",
+    "isotonic",
+    "impute",
+    "kernel_approximation",
+    "kernel_ridge",
+    "linear_model",
+    "manifold",
+    "mixture",
+    "model_selection",
+    "multiclass",
+    "multioutput",
+    "naive_bayes",
+    "neighbors",
+    "neural_network",
+    "pipeline",
+    "preprocessing",
+    "random_projection",
+    "semi_supervised",
+    "svm",
+    "tree",
 }
 
 
 column_name_estimators = [
-    est for est in _tested_estimators()
-    if est.__module__.split('.')[1] not in COLUMN_NAME_MODULES_TO_IGNORE]
+    est
+    for est in _tested_estimators()
+    if est.__module__.split(".")[1] not in COLUMN_NAME_MODULES_TO_IGNORE
+]
 
 
-@pytest.mark.parametrize('estimator', column_name_estimators,
-                         ids=_get_check_estimator_ids)
+@pytest.mark.parametrize(
+    "estimator", column_name_estimators, ids=_get_check_estimator_ids
+)
 def test_pandas_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataframe_column_names_consistency(type(estimator).__name__,
-                                             estimator)
+    check_dataframe_column_names_consistency(type(estimator).__name__, estimator)
 
 
-@pytest.mark.parametrize('estimator', column_name_estimators,
-                         ids=_get_check_estimator_ids)
+@pytest.mark.parametrize(
+    "estimator", column_name_estimators, ids=_get_check_estimator_ids
+)
 def test_xarray_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataarray_column_names_consistency(type(estimator).__name__,
-                                             estimator)
+    check_dataarray_column_names_consistency(type(estimator).__name__, estimator)
diff --git a/sklearn/tests/test_feature_names.py b/sklearn/tests/test_feature_names.py
index 54f78e7f9523e..b0597bae063ef 100644
--- a/sklearn/tests/test_feature_names.py
+++ b/sklearn/tests/test_feature_names.py
@@ -14,13 +14,14 @@ def _construct_array(array_type, column_names):
         return pd.DataFrame(X, columns=column_names)
     else:
         xr = pytest.importorskip("xarray")
-        return xr.DataArray(X, dims=('index', 'columns'),
-                            coords={'columns': column_names})
+        return xr.DataArray(
+            X, dims=("index", "columns"), coords={"columns": column_names}
+        )
 
 
 @pytest.mark.parametrize("array_type", ["dataframe", "dataarray"])
 def test_get_feature_names(array_type):
-    column_names = [f'col_{i}' for i in range(3)]
+    column_names = [f"col_{i}" for i in range(3)]
     X = _construct_array(array_type, column_names)
 
     names = _get_feature_names(X)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ed5fd979d205e..341757050a34a 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3428,31 +3428,35 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
     try:
         import pandas as pd
     except ImportError:
-        raise SkipTest("pandas is not installed: not checking "
-                       "column name consistency for pandas")
+        raise SkipTest(
+            "pandas is not installed: not checking "
+            "column name consistency for pandas"
+        )
 
     def _construct_dataframe(X, columns):
         return pd.DataFrame(X, columns=columns)
-    _check_column_name_consistency(name, estimator_orig, _construct_dataframe,
-                                   "dataframe")
+
+    _check_column_name_consistency(
+        name, estimator_orig, _construct_dataframe, "dataframe"
+    )
 
 
 def check_dataarray_column_names_consistency(name, estimator_orig):
     try:
         import xarray as xr
     except ImportError:
-        raise SkipTest("xarray is not installed: not checking "
-                       "column name consistency for xarray")
+        raise SkipTest(
+            "xarray is not installed: not checking "
+            "column name consistency for xarray"
+        )
 
     def _construct_xarray(X, columns):
-        return xr.DataArray(X, dims=('index', 'columns'),
-                            coords={'columns': columns})
-    _check_column_name_consistency(name, estimator_orig, _construct_xarray,
-                                   "xarray")
+        return xr.DataArray(X, dims=("index", "columns"), coords={"columns": columns})
 
+    _check_column_name_consistency(name, estimator_orig, _construct_xarray, "xarray")
 
-def _check_column_name_consistency(name, estimator_orig, construct_X,
-                                   array_name):
+
+def _check_column_name_consistency(name, estimator_orig, construct_X, array_name):
     estimator = clone(estimator_orig)
     tags = estimator._get_tags()
 
@@ -3475,13 +3479,16 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     estimator.fit(X, y)
 
     if not hasattr(estimator, "feature_names_in_"):
-        raise ValueError("Estimator does not have a feature_names_in_ "
-                         f"attribute after fitting with a {array_name}")
+        raise ValueError(
+            "Estimator does not have a feature_names_in_ "
+            f"attribute after fitting with a {array_name}"
+        )
 
     check_funcs = [
-        getattr(estimator, func) for func in
-        ("predict", "transform", "decision_function", "predict_proba")
-        if hasattr(estimator, func)]
+        getattr(estimator, func)
+        for func in ("predict", "transform", "decision_function", "predict_proba")
+        if hasattr(estimator, func)
+    ]
     for func in check_funcs:
         func(X)  # works
 
@@ -3489,9 +3496,11 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
     bad_names = names[::-1]
     X_bad = construct_X(X, bad_names)
 
-    expected_msg = ("The column names should match those that were passed "
-                    f"during fit. Got ({bad_names}) expected ({names}). "
-                    "Starting version 0.26, an error will be raised")
+    expected_msg = (
+        "The column names should match those that were passed "
+        f"during fit. Got ({bad_names}) expected ({names}). "
+        "Starting version 0.26, an error will be raised"
+    )
     for method in check_funcs:
         # TODO In 0.26, this will be an error.
         assert_warns_message(FutureWarning, expected_msg, method, X_bad)
@@ -3502,6 +3511,4 @@ def _check_column_name_consistency(name, estimator_orig, construct_X,
 
     estimator = clone(estimator_orig)
     estimator.partial_fit(X, y)
-    assert_warns_message(FutureWarning, expected_msg, estimator.partial_fit,
-                         X_bad, y)
-
+    assert_warns_message(FutureWarning, expected_msg, estimator.partial_fit, X_bad, y)

From aeb220ee82553412f6c95a0556dbba7086883521 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 15:00:22 -0400
Subject: [PATCH 26/52] CLN Slightly nicer tests

---
 sklearn/base.py                              |  4 +---
 sklearn/tests/test_common.py                 | 14 ++++--------
 sklearn/tests/test_feature_names.py          | 24 ++++++++------------
 sklearn/utils/_feature_names.py              | 12 +---------
 sklearn/utils/estimator_checks.py            | 18 +--------------
 sklearn/utils/tests/test_estimator_checks.py | 22 +++---------------
 6 files changed, 19 insertions(+), 75 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index a522e4958e1c0..003eece7b5a73 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -433,9 +433,7 @@ def _check_feature_names(self, X, reset=True):
         ):
             warnings.warn(
                 "The column names should match those that were "
-                f"passed during fit. Got ({new_feature_names_in}) "
-                f"expected ({fitted_feature_names}). Starting "
-                "version 0.26, an error will be raised",
+                "passed during fit. Starting version 1.2, an error will be raised",
                 FutureWarning,
             )
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 209c2d50fe5b7..717e8d763902f 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -48,7 +48,6 @@
     check_class_weight_balanced_linear_classifier,
     parametrize_with_checks,
     check_dataframe_column_names_consistency,
-    check_dataarray_column_names_consistency,
     check_n_features_in_after_fitting,
 )
 
@@ -314,6 +313,9 @@ def test_check_n_features_in_after_fitting(estimator):
 # TODO: When more modules get added, we can remove it from this list to make
 # sure it gets tested. After we finish each module we can move the checks
 # into check_estimator.
+# NOTE: Metaestimators that delegates validation to the inner estimator is
+# is actually checking that the inner estimator checks for column name
+# consistency
 COLUMN_NAME_MODULES_TO_IGNORE = {
     "calibration",
     "cluster",
@@ -359,12 +361,4 @@ def test_check_n_features_in_after_fitting(estimator):
 )
 def test_pandas_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataframe_column_names_consistency(type(estimator).__name__, estimator)
-
-
-@pytest.mark.parametrize(
-    "estimator", column_name_estimators, ids=_get_check_estimator_ids
-)
-def test_xarray_column_name_consistency(estimator):
-    _set_checking_parameters(estimator)
-    check_dataarray_column_names_consistency(type(estimator).__name__, estimator)
+    check_dataframe_column_names_consistency(estimator.__class__.__name__, estimator)
diff --git a/sklearn/tests/test_feature_names.py b/sklearn/tests/test_feature_names.py
index b0597bae063ef..a27691981b51f 100644
--- a/sklearn/tests/test_feature_names.py
+++ b/sklearn/tests/test_feature_names.py
@@ -6,23 +6,17 @@
 from sklearn.utils._feature_names import _get_feature_names
 
 
-def _construct_array(array_type, column_names):
+def test_get_feature_names_pandas():
+    pd = pytest.importorskip("pandas")
+    column_names = [f"col_{i}" for i in range(3)]
     X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
+    X = pd.DataFrame(X, columns=column_names)
 
-    if array_type == "dataframe":
-        pd = pytest.importorskip("pandas")
-        return pd.DataFrame(X, columns=column_names)
-    else:
-        xr = pytest.importorskip("xarray")
-        return xr.DataArray(
-            X, dims=("index", "columns"), coords={"columns": column_names}
-        )
+    names = _get_feature_names(X)
+    assert_array_equal(names, column_names)
 
 
-@pytest.mark.parametrize("array_type", ["dataframe", "dataarray"])
-def test_get_feature_names(array_type):
-    column_names = [f"col_{i}" for i in range(3)]
-    X = _construct_array(array_type, column_names)
-
+def test_get_feature_names_numpy():
+    X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
     names = _get_feature_names(X)
-    assert_array_equal(names, column_names)
+    assert names is None
diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
index b5f2607f72ef0..9711ee5d69174 100644
--- a/sklearn/utils/_feature_names.py
+++ b/sklearn/utils/_feature_names.py
@@ -6,28 +6,18 @@ def _get_feature_names(X):
 
     Parameters
     ----------
-    X : {dataframe, dataarray} of shape (n_samples, n_features)
+    X : {dataframe} of shape (n_samples, n_features)
         Array container to extract feature names.
 
         - pandas DataFrame : The columns will be considered to be feature
           names.
-        - xarray DataArray : The coords of the second dimension will be
-          considered to be feature names.
         - All other array containers will return `None`.
 
     Returns
     -------
     names: ndarray of shape (n_features,) or None
         Column names of `X`. Unrecognized array containers will return `None`.
-
-    Raises
-    ------
-    ValueError
-        If column names consist of a non-string data type.
     """
     if hasattr(X, "columns"):
         # pandas
         return np.array(X.columns, dtype=object)
-    elif hasattr(X, "dims") and isinstance(X.dims, tuple) and len(X.dims) == 2:
-        # xarray DataArray
-        return np.array(X.coords[X.dims[1]], dtype=object)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 341757050a34a..23816c9af28f0 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3441,21 +3441,6 @@ def _construct_dataframe(X, columns):
     )
 
 
-def check_dataarray_column_names_consistency(name, estimator_orig):
-    try:
-        import xarray as xr
-    except ImportError:
-        raise SkipTest(
-            "xarray is not installed: not checking "
-            "column name consistency for xarray"
-        )
-
-    def _construct_xarray(X, columns):
-        return xr.DataArray(X, dims=("index", "columns"), coords={"columns": columns})
-
-    _check_column_name_consistency(name, estimator_orig, _construct_xarray, "xarray")
-
-
 def _check_column_name_consistency(name, estimator_orig, construct_X, array_name):
     estimator = clone(estimator_orig)
     tags = estimator._get_tags()
@@ -3498,8 +3483,7 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
 
     expected_msg = (
         "The column names should match those that were passed "
-        f"during fit. Got ({bad_names}) expected ({names}). "
-        "Starting version 0.26, an error will be raised"
+        "during fit. Starting version 1.2, an error will be raised"
     )
     for method in check_funcs:
         # TODO In 0.26, this will be an error.
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index e9e6c5e82679b..d113f41498485 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -31,7 +31,6 @@
 from sklearn.utils.estimator_checks import check_regressor_data_not_an_array
 from sklearn.utils.estimator_checks import check_estimator_get_tags_default_keys
 from sklearn.utils.estimator_checks import check_dataframe_column_names_consistency
-from sklearn.utils.estimator_checks import check_dataarray_column_names_consistency
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.estimator_checks import check_outlier_corruption
 from sklearn.utils.fixes import np_version, parse_version
@@ -701,27 +700,12 @@ def test_check_estimator_get_tags_default_keys():
 
 
 def test_check_dataframe_column_names_consistency():
-    assert_raises_regex(
-        ValueError,
-        "Estimator does not have a feature_names_in_",
-        check_dataframe_column_names_consistency,
-        "estimator_name",
-        BaseBadClassifier(),
-    )
+    err_msg = "Estimator does not have a feature_names_in_"
+    with raises(ValueError, match=err_msg):
+        check_dataframe_column_names_consistency("estimator_name", BaseBadClassifier())
     check_dataframe_column_names_consistency("estimator_name", PartialFitChecksName())
 
 
-def test_check_dataarray_column_name_consistency():
-    assert_raises_regex(
-        ValueError,
-        "Estimator does not have a feature_names_in_",
-        check_dataarray_column_names_consistency,
-        "estimator_name",
-        BaseBadClassifier(),
-    )
-    check_dataarray_column_names_consistency("estimator_name", PartialFitChecksName())
-
-
 def run_tests_without_pytest():
     """Runs the tests in this file without using pytest."""
     main_module = sys.modules["__main__"]

From a2ce8b2b1a50872fd6e9185a8f19458123d195e4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 15:15:19 -0400
Subject: [PATCH 27/52] CLN Does not define feature_names_in_ when they do not
 exist

---
 sklearn/base.py            | 4 +++-
 sklearn/tests/test_base.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 003eece7b5a73..ff3b08db05889 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -413,7 +413,9 @@ def _check_feature_names(self, X, reset=True):
         """
 
         if reset:
-            self.feature_names_in_ = _get_feature_names(X)
+            feature_names_in = _get_feature_names(X)
+            if feature_names_in is not None:
+                self.feature_names_in_ = feature_names_in
             return
 
         fitted_feature_names = getattr(self, "feature_names_in_", None)
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index ac180b1b31563..594a195c691d9 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -618,7 +618,7 @@ def test_n_features_in_no_validation():
 
 
 def test_feature_names_in():
-    # Simple checks for feature_names_in
+    """Check that feature_name_in are recorded by `_validate_data`"""
     pd = pytest.importorskip("pandas")
     iris = datasets.load_iris()
     X_np, y = iris.data, iris.target

From f34435351d4d8ae543c28a267b8d778e69931564 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 15:31:56 -0400
Subject: [PATCH 28/52] CLN Be more strict about dataframes

---
 sklearn/base.py                        |  2 +-
 sklearn/tests/test_feature_names.py    | 22 ------------------
 sklearn/utils/_feature_names.py        | 23 ------------------
 sklearn/utils/tests/test_validation.py | 32 ++++++++++++++++++++++++++
 sklearn/utils/validation.py            | 28 ++++++++++++++++++++++
 5 files changed, 61 insertions(+), 46 deletions(-)
 delete mode 100644 sklearn/tests/test_feature_names.py
 delete mode 100644 sklearn/utils/_feature_names.py

diff --git a/sklearn/base.py b/sklearn/base.py
index ff3b08db05889..fbb19ba540e99 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -24,7 +24,7 @@
 from .utils.validation import _check_y
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
-from .utils._feature_names import _get_feature_names
+from .utils.validation import _get_feature_names
 
 
 def clone(estimator, *, safe=True):
diff --git a/sklearn/tests/test_feature_names.py b/sklearn/tests/test_feature_names.py
deleted file mode 100644
index a27691981b51f..0000000000000
--- a/sklearn/tests/test_feature_names.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import pytest
-
-import numpy as np
-from numpy.testing import assert_array_equal
-
-from sklearn.utils._feature_names import _get_feature_names
-
-
-def test_get_feature_names_pandas():
-    pd = pytest.importorskip("pandas")
-    column_names = [f"col_{i}" for i in range(3)]
-    X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
-    X = pd.DataFrame(X, columns=column_names)
-
-    names = _get_feature_names(X)
-    assert_array_equal(names, column_names)
-
-
-def test_get_feature_names_numpy():
-    X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float)
-    names = _get_feature_names(X)
-    assert names is None
diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
deleted file mode 100644
index 9711ee5d69174..0000000000000
--- a/sklearn/utils/_feature_names.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import numpy as np
-
-
-def _get_feature_names(X):
-    """Get feature names from X.
-
-    Parameters
-    ----------
-    X : {dataframe} of shape (n_samples, n_features)
-        Array container to extract feature names.
-
-        - pandas DataFrame : The columns will be considered to be feature
-          names.
-        - All other array containers will return `None`.
-
-    Returns
-    -------
-    names: ndarray of shape (n_features,) or None
-        Column names of `X`. Unrecognized array containers will return `None`.
-    """
-    if hasattr(X, "columns"):
-        # pandas
-        return np.array(X.columns, dtype=object)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index ac376dbb077ed..b509c321576f0 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -48,6 +48,7 @@
     _check_sample_weight,
     _allclose_dense_sparse,
     _num_features,
+    _get_feature_names,
     FLOAT_DTYPES,
 )
 from sklearn.utils.validation import _check_fit_params
@@ -1451,3 +1452,34 @@ def test_check_array_deprecated_matrix():
     )
     with pytest.warns(FutureWarning, match=msg):
         check_array(X)
+
+
+def test_get_feature_names_pandas():
+    """Get feature names with pandas dataframes."""
+    pd = pytest.importorskip("pandas")
+    column_names = [f"col_{i}" for i in range(3)]
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    X = pd.DataFrame(X, columns=column_names)
+
+    names = _get_feature_names(X)
+    assert_array_equal(names, column_names)
+
+
+def test_get_feature_names_numpy():
+    """Get feature names return None for numpy arrays"""
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    names = _get_feature_names(X)
+    assert names is None
+
+
+def test_ducktype_no_feature_names():
+    """Ducktyped dataframe has no feature names"""
+
+    class NotADataFrame:
+        def __init__(self):
+            self.columns = ["col1", "col2", "col3"]
+            self.iloc = "my_iloc"
+
+    X = NotADataFrame()
+    names = _get_feature_names(X)
+    assert names is None
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index bb699ffefd709..2ab38e169d8a5 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1588,3 +1588,31 @@ def _check_fit_params(X, fit_params, indices=None):
             )
 
     return fit_params_validated
+
+
+def _get_feature_names(X):
+    """Get feature names from X.
+
+    Support for other array containers should place its implementation here.
+
+    Parameters
+    ----------
+    X : dataframe of shape (n_samples, n_features)
+        Array container to extract feature names.
+
+        - pandas DataFrame : The columns will be considered to be feature
+          names.
+        - All other array containers will return `None`.
+
+    Returns
+    -------
+    names: Sequence of str or None
+        Column names of `X`. Unrecognized array containers will return `None`.
+    """
+    if hasattr(X, "iloc"):
+        with suppress(ImportError):
+            # check explicitly for pandas
+            import pandas as pd
+
+            if isinstance(X, pd.DataFrame):
+                return X.columns

From f114e98c814b04b0b75645bedb3d0105b43939f8 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 16:12:37 -0400
Subject: [PATCH 29/52] CLN Sync with data generation in
 check_n_features_in_after_fitting

---
 sklearn/tests/test_base.py        |  9 +++++----
 sklearn/utils/estimator_checks.py | 31 ++++++++++++++++++++-----------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 594a195c691d9..1a34579ebcecc 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -621,11 +621,11 @@ def test_feature_names_in():
     """Check that feature_name_in are recorded by `_validate_data`"""
     pd = pytest.importorskip("pandas")
     iris = datasets.load_iris()
-    X_np, y = iris.data, iris.target
+    X_np = iris.data
     df = pd.DataFrame(X_np, columns=iris.feature_names)
 
     class NoOpTransformer(TransformerMixin, BaseEstimator):
-        def fit(self, X, y):
+        def fit(self, X, y=None):
             self._validate_data(X)
             return self
 
@@ -633,7 +633,8 @@ def transform(self, X):
             self._validate_data(X, reset=False)
             return X
 
-    trans = NoOpTransformer().fit(df, y)
+    # fit on dataframe saves the feature names
+    trans = NoOpTransformer().fit(df)
     assert_array_equal(trans.feature_names_in_, df.columns)
 
     msg = "The column names should match those that were passed"
@@ -647,6 +648,6 @@ def transform(self, X):
     assert not record
 
     # fitted on numpy array and transformed on pandas array does not warn
-    trans = NoOpTransformer().fit(X_np, y)
+    trans = NoOpTransformer().fit(X_np)
     with pytest.warns(None) as record:
         trans.transform(df)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 23816c9af28f0..75c8622e2f78a 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3442,27 +3442,36 @@ def _construct_dataframe(X, columns):
 
 
 def _check_column_name_consistency(name, estimator_orig, construct_X, array_name):
-    estimator = clone(estimator_orig)
-    tags = estimator._get_tags()
+    tags = _safe_tags(estimator_orig)
 
-    # should be
-    if "2darray" not in tags["X_types"] or tags["no_validation"]:
+    if (
+        "2darray" not in tags["X_types"]
+        and "sparse" not in tags["X_types"]
+        or tags["no_validation"]
+    ):
         return
 
-    X_orig, _ = make_regression(random_state=0, n_features=5)
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+    if "warm_start" in estimator.get_params():
+        estimator.set_params(warm_start=False)
+
+    X_orig = rng.normal(size=(150, 8))
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
     X_orig = _pairwise_estimator_convert_X(X_orig, estimator)
-
     n_samples, n_features = X_orig.shape
+
     names = np.array([f"col_{i}" for i in range(n_features)])
     X = construct_X(X_orig, names)
 
-    rng = np.random.RandomState(0)
-    y = rng.randint(low=0, high=2, size=n_samples)
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
 
-    estimator.fit(X, y)
-
     if not hasattr(estimator, "feature_names_in_"):
         raise ValueError(
             "Estimator does not have a feature_names_in_ "
@@ -3486,7 +3495,7 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
         "during fit. Starting version 1.2, an error will be raised"
     )
     for method in check_funcs:
-        # TODO In 0.26, this will be an error.
+        # TODO In 1.2, this will be an error.
         assert_warns_message(FutureWarning, expected_msg, method, X_bad)
 
     # partial_fit checks on second call

From 396d3eaf5599a23c9d9d235f321a2d8ff332df25 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 16:50:46 -0400
Subject: [PATCH 30/52] ENH Removes estimators from ignore list

---
 sklearn/base.py                          |  1 +
 sklearn/calibration.py                   |  2 ++
 sklearn/feature_selection/_from_model.py |  2 ++
 sklearn/kernel_approximation.py          |  1 +
 sklearn/linear_model/_ransac.py          |  1 +
 sklearn/linear_model/_ridge.py           |  2 ++
 sklearn/manifold/_isomap.py              |  2 ++
 sklearn/manifold/_locally_linear.py      |  2 +-
 sklearn/tests/test_common.py             | 23 +----------------------
 sklearn/utils/estimator_checks.py        | 20 ++++++++++++++++----
 10 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index fbb19ba540e99..8fc5cf8d7e002 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -497,6 +497,7 @@ def _validate_data(
             validated.
         """
         self._check_feature_names(X, reset=reset)
+
         if y is None and self._get_tags()["requires_y"]:
             raise ValueError(
                 f"This {self.__class__.__name__} estimator "
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 05e02475bbec9..1faef5701c515 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -368,6 +368,8 @@ def fit(self, X, y, sample_weight=None):
         first_clf = self.calibrated_classifiers_[0].base_estimator
         if hasattr(first_clf, "n_features_in_"):
             self.n_features_in_ = first_clf.n_features_in_
+        if hasattr(first_clf, "n_features_in_"):
+            self.feature_names_in_ = first_clf.feature_names_in_
         return self
 
     def predict_proba(self, X):
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 2814a5a1a0fb9..7bf9ca00e5cf9 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -256,6 +256,8 @@ def fit(self, X, y=None, **fit_params):
             raise NotFittedError("Since 'prefit=True', call transform directly")
         self.estimator_ = clone(self.estimator)
         self.estimator_.fit(X, y, **fit_params)
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
         return self
 
     @property
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 725e60b97cb1f..a706685ec2cf8 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -436,6 +436,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
+        self._check_feature_names(X, reset=False)
         X = as_float_array(X, copy=True)
         X = self._validate_data(X, copy=False, reset=False)
         if (X <= -self.skewedness).any():
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index dd600363b3d8d..c00afc3c34d0d 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -539,6 +539,7 @@ def predict(self, X):
             Returns predicted values.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
 
         return self.estimator_.predict(X)
 
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 512b2bec61d95..60c40919d1173 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -1848,6 +1848,8 @@ def fit(self, X, y, sample_weight=None):
         self.coef_ = estimator.coef_
         self.intercept_ = estimator.intercept_
         self.n_features_in_ = estimator.n_features_in_
+        if hasattr(estimator, "feature_names_in_"):
+            self.feature_names_in_ = estimator.feature_names_in_
 
         return self
 
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 341061bb34ec2..91cd4d2483666 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -166,6 +166,8 @@ def _fit_transform(self, X):
         )
         self.nbrs_.fit(X)
         self.n_features_in_ = self.nbrs_.n_features_in_
+        if hasattr(self.nbrs_, "feature_names_in_"):
+            self.feature_names_in_ = self.nbrs_.feature_names_in_
 
         self.kernel_pca_ = KernelPCA(
             n_components=self.n_components,
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 64cc5c087052b..29e29647d1e85 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -768,7 +768,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X)
+        X = self._validate_data(X, reset=False)
         ind = self.nbrs_.kneighbors(
             X, n_neighbors=self.n_neighbors, return_distance=False
         )
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 717e8d763902f..1909fd23eb013 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -317,35 +317,14 @@ def test_check_n_features_in_after_fitting(estimator):
 # is actually checking that the inner estimator checks for column name
 # consistency
 COLUMN_NAME_MODULES_TO_IGNORE = {
-    "calibration",
-    "cluster",
     "compose",
-    "covariance",
-    "decomposition",
-    "discriminant_analysis",
     "ensemble",
     "feature_extraction",
-    "feature_selection",
-    "gaussian_process",
-    "isotonic",
-    "impute",
     "kernel_approximation",
-    "kernel_ridge",
-    "linear_model",
-    "manifold",
-    "mixture",
     "model_selection",
     "multiclass",
     "multioutput",
-    "naive_bayes",
-    "neighbors",
-    "neural_network",
-    "pipeline",
-    "preprocessing",
-    "random_projection",
-    "semi_supervised",
-    "svm",
-    "tree",
+    "semi_supervised" "pipeline",
 }
 
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 75c8622e2f78a..28630dee680b1 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -19,7 +19,6 @@
 from ._testing import assert_array_almost_equal
 from ._testing import assert_allclose
 from ._testing import assert_allclose_dense_sparse
-from ._testing import assert_warns_message
 from ._testing import set_random_state
 from ._testing import SkipTest
 from ._testing import ignore_warnings
@@ -3472,6 +3471,8 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
         y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
 
+    estimator.fit(X, y)
+
     if not hasattr(estimator, "feature_names_in_"):
         raise ValueError(
             "Estimator does not have a feature_names_in_ "
@@ -3496,12 +3497,23 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
     )
     for method in check_funcs:
         # TODO In 1.2, this will be an error.
-        assert_warns_message(FutureWarning, expected_msg, method, X_bad)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error", category=FutureWarning, module="sklearn")
+            with raises(FutureWarning, match=expected_msg):
+                method(X_bad)
 
     # partial_fit checks on second call
     if not hasattr(estimator, "partial_fit"):
         return  # partial_fit is not defined
 
     estimator = clone(estimator_orig)
-    estimator.partial_fit(X, y)
-    assert_warns_message(FutureWarning, expected_msg, estimator.partial_fit, X_bad, y)
+    if is_classifier(estimator):
+        classes = np.unique(y)
+        estimator.partial_fit(X, y, classes=classes)
+    else:
+        estimator.partial_fit(X, y)
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", category=FutureWarning, module="sklearn")
+        with raises(FutureWarning, match=expected_msg):
+            estimator.partial_fit(X_bad, y)

From 8372a6e1d3d3a7fc45b5e3cbe0c44f95cb48bc94 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 16:51:47 -0400
Subject: [PATCH 31/52] ENH Removes estimators from ignore list

---
 sklearn/tests/test_common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 1909fd23eb013..76dccd34ec0b0 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -324,7 +324,8 @@ def test_check_n_features_in_after_fitting(estimator):
     "model_selection",
     "multiclass",
     "multioutput",
-    "semi_supervised" "pipeline",
+    "pipeline",
+    "semi_supervised",
 }
 
 
From dd3612050cfc94ec13fa94a1f08964f404241052 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 16:55:45 -0400
Subject: [PATCH 32/52] DOC Adds whats new

---
 doc/whats_new/v1.0.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 9689cd8789a7a..2a2bcd31a8be5 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -130,6 +130,12 @@ Changelog
 - |API| `np.matrix` usage is deprecated in 1.0 and will raise a `TypeError` in
   1.2. :pr:`20165` by `Thomas Fan`_.
 
+- |API| All estimators store `feature_names_in_` when fitted on pandas Dataframes.
+  These feature names are compared to names seen in `non-fit` methods,
+  `i.e.` `transform` and will raise a `FutureWarning` if they are not consistent.
+  These `FutureWarning`s will become `ValueError`s in 1.2.
+  :pr:`18010` by `Thomas Fan`_.
+
 :mod:`sklearn.base`
 ...................
 

From bacbec139b854f9d9b8149ed2bac964dee66a476 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 28 Jun 2021 17:26:09 -0400
Subject: [PATCH 33/52] BUG Fix typo bug

---
 sklearn/calibration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 1faef5701c515..edc86435dcd5e 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -368,7 +368,7 @@ def fit(self, X, y, sample_weight=None):
         first_clf = self.calibrated_classifiers_[0].base_estimator
         if hasattr(first_clf, "n_features_in_"):
             self.n_features_in_ = first_clf.n_features_in_
-        if hasattr(first_clf, "n_features_in_"):
+        if hasattr(first_clf, "feature_names_in_"):
             self.feature_names_in_ = first_clf.feature_names_in_
         return self
 

From d86e70eddd3c3a9ab3df155f4f3135cb5e04b7e4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 16:34:47 -0400
Subject: [PATCH 34/52] CLN Address comments

---
 sklearn/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 8fc5cf8d7e002..45fc219527036 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -393,7 +393,7 @@ def _check_n_features(self, X, reset):
                 f"is expecting {self.n_features_in_} features as input."
             )
 
-    def _check_feature_names(self, X, reset=True):
+    def _check_feature_names(self, X, *, reset):
         """Validate feature names and set or check the `feature_names_in_`
         attribute.
 
@@ -402,7 +402,7 @@ def _check_feature_names(self, X, reset=True):
         X : {dataframe-like} of shape (n_samples, n_features)
             The input samples.
 
-        reset : bool, default=True
+        reset : bool
             Whether to reset the `feature_names_in_` attribute.
             If False, the input will be checked for consistency with
             feature names of data provided when reset was last True.
@@ -478,8 +478,8 @@ def _validate_data(
             If False, the input will be checked for consistency with data
             provided when reset was last True.
             .. note::
-               It is recommended to call `reset=True` in `fit` and in the first
-               call to `partial_fit`. All other methods that validates `X`
+               It is recommended to call reset=True in `fit` and in the first
+               call to `partial_fit`. All other methods that validate `X`
                should set `reset=False`.
         validate_separately : False or tuple of dicts, default=False
             Only used if y is not None.

From 8a4212f46b422eeda466aa45a34942198aa393d7 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 17:23:39 -0400
Subject: [PATCH 35/52] CLN Address comments

---
 sklearn/base.py                        |   2 +-
 sklearn/linear_model/_ransac.py        |   1 +
 sklearn/tests/test_common.py           |   7 +-
 sklearn/utils/estimator_checks.py      | 109 ++++++++++++++-----------
 sklearn/utils/tests/test_validation.py |  13 ---
 sklearn/utils/validation.py            |  11 +--
 6 files changed, 70 insertions(+), 73 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 45fc219527036..f9aea9791e02f 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -399,7 +399,7 @@ def _check_feature_names(self, X, *, reset):
 
         Parameters
         ----------
-        X : {dataframe-like} of shape (n_samples, n_features)
+        X : {ndarray, dataframe} of shape (n_samples, n_features)
             The input samples.
 
         reset : bool
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index c00afc3c34d0d..52d57334acc04 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -562,6 +562,7 @@ def score(self, X, y):
             Score of the prediction.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
 
         return self.estimator_.score(X, y)
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 76dccd34ec0b0..ec1b9e13ef688 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -313,9 +313,10 @@ def test_check_n_features_in_after_fitting(estimator):
 # TODO: When more modules get added, we can remove it from this list to make
 # sure it gets tested. After we finish each module we can move the checks
 # into check_estimator.
-# NOTE: Metaestimators that delegates validation to the inner estimator is
-# is actually checking that the inner estimator checks for column name
-# consistency
+# NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
+# delegates validation to a base estimator, the check is testing that the base estimator
+# is checking for column name consistency.
+
 COLUMN_NAME_MODULES_TO_IGNORE = {
     "compose",
     "ensemble",
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 28630dee680b1..35d2cc5d54d13 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3428,19 +3428,10 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         import pandas as pd
     except ImportError:
         raise SkipTest(
-            "pandas is not installed: not checking "
-            "column name consistency for pandas"
+            "pandas is not installed: not checking column name consistency "
+            "for pandas"
         )
 
-    def _construct_dataframe(X, columns):
-        return pd.DataFrame(X, columns=columns)
-
-    _check_column_name_consistency(
-        name, estimator_orig, _construct_dataframe, "dataframe"
-    )
-
-
-def _check_column_name_consistency(name, estimator_orig, construct_X, array_name):
     tags = _safe_tags(estimator_orig)
 
     if (
@@ -3454,8 +3445,6 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
 
     estimator = clone(estimator_orig)
     set_random_state(estimator)
-    if "warm_start" in estimator.get_params():
-        estimator.set_params(warm_start=False)
 
     X_orig = rng.normal(size=(150, 8))
     X_orig = _enforce_estimator_tags_x(estimator, X_orig)
@@ -3463,57 +3452,81 @@ def _check_column_name_consistency(name, estimator_orig, construct_X, array_name
     n_samples, n_features = X_orig.shape
 
     names = np.array([f"col_{i}" for i in range(n_features)])
-    X = construct_X(X_orig, names)
+    X = pd.DataFrame(X_orig, columns=names)
 
     if is_regressor(estimator):
         y = rng.normal(size=n_samples)
     else:
         y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
-
     estimator.fit(X, y)
 
     if not hasattr(estimator, "feature_names_in_"):
         raise ValueError(
             "Estimator does not have a feature_names_in_ "
-            f"attribute after fitting with a {array_name}"
+            "attribute after fitting with a dataframe"
         )
 
-    check_funcs = [
-        getattr(estimator, func)
-        for func in ("predict", "transform", "decision_function", "predict_proba")
-        if hasattr(estimator, func)
-    ]
-    for func in check_funcs:
-        func(X)  # works
+    check_methods = []
+    for method in (
+        "predict",
+        "transform",
+        "decision_function",
+        "predict_proba",
+        "score",
+        "predict_log_proba",
+    ):
+        if not hasattr(estimator, method):
+            continue
+
+        callable_method = getattr(estimator, method)
+        if method == "score":
+            callable_method = partial(callable_method, y=y)
+        check_methods.append((method, callable_method))
+
+    for _, method in check_methods:
+        method(X)  # works
 
     assert_array_equal(estimator.feature_names_in_, names)
-    bad_names = names[::-1]
-    X_bad = construct_X(X, bad_names)
 
-    expected_msg = (
-        "The column names should match those that were passed "
-        "during fit. Starting version 1.2, an error will be raised"
-    )
-    for method in check_funcs:
-        # TODO In 1.2, this will be an error.
-        with warnings.catch_warnings():
-            warnings.filterwarnings("error", category=FutureWarning, module="sklearn")
-            with raises(FutureWarning, match=expected_msg):
-                method(X_bad)
+    invalid_names = [
+        names[::-1],
+        [f"another_prefix_{i}" for i in range(n_features)],
+        names[:3],
+    ]
 
-    # partial_fit checks on second call
-    if not hasattr(estimator, "partial_fit"):
-        return  # partial_fit is not defined
+    for invalid_name in invalid_names:
+        X_bad = pd.DataFrame(X, columns=invalid_name)
 
-    estimator = clone(estimator_orig)
-    if is_classifier(estimator):
-        classes = np.unique(y)
-        estimator.partial_fit(X, y, classes=classes)
-    else:
-        estimator.partial_fit(X, y)
+        expected_msg = (
+            "The column names should match those that were passed "
+            "during fit. Starting version 1.2, an error will be raised"
+        )
+        for name, method in check_methods:
+            # TODO In 1.2, this will be an error.
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "error",
+                    category=FutureWarning,
+                    module="sklearn",
+                )
+                with raises(
+                    FutureWarning, match=expected_msg, err_msg=f"{name} did not raise"
+                ):
+                    method(X_bad)
 
-    with warnings.catch_warnings():
-        warnings.filterwarnings("error", category=FutureWarning, module="sklearn")
-        with raises(FutureWarning, match=expected_msg):
-            estimator.partial_fit(X_bad, y)
+        # partial_fit checks on second call
+        if not hasattr(estimator, "partial_fit"):
+            continue
+
+        estimator = clone(estimator_orig)
+        if is_classifier(estimator):
+            classes = np.unique(y)
+            estimator.partial_fit(X, y, classes=classes)
+        else:
+            estimator.partial_fit(X, y)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error", category=FutureWarning, module="sklearn")
+            with raises(FutureWarning, match=expected_msg):
+                estimator.partial_fit(X_bad, y)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index b509c321576f0..9bd4557ae4c57 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1470,16 +1470,3 @@ def test_get_feature_names_numpy():
     X = np.array([[1, 2, 3], [4, 5, 6]])
     names = _get_feature_names(X)
     assert names is None
-
-
-def test_ducktype_no_feature_names():
-    """Ducktyped dataframe has no feature names"""
-
-    class NotADataFrame:
-        def __init__(self):
-            self.columns = ["col1", "col2", "col3"]
-            self.iloc = "my_iloc"
-
-    X = NotADataFrame()
-    names = _get_feature_names(X)
-    assert names is None
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 2ab38e169d8a5..538823ee0b3e2 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1597,7 +1597,7 @@ def _get_feature_names(X):
 
     Parameters
     ----------
-    X : dataframe of shape (n_samples, n_features)
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
         Array container to extract feature names.
 
         - pandas DataFrame : The columns will be considered to be feature
@@ -1609,10 +1609,5 @@ def _get_feature_names(X):
     names: Sequence of str or None
         Column names of `X`. Unrecognized array containers will return `None`.
     """
-    if hasattr(X, "iloc"):
-        with suppress(ImportError):
-            # check explicitly for pandas
-            import pandas as pd
-
-            if isinstance(X, pd.DataFrame):
-                return X.columns
+    if hasattr(X, "columns"):
+        return X.columns

From ee95642faeba76c1466d7f83cb948e96872f880a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 17:28:57 -0400
Subject: [PATCH 36/52] DOC grammer

---
 sklearn/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index f9aea9791e02f..9f41a9f17d56a 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -408,7 +408,7 @@ def _check_feature_names(self, X, *, reset):
             feature names of data provided when reset was last True.
             .. note::
                It is recommended to call `reset=True` in `fit` and in the first
-               call to `partial_fit`. All other methods that validates `X`
+               call to `partial_fit`. All other methods that validate `X`
                should set `reset=False`.
         """
 

From 7aeae36d0df096cd3e40bb538413efc009869785 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 21:08:19 -0400
Subject: [PATCH 37/52] TST Adds test and remove support for multiindex

---
 sklearn/utils/tests/test_validation.py | 22 ++++++++++++++++------
 sklearn/utils/validation.py            | 10 +++++++---
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 9bd4557ae4c57..2a3360ea88144 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1454,19 +1454,29 @@ def test_check_array_deprecated_matrix():
         check_array(X)
 
 
-def test_get_feature_names_pandas():
+@pytest.mark.parametrize("names", [[f"col_{i}" for i in range(3)], list(range(3))])
+def test_get_feature_names_pandas(names):
     """Get feature names with pandas dataframes."""
     pd = pytest.importorskip("pandas")
-    column_names = [f"col_{i}" for i in range(3)]
-    X = np.array([[1, 2, 3], [4, 5, 6]])
-    X = pd.DataFrame(X, columns=column_names)
+    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=names)
 
     names = _get_feature_names(X)
-    assert_array_equal(names, column_names)
+    assert_array_equal(names, names)
 
 
 def test_get_feature_names_numpy():
-    """Get feature names return None for numpy arrays"""
+    """Get feature names return None for numpy arrays."""
     X = np.array([[1, 2, 3], [4, 5, 6]])
     names = _get_feature_names(X)
     assert names is None
+
+
+def test_get_feature_names_error_multi_index_fail():
+    """Check that MultiIndex raises an error."""
+    pd = pytest.importorskip("pandas")
+    names = pd.MultiIndex.from_product([["one", "two"], ["A", "B"]])
+    X = pd.DataFrame([[1, 2, 3, 4]], columns=names)
+
+    msg = "Pandas MultiIndex is not supported as feature names"
+    with pytest.raises(ValueError, match=msg):
+        _get_feature_names(X)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 538823ee0b3e2..e52623628d8a9 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1606,8 +1606,12 @@ def _get_feature_names(X):
 
     Returns
     -------
-    names: Sequence of str or None
-        Column names of `X`. Unrecognized array containers will return `None`.
+    names: ndarray or None
+        Feature names of `X`. Unrecognized array containers will return `None`.
     """
     if hasattr(X, "columns"):
-        return X.columns
+        feature_names = np.array(X.columns)
+        # multiple index
+        if isinstance(feature_names[0], tuple):
+            raise ValueError("Pandas MultiIndex is not supported as feature names")
+        return feature_names

From 2e4e422f46d63cd38d4116696dbb07e9e1bd7dd1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 21:32:56 -0400
Subject: [PATCH 38/52] STY Run black

---
 sklearn/utils/estimator_checks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 7b3361c244c5f..f26b1adfaab9c 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3442,8 +3442,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         import pandas as pd
     except ImportError:
         raise SkipTest(
-            "pandas is not installed: not checking column name consistency "
-            "for pandas"
+            "pandas is not installed: not checking column name consistency for pandas"
         )
 
     tags = _safe_tags(estimator_orig)

From d112c617d59de655d92bc04e108eb453db83033b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Jul 2021 22:02:39 -0400
Subject: [PATCH 39/52] ENH Adds warnings to _check_feature_names

---
 sklearn/base.py                   | 26 ++++++++++++++++++--------
 sklearn/tests/test_base.py        | 14 +++++++++-----
 sklearn/utils/estimator_checks.py |  3 +--
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 89a4d00f4d094..15c3414b42c67 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -422,19 +422,29 @@ def _check_feature_names(self, X, *, reset):
             return
 
         fitted_feature_names = getattr(self, "feature_names_in_", None)
-        if fitted_feature_names is None:
-            # no feature names to check
+        X_feature_names = _get_feature_names(X)
+
+        if fitted_feature_names is None and X_feature_names is None:
+            # no feature names seen in fit and in X
             return
 
-        new_feature_names_in = _get_feature_names(X)
-        if new_feature_names_in is None:
-            # X does not have feature names but estimator was fitted with
-            # data with feature names
+        if X_feature_names is not None and fitted_feature_names is None:
+            warnings.warn(
+                f"X has feature names, but {self.__class__.__name__} was fitted without"
+                " feature names"
+            )
+            return
+
+        if X_feature_names is None and fitted_feature_names is not None:
+            warnings.warn(
+                f"X does not have any feature names, but {self.__class__.__name__} was "
+                "fitted with feature names"
+            )
             return
 
         # valid the `feature_names_in_` attribute
-        if len(fitted_feature_names) != len(new_feature_names_in) or np.any(
-            fitted_feature_names != new_feature_names_in
+        if len(fitted_feature_names) != len(X_feature_names) or np.any(
+            fitted_feature_names != X_feature_names
         ):
             warnings.warn(
                 "The column names should match those that were "
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index fc5d2507c41af..87006259ba523 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -642,12 +642,16 @@ def transform(self, X):
     with pytest.warns(FutureWarning, match=msg):
         trans.transform(df_bad)
 
-    # does not warn when transforming on numpy array
-    with pytest.warns(None) as record:
+    # warns when fitted on dataframe and transforming a ndarray
+    msg = (
+        "X does not have any feature names, but NoOpTransformer was "
+        "fitted with feature names"
+    )
+    with pytest.warns(UserWarning, match=msg):
         trans.transform(X_np)
-    assert not record
 
-    # fitted on numpy array and transformed on pandas array does not warn
+    # warns when fitted on a ndarray and transforming dataframe
+    msg = "X has feature names, but NoOpTransformer was fitted without feature names"
     trans = NoOpTransformer().fit(X_np)
-    with pytest.warns(None) as record:
+    with pytest.warns(UserWarning, match=msg):
         trans.transform(df)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f26b1adfaab9c..7a26088b20cab 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3479,6 +3479,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
             "Estimator does not have a feature_names_in_ "
             "attribute after fitting with a dataframe"
         )
+    assert_array_equal(estimator.feature_names_in_, names)
 
     check_methods = []
     for method in (
@@ -3500,8 +3501,6 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
     for _, method in check_methods:
         method(X)  # works
 
-    assert_array_equal(estimator.feature_names_in_, names)
-
     invalid_names = [
         names[::-1],
         [f"another_prefix_{i}" for i in range(n_features)],

From f70a56e952d5af58d27abda813d5973cfaf99cdf Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 23 Jul 2021 15:52:42 -0400
Subject: [PATCH 40/52] CLN Address comments

---
 sklearn/base.py                        | 9 +++++----
 sklearn/kernel_approximation.py        | 9 ++++-----
 sklearn/neural_network/_rbm.py         | 3 +--
 sklearn/utils/estimator_checks.py      | 3 ++-
 sklearn/utils/tests/test_validation.py | 2 +-
 sklearn/utils/validation.py            | 8 ++++----
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 15c3414b42c67..c840d193cc317 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -397,8 +397,9 @@ def _check_n_features(self, X, reset):
             )
 
     def _check_feature_names(self, X, *, reset):
-        """Validate feature names and set or check the `feature_names_in_`
-        attribute.
+        """Set or check the `feature_names_in_` attribute.
+
+        .. versionadded:: 1.0
 
         Parameters
         ----------
@@ -442,12 +443,12 @@ def _check_feature_names(self, X, *, reset):
             )
             return
 
-        # valid the `feature_names_in_` attribute
+        # validate the feature names against the `feature_names_in_` attribute
         if len(fitted_feature_names) != len(X_feature_names) or np.any(
             fitted_feature_names != X_feature_names
         ):
             warnings.warn(
-                "The column names should match those that were "
+                "The feature names should match those that were "
                 "passed during fit. Starting version 1.2, an error will be raised",
                 FutureWarning,
             )
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index ef111512faed7..d0e63f31c6320 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -21,7 +21,7 @@
 
 from .base import BaseEstimator
 from .base import TransformerMixin
-from .utils import check_random_state, as_float_array
+from .utils import check_random_state
 from .utils.extmath import safe_sparse_dot
 from .utils.validation import check_is_fitted
 from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
@@ -450,10 +450,9 @@ def transform(self, X):
             Projected array.
         """
         check_is_fitted(self)
-
-        self._check_feature_names(X, reset=False)
-        X = as_float_array(X, copy=True)
-        X = self._validate_data(X, copy=False, reset=False)
+        X = self._validate_data(
+            X, copy=True, dtype=[np.float64, np.float32], reset=False
+        )
         if (X <= -self.skewedness).any():
             raise ValueError("X may not contain entries smaller than -skewedness.")
 
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 3d8647e3960f6..6d9afeb10de63 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -15,7 +15,6 @@
 
 from ..base import BaseEstimator
 from ..base import TransformerMixin
-from ..utils import check_array
 from ..utils import check_random_state
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
@@ -333,7 +332,7 @@ def score_samples(self, X):
         """
         check_is_fitted(self)
 
-        v = check_array(X, accept_sparse="csr")
+        v = self._validate_data(X, accept_sparse="csr", reset=False)
         rng = check_random_state(self.random_state)
 
         # Randomly corrupt one feature in each sample in v.
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f6bb66dc65a0e..75d91556bac8b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3522,6 +3522,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         "decision_function",
         "predict_proba",
         "score",
+        "score_samples",
         "predict_log_proba",
     ):
         if not hasattr(estimator, method):
@@ -3545,7 +3546,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         X_bad = pd.DataFrame(X, columns=invalid_name)
 
         expected_msg = (
-            "The column names should match those that were passed "
+            "The feature names should match those that were passed "
             "during fit. Starting version 1.2, an error will be raised"
         )
         for name, method in check_methods:
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index b8823d103b3f3..612523ada41ae 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1449,7 +1449,7 @@ def test_check_array_deprecated_matrix():
 
 
 @pytest.mark.parametrize("names", [[f"col_{i}" for i in range(3)], list(range(3))])
-def test_get_feature_names_pandas(names):
+def test_get_feature_names_pandas_error(names):
     """Get feature names with pandas dataframes."""
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=names)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 4a1a347e7d9e6..9eaa8c2d795fa 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1609,8 +1609,8 @@ def _get_feature_names(X):
         Feature names of `X`. Unrecognized array containers will return `None`.
     """
     if hasattr(X, "columns"):
-        feature_names = np.array(X.columns)
-        # multiple index
-        if isinstance(feature_names[0], tuple):
+        feature_names = X.columns
+        # Only strings are supported
+        if not isinstance(feature_names[0], str):
             raise ValueError("Pandas MultiIndex is not supported as feature names")
-        return feature_names
+        return np.asarray(feature_names)

From 62bb28b5cae614ebb8bbc2b5af499f2527126e51 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 23 Jul 2021 18:53:32 -0400
Subject: [PATCH 41/52] ENH Adds better errors

---
 sklearn/base.py                   | 35 ++++++++++++++++++++++++++++---
 sklearn/utils/estimator_checks.py | 20 ++++++++++++------
 sklearn/utils/validation.py       |  4 ++--
 3 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index c840d193cc317..b57868fd73153 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -447,11 +447,40 @@ def _check_feature_names(self, X, *, reset):
         if len(fitted_feature_names) != len(X_feature_names) or np.any(
             fitted_feature_names != X_feature_names
         ):
-            warnings.warn(
+            message = (
                 "The feature names should match those that were "
-                "passed during fit. Starting version 1.2, an error will be raised",
-                FutureWarning,
+                "passed during fit. Starting version 1.2, an error will be raised.\n"
             )
+            fitted_feature_names_set = set(fitted_feature_names)
+            X_feature_names_set = set(X_feature_names)
+
+            unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
+            missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
+
+            def add_names(names):
+                output = ""
+                max_n_names = 5
+                for i, name in enumerate(names):
+                    if i >= max_n_names:
+                        output += "- ...\n"
+                        break
+                    output += f"- {name}\n"
+                return output
+
+            if unexpected_names:
+                message += "Feature names unseen at fit time:\n"
+                message += add_names(unexpected_names)
+
+            if missing_names:
+                message += "Feature names seen at fit time, yet now missing:\n"
+                message += add_names(missing_names)
+
+            if not missing_names and not missing_names:
+                message += (
+                    "Feature names must be in the same order as they were in fit.\n"
+                )
+
+            warnings.warn(message, FutureWarning)
 
     def _validate_data(
         self,
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 75d91556bac8b..0162f954efd40 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3537,17 +3537,25 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         method(X)  # works
 
     invalid_names = [
-        names[::-1],
-        [f"another_prefix_{i}" for i in range(n_features)],
-        names[:3],
+        (names[::-1], "Feature names must be in the same order as they were in fit."),
+        (
+            [f"another_prefix_{i}" for i in range(n_features)],
+            "Feature names unseen at fit time:\n- another_prefix_0\n-"
+            " another_prefix_1\n",
+        ),
+        (
+            names[:3],
+            f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n",
+        ),
     ]
 
-    for invalid_name in invalid_names:
+    for invalid_name, additional_message in invalid_names:
         X_bad = pd.DataFrame(X, columns=invalid_name)
 
-        expected_msg = (
+        expected_msg = re.escape(
             "The feature names should match those that were passed "
-            "during fit. Starting version 1.2, an error will be raised"
+            "during fit. Starting version 1.2, an error will be raised.\n"
+            f"{additional_message}"
         )
         for name, method in check_methods:
             # TODO In 1.2, this will be an error.
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 9eaa8c2d795fa..75569c07ccacd 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1609,8 +1609,8 @@ def _get_feature_names(X):
         Feature names of `X`. Unrecognized array containers will return `None`.
     """
     if hasattr(X, "columns"):
-        feature_names = X.columns
+        feature_names = np.asarray(X.columns)
         # Only strings are supported
         if not isinstance(feature_names[0], str):
-            raise ValueError("Pandas MultiIndex is not supported as feature names")
+            raise ValueError("Only strings ")
         return np.asarray(feature_names)

From a0d4d12a8001260c80d10a667482c0bf721901bf Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 23 Jul 2021 22:06:25 -0400
Subject: [PATCH 42/52] ENH Warn for non-string columns

---
 sklearn/base.py                           |  2 +-
 sklearn/utils/_feature_names.py           | 38 ++++++++++++++++++++
 sklearn/utils/tests/test_feature_names.py | 42 +++++++++++++++++++++++
 sklearn/utils/tests/test_validation.py    | 29 ----------------
 sklearn/utils/validation.py               | 27 ---------------
 5 files changed, 81 insertions(+), 57 deletions(-)
 create mode 100644 sklearn/utils/_feature_names.py
 create mode 100644 sklearn/utils/tests/test_feature_names.py

diff --git a/sklearn/base.py b/sklearn/base.py
index b57868fd73153..33d8668e20c53 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -24,7 +24,7 @@
 from .utils.validation import _check_y
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
-from .utils.validation import _get_feature_names
+from .utils._feature_names import _get_feature_names
 
 
 def clone(estimator, *, safe=True):
diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
new file mode 100644
index 0000000000000..1d40e80154b9e
--- /dev/null
+++ b/sklearn/utils/_feature_names.py
@@ -0,0 +1,38 @@
+"""Module for working with feature names."""
+import numpy as np
+
+import warnings
+
+
+def _get_feature_names(X):
+    """Get feature names from X.
+
+    Support for other array containers should place its implementation here.
+
+    Parameters
+    ----------
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
+        Array container to extract feature names.
+
+        - pandas dataframe : The columns will be considered to be feature
+          names. If the dataframe contains non-string feature names, `None` is
+          returned.
+        - All other array containers will return `None`.
+
+    Returns
+    -------
+    names: ndarray or None
+        Feature names of `X`. Unrecognized array containers will return `None`.
+    """
+    if hasattr(X, "columns"):
+        feature_names = np.asarray(X.columns)
+        # Only strings are supported
+        if not all(isinstance(item, str) for item in feature_names):
+            warnings.warn(
+                "Feature name support requires all feature names to be strings. "
+                "Passing non-str feature names will raise an error in 1.2",
+                FutureWarning,
+            )
+
+            return
+        return feature_names
diff --git a/sklearn/utils/tests/test_feature_names.py b/sklearn/utils/tests/test_feature_names.py
new file mode 100644
index 0000000000000..4b1f8a08337f9
--- /dev/null
+++ b/sklearn/utils/tests/test_feature_names.py
@@ -0,0 +1,42 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils._feature_names import _get_feature_names
+
+
+@pytest.mark.parametrize(
+    "names",
+    [
+        list(range(2)),
+        range(2),
+        [["a", "b"], ["c", "d"]],
+    ],
+    ids=["list-int", "range", "multi-index"],
+)
+def test_get_feature_names_pandas_warns(names):
+    """Get feature names with pandas dataframes with warnings."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
+
+    msg = "Feature name support requires all feature names to be strings"
+    with pytest.warns(FutureWarning, match=msg):
+        names = _get_feature_names(X)
+    assert names is None
+
+
+def test_get_feature_names_pandas():
+    """Get feature names with pandas dataframes."""
+    pd = pytest.importorskip("pandas")
+    columns = [f"col_{i}" for i in range(3)]
+    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns)
+    feature_names = _get_feature_names(X)
+
+    assert_array_equal(feature_names, columns)
+
+
+def test_get_feature_names_numpy():
+    """Get feature names return None for numpy arrays."""
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    names = _get_feature_names(X)
+    assert names is None
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 612523ada41ae..1a1449ecc209f 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -48,7 +48,6 @@
     _check_sample_weight,
     _allclose_dense_sparse,
     _num_features,
-    _get_feature_names,
     FLOAT_DTYPES,
 )
 from sklearn.utils.validation import _check_fit_params
@@ -1446,31 +1445,3 @@ def test_check_array_deprecated_matrix():
     )
     with pytest.warns(FutureWarning, match=msg):
         check_array(X)
-
-
-@pytest.mark.parametrize("names", [[f"col_{i}" for i in range(3)], list(range(3))])
-def test_get_feature_names_pandas_error(names):
-    """Get feature names with pandas dataframes."""
-    pd = pytest.importorskip("pandas")
-    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=names)
-
-    names = _get_feature_names(X)
-    assert_array_equal(names, names)
-
-
-def test_get_feature_names_numpy():
-    """Get feature names return None for numpy arrays."""
-    X = np.array([[1, 2, 3], [4, 5, 6]])
-    names = _get_feature_names(X)
-    assert names is None
-
-
-def test_get_feature_names_error_multi_index_fail():
-    """Check that MultiIndex raises an error."""
-    pd = pytest.importorskip("pandas")
-    names = pd.MultiIndex.from_product([["one", "two"], ["A", "B"]])
-    X = pd.DataFrame([[1, 2, 3, 4]], columns=names)
-
-    msg = "Pandas MultiIndex is not supported as feature names"
-    with pytest.raises(ValueError, match=msg):
-        _get_feature_names(X)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 75569c07ccacd..98bf6ac8bdb6a 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1587,30 +1587,3 @@ def _check_fit_params(X, fit_params, indices=None):
             )
 
     return fit_params_validated
-
-
-def _get_feature_names(X):
-    """Get feature names from X.
-
-    Support for other array containers should place its implementation here.
-
-    Parameters
-    ----------
-    X : {ndarray, dataframe} of shape (n_samples, n_features)
-        Array container to extract feature names.
-
-        - pandas DataFrame : The columns will be considered to be feature
-          names.
-        - All other array containers will return `None`.
-
-    Returns
-    -------
-    names: ndarray or None
-        Feature names of `X`. Unrecognized array containers will return `None`.
-    """
-    if hasattr(X, "columns"):
-        feature_names = np.asarray(X.columns)
-        # Only strings are supported
-        if not isinstance(feature_names[0], str):
-            raise ValueError("Only strings ")
-        return np.asarray(feature_names)

From 7bc3d8bdd93c0374b75d30d96cff7f5d3ab372df Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 23 Jul 2021 22:22:32 -0400
Subject: [PATCH 43/52] CLN Move back to validation

---
 sklearn/base.py                           |  2 +-
 sklearn/utils/_feature_names.py           | 38 --------------------
 sklearn/utils/tests/test_feature_names.py | 42 -----------------------
 sklearn/utils/tests/test_validation.py    | 38 ++++++++++++++++++++
 sklearn/utils/validation.py               | 34 ++++++++++++++++++
 5 files changed, 73 insertions(+), 81 deletions(-)
 delete mode 100644 sklearn/utils/_feature_names.py
 delete mode 100644 sklearn/utils/tests/test_feature_names.py

diff --git a/sklearn/base.py b/sklearn/base.py
index 33d8668e20c53..b57868fd73153 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -24,7 +24,7 @@
 from .utils.validation import _check_y
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
-from .utils._feature_names import _get_feature_names
+from .utils.validation import _get_feature_names
 
 
 def clone(estimator, *, safe=True):
diff --git a/sklearn/utils/_feature_names.py b/sklearn/utils/_feature_names.py
deleted file mode 100644
index 1d40e80154b9e..0000000000000
--- a/sklearn/utils/_feature_names.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Module for working with feature names."""
-import numpy as np
-
-import warnings
-
-
-def _get_feature_names(X):
-    """Get feature names from X.
-
-    Support for other array containers should place its implementation here.
-
-    Parameters
-    ----------
-    X : {ndarray, dataframe} of shape (n_samples, n_features)
-        Array container to extract feature names.
-
-        - pandas dataframe : The columns will be considered to be feature
-          names. If the dataframe contains non-string feature names, `None` is
-          returned.
-        - All other array containers will return `None`.
-
-    Returns
-    -------
-    names: ndarray or None
-        Feature names of `X`. Unrecognized array containers will return `None`.
-    """
-    if hasattr(X, "columns"):
-        feature_names = np.asarray(X.columns)
-        # Only strings are supported
-        if not all(isinstance(item, str) for item in feature_names):
-            warnings.warn(
-                "Feature name support requires all feature names to be strings. "
-                "Passing non-str feature names will raise an error in 1.2",
-                FutureWarning,
-            )
-
-            return
-        return feature_names
diff --git a/sklearn/utils/tests/test_feature_names.py b/sklearn/utils/tests/test_feature_names.py
deleted file mode 100644
index 4b1f8a08337f9..0000000000000
--- a/sklearn/utils/tests/test_feature_names.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pytest
-import numpy as np
-from numpy.testing import assert_array_equal
-
-from sklearn.utils._feature_names import _get_feature_names
-
-
-@pytest.mark.parametrize(
-    "names",
-    [
-        list(range(2)),
-        range(2),
-        [["a", "b"], ["c", "d"]],
-    ],
-    ids=["list-int", "range", "multi-index"],
-)
-def test_get_feature_names_pandas_warns(names):
-    """Get feature names with pandas dataframes with warnings."""
-    pd = pytest.importorskip("pandas")
-    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
-
-    msg = "Feature name support requires all feature names to be strings"
-    with pytest.warns(FutureWarning, match=msg):
-        names = _get_feature_names(X)
-    assert names is None
-
-
-def test_get_feature_names_pandas():
-    """Get feature names with pandas dataframes."""
-    pd = pytest.importorskip("pandas")
-    columns = [f"col_{i}" for i in range(3)]
-    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns)
-    feature_names = _get_feature_names(X)
-
-    assert_array_equal(feature_names, columns)
-
-
-def test_get_feature_names_numpy():
-    """Get feature names return None for numpy arrays."""
-    X = np.array([[1, 2, 3], [4, 5, 6]])
-    names = _get_feature_names(X)
-    assert names is None
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 1a1449ecc209f..78bf652962a81 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -49,6 +49,7 @@
     _allclose_dense_sparse,
     _num_features,
     FLOAT_DTYPES,
+    _get_feature_names,
 )
 from sklearn.utils.validation import _check_fit_params
 
@@ -1445,3 +1446,40 @@ def test_check_array_deprecated_matrix():
     )
     with pytest.warns(FutureWarning, match=msg):
         check_array(X)
+
+
+@pytest.mark.parametrize(
+    "names",
+    [
+        list(range(2)),
+        range(2),
+        [["a", "b"], ["c", "d"]],
+    ],
+    ids=["list-int", "range", "multi-index"],
+)
+def test_get_feature_names_pandas_warns(names):
+    """Get feature names with pandas dataframes with warnings."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
+
+    msg = "Feature name support requires all feature names to be strings"
+    with pytest.warns(FutureWarning, match=msg):
+        names = _get_feature_names(X)
+    assert names is None
+
+
+def test_get_feature_names_pandas():
+    """Get feature names with pandas dataframes."""
+    pd = pytest.importorskip("pandas")
+    columns = [f"col_{i}" for i in range(3)]
+    X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns)
+    feature_names = _get_feature_names(X)
+
+    assert_array_equal(feature_names, columns)
+
+
+def test_get_feature_names_numpy():
+    """Get feature names return None for numpy arrays."""
+    X = np.array([[1, 2, 3], [4, 5, 6]])
+    names = _get_feature_names(X)
+    assert names is None
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 98bf6ac8bdb6a..c517d5d810351 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1587,3 +1587,37 @@ def _check_fit_params(X, fit_params, indices=None):
             )
 
     return fit_params_validated
+
+
+def _get_feature_names(X):
+    """Get feature names from X.
+
+    Support for other array containers should place its implementation here.
+
+    Parameters
+    ----------
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
+        Array container to extract feature names.
+
+        - pandas dataframe : The columns will be considered to be feature
+          names. If the dataframe contains non-string feature names, `None` is
+          returned.
+        - All other array containers will return `None`.
+
+    Returns
+    -------
+    names: ndarray or None
+        Feature names of `X`. Unrecognized array containers will return `None`.
+    """
+    if hasattr(X, "columns"):
+        feature_names = np.asarray(X.columns)
+        # Only strings are supported
+        if not all(isinstance(item, str) for item in feature_names):
+            warnings.warn(
+                "Feature name support requires all feature names to be strings. "
+                "Passing non-str feature names will raise an error in 1.2",
+                FutureWarning,
+            )
+
+            return
+        return feature_names

From 6e42e0f8914ff6d7e169ee4322d25ccddebb71ac Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 25 Jul 2021 08:22:26 -0400
Subject: [PATCH 44/52] TST Fix test error message

---
 sklearn/tests/test_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 87006259ba523..52b9c0595adbb 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -637,7 +637,7 @@ def transform(self, X):
     trans = NoOpTransformer().fit(df)
     assert_array_equal(trans.feature_names_in_, df.columns)
 
-    msg = "The column names should match those that were passed"
+    msg = "The feature names should match those that were passed"
     df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
     with pytest.warns(FutureWarning, match=msg):
         trans.transform(df_bad)

From a171666d312b3004e4ab4c2f9e72c42d5a1f8f7f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 25 Jul 2021 08:31:44 -0400
Subject: [PATCH 45/52] ENH Delegates validation to base estimator in Ransac

---
 sklearn/linear_model/_ransac.py | 45 +++++++++++++++------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 47ffee936bec2..f424ff31d0d4a 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,9 +9,14 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
-from ..utils import check_random_state, check_consistent_length
+from ..utils import check_random_state, _safe_indexing
 from ..utils.random import sample_without_replacement
-from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import (
+    check_is_fitted,
+    _check_sample_weight,
+    _num_samples,
+    _num_features,
+)
 from ._base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -273,25 +278,19 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        # Need to validate separately here.
-        # We can't pass multi_ouput=True because that would allow y to be csr.
-        check_X_params = dict(accept_sparse="csr")
-        check_y_params = dict(ensure_2d=False)
-        X, y = self._validate_data(
-            X, y, validate_separately=(check_X_params, check_y_params)
-        )
-        check_consistent_length(X, y)
-
         if self.base_estimator is not None:
             base_estimator = clone(self.base_estimator)
         else:
             base_estimator = LinearRegression()
 
+        n_samples = _num_samples(X)
+        n_features = _num_features(X)
+
         if self.min_samples is None:
             # assume linear model by default
-            min_samples = X.shape[1] + 1
+            min_samples = n_features + 1
         elif 0 < self.min_samples < 1:
-            min_samples = np.ceil(self.min_samples * X.shape[0])
+            min_samples = np.ceil(self.min_samples * n_samples)
         elif self.min_samples >= 1:
             if self.min_samples % 1 != 0:
                 raise ValueError("Absolute number of samples must be an integer value.")
@@ -301,7 +300,7 @@ def fit(self, X, y, sample_weight=None):
         if min_samples > X.shape[0]:
             raise ValueError(
                 "`min_samples` may not be larger than number "
-                "of samples: n_samples = %d." % (X.shape[0])
+                "of samples: n_samples = %d." % n_samples
             )
 
         if self.stop_probability < 0 or self.stop_probability > 1:
@@ -384,8 +383,6 @@ def fit(self, X, y, sample_weight=None):
         self.n_skips_invalid_data_ = 0
         self.n_skips_invalid_model_ = 0
 
-        # number of data samples
-        n_samples = X.shape[0]
         sample_idxs = np.arange(n_samples)
 
         self.n_trials_ = 0
@@ -404,8 +401,8 @@ def fit(self, X, y, sample_weight=None):
             subset_idxs = sample_without_replacement(
                 n_samples, min_samples, random_state=random_state
             )
-            X_subset = X[subset_idxs]
-            y_subset = y[subset_idxs]
+            X_subset = _safe_indexing(X, subset_idxs, axis=0)
+            y_subset = _safe_indexing(y, subset_idxs, axis=0)
 
             # check if random sample set is valid
             if self.is_data_valid is not None and not self.is_data_valid(
@@ -444,8 +441,8 @@ def fit(self, X, y, sample_weight=None):
 
             # extract inlier data set
             inlier_idxs_subset = sample_idxs[inlier_mask_subset]
-            X_inlier_subset = X[inlier_idxs_subset]
-            y_inlier_subset = y[inlier_idxs_subset]
+            X_inlier_subset = _safe_indexing(X, inlier_idxs_subset, axis=0)
+            y_inlier_subset = _safe_indexing(y, inlier_idxs_subset, axis=0)
 
             # score of inlier data set
             score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset)
@@ -521,6 +518,10 @@ def fit(self, X, y, sample_weight=None):
 
         self.estimator_ = base_estimator
         self.inlier_mask_ = inlier_mask_best
+        if hasattr(self.estimator_, "n_features_in_"):
+            self.n_features_in_ = self.estimator_.n_features_in_
+        if hasattr(self.estimator_, "feature_names_in_"):
+            self.feature_names_in_ = self.estimator_.feature_names_in_
         return self
 
     def predict(self, X):
@@ -538,8 +539,6 @@ def predict(self, X):
             Returns predicted values.
         """
         check_is_fitted(self)
-        self._check_feature_names(X, reset=False)
-
         return self.estimator_.predict(X)
 
     def score(self, X, y):
@@ -561,8 +560,6 @@ def score(self, X, y):
             Score of the prediction.
         """
         check_is_fitted(self)
-        self._check_feature_names(X, reset=False)
-
         return self.estimator_.score(X, y)
 
     def _more_tags(self):

From 681d04540719276b3632efe4d6d089a406e130e6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 26 Jul 2021 12:16:47 -0400
Subject: [PATCH 46/52] Revert "ENH Delegates validation to base estimator in
 Ransac"

This reverts commit a171666d312b3004e4ab4c2f9e72c42d5a1f8f7f.
---
 sklearn/linear_model/_ransac.py | 45 ++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index f424ff31d0d4a..47ffee936bec2 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,14 +9,9 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
-from ..utils import check_random_state, _safe_indexing
+from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
-from ..utils.validation import (
-    check_is_fitted,
-    _check_sample_weight,
-    _num_samples,
-    _num_features,
-)
+from ..utils.validation import check_is_fitted, _check_sample_weight
 from ._base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -278,19 +273,25 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(accept_sparse="csr")
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(
+            X, y, validate_separately=(check_X_params, check_y_params)
+        )
+        check_consistent_length(X, y)
+
         if self.base_estimator is not None:
             base_estimator = clone(self.base_estimator)
         else:
             base_estimator = LinearRegression()
 
-        n_samples = _num_samples(X)
-        n_features = _num_features(X)
-
         if self.min_samples is None:
             # assume linear model by default
-            min_samples = n_features + 1
+            min_samples = X.shape[1] + 1
         elif 0 < self.min_samples < 1:
-            min_samples = np.ceil(self.min_samples * n_samples)
+            min_samples = np.ceil(self.min_samples * X.shape[0])
         elif self.min_samples >= 1:
             if self.min_samples % 1 != 0:
                 raise ValueError("Absolute number of samples must be an integer value.")
@@ -300,7 +301,7 @@ def fit(self, X, y, sample_weight=None):
         if min_samples > X.shape[0]:
             raise ValueError(
                 "`min_samples` may not be larger than number "
-                "of samples: n_samples = %d." % n_samples
+                "of samples: n_samples = %d." % (X.shape[0])
             )
 
         if self.stop_probability < 0 or self.stop_probability > 1:
@@ -383,6 +384,8 @@ def fit(self, X, y, sample_weight=None):
         self.n_skips_invalid_data_ = 0
         self.n_skips_invalid_model_ = 0
 
+        # number of data samples
+        n_samples = X.shape[0]
         sample_idxs = np.arange(n_samples)
 
         self.n_trials_ = 0
@@ -401,8 +404,8 @@ def fit(self, X, y, sample_weight=None):
             subset_idxs = sample_without_replacement(
                 n_samples, min_samples, random_state=random_state
             )
-            X_subset = _safe_indexing(X, subset_idxs, axis=0)
-            y_subset = _safe_indexing(y, subset_idxs, axis=0)
+            X_subset = X[subset_idxs]
+            y_subset = y[subset_idxs]
 
             # check if random sample set is valid
             if self.is_data_valid is not None and not self.is_data_valid(
@@ -441,8 +444,8 @@ def fit(self, X, y, sample_weight=None):
 
             # extract inlier data set
             inlier_idxs_subset = sample_idxs[inlier_mask_subset]
-            X_inlier_subset = _safe_indexing(X, inlier_idxs_subset, axis=0)
-            y_inlier_subset = _safe_indexing(y, inlier_idxs_subset, axis=0)
+            X_inlier_subset = X[inlier_idxs_subset]
+            y_inlier_subset = y[inlier_idxs_subset]
 
             # score of inlier data set
             score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset)
@@ -518,10 +521,6 @@ def fit(self, X, y, sample_weight=None):
 
         self.estimator_ = base_estimator
         self.inlier_mask_ = inlier_mask_best
-        if hasattr(self.estimator_, "n_features_in_"):
-            self.n_features_in_ = self.estimator_.n_features_in_
-        if hasattr(self.estimator_, "feature_names_in_"):
-            self.feature_names_in_ = self.estimator_.feature_names_in_
         return self
 
     def predict(self, X):
@@ -539,6 +538,8 @@ def predict(self, X):
             Returns predicted values.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
+
         return self.estimator_.predict(X)
 
     def score(self, X, y):
@@ -560,6 +561,8 @@ def score(self, X, y):
             Score of the prediction.
         """
         check_is_fitted(self)
+        self._check_feature_names(X, reset=False)
+
         return self.estimator_.score(X, y)
 
     def _more_tags(self):

From 673811a9583c8efb360161bfa63be555ab456987 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 26 Jul 2021 13:17:02 -0400
Subject: [PATCH 47/52] ENH Only warn in fit

---
 sklearn/base.py                        |  8 ++++----
 sklearn/tests/test_base.py             | 17 ++++++++++++++++-
 sklearn/utils/tests/test_validation.py |  2 +-
 sklearn/utils/validation.py            | 16 +++++++++-------
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index b57868fd73153..6e39fb36bb6ab 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -417,13 +417,13 @@ def _check_feature_names(self, X, *, reset):
         """
 
         if reset:
-            feature_names_in = _get_feature_names(X)
+            feature_names_in = _get_feature_names(X, warn_on_invalid=True)
             if feature_names_in is not None:
                 self.feature_names_in_ = feature_names_in
             return
 
         fitted_feature_names = getattr(self, "feature_names_in_", None)
-        X_feature_names = _get_feature_names(X)
+        X_feature_names = _get_feature_names(X, warn_on_invalid=False)
 
         if fitted_feature_names is None and X_feature_names is None:
             # no feature names seen in fit and in X
@@ -438,8 +438,8 @@ def _check_feature_names(self, X, *, reset):
 
         if X_feature_names is None and fitted_feature_names is not None:
             warnings.warn(
-                f"X does not have any feature names, but {self.__class__.__name__} was "
-                "fitted with feature names"
+                "X does not have valid feature names, but"
+                f" {self.__class__.__name__} was fitted with feature names"
             )
             return
 
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 52b9c0595adbb..23cc350ce6880 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -644,7 +644,7 @@ def transform(self, X):
 
     # warns when fitted on dataframe and transforming a ndarray
     msg = (
-        "X does not have any feature names, but NoOpTransformer was "
+        "X does not have valid feature names, but NoOpTransformer was "
         "fitted with feature names"
     )
     with pytest.warns(UserWarning, match=msg):
@@ -655,3 +655,18 @@ def transform(self, X):
     trans = NoOpTransformer().fit(X_np)
     with pytest.warns(UserWarning, match=msg):
         trans.transform(df)
+
+    # fit on dataframe with invalid feature names warns on fit
+    df_int_names = pd.DataFrame(X_np)
+    trans = NoOpTransformer()
+
+    msg = "Feature name support requires all feature names to be strings"
+    with pytest.warns(FutureWarning, match=msg):
+        trans.fit(df_int_names)
+
+    # fit on dataframe with invalid feature names -> do not warn on transform
+    Xs = [X_np, df_int_names]
+    for X in Xs:
+        with pytest.warns(None) as record:
+            trans.transform(X)
+        assert not record
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 78bf652962a81..24c400b5baa7e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1464,7 +1464,7 @@ def test_get_feature_names_pandas_warns(names):
 
     msg = "Feature name support requires all feature names to be strings"
     with pytest.warns(FutureWarning, match=msg):
-        names = _get_feature_names(X)
+        names = _get_feature_names(X, warn_on_invalid=True)
     assert names is None
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index c517d5d810351..8b2009c7c69ae 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1589,7 +1589,7 @@ def _check_fit_params(X, fit_params, indices=None):
     return fit_params_validated
 
 
-def _get_feature_names(X):
+def _get_feature_names(X, *, warn_on_invalid=False):
     """Get feature names from X.
 
     Support for other array containers should place its implementation here.
@@ -1604,6 +1604,9 @@ def _get_feature_names(X):
           returned.
         - All other array containers will return `None`.
 
+    warn_on_invalid: bool, default=False
+        Warn when column names are invalid.
+
     Returns
     -------
     names: ndarray or None
@@ -1613,11 +1616,10 @@ def _get_feature_names(X):
         feature_names = np.asarray(X.columns)
         # Only strings are supported
         if not all(isinstance(item, str) for item in feature_names):
-            warnings.warn(
-                "Feature name support requires all feature names to be strings. "
-                "Passing non-str feature names will raise an error in 1.2",
-                FutureWarning,
-            )
-
+            if warn_on_invalid:
+                warnings.warn(
+                    "Feature name support requires all feature names to be strings. ",
+                    FutureWarning,
+                )
             return
         return feature_names

From 1a2bf2549b3a977d4a7712b7a527a7f965163369 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 1 Aug 2021 08:49:00 -0400
Subject: [PATCH 48/52] TST Makes test pass

---
 sklearn/base.py                        |  4 ++--
 sklearn/tests/test_base.py             |  7 ++-----
 sklearn/tests/test_common.py           |  5 ++++-
 sklearn/utils/tests/test_validation.py |  4 +---
 sklearn/utils/validation.py            | 10 +---------
 5 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 6e39fb36bb6ab..a585b2b06c394 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -417,13 +417,13 @@ def _check_feature_names(self, X, *, reset):
         """
 
         if reset:
-            feature_names_in = _get_feature_names(X, warn_on_invalid=True)
+            feature_names_in = _get_feature_names(X)
             if feature_names_in is not None:
                 self.feature_names_in_ = feature_names_in
             return
 
         fitted_feature_names = getattr(self, "feature_names_in_", None)
-        X_feature_names = _get_feature_names(X, warn_on_invalid=False)
+        X_feature_names = _get_feature_names(X)
 
         if fitted_feature_names is None and X_feature_names is None:
             # no feature names seen in fit and in X
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 23cc350ce6880..9d4b780dc2121 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -656,13 +656,10 @@ def transform(self, X):
     with pytest.warns(UserWarning, match=msg):
         trans.transform(df)
 
-    # fit on dataframe with invalid feature names warns on fit
+    # fit on dataframe with invalid feature names works
     df_int_names = pd.DataFrame(X_np)
     trans = NoOpTransformer()
-
-    msg = "Feature name support requires all feature names to be strings"
-    with pytest.warns(FutureWarning, match=msg):
-        trans.fit(df_int_names)
+    trans.fit(df_int_names)
 
     # fit on dataframe with invalid feature names -> do not warn on transform
     Xs = [X_np, df_int_names]
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index ef34d0efe6a63..11fa576b88d98 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -342,4 +342,7 @@ def test_check_n_features_in_after_fitting(estimator):
 )
 def test_pandas_column_name_consistency(estimator):
     _set_checking_parameters(estimator)
-    check_dataframe_column_names_consistency(estimator.__class__.__name__, estimator)
+    with ignore_warnings(category=(FutureWarning)):
+        check_dataframe_column_names_consistency(
+            estimator.__class__.__name__, estimator
+        )
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 24c400b5baa7e..acb2bbfcb53f8 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1462,9 +1462,7 @@ def test_get_feature_names_pandas_warns(names):
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
 
-    msg = "Feature name support requires all feature names to be strings"
-    with pytest.warns(FutureWarning, match=msg):
-        names = _get_feature_names(X, warn_on_invalid=True)
+    names = _get_feature_names(X)
     assert names is None
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 8b2009c7c69ae..ef52bccb983e1 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1589,7 +1589,7 @@ def _check_fit_params(X, fit_params, indices=None):
     return fit_params_validated
 
 
-def _get_feature_names(X, *, warn_on_invalid=False):
+def _get_feature_names(X):
     """Get feature names from X.
 
     Support for other array containers should place its implementation here.
@@ -1604,9 +1604,6 @@ def _get_feature_names(X, *, warn_on_invalid=False):
           returned.
         - All other array containers will return `None`.
 
-    warn_on_invalid: bool, default=False
-        Warn when column names are invalid.
-
     Returns
     -------
     names: ndarray or None
@@ -1616,10 +1613,5 @@ def _get_feature_names(X, *, warn_on_invalid=False):
         feature_names = np.asarray(X.columns)
         # Only strings are supported
         if not all(isinstance(item, str) for item in feature_names):
-            if warn_on_invalid:
-                warnings.warn(
-                    "Feature name support requires all feature names to be strings. ",
-                    FutureWarning,
-                )
             return
         return feature_names

From decb9671cb7caef55696182cd5a88ffd3a6de967 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 13 Aug 2021 10:16:59 -0400
Subject: [PATCH 49/52] CLN Address comments

---
 sklearn/tests/test_base.py             | 24 +++++++++++++++++++++---
 sklearn/utils/tests/test_validation.py | 16 +++++++++++++++-
 sklearn/utils/validation.py            | 21 ++++++++++++++++++---
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 9d4b780dc2121..b6f8fd12bfec2 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -1,6 +1,7 @@
 # Author: Gael Varoquaux
 # License: BSD 3 clause
 
+import re
 import numpy as np
 import scipy.sparse as sp
 import pytest
@@ -656,14 +657,31 @@ def transform(self, X):
     with pytest.warns(UserWarning, match=msg):
         trans.transform(df)
 
-    # fit on dataframe with invalid feature names works
+    # fit on dataframe with all integer feature names works without warning
     df_int_names = pd.DataFrame(X_np)
     trans = NoOpTransformer()
-    trans.fit(df_int_names)
+    with pytest.warns(None) as record:
+        trans.fit(df_int_names)
+    assert not record
 
-    # fit on dataframe with invalid feature names -> do not warn on transform
+    # fit on dataframe with no feature names or all integer feature names
+    # -> do not warn on trainsform
     Xs = [X_np, df_int_names]
     for X in Xs:
         with pytest.warns(None) as record:
             trans.transform(X)
         assert not record
+
+    # fit on dataframe with feature names that are mixed warns:
+    df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
+    trans = NoOpTransformer()
+    msg = re.escape(
+        "Feature names only support names that are all strings. "
+        "Got feature names with dtypes: ['int', 'str']"
+    )
+    with pytest.warns(UserWarning, match=msg) as record:
+        trans.fit(df_mixed)
+
+    # transform on feature names that are mixed also warns:
+    with pytest.warns(UserWarning, match=msg) as record:
+        trans.transform(df_mixed)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index acb2bbfcb53f8..85e0532996dc6 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1458,7 +1458,7 @@ def test_check_array_deprecated_matrix():
     ids=["list-int", "range", "multi-index"],
 )
 def test_get_feature_names_pandas_warns(names):
-    """Get feature names with pandas dataframes with warnings."""
+    """Get feature names with pandas dataframes without warning."""
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
 
@@ -1481,3 +1481,17 @@ def test_get_feature_names_numpy():
     X = np.array([[1, 2, 3], [4, 5, 6]])
     names = _get_feature_names(X)
     assert names is None
+
+
+def test_get_feature_names_mixed_type_warns():
+    """Get feature names warns when the feature names have mixed dtypes"""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=["a", 1])
+
+    msg = re.escape(
+        "Feature names only support names that are all strings. "
+        "Got feature names with dtypes: ['int', 'str']"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        names = _get_feature_names(X)
+    assert names is None
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index ef52bccb983e1..1e999f0e0d223 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1609,9 +1609,24 @@ def _get_feature_names(X):
     names: ndarray or None
         Feature names of `X`. Unrecognized array containers will return `None`.
     """
+    feature_names = None
+
+    # extract feature names for support array containers
     if hasattr(X, "columns"):
         feature_names = np.asarray(X.columns)
-        # Only strings are supported
-        if not all(isinstance(item, str) for item in feature_names):
-            return
+
+    if feature_names is None:
+        return
+
+    types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
+
+    # Warn when types are mixed
+    if len(types) > 1:
+        warnings.warn(
+            "Feature names only support names that are all strings. "
+            f"Got feature names with dtypes: {types}"
+        )
+
+    # Only feature names of all strings are supported
+    if all(t == "str" for t in types):
         return feature_names

From 5e89f6c707cd1e11f34938854b795b0486f462fb Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 13 Aug 2021 10:21:06 -0400
Subject: [PATCH 50/52] ENH Include 1.2 error in warning

---
 sklearn/tests/test_base.py             | 1 +
 sklearn/utils/tests/test_validation.py | 3 ++-
 sklearn/utils/validation.py            | 4 +++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index b6f8fd12bfec2..6bf73a9ae8e81 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -672,6 +672,7 @@ def transform(self, X):
             trans.transform(X)
         assert not record
 
+    # TODO: Convert to a error in 1.2
     # fit on dataframe with feature names that are mixed warns:
     df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
     trans = NoOpTransformer()
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 85e0532996dc6..ebd2af9a1dd2d 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1483,6 +1483,7 @@ def test_get_feature_names_numpy():
     assert names is None
 
 
+# TODO: Convert to a error in 1.2
 def test_get_feature_names_mixed_type_warns():
     """Get feature names warns when the feature names have mixed dtypes"""
     pd = pytest.importorskip("pandas")
@@ -1490,7 +1491,7 @@ def test_get_feature_names_mixed_type_warns():
 
     msg = re.escape(
         "Feature names only support names that are all strings. "
-        "Got feature names with dtypes: ['int', 'str']"
+        "Got feature names with dtypes: ['int', 'str']. An error will be raised"
     )
     with pytest.warns(UserWarning, match=msg):
         names = _get_feature_names(X)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 1e999f0e0d223..9dd9f8242a991 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1622,9 +1622,11 @@ def _get_feature_names(X):
 
     # Warn when types are mixed
     if len(types) > 1:
+        # TODO: Convert to a error in 1.2
         warnings.warn(
             "Feature names only support names that are all strings. "
-            f"Got feature names with dtypes: {types}"
+            f"Got feature names with dtypes: {types}. An error will be raised "
+            "in 1.2."
         )
 
     # Only feature names of all strings are supported

From 77221fd7b1ffeb9ee845844b5a1eb284c58e3a11 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 13 Aug 2021 10:22:10 -0400
Subject: [PATCH 51/52] CLN Better checking

---
 sklearn/utils/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 9dd9f8242a991..2387e504a67e6 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1630,5 +1630,5 @@ def _get_feature_names(X):
         )
 
     # Only feature names of all strings are supported
-    if all(t == "str" for t in types):
+    if types[0] == "str":
         return feature_names

From 19de717390ae46bed213ca6acf9f45103014a22e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 13 Aug 2021 13:43:59 -0400
Subject: [PATCH 52/52] ENH Super restrictive on supporting ints and strs

---
 sklearn/tests/test_base.py             |  4 ++--
 sklearn/utils/tests/test_validation.py | 32 +++++++++++++++-----------
 sklearn/utils/validation.py            | 13 +++++++----
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 6bf73a9ae8e81..36dc6064e4a46 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -680,9 +680,9 @@ def transform(self, X):
         "Feature names only support names that are all strings. "
         "Got feature names with dtypes: ['int', 'str']"
     )
-    with pytest.warns(UserWarning, match=msg) as record:
+    with pytest.warns(FutureWarning, match=msg) as record:
         trans.fit(df_mixed)
 
     # transform on feature names that are mixed also warns:
-    with pytest.warns(UserWarning, match=msg) as record:
+    with pytest.warns(FutureWarning, match=msg) as record:
         trans.transform(df_mixed)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index ebd2af9a1dd2d..9d88a06149e61 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1450,19 +1450,17 @@ def test_check_array_deprecated_matrix():
 
 @pytest.mark.parametrize(
     "names",
-    [
-        list(range(2)),
-        range(2),
-        [["a", "b"], ["c", "d"]],
-    ],
-    ids=["list-int", "range", "multi-index"],
+    [list(range(2)), range(2), None],
+    ids=["list-int", "range", "default"],
 )
-def test_get_feature_names_pandas_warns(names):
-    """Get feature names with pandas dataframes without warning."""
+def test_get_feature_names_pandas_with_ints_no_warning(names):
+    """Get feature names with pandas dataframes with ints without warning"""
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
 
-    names = _get_feature_names(X)
+    with pytest.warns(None) as record:
+        names = _get_feature_names(X)
+    assert not record
     assert names is None
 
 
@@ -1484,15 +1482,23 @@ def test_get_feature_names_numpy():
 
 
 # TODO: Convert to a error in 1.2
-def test_get_feature_names_mixed_type_warns():
+@pytest.mark.parametrize(
+    "names, dtypes",
+    [
+        ([["a", "b"], ["c", "d"]], "['tuple']"),
+        (["a", 1], "['int', 'str']"),
+    ],
+    ids=["multi-index", "mixed"],
+)
+def test_get_feature_names_invalid_dtypes_warns(names, dtypes):
     """Get feature names warns when the feature names have mixed dtypes"""
     pd = pytest.importorskip("pandas")
-    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=["a", 1])
+    X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names)
 
     msg = re.escape(
         "Feature names only support names that are all strings. "
-        "Got feature names with dtypes: ['int', 'str']. An error will be raised"
+        f"Got feature names with dtypes: {dtypes}. An error will be raised"
     )
-    with pytest.warns(UserWarning, match=msg):
+    with pytest.warns(FutureWarning, match=msg):
         names = _get_feature_names(X)
     assert names is None
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 2387e504a67e6..b5b485c5837ab 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1615,19 +1615,22 @@ def _get_feature_names(X):
     if hasattr(X, "columns"):
         feature_names = np.asarray(X.columns)
 
-    if feature_names is None:
+    if feature_names is None or len(feature_names) == 0:
         return
 
     types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
 
-    # Warn when types are mixed
-    if len(types) > 1:
-        # TODO: Convert to a error in 1.2
+    # Warn when types are mixed.
+    # ints and strings do not warn
+    if len(types) > 1 or not (types[0].startswith("int") or types[0] == "str"):
+        # TODO: Convert to an error in 1.2
         warnings.warn(
             "Feature names only support names that are all strings. "
             f"Got feature names with dtypes: {types}. An error will be raised "
-            "in 1.2."
+            "in 1.2.",
+            FutureWarning,
         )
+        return
 
     # Only feature names of all strings are supported
     if types[0] == "str":