From 779385f2fa1d8497d19988667560015aad1475b7 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 26 Jul 2020 21:56:51 -0400 Subject: [PATCH 01/52] ENH Adds column name consistency --- sklearn/base.py | 42 +++++++++++- sklearn/linear_model/_base.py | 6 +- sklearn/tests/test_base.py | 25 ++++++++ sklearn/tests/test_common.py | 19 ++++++ sklearn/tests/test_docstring_parameters.py | 2 +- sklearn/tests/test_extarray.py | 27 ++++++++ sklearn/utils/_extarray.py | 16 +++++ sklearn/utils/estimator_checks.py | 75 ++++++++++++++++++++++ 8 files changed, 207 insertions(+), 5 deletions(-) create mode 100644 sklearn/tests/test_extarray.py create mode 100644 sklearn/utils/_extarray.py diff --git a/sklearn/base.py b/sklearn/base.py index 46398baabfd3a..2961a5a50707f 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -19,6 +19,7 @@ from .utils.validation import check_array from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args +from .utils._extarray import _get_feature_names _DEFAULT_TAGS = { 'non_deterministic': False, @@ -376,6 +377,44 @@ def _check_n_features(self, X, reset): self.n_features_in_) ) + def _check_feature_names(self, X, reset=True): + """Validate feature names and set or check the `feature_names_in_` + attribute. + + Parameters + ---------- + X : {dataframe-like} of shape (n_samples, n_features) + The input samples. + reset : bool, default=True + Whether to reset the `feature_names_in_` attribute. + If False, the Input will be checked for consistency with data + provided when reset was last True. + """ + + if reset: + self.feature_names_in_ = _get_feature_names(X) + return + + fitted_feature_names = getattr(self, "feature_names_in_", None) + if fitted_feature_names is None: + # no feature names to check + return + + feature_names_in = _get_feature_names(X) + if feature_names_in is None: + # X does not have feature names but estimator was fitted with + # data with feature names + return + + # valid the `feature_names_in_` attribute + if (len(fitted_feature_names) != len(feature_names_in) or + np.any(fitted_feature_names != feature_names_in)): + warnings.warn("The column names should match those that were " + "passed during fit(), in the same order. Got " + f"({feature_names_in}) expected " + f"({fitted_feature_names}). Starting version 0.26, " + "an error will be raised", FutureWarning) + def _validate_data(self, X, y=None, reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. @@ -406,9 +445,10 @@ def _validate_data(self, X, y=None, reset=True, out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ + self._check_feature_names(X, reset=reset) if y is None: - if self._get_tags()['requires_y']: + if reset and self._get_tags()['requires_y']: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 4ab797578dbde..c922480875f9d 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -214,8 +214,8 @@ def fit(self, X, y): def _decision_function(self, X): check_is_fitted(self) - - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + reset=False) return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ @@ -279,7 +279,7 @@ class would be predicted. """ check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr', reset=False) n_features = self.coef_.shape[1] if X.shape[1] != n_features: diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index db5c88051346a..51dfbd163a312 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -16,6 +16,8 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LinearRegression +from sklearn.datasets import make_regression from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor @@ -537,3 +539,26 @@ def test_repr_html_wraps(): with config_context(display='diagram'): output = tree._repr_html_() assert "