From 6078cc6297bf25c655143516e75ef417c3d6cd7d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 1 Jun 2020 11:51:10 -0400 Subject: [PATCH 1/2] WIP --- sklearn/base.py | 22 +++++++++- .../gradient_boosting.py | 4 +- .../tests/test_gradient_boosting.py | 7 ++++ sklearn/utils/estimator_checks.py | 42 +++++++++++++++++++ sklearn/utils/tests/test_validation.py | 13 ++++++ sklearn/utils/validation.py | 5 +++ 6 files changed, 90 insertions(+), 3 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a4cc26acc0c9a..5bf9aa4cc15ac 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -17,6 +17,7 @@ from .utils import _IS_32BIT from .utils.validation import check_X_y from .utils.validation import check_array +from .utils.validation import _is_dataframe from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args @@ -376,6 +377,21 @@ def _check_n_features(self, X, reset): self.n_features_in_) ) + def _check_feature_names(self, df, reset): + # set _feature_names_in attribute or check against it + + if reset: + self._feature_names_in = df.columns.values + elif hasattr(self, '_feature_names_in'): + feature_names = df.columns.values + if np.any(feature_names != self._feature_names_in): + raise ValueError( + "The column names of the dataframe must match those " + "that were passed during fit(), in the same order. " + f"Got ({feature_names}), expected " + f"({self._feature_names_in})." + ) + def _validate_data(self, X, y=None, reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. @@ -406,9 +422,11 @@ def _validate_data(self, X, y=None, reset=True, out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ + is_df = _is_dataframe(X) + X_orig = X if y is None: - if self._get_tags()['requires_y']: + if reset and self._get_tags()['requires_y']: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." @@ -430,6 +448,8 @@ def _validate_data(self, X, y=None, reset=True, if check_params.get('ensure_2d', True): self._check_n_features(X, reset=reset) + if is_df: + self._check_feature_names(X_orig, reset=reset) return out diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index a057d4f274b50..2c0dd3d66e5e1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -640,8 +640,8 @@ def _raw_predict(self, X): raw_predictions : array, shape (n_trees_per_iteration, n_samples) The raw predicted values. """ - X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE], - force_all_finite=False) + X = self._validate_data(X, dtype=[X_DTYPE, X_BINNED_DTYPE], + force_all_finite=False, reset=False) check_is_fitted(self) if X.shape[1] != self.n_features_: raise ValueError( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 4a424bac47d63..8742de9331a9d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -794,3 +794,10 @@ def test_staged_predict(HistGradientBoosting, X, y): assert_allclose(staged_predictions, pred_aux) assert staged_predictions.shape == pred_aux.shape + + +def test_df_column_names(): + from sklearn.utils.estimator_checks import check_dataframe_column_names_consistency # noqa + + check_dataframe_column_names_consistency('hgbdt', + HistGradientBoostingRegressor()) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bbde6264a1c77..ed137f14c5427 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2991,3 +2991,45 @@ def check_requires_y_none(name, estimator_orig): except ValueError as ve: if not any(msg in str(ve) for msg in expected_err_msgs): warnings.warn(warning_msg, FutureWarning) + + +def check_dataframe_column_names_consistency(name, estimator_orig): + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column names consistency" + ) + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + if 'warm_start' in estimator.get_params(): + estimator.set_params(warm_start=False) + + n_samples = 100 + X = rng.normal(loc=100, size=(n_samples, 2)) + X = _pairwise_estimator_convert_X(X, estimator) + X = pd.DataFrame(X) + + if is_regressor(estimator_orig): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + estimator.fit(X, y) + if hasattr(estimator, '_feature_names_in'): + assert_array_equal(estimator._feature_names_in, X.columns.values) + + bad_X = X[X.columns[::-1]] # reverse column order + + for method in ["predict", "transform", "decision_function", + "predict_proba"]: + if hasattr(estimator, method): + estimator.predict(X) + msg = ("column names of the dataframe must match those that were " + "passed during fit") + assert_raises_regex(ValueError, msg, getattr(estimator, method), + bad_X) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index bcfd8fcd8d50e..5d4d5f37f471e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -43,6 +43,7 @@ _deprecate_positional_args, _check_sample_weight, _allclose_dense_sparse, + _is_dataframe, FLOAT_DTYPES) from sklearn.utils.validation import _check_fit_params @@ -1213,3 +1214,15 @@ def test_check_sparse_pandas_sp_format(sp_format): assert sp.issparse(result) assert result.format == sp_format assert_allclose_dense_sparse(sp_mat, result) + + +def test_is_dataframe(): + pd = pytest.importorskip('pandas') + + assert _is_dataframe(pd.DataFrame(np.arange(10))) + assert _is_dataframe(pd.Series(np.arange(10))) + + assert not _is_dataframe(np.arange(10)) + assert not _is_dataframe(list(range(10))) + assert not _is_dataframe(1234) + assert not _is_dataframe('still not a df') diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 140b8d48a1bcd..d0ff001c34b42 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1378,3 +1378,8 @@ def _check_fit_params(X, fit_params, indices=None): ) return fit_params_validated + + +def _is_dataframe(X): + # Return True if X is a pandas dataframe (or a Series) + return hasattr(X, 'iloc') From 1c9827b7faa324ddb2b09b0d1de952d0b0ebe83b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 1 Jun 2020 12:14:22 -0400 Subject: [PATCH 2/2] maybe fix --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 2c0dd3d66e5e1..a5926fe46b90d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -640,9 +640,9 @@ def _raw_predict(self, X): raw_predictions : array, shape (n_trees_per_iteration, n_samples) The raw predicted values. """ + check_is_fitted(self) X = self._validate_data(X, dtype=[X_DTYPE, X_BINNED_DTYPE], force_all_finite=False, reset=False) - check_is_fitted(self) if X.shape[1] != self.n_features_: raise ValueError( 'X has {} features but this estimator was trained with '