TST move check_n_features_in_after_fitting to common tests (scikit-learn#29844)

adrinjalali · thomasjpfan · web-flow · commit 74a33757c8a8 · 2024-09-23T17:48:47.000Z
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
@@ -411,6 +411,14 @@ def predict_proba(self, X):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.classifier_tags.multi_label = True
+        tags.input_tags.pairwise = self.metric == "precomputed"
+        if tags.input_tags.pairwise:
+            tags._xfail_checks.update(
+                {
+                    "check_n_features_in_after_fitting": "FIXME",
+                    "check_dataframe_column_names_consistency": "FIXME",
+                }
+            )
         return tags
 
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
@@ -195,6 +195,13 @@ def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # For cross-validation routines to split data correctly
         tags.input_tags.pairwise = self.metric == "precomputed"
+        if tags.input_tags.pairwise:
+            tags._xfail_checks.update(
+                {
+                    "check_n_features_in_after_fitting": "FIXME",
+                    "check_dataframe_column_names_consistency": "FIXME",
+                }
+            )
         return tags
 
     @_fit_context(
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -51,7 +51,6 @@
     check_global_output_transform_pandas,
     check_global_set_output_transform_polars,
     check_inplace_ensure_writeable,
-    check_n_features_in_after_fitting,
     check_param_validation,
     check_set_output_transform,
     check_set_output_transform_pandas,
@@ -243,13 +242,6 @@ def check_field_types(tags, defaults):
     check_field_types(tags.transformer_tags, defaults.transformer_tags)
 
 
-@pytest.mark.parametrize(
-    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
-)
-def test_check_n_features_in_after_fitting(estimator):
-    check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
-
-
 def _estimators_that_predict_in_fit():
     for estimator in _tested_estimators():
         est_params = set(estimator.get_params())
@@ -286,6 +278,11 @@ def _estimators_that_predict_in_fit():
 def test_pandas_column_name_consistency(estimator):
     if isinstance(estimator, ColumnTransformer):
         pytest.skip("ColumnTransformer is not tested here")
+    tags = get_tags(estimator)
+    if "check_dataframe_column_names_consistency" in tags._xfail_checks:
+        pytest.skip(
+            "Estimator does not support check_dataframe_column_names_consistency"
+        )
     with ignore_warnings(category=(FutureWarning)):
         with warnings.catch_warnings(record=True) as record:
             check_dataframe_column_names_consistency(
diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -140,7 +140,11 @@
     MultiOutputRegressor,
     RegressorChain,
 )
-from sklearn.neighbors import NeighborhoodComponentsAnalysis
+from sklearn.neighbors import (
+    KNeighborsClassifier,
+    KNeighborsRegressor,
+    NeighborhoodComponentsAnalysis,
+)
 from sklearn.neural_network import BernoulliRBM, MLPClassifier, MLPRegressor
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
@@ -308,6 +312,8 @@
     IncrementalPCA: dict(batch_size=10),
     IsolationForest: dict(n_estimators=5),
     KMeans: dict(n_init=2, n_clusters=2, max_iter=5),
+    KNeighborsClassifier: [dict(n_neighbors=2), dict(metric="precomputed")],
+    KNeighborsRegressor: [dict(n_neighbors=2), dict(metric="precomputed")],
     LabelPropagation: dict(max_iter=5),
     LabelSpreading: dict(max_iter=5),
     LarsCV: dict(max_iter=5, cv=3),
@@ -448,8 +454,8 @@
         ],
         cv=3,
     ),
-    SVC: dict(max_iter=-1),
-    SVR: dict(max_iter=-1),
+    SVC: [dict(max_iter=-1), dict(kernel="precomputed")],
+    SVR: [dict(max_iter=-1), dict(kernel="precomputed")],
     TargetEncoder: dict(cv=3),
     TheilSenRegressor: dict(max_iter=5, max_subpopulation=100),
     # TruncatedSVD doesn't run with n_components = n_features
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -5,6 +5,7 @@
 
 import pickle
 import re
+import textwrap
 import warnings
 from contextlib import nullcontext
 from copy import deepcopy
@@ -95,6 +96,7 @@ def _yield_api_checks(estimator):
     if tags.requires_fit:
         yield check_estimators_unfitted
     yield check_do_not_raise_errors_in_init_or_set_params
+    yield check_n_features_in_after_fitting
 
 
 def _yield_checks(estimator):
@@ -441,7 +443,7 @@ def _should_be_skipped_or_marked(estimator, check):
     return False, "placeholder reason that will never be used"
 
 
-def parametrize_with_checks(estimators, *, legacy=True):
+def parametrize_with_checks(estimators, *, legacy: bool = True):
     """Pytest specific decorator for parametrizing estimator checks.
 
     Checks are categorised into the following groups:
@@ -468,6 +470,7 @@ def parametrize_with_checks(estimators, *, legacy=True):
 
         .. versionadded:: 0.24
 
+
     legacy : bool, default=True
         Whether to include legacy checks. Over time we remove checks from this category
         and move them into their specific category.
@@ -520,7 +523,7 @@ def checks_generator():
     )
 
 
-def check_estimator(estimator=None, generate_only=False, *, legacy=True):
+def check_estimator(estimator=None, generate_only=False, *, legacy: bool = True):
     """Check if estimator adheres to scikit-learn conventions.
 
     This function will run an extensive test-suite for input validation,
@@ -2009,13 +2012,14 @@ def check_estimators_partial_fit_n_features(name, estimator_orig):
 
 
 @ignore_warnings(category=FutureWarning)
-def check_classifier_multioutput(name, estimator):
+def check_classifier_multioutput(name, estimator_orig):
     n_samples, n_labels, n_classes = 42, 5, 3
-    tags = get_tags(estimator)
-    estimator = clone(estimator)
+    tags = get_tags(estimator_orig)
+    estimator = clone(estimator_orig)
     X, y = make_multilabel_classification(
         random_state=42, n_samples=n_samples, n_labels=n_labels, n_classes=n_classes
     )
+    X = _enforce_estimator_tags_X(estimator, X)
     estimator.fit(X, y)
     y_pred = estimator.predict(X)
 
@@ -2174,13 +2178,14 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig):
 def check_classifiers_one_label(name, classifier_orig):
     error_string_fit = "Classifier can't train when only one class is present."
     error_string_predict = "Classifier can't predict when only one class is present."
+    classifier = clone(classifier_orig)
     rnd = np.random.RandomState(0)
     X_train = rnd.uniform(size=(10, 3))
     X_test = rnd.uniform(size=(10, 3))
+    X_train, X_test = _enforce_estimator_tags_X(classifier, X_train, X_test=X_test)
     y = np.ones(10)
     # catch deprecation warnings
     with ignore_warnings(category=FutureWarning):
-        classifier = clone(classifier_orig)
         with raises(
             ValueError, match="class", may_pass=True, err_msg=error_string_fit
         ) as cm:
@@ -2505,6 +2510,7 @@ def check_classifiers_multilabel_representation_invariance(name, classifier_orig
 
     X_train, y_train = X[:80], y[:80]
     X_test = X[80:]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
 
     y_train_list_of_lists = y_train.tolist()
     y_train_list_of_arrays = list(y_train)
@@ -2552,6 +2558,7 @@ def check_classifiers_multilabel_output_format_predict(name, classifier_orig):
 
     X_train, X_test = X[:-test_size], X[-test_size:]
     y_train, y_test = y[:-test_size], y[-test_size:]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
     classifier.fit(X_train, y_train)
 
     response_method_name = "predict"
@@ -2597,6 +2604,7 @@ def check_classifiers_multilabel_output_format_predict_proba(name, classifier_or
 
     X_train, X_test = X[:-test_size], X[-test_size:]
     y_train = y[:-test_size]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
     classifier.fit(X_train, y_train)
 
     response_method_name = "predict_proba"
@@ -2681,6 +2689,7 @@ def check_classifiers_multilabel_output_format_decision_function(name, classifie
 
     X_train, X_test = X[:-test_size], X[-test_size:]
     y_train = y[:-test_size]
+    X_train, X_test = _enforce_estimator_tags_X(classifier_orig, X_train, X_test=X_test)
     classifier.fit(X_train, y_train)
 
     response_method_name = "decision_function"
@@ -3474,30 +3483,48 @@ def _enforce_estimator_tags_y(estimator, y):
     return y
 
 
-def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel):
+def _enforce_estimator_tags_X(estimator, X, X_test=None, kernel=linear_kernel):
     # Estimators with `1darray` in `X_types` tag only accept
     # X of shape (`n_samples`,)
     if get_tags(estimator).input_tags.one_d_array:
         X = X[:, 0]
+        if X_test is not None:
+            X_test = X_test[:, 0]  # pragma: no cover
     # Estimators with a `requires_positive_X` tag only accept
     # strictly positive data
     if get_tags(estimator).input_tags.positive_only:
         X = X - X.min()
+        if X_test is not None:
+            X_test = X_test - X_test.min()  # pragma: no cover
     if get_tags(estimator).input_tags.categorical:
         dtype = np.float64 if get_tags(estimator).input_tags.allow_nan else np.int32
         X = np.round((X - X.min())).astype(dtype)
+        if X_test is not None:
+            X_test = np.round((X_test - X_test.min())).astype(dtype)  # pragma: no cover
 
     if estimator.__class__.__name__ == "SkewedChi2Sampler":
         # SkewedChi2Sampler requires X > -skewdness in transform
         X = X - X.min()
+        if X_test is not None:
+            X_test = X_test - X_test.min()  # pragma: no cover
+
+    X_res = X
 
     # Pairwise estimators only accept
     # X of shape (`n_samples`, `n_samples`)
     if _is_pairwise_metric(estimator):
-        X = pairwise_distances(X, metric="euclidean")
+        X_res = pairwise_distances(X, metric="euclidean")
+        if X_test is not None:
+            X_test = pairwise_distances(
+                X_test, X, metric="euclidean"
+            )  # pragma: no cover
     elif get_tags(estimator).input_tags.pairwise:
-        X = kernel(X, X)
-    return X
+        X_res = kernel(X, X)
+        if X_test is not None:
+            X_test = kernel(X_test, X)  # pragma: no cover
+    if X_test is not None:
+        return X_res, X_test
+    return X_res
 
 
 @ignore_warnings(category=FutureWarning)
@@ -3913,8 +3940,16 @@ def check_n_features_in_after_fitting(name, estimator_orig):
         y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
 
+    err_msg = (
+        "`{name}.fit()` does not set the `n_features_in_` attribute. "
+        "You might want to use `sklearn.utils.validation.validate_data` instead "
+        "of `check_array` in `{name}.fit()` which takes care of setting the "
+        "attribute.".format(name=name)
+    )
+
     estimator.fit(X, y)
-    assert estimator.n_features_in_ == X.shape[1]
+    assert hasattr(estimator, "n_features_in_"), err_msg
+    assert estimator.n_features_in_ == X.shape[1], err_msg
 
     # check methods will check n_features_in_
     check_methods = [
@@ -3926,6 +3961,28 @@ def check_n_features_in_after_fitting(name, estimator_orig):
     ]
     X_bad = X[:, [1]]
 
+    err_msg = """\
+        `{name}.{method}()` does not check for consistency between input number
+        of features with {name}.fit(), via the `n_features_in_` attribute.
+        You might want to use `sklearn.utils.validation.validate_data` instead
+        of `check_array` in `{name}.fit()` and {name}.{method}()`. This can be done
+        like the following:
+        from sklearn.utils.validation import validate_data
+        ...
+        class MyEstimator(BaseEstimator):
+            ...
+            def fit(self, X, y):
+                X, y = validate_data(self, X, y, ...)
+                ...
+                return self
+            ...
+            def {method}(self, X):
+                X = validate_data(self, X, ..., reset=False)
+                ...
+            return X
+    """
+    err_msg = textwrap.dedent(err_msg)
+
     msg = f"X has 1 features, but \\w+ is expecting {X.shape[1]} features as input"
     for method in check_methods:
         if not hasattr(estimator, method):
@@ -3935,7 +3992,9 @@ def check_n_features_in_after_fitting(name, estimator_orig):
         if method == "score":
             callable_method = partial(callable_method, y=y)
 
-        with raises(ValueError, match=msg):
+        with raises(
+            ValueError, match=msg, err_msg=err_msg.format(name=name, method=method)
+        ):
             callable_method(X_bad)
 
     # partial_fit will check in the second call
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -304,7 +304,8 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        X = check_array(X)
+        check_is_fitted(self)
+        X = validate_data(self, X, reset=False)
         return X
 
 
@@ -422,16 +423,15 @@ def __init__(self, sparse_container=None):
         self.sparse_container = sparse_container
 
     def fit(self, X, y=None):
-        self.X_shape_ = validate_data(self, X).shape
+        validate_data(self, X)
         return self
 
     def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X)
 
     def transform(self, X):
-        X = check_array(X)
-        if X.shape[1] != self.X_shape_[1]:
-            raise ValueError("Bad number of features")
+        check_is_fitted(self)
+        X = validate_data(self, X, accept_sparse=True, reset=False)
         return self.sparse_container(X)