FIX Non-fit methods no long raises UserWarning for valid dataframes (#21199)

thomasjpfan · ogrisel · glemaitre · commit c00a9e96f3ce · 2021-10-25T11:37:00.000+02:00
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -5,13 +5,23 @@
 .. _changes_1_0_1:
 
 Version 1.0.1
-=============
 
 **In Development**
 
 Changelog
 ---------
 
+Fixed models
+------------
+
+- |Fix| Non-fit methods in the following classes do not raise a UserWarning
+  when fitted on DataFrames with valid feature names:
+  :class:`covariance.EllipticEnvelope`, :class:`ensemble.IsolationForest`,
+  :class:`ensemble.AdaBoostClassifier`, :class:`neighbors.KNeighborsClassifier`,
+  :class:`neighbors.KNeighborsRegressor`,
+  :class:`neighbors.RadiusNeighborsClassifier`,
+  :class:`neighbors.RadiusNeighborsRegressor`. :pr:`21199` by `Thomas Fan`_.
+
 :mod:`sklearn.calibration`
 ..........................
 
@@ -25,6 +35,17 @@ Changelog
   the Bayesian priors.
   :pr:`21179` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.neighbors`
+........................
+
+- |Fix| :class:`neighbors.KNeighborsClassifier`,
+  :class:`neighbors.KNeighborsRegressor`,
+  :class:`neighbors.RadiusNeighborsClassifier`,
+  :class:`neighbors.RadiusNeighborsRegressor` with `metric="precomputed"` raises
+  an error for `bsr` and `dok` sparse matrices in methods: `fit`, `kneighbors`
+  and `radius_neighbors`, due to handling of explicit zeros in `bsr` and `dok`
+  :term:`sparse graph` formats. :pr:`21199` by `Thomas Fan`_.
+
 .. _changes_1_0:
 
 Version 1.0.0
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
@@ -215,7 +215,6 @@ def score_samples(self, X):
             Opposite of the Mahalanobis distances.
         """
         check_is_fitted(self)
-        X = self._validate_data(X, reset=False)
         return -self.mahalanobis(X)
 
     def predict(self, X):
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
@@ -337,9 +337,9 @@ def predict(self, X):
             be considered as an inlier according to the fitted model.
         """
         check_is_fitted(self)
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
-        is_inlier = np.ones(X.shape[0], dtype=int)
-        is_inlier[self.decision_function(X) < 0] = -1
+        decision_func = self.decision_function(X)
+        is_inlier = np.ones_like(decision_func, dtype=int)
+        is_inlier[decision_func < 0] = -1
         return is_inlier
 
     def decision_function(self, X):
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
@@ -676,8 +676,6 @@ def predict(self, X):
         y : ndarray of shape (n_samples,)
             The predicted classes.
         """
-        X = self._check_X(X)
-
         pred = self.decision_function(X)
 
         if self.n_classes_ == 2:
@@ -852,8 +850,6 @@ def predict_proba(self, X):
             outputs is the same of that of the :term:`classes_` attribute.
         """
         check_is_fitted(self)
-        X = self._check_X(X)
-
         n_classes = self.n_classes_
 
         if n_classes == 1:
@@ -886,7 +882,6 @@ def staged_predict_proba(self, X):
             The class probabilities of the input samples. The order of
             outputs is the same of that of the :term:`classes_` attribute.
         """
-        X = self._check_X(X)
 
         n_classes = self.n_classes_
 
@@ -912,7 +907,6 @@ def predict_log_proba(self, X):
             The class probabilities of the input samples. The order of
             outputs is the same of that of the :term:`classes_` attribute.
         """
-        X = self._check_X(X)
         return np.log(self.predict_proba(X))
 
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
@@ -211,8 +211,6 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
             Class labels for each data sample.
         """
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
-
         neigh_dist, neigh_ind = self.kneighbors(X)
         classes_ = self.classes_
         _y = self._y
@@ -255,8 +253,6 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
-
         neigh_dist, neigh_ind = self.kneighbors(X)
 
         classes_ = self.classes_
@@ -271,7 +267,7 @@ def predict_proba(self, X):
         if weights is None:
             weights = np.ones_like(neigh_ind)
 
-        all_rows = np.arange(X.shape[0])
+        all_rows = np.arange(n_queries)
         probabilities = []
         for k, classes_k in enumerate(classes_):
             pred_labels = _y[:, k][neigh_ind]
@@ -614,7 +610,6 @@ def predict_proba(self, X):
             by lexicographic order.
         """
 
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
         n_queries = _num_samples(X)
 
         neigh_dist, neigh_ind = self.radius_neighbors(X)
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
@@ -226,8 +226,6 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
             Target values.
         """
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
-
         neigh_dist, neigh_ind = self.kneighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
@@ -436,8 +434,6 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
-        X = self._validate_data(X, accept_sparse="csr", reset=False)
-
         neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -1088,7 +1088,12 @@ def test_kneighbors_regressor_sparse(
             assert np.mean(knn.predict(X2).round() == y) > 0.95
 
             X2_pre = sparsev(pairwise_distances(X, metric="euclidean"))
-            assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95
+            if sparsev in {dok_matrix, bsr_matrix}:
+                msg = "not supported due to its handling of explicit zeros"
+                with pytest.raises(TypeError, match=msg):
+                    knn_pre.predict(X2_pre)
+            else:
+                assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95
 
 
 def test_neighbors_iris():
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -3779,7 +3779,14 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
         check_methods.append((method, callable_method))
 
     for _, method in check_methods:
-        method(X)  # works
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "error",
+                message="X does not have valid feature names",
+                category=UserWarning,
+                module="sklearn",
+            )
+            method(X)  # works without UserWarning for valid features
 
     invalid_names = [
         (names[::-1], "Feature names must be in the same order as they were in fit."),