scikit-learn · adrinjalali · May 4, 2023 · Apr 29, 2023 · May 2, 2023 · May 3, 2023
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -548,7 +548,7 @@ def _validate_data(
 
         cast_to_ndarray : bool, default=True
             Cast `X` and `y` to ndarray with checks in `check_params`. If
-            `False`, `X` and `y` are unchanged and only `feature_names` and
+            `False`, `X` and `y` are unchanged and only `feature_names_in_` and
             `n_features_in_` are checked.
 
         **check_params : kwargs
@@ -576,21 +576,25 @@ def _validate_data(
         no_val_X = isinstance(X, str) and X == "no_validation"
         no_val_y = y is None or isinstance(y, str) and y == "no_validation"
 
+        if no_val_X and no_val_y:
+            raise ValueError("Validation should be done on X, y or both.")
+
         default_check_params = {"estimator": self}
         check_params = {**default_check_params, **check_params}
 
-        if no_val_X and no_val_y:
-            raise ValueError("Validation should be done on X, y or both.")
+        if not cast_to_ndarray:
+            if not no_val_X and no_val_y:
+                out = X
+            elif no_val_X and not no_val_y:
+                out = y
+            else:
+                out = X, y
         elif not no_val_X and no_val_y:
-            if cast_to_ndarray:
-                X = check_array(X, input_name="X", **check_params)
-            out = X
+            out = check_array(X, input_name="X", **check_params)
         elif no_val_X and not no_val_y:
-            if cast_to_ndarray:
-                y = _check_y(y, **check_params) if cast_to_ndarray else y
-            out = y
+            out = _check_y(y, **check_params)
         else:
-            if validate_separately and cast_to_ndarray:
+            if validate_separately:
                 # We need this because some estimators validate X and y
                 # separately, and in general, separately calling check_array()
                 # on X and y isn't equivalent to just calling check_X_y()

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
@@ -719,6 +719,47 @@ def transform(self, X):
         trans.transform(df_mixed)
 
 
+def test_validate_data_cast_to_ndarray():
+    """Check cast_to_ndarray option of _validate_data."""
+
+    pd = pytest.importorskip("pandas")
+    iris = datasets.load_iris()
+    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    y = pd.Series(iris.target)
+
+    class NoOpTransformer(TransformerMixin, BaseEstimator):
+        pass
+
+    no_op = NoOpTransformer()
+    X_np_out = no_op._validate_data(df, cast_to_ndarray=True)
+    assert isinstance(X_np_out, np.ndarray)
+    assert_allclose(X_np_out, df.to_numpy())
+
+    X_df_out = no_op._validate_data(df, cast_to_ndarray=False)
+    assert X_df_out is df
+
+    y_np_out = no_op._validate_data(y=y, cast_to_ndarray=True)
+    assert isinstance(y_np_out, np.ndarray)
+    assert_allclose(y_np_out, y.to_numpy())
+
+    y_series_out = no_op._validate_data(y=y, cast_to_ndarray=False)
+    assert y_series_out is y
+
+    X_np_out, y_np_out = no_op._validate_data(df, y, cast_to_ndarray=True)
+    assert isinstance(X_np_out, np.ndarray)
+    assert_allclose(X_np_out, df.to_numpy())
+    assert isinstance(y_np_out, np.ndarray)
+    assert_allclose(y_np_out, y.to_numpy())
+
+    X_df_out, y_series_out = no_op._validate_data(df, y, cast_to_ndarray=False)
+    assert X_df_out is df
+    assert y_series_out is y
+
+    msg = "Validation should be done on X, y or both."
+    with pytest.raises(ValueError, match=msg):
+        no_op._validate_data()
+
+
 def test_clone_keeps_output_config():
     """Check that clone keeps the set_output config."""