scikit-learn · jovan-stojanovic · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
@@ -611,11 +611,16 @@ def test_pls_feature_names_out(Klass):
     assert_array_equal(names_out, expected_names_out)
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical])
-def test_pls_set_output(Klass):
+def test_pls_set_output(use_pyarrow_dtypes, Klass):
     """Check `set_output` in cross_decomposition module."""
     pd = pytest.importorskip("pandas")
     X, Y = load_linnerud(return_X_y=True, as_frame=True)
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X.convert_dtypes(dtype_backend="pyarrow")
+        Y.convert_dtypes(dtype_backend="pyarrow")
 
     est = Klass().set_output(transform="pandas").fit(X, Y)
     X_trans, y_trans = est.transform(X, Y)

diff --git a/sklearn/datasets/tests/test_arff_parser.py b/sklearn/datasets/tests/test_arff_parser.py
@@ -11,6 +11,7 @@
 )
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize(
     "feature_names, target_names",
     [
@@ -43,7 +44,7 @@
         ),
     ],
 )
-def test_post_process_frame(feature_names, target_names):
+def test_post_process_frame(use_pyarrow_dtypes, feature_names, target_names):
     """Check the behaviour of the post-processing function for splitting a dataframe."""
     pd = pytest.importorskip("pandas")
 
@@ -57,6 +58,9 @@ def test_post_process_frame(feature_names, target_names):
             "col_string": ["a", "b", "c"],
         }
     )
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X_original.convert_dtypes(dtype_backend="pyarrow")
 
     X, y = _post_process_frame(X_original, feature_names, target_names)
     assert isinstance(X, pd.DataFrame)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1113,10 +1113,11 @@ def test_categorical_spec_errors(
         est.fit(X, y)
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
-def test_categorical_spec_errors_with_feature_names(Est):
+def test_categorical_spec_errors_with_feature_names(use_pyarrow_dtypes, Est):
     pd = pytest.importorskip("pandas")
     n_samples = 10
     X = pd.DataFrame(
@@ -1127,6 +1128,9 @@ def test_categorical_spec_errors_with_feature_names(Est):
         }
     )
     y = [0, 1] * (n_samples // 2)
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X.convert_dtypes(dtype_backend="pyarrow")
 
     est = Est(categorical_features=["f0", "f1", "f3"])
     expected_msg = re.escape(
@@ -1162,20 +1166,26 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
     assert est.is_categorical_ is None
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
 @pytest.mark.parametrize(
     "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
 )
-def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
+def test_categorical_bad_encoding_errors(
+    use_pyarrow_dtypes, Est, use_pandas, feature_name
+):
     # Test errors when categories are encoded incorrectly
 
     gb = Est(categorical_features=[True], max_bins=2)
 
     if use_pandas:
         pd = pytest.importorskip("pandas")
         X = pd.DataFrame({"f0": [0, 1, 2]})
+        if use_pyarrow_dtypes:
+            pytest.importorskip("pyarrow")
+            X.convert_dtypes(dtype_backend="pyarrow")
     else:
         X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
@@ -1188,6 +1198,8 @@ def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
 
     if use_pandas:
         X = pd.DataFrame({"f0": [0, 2]})
+        if use_pyarrow_dtypes:
+            X.convert_dtypes(dtype_backend="pyarrow")
     else:
         X = np.array([[0, 2]]).T
     y = np.arange(2)
@@ -1293,14 +1305,18 @@ def test_interaction_cst_numerically():
     )
 
 
-def test_no_user_warning_with_scoring():
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_no_user_warning_with_scoring(use_pyarrow_dtypes):
     """Check that no UserWarning is raised when scoring is set.
 
     Non-regression test for #22907.
     """
     pd = pytest.importorskip("pandas")
     X, y = make_regression(n_samples=50, random_state=0)
     X_df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X_df.convert_dtypes(dtype_backend="pyarrow")
 
     est = HistGradientBoostingRegressor(
         random_state=0, scoring="neg_mean_absolute_error", early_stopping=True

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -293,10 +293,14 @@ def test_input_error():
         gbdt.fit(X, y)
 
 
-def test_input_error_related_to_feature_names():
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_input_error_related_to_feature_names(use_pyarrow_dtypes):
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
     y = np.array([0, 1, 0])
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X.convert_dtypes(dtype_backend="pyarrow")
 
     monotonic_cst = {"d": 1, "a": 1, "c": -1}
     gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -341,7 +341,8 @@ def test_base_estimator_property_deprecated():
         model.base_estimator_
 
 
-def test_iforest_preserve_feature_names():
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_iforest_preserve_feature_names(use_pyarrow_dtypes):
     """Check that feature names are preserved when contamination is not "auto".
 
     Feature names are required for consistency checks during scoring.
@@ -352,6 +353,10 @@ def test_iforest_preserve_feature_names():
     rng = np.random.RandomState(0)
 
     X = pd.DataFrame(data=rng.randn(4), columns=["a"])
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X.convert_dtypes(dtype_backend="pyarrow")
+
     model = IsolationForest(random_state=0, contamination=0.05)
 
     with warnings.catch_warnings():

diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py
@@ -118,7 +118,8 @@ def test_get_support():
     assert_array_equal(support_inds, sel.get_support(indices=True))
 
 
-def test_output_dataframe():
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_output_dataframe(use_pyarrow_dtypes):
     """Check output dtypes for dataframes is consistent with the input dtypes."""
     pd = pytest.importorskip("pandas")
 
@@ -130,6 +131,9 @@ def test_output_dataframe():
             "d": pd.Series([3.0, 2.4, 1.2], dtype=np.float64),
         }
     )
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+        X.convert_dtypes(dtype_backend="pyarrow")
 
     for step in [2, 3]:
         sel = StepSelector(step=step).set_output(transform="pandas")

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -946,12 +946,15 @@ def test_mutual_info_regression():
     assert_array_equal(support, gtruth)
 
 
-def test_dataframe_output_dtypes():
-    """Check that the output datafarme dtypes are the same as the input.
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_dataframe_output_dtypes(use_pyarrow_dtypes):
+    """Check that the output dataframe dtypes are the same as the input.
 
     Non-regression test for gh-24860.
     """
     pd = pytest.importorskip("pandas")
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
 
     X, y = load_iris(return_X_y=True, as_frame=True)
     X = X.astype(
@@ -960,6 +963,9 @@ def test_dataframe_output_dtypes():
             "petal width (cm)": np.float64,
         }
     )
+    if use_pyarrow_dtypes:
+        X.convert_dtypes(dtype_backend="pyarrow")
+
     X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10)
 
     column_order = X.columns

diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py
@@ -597,13 +597,21 @@ def test_select_from_model_pls(PLSEstimator):
     assert model.score(X, y) > 0.5
 
 
-def test_estimator_does_not_support_feature_names():
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
+def test_estimator_does_not_support_feature_names(use_pyarrow_dtypes):
     """SelectFromModel works with estimators that do not support feature_names_in_.
 
     Non-regression test for #21949.
     """
     pytest.importorskip("pandas")
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+
     X, y = datasets.load_iris(as_frame=True, return_X_y=True)
+    if use_pyarrow_dtypes:
+        X.convert_dtypes(dtype_backend="pyarrow")
+        y.convert_dtypes(dtype_backend="pyarrow")
+
     all_feature_names = set(X.columns)
 
     def importance_getter(estimator):
@@ -646,11 +654,17 @@ def test_partial_fit_validate_max_features(error, err_msg, max_features):
         ).partial_fit(X, y, classes=[0, 1])
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize("as_frame", [True, False])
-def test_partial_fit_validate_feature_names(as_frame):
+def test_partial_fit_validate_feature_names(use_pyarrow_dtypes, as_frame):
     """Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
     pytest.importorskip("pandas")
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
     X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)
+    if use_pyarrow_dtypes and as_frame is True:
+        X.convert_dtypes(dtype_backend="pyarrow")
+        y.convert_dtypes(dtype_backend="pyarrow")
 
     selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
         X, y, classes=[0, 1, 2]

diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py
@@ -102,11 +102,17 @@ def test_imputers_add_indicator_sparse(imputer, marker):
 
 # ConvergenceWarning will be raised by the IterativeImputer
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
 @pytest.mark.parametrize("add_indicator", [True, False])
-def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
+def test_imputers_pandas_na_integer_array_support(
+    use_pyarrow_dtypes, imputer, add_indicator
+):
     # Test pandas IntegerArray with pd.NA
     pd = pytest.importorskip("pandas")
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+
     marker = np.nan
     imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
 
@@ -123,18 +129,24 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
 
     # Creates dataframe with IntegerArrays with pd.NA
     X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
+    if use_pyarrow_dtypes:
+        X_df.convert_dtypes(dtype_backend="pyarrow")
 
     # fit on pandas dataframe with IntegerArrays
     X_trans = imputer.fit_transform(X_df)
 
     assert_allclose(X_trans_expected, X_trans)
 
 
+@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
 @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
 @pytest.mark.parametrize("add_indicator", [True, False])
-def test_imputers_feature_names_out_pandas(imputer, add_indicator):
+def test_imputers_feature_names_out_pandas(use_pyarrow_dtypes, imputer, add_indicator):
     """Check feature names out for imputers."""
     pd = pytest.importorskip("pandas")
+    if use_pyarrow_dtypes:
+        pytest.importorskip("pyarrow")
+
     marker = np.nan
     imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
 
@@ -147,6 +159,8 @@ def test_imputers_feature_names_out_pandas(imputer, add_indicator):
         ]
     )
     X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
+    if use_pyarrow_dtypes:
+        X_df.convert_dtypes(dtype_backend="pyarrow")
     imputer.fit(X_df)
 
     names = imputer.get_feature_names_out()