Skip to content

[WIP] FIX Add tests for pyarrow dtypes in pandas Dataframes #26651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 164 additions & 29 deletions sklearn/compose/tests/test_column_transformer.py

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion sklearn/cross_decomposition/tests/test_pls.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,11 +611,16 @@ def test_pls_feature_names_out(Klass):
assert_array_equal(names_out, expected_names_out)


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical])
def test_pls_set_output(Klass):
def test_pls_set_output(use_pyarrow_dtypes, Klass):
"""Check `set_output` in cross_decomposition module."""
pd = pytest.importorskip("pandas")
X, Y = load_linnerud(return_X_y=True, as_frame=True)
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")
Y.convert_dtypes(dtype_backend="pyarrow")

est = Klass().set_output(transform="pandas").fit(X, Y)
X_trans, y_trans = est.transform(X, Y)
Expand Down
6 changes: 5 additions & 1 deletion sklearn/datasets/tests/test_arff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize(
"feature_names, target_names",
[
Expand Down Expand Up @@ -43,7 +44,7 @@
),
],
)
def test_post_process_frame(feature_names, target_names):
def test_post_process_frame(use_pyarrow_dtypes, feature_names, target_names):
"""Check the behaviour of the post-processing function for splitting a dataframe."""
pd = pytest.importorskip("pandas")

Expand All @@ -57,6 +58,9 @@ def test_post_process_frame(feature_names, target_names):
"col_string": ["a", "b", "c"],
}
)
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X_original.convert_dtypes(dtype_backend="pyarrow")

X, y = _post_process_frame(X_original, feature_names, target_names)
assert isinstance(X, pd.DataFrame)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1113,10 +1113,11 @@ def test_categorical_spec_errors(
est.fit(X, y)


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize(
"Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
)
def test_categorical_spec_errors_with_feature_names(Est):
def test_categorical_spec_errors_with_feature_names(use_pyarrow_dtypes, Est):
pd = pytest.importorskip("pandas")
n_samples = 10
X = pd.DataFrame(
Expand All @@ -1127,6 +1128,9 @@ def test_categorical_spec_errors_with_feature_names(Est):
}
)
y = [0, 1] * (n_samples // 2)
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")

est = Est(categorical_features=["f0", "f1", "f3"])
expected_msg = re.escape(
Expand Down Expand Up @@ -1162,20 +1166,26 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
assert est.is_categorical_ is None


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize(
"Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
)
@pytest.mark.parametrize(
"use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
)
def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
def test_categorical_bad_encoding_errors(
use_pyarrow_dtypes, Est, use_pandas, feature_name
):
# Test errors when categories are encoded incorrectly

gb = Est(categorical_features=[True], max_bins=2)

if use_pandas:
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"f0": [0, 1, 2]})
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")
else:
X = np.array([[0, 1, 2]]).T
y = np.arange(3)
Expand All @@ -1188,6 +1198,8 @@ def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):

if use_pandas:
X = pd.DataFrame({"f0": [0, 2]})
if use_pyarrow_dtypes:
X.convert_dtypes(dtype_backend="pyarrow")
else:
X = np.array([[0, 2]]).T
y = np.arange(2)
Expand Down Expand Up @@ -1293,14 +1305,18 @@ def test_interaction_cst_numerically():
)


def test_no_user_warning_with_scoring():
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_no_user_warning_with_scoring(use_pyarrow_dtypes):
"""Check that no UserWarning is raised when scoring is set.

Non-regression test for #22907.
"""
pd = pytest.importorskip("pandas")
X, y = make_regression(n_samples=50, random_state=0)
X_df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X_df.convert_dtypes(dtype_backend="pyarrow")

est = HistGradientBoostingRegressor(
random_state=0, scoring="neg_mean_absolute_error", early_stopping=True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,14 @@ def test_input_error():
gbdt.fit(X, y)


def test_input_error_related_to_feature_names():
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_input_error_related_to_feature_names(use_pyarrow_dtypes):
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
y = np.array([0, 1, 0])
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")

monotonic_cst = {"d": 1, "a": 1, "c": -1}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
Expand Down
7 changes: 6 additions & 1 deletion sklearn/ensemble/tests/test_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ def test_base_estimator_property_deprecated():
model.base_estimator_


def test_iforest_preserve_feature_names():
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_iforest_preserve_feature_names(use_pyarrow_dtypes):
"""Check that feature names are preserved when contamination is not "auto".

Feature names are required for consistency checks during scoring.
Expand All @@ -352,6 +353,10 @@ def test_iforest_preserve_feature_names():
rng = np.random.RandomState(0)

X = pd.DataFrame(data=rng.randn(4), columns=["a"])
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")

model = IsolationForest(random_state=0, contamination=0.05)

with warnings.catch_warnings():
Expand Down
6 changes: 5 additions & 1 deletion sklearn/feature_selection/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ def test_get_support():
assert_array_equal(support_inds, sel.get_support(indices=True))


def test_output_dataframe():
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_output_dataframe(use_pyarrow_dtypes):
"""Check output dtypes for dataframes is consistent with the input dtypes."""
pd = pytest.importorskip("pandas")

Expand All @@ -130,6 +131,9 @@ def test_output_dataframe():
"d": pd.Series([3.0, 2.4, 1.2], dtype=np.float64),
}
)
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X.convert_dtypes(dtype_backend="pyarrow")

for step in [2, 3]:
sel = StepSelector(step=step).set_output(transform="pandas")
Expand Down
10 changes: 8 additions & 2 deletions sklearn/feature_selection/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,12 +946,15 @@ def test_mutual_info_regression():
assert_array_equal(support, gtruth)


def test_dataframe_output_dtypes():
"""Check that the output datafarme dtypes are the same as the input.
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_dataframe_output_dtypes(use_pyarrow_dtypes):
"""Check that the output dataframe dtypes are the same as the input.

Non-regression test for gh-24860.
"""
pd = pytest.importorskip("pandas")
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")

X, y = load_iris(return_X_y=True, as_frame=True)
X = X.astype(
Expand All @@ -960,6 +963,9 @@ def test_dataframe_output_dtypes():
"petal width (cm)": np.float64,
}
)
if use_pyarrow_dtypes:
X.convert_dtypes(dtype_backend="pyarrow")

X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10)

column_order = X.columns
Expand Down
18 changes: 16 additions & 2 deletions sklearn/feature_selection/tests/test_from_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,13 +597,21 @@ def test_select_from_model_pls(PLSEstimator):
assert model.score(X, y) > 0.5


def test_estimator_does_not_support_feature_names():
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
def test_estimator_does_not_support_feature_names(use_pyarrow_dtypes):
"""SelectFromModel works with estimators that do not support feature_names_in_.

Non-regression test for #21949.
"""
pytest.importorskip("pandas")
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")

X, y = datasets.load_iris(as_frame=True, return_X_y=True)
if use_pyarrow_dtypes:
X.convert_dtypes(dtype_backend="pyarrow")
y.convert_dtypes(dtype_backend="pyarrow")

all_feature_names = set(X.columns)

def importance_getter(estimator):
Expand Down Expand Up @@ -646,11 +654,17 @@ def test_partial_fit_validate_max_features(error, err_msg, max_features):
).partial_fit(X, y, classes=[0, 1])


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize("as_frame", [True, False])
def test_partial_fit_validate_feature_names(as_frame):
def test_partial_fit_validate_feature_names(use_pyarrow_dtypes, as_frame):
"""Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
pytest.importorskip("pandas")
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")
X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)
if use_pyarrow_dtypes and as_frame is True:
X.convert_dtypes(dtype_backend="pyarrow")
y.convert_dtypes(dtype_backend="pyarrow")

selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
X, y, classes=[0, 1, 2]
Expand Down
18 changes: 16 additions & 2 deletions sklearn/impute/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,17 @@ def test_imputers_add_indicator_sparse(imputer, marker):

# ConvergenceWarning will be raised by the IterativeImputer
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
@pytest.mark.parametrize("add_indicator", [True, False])
def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
def test_imputers_pandas_na_integer_array_support(
use_pyarrow_dtypes, imputer, add_indicator
):
# Test pandas IntegerArray with pd.NA
pd = pytest.importorskip("pandas")
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")

marker = np.nan
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)

Expand All @@ -123,18 +129,24 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):

# Creates dataframe with IntegerArrays with pd.NA
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
if use_pyarrow_dtypes:
X_df.convert_dtypes(dtype_backend="pyarrow")

# fit on pandas dataframe with IntegerArrays
X_trans = imputer.fit_transform(X_df)

assert_allclose(X_trans_expected, X_trans)


@pytest.mark.parametrize("use_pyarrow_dtypes", [True, False])
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
@pytest.mark.parametrize("add_indicator", [True, False])
def test_imputers_feature_names_out_pandas(imputer, add_indicator):
def test_imputers_feature_names_out_pandas(use_pyarrow_dtypes, imputer, add_indicator):
"""Check feature names out for imputers."""
pd = pytest.importorskip("pandas")
if use_pyarrow_dtypes:
pytest.importorskip("pyarrow")

marker = np.nan
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)

Expand All @@ -147,6 +159,8 @@ def test_imputers_feature_names_out_pandas(imputer, add_indicator):
]
)
X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
if use_pyarrow_dtypes:
X_df.convert_dtypes(dtype_backend="pyarrow")
imputer.fit(X_df)

names = imputer.get_feature_names_out()
Expand Down
Loading