From 92383cda36bd5de516c905fba96f732a046bc8b3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 Oct 2022 17:50:21 -0400 Subject: [PATCH 1/7] ENH Improve set_output compatibility in ColumnTransformer --- doc/whats_new/v1.2.rst | 2 +- sklearn/compose/_column_transformer.py | 38 +++++++++- .../compose/tests/test_column_transformer.py | 76 +++++++++++++++++++ sklearn/utils/_set_output.py | 13 +++- sklearn/utils/tests/test_set_output.py | 13 ++++ 5 files changed, 136 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 7b7c33cb71cfd..47ae277954758 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -62,7 +62,7 @@ Changes impacting all modules or :class:`compose.ColumnTransformer` also define a `set_output`. For details, see `SLEP018 `__. - :pr:`23734` by `Thomas Fan`_. + :pr:`23734` and :pr:`xxxxx` by `Thomas Fan`_. - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all estimators are now significantly more efficient for float32 data by leveraging diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index db7a7016c83ab..f33a443ab0d9e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -514,6 +514,23 @@ def get_feature_names_out(self, input_features=None): # No feature names return np.array([], dtype=object) + return self._add_prefix_for_feature_names_out( + transformer_with_feature_names_out + ) + + def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out): + """Add prefix for feature names out that includes the transformer names. + + Parameters + ---------- + transformer_with_feature_names_out : list of tuples of (str, array-like of str) + The tuple consistent of the transformer's name and its feature names out. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ if self.verbose_feature_names_out: # Prefix the feature names out with the transformers name names = list( @@ -821,7 +838,26 @@ def _hstack(self, Xs): config = _get_output_config("transform", self) if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs): pd = check_pandas_support("transform") - return pd.concat(Xs, axis=1) + output = pd.concat(Xs, axis=1) + + # If all transformers define `get_feature_names_out`, then transform + # will adjust the column names to be consistent with + # verbose_feature_names_out. Here we prefix the feature names if + # verbose_feature_names_out=True. + + if not self.verbose_feature_names_out: + return output + + transformer_names = [ + t[0] for t in self._iter(fitted=True, replace_strings=True) + ] + feature_names_outs = [X.columns for X in Xs] + names_out = self._add_prefix_for_feature_names_out( + list(zip(transformer_names, feature_names_outs)) + ) + output.columns = names_out + return output + return np.hstack(Xs) def _sk_visual_block_(self): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index bade093b89c1f..e03722e5b480d 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2053,3 +2053,79 @@ def test_column_transform_set_output_after_fitting(remainder): } for col, dtype in X_trans_df.dtypes.items(): assert dtype == expected_dtypes[col] + + +# Transformer that does not define get_feature_names_out. This transformer +# always expects the input to be a DataFrame. +class PandasOutTransformer(BaseEstimator): + def __init__(self, offset=1.0): + self.offset = offset + + def fit(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return self + + def transform(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return X - self.offset + + def set_output(self, transform=None): + # This transformer will always output a DataFrame regardless of the + # configuration. + return self + + +@pytest.mark.parametrize( + "trans_1, expected_verbose_names, expected_non_verbose_names", + [ + ( + PandasOutTransformer(offset=2.0), + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ( + "drop", + ["trans_0__feat1"], + ["feat1"], + ), + ( + "passthrough", + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ], +) +def test_transformers_with_pandas_out_but_not_feature_names_out( + trans_1, expected_verbose_names, expected_non_verbose_names +): + """Check that set_config(transform="pandas") is compatible with more transformers. + + Specifically, if transformers returns a DataFrame, but does not define + `get_feature_names_out`. + """ + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]}) + ct = ColumnTransformer( + [ + ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]), + ("trans_1", trans_1, ["feat0"]), + ] + ) + X_trans_np = ct.fit_transform(X_df) + assert isinstance(X_trans_np, np.ndarray) + + # `ct` does not have `get_feature_names_out` + with pytest.raises(AttributeError, match="not provide get_feature_names_out"): + ct.get_feature_names_out() + + # The feature names are prefixed because verbose_feature_names_out=True is default + ct.set_output(transform="pandas") + X_trans_df0 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df0.columns, expected_verbose_names) + + ct.set_params(verbose_feature_names_out=False) + X_trans_df1 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df1.columns, expected_non_verbose_names) diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 525c6e0fe0118..be97c3734927f 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -27,9 +27,11 @@ def _wrap_in_pandas_container( columns : callable, ndarray, or None The column names or a callable that returns the column names. The callable is useful if the column names require some computation. - If `None` and `data_to_wrap` is already a dataframe, then the column - names are not changed. If `None` and `data_to_wrap` is **not** a - dataframe, then columns are `range(n_features)`. + If `columns` is a callable that raises an error, `columns` will have + the same semantics as `None`. If `None` and `data_to_wrap` is already a + dataframe, then the column names are not changed. If `None` and + `data_to_wrap` is **not** a dataframe, then columns are + `range(n_features)`. index : array-like, default=None Index for data. @@ -43,7 +45,10 @@ def _wrap_in_pandas_container( raise ValueError("Pandas output does not support sparse data.") if callable(columns): - columns = columns() + try: + columns = columns() + except Exception: + columns = None pd = check_pandas_support("Setting output container to 'pandas'") diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index d20a4634f885d..1158d3ad47707 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -199,3 +199,16 @@ def test_auto_wrap_output_keys_errors_with_incorrect_input(): class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"): pass + + +def test__wrap_in_pandas_container_column_errors(): + """If a callable `columns` errors, it has the same semantics as columns=None.""" + pd = pytest.importorskip("pandas") + + def get_columns(): + raise ValueError("No feature names defined") + + X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]}) + + X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns) + assert_array_equal(X_wrapped.columns, X_df.columns) From 1ac26b2e851594be9d33c80c0d2f6eddb4451e91 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 Oct 2022 18:02:38 -0400 Subject: [PATCH 2/7] DOC Adds PR number --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 47ae277954758..1da5fa9d63756 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -62,7 +62,7 @@ Changes impacting all modules or :class:`compose.ColumnTransformer` also define a `set_output`. For details, see `SLEP018 `__. - :pr:`23734` and :pr:`xxxxx` by `Thomas Fan`_. + :pr:`23734` and :pr:`24699` by `Thomas Fan`_. - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all estimators are now significantly more efficient for float32 data by leveraging From b16559088eb1d7d7244943f152d9e0e3a48db3f2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 19 Oct 2022 14:37:55 -0400 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/compose/_column_transformer.py | 2 +- sklearn/compose/tests/test_column_transformer.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f33a443ab0d9e..78c012ceec66a 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -528,7 +528,7 @@ def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out): Returns ------- - feature_names_out : ndarray of str objects + feature_names_out : ndarray of shape (n_features,), dtype=str Transformed feature names. """ if self.verbose_feature_names_out: diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index e03722e5b480d..74be853adf280 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2117,7 +2117,8 @@ def test_transformers_with_pandas_out_but_not_feature_names_out( X_trans_np = ct.fit_transform(X_df) assert isinstance(X_trans_np, np.ndarray) - # `ct` does not have `get_feature_names_out` + # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does define + # the method. with pytest.raises(AttributeError, match="not provide get_feature_names_out"): ct.get_feature_names_out() From 0fc62e6a4550632e8b23d2b910b501f175a18b35 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 19 Oct 2022 14:39:41 -0400 Subject: [PATCH 4/7] TST Adds test for NumPy arrays --- sklearn/utils/tests/test_set_output.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 1158d3ad47707..86b6ccc57f1c3 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -212,3 +212,7 @@ def get_columns(): X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns) assert_array_equal(X_wrapped.columns, X_df.columns) + + X_np = np.asarray([[1, 3], [2, 4], [3, 5]]) + X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns) + assert_array_equal(X_wrapped.columns, [0, 1]) From 9317c08ec8383f20cfeec5e31151952a58f454cc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 19 Oct 2022 14:40:19 -0400 Subject: [PATCH 5/7] STY Fixes style --- sklearn/compose/tests/test_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 74be853adf280..e0daca0b9c4fb 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2117,8 +2117,8 @@ def test_transformers_with_pandas_out_but_not_feature_names_out( X_trans_np = ct.fit_transform(X_df) assert isinstance(X_trans_np, np.ndarray) - # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does define - # the method. + # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does + # not define the method. with pytest.raises(AttributeError, match="not provide get_feature_names_out"): ct.get_feature_names_out() From abf9b5ef4c729e0f44b90d815b33cd423570ec9e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 19 Oct 2022 14:41:09 -0400 Subject: [PATCH 6/7] DOC Minor rewording --- sklearn/compose/tests/test_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index e0daca0b9c4fb..9b8bcdb80fabe 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2055,8 +2055,8 @@ def test_column_transform_set_output_after_fitting(remainder): assert dtype == expected_dtypes[col] -# Transformer that does not define get_feature_names_out. This transformer -# always expects the input to be a DataFrame. +# PandasOutTransformer that does not define get_feature_names_out and always expects +# the input to be a DataFrame. class PandasOutTransformer(BaseEstimator): def __init__(self, offset=1.0): self.offset = offset From 2fb935f83649f7b385f8ebade15836a464127f51 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 19 Oct 2022 14:42:24 -0400 Subject: [PATCH 7/7] TST Have test depend on X_np directly --- sklearn/utils/tests/test_set_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 86b6ccc57f1c3..cbc8ed8364865 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -215,4 +215,4 @@ def get_columns(): X_np = np.asarray([[1, 3], [2, 4], [3, 5]]) X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns) - assert_array_equal(X_wrapped.columns, [0, 1]) + assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))