From 696fe53028ff498cf5270f29803359057f4ad8b2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 May 2021 22:50:19 -0400 Subject: [PATCH 1/7] ENH ColumnTransformer.transform returns dataframes when transformers output them --- doc/whats_new/v1.0.rst | 3 + sklearn/compose/_column_transformer.py | 17 ++++ .../compose/tests/test_column_transformer.py | 90 ++++++++++++++++--- 3 files changed, 98 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 87b0441bade5f..c5d5e26aefdbf 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -174,6 +174,9 @@ Changelog :mod:`sklearn.compose` ...................... +- |Enhancement| :class:`compose.ColumnTransformer` returns a pandas DataFrame + when every transformer outputs a DataFrame in :meth:`transform`. + - |Enhancement| :class:`compose.ColumnTransformer` now records the output of each transformer in `output_indices_`. :pr:`18393` by :user:`Luca Bittarello `. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6c15b81be98c2..d4ffe48412681 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -7,6 +7,7 @@ # Joris Van den Bossche # License: BSD from itertools import chain +import warnings import numpy as np from scipy import sparse @@ -636,6 +637,22 @@ def _hstack(self, Xs): ---------- Xs : list of {array-like, sparse matrix, dataframe} """ + if all(hasattr(X, "iloc") for X in Xs): + # All output from transformers are pandas DataFrames + import pandas as pd + + if any(not Xs[0].index.equals(X.index) for X in Xs[1:]): + warnings.warn( + "The DataFrame's indexes for each transformer do not " + "match. A numpy array or a sparse matrix will be " + "returned.") + return self._hstack_np(Xs) + return pd.concat(Xs, axis=1) + + return self._hstack_np(Xs) + + def _hstack_np(self, Xs): + """Stack Xs horizontally.""" if self.sparse_output_: try: # since all columns should be numeric before stacking them diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9278d67296ec5..e244e463fb9f9 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -15,6 +15,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.base import BaseEstimator +from sklearn.base import TransformerMixin from sklearn.compose import ( ColumnTransformer, make_column_transformer, make_column_selector ) @@ -374,12 +375,12 @@ def test_column_transformer_output_indices_df(): assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), 'remainder': slice(0, 0)} - assert_array_equal(X_trans[:, [0]], - X_trans[:, ct.output_indices_['trans1']]) - assert_array_equal(X_trans[:, [1]], - X_trans[:, ct.output_indices_['trans2']]) - assert_array_equal(X_trans[:, []], - X_trans[:, ct.output_indices_['remainder']]) + assert_array_equal(X_trans.iloc[:, [0]], + X_trans.iloc[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans.iloc[:, [1]], + X_trans.iloc[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans.iloc[:, []], + X_trans.iloc[:, ct.output_indices_['remainder']]) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) @@ -387,12 +388,12 @@ def test_column_transformer_output_indices_df(): assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), 'remainder': slice(0, 0)} - assert_array_equal(X_trans[:, [0]], - X_trans[:, ct.output_indices_['trans1']]) - assert_array_equal(X_trans[:, [1]], - X_trans[:, ct.output_indices_['trans2']]) - assert_array_equal(X_trans[:, []], - X_trans[:, ct.output_indices_['remainder']]) + assert_array_equal(X_trans.iloc[:, [0]], + X_trans.iloc[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans.iloc[:, [1]], + X_trans.iloc[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans.iloc[:, []], + X_trans.iloc[:, ct.output_indices_['remainder']]) def test_column_transformer_sparse_array(): @@ -1521,3 +1522,68 @@ def test_get_feature_names_empty_selection(selector): ct = ColumnTransformer([('ohe', OneHotEncoder(drop='first'), selector)]) ct.fit([[1, 2], [3, 4]]) assert ct.get_feature_names() == [] + + +class DataFrameTransformer(TransformerMixin, BaseEstimator): + def __init__(self, prefix="X", index_start=0, reverse_index=False): + self.prefix = prefix + self.index_start = index_start + self.reverse_index = reverse_index + + def fit(self, X, y=None): + return self + + def transform(self, X): + import pandas as pd + n_samples, n_features = X.shape + columns = [f"{self.prefix}{i}" for i in range(n_features)] + index = [i + self.index_start for i in range(n_samples)] + if self.reverse_index: + index = index[::-1] + + return pd.DataFrame(X.to_numpy(), columns=columns, index=index) + + +def test_pandas_index_aligned_hstack(): + """When transformers all output dataframes, ColumnTransformer will + output dataframes.""" + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"]) + + ct = ColumnTransformer([ + ("first", + DataFrameTransformer(prefix="first", index_start=1), ["X"]), + ("second", + DataFrameTransformer(prefix="second", index_start=1), ["X", "Y"]) + ]).fit(X) + + X_expected = pd.DataFrame({ + "first0": [1, 0, 3], + "second0": [1, 0, 3], + "second1": [4, 5, 0], + }, columns=["first0", "second0", "second1"], index=[1, 2, 3]) + + X_trans = ct.transform(X) + pd.testing.assert_frame_equal(X_trans, X_expected) + + +@pytest.mark.parametrize("first_kwargs", [ + {"index_start": 2}, {"reverse_index": True}]) +def test_pandas_index_not_aligned_warns(first_kwargs): + """When transformers all output dataframes, ColumnTransformer will + warn and output numpy arrays.""" + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"]) + + ct = ColumnTransformer([ + ("first", DataFrameTransformer(prefix="first", **first_kwargs), ["X"]), + ("second", DataFrameTransformer(prefix="second"), ["X", "Y"]) + ]) + + msg = "The DataFrame's indexes for each transformer do not match" + with pytest.warns(UserWarning, match=msg): + X_trans = ct.fit_transform(X) + + assert isinstance(X_trans, np.ndarray) + X_expected = np.array([[1, 1, 4], [0, 0, 5], [3, 3, 0]]) + assert_array_equal(X_trans, X_expected) From 0c69145b437dd857f4e867066fd5893170bc234b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 May 2021 22:51:57 -0400 Subject: [PATCH 2/7] DOC Adds whats new with PR number --- doc/whats_new/v1.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c5d5e26aefdbf..cd17a9e292d49 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -174,8 +174,9 @@ Changelog :mod:`sklearn.compose` ...................... -- |Enhancement| :class:`compose.ColumnTransformer` returns a pandas DataFrame - when every transformer outputs a DataFrame in :meth:`transform`. +- |Enhancement| :meth:`compose.ColumnTransformer.transform` returns a pandas + DataFrame when every transformer outputs a DataFrame from :meth:`transform`. + :pr:`20110` by `Thomas Fan`_. - |Enhancement| :class:`compose.ColumnTransformer` now records the output of each transformer in `output_indices_`. :pr:`18393` by From 1334bcb2ee0d749e8ca9839ef9fa9722c4c61f4c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 May 2021 22:54:59 -0400 Subject: [PATCH 3/7] CLN Move pandas till later --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index d4ffe48412681..115bfec50a830 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -639,14 +639,14 @@ def _hstack(self, Xs): """ if all(hasattr(X, "iloc") for X in Xs): # All output from transformers are pandas DataFrames - import pandas as pd - if any(not Xs[0].index.equals(X.index) for X in Xs[1:]): warnings.warn( "The DataFrame's indexes for each transformer do not " "match. A numpy array or a sparse matrix will be " "returned.") return self._hstack_np(Xs) + + import pandas as pd return pd.concat(Xs, axis=1) return self._hstack_np(Xs) From c37ddb7a6ef4cdd6c163314168f841ff3c1544e1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 May 2021 22:58:46 -0400 Subject: [PATCH 4/7] DOC Adds docstring --- sklearn/compose/_column_transformer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 115bfec50a830..a69fe392e2572 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -633,12 +633,15 @@ def _hstack(self, Xs): This allows subclasses to control the stacking behavior, while reusing everything else from ColumnTransformer. + If Xs are all DataFrames and their indexes are the same then a + DataFrame is returned. + Parameters ---------- Xs : list of {array-like, sparse matrix, dataframe} """ + # Check that all output from transformers are pandas DataFrames if all(hasattr(X, "iloc") for X in Xs): - # All output from transformers are pandas DataFrames if any(not Xs[0].index.equals(X.index) for X in Xs[1:]): warnings.warn( "The DataFrame's indexes for each transformer do not " From 59ad23243e4442254591750538548a946efdda1b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 May 2021 22:59:26 -0400 Subject: [PATCH 5/7] DOC Adds docstring to _hstack_np --- sklearn/compose/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a69fe392e2572..9a02fe5c1bc87 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -655,7 +655,7 @@ def _hstack(self, Xs): return self._hstack_np(Xs) def _hstack_np(self, Xs): - """Stack Xs horizontally.""" + """Stack Xs horizontally into a numpy array or sparse matrix.""" if self.sparse_output_: try: # since all columns should be numeric before stacking them From d6f7a1e4a3d707755402e8f9c1b157091c765542 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 11 Jun 2021 09:18:31 -0400 Subject: [PATCH 6/7] TST Fixes test --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f4f40222811e9..4e70ad16724a6 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1548,7 +1548,7 @@ def transform(self, X): if self.reverse_index: index = index[::-1] - return pd.DataFrame(X.to_numpy(), columns=columns, index=index) + return pd.DataFrame(X.values, columns=columns, index=index) def test_pandas_index_aligned_hstack(): From 65137553d922c6b74f8fd2e2517e5534f7466d39 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 25 Jul 2021 08:02:04 -0400 Subject: [PATCH 7/7] TST Adds test for sparse output --- sklearn/compose/_column_transformer.py | 5 ++- .../compose/tests/test_column_transformer.py | 35 ++++++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 7d79932a7c5f3..7248fa10fd1e0 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -667,10 +667,13 @@ def _hstack(self, Xs): warnings.warn( "The DataFrame's indexes for each transformer do not " "match. A numpy array or a sparse matrix will be " - "returned.") + "returned." + ) return self._hstack_np(Xs) import pandas as pd + + print([X.dtypes for X in Xs]) return pd.concat(Xs, axis=1) return self._hstack_np(Xs) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 85786651e1406..da3131f07a676 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1618,10 +1618,13 @@ def test_get_feature_names_empty_selection(selector): class DataFrameTransformer(TransformerMixin, BaseEstimator): - def __init__(self, prefix="X", index_start=0, reverse_index=False): + def __init__( + self, prefix="X", index_start=0, reverse_index=False, sparse_output=False + ): self.prefix = prefix self.index_start = index_start self.reverse_index = reverse_index + self.sparse_output = sparse_output def fit(self, X, y=None): return self @@ -1635,10 +1638,15 @@ def transform(self, X): if self.reverse_index: index = index[::-1] + if self.sparse_output: + return pd.DataFrame.sparse.from_spmatrix( + sparse.csr_matrix(X.values), columns=columns, index=index + ) return pd.DataFrame(X.values, columns=columns, index=index) -def test_pandas_index_aligned_hstack(): +@pytest.mark.parametrize("first_sparse_output", [True, False]) +def test_pandas_index_aligned_hstack(first_sparse_output): """When transformers all output dataframes, ColumnTransformer will output dataframes.""" pd = pytest.importorskip("pandas") @@ -1646,7 +1654,15 @@ def test_pandas_index_aligned_hstack(): ct = ColumnTransformer( [ - ("first", DataFrameTransformer(prefix="first", index_start=1), ["X"]), + ( + "first", + DataFrameTransformer( + prefix="first", + index_start=1, + sparse_output=first_sparse_output, + ), + ["X"], + ), ( "second", DataFrameTransformer(prefix="second", index_start=1), @@ -1664,13 +1680,16 @@ def test_pandas_index_aligned_hstack(): columns=["first0", "second0", "second1"], index=[1, 2, 3], ) + if first_sparse_output: + X_expected["first0"] = X_expected["first0"].astype(pd.SparseDtype("int", 0)) X_trans = ct.transform(X) pd.testing.assert_frame_equal(X_trans, X_expected) @pytest.mark.parametrize("first_kwargs", [{"index_start": 2}, {"reverse_index": True}]) -def test_pandas_index_not_aligned_warns(first_kwargs): +@pytest.mark.parametrize("first_sparse_output", [True, False]) +def test_pandas_index_not_aligned_warns(first_kwargs, first_sparse_output): """When transformers all output dataframes, ColumnTransformer will warn and output numpy arrays.""" pd = pytest.importorskip("pandas") @@ -1678,7 +1697,13 @@ def test_pandas_index_not_aligned_warns(first_kwargs): ct = ColumnTransformer( [ - ("first", DataFrameTransformer(prefix="first", **first_kwargs), ["X"]), + ( + "first", + DataFrameTransformer( + prefix="first", sparse_output=first_sparse_output, **first_kwargs + ), + ["X"], + ), ("second", DataFrameTransformer(prefix="second"), ["X", "Y"]), ] )