From ddacaed396f91c1e1812f9e91c9eee53be27cf5a Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Sep 2018 15:21:43 -0400 Subject: [PATCH 1/8] added solution by Joris and testcases --- sklearn/compose/_column_transformer.py | 4 ++++ .../compose/tests/test_column_transformer.py | 21 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index e09d2d09d7e43..376c288b1f8d0 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -234,6 +234,8 @@ def _iter(self, X=None, fitted=False, replace_strings=False): check_inverse=False) elif trans == 'drop': continue + elif hasattr(column, '__len__') and len(column) == 0: + continue yield (name, trans, sub, get_weight(name)) @@ -335,6 +337,8 @@ def _update_fitted_transformers(self, transformers): # so get next transformer, but save original string next(transformers) trans = 'passthrough' + elif hasattr(column, '__len__') and len(column) == 0: + trans = old else: trans = next(transformers) transformers_.append((name, trans, column)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f67806a52c543..c0f40003f1070 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -111,6 +111,20 @@ def test_column_transformer(): assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 + # test case that ensures that the column transformer does also work when + # a given transformer doesn't have any columns to work on + ct = ColumnTransformer([('trans1', Trans(), [0, 1]), + ('trans2', Trans(), [])]) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + + ct = ColumnTransformer([('trans1', Trans(), []), + ('trans2', Trans(), [0, 1])]) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), [0]), @@ -185,6 +199,13 @@ def test_column_transformer_dataframe(): assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' + ct = ColumnTransformer([('trans1', Trans(), ['first', 'second']), + ('trans2', Trans(), [])]) + assert_array_equal(ct.fit_transform(X_df), X_res_both) + assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != 'remainder' + ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) From c98a6c10f8a8d5a7d58df14d3af850c4eb0e7798 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Mon, 17 Sep 2018 06:01:47 -0400 Subject: [PATCH 2/8] incorporated comments by Roman and Joel --- sklearn/compose/_column_transformer.py | 8 ++-- .../compose/tests/test_column_transformer.py | 46 +++++++++++++------ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 376c288b1f8d0..61f6a427a9920 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -109,8 +109,10 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): transformers_ : list The collection of fitted transformers as tuples of (name, fitted_transformer, column). `fitted_transformer` can be an - estimator, 'drop', or 'passthrough'. If there are remaining columns, - the final element is a tuple of the form: + estimator, 'drop', or 'passthrough'. Note that the column list is + allowed to be empty. In that case the transformers will not be fitted. + If there are remaining columns, the final element is a tuple of the + form: ('remainder', transformer, remaining_columns) corresponding to the ``remainder`` parameter. If there are remaining columns, then ``len(transformers_)==len(transformers)+1``, otherwise @@ -338,7 +340,7 @@ def _update_fitted_transformers(self, transformers): next(transformers) trans = 'passthrough' elif hasattr(column, '__len__') and len(column) == 0: - trans = old + trans = 'empty' else: trans = next(transformers) transformers_.append((name, trans, column)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index c0f40003f1070..476c4e23e7ad4 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -111,20 +111,6 @@ def test_column_transformer(): assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 - # test case that ensures that the column transformer does also work when - # a given transformer doesn't have any columns to work on - ct = ColumnTransformer([('trans1', Trans(), [0, 1]), - ('trans2', Trans(), [])]) - assert_array_equal(ct.fit_transform(X_array), X_res_both) - assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) - assert len(ct.transformers_) == 2 - - ct = ColumnTransformer([('trans1', Trans(), []), - ('trans2', Trans(), [0, 1])]) - assert_array_equal(ct.fit_transform(X_array), X_res_both) - assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) - assert len(ct.transformers_) == 2 - # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), [0]), @@ -272,6 +258,38 @@ def transform(self, X, y=None): assert_array_equal(ct.transformers_[-1][2], [1]) +def test_column_transformer_empty_columns(): + # test case that ensures that the column transformer does also work when + # a given transformer doesn't have any columns to work on + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_res_both = X_array + + ct = ColumnTransformer([('trans1', Trans(), [0, 1]), + ('trans2', Trans(), [])]) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + + ct = ColumnTransformer([('trans1', Trans(), []), + ('trans2', Trans(), [0, 1])]) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + + ct = ColumnTransformer([('trans', Trans(), [])], + remainder='passthrough') + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 # including remainder + + fixture = np.array([[],[],[]]) + ct = ColumnTransformer([('trans', Trans(), [])], + remainder='drop') + assert_array_equal(ct.fit_transform(X_array), fixture) + assert_array_equal(ct.fit(X_array).transform(X_array), fixture) + assert len(ct.transformers_) == 2 # including remainder + + def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() From afab865e4bd728c41e6a511867a44880dab2ef60 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Wed, 19 Sep 2018 07:05:59 -0400 Subject: [PATCH 3/8] fix pep8 problems --- sklearn/compose/tests/test_column_transformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 476c4e23e7ad4..421fb248be2a5 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -280,14 +280,14 @@ def test_column_transformer_empty_columns(): remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) - assert len(ct.transformers_) == 2 # including remainder + assert len(ct.transformers_) == 2 # including remainder - fixture = np.array([[],[],[]]) + fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', Trans(), [])], remainder='drop') assert_array_equal(ct.fit_transform(X_array), fixture) assert_array_equal(ct.fit(X_array).transform(X_array), fixture) - assert len(ct.transformers_) == 2 # including remainder + assert len(ct.transformers_) == 2 # including remainder def test_column_transformer_sparse_array(): From faaa9a38528dc2b4bca1a9b5da6f0b77f0586a47 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 21 Sep 2018 09:22:56 -0400 Subject: [PATCH 4/8] improved doc string --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6fc7fbd5e2d61..c6a9705e67d6b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -109,8 +109,8 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): transformers_ : list The collection of fitted transformers as tuples of (name, fitted_transformer, column). `fitted_transformer` can be an - estimator, 'drop', or 'passthrough'. Note that the column list is - allowed to be empty. In that case the transformers will not be fitted. + estimator, 'drop', or 'passthrough'. In case there were no columns + selected, this string 'empty' will stand in place of the transformer. If there are remaining columns, the final element is a tuple of the form: ('remainder', transformer, remaining_columns) corresponding to the From 3b7f01a32a8a8ca2e7e96314140c58f14338e662 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Sep 2018 13:43:35 +0200 Subject: [PATCH 5/8] change 'empty' to unfitted transformer --- sklearn/compose/_column_transformer.py | 4 ++-- sklearn/compose/tests/test_column_transformer.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c6a9705e67d6b..9b0a5d5fb62a2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -110,7 +110,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): The collection of fitted transformers as tuples of (name, fitted_transformer, column). `fitted_transformer` can be an estimator, 'drop', or 'passthrough'. In case there were no columns - selected, this string 'empty' will stand in place of the transformer. + selected, this will be the unfitted transformer. If there are remaining columns, the final element is a tuple of the form: ('remainder', transformer, remaining_columns) corresponding to the @@ -356,7 +356,7 @@ def _update_fitted_transformers(self, transformers): next(fitted_transformers) trans = 'passthrough' elif hasattr(column, '__len__') and len(column) == 0: - trans = 'empty' + trans = old else: trans = next(fitted_transformers) transformers_.append((name, trans, column)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9a5932a7e7502..a7592e9e3f49c 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -269,18 +269,21 @@ def test_column_transformer_empty_columns(): assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 + assert isinstance(ct.transformers_[1][1], Trans) ct = ColumnTransformer([('trans1', Trans(), []), ('trans2', Trans(), [0, 1])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 + assert isinstance(ct.transformers_[0][1], Trans) ct = ColumnTransformer([('trans', Trans(), [])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 # including remainder + assert isinstance(ct.transformers_[0][1], Trans) fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', Trans(), [])], @@ -288,6 +291,7 @@ def test_column_transformer_empty_columns(): assert_array_equal(ct.fit_transform(X_array), fixture) assert_array_equal(ct.fit(X_array).transform(X_array), fixture) assert len(ct.transformers_) == 2 # including remainder + assert isinstance(ct.transformers_[0][1], Trans) def test_column_transformer_sparse_array(): From 7e70d1671094e6814b8c8b147689a375a0aaec07 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Sep 2018 14:00:34 +0200 Subject: [PATCH 6/8] also support all-False boolean arrays --- sklearn/compose/_column_transformer.py | 18 ++++++++++++++++-- .../compose/tests/test_column_transformer.py | 12 +++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 9b0a5d5fb62a2..b7be3c8ef1455 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -245,7 +245,7 @@ def _iter(self, fitted=False, replace_strings=False): check_inverse=False) elif trans == 'drop': continue - elif hasattr(column, '__len__') and len(column) == 0: + elif _is_empty_column_selection(column): continue yield (name, trans, column, get_weight(name)) @@ -355,7 +355,7 @@ def _update_fitted_transformers(self, transformers): # so get next transformer, but save original string next(fitted_transformers) trans = 'passthrough' - elif hasattr(column, '__len__') and len(column) == 0: + elif _is_empty_column_selection(column): trans = old else: trans = next(fitted_transformers) @@ -650,6 +650,20 @@ def _get_column_indices(X, key): "strings, or boolean mask is allowed") +def _is_empty_column_selection(column): + """ + Return True if the column selection is empty (empty list or all-False + boolean array). + + """ + if hasattr(column, 'dtype') and np.issubdtype(column.dtype, np.bool_): + return not column.any() + elif hasattr(column, '__len__'): + return len(column) == 0 + else: + return False + + def _get_transformer_list(estimators): """ Construct (name, trans, column) tuples from list diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a7592e9e3f49c..2aaaa40c82126 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -258,27 +258,29 @@ def transform(self, X, y=None): assert_array_equal(ct.transformers_[-1][2], [1]) -def test_column_transformer_empty_columns(): +@pytest.mark.parametrize("column", [[], np.array([False, False])], + ids=['list', 'bool']) +def test_column_transformer_empty_columns(column): # test case that ensures that the column transformer does also work when # a given transformer doesn't have any columns to work on X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), [0, 1]), - ('trans2', Trans(), [])]) + ('trans2', Trans(), column)]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], Trans) - ct = ColumnTransformer([('trans1', Trans(), []), + ct = ColumnTransformer([('trans1', Trans(), column), ('trans2', Trans(), [0, 1])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], Trans) - ct = ColumnTransformer([('trans', Trans(), [])], + ct = ColumnTransformer([('trans', Trans(), column)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) @@ -286,7 +288,7 @@ def test_column_transformer_empty_columns(): assert isinstance(ct.transformers_[0][1], Trans) fixture = np.array([[], [], []]) - ct = ColumnTransformer([('trans', Trans(), [])], + ct = ColumnTransformer([('trans', Trans(), column)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), fixture) assert_array_equal(ct.fit(X_array).transform(X_array), fixture) From 3c281ea30ebe1f66ec08b17abc586cfda18a0fda Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Sep 2018 14:29:23 +0200 Subject: [PATCH 7/8] combine basic and pandas test with parametrization --- .../compose/tests/test_column_transformer.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 2aaaa40c82126..7c12bc0f2a7ed 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -185,13 +185,6 @@ def test_column_transformer_dataframe(): assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' - ct = ColumnTransformer([('trans1', Trans(), ['first', 'second']), - ('trans2', Trans(), [])]) - assert_array_equal(ct.fit_transform(X_df), X_res_both) - assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) - assert len(ct.transformers_) == 2 - assert ct.transformers_[-1][0] != 'remainder' - ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) @@ -258,40 +251,47 @@ def transform(self, X, y=None): assert_array_equal(ct.transformers_[-1][2], [1]) +@pytest.mark.parametrize("pandas", [True, False], ids=['pandas', 'numpy']) @pytest.mark.parametrize("column", [[], np.array([False, False])], ids=['list', 'bool']) -def test_column_transformer_empty_columns(column): +def test_column_transformer_empty_columns(pandas, column): # test case that ensures that the column transformer does also work when # a given transformer doesn't have any columns to work on X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array + if pandas: + pd = pytest.importorskip('pandas') + X = pd.DataFrame(X_array, columns=['first', 'second']) + else: + X = X_array + ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', Trans(), column)]) - assert_array_equal(ct.fit_transform(X_array), X_res_both) - assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], Trans) ct = ColumnTransformer([('trans1', Trans(), column), ('trans2', Trans(), [0, 1])]) - assert_array_equal(ct.fit_transform(X_array), X_res_both) - assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], Trans) ct = ColumnTransformer([('trans', Trans(), column)], remainder='passthrough') - assert_array_equal(ct.fit_transform(X_array), X_res_both) - assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans) fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', Trans(), column)], remainder='drop') - assert_array_equal(ct.fit_transform(X_array), fixture) - assert_array_equal(ct.fit(X_array).transform(X_array), fixture) + assert_array_equal(ct.fit_transform(X), fixture) + assert_array_equal(ct.fit(X).transform(X), fixture) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans) From 7a5ccafecfec07d57f2faa1b8a4ec66014ec9806 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Sep 2018 00:01:43 +0200 Subject: [PATCH 8/8] fix pep8 --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 7c12bc0f2a7ed..e7ab656abb026 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -262,7 +262,7 @@ def test_column_transformer_empty_columns(pandas, column): if pandas: pd = pytest.importorskip('pandas') - X = pd.DataFrame(X_array, columns=['first', 'second']) + X = pd.DataFrame(X_array, columns=['first', 'second']) else: X = X_array