From b70639c9483113c5de0139a1c157ebb30502795b Mon Sep 17 00:00:00 2001 From: lbittarello Date: Sun, 13 Sep 2020 16:54:13 +0100 Subject: [PATCH 01/26] Record output of transformers in ColumnTransformer --- sklearn/compose/_column_transformer.py | 19 +++++++++++++++++ .../compose/tests/test_column_transformer.py | 21 +++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 780ce46433b90..fcb61be5af792 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -136,6 +136,10 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. + transformers_output_ : dict + A dictionary from transformer names to column indices, recording the + columns produced by each transformer (if any). + Notes ----- The order of the columns in the transformed feature matrix follows the @@ -417,6 +421,20 @@ def _validate_output(self, result): "The output of the '{0}' transformer should be 2D (scipy " "matrix, array, or pandas DataFrame).".format(name)) + def _index_output(self, Xs): + """ + Record which transformer produced which column. + """ + idx_ = 0 + self.transformers_output_ = {} + for idx, (name, trans, _, _) in enumerate( + self._iter(fitted=True, replace_strings=True) + ): + n_columns = Xs[idx].shape[1] + self.transformers_output_[name] = list(range(idx_, idx_ + n_columns)) + idx_ += n_columns + return + def _validate_features(self, n_features, feature_names): """Ensures feature counts and names are the same during fit and transform. @@ -555,6 +573,7 @@ def fit_transform(self, X, y=None): self._update_fitted_transformers(transformers) self._validate_output(Xs) + self._index_output(Xs) return self._hstack(list(Xs)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 4e58769e244c7..b4ae02123913f 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -115,6 +115,7 @@ def test_column_transformer(): assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 + assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} @@ -126,12 +127,14 @@ def test_column_transformer(): assert_array_equal(both.fit_transform(X_array), res) assert_array_equal(both.fit(X_array).transform(X_array), res) assert len(both.transformers_) == 2 + assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) assert len(both.transformers_) == 1 + assert both.transformers_output_ == {'trans': [0, 1]} def test_column_transformer_dataframe(): @@ -190,6 +193,7 @@ def test_column_transformer_dataframe(): assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' + assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) @@ -197,6 +201,7 @@ def test_column_transformer_dataframe(): assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' + assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} @@ -208,7 +213,8 @@ def test_column_transformer_dataframe(): assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 - assert ct.transformers_[-1][0] != 'remainder' + assert both.transformers_[-1][0] != 'remainder' + assert both.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], @@ -216,16 +222,18 @@ def test_column_transformer_dataframe(): assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 - assert ct.transformers_[-1][0] != 'remainder' + assert both.transformers_[-1][0] != 'remainder' + assert both.transformers_output_ == {'trans': [0, 1]} both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 - assert ct.transformers_[-1][0] != 'remainder' + assert both.transformers_[-1][0] != 'remainder' + assert both.transformers_output_ == {'trans': [0, 1]} - # ensure pandas object is passes through + # ensure pandas object is passed through class TransAssert(BaseEstimator): @@ -286,6 +294,7 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], TransRaise) + assert ct.transformers_output_ == {'trans1': [0, 1]} ct = ColumnTransformer([('trans1', TransRaise(), column), ('trans2', Trans(), [0, 1])]) @@ -293,6 +302,7 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], TransRaise) + assert ct.transformers_output_ == {'trans2': [0, 1]} ct = ColumnTransformer([('trans', TransRaise(), column)], remainder='passthrough') @@ -300,6 +310,7 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], TransRaise) + assert ct.transformers_output_ == {'remainder': [0, 1]} fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', TransRaise(), column)], @@ -355,6 +366,8 @@ def test_column_transformer_list(): assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) + assert ct.transformers_output_ == {'numerical': [0, 1], 'categorical': [2, 3]} + def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T From 80a9b14f5036efc61932c46ad370bd1aa7d3b3fc Mon Sep 17 00:00:00 2001 From: lbittarello Date: Sun, 13 Sep 2020 17:06:34 +0100 Subject: [PATCH 02/26] Shorten lines --- sklearn/compose/_column_transformer.py | 4 +++- sklearn/compose/tests/test_column_transformer.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index fcb61be5af792..1417341083e4d 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -431,7 +431,9 @@ def _index_output(self, Xs): self._iter(fitted=True, replace_strings=True) ): n_columns = Xs[idx].shape[1] - self.transformers_output_[name] = list(range(idx_, idx_ + n_columns)) + self.transformers_output_[name] = list( + range(idx_, idx_ + n_columns) + ) idx_ += n_columns return diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index b4ae02123913f..e3b76ba7fdb8d 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -366,7 +366,9 @@ def test_column_transformer_list(): assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) - assert ct.transformers_output_ == {'numerical': [0, 1], 'categorical': [2, 3]} + assert ct.transformers_output_ == { + 'numerical': [0, 1], 'categorical': [2, 3] + } def test_column_transformer_sparse_stacking(): From 5ce664694cfcb5fdf6920b2ea4017009aef13416 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Sun, 13 Sep 2020 18:50:50 +0200 Subject: [PATCH 03/26] Control variable 'trans' not used within the loop --- sklearn/compose/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 1417341083e4d..d35240d004224 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -427,7 +427,7 @@ def _index_output(self, Xs): """ idx_ = 0 self.transformers_output_ = {} - for idx, (name, trans, _, _) in enumerate( + for idx, (name, _, _, _) in enumerate( self._iter(fitted=True, replace_strings=True) ): n_columns = Xs[idx].shape[1] From fbc374f21f43ef54fc5af18c88268df741ef5d22 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Fri, 25 Sep 2020 08:38:52 +0100 Subject: [PATCH 04/26] Use slices as values instead of lists --- sklearn/compose/_column_transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index d35240d004224..f88cd59da6bd9 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -431,9 +431,7 @@ def _index_output(self, Xs): self._iter(fitted=True, replace_strings=True) ): n_columns = Xs[idx].shape[1] - self.transformers_output_[name] = list( - range(idx_, idx_ + n_columns) - ) + self.transformers_output_[name] = slice(idx_, idx_ + n_columns) idx_ += n_columns return From 6e86e807d1fc808f52997fc22caed8c8d8207bdd Mon Sep 17 00:00:00 2001 From: lbittarello Date: Fri, 25 Sep 2020 08:39:41 +0100 Subject: [PATCH 05/26] Clarify docstring (hopefully) --- sklearn/compose/_column_transformer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f88cd59da6bd9..1a1342acf2ec6 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -137,8 +137,9 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): of the individual transformers and the `sparse_threshold` keyword. transformers_output_ : dict - A dictionary from transformer names to column indices, recording the - columns produced by each transformer (if any). + A dictionary from transformer names to slices to track the results + of each transformer. The slices are the positions in the stacked + results of the output of each particular step in `transformers`. Notes ----- From 9c789de248f27d5cbdefdeaac85aef376ef82708 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Fri, 25 Sep 2020 08:41:09 +0100 Subject: [PATCH 06/26] Update and spin off tests --- .../compose/tests/test_column_transformer.py | 59 ++++++++++++++----- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index e3b76ba7fdb8d..135abcdbceeec 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -115,7 +115,6 @@ def test_column_transformer(): assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 - assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} @@ -127,14 +126,12 @@ def test_column_transformer(): assert_array_equal(both.fit_transform(X_array), res) assert_array_equal(both.fit(X_array).transform(X_array), res) assert len(both.transformers_) == 2 - assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) assert len(both.transformers_) == 1 - assert both.transformers_output_ == {'trans': [0, 1]} def test_column_transformer_dataframe(): @@ -193,7 +190,6 @@ def test_column_transformer_dataframe(): assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' - assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) @@ -201,7 +197,6 @@ def test_column_transformer_dataframe(): assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' - assert ct.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} @@ -214,7 +209,6 @@ def test_column_transformer_dataframe(): assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert both.transformers_[-1][0] != 'remainder' - assert both.transformers_output_ == {'trans1': [0], 'trans2': [1]} # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], @@ -223,7 +217,6 @@ def test_column_transformer_dataframe(): assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert both.transformers_[-1][0] != 'remainder' - assert both.transformers_output_ == {'trans': [0, 1]} both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) @@ -231,7 +224,6 @@ def test_column_transformer_dataframe(): assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert both.transformers_[-1][0] != 'remainder' - assert both.transformers_output_ == {'trans': [0, 1]} # ensure pandas object is passed through @@ -294,7 +286,6 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], TransRaise) - assert ct.transformers_output_ == {'trans1': [0, 1]} ct = ColumnTransformer([('trans1', TransRaise(), column), ('trans2', Trans(), [0, 1])]) @@ -302,7 +293,6 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], TransRaise) - assert ct.transformers_output_ == {'trans2': [0, 1]} ct = ColumnTransformer([('trans', TransRaise(), column)], remainder='passthrough') @@ -310,7 +300,6 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], TransRaise) - assert ct.transformers_output_ == {'remainder': [0, 1]} fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', TransRaise(), column)], @@ -321,6 +310,50 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert isinstance(ct.transformers_[0][1], TransRaise) +def test_column_transformer_transformers_output(): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + ct = ColumnTransformer([('trans1', Trans(), [0]), + ('trans2', Trans(), [1])]) + ct.fit(X_array) + assert ct.transformers_output_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} + + # test with transformer_weights and multiple columns + ct = ColumnTransformer([('trans', Trans(), [0, 1])], + transformer_weights={'trans': .1}) + ct.fit(X_array) + assert ct.transformers_output_ == {'trans': slice(0, 2)} + + # test case that ensures that the attribute does also work when + # a given transformer doesn't have any columns to work on + ct = ColumnTransformer([('trans1', Trans(), [0, 1]), + ('trans2', TransRaise(), [])]) + ct.fit(X_array) + assert ct.transformers_output_ == {'trans1': slice(0, 2)} + + ct = ColumnTransformer([('trans', TransRaise(), [])], + remainder='passthrough') + ct.fit(X_array) + assert ct.transformers_output_ == {'remainder': slice(0, 2)} + + # test with data frame + pd = pytest.importorskip('pandas') + X_df = pd.DataFrame(X_array, columns=['first', 'second']) + + ct = ColumnTransformer([('trans1', Trans(), ['first']), + ('trans2', Trans(), ['second'])]) + ct.fit(X_df) + assert ct.transformers_output_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} + + ct = ColumnTransformer([('trans1', Trans(), [0]), + ('trans2', Trans(), [1])]) + ct.fit(X_df) + assert ct.transformers_output_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} + + def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() @@ -366,10 +399,6 @@ def test_column_transformer_list(): assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) - assert ct.transformers_output_ == { - 'numerical': [0, 1], 'categorical': [2, 3] - } - def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T From bcc35eeacbf5ea1f96b4745e26db86b5d759d98b Mon Sep 17 00:00:00 2001 From: lbittarello Date: Fri, 25 Sep 2020 08:45:28 +0100 Subject: [PATCH 07/26] Fix white space --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 135abcdbceeec..1deda43a0a784 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -321,7 +321,7 @@ def test_column_transformer_transformers_output(): # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], - transformer_weights={'trans': .1}) + transformer_weights={'trans': .1}) ct.fit(X_array) assert ct.transformers_output_ == {'trans': slice(0, 2)} From fc3948f3d8ed0761be227ab7ed0c934244ec2e07 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:15:28 +0100 Subject: [PATCH 08/26] Update documentation Co-Authored-By: Nicolas Hug Co-Authored-By: Joel Nothman <78827+jnothman@users.noreply.github.com> --- sklearn/compose/_column_transformer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 1a1342acf2ec6..9d163cc3ed8d7 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -137,9 +137,10 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): of the individual transformers and the `sparse_threshold` keyword. transformers_output_ : dict - A dictionary from transformer names to slices to track the results - of each transformer. The slices are the positions in the stacked - results of the output of each particular step in `transformers`. + A dictionary from transformer names to :py:class:`slice`s, where + each slice corresponds to indices in the transformed output. + This is useful to inspect which transformer is responsible for + which transformed feature(s). Notes ----- From 4a4f81d18b7d8572fa16c14934bf44a9262fc03d Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:25:03 +0100 Subject: [PATCH 09/26] Rename idx_ Co-Authored-By: Nicolas Hug --- sklearn/compose/_column_transformer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 9d163cc3ed8d7..34dff9d3c54b8 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -427,14 +427,14 @@ def _index_output(self, Xs): """ Record which transformer produced which column. """ - idx_ = 0 + idx = 0 self.transformers_output_ = {} - for idx, (name, _, _, _) in enumerate( + for transformer_idx, (name, _, _, _) in enumerate( self._iter(fitted=True, replace_strings=True) ): - n_columns = Xs[idx].shape[1] - self.transformers_output_[name] = slice(idx_, idx_ + n_columns) - idx_ += n_columns + n_columns = Xs[transformer_idx].shape[1] + self.transformers_output_[name] = slice(idx, idx + n_columns) + idx += n_columns return def _validate_features(self, n_features, feature_names): From e844aac31d5f959dba7d4f7a1b098b0f1aaa2618 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:26:07 +0100 Subject: [PATCH 10/26] Do not return Co-Authored-By: Nicolas Hug --- sklearn/compose/_column_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 34dff9d3c54b8..a0a9384bbbeea 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -435,7 +435,6 @@ def _index_output(self, Xs): n_columns = Xs[transformer_idx].shape[1] self.transformers_output_[name] = slice(idx, idx + n_columns) idx += n_columns - return def _validate_features(self, n_features, feature_names): """Ensures feature counts and names are the same during fit and From b8191ec7d409ed5129eba8f6fc6cf288fd1102ac Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:27:29 +0100 Subject: [PATCH 11/26] Rename _index_output Co-Authored-By: Nicolas Hug Co-Authored-By: Joel Nothman <78827+jnothman@users.noreply.github.com> --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a0a9384bbbeea..ae14ee812865d 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -423,7 +423,7 @@ def _validate_output(self, result): "The output of the '{0}' transformer should be 2D (scipy " "matrix, array, or pandas DataFrame).".format(name)) - def _index_output(self, Xs): + def _record_transformer_slices(self, Xs): """ Record which transformer produced which column. """ @@ -574,7 +574,7 @@ def fit_transform(self, X, y=None): self._update_fitted_transformers(transformers) self._validate_output(Xs) - self._index_output(Xs) + self._record_transformer_slices(Xs) return self._hstack(list(Xs)) From 6665c0341ed53242bbc10e55b89329f0f07b4ff1 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:31:35 +0100 Subject: [PATCH 12/26] Rename attribute Co-Authored-By: Joel Nothman <78827+jnothman@users.noreply.github.com> --- sklearn/compose/_column_transformer.py | 6 +++--- .../compose/tests/test_column_transformer.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index ae14ee812865d..31766735c80a6 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -136,7 +136,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. - transformers_output_ : dict + transformer_slices_ : dict A dictionary from transformer names to :py:class:`slice`s, where each slice corresponds to indices in the transformed output. This is useful to inspect which transformer is responsible for @@ -428,12 +428,12 @@ def _record_transformer_slices(self, Xs): Record which transformer produced which column. """ idx = 0 - self.transformers_output_ = {} + self.transformer_slices_ = {} for transformer_idx, (name, _, _, _) in enumerate( self._iter(fitted=True, replace_strings=True) ): n_columns = Xs[transformer_idx].shape[1] - self.transformers_output_[name] = slice(idx, idx + n_columns) + self.transformer_slices_[name] = slice(idx, idx + n_columns) idx += n_columns def _validate_features(self, n_features, feature_names): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 1deda43a0a784..1919eb33d356c 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -316,26 +316,26 @@ def test_column_transformer_transformers_output(): ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_array) - assert ct.transformers_output_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + assert ct.transformer_slices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) ct.fit(X_array) - assert ct.transformers_output_ == {'trans': slice(0, 2)} + assert ct.transformer_slices_ == {'trans': slice(0, 2)} # test case that ensures that the attribute does also work when # a given transformer doesn't have any columns to work on ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', TransRaise(), [])]) ct.fit(X_array) - assert ct.transformers_output_ == {'trans1': slice(0, 2)} + assert ct.transformer_slices_ == {'trans1': slice(0, 2)} ct = ColumnTransformer([('trans', TransRaise(), [])], remainder='passthrough') ct.fit(X_array) - assert ct.transformers_output_ == {'remainder': slice(0, 2)} + assert ct.transformer_slices_ == {'remainder': slice(0, 2)} # test with data frame pd = pytest.importorskip('pandas') @@ -344,14 +344,14 @@ def test_column_transformer_transformers_output(): ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) ct.fit(X_df) - assert ct.transformers_output_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + assert ct.transformer_slices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_df) - assert ct.transformers_output_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + assert ct.transformer_slices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2)} def test_column_transformer_sparse_array(): From 127760bb167507ab504c7435c9bf700958a44c19 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 22:34:18 +0100 Subject: [PATCH 13/26] Add comment to tests Co-Authored-By: Nicolas Hug --- sklearn/compose/tests/test_column_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 1919eb33d356c..43ed6a2ba329c 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -310,7 +310,8 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert isinstance(ct.transformers_[0][1], TransRaise) -def test_column_transformer_transformers_output(): +def test_column_transformer_transformer_slices(): + # Checks for the transformer_slices_ attribute X_array = np.array([[0, 1, 2], [2, 4, 6]]).T ct = ColumnTransformer([('trans1', Trans(), [0]), From a0dd2857cab0f678bb2c59233617bd818415629d Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 23:08:14 +0100 Subject: [PATCH 14/26] Add entries for transformers without output Co-Authored-By: Nicolas Hug --- sklearn/compose/_column_transformer.py | 8 ++++++++ .../compose/tests/test_column_transformer.py | 19 +++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 31766735c80a6..6ecf432ac5d4e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -429,6 +429,7 @@ def _record_transformer_slices(self, Xs): """ idx = 0 self.transformer_slices_ = {} + for transformer_idx, (name, _, _, _) in enumerate( self._iter(fitted=True, replace_strings=True) ): @@ -436,6 +437,13 @@ def _record_transformer_slices(self, Xs): self.transformer_slices_[name] = slice(idx, idx + n_columns) idx += n_columns + # empty slices for transformers that generate no output (except + # for 'remainder' when 'drop'); these are safe for indexing + all_names = [t[0] for t in self.transformers] + ['remainder'] + for name in all_names: + if name not in self.transformer_slices_: + self.transformer_slices_[name] = slice(-1, 0) + def _validate_features(self, n_features, feature_names): """Ensures feature counts and names are the same during fit and transform. diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 43ed6a2ba329c..c9e7a4ad3a799 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -318,25 +318,30 @@ def test_column_transformer_transformer_slices(): ('trans2', Trans(), [1])]) ct.fit(X_array) assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) ct.fit(X_array) - assert ct.transformer_slices_ == {'trans': slice(0, 2)} + assert ct.transformer_slices_ == {'trans': slice(0, 2), + 'remainder': slice(-1, 0)} # test case that ensures that the attribute does also work when # a given transformer doesn't have any columns to work on ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', TransRaise(), [])]) ct.fit(X_array) - assert ct.transformer_slices_ == {'trans1': slice(0, 2)} + assert ct.transformer_slices_ == {'trans1': slice(0, 2), + 'trans2': slice(-1, 0), + 'remainder': slice(-1, 0)} ct = ColumnTransformer([('trans', TransRaise(), [])], remainder='passthrough') ct.fit(X_array) - assert ct.transformer_slices_ == {'remainder': slice(0, 2)} + assert ct.transformer_slices_ == {'trans': slice(-1, 0), + 'remainder': slice(0, 2)} # test with data frame pd = pytest.importorskip('pandas') @@ -346,13 +351,15 @@ def test_column_transformer_transformer_slices(): ('trans2', Trans(), ['second'])]) ct.fit(X_df) assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_df) assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2)} + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} def test_column_transformer_sparse_array(): From 29e12f3fa1dc7e72e038367df7c03b053bc255b9 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 23:37:53 +0100 Subject: [PATCH 15/26] Docstring --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6ecf432ac5d4e..9eadd1276b06e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -137,8 +137,8 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): of the individual transformers and the `sparse_threshold` keyword. transformer_slices_ : dict - A dictionary from transformer names to :py:class:`slice`s, where - each slice corresponds to indices in the transformed output. + A dictionary from each transformer name to a :py:object:`slice`, + where the slice corresponds to indices in the transformed output. This is useful to inspect which transformer is responsible for which transformed feature(s). From ee44d509818bf8bc8b7e656b1d9ff9d9f3c51c1e Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 23:41:26 +0100 Subject: [PATCH 16/26] Rename attribute --- sklearn/compose/_column_transformer.py | 14 ++++---- .../compose/tests/test_column_transformer.py | 36 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 9eadd1276b06e..51e13c4d35b01 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -136,7 +136,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. - transformer_slices_ : dict + output_indices_ : dict A dictionary from each transformer name to a :py:object:`slice`, where the slice corresponds to indices in the transformed output. This is useful to inspect which transformer is responsible for @@ -423,26 +423,26 @@ def _validate_output(self, result): "The output of the '{0}' transformer should be 2D (scipy " "matrix, array, or pandas DataFrame).".format(name)) - def _record_transformer_slices(self, Xs): + def _record_output_indices(self, Xs): """ Record which transformer produced which column. """ idx = 0 - self.transformer_slices_ = {} + self.output_indices_ = {} for transformer_idx, (name, _, _, _) in enumerate( self._iter(fitted=True, replace_strings=True) ): n_columns = Xs[transformer_idx].shape[1] - self.transformer_slices_[name] = slice(idx, idx + n_columns) + self.output_indices_[name] = slice(idx, idx + n_columns) idx += n_columns # empty slices for transformers that generate no output (except # for 'remainder' when 'drop'); these are safe for indexing all_names = [t[0] for t in self.transformers] + ['remainder'] for name in all_names: - if name not in self.transformer_slices_: - self.transformer_slices_[name] = slice(-1, 0) + if name not in self.output_indices_: + self.output_indices_[name] = slice(-1, 0) def _validate_features(self, n_features, feature_names): """Ensures feature counts and names are the same during fit and @@ -582,7 +582,7 @@ def fit_transform(self, X, y=None): self._update_fitted_transformers(transformers) self._validate_output(Xs) - self._record_transformer_slices(Xs) + self._record_output_indices(Xs) return self._hstack(list(Xs)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index c9e7a4ad3a799..63ba954935c4e 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -310,38 +310,38 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert isinstance(ct.transformers_[0][1], TransRaise) -def test_column_transformer_transformer_slices(): - # Checks for the transformer_slices_ attribute +def test_column_transformer_output_indices(): + # Checks for the output_indices_ attribute X_array = np.array([[0, 1, 2], [2, 4, 6]]).T ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_array) - assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) ct.fit(X_array) - assert ct.transformer_slices_ == {'trans': slice(0, 2), - 'remainder': slice(-1, 0)} + assert ct.output_indices_ == {'trans': slice(0, 2), + 'remainder': slice(-1, 0)} # test case that ensures that the attribute does also work when # a given transformer doesn't have any columns to work on ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', TransRaise(), [])]) ct.fit(X_array) - assert ct.transformer_slices_ == {'trans1': slice(0, 2), - 'trans2': slice(-1, 0), - 'remainder': slice(-1, 0)} + assert ct.output_indices_ == {'trans1': slice(0, 2), + 'trans2': slice(-1, 0), + 'remainder': slice(-1, 0)} ct = ColumnTransformer([('trans', TransRaise(), [])], remainder='passthrough') ct.fit(X_array) - assert ct.transformer_slices_ == {'trans': slice(-1, 0), - 'remainder': slice(0, 2)} + assert ct.output_indices_ == {'trans': slice(-1, 0), + 'remainder': slice(0, 2)} # test with data frame pd = pytest.importorskip('pandas') @@ -350,16 +350,16 @@ def test_column_transformer_transformer_slices(): ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) ct.fit(X_df) - assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_df) - assert ct.transformer_slices_ == {'trans1': slice(0, 1), - 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(-1, 0)} def test_column_transformer_sparse_array(): From f8013d81ce59e86be2e4c1035945bfb377f3d340 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Thu, 19 Nov 2020 23:42:44 +0100 Subject: [PATCH 17/26] Inconsistent tests --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 63ba954935c4e..53bd6ff1ab668 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -312,7 +312,7 @@ def test_column_transformer_empty_columns(pandas, column_selection, def test_column_transformer_output_indices(): # Checks for the output_indices_ attribute - X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_array = np.arange(6).reshape(3, 2) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) From 94cf92471dfb7d766aad4d529501434077139df6 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Fri, 20 Nov 2020 07:57:54 +0100 Subject: [PATCH 18/26] Docstrings --- sklearn/compose/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 51e13c4d35b01..5b3081c336598 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -137,7 +137,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): of the individual transformers and the `sparse_threshold` keyword. output_indices_ : dict - A dictionary from each transformer name to a :py:object:`slice`, + A dictionary from each transformer name to a :object:`slice`, where the slice corresponds to indices in the transformed output. This is useful to inspect which transformer is responsible for which transformed feature(s). From 14899f7bf5338b221dc066441a8d27cc3124d7a0 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Fri, 20 Nov 2020 08:43:08 +0100 Subject: [PATCH 19/26] Docstrings without reference --- sklearn/compose/_column_transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 5b3081c336598..e786070258df3 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -137,10 +137,10 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): of the individual transformers and the `sparse_threshold` keyword. output_indices_ : dict - A dictionary from each transformer name to a :object:`slice`, - where the slice corresponds to indices in the transformed output. - This is useful to inspect which transformer is responsible for - which transformed feature(s). + A dictionary from each transformer name to a slice, where the slice + corresponds to indices in the transformed output. This is useful to + inspect which transformer is responsible for which transformed + feature(s). Notes ----- From 7d5ed362e5de786042eb95ab9e5707ffe5a170a2 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Sat, 21 Nov 2020 22:18:55 +0100 Subject: [PATCH 20/26] slice(0, 0) Co-Authored-By: Nicolas Hug --- sklearn/compose/_column_transformer.py | 2 +- sklearn/compose/tests/test_column_transformer.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index e786070258df3..07fd66e9068b0 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -442,7 +442,7 @@ def _record_output_indices(self, Xs): all_names = [t[0] for t in self.transformers] + ['remainder'] for name in all_names: if name not in self.output_indices_: - self.output_indices_[name] = slice(-1, 0) + self.output_indices_[name] = slice(0, 0) def _validate_features(self, n_features, feature_names): """Ensures feature counts and names are the same during fit and diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 53bd6ff1ab668..458048c35c5a2 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -319,14 +319,14 @@ def test_column_transformer_output_indices(): ct.fit(X_array) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + 'remainder': slice(0, 0)} # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) ct.fit(X_array) assert ct.output_indices_ == {'trans': slice(0, 2), - 'remainder': slice(-1, 0)} + 'remainder': slice(0, 0)} # test case that ensures that the attribute does also work when # a given transformer doesn't have any columns to work on @@ -334,13 +334,13 @@ def test_column_transformer_output_indices(): ('trans2', TransRaise(), [])]) ct.fit(X_array) assert ct.output_indices_ == {'trans1': slice(0, 2), - 'trans2': slice(-1, 0), - 'remainder': slice(-1, 0)} + 'trans2': slice(0, 0), + 'remainder': slice(0, 0)} ct = ColumnTransformer([('trans', TransRaise(), [])], remainder='passthrough') ct.fit(X_array) - assert ct.output_indices_ == {'trans': slice(-1, 0), + assert ct.output_indices_ == {'trans': slice(0, 0), 'remainder': slice(0, 2)} # test with data frame @@ -352,14 +352,14 @@ def test_column_transformer_output_indices(): ct.fit(X_df) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + 'remainder': slice(0, 0)} ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) ct.fit(X_df) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), - 'remainder': slice(-1, 0)} + 'remainder': slice(0, 0)} def test_column_transformer_sparse_array(): From 50ed71f644ed71c7c7e326e14c9dc7839ec34714 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Sat, 21 Nov 2020 22:19:26 +0100 Subject: [PATCH 21/26] Update outdated comment Co-Authored-By: Nicolas Hug --- sklearn/compose/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 07fd66e9068b0..8223898f38244 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -437,8 +437,8 @@ def _record_output_indices(self, Xs): self.output_indices_[name] = slice(idx, idx + n_columns) idx += n_columns - # empty slices for transformers that generate no output (except - # for 'remainder' when 'drop'); these are safe for indexing + # empty slices for transformers that generate no output; these + # are safe for indexing all_names = [t[0] for t in self.transformers] + ['remainder'] for name in all_names: if name not in self.output_indices_: From 06ba19a2b99365daef88823edbe924a41326d6c6 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Sat, 21 Nov 2020 22:19:51 +0100 Subject: [PATCH 22/26] More tests Co-Authored-By: Nicolas Hug --- .../compose/tests/test_column_transformer.py | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 458048c35c5a2..a9beb4503243c 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -316,32 +316,50 @@ def test_column_transformer_output_indices(): ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) - ct.fit(X_array) + X_trans = ct.fit_transform(X_array) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) # test with transformer_weights and multiple columns ct = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) - ct.fit(X_array) + X_trans = ct.fit_transform(X_array) assert ct.output_indices_ == {'trans': slice(0, 2), 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['trans']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) # test case that ensures that the attribute does also work when # a given transformer doesn't have any columns to work on ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', TransRaise(), [])]) - ct.fit(X_array) + X_trans = ct.fit_transform(X_array) assert ct.output_indices_ == {'trans1': slice(0, 2), 'trans2': slice(0, 0), 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) ct = ColumnTransformer([('trans', TransRaise(), [])], remainder='passthrough') - ct.fit(X_array) + X_trans = ct.fit_transform(X_array) assert ct.output_indices_ == {'trans': slice(0, 0), 'remainder': slice(0, 2)} + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['trans']]) + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['remainder']]) # test with data frame pd = pytest.importorskip('pandas') @@ -349,17 +367,29 @@ def test_column_transformer_output_indices(): ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) - ct.fit(X_df) + X_trans = ct.fit_transform(X_df) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) - ct.fit(X_df) + X_trans = ct.fit_transform(X_df) assert ct.output_indices_ == {'trans1': slice(0, 1), 'trans2': slice(1, 2), 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) def test_column_transformer_sparse_array(): From 36b20e8be70d994bef999cd7dfd11a2a8af07a67 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Sat, 21 Nov 2020 22:23:17 +0100 Subject: [PATCH 23/26] Log changes Co-Authored-By: Nicolas Hug --- doc/whats_new/v0.24.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index e8225811194cc..ae5e974ee9a0e 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -89,6 +89,9 @@ Changelog - |FIX| :class:`compose.ColumnTransformer` now displays the remainder in the diagram display. :pr:`18167` by `Thomas Fan`_. +- |Enhancement| :class:`compose.ColumnTransformer` now records the output + of each transformer. :pr:`18393` by :user:`Luca Bittarello `. + :mod:`sklearn.covariance` ......................... From 27cfee5615bb304778bc8a171e332e480e86529e Mon Sep 17 00:00:00 2001 From: lbittarello Date: Sat, 27 Mar 2021 12:49:15 +0100 Subject: [PATCH 24/26] Update change logs --- doc/whats_new/v0.24.rst | 3 --- doc/whats_new/v1.0.rst | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index e34ad84a98ab3..68ea8ba0f7a72 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -19,9 +19,6 @@ Changelog :term:`get_feature_names` on transformers with an empty column selection. :pr:`19579` by `Thomas Fan`_. -- |Enhancement| :class:`compose.ColumnTransformer` now records the output - of each transformer. :pr:`18393` by :user:`Luca Bittarello `. - :mod:`sklearn.ensemble` ....................... diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index be894774f5a27..a54abb78730a4 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -102,6 +102,13 @@ Changelog - |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message when the cached file is invalid. :pr:`19669` `Thomas Fan`_. +:mod:`sklearn.compose` +...................... + +- |Enhancement| :class:`compose.ColumnTransformer` now records the output + of each transformer in `output_indices_`. :pr:`18393` by + :user:`Luca Bittarello `. + :mod:`sklearn.decomposition` ............................ From 2b96828eb971e1375cdfe235eda5b2f456307845 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Sat, 27 Mar 2021 12:53:26 +0100 Subject: [PATCH 25/26] Update comment --- sklearn/compose/_column_transformer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index da4eab1bf5e14..da4a2dd93507c 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -428,8 +428,9 @@ def _record_output_indices(self, Xs): self.output_indices_[name] = slice(idx, idx + n_columns) idx += n_columns - # empty slices for transformers that generate no output; these - # are safe for indexing + # `_iter` only generates transformers that have a non empty + # selection. Here we set empty slices for transformers that + # generate no output, which are safe for indexing all_names = [t[0] for t in self.transformers] + ['remainder'] for name in all_names: if name not in self.output_indices_: From 5ee27f9a815a2b0ff8fba3709115ad69828ac6c0 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Sat, 27 Mar 2021 12:57:14 +0100 Subject: [PATCH 26/26] Split tests --- sklearn/compose/tests/test_column_transformer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a73f6f82d3623..f7c1874d4a1b7 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -361,9 +361,13 @@ def test_column_transformer_output_indices(): assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_['remainder']]) - # test with data frame + +def test_column_transformer_output_indices_df(): + # Checks for the output_indices_ attribute with data frames pd = pytest.importorskip('pandas') - X_df = pd.DataFrame(X_array, columns=['first', 'second']) + + X_df = pd.DataFrame(np.arange(6).reshape(3, 2), + columns=['first', 'second']) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])])