Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ Changelog
- |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message
when the cached file is invalid. :pr:`19669` `Thomas Fan`_.

:mod:`sklearn.compose`
......................

- |Enhancement| :class:`compose.ColumnTransformer` now records the output
of each transformer in `output_indices_`. :pr:`18393` by
:user:`Luca Bittarello <lbittarello>`.

:mod:`sklearn.decomposition`
............................

Expand Down
29 changes: 29 additions & 0 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
sparse matrix or a dense numpy array, which depends on the output
of the individual transformers and the `sparse_threshold` keyword.

output_indices_ : dict
A dictionary from each transformer name to a slice, where the slice
corresponds to indices in the transformed output. This is useful to
inspect which transformer is responsible for which transformed
feature(s).

Notes
-----
The order of the columns in the transformed feature matrix follows the
Expand Down Expand Up @@ -408,6 +414,28 @@ def _validate_output(self, result):
"The output of the '{0}' transformer should be 2D (scipy "
"matrix, array, or pandas DataFrame).".format(name))

def _record_output_indices(self, Xs):
"""
Record which transformer produced which column.
"""
idx = 0
self.output_indices_ = {}

for transformer_idx, (name, _, _, _) in enumerate(
self._iter(fitted=True, replace_strings=True)
):
n_columns = Xs[transformer_idx].shape[1]
self.output_indices_[name] = slice(idx, idx + n_columns)
idx += n_columns

# `_iter` only generates transformers that have a non empty
# selection. Here we set empty slices for transformers that
# generate no output, which are safe for indexing
all_names = [t[0] for t in self.transformers] + ['remainder']
for name in all_names:
if name not in self.output_indices_:
self.output_indices_[name] = slice(0, 0)

def _log_message(self, name, idx, total):
if not self.verbose:
return None
Expand Down Expand Up @@ -518,6 +546,7 @@ def fit_transform(self, X, y=None):

self._update_fitted_transformers(transformers)
self._validate_output(Xs)
self._record_output_indices(Xs)

return self._hstack(list(Xs))

Expand Down
88 changes: 87 additions & 1 deletion sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def test_column_transformer_dataframe():
assert len(both.transformers_) == 1
assert both.transformers_[-1][0] != 'remainder'

# ensure pandas object is passes through
# ensure pandas object is passed through

class TransAssert(BaseEstimator):

Expand Down Expand Up @@ -310,6 +310,92 @@ def test_column_transformer_empty_columns(pandas, column_selection,
assert isinstance(ct.transformers_[0][1], TransRaise)


def test_column_transformer_output_indices():
# Checks for the output_indices_ attribute
X_array = np.arange(6).reshape(3, 2)

ct = ColumnTransformer([('trans1', Trans(), [0]),
('trans2', Trans(), [1])])
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {'trans1': slice(0, 1),
'trans2': slice(1, 2),
'remainder': slice(0, 0)}
assert_array_equal(X_trans[:, [0]],
X_trans[:, ct.output_indices_['trans1']])
assert_array_equal(X_trans[:, [1]],
X_trans[:, ct.output_indices_['trans2']])

# test with transformer_weights and multiple columns
ct = ColumnTransformer([('trans', Trans(), [0, 1])],
transformer_weights={'trans': .1})
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {'trans': slice(0, 2),
'remainder': slice(0, 0)}
assert_array_equal(X_trans[:, [0, 1]],
X_trans[:, ct.output_indices_['trans']])
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['remainder']])

# test case that ensures that the attribute does also work when
# a given transformer doesn't have any columns to work on
ct = ColumnTransformer([('trans1', Trans(), [0, 1]),
('trans2', TransRaise(), [])])
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {'trans1': slice(0, 2),
'trans2': slice(0, 0),
'remainder': slice(0, 0)}
assert_array_equal(X_trans[:, [0, 1]],
X_trans[:, ct.output_indices_['trans1']])
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['trans2']])
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['remainder']])

ct = ColumnTransformer([('trans', TransRaise(), [])],
remainder='passthrough')
X_trans = ct.fit_transform(X_array)
assert ct.output_indices_ == {'trans': slice(0, 0),
'remainder': slice(0, 2)}
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['trans']])
assert_array_equal(X_trans[:, [0, 1]],
X_trans[:, ct.output_indices_['remainder']])


def test_column_transformer_output_indices_df():
# Checks for the output_indices_ attribute with data frames
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame(np.arange(6).reshape(3, 2),
columns=['first', 'second'])

ct = ColumnTransformer([('trans1', Trans(), ['first']),
('trans2', Trans(), ['second'])])
X_trans = ct.fit_transform(X_df)
assert ct.output_indices_ == {'trans1': slice(0, 1),
'trans2': slice(1, 2),
'remainder': slice(0, 0)}
assert_array_equal(X_trans[:, [0]],
X_trans[:, ct.output_indices_['trans1']])
assert_array_equal(X_trans[:, [1]],
X_trans[:, ct.output_indices_['trans2']])
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['remainder']])

ct = ColumnTransformer([('trans1', Trans(), [0]),
('trans2', Trans(), [1])])
X_trans = ct.fit_transform(X_df)
assert ct.output_indices_ == {'trans1': slice(0, 1),
'trans2': slice(1, 2),
'remainder': slice(0, 0)}
assert_array_equal(X_trans[:, [0]],
X_trans[:, ct.output_indices_['trans1']])
assert_array_equal(X_trans[:, [1]],
X_trans[:, ct.output_indices_['trans2']])
assert_array_equal(X_trans[:, []],
X_trans[:, ct.output_indices_['remainder']])


def test_column_transformer_sparse_array():
X_sparse = sparse.eye(3, 2).tocsr()

Expand Down