Skip to content

ColumnTransformer generalization to work on empty lists #12084

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 25, 2018
24 changes: 22 additions & 2 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,10 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
transformers_ : list
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. If there are remaining columns,
the final element is a tuple of the form:
estimator, 'drop', or 'passthrough'. In case there were no columns
selected, this will be the unfitted transformer.
If there are remaining columns, the final element is a tuple of the
form:
('remainder', transformer, remaining_columns) corresponding to the
``remainder`` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
Expand Down Expand Up @@ -243,6 +245,8 @@ def _iter(self, fitted=False, replace_strings=False):
check_inverse=False)
elif trans == 'drop':
continue
elif _is_empty_column_selection(column):
continue

yield (name, trans, column, get_weight(name))

Expand Down Expand Up @@ -351,6 +355,8 @@ def _update_fitted_transformers(self, transformers):
# so get next transformer, but save original string
next(fitted_transformers)
trans = 'passthrough'
elif _is_empty_column_selection(column):
trans = old
else:
trans = next(fitted_transformers)
transformers_.append((name, trans, column))
Expand Down Expand Up @@ -644,6 +650,20 @@ def _get_column_indices(X, key):
"strings, or boolean mask is allowed")


def _is_empty_column_selection(column):
"""
Return True if the column selection is empty (empty list or all-False
boolean array).

"""
if hasattr(column, 'dtype') and np.issubdtype(column.dtype, np.bool_):
return not column.any()
elif hasattr(column, '__len__'):
return len(column) == 0
else:
return False


def _get_transformer_list(estimators):
"""
Construct (name, trans, column) tuples from list
Expand Down
45 changes: 45 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,51 @@ def transform(self, X, y=None):
assert_array_equal(ct.transformers_[-1][2], [1])


@pytest.mark.parametrize("pandas", [True, False], ids=['pandas', 'numpy'])
@pytest.mark.parametrize("column", [[], np.array([False, False])],
ids=['list', 'bool'])
def test_column_transformer_empty_columns(pandas, column):
# test case that ensures that the column transformer does also work when
# a given transformer doesn't have any columns to work on
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_res_both = X_array

if pandas:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X_array, columns=['first', 'second'])
else:
X = X_array

ct = ColumnTransformer([('trans1', Trans(), [0, 1]),
('trans2', Trans(), column)])
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2
assert isinstance(ct.transformers_[1][1], Trans)

ct = ColumnTransformer([('trans1', Trans(), column),
('trans2', Trans(), [0, 1])])
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2
assert isinstance(ct.transformers_[0][1], Trans)

ct = ColumnTransformer([('trans', Trans(), column)],
remainder='passthrough')
assert_array_equal(ct.fit_transform(X), X_res_both)
assert_array_equal(ct.fit(X).transform(X), X_res_both)
assert len(ct.transformers_) == 2 # including remainder
assert isinstance(ct.transformers_[0][1], Trans)

fixture = np.array([[], [], []])
ct = ColumnTransformer([('trans', Trans(), column)],
remainder='drop')
assert_array_equal(ct.fit_transform(X), fixture)
assert_array_equal(ct.fit(X).transform(X), fixture)
assert len(ct.transformers_) == 2 # including remainder
assert isinstance(ct.transformers_[0][1], Trans)


def test_column_transformer_sparse_array():
X_sparse = sparse.eye(3, 2).tocsr()

Expand Down