Skip to content

[MRG] Convert ColumnTransformer input list to numpy array #12104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
from ..base import clone, TransformerMixin
from ..utils import Parallel, delayed
from ..externals import six
from ..pipeline import (
_fit_one_transformer, _fit_transform_one, _transform_one, _name_estimators)
from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
from ..preprocessing import FunctionTransformer
from ..utils import Bunch
from ..utils.metaestimators import _BaseComposition
from ..utils.validation import check_is_fitted
from ..utils.validation import check_array, check_is_fitted


__all__ = ['ColumnTransformer', 'make_column_transformer']
Expand Down Expand Up @@ -436,6 +435,7 @@ def fit_transform(self, X, y=None):
sparse matrices.

"""
X = _check_X(X)
self._validate_transformers()
self._validate_column_callables(X)
self._validate_remainder(X)
Expand Down Expand Up @@ -485,6 +485,7 @@ def transform(self, X):
"""
check_is_fitted(self, 'transformers_')

X = _check_X(X)
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
self._validate_output(Xs)

Expand All @@ -511,6 +512,13 @@ def _hstack(self, Xs):
return np.hstack(Xs)


def _check_X(X):
"""Use check_array only on lists and other non-array-likes / sparse"""
if hasattr(X, '__array__') or sparse.issparse(X):
return X
return check_array(X, force_all_finite='allow-nan', dtype=np.object)


def _check_key_type(key, superclass):
"""
Check that scalar, list or slice is of a certain type.
Expand Down
24 changes: 23 additions & 1 deletion sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from sklearn.base import BaseEstimator
from sklearn.externals import six
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.exceptions import NotFittedError
from sklearn.exceptions import NotFittedError, DataConversionWarning
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

Expand Down Expand Up @@ -278,6 +278,28 @@ def test_column_transformer_sparse_array():
X_res_both)


def test_column_transformer_list():
X_list = [
[1, float('nan'), 'a'],
[0, 0, 'b']
]
expected_result = np.array([
[1, float('nan'), 1, 0],
[-1, 0, 0, 1],
])

ct = ColumnTransformer([
('numerical', StandardScaler(), [0, 1]),
('categorical', OneHotEncoder(), [2]),
])

with pytest.warns(DataConversionWarning):
# TODO: this warning is not very useful in this case, would be good
# to get rid of it
assert_array_equal(ct.fit_transform(X_list), expected_result)
assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)


def test_column_transformer_sparse_stacking():
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
col_trans = ColumnTransformer([('trans1', Trans(), [0]),
Expand Down