From a7e43ab408d193539401e0f27a59a443f4c10939 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Sep 2018 16:51:06 +0200 Subject: [PATCH 1/3] ColumnTransformer: store evaluated function column specifier during fit --- sklearn/compose/_column_transformer.py | 46 ++++++++++++------- .../compose/tests/test_column_transformer.py | 4 ++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index e09d2d09d7e43..472a0f16e1f50 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -217,7 +217,12 @@ def _iter(self, X=None, fitted=False, replace_strings=False): if fitted: transformers = self.transformers_ else: - transformers = self.transformers + # interleave the validated column specifiers + transformers = [ + (name, trans, column) for (name, trans, _), column + in zip(self.transformers, self._columns) + ] + # add transformer tuple for remainder if self._remainder[2] is not None: transformers = chain(transformers, [self._remainder]) get_weight = (self.transformer_weights or {}).get @@ -257,6 +262,17 @@ def _validate_transformers(self): "specifiers. '%s' (type %s) doesn't." % (t, type(t))) + def _validate_column_callables(self, X): + """ + Converts callable column specifications. + """ + columns = [] + for _, _, column in self.transformers: + if callable(column): + column = column(X) + columns.append(column) + self._columns = columns + def _validate_remainder(self, X): """ Validates ``remainder`` and defines ``_remainder`` targeting @@ -274,7 +290,7 @@ def _validate_remainder(self, X): n_columns = X.shape[1] cols = [] - for _, _, columns in self.transformers: + for columns in self._columns: cols.extend(_get_column_indices(X, columns)) remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None @@ -320,27 +336,30 @@ def get_feature_names(self): def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases - transformers = iter(transformers) + fitted_transformers = iter(transformers) transformers_ = [] - transformer_iter = self.transformers + transformers = [ + (name, trans, column) for (name, trans, _), column + in zip(self.transformers, self._columns) + ] if self._remainder[2] is not None: - transformer_iter = chain(transformer_iter, [self._remainder]) + transformers = chain(transformers, [self._remainder]) - for name, old, column in transformer_iter: + for name, old, column in transformers: if old == 'drop': trans = 'drop' elif old == 'passthrough': # FunctionTransformer is present in list of transformers, # so get next transformer, but save original string - next(transformers) + next(fitted_transformers) trans = 'passthrough' else: - trans = next(transformers) + trans = next(fitted_transformers) transformers_.append((name, trans, column)) # sanity check that transformers is exhausted - assert not list(transformers) + assert not list(fitted_transformers) self.transformers_ = transformers_ def _validate_output(self, result): @@ -419,8 +438,9 @@ def fit_transform(self, X, y=None): sparse matrices. """ - self._validate_remainder(X) self._validate_transformers() + self._validate_column_callables(X) + self._validate_remainder(X) result = self._fit_transform(X, y, _fit_transform_one) @@ -545,9 +565,6 @@ def _get_column(X, key): can use any hashable object as key). """ - if callable(key): - key = key(X) - # check whether we have string column names or integers if _check_key_type(key, int): column_names = False @@ -589,9 +606,6 @@ def _get_column_indices(X, key): """ n_columns = X.shape[1] - if callable(key): - key = key(X) - if _check_key_type(key, int): if isinstance(key, int): return [key] diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f67806a52c543..7e5e5029fa71a 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -873,6 +873,8 @@ def func(X): remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) + assert callable(ct.transformers[0][2]) + assert ct.transformers_[0][2] == [0] pd = pytest.importorskip('pandas') X_df = pd.DataFrame(X_array, columns=['first', 'second']) @@ -886,3 +888,5 @@ def func(X): remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) + assert callable(ct.transformers[0][2]) + assert ct.transformers_[0][2] == ['first'] From faa040f66d6c99be4a4e0dd32c47daac114f32ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Sep 2018 17:18:00 +0200 Subject: [PATCH 2/3] reuse _iter for _update_fitted_transformers to avoid repetition --- sklearn/compose/_column_transformer.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 472a0f16e1f50..334869a77a89b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -212,7 +212,13 @@ def set_params(self, **kwargs): return self def _iter(self, X=None, fitted=False, replace_strings=False): - """Generate (name, trans, column, weight) tuples + """ + Generate (name, trans, X_subset, weight, column) tuples. + + If fitted=True, use the fitted transformers, else use the + user specified transformers updated with converted column names + and potentially appended with transformer for remainder. + """ if fitted: transformers = self.transformers_ @@ -240,7 +246,7 @@ def _iter(self, X=None, fitted=False, replace_strings=False): elif trans == 'drop': continue - yield (name, trans, sub, get_weight(name)) + yield (name, trans, sub, get_weight(name), column) def _validate_transformers(self): if not self.transformers: @@ -319,7 +325,7 @@ def get_feature_names(self): """ check_is_fitted(self, 'transformers_') feature_names = [] - for name, trans, _, _ in self._iter(fitted=True): + for name, trans, _, _, _ in self._iter(fitted=True): if trans == 'drop': continue elif trans == 'passthrough': @@ -339,14 +345,7 @@ def _update_fitted_transformers(self, transformers): fitted_transformers = iter(transformers) transformers_ = [] - transformers = [ - (name, trans, column) for (name, trans, _), column - in zip(self.transformers, self._columns) - ] - if self._remainder[2] is not None: - transformers = chain(transformers, [self._remainder]) - - for name, old, column in transformers: + for name, old, _, _, column in self._iter(): if old == 'drop': trans = 'drop' elif old == 'passthrough': @@ -367,7 +366,8 @@ def _validate_output(self, result): Ensure that the output of each transformer is 2D. Otherwise hstack can raise an error or produce incorrect results. """ - names = [name for name, _, _, _ in self._iter(replace_strings=True)] + names = [name for name, _, _, _, _ in self._iter(fitted=True, + replace_strings=True)] for Xs, name in zip(result, names): if not getattr(Xs, 'ndim', 0) == 2: raise ValueError( @@ -386,7 +386,7 @@ def _fit_transform(self, X, y, func, fitted=False): return Parallel(n_jobs=self.n_jobs)( delayed(func)(clone(trans) if not fitted else trans, X_sel, y, weight) - for _, trans, X_sel, weight in self._iter( + for _, trans, X_sel, weight, _ in self._iter( X=X, fitted=fitted, replace_strings=True)) except ValueError as e: if "Expected 2D array, got 1D array instead" in str(e): From adad0d5b6200ea96e06b283b3c57baf03a5281fe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 20 Sep 2018 11:56:39 +0200 Subject: [PATCH 3/3] remove X from _iter --- sklearn/compose/_column_transformer.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 334869a77a89b..9014623280d2e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -211,7 +211,7 @@ def set_params(self, **kwargs): self._set_params('_transformers', **kwargs) return self - def _iter(self, X=None, fitted=False, replace_strings=False): + def _iter(self, fitted=False, replace_strings=False): """ Generate (name, trans, X_subset, weight, column) tuples. @@ -234,8 +234,6 @@ def _iter(self, X=None, fitted=False, replace_strings=False): get_weight = (self.transformer_weights or {}).get for name, trans, column in transformers: - sub = None if X is None else _get_column(X, column) - if replace_strings: # replace 'passthrough' with identity transformer and # skip in case of 'drop' @@ -246,7 +244,7 @@ def _iter(self, X=None, fitted=False, replace_strings=False): elif trans == 'drop': continue - yield (name, trans, sub, get_weight(name), column) + yield (name, trans, column, get_weight(name)) def _validate_transformers(self): if not self.transformers: @@ -325,7 +323,7 @@ def get_feature_names(self): """ check_is_fitted(self, 'transformers_') feature_names = [] - for name, trans, _, _, _ in self._iter(fitted=True): + for name, trans, _, _ in self._iter(fitted=True): if trans == 'drop': continue elif trans == 'passthrough': @@ -345,7 +343,7 @@ def _update_fitted_transformers(self, transformers): fitted_transformers = iter(transformers) transformers_ = [] - for name, old, _, _, column in self._iter(): + for name, old, column, _ in self._iter(): if old == 'drop': trans = 'drop' elif old == 'passthrough': @@ -366,8 +364,8 @@ def _validate_output(self, result): Ensure that the output of each transformer is 2D. Otherwise hstack can raise an error or produce incorrect results. """ - names = [name for name, _, _, _, _ in self._iter(fitted=True, - replace_strings=True)] + names = [name for name, _, _, _ in self._iter(fitted=True, + replace_strings=True)] for Xs, name in zip(result, names): if not getattr(Xs, 'ndim', 0) == 2: raise ValueError( @@ -385,9 +383,9 @@ def _fit_transform(self, X, y, func, fitted=False): try: return Parallel(n_jobs=self.n_jobs)( delayed(func)(clone(trans) if not fitted else trans, - X_sel, y, weight) - for _, trans, X_sel, weight, _ in self._iter( - X=X, fitted=fitted, replace_strings=True)) + _get_column(X, column), y, weight) + for _, trans, column, weight in self._iter( + fitted=fitted, replace_strings=True)) except ValueError as e: if "Expected 2D array, got 1D array instead" in str(e): raise ValueError(_ERR_MSG_1DCOLUMN)