Change default of ColumnTransformer remainder from passthrough to drop (#11603)

jorisvandenbossche · GaelVaroquaux · commit 277e1250c293 · 2018-07-18T09:57:15.000+02:00
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -413,7 +413,7 @@ variable, but apply a :class:`feature_extraction.text.CountVectorizer
 <sklearn.feature_extraction.text.CountVectorizer>` to the ``'title'`` column.
 As we might use multiple feature extraction methods on the same column, we give
 each transformer a unique name, say ``'city_category'`` and ``'title_bow'``.
-We can ignore the remaining rating columns by setting ``remainder='drop'``::
+By default, the remaining rating columns are ignored (``remainder='drop'``)::
 
   >>> from sklearn.compose import ColumnTransformer
   >>> from sklearn.feature_extraction.text import CountVectorizer
@@ -495,7 +495,7 @@ above example would be::
   ...     ('city', CountVectorizer(analyzer=lambda x: [x])),
   ...     ('title', CountVectorizer()))
   >>> column_trans # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-  ColumnTransformer(n_jobs=1, remainder='passthrough', transformer_weights=None,
+  ColumnTransformer(n_jobs=1, remainder='drop', transformer_weights=None,
            transformers=[('countvectorizer-1', ...)
 
 .. topic:: Examples:
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -71,14 +71,14 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
             A callable is passed the input data `X` and can return any of the
             above.
 
-    remainder : {'passthrough', 'drop'} or estimator, default 'passthrough'
-        By default, all remaining columns that were not specified in
-        `transformers` will be automatically passed through (default of
-        ``'passthrough'``). This subset of columns is concatenated with the
-        output of the transformers.
-        By using ``remainder='drop'``, only the specified columns in
-        `transformers` are transformed and combined in the output, and the
-        non-specified columns are dropped.
+    remainder : {'passthrough', 'drop'} or estimator, default 'drop'
+        By default, only the specified columns in `transformers` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns that
+        were not specified in `transformers` will be automatically passed
+        through. This subset of columns is concatenated with the output of
+        the transformers.
         By setting ``remainder`` to be an estimator, the remaining
         non-specified columns will use the ``remainder`` estimator. The
         estimator must support `fit` and `transform`.
@@ -141,7 +141,7 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
 
     """
 
-    def __init__(self, transformers, remainder='passthrough', n_jobs=1,
+    def __init__(self, transformers, remainder='drop', n_jobs=1,
                  transformer_weights=None):
         self.transformers = transformers
         self.remainder = remainder
@@ -658,8 +658,7 @@ def make_column_transformer(*transformers, **kwargs):
     ...     (['numerical_column'], StandardScaler()),
     ...     (['categorical_column'], OneHotEncoder()))
     ...     # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    ColumnTransformer(n_jobs=1, remainder='passthrough',
-             transformer_weights=None,
+    ColumnTransformer(n_jobs=1, remainder='drop', transformer_weights=None,
              transformers=[('standardscaler',
                             StandardScaler(...),
                             ['numerical_column']),
@@ -669,7 +668,7 @@ def make_column_transformer(*transformers, **kwargs):
 
     """
     n_jobs = kwargs.pop('n_jobs', 1)
-    remainder = kwargs.pop('remainder', 'passthrough')
+    remainder = kwargs.pop('remainder', 'drop')
     if kwargs:
         raise TypeError('Unknown keyword arguments: "{}"'
                         .format(list(kwargs.keys())[0]))
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -405,7 +405,7 @@ def test_column_transformer_get_set_params():
                             ('trans2', StandardScaler(), [1])])
 
     exp = {'n_jobs': 1,
-           'remainder': 'passthrough',
+           'remainder': 'drop',
            'trans1': ct.transformers[0][1],
            'trans1__copy': True,
            'trans1__with_mean': True,
@@ -424,7 +424,7 @@ def test_column_transformer_get_set_params():
 
     ct.set_params(trans1='passthrough')
     exp = {'n_jobs': 1,
-           'remainder': 'passthrough',
+           'remainder': 'drop',
            'trans1': 'passthrough',
            'trans2': ct.transformers[1][1],
            'trans2__copy': True,
@@ -492,7 +492,8 @@ def test_column_transformer_get_feature_names():
         NotImplementedError, 'get_feature_names is not yet supported',
         ct.get_feature_names)
 
-    ct = ColumnTransformer([('trans', DictVectorizer(), 0)])
+    ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
+                           remainder='passthrough')
     ct.fit(X)
     assert_raise_message(
         NotImplementedError, 'get_feature_names is not yet supported',
@@ -552,23 +553,22 @@ def test_column_transformer_remainder():
     X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
     X_res_both = X_array
 
-    # default passthrough
-    ct = ColumnTransformer([('trans', Trans(), [0])])
-    assert_array_equal(ct.fit_transform(X_array), X_res_both)
-    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
+    # default drop
+    ct = ColumnTransformer([('trans1', Trans(), [0])])
+    assert_array_equal(ct.fit_transform(X_array), X_res_first)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
     assert len(ct.transformers_) == 2
     assert ct.transformers_[-1][0] == 'remainder'
-    assert ct.transformers_[-1][1] == 'passthrough'
+    assert ct.transformers_[-1][1] == 'drop'
     assert_array_equal(ct.transformers_[-1][2], [1])
 
-    # specify to drop remaining columns
-    ct = ColumnTransformer([('trans1', Trans(), [0])],
-                           remainder='drop')
-    assert_array_equal(ct.fit_transform(X_array), X_res_first)
-    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
+    # specify passthrough
+    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
+    assert_array_equal(ct.fit_transform(X_array), X_res_both)
+    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
     assert len(ct.transformers_) == 2
     assert ct.transformers_[-1][0] == 'remainder'
-    assert ct.transformers_[-1][1] == 'drop'
+    assert ct.transformers_[-1][1] == 'passthrough'
     assert_array_equal(ct.transformers_[-1][2], [1])
 
     # column order is not preserved (passed through added to end)
@@ -602,6 +602,9 @@ def test_column_transformer_remainder():
         "remainder keyword needs to be one of \'drop\', \'passthrough\', "
         "or estimator.", ct.fit_transform, X_array)
 
+    # check default for make_column_transformer
+    ct = make_column_transformer(([0], Trans()))
+    assert ct.remainder == 'drop'
 
 @pytest.mark.parametrize("key", [[0], np.array([0]), slice(0, 1),
                                  np.array([True, False])])