scikit-learn · ogrisel · Jun 20, 2018 · Jun 6, 2018 · Jun 6, 2018 · Jun 6, 2018
diff --git a/doc/conftest.py b/doc/conftest.py
@@ -62,6 +62,13 @@ def setup_compose():
         raise SkipTest("Skipping compose.rst, pandas not installed")
 
 
+def setup_impute():
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("Skipping impute.rst, pandas not installed")
+
+
 def pytest_runtest_setup(item):
     fname = item.fspath.strpath
     if fname.endswith('datasets/labeled_faces.rst'):
@@ -76,6 +83,8 @@ def pytest_runtest_setup(item):
         setup_working_with_text_data()
     elif fname.endswith('modules/compose.rst'):
         setup_compose()
+    elif fname.endswith('modules/impute.rst'):
+        setup_impute()
 
 
 def pytest_runtest_teardown(item):

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
@@ -20,19 +20,20 @@ Univariate feature imputation
 =============================
 
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
-values, either using the mean, the median or the most frequent value of
-the row or column in which the missing values are located. This class
-also allows for different missing values encodings.
+values. Missing values can be imputed with a provided constant value, or using
+the statistics (mean, median or most frequent) of each column in which the
+missing values are located. This class also allows for different missing values
+encodings.
 
 The following snippet demonstrates how to replace missing values,
 encoded as ``np.nan``, using the mean value of the columns (axis 0)
 that contain the missing values::
 
     >>> import numpy as np
     >>> from sklearn.impute import SimpleImputer
-    >>> imp = SimpleImputer(missing_values='NaN', strategy='mean')
+    >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean')
     >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])       # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, missing_values='NaN', strategy='mean', verbose=0)
+    SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)
     >>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
     >>> print(imp.transform(X))           # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
     [[4.          2.        ]
@@ -45,7 +46,7 @@ The :class:`SimpleImputer` class also supports sparse matrices::
     >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
     >>> imp = SimpleImputer(missing_values=0, strategy='mean')
     >>> imp.fit(X)                  # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, missing_values=0, strategy='mean', verbose=0)
+    SimpleImputer(copy=True, fill_value=None, missing_values=0, strategy='mean', verbose=0)
     >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
     >>> print(imp.transform(X_test))      # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
     [[4.          2.        ]
@@ -56,6 +57,23 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
+The :class:`SimpleImputer` class also supports categorical data represented as
+string values or pandas categoricals when using the ``'most_frequent'`` or
+``'constant'`` strategy::
+
+    >>> import pandas as pd
+    >>> df = pd.DataFrame([["a", "x"],
+    ...                    [np.nan, "y"],
+    ...                    ["a", np.nan],
+    ...                    ["b", "y"]], dtype="category")
+    ...
+    >>> imp = SimpleImputer(strategy="most_frequent")
+    >>> print(imp.fit_transform(df))      # doctest: +NORMALIZE_WHITESPACE
+    [['a' 'x']
+     ['a' 'y']
+     ['a' 'y']
+     ['b' 'y']]
+
 .. _mice:
 
 Multivariate feature imputation
@@ -76,7 +94,7 @@ Here is an example snippet::
     >>> imp = MICEImputer(n_imputations=10, random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     MICEImputer(imputation_order='ascending', initial_strategy='mean',
-          max_value=None, min_value=None, missing_values='NaN', n_burn_in=10,
+          max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
           n_imputations=10, n_nearest_features=None, predictor=None,
           random_state=0, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -613,6 +613,17 @@ Imputer
   SimpleImputer().fit_transform(X.T).T)``). :issue:`10829` by :user:`Guillaume
   Lemaitre <glemaitre>` and :user:`Gilberto Olimpio <gilbertoolimpio>`.
 
+- The :class:`impute.SimpleImputer` has a new strategy, ``'constant'``, to
+  complete missing values with a fixed one, given by the ``fill_value``
+  parameter. This strategy supports numeric and non-numeric data, and so does
+  the ``'most_frequent'`` strategy now. :issue:`11211` by :user:`Jeremie du
+  Boisberranger <jeremiedbb>`.
+
+- The NaN marker for the missing values has been changed between the
+  :class:`preprocessing.Imputer` and the :class:`impute.SimpleImputer`.
+  ``missing_values='NaN'`` should now be ``missing_values=np.nan``.
+  :issue:`11211` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
 Outlier Detection models
 
 - More consistent outlier detection API:

diff --git a/...compose/column_transformer_mixed_types.py → ...se/plot_column_transformer_mixed_types.py b/...compose/column_transformer_mixed_types.py → ...se/plot_column_transformer_mixed_types.py
@@ -27,14 +27,16 @@
 from __future__ import print_function
 
 import pandas as pd
+import numpy as np
 
-from sklearn.compose import make_column_transformer
-from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler, CategoricalEncoder
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, GridSearchCV
 
+np.random.seed(0)
 
 # Read data from Titanic dataset.
 titanic_url = ('https://raw.githubusercontent.com/amueller/'
@@ -49,36 +51,37 @@
 # - embarked: categories encoded as strings {'C', 'S', 'Q'}.
 # - sex: categories encoded as strings {'female', 'male'}.
 # - pclass: ordinal integers {1, 2, 3}.
-numeric_features = ['age', 'fare']
-categorical_features = ['embarked', 'sex', 'pclass']
-
-# Provisionally, use pd.fillna() to impute missing values for categorical
-# features; SimpleImputer will eventually support strategy="constant".
-data[categorical_features] = data[categorical_features].fillna(value='missing')
 
 # We create the preprocessing pipelines for both numeric and categorical data.
-numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
-categorical_transformer = CategoricalEncoder('onehot-dense',
-                                             handle_unknown='ignore')
+numeric_features = ['age', 'fare']
+numeric_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='median')),
+    ('scaler', StandardScaler())])
+
+categorical_features = ['embarked', 'sex', 'pclass']
+categorical_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
 
-preprocessing_pl = make_column_transformer(
-    (numeric_features, numeric_transformer),
-    (categorical_features, categorical_transformer),
-    remainder='drop'
-)
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', numeric_transformer, numeric_features),
+        ('cat', categorical_transformer, categorical_features)],
+    remainder='drop')
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
-clf = make_pipeline(preprocessing_pl, LogisticRegression())
+clf = Pipeline(steps=[('preprocessor', preprocessor),
+                      ('classifier', LogisticRegression())])
 
 X = data.drop('survived', axis=1)
-y = data.survived.values
+y = data['survived']
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                     shuffle=True)
 
 clf.fit(X_train, y_train)
-print("model score: %f" % clf.score(X_test, y_test))
+print("model score: %.3f" % clf.score(X_test, y_test))
 
 
 ###############################################################################
@@ -93,12 +96,12 @@
 
 
 param_grid = {
-    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
-    'logisticregression__C': [0.1, 1.0, 1.0],
+    'preprocessor__num__imputer__strategy': ['mean', 'median'],
+    'classifier__C': [0.1, 1.0, 10, 100],
 }
 
 grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
 grid_search.fit(X_train, y_train)
 
-print(("best logistic regression from grid search: %f"
+print(("best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)))