diff --git a/doc/conf.py b/doc/conf.py
index 27a6bf2ee30c2..c736adc8e267e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -263,9 +263,9 @@
                    'sphx_glr_plot_compare_methods_001.png': 349}
 
 
-# enable experimental module so that the new GBDTs estimators can be
+# enable experimental module so that experimental estimators can be
 # discovered properly by sphinx
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.experimental import *  # noqa
 
 
 def make_carousel_thumbs(app, exception):
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index d61b519a9171e..5f7bb2026021c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -471,6 +471,7 @@ Samples generator
    :toctree: generated/
 
    experimental.enable_hist_gradient_boosting
+   experimental.enable_iterative_imputer
 
 
 .. _feature_extraction_ref:
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 777a2bd157b29..4cd0ea6e85d60 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -105,7 +105,16 @@ of ``y``.  This is done for each feature in an iterative fashion, and then is
 repeated for ``max_iter`` imputation rounds. The results of the final
 imputation round are returned.
 
+.. note::
+
+   This estimator is still **experimental** for now: the predictions
+   and the API might change without any deprecation cycle. To use it,
+   you need to explicitly import ``enable_iterative_imputer``.
+
+::
+
     >>> import numpy as np
+    >>> from sklearn.experimental import enable_iterative_imputer
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(max_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 299ed158797af..73ebaacfd44a0 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -216,7 +216,7 @@ Support for Python 3.4 and below has been officially dropped.
 
     >>> # explicitly require this experimental feature
     >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-    >>> # now you can import normally from ensemble
+    >>> # now you can import normally from sklearn.ensemble
     >>> from sklearn.ensemble import HistGradientBoostingClassifier
 
   :pr:`12807` by :user:`Nicolas Hug<NicolasHug>`.
@@ -358,6 +358,15 @@ Support for Python 3.4 and below has been officially dropped.
   :pr:`12177` by :user:`Sergey Feldman <sergeyf>` and :user:`Ben Lawson
   <benlawson>`.
 
+  The API of IterativeImputer is experimental and subject to change without any
+  deprecation cycle. To use them, you need to explicitly import
+  ``enable_iterative_imputer``::
+
+    >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+    >>> # now you can import normally from sklearn.impute
+    >>> from sklearn.impute import IterativeImputer
+
+
 - |Feature| The :class:`impute.SimpleImputer` and
   :class:`impute.IterativeImputer` have a new parameter ``'add_indicator'``,
   which simply stacks a :class:`impute.MissingIndicator` transform into the
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 382e8de304be8..06fab08c381f2 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -42,6 +42,8 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.datasets import fetch_california_housing
 from sklearn.impute import SimpleImputer
 from sklearn.impute import IterativeImputer
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 897b66aad246c..2d2d37745abf3 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -23,6 +23,8 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py
new file mode 100644
index 0000000000000..2f262141cc069
--- /dev/null
+++ b/sklearn/experimental/enable_iterative_imputer.py
@@ -0,0 +1,19 @@
+"""Enables IterativeImputer
+
+The API and results of this estimators might change without any deprecation
+cycle.
+
+Importing this file dynamically sets :class:`sklearn.impute.IterativeImputer`
+as an attribute of the impute module::
+
+    >>> # explicitly require this experimental feature
+    >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+    >>> # now you can import normally from impute
+    >>> from sklearn.impute import IterativeImputer
+"""
+
+from ..impute._iterative import IterativeImputer
+from .. import impute
+
+impute.IterativeImputer = IterativeImputer
+impute.__all__ += ['IterativeImputer']
diff --git a/sklearn/experimental/tests/test_enable_iterative_imputer.py b/sklearn/experimental/tests/test_enable_iterative_imputer.py
new file mode 100644
index 0000000000000..17579e0c43612
--- /dev/null
+++ b/sklearn/experimental/tests/test_enable_iterative_imputer.py
@@ -0,0 +1,39 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+from sklearn.utils.testing import assert_run_python_script
+
+
+def test_imports_strategies():
+    # Make sure different import strategies work or fail as expected.
+
+    # Since Python caches the imported modules, we need to run a child process
+    # for every test case. Else, the tests would not be independent
+    # (manually removing the imports from the cache (sys.modules) is not
+    # recommended and can lead to many complications).
+
+    good_import = """
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(good_import))
+
+    good_import_with_ensemble_first = """
+    import sklearn.ensemble
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first))
+
+    bad_imports = """
+    import pytest
+
+    with pytest.raises(ImportError):
+        from sklearn.impute import IterativeImputer
+
+    import sklearn.experimental
+    with pytest.raises(ImportError):
+        from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(bad_imports))
diff --git a/sklearn/impute.py b/sklearn/impute.py
deleted file mode 100644
index 8bbf1bb94e242..0000000000000
--- a/sklearn/impute.py
+++ /dev/null
@@ -1,1339 +0,0 @@
-"""Transformers for missing value imputation"""
-# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
-#          Sergey Feldman <sergeyfeldman@gmail.com>
-# License: BSD 3 clause
-
-from __future__ import division
-
-import warnings
-import numbers
-from time import time
-from distutils.version import LooseVersion
-
-import numpy as np
-import numpy.ma as ma
-import scipy
-from scipy import sparse
-from scipy import stats
-from collections import namedtuple
-
-from .base import BaseEstimator, TransformerMixin
-from .base import clone
-from .exceptions import ConvergenceWarning
-from .preprocessing import normalize
-from .utils import check_array, check_random_state, safe_indexing
-from .utils.sparsefuncs import _get_median
-from .utils.validation import check_is_fitted
-from .utils.validation import FLOAT_DTYPES
-from .utils.fixes import _object_dtype_isnan
-from .utils import is_scalar_nan
-
-
-ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
-                                               'neighbor_feat_idx',
-                                               'estimator'])
-
-__all__ = [
-    'MissingIndicator',
-    'SimpleImputer',
-    'IterativeImputer',
-]
-
-
-def _check_inputs_dtype(X, missing_values):
-    if (X.dtype.kind in ("f", "i", "u") and
-            not isinstance(missing_values, numbers.Real)):
-        raise ValueError("'X' and 'missing_values' types are expected to be"
-                         " both numerical. Got X.dtype={} and "
-                         " type(missing_values)={}."
-                         .format(X.dtype, type(missing_values)))
-
-
-def _get_mask(X, value_to_mask):
-    """Compute the boolean mask X == missing_values."""
-    if is_scalar_nan(value_to_mask):
-        if X.dtype.kind == "f":
-            return np.isnan(X)
-        elif X.dtype.kind in ("i", "u"):
-            # can't have NaNs in integer array.
-            return np.zeros(X.shape, dtype=bool)
-        else:
-            # np.isnan does not work on object dtypes.
-            return _object_dtype_isnan(X)
-    else:
-        # X == value_to_mask with object dytpes does not always perform
-        # element-wise for old versions of numpy
-        return np.equal(X, value_to_mask)
-
-
-def _most_frequent(array, extra_value, n_repeat):
-    """Compute the most frequent value in a 1d array extended with
-       [extra_value] * n_repeat, where extra_value is assumed to be not part
-       of the array."""
-    # Compute the most frequent value in array only
-    if array.size > 0:
-        with warnings.catch_warnings():
-            # stats.mode raises a warning when input array contains objects due
-            # to incapacity to detect NaNs. Irrelevant here since input array
-            # has already been NaN-masked.
-            warnings.simplefilter("ignore", RuntimeWarning)
-            mode = stats.mode(array)
-
-        most_frequent_value = mode[0][0]
-        most_frequent_count = mode[1][0]
-    else:
-        most_frequent_value = 0
-        most_frequent_count = 0
-
-    # Compare to array + [extra_value] * n_repeat
-    if most_frequent_count == 0 and n_repeat == 0:
-        return np.nan
-    elif most_frequent_count < n_repeat:
-        return extra_value
-    elif most_frequent_count > n_repeat:
-        return most_frequent_value
-    elif most_frequent_count == n_repeat:
-        # Ties the breaks. Copy the behaviour of scipy.stats.mode
-        if most_frequent_value < extra_value:
-            return most_frequent_value
-        else:
-            return extra_value
-
-
-class SimpleImputer(BaseEstimator, TransformerMixin):
-    """Imputation transformer for completing missing values.
-
-    Read more in the :ref:`User Guide <impute>`.
-
-    Parameters
-    ----------
-    missing_values : number, string, np.nan (default) or None
-        The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
-
-    strategy : string, optional (default="mean")
-        The imputation strategy.
-
-        - If "mean", then replace missing values using the mean along
-          each column. Can only be used with numeric data.
-        - If "median", then replace missing values using the median along
-          each column. Can only be used with numeric data.
-        - If "most_frequent", then replace missing using the most frequent
-          value along each column. Can be used with strings or numeric data.
-        - If "constant", then replace missing values with fill_value. Can be
-          used with strings or numeric data.
-
-        .. versionadded:: 0.20
-           strategy="constant" for fixed value imputation.
-
-    fill_value : string or numerical value, optional (default=None)
-        When strategy == "constant", fill_value is used to replace all
-        occurrences of missing_values.
-        If left to the default, fill_value will be 0 when imputing numerical
-        data and "missing_value" for strings or object data types.
-
-    verbose : integer, optional (default=0)
-        Controls the verbosity of the imputer.
-
-    copy : boolean, optional (default=True)
-        If True, a copy of X will be created. If False, imputation will
-        be done in-place whenever possible. Note that, in the following cases,
-        a new copy will always be made, even if `copy=False`:
-
-        - If X is not an array of floating values;
-        - If X is encoded as a CSR matrix;
-        - If add_indicator=True.
-
-    add_indicator : boolean, optional (default=False)
-        If True, a `MissingIndicator` transform will stack onto output
-        of the imputer's transform. This allows a predictive estimator
-        to account for missingness despite imputation. If a feature has no
-        missing values at fit/train time, the feature won't appear on
-        the missing indicator even if there are missing values at
-        transform/test time.
-
-    Attributes
-    ----------
-    statistics_ : array of shape (n_features,)
-        The imputation fill value for each feature.
-
-    indicator_ : :class:`sklearn.impute.MissingIndicator`
-        Indicator used to add binary indicators for missing values.
-        ``None`` if add_indicator is False.
-
-    See also
-    --------
-    IterativeImputer : Multivariate imputation of missing values.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.impute import SimpleImputer
-    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
-    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
-    ... # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(add_indicator=False, copy=True, fill_value=None,
-            missing_values=nan, strategy='mean', verbose=0)
-    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
-    >>> print(imp_mean.transform(X))
-    ... # doctest: +NORMALIZE_WHITESPACE
-    [[ 7.   2.   3. ]
-     [ 4.   3.5  6. ]
-     [10.   3.5  9. ]]
-
-    Notes
-    -----
-    Columns which only contained missing values at `fit` are discarded upon
-    `transform` if strategy is not "constant".
-
-    """
-    def __init__(self, missing_values=np.nan, strategy="mean",
-                 fill_value=None, verbose=0, copy=True, add_indicator=False):
-        self.missing_values = missing_values
-        self.strategy = strategy
-        self.fill_value = fill_value
-        self.verbose = verbose
-        self.copy = copy
-        self.add_indicator = add_indicator
-
-    def _validate_input(self, X):
-        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
-        if self.strategy not in allowed_strategies:
-            raise ValueError("Can only use these strategies: {0} "
-                             " got strategy={1}".format(allowed_strategies,
-                                                        self.strategy))
-
-        if self.strategy in ("most_frequent", "constant"):
-            dtype = None
-        else:
-            dtype = FLOAT_DTYPES
-
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-
-        try:
-            X = check_array(X, accept_sparse='csc', dtype=dtype,
-                            force_all_finite=force_all_finite, copy=self.copy)
-        except ValueError as ve:
-            if "could not convert" in str(ve):
-                raise ValueError("Cannot use {0} strategy with non-numeric "
-                                 "data. Received datatype :{1}."
-                                 "".format(self.strategy, X.dtype.kind))
-            else:
-                raise ve
-
-        _check_inputs_dtype(X, self.missing_values)
-        if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise ValueError("SimpleImputer does not support data with dtype "
-                             "{0}. Please provide either a numeric array (with"
-                             " a floating point or integer dtype) or "
-                             "categorical data represented either as an array "
-                             "with integer dtype or an array of string values "
-                             "with an object dtype.".format(X.dtype))
-
-        return X
-
-    def fit(self, X, y=None):
-        """Fit the imputer on X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Input data, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features.
-
-        Returns
-        -------
-        self : SimpleImputer
-        """
-        X = self._validate_input(X)
-
-        # default fill_value is 0 for numerical input and "missing_value"
-        # otherwise
-        if self.fill_value is None:
-            if X.dtype.kind in ("i", "u", "f"):
-                fill_value = 0
-            else:
-                fill_value = "missing_value"
-        else:
-            fill_value = self.fill_value
-
-        # fill_value should be numerical in case of numerical input
-        if (self.strategy == "constant" and
-                X.dtype.kind in ("i", "u", "f") and
-                not isinstance(fill_value, numbers.Real)):
-            raise ValueError("'fill_value'={0} is invalid. Expected a "
-                             "numerical value when imputing numerical "
-                             "data".format(fill_value))
-
-        if sparse.issparse(X):
-            # missing_values = 0 not allowed with sparse data as it would
-            # force densification
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                self.statistics_ = self._sparse_fit(X,
-                                                    self.strategy,
-                                                    self.missing_values,
-                                                    fill_value)
-        else:
-            self.statistics_ = self._dense_fit(X,
-                                               self.strategy,
-                                               self.missing_values,
-                                               fill_value)
-
-        if self.add_indicator:
-            self.indicator_ = MissingIndicator(
-                missing_values=self.missing_values)
-            self.indicator_.fit(X)
-        else:
-            self.indicator_ = None
-
-        return self
-
-    def _sparse_fit(self, X, strategy, missing_values, fill_value):
-        """Fit the transformer on sparse data."""
-        mask_data = _get_mask(X.data, missing_values)
-        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
-
-        statistics = np.empty(X.shape[1])
-
-        if strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
-            statistics.fill(fill_value)
-        else:
-            for i in range(X.shape[1]):
-                column = X.data[X.indptr[i]:X.indptr[i + 1]]
-                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
-                column = column[~mask_column]
-
-                # combine explicit and implicit zeros
-                mask_zeros = _get_mask(column, 0)
-                column = column[~mask_zeros]
-                n_explicit_zeros = mask_zeros.sum()
-                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
-
-                if strategy == "mean":
-                    s = column.size + n_zeros
-                    statistics[i] = np.nan if s == 0 else column.sum() / s
-
-                elif strategy == "median":
-                    statistics[i] = _get_median(column,
-                                                n_zeros)
-
-                elif strategy == "most_frequent":
-                    statistics[i] = _most_frequent(column,
-                                                   0,
-                                                   n_zeros)
-        return statistics
-
-    def _dense_fit(self, X, strategy, missing_values, fill_value):
-        """Fit the transformer on dense data."""
-        mask = _get_mask(X, missing_values)
-        masked_X = ma.masked_array(X, mask=mask)
-
-        # Mean
-        if strategy == "mean":
-            mean_masked = np.ma.mean(masked_X, axis=0)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            mean = np.ma.getdata(mean_masked)
-            mean[np.ma.getmask(mean_masked)] = np.nan
-
-            return mean
-
-        # Median
-        elif strategy == "median":
-            median_masked = np.ma.median(masked_X, axis=0)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            median = np.ma.getdata(median_masked)
-            median[np.ma.getmaskarray(median_masked)] = np.nan
-
-            return median
-
-        # Most frequent
-        elif strategy == "most_frequent":
-            # scipy.stats.mstats.mode cannot be used because it will no work
-            # properly if the first element is masked and if its frequency
-            # is equal to the frequency of the most frequent valid element
-            # See https://github.com/scipy/scipy/issues/2636
-
-            # To be able access the elements by columns
-            X = X.transpose()
-            mask = mask.transpose()
-
-            if X.dtype.kind == "O":
-                most_frequent = np.empty(X.shape[0], dtype=object)
-            else:
-                most_frequent = np.empty(X.shape[0])
-
-            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
-                row_mask = np.logical_not(row_mask).astype(np.bool)
-                row = row[row_mask]
-                most_frequent[i] = _most_frequent(row, np.nan, 0)
-
-            return most_frequent
-
-        # Constant
-        elif strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
-            return np.full(X.shape[1], fill_value, dtype=X.dtype)
-
-    def transform(self, X):
-        """Impute all missing values in X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-        """
-        check_is_fitted(self, 'statistics_')
-
-        X = self._validate_input(X)
-
-        statistics = self.statistics_
-
-        if X.shape[1] != statistics.shape[0]:
-            raise ValueError("X has %d features per sample, expected %d"
-                             % (X.shape[1], self.statistics_.shape[0]))
-
-        if self.add_indicator:
-            X_trans_indicator = self.indicator_.transform(X)
-
-        # Delete the invalid columns if strategy is not constant
-        if self.strategy == "constant":
-            valid_statistics = statistics
-        else:
-            # same as np.isnan but also works for object dtypes
-            invalid_mask = _get_mask(statistics, np.nan)
-            valid_mask = np.logical_not(invalid_mask)
-            valid_statistics = statistics[valid_mask]
-            valid_statistics_indexes = np.flatnonzero(valid_mask)
-
-            if invalid_mask.any():
-                missing = np.arange(X.shape[1])[invalid_mask]
-                if self.verbose:
-                    warnings.warn("Deleting features without "
-                                  "observed values: %s" % missing)
-                X = X[:, valid_statistics_indexes]
-
-        # Do actual imputation
-        if sparse.issparse(X):
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                mask = _get_mask(X.data, self.missing_values)
-                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
-                                    np.diff(X.indptr))[mask]
-
-                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
-                                                                copy=False)
-        else:
-            mask = _get_mask(X, self.missing_values)
-            n_missing = np.sum(mask, axis=0)
-            values = np.repeat(valid_statistics, n_missing)
-            coordinates = np.where(mask.transpose())[::-1]
-
-            X[coordinates] = values
-
-        if self.add_indicator:
-            hstack = sparse.hstack if sparse.issparse(X) else np.hstack
-            X = hstack((X, X_trans_indicator))
-
-        return X
-
-    def _more_tags(self):
-        return {'allow_nan': True}
-
-
-class IterativeImputer(BaseEstimator, TransformerMixin):
-    """Multivariate imputer that estimates each feature from all the others.
-
-    A strategy for imputing missing values by modeling each feature with
-    missing values as a function of other features in a round-robin fashion.
-
-    Read more in the :ref:`User Guide <iterative_imputer>`.
-
-    Parameters
-    ----------
-    estimator : estimator object, default=BayesianRidge()
-        The estimator to use at each step of the round-robin imputation.
-        If ``sample_posterior`` is True, the estimator must support
-        ``return_std`` in its ``predict`` method.
-
-    missing_values : int, np.nan, optional (default=np.nan)
-        The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed.
-
-    sample_posterior : boolean, default=False
-        Whether to sample from the (Gaussian) predictive posterior of the
-        fitted estimator for each imputation. Estimator must support
-        ``return_std`` in its ``predict`` method if set to ``True``. Set to
-        ``True`` if using ``IterativeImputer`` for multiple imputations.
-
-    max_iter : int, optional (default=10)
-        Maximum number of imputation rounds to perform before returning the
-        imputations computed during the final round. A round is a single
-        imputation of each feature with missing values. The stopping criterion
-        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
-        where `X_t` is `X` at iteration `t. Note that early stopping is only
-        applied if ``sample_posterior=False``.
-
-    tol : float, optional (default=1e-3)
-        Tolerance of the stopping condition.
-
-    n_nearest_features : int, optional (default=None)
-        Number of other features to use to estimate the missing values of
-        each feature column. Nearness between features is measured using
-        the absolute correlation coefficient between each feature pair (after
-        initial imputation). To ensure coverage of features throughout the
-        imputation process, the neighbor features are not necessarily nearest,
-        but are drawn with probability proportional to correlation for each
-        imputed target feature. Can provide significant speed-up when the
-        number of features is huge. If ``None``, all features will be used.
-
-    initial_strategy : str, optional (default="mean")
-        Which strategy to use to initialize the missing values. Same as the
-        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
-        Valid values: {"mean", "median", "most_frequent", or "constant"}.
-
-    imputation_order : str, optional (default="ascending")
-        The order in which the features will be imputed. Possible values:
-
-        "ascending"
-            From features with fewest missing values to most.
-        "descending"
-            From features with most missing values to fewest.
-        "roman"
-            Left to right.
-        "arabic"
-            Right to left.
-        "random"
-            A random order for each round.
-
-    min_value : float, optional (default=None)
-        Minimum possible imputed value. Default of ``None`` will set minimum
-        to negative infinity.
-
-    max_value : float, optional (default=None)
-        Maximum possible imputed value. Default of ``None`` will set maximum
-        to positive infinity.
-
-    verbose : int, optional (default=0)
-        Verbosity flag, controls the debug messages that are issued
-        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
-        or 2.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use. Randomizes
-        selection of estimator features if n_nearest_features is not None, the
-        ``imputation_order`` if ``random``, and the sampling from posterior if
-        ``sample_posterior`` is True. Use an integer for determinism.
-        See :term:`the Glossary <random_state>`.
-
-    add_indicator : boolean, optional (default=False)
-        If True, a `MissingIndicator` transform will stack onto output
-        of the imputer's transform. This allows a predictive estimator
-        to account for missingness despite imputation. If a feature has no
-        missing values at fit/train time, the feature won't appear on
-        the missing indicator even if there are missing values at
-        transform/test time.
-
-    Attributes
-    ----------
-    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
-        Imputer used to initialize the missing values.
-
-    imputation_sequence_ : list of tuples
-        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
-        ``feat_idx`` is the current feature to be imputed,
-        ``neighbor_feat_idx`` is the array of other features used to impute the
-        current feature, and ``estimator`` is the trained estimator used for
-        the imputation. Length is ``self.n_features_with_missing_ *
-        self.n_iter_``.
-
-    n_iter_ : int
-        Number of iteration rounds that occurred. Will be less than
-        ``self.max_iter`` if early stopping criterion was reached.
-
-    n_features_with_missing_ : int
-        Number of features with missing values.
-
-    indicator_ : :class:`sklearn.impute.MissingIndicator`
-        Indicator used to add binary indicators for missing values.
-        ``None`` if add_indicator is False.
-
-    See also
-    --------
-    SimpleImputer : Univariate imputation of missing values.
-
-    Notes
-    -----
-    To support imputation in inductive mode we store each feature's estimator
-    during the ``fit`` phase, and predict without refitting (in order) during
-    the ``transform`` phase.
-
-    Features which contain all missing values at ``fit`` are discarded upon
-    ``transform``.
-
-    Features with missing values during ``transform`` which did not have any
-    missing values during ``fit`` will be imputed with the initial imputation
-    method only.
-
-    References
-    ----------
-    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
-        Multivariate Imputation by Chained Equations in R". Journal of
-        Statistical Software 45: 1-67.
-        <https://www.jstatsoft.org/article/view/v045i03>`_
-
-    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
-        Multivariate Data Suitable for use with an Electronic Computer".
-        Journal of the Royal Statistical Society 22(2): 302-306.
-        <https://www.jstor.org/stable/2984099>`_
-    """
-
-    def __init__(self,
-                 estimator=None,
-                 missing_values=np.nan,
-                 sample_posterior=False,
-                 max_iter=10,
-                 tol=1e-3,
-                 n_nearest_features=None,
-                 initial_strategy="mean",
-                 imputation_order='ascending',
-                 min_value=None,
-                 max_value=None,
-                 verbose=0,
-                 random_state=None,
-                 add_indicator=False):
-
-        self.estimator = estimator
-        self.missing_values = missing_values
-        self.sample_posterior = sample_posterior
-        self.max_iter = max_iter
-        self.tol = tol
-        self.n_nearest_features = n_nearest_features
-        self.initial_strategy = initial_strategy
-        self.imputation_order = imputation_order
-        self.min_value = min_value
-        self.max_value = max_value
-        self.verbose = verbose
-        self.random_state = random_state
-        self.add_indicator = add_indicator
-
-    def _impute_one_feature(self,
-                            X_filled,
-                            mask_missing_values,
-                            feat_idx,
-                            neighbor_feat_idx,
-                            estimator=None,
-                            fit_mode=True):
-        """Impute a single feature from the others provided.
-
-        This function predicts the missing values of one of the features using
-        the current estimates of all the other features. The ``estimator`` must
-        support ``return_std=True`` in its ``predict`` method for this function
-        to work.
-
-        Parameters
-        ----------
-        X_filled : ndarray
-            Input data with the most recent imputations.
-
-        mask_missing_values : ndarray
-            Input data's missing indicator matrix.
-
-        feat_idx : int
-            Index of the feature currently being imputed.
-
-        neighbor_feat_idx : ndarray
-            Indices of the features to be used in imputing ``feat_idx``.
-
-        estimator : object
-            The estimator to use at this step of the round-robin imputation.
-            If ``sample_posterior`` is True, the estimator must support
-            ``return_std`` in its ``predict`` method.
-            If None, it will be cloned from self._estimator.
-
-        fit_mode : boolean, default=True
-            Whether to fit and predict with the estimator or just predict.
-
-        Returns
-        -------
-        X_filled : ndarray
-            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
-
-        estimator : estimator with sklearn API
-            The fitted estimator used to impute
-            ``X_filled[missing_row_mask, feat_idx]``.
-        """
-
-        # if nothing is missing, just return the default
-        # (should not happen at fit time because feat_ids would be excluded)
-        missing_row_mask = mask_missing_values[:, feat_idx]
-        if not np.any(missing_row_mask):
-            return X_filled, estimator
-
-        if estimator is None and fit_mode is False:
-            raise ValueError("If fit_mode is False, then an already-fitted "
-                             "estimator should be passed in.")
-
-        if estimator is None:
-            estimator = clone(self._estimator)
-
-        if fit_mode:
-            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
-                                    ~missing_row_mask)
-            y_train = safe_indexing(X_filled[:, feat_idx],
-                                    ~missing_row_mask)
-            estimator.fit(X_train, y_train)
-
-        # get posterior samples
-        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
-                               missing_row_mask)
-        if self.sample_posterior:
-            mus, sigmas = estimator.predict(X_test, return_std=True)
-            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-            # two types of problems: (1) non-positive sigmas, (2) mus outside
-            # legal range of min_value and max_value (results in inf sample)
-            positive_sigmas = sigmas > 0
-            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
-            mus_too_low = mus < self._min_value
-            imputed_values[mus_too_low] = self._min_value
-            mus_too_high = mus > self._max_value
-            imputed_values[mus_too_high] = self._max_value
-            # the rest can be sampled without statistical issues
-            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
-            mus = mus[inrange_mask]
-            sigmas = sigmas[inrange_mask]
-            a = (self._min_value - mus) / sigmas
-            b = (self._max_value - mus) / sigmas
-
-            if scipy.__version__ < LooseVersion('0.18'):
-                # bug with vector-valued `a` in old scipy
-                imputed_values[inrange_mask] = [
-                    stats.truncnorm(a=a_, b=b_,
-                                    loc=loc_, scale=scale_).rvs(
-                                        random_state=self.random_state_)
-                    for a_, b_, loc_, scale_
-                    in zip(a, b, mus, sigmas)]
-            else:
-                truncated_normal = stats.truncnorm(a=a, b=b,
-                                                   loc=mus, scale=sigmas)
-                imputed_values[inrange_mask] = truncated_normal.rvs(
-                    random_state=self.random_state_)
-        else:
-            imputed_values = estimator.predict(X_test)
-            imputed_values = np.clip(imputed_values,
-                                     self._min_value,
-                                     self._max_value)
-
-        # update the feature
-        X_filled[missing_row_mask, feat_idx] = imputed_values
-        return X_filled, estimator
-
-    def _get_neighbor_feat_idx(self,
-                               n_features,
-                               feat_idx,
-                               abs_corr_mat):
-        """Get a list of other features to predict ``feat_idx``.
-
-        If self.n_nearest_features is less than or equal to the total
-        number of features, then use a probability proportional to the absolute
-        correlation between ``feat_idx`` and each other feature to randomly
-        choose a subsample of the other features (without replacement).
-
-        Parameters
-        ----------
-        n_features : int
-            Number of features in ``X``.
-
-        feat_idx : int
-            Index of the feature currently being imputed.
-
-        abs_corr_mat : ndarray, shape (n_features, n_features)
-            Absolute correlation matrix of ``X``. The diagonal has been zeroed
-            out and each feature has been normalized to sum to 1. Can be None.
-
-        Returns
-        -------
-        neighbor_feat_idx : array-like
-            The features to use to impute ``feat_idx``.
-        """
-        if (self.n_nearest_features is not None and
-                self.n_nearest_features < n_features):
-            p = abs_corr_mat[:, feat_idx]
-            neighbor_feat_idx = self.random_state_.choice(
-                np.arange(n_features), self.n_nearest_features, replace=False,
-                p=p)
-        else:
-            inds_left = np.arange(feat_idx)
-            inds_right = np.arange(feat_idx + 1, n_features)
-            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
-        return neighbor_feat_idx
-
-    def _get_ordered_idx(self, mask_missing_values):
-        """Decide in what order we will update the features.
-
-        As a homage to the MICE R package, we will have 4 main options of
-        how to order the updates, and use a random order if anything else
-        is specified.
-
-        Also, this function skips features which have no missing values.
-
-        Parameters
-        ----------
-        mask_missing_values : array-like, shape (n_samples, n_features)
-            Input data's missing indicator matrix, where "n_samples" is the
-            number of samples and "n_features" is the number of features.
-
-        Returns
-        -------
-        ordered_idx : ndarray, shape (n_features,)
-            The order in which to impute the features.
-        """
-        frac_of_missing_values = mask_missing_values.mean(axis=0)
-        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
-        if self.imputation_order == 'roman':
-            ordered_idx = missing_values_idx
-        elif self.imputation_order == 'arabic':
-            ordered_idx = missing_values_idx[::-1]
-        elif self.imputation_order == 'ascending':
-            n = len(frac_of_missing_values) - len(missing_values_idx)
-            ordered_idx = np.argsort(frac_of_missing_values,
-                                     kind='mergesort')[n:][::-1]
-        elif self.imputation_order == 'descending':
-            n = len(frac_of_missing_values) - len(missing_values_idx)
-            ordered_idx = np.argsort(frac_of_missing_values,
-                                     kind='mergesort')[n:]
-        elif self.imputation_order == 'random':
-            ordered_idx = missing_values_idx
-            self.random_state_.shuffle(ordered_idx)
-        else:
-            raise ValueError("Got an invalid imputation order: '{0}'. It must "
-                             "be one of the following: 'roman', 'arabic', "
-                             "'ascending', 'descending', or "
-                             "'random'.".format(self.imputation_order))
-        return ordered_idx
-
-    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
-        """Get absolute correlation matrix between features.
-
-        Parameters
-        ----------
-        X_filled : ndarray, shape (n_samples, n_features)
-            Input data with the most recent imputations.
-
-        tolerance : float, optional (default=1e-6)
-            ``abs_corr_mat`` can have nans, which will be replaced
-            with ``tolerance``.
-
-        Returns
-        -------
-        abs_corr_mat : ndarray, shape (n_features, n_features)
-            Absolute correlation matrix of ``X`` at the beginning of the
-            current round. The diagonal has been zeroed out and each feature's
-            absolute correlations with all others have been normalized to sum
-            to 1.
-        """
-        n_features = X_filled.shape[1]
-        if (self.n_nearest_features is None or
-                self.n_nearest_features >= n_features):
-            return None
-        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
-        # np.corrcoef is not defined for features with zero std
-        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
-        # ensures exploration, i.e. at least some probability of sampling
-        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
-        # features are not their own neighbors
-        np.fill_diagonal(abs_corr_mat, 0)
-        # needs to sum to 1 for np.random.choice sampling
-        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
-        return abs_corr_mat
-
-    def _initial_imputation(self, X):
-        """Perform initial imputation for input X.
-
-        Parameters
-        ----------
-        X : ndarray, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        Returns
-        -------
-        Xt : ndarray, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        X_filled : ndarray, shape (n_samples, n_features)
-            Input data with the most recent imputations.
-
-        mask_missing_values : ndarray, shape (n_samples, n_features)
-            Input data's missing indicator matrix, where "n_samples" is the
-            number of samples and "n_features" is the number of features.
-        """
-        if is_scalar_nan(self.missing_values):
-            force_all_finite = "allow-nan"
-        else:
-            force_all_finite = True
-
-        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
-
-        mask_missing_values = _get_mask(X, self.missing_values)
-        if self.initial_imputer_ is None:
-            self.initial_imputer_ = SimpleImputer(
-                                            missing_values=self.missing_values,
-                                            strategy=self.initial_strategy)
-            X_filled = self.initial_imputer_.fit_transform(X)
-        else:
-            X_filled = self.initial_imputer_.transform(X)
-
-        valid_mask = np.flatnonzero(np.logical_not(
-            np.isnan(self.initial_imputer_.statistics_)))
-        Xt = X[:, valid_mask]
-        mask_missing_values = mask_missing_values[:, valid_mask]
-
-        return Xt, X_filled, mask_missing_values
-
-    def fit_transform(self, X, y=None):
-        """Fits the imputer on X and return the transformed X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        y : ignored.
-
-        Returns
-        -------
-        Xt : array-like, shape (n_samples, n_features)
-             The imputed input data.
-        """
-        self.random_state_ = getattr(self, "random_state_",
-                                     check_random_state(self.random_state))
-
-        if self.max_iter < 0:
-            raise ValueError(
-                "'max_iter' should be a positive integer. Got {} instead."
-                .format(self.max_iter))
-
-        if self.tol < 0:
-            raise ValueError(
-                "'tol' should be a non-negative float. Got {} instead."
-                .format(self.tol)
-            )
-
-        if self.add_indicator:
-            self.indicator_ = MissingIndicator(
-                missing_values=self.missing_values)
-            X_trans_indicator = self.indicator_.fit_transform(X)
-        else:
-            self.indicator_ = None
-
-        if self.estimator is None:
-            from .linear_model import BayesianRidge
-            self._estimator = BayesianRidge()
-        else:
-            self._estimator = clone(self.estimator)
-
-        self.imputation_sequence_ = []
-
-        if hasattr(self._estimator, 'random_state'):
-            self._estimator.random_state = self.random_state_
-
-        self._min_value = -np.inf if self.min_value is None else self.min_value
-        self._max_value = np.inf if self.max_value is None else self.max_value
-
-        self.initial_imputer_ = None
-        X, Xt, mask_missing_values = self._initial_imputation(X)
-
-        if self.max_iter == 0 or np.all(mask_missing_values):
-            self.n_iter_ = 0
-            return Xt
-
-        # order in which to impute
-        # note this is probably too slow for large feature data (d > 100000)
-        # and a better way would be good.
-        # see: https://goo.gl/KyCNwj and subsequent comments
-        ordered_idx = self._get_ordered_idx(mask_missing_values)
-        self.n_features_with_missing_ = len(ordered_idx)
-
-        abs_corr_mat = self._get_abs_corr_mat(Xt)
-
-        n_samples, n_features = Xt.shape
-        if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
-                  % (X.shape,))
-        start_t = time()
-        if not self.sample_posterior:
-            Xt_previous = Xt.copy()
-            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
-        for self.n_iter_ in range(1, self.max_iter + 1):
-            if self.imputation_order == 'random':
-                ordered_idx = self._get_ordered_idx(mask_missing_values)
-
-            for feat_idx in ordered_idx:
-                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
-                                                                feat_idx,
-                                                                abs_corr_mat)
-                Xt, estimator = self._impute_one_feature(
-                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
-                    estimator=None, fit_mode=True)
-                estimator_triplet = ImputerTriplet(feat_idx,
-                                                   neighbor_feat_idx,
-                                                   estimator)
-                self.imputation_sequence_.append(estimator_triplet)
-
-            if self.verbose > 1:
-                print('[IterativeImputer] Ending imputation round '
-                      '%d/%d, elapsed time %0.2f'
-                      % (self.n_iter_, self.max_iter, time() - start_t))
-
-            if not self.sample_posterior:
-                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
-                                          axis=None)
-                if inf_norm < normalized_tol:
-                    if self.verbose > 0:
-                        print('[IterativeImputer] Early stopping criterion '
-                              'reached.')
-                    break
-                Xt_previous = Xt.copy()
-        else:
-            if not self.sample_posterior:
-                warnings.warn("[IterativeImputer] Early stopping criterion not"
-                              " reached.", ConvergenceWarning)
-        Xt[~mask_missing_values] = X[~mask_missing_values]
-
-        if self.add_indicator:
-            Xt = np.hstack((Xt, X_trans_indicator))
-        return Xt
-
-    def transform(self, X):
-        """Imputes all missing values in X.
-
-        Note that this is stochastic, and that if random_state is not fixed,
-        repeated calls, or permuted input, will yield different results.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : array-like, shape (n_samples, n_features)
-             The imputed input data.
-        """
-        check_is_fitted(self, 'initial_imputer_')
-
-        if self.add_indicator:
-            X_trans_indicator = self.indicator_.transform(X)
-
-        X, Xt, mask_missing_values = self._initial_imputation(X)
-
-        if self.n_iter_ == 0 or np.all(mask_missing_values):
-            return Xt
-
-        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
-        i_rnd = 0
-        if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
-                  % (X.shape,))
-        start_t = time()
-        for it, estimator_triplet in enumerate(self.imputation_sequence_):
-            Xt, _ = self._impute_one_feature(
-                Xt,
-                mask_missing_values,
-                estimator_triplet.feat_idx,
-                estimator_triplet.neighbor_feat_idx,
-                estimator=estimator_triplet.estimator,
-                fit_mode=False
-            )
-            if not (it + 1) % imputations_per_round:
-                if self.verbose > 1:
-                    print('[IterativeImputer] Ending imputation round '
-                          '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, self.n_iter_, time() - start_t))
-                i_rnd += 1
-
-        Xt[~mask_missing_values] = X[~mask_missing_values]
-
-        if self.add_indicator:
-            Xt = np.hstack((Xt, X_trans_indicator))
-        return Xt
-
-    def fit(self, X, y=None):
-        """Fits the imputer on X and return self.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        y : ignored
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        self.fit_transform(X)
-        return self
-
-    def _more_tags(self):
-        return {'allow_nan': True}
-
-
-class MissingIndicator(BaseEstimator, TransformerMixin):
-    """Binary indicators for missing values.
-
-    Note that this component typically should not be used in a vanilla
-    :class:`Pipeline` consisting of transformers and a classifier, but rather
-    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
-
-    Read more in the :ref:`User Guide <impute>`.
-
-    Parameters
-    ----------
-    missing_values : number, string, np.nan (default) or None
-        The placeholder for the missing values. All occurrences of
-        `missing_values` will be indicated (True in the output array), the
-        other values will be marked as False.
-
-    features : str, optional
-        Whether the imputer mask should represent all or a subset of
-        features.
-
-        - If "missing-only" (default), the imputer mask will only represent
-          features containing missing values during fit time.
-        - If "all", the imputer mask will represent all features.
-
-    sparse : boolean or "auto", optional
-        Whether the imputer mask format should be sparse or dense.
-
-        - If "auto" (default), the imputer mask will be of same type as
-          input.
-        - If True, the imputer mask will be a sparse matrix.
-        - If False, the imputer mask will be a numpy array.
-
-    error_on_new : boolean, optional
-        If True (default), transform will raise an error when there are
-        features with missing values in transform that have no missing values
-        in fit. This is applicable only when ``features="missing-only"``.
-
-    Attributes
-    ----------
-    features_ : ndarray, shape (n_missing_features,) or (n_features,)
-        The features indices which will be returned when calling ``transform``.
-        They are computed during ``fit``. For ``features='all'``, it is
-        to ``range(n_features)``.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.impute import MissingIndicator
-    >>> X1 = np.array([[np.nan, 1, 3],
-    ...                [4, 0, np.nan],
-    ...                [8, 1, 0]])
-    >>> X2 = np.array([[5, 1, np.nan],
-    ...                [np.nan, 2, 3],
-    ...                [2, 4, 0]])
-    >>> indicator = MissingIndicator()
-    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
-    MissingIndicator(error_on_new=True, features='missing-only',
-             missing_values=nan, sparse='auto')
-    >>> X2_tr = indicator.transform(X2)
-    >>> X2_tr
-    array([[False,  True],
-           [ True, False],
-           [False, False]])
-
-    """
-
-    def __init__(self, missing_values=np.nan, features="missing-only",
-                 sparse="auto", error_on_new=True):
-        self.missing_values = missing_values
-        self.features = features
-        self.sparse = sparse
-        self.error_on_new = error_on_new
-
-    def _get_missing_features_info(self, X):
-        """Compute the imputer mask and the indices of the features
-        containing missing values.
-
-        Parameters
-        ----------
-        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The input data with missing values. Note that ``X`` has been
-            checked in ``fit`` and ``transform`` before to call this function.
-
-        Returns
-        -------
-        imputer_mask : {ndarray or sparse matrix}, shape \
-(n_samples, n_features) or (n_samples, n_features_with_missing)
-            The imputer mask of the original data.
-
-        features_with_missing : ndarray, shape (n_features_with_missing)
-            The features containing missing values.
-
-        """
-        if sparse.issparse(X):
-            mask = _get_mask(X.data, self.missing_values)
-
-            # The imputer mask will be constructed with the same sparse format
-            # as X.
-            sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
-                                  else sparse.csc_matrix)
-            imputer_mask = sparse_constructor(
-                (mask, X.indices.copy(), X.indptr.copy()),
-                shape=X.shape, dtype=bool)
-            imputer_mask.eliminate_zeros()
-
-            if self.features == 'missing-only':
-                n_missing = imputer_mask.getnnz(axis=0)
-
-            if self.sparse is False:
-                imputer_mask = imputer_mask.toarray()
-            elif imputer_mask.format == 'csr':
-                imputer_mask = imputer_mask.tocsc()
-        else:
-            imputer_mask = _get_mask(X, self.missing_values)
-
-            if self.features == 'missing-only':
-                n_missing = imputer_mask.sum(axis=0)
-
-            if self.sparse is True:
-                imputer_mask = sparse.csc_matrix(imputer_mask)
-
-        if self.features == 'all':
-            features_indices = np.arange(X.shape[1])
-        else:
-            features_indices = np.flatnonzero(n_missing)
-
-        return imputer_mask, features_indices
-
-    def _validate_input(self, X):
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
-        if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise ValueError("MissingIndicator does not support data with "
-                             "dtype {0}. Please provide either a numeric array"
-                             " (with a floating point or integer dtype) or "
-                             "categorical data represented either as an array "
-                             "with integer dtype or an array of string values "
-                             "with an object dtype.".format(X.dtype))
-
-        if sparse.issparse(X) and self.missing_values == 0:
-            # missing_values = 0 not allowed with sparse data as it would
-            # force densification
-            raise ValueError("Sparse input with missing_values=0 is "
-                             "not supported. Provide a dense "
-                             "array instead.")
-
-        return X
-
-    def fit(self, X, y=None):
-        """Fit the transformer on X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Input data, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        X = self._validate_input(X)
-        self._n_features = X.shape[1]
-
-        if self.features not in ('missing-only', 'all'):
-            raise ValueError("'features' has to be either 'missing-only' or "
-                             "'all'. Got {} instead.".format(self.features))
-
-        if not ((isinstance(self.sparse, str) and
-                self.sparse == "auto") or isinstance(self.sparse, bool)):
-            raise ValueError("'sparse' has to be a boolean or 'auto'. "
-                             "Got {!r} instead.".format(self.sparse))
-
-        self.features_ = self._get_missing_features_info(X)[1]
-
-        return self
-
-    def transform(self, X):
-        """Generate missing values indicator for X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The missing indicator for input data. The data type of ``Xt``
-            will be boolean.
-
-        """
-        check_is_fitted(self, "features_")
-        X = self._validate_input(X)
-
-        if X.shape[1] != self._n_features:
-            raise ValueError("X has a different number of features "
-                             "than during fitting.")
-
-        imputer_mask, features = self._get_missing_features_info(X)
-
-        if self.features == "missing-only":
-            features_diff_fit_trans = np.setdiff1d(features, self.features_)
-            if (self.error_on_new and features_diff_fit_trans.size > 0):
-                raise ValueError("The features {} have missing values "
-                                 "in transform but have no missing values "
-                                 "in fit.".format(features_diff_fit_trans))
-
-            if self.features_.size < self._n_features:
-                imputer_mask = imputer_mask[:, self.features_]
-
-        return imputer_mask
-
-    def fit_transform(self, X, y=None):
-        """Generate missing values indicator for X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The missing indicator for input data. The data type of ``Xt``
-            will be boolean.
-
-        """
-        return self.fit(X, y).transform(X)
-
-    def _more_tags(self):
-        return {'allow_nan': True,
-                'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py
new file mode 100644
index 0000000000000..abeb4d471f5f3
--- /dev/null
+++ b/sklearn/impute/__init__.py
@@ -0,0 +1,8 @@
+"""Transformers for missing value imputation"""
+
+from ._base import MissingIndicator, SimpleImputer
+
+__all__ = [
+    'MissingIndicator',
+    'SimpleImputer',
+]
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
new file mode 100644
index 0000000000000..7be9da691ce11
--- /dev/null
+++ b/sklearn/impute/_base.py
@@ -0,0 +1,675 @@
+# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
+#          Sergey Feldman <sergeyfeldman@gmail.com>
+# License: BSD 3 clause
+
+from __future__ import division
+
+import warnings
+import numbers
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils.sparsefuncs import _get_median
+from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
+from ..utils.fixes import _object_dtype_isnan
+from ..utils import is_scalar_nan
+from ..utils import check_array
+
+
+def _check_inputs_dtype(X, missing_values):
+    if (X.dtype.kind in ("f", "i", "u") and
+            not isinstance(missing_values, numbers.Real)):
+        raise ValueError("'X' and 'missing_values' types are expected to be"
+                         " both numerical. Got X.dtype={} and "
+                         " type(missing_values)={}."
+                         .format(X.dtype, type(missing_values)))
+
+
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == missing_values."""
+    if is_scalar_nan(value_to_mask):
+        if X.dtype.kind == "f":
+            return np.isnan(X)
+        elif X.dtype.kind in ("i", "u"):
+            # can't have NaNs in integer array.
+            return np.zeros(X.shape, dtype=bool)
+        else:
+            # np.isnan does not work on object dtypes.
+            return _object_dtype_isnan(X)
+    else:
+        # X == value_to_mask with object dytpes does not always perform
+        # element-wise for old versions of numpy
+        return np.equal(X, value_to_mask)
+
+
+def _most_frequent(array, extra_value, n_repeat):
+    """Compute the most frequent value in a 1d array extended with
+       [extra_value] * n_repeat, where extra_value is assumed to be not part
+       of the array."""
+    # Compute the most frequent value in array only
+    if array.size > 0:
+        with warnings.catch_warnings():
+            # stats.mode raises a warning when input array contains objects due
+            # to incapacity to detect NaNs. Irrelevant here since input array
+            # has already been NaN-masked.
+            warnings.simplefilter("ignore", RuntimeWarning)
+            mode = stats.mode(array)
+
+        most_frequent_value = mode[0][0]
+        most_frequent_count = mode[1][0]
+    else:
+        most_frequent_value = 0
+        most_frequent_count = 0
+
+    # Compare to array + [extra_value] * n_repeat
+    if most_frequent_count == 0 and n_repeat == 0:
+        return np.nan
+    elif most_frequent_count < n_repeat:
+        return extra_value
+    elif most_frequent_count > n_repeat:
+        return most_frequent_value
+    elif most_frequent_count == n_repeat:
+        # Ties the breaks. Copy the behaviour of scipy.stats.mode
+        if most_frequent_value < extra_value:
+            return most_frequent_value
+        else:
+            return extra_value
+
+
+class SimpleImputer(BaseEstimator, TransformerMixin):
+    """Imputation transformer for completing missing values.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    Parameters
+    ----------
+    missing_values : number, string, np.nan (default) or None
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed.
+
+    strategy : string, optional (default="mean")
+        The imputation strategy.
+
+        - If "mean", then replace missing values using the mean along
+          each column. Can only be used with numeric data.
+        - If "median", then replace missing values using the median along
+          each column. Can only be used with numeric data.
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+        .. versionadded:: 0.20
+           strategy="constant" for fixed value imputation.
+
+    fill_value : string or numerical value, optional (default=None)
+        When strategy == "constant", fill_value is used to replace all
+        occurrences of missing_values.
+        If left to the default, fill_value will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
+
+    verbose : integer, optional (default=0)
+        Controls the verbosity of the imputer.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible. Note that, in the following cases,
+        a new copy will always be made, even if `copy=False`:
+
+        - If X is not an array of floating values;
+        - If X is encoded as a CSR matrix;
+        - If add_indicator=True.
+
+    add_indicator : boolean, optional (default=False)
+        If True, a `MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    Attributes
+    ----------
+    statistics_ : array of shape (n_features,)
+        The imputation fill value for each feature.
+
+    indicator_ : :class:`sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import SimpleImputer
+    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
+    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    SimpleImputer(add_indicator=False, copy=True, fill_value=None,
+            missing_values=nan, strategy='mean', verbose=0)
+    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+    >>> print(imp_mean.transform(X))
+    ... # doctest: +NORMALIZE_WHITESPACE
+    [[ 7.   2.   3. ]
+     [ 4.   3.5  6. ]
+     [10.   3.5  9. ]]
+
+    Notes
+    -----
+    Columns which only contained missing values at `fit` are discarded upon
+    `transform` if strategy is not "constant".
+
+    """
+    def __init__(self, missing_values=np.nan, strategy="mean",
+                 fill_value=None, verbose=0, copy=True, add_indicator=False):
+        self.missing_values = missing_values
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.verbose = verbose
+        self.copy = copy
+        self.add_indicator = add_indicator
+
+    def _validate_input(self, X):
+        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.strategy in ("most_frequent", "constant"):
+            dtype = None
+        else:
+            dtype = FLOAT_DTYPES
+
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+
+        try:
+            X = check_array(X, accept_sparse='csc', dtype=dtype,
+                            force_all_finite=force_all_finite, copy=self.copy)
+        except ValueError as ve:
+            if "could not convert" in str(ve):
+                raise ValueError("Cannot use {0} strategy with non-numeric "
+                                 "data. Received datatype :{1}."
+                                 "".format(self.strategy, X.dtype.kind))
+            else:
+                raise ve
+
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError("SimpleImputer does not support data with dtype "
+                             "{0}. Please provide either a numeric array (with"
+                             " a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
+
+        return X
+
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+
+        Returns
+        -------
+        self : SimpleImputer
+        """
+        X = self._validate_input(X)
+
+        # default fill_value is 0 for numerical input and "missing_value"
+        # otherwise
+        if self.fill_value is None:
+            if X.dtype.kind in ("i", "u", "f"):
+                fill_value = 0
+            else:
+                fill_value = "missing_value"
+        else:
+            fill_value = self.fill_value
+
+        # fill_value should be numerical in case of numerical input
+        if (self.strategy == "constant" and
+                X.dtype.kind in ("i", "u", "f") and
+                not isinstance(fill_value, numbers.Real)):
+            raise ValueError("'fill_value'={0} is invalid. Expected a "
+                             "numerical value when imputing numerical "
+                             "data".format(fill_value))
+
+        if sparse.issparse(X):
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                self.statistics_ = self._sparse_fit(X,
+                                                    self.strategy,
+                                                    self.missing_values,
+                                                    fill_value)
+        else:
+            self.statistics_ = self._dense_fit(X,
+                                               self.strategy,
+                                               self.missing_values,
+                                               fill_value)
+
+        if self.add_indicator:
+            self.indicator_ = MissingIndicator(
+                missing_values=self.missing_values)
+            self.indicator_.fit(X)
+        else:
+            self.indicator_ = None
+
+        return self
+
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on sparse data."""
+        mask_data = _get_mask(X.data, missing_values)
+        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
+
+        statistics = np.empty(X.shape[1])
+
+        if strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            statistics.fill(fill_value)
+        else:
+            for i in range(X.shape[1]):
+                column = X.data[X.indptr[i]:X.indptr[i + 1]]
+                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
+                column = column[~mask_column]
+
+                # combine explicit and implicit zeros
+                mask_zeros = _get_mask(column, 0)
+                column = column[~mask_zeros]
+                n_explicit_zeros = mask_zeros.sum()
+                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+                if strategy == "mean":
+                    s = column.size + n_zeros
+                    statistics[i] = np.nan if s == 0 else column.sum() / s
+
+                elif strategy == "median":
+                    statistics[i] = _get_median(column,
+                                                n_zeros)
+
+                elif strategy == "most_frequent":
+                    statistics[i] = _most_frequent(column,
+                                                   0,
+                                                   n_zeros)
+        return statistics
+
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on dense data."""
+        mask = _get_mask(X, missing_values)
+        masked_X = ma.masked_array(X, mask=mask)
+
+        # Mean
+        if strategy == "mean":
+            mean_masked = np.ma.mean(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            mean = np.ma.getdata(mean_masked)
+            mean[np.ma.getmask(mean_masked)] = np.nan
+
+            return mean
+
+        # Median
+        elif strategy == "median":
+            median_masked = np.ma.median(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            median = np.ma.getdata(median_masked)
+            median[np.ma.getmaskarray(median_masked)] = np.nan
+
+            return median
+
+        # Most frequent
+        elif strategy == "most_frequent":
+            # scipy.stats.mstats.mode cannot be used because it will no work
+            # properly if the first element is masked and if its frequency
+            # is equal to the frequency of the most frequent valid element
+            # See https://github.com/scipy/scipy/issues/2636
+
+            # To be able access the elements by columns
+            X = X.transpose()
+            mask = mask.transpose()
+
+            if X.dtype.kind == "O":
+                most_frequent = np.empty(X.shape[0], dtype=object)
+            else:
+                most_frequent = np.empty(X.shape[0])
+
+            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+                row_mask = np.logical_not(row_mask).astype(np.bool)
+                row = row[row_mask]
+                most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+            return most_frequent
+
+        # Constant
+        elif strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            return np.full(X.shape[1], fill_value, dtype=X.dtype)
+
+    def transform(self, X):
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+        """
+        check_is_fitted(self, 'statistics_')
+
+        X = self._validate_input(X)
+
+        statistics = self.statistics_
+
+        if X.shape[1] != statistics.shape[0]:
+            raise ValueError("X has %d features per sample, expected %d"
+                             % (X.shape[1], self.statistics_.shape[0]))
+
+        if self.add_indicator:
+            X_trans_indicator = self.indicator_.transform(X)
+
+        # Delete the invalid columns if strategy is not constant
+        if self.strategy == "constant":
+            valid_statistics = statistics
+        else:
+            # same as np.isnan but also works for object dtypes
+            invalid_mask = _get_mask(statistics, np.nan)
+            valid_mask = np.logical_not(invalid_mask)
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
+
+            if invalid_mask.any():
+                missing = np.arange(X.shape[1])[invalid_mask]
+                if self.verbose:
+                    warnings.warn("Deleting features without "
+                                  "observed values: %s" % missing)
+                X = X[:, valid_statistics_indexes]
+
+        # Do actual imputation
+        if sparse.issparse(X):
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                mask = _get_mask(X.data, self.missing_values)
+                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
+                                    np.diff(X.indptr))[mask]
+
+                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
+                                                                copy=False)
+        else:
+            mask = _get_mask(X, self.missing_values)
+            n_missing = np.sum(mask, axis=0)
+            values = np.repeat(valid_statistics, n_missing)
+            coordinates = np.where(mask.transpose())[::-1]
+
+            X[coordinates] = values
+
+        if self.add_indicator:
+            hstack = sparse.hstack if sparse.issparse(X) else np.hstack
+            X = hstack((X, X_trans_indicator))
+
+        return X
+
+    def _more_tags(self):
+        return {'allow_nan': True}
+
+
+class MissingIndicator(BaseEstimator, TransformerMixin):
+    """Binary indicators for missing values.
+
+    Note that this component typically should not be used in a vanilla
+    :class:`Pipeline` consisting of transformers and a classifier, but rather
+    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    Parameters
+    ----------
+    missing_values : number, string, np.nan (default) or None
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be indicated (True in the output array), the
+        other values will be marked as False.
+
+    features : str, optional
+        Whether the imputer mask should represent all or a subset of
+        features.
+
+        - If "missing-only" (default), the imputer mask will only represent
+          features containing missing values during fit time.
+        - If "all", the imputer mask will represent all features.
+
+    sparse : boolean or "auto", optional
+        Whether the imputer mask format should be sparse or dense.
+
+        - If "auto" (default), the imputer mask will be of same type as
+          input.
+        - If True, the imputer mask will be a sparse matrix.
+        - If False, the imputer mask will be a numpy array.
+
+    error_on_new : boolean, optional
+        If True (default), transform will raise an error when there are
+        features with missing values in transform that have no missing values
+        in fit. This is applicable only when ``features="missing-only"``.
+
+    Attributes
+    ----------
+    features_ : ndarray, shape (n_missing_features,) or (n_features,)
+        The features indices which will be returned when calling ``transform``.
+        They are computed during ``fit``. For ``features='all'``, it is
+        to ``range(n_features)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import MissingIndicator
+    >>> X1 = np.array([[np.nan, 1, 3],
+    ...                [4, 0, np.nan],
+    ...                [8, 1, 0]])
+    >>> X2 = np.array([[5, 1, np.nan],
+    ...                [np.nan, 2, 3],
+    ...                [2, 4, 0]])
+    >>> indicator = MissingIndicator()
+    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
+    MissingIndicator(error_on_new=True, features='missing-only',
+             missing_values=nan, sparse='auto')
+    >>> X2_tr = indicator.transform(X2)
+    >>> X2_tr
+    array([[False,  True],
+           [ True, False],
+           [False, False]])
+
+    """
+
+    def __init__(self, missing_values=np.nan, features="missing-only",
+                 sparse="auto", error_on_new=True):
+        self.missing_values = missing_values
+        self.features = features
+        self.sparse = sparse
+        self.error_on_new = error_on_new
+
+    def _get_missing_features_info(self, X):
+        """Compute the imputer mask and the indices of the features
+        containing missing values.
+
+        Parameters
+        ----------
+        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The input data with missing values. Note that ``X`` has been
+            checked in ``fit`` and ``transform`` before to call this function.
+
+        Returns
+        -------
+        imputer_mask : {ndarray or sparse matrix}, shape \
+(n_samples, n_features) or (n_samples, n_features_with_missing)
+            The imputer mask of the original data.
+
+        features_with_missing : ndarray, shape (n_features_with_missing)
+            The features containing missing values.
+
+        """
+        if sparse.issparse(X):
+            mask = _get_mask(X.data, self.missing_values)
+
+            # The imputer mask will be constructed with the same sparse format
+            # as X.
+            sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
+                                  else sparse.csc_matrix)
+            imputer_mask = sparse_constructor(
+                (mask, X.indices.copy(), X.indptr.copy()),
+                shape=X.shape, dtype=bool)
+            imputer_mask.eliminate_zeros()
+
+            if self.features == 'missing-only':
+                n_missing = imputer_mask.getnnz(axis=0)
+
+            if self.sparse is False:
+                imputer_mask = imputer_mask.toarray()
+            elif imputer_mask.format == 'csr':
+                imputer_mask = imputer_mask.tocsc()
+        else:
+            imputer_mask = _get_mask(X, self.missing_values)
+
+            if self.features == 'missing-only':
+                n_missing = imputer_mask.sum(axis=0)
+
+            if self.sparse is True:
+                imputer_mask = sparse.csc_matrix(imputer_mask)
+
+        if self.features == 'all':
+            features_indices = np.arange(X.shape[1])
+        else:
+            features_indices = np.flatnonzero(n_missing)
+
+        return imputer_mask, features_indices
+
+    def _validate_input(self, X):
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError("MissingIndicator does not support data with "
+                             "dtype {0}. Please provide either a numeric array"
+                             " (with a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
+
+        if sparse.issparse(X) and self.missing_values == 0:
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            raise ValueError("Sparse input with missing_values=0 is "
+                             "not supported. Provide a dense "
+                             "array instead.")
+
+        return X
+
+    def fit(self, X, y=None):
+        """Fit the transformer on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X = self._validate_input(X)
+        self._n_features = X.shape[1]
+
+        if self.features not in ('missing-only', 'all'):
+            raise ValueError("'features' has to be either 'missing-only' or "
+                             "'all'. Got {} instead.".format(self.features))
+
+        if not ((isinstance(self.sparse, str) and
+                self.sparse == "auto") or isinstance(self.sparse, bool)):
+            raise ValueError("'sparse' has to be a boolean or 'auto'. "
+                             "Got {!r} instead.".format(self.sparse))
+
+        self.features_ = self._get_missing_features_info(X)[1]
+
+        return self
+
+    def transform(self, X):
+        """Generate missing values indicator for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The missing indicator for input data. The data type of ``Xt``
+            will be boolean.
+
+        """
+        check_is_fitted(self, "features_")
+        X = self._validate_input(X)
+
+        if X.shape[1] != self._n_features:
+            raise ValueError("X has a different number of features "
+                             "than during fitting.")
+
+        imputer_mask, features = self._get_missing_features_info(X)
+
+        if self.features == "missing-only":
+            features_diff_fit_trans = np.setdiff1d(features, self.features_)
+            if (self.error_on_new and features_diff_fit_trans.size > 0):
+                raise ValueError("The features {} have missing values "
+                                 "in transform but have no missing values "
+                                 "in fit.".format(features_diff_fit_trans))
+
+            if self.features_.size < self._n_features:
+                imputer_mask = imputer_mask[:, self.features_]
+
+        return imputer_mask
+
+    def fit_transform(self, X, y=None):
+        """Generate missing values indicator for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The missing indicator for input data. The data type of ``Xt``
+            will be boolean.
+
+        """
+        return self.fit(X, y).transform(X)
+
+    def _more_tags(self):
+        return {'allow_nan': True,
+                'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
new file mode 100644
index 0000000000000..40df3f4059c04
--- /dev/null
+++ b/sklearn/impute/_iterative.py
@@ -0,0 +1,680 @@
+
+from time import time
+from distutils.version import LooseVersion
+from collections import namedtuple
+import warnings
+
+import scipy
+from scipy import stats
+import numpy as np
+
+from ..base import clone, BaseEstimator, TransformerMixin
+from ..exceptions import ConvergenceWarning
+from ..preprocessing import normalize
+from ..utils import check_array, check_random_state, safe_indexing
+from ..utils.validation import FLOAT_DTYPES, check_is_fitted
+from ..utils import is_scalar_nan
+
+from ._base import (_get_mask, MissingIndicator, SimpleImputer,
+                    _check_inputs_dtype)
+
+
+_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx',
+                                                 'neighbor_feat_idx',
+                                                 'estimator'])
+
+
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
+
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
+
+    Read more in the :ref:`User Guide <iterative_imputer>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_iterative_imputer``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+        >>> # now you can import normally from sklearn.impute
+        >>> from sklearn.impute import IterativeImputer
+
+    Parameters
+    ----------
+    estimator : estimator object, default=BayesianRidge()
+        The estimator to use at each step of the round-robin imputation.
+        If ``sample_posterior`` is True, the estimator must support
+        ``return_std`` in its ``predict`` method.
+
+    missing_values : int, np.nan, optional (default=np.nan)
+        The placeholder for the missing values. All occurrences of
+        ``missing_values`` will be imputed.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted estimator for each imputation. Estimator must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
+
+    max_iter : int, optional (default=10)
+        Maximum number of imputation rounds to perform before returning the
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values. The stopping criterion
+        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
+        where `X_t` is `X` at iteration `t. Note that early stopping is only
+        applied if ``sample_posterior=False``.
+
+    tol : float, optional (default=1e-3)
+        Tolerance of the stopping condition.
+
+    n_nearest_features : int, optional (default=None)
+        Number of other features to use to estimate the missing values of
+        each feature column. Nearness between features is measured using
+        the absolute correlation coefficient between each feature pair (after
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
+
+    initial_strategy : str, optional (default="mean")
+        Which strategy to use to initialize the missing values. Same as the
+        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+        Valid values: {"mean", "median", "most_frequent", or "constant"}.
+
+    imputation_order : str, optional (default="ascending")
+        The order in which the features will be imputed. Possible values:
+
+        "ascending"
+            From features with fewest missing values to most.
+        "descending"
+            From features with most missing values to fewest.
+        "roman"
+            Left to right.
+        "arabic"
+            Right to left.
+        "random"
+            A random order for each round.
+
+    min_value : float, optional (default=None)
+        Minimum possible imputed value. Default of ``None`` will set minimum
+        to negative infinity.
+
+    max_value : float, optional (default=None)
+        Maximum possible imputed value. Default of ``None`` will set maximum
+        to positive infinity.
+
+    verbose : int, optional (default=0)
+        Verbosity flag, controls the debug messages that are issued
+        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+        or 2.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of estimator features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
+
+    add_indicator : boolean, optional (default=False)
+        If True, a `MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    Attributes
+    ----------
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
+
+    imputation_sequence_ : list of tuples
+        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
+        ``feat_idx`` is the current feature to be imputed,
+        ``neighbor_feat_idx`` is the array of other features used to impute the
+        current feature, and ``estimator`` is the trained estimator used for
+        the imputation. Length is ``self.n_features_with_missing_ *
+        self.n_iter_``.
+
+    n_iter_ : int
+        Number of iteration rounds that occurred. Will be less than
+        ``self.max_iter`` if early stopping criterion was reached.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    indicator_ : :class:`sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
+
+    Notes
+    -----
+    To support imputation in inductive mode we store each feature's estimator
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
+
+    Features which contain all missing values at ``fit`` are discarded upon
+    ``transform``.
+
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
+
+    References
+    ----------
+    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+        Multivariate Imputation by Chained Equations in R". Journal of
+        Statistical Software 45: 1-67.
+        <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
+    """
+
+    def __init__(self,
+                 estimator=None,
+                 missing_values=np.nan,
+                 sample_posterior=False,
+                 max_iter=10,
+                 tol=1e-3,
+                 n_nearest_features=None,
+                 initial_strategy="mean",
+                 imputation_order='ascending',
+                 min_value=None,
+                 max_value=None,
+                 verbose=0,
+                 random_state=None,
+                 add_indicator=False):
+
+        self.estimator = estimator
+        self.missing_values = missing_values
+        self.sample_posterior = sample_posterior
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_nearest_features = n_nearest_features
+        self.initial_strategy = initial_strategy
+        self.imputation_order = imputation_order
+        self.min_value = min_value
+        self.max_value = max_value
+        self.verbose = verbose
+        self.random_state = random_state
+        self.add_indicator = add_indicator
+
+    def _impute_one_feature(self,
+                            X_filled,
+                            mask_missing_values,
+                            feat_idx,
+                            neighbor_feat_idx,
+                            estimator=None,
+                            fit_mode=True):
+        """Impute a single feature from the others provided.
+
+        This function predicts the missing values of one of the features using
+        the current estimates of all the other features. The ``estimator`` must
+        support ``return_std=True`` in its ``predict`` method for this function
+        to work.
+
+        Parameters
+        ----------
+        X_filled : ndarray
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray
+            Input data's missing indicator matrix.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        neighbor_feat_idx : ndarray
+            Indices of the features to be used in imputing ``feat_idx``.
+
+        estimator : object
+            The estimator to use at this step of the round-robin imputation.
+            If ``sample_posterior`` is True, the estimator must support
+            ``return_std`` in its ``predict`` method.
+            If None, it will be cloned from self._estimator.
+
+        fit_mode : boolean, default=True
+            Whether to fit and predict with the estimator or just predict.
+
+        Returns
+        -------
+        X_filled : ndarray
+            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
+
+        estimator : estimator with sklearn API
+            The fitted estimator used to impute
+            ``X_filled[missing_row_mask, feat_idx]``.
+        """
+
+        # if nothing is missing, just return the default
+        # (should not happen at fit time because feat_ids would be excluded)
+        missing_row_mask = mask_missing_values[:, feat_idx]
+        if not np.any(missing_row_mask):
+            return X_filled, estimator
+
+        if estimator is None and fit_mode is False:
+            raise ValueError("If fit_mode is False, then an already-fitted "
+                             "estimator should be passed in.")
+
+        if estimator is None:
+            estimator = clone(self._estimator)
+
+        if fit_mode:
+            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
+                                    ~missing_row_mask)
+            y_train = safe_indexing(X_filled[:, feat_idx],
+                                    ~missing_row_mask)
+            estimator.fit(X_train, y_train)
+
+        # get posterior samples
+        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
+                               missing_row_mask)
+        if self.sample_posterior:
+            mus, sigmas = estimator.predict(X_test, return_std=True)
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            # two types of problems: (1) non-positive sigmas, (2) mus outside
+            # legal range of min_value and max_value (results in inf sample)
+            positive_sigmas = sigmas > 0
+            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+            mus_too_low = mus < self._min_value
+            imputed_values[mus_too_low] = self._min_value
+            mus_too_high = mus > self._max_value
+            imputed_values[mus_too_high] = self._max_value
+            # the rest can be sampled without statistical issues
+            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+            mus = mus[inrange_mask]
+            sigmas = sigmas[inrange_mask]
+            a = (self._min_value - mus) / sigmas
+            b = (self._max_value - mus) / sigmas
+
+            if scipy.__version__ < LooseVersion('0.18'):
+                # bug with vector-valued `a` in old scipy
+                imputed_values[inrange_mask] = [
+                    stats.truncnorm(a=a_, b=b_,
+                                    loc=loc_, scale=scale_).rvs(
+                                        random_state=self.random_state_)
+                    for a_, b_, loc_, scale_
+                    in zip(a, b, mus, sigmas)]
+            else:
+                truncated_normal = stats.truncnorm(a=a, b=b,
+                                                   loc=mus, scale=sigmas)
+                imputed_values[inrange_mask] = truncated_normal.rvs(
+                    random_state=self.random_state_)
+        else:
+            imputed_values = estimator.predict(X_test)
+            imputed_values = np.clip(imputed_values,
+                                     self._min_value,
+                                     self._max_value)
+
+        # update the feature
+        X_filled[missing_row_mask, feat_idx] = imputed_values
+        return X_filled, estimator
+
+    def _get_neighbor_feat_idx(self,
+                               n_features,
+                               feat_idx,
+                               abs_corr_mat):
+        """Get a list of other features to predict ``feat_idx``.
+
+        If self.n_nearest_features is less than or equal to the total
+        number of features, then use a probability proportional to the absolute
+        correlation between ``feat_idx`` and each other feature to randomly
+        choose a subsample of the other features (without replacement).
+
+        Parameters
+        ----------
+        n_features : int
+            Number of features in ``X``.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X``. The diagonal has been zeroed
+            out and each feature has been normalized to sum to 1. Can be None.
+
+        Returns
+        -------
+        neighbor_feat_idx : array-like
+            The features to use to impute ``feat_idx``.
+        """
+        if (self.n_nearest_features is not None and
+                self.n_nearest_features < n_features):
+            p = abs_corr_mat[:, feat_idx]
+            neighbor_feat_idx = self.random_state_.choice(
+                np.arange(n_features), self.n_nearest_features, replace=False,
+                p=p)
+        else:
+            inds_left = np.arange(feat_idx)
+            inds_right = np.arange(feat_idx + 1, n_features)
+            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+        return neighbor_feat_idx
+
+    def _get_ordered_idx(self, mask_missing_values):
+        """Decide in what order we will update the features.
+
+        As a homage to the MICE R package, we will have 4 main options of
+        how to order the updates, and use a random order if anything else
+        is specified.
+
+        Also, this function skips features which have no missing values.
+
+        Parameters
+        ----------
+        mask_missing_values : array-like, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+
+        Returns
+        -------
+        ordered_idx : ndarray, shape (n_features,)
+            The order in which to impute the features.
+        """
+        frac_of_missing_values = mask_missing_values.mean(axis=0)
+        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
+        if self.imputation_order == 'roman':
+            ordered_idx = missing_values_idx
+        elif self.imputation_order == 'arabic':
+            ordered_idx = missing_values_idx[::-1]
+        elif self.imputation_order == 'ascending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:][::-1]
+        elif self.imputation_order == 'descending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:]
+        elif self.imputation_order == 'random':
+            ordered_idx = missing_values_idx
+            self.random_state_.shuffle(ordered_idx)
+        else:
+            raise ValueError("Got an invalid imputation order: '{0}'. It must "
+                             "be one of the following: 'roman', 'arabic', "
+                             "'ascending', 'descending', or "
+                             "'random'.".format(self.imputation_order))
+        return ordered_idx
+
+    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+        """Get absolute correlation matrix between features.
+
+        Parameters
+        ----------
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        tolerance : float, optional (default=1e-6)
+            ``abs_corr_mat`` can have nans, which will be replaced
+            with ``tolerance``.
+
+        Returns
+        -------
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X`` at the beginning of the
+            current round. The diagonal has been zeroed out and each feature's
+            absolute correlations with all others have been normalized to sum
+            to 1.
+        """
+        n_features = X_filled.shape[1]
+        if (self.n_nearest_features is None or
+                self.n_nearest_features >= n_features):
+            return None
+        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+        # np.corrcoef is not defined for features with zero std
+        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+        # ensures exploration, i.e. at least some probability of sampling
+        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+        # features are not their own neighbors
+        np.fill_diagonal(abs_corr_mat, 0)
+        # needs to sum to 1 for np.random.choice sampling
+        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
+        return abs_corr_mat
+
+    def _initial_imputation(self, X):
+        """Perform initial imputation for input X.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        Returns
+        -------
+        Xt : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+        """
+        if is_scalar_nan(self.missing_values):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
+
+        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+
+        mask_missing_values = _get_mask(X, self.missing_values)
+        if self.initial_imputer_ is None:
+            self.initial_imputer_ = SimpleImputer(
+                                            missing_values=self.missing_values,
+                                            strategy=self.initial_strategy)
+            X_filled = self.initial_imputer_.fit_transform(X)
+        else:
+            X_filled = self.initial_imputer_.transform(X)
+
+        valid_mask = np.flatnonzero(np.logical_not(
+            np.isnan(self.initial_imputer_.statistics_)))
+        Xt = X[:, valid_mask]
+        mask_missing_values = mask_missing_values[:, valid_mask]
+
+        return Xt, X_filled, mask_missing_values
+
+    def fit_transform(self, X, y=None):
+        """Fits the imputer on X and return the transformed X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        self.random_state_ = getattr(self, "random_state_",
+                                     check_random_state(self.random_state))
+
+        if self.max_iter < 0:
+            raise ValueError(
+                "'max_iter' should be a positive integer. Got {} instead."
+                .format(self.max_iter))
+
+        if self.tol < 0:
+            raise ValueError(
+                "'tol' should be a non-negative float. Got {} instead."
+                .format(self.tol)
+            )
+
+        if self.add_indicator:
+            self.indicator_ = MissingIndicator(
+                missing_values=self.missing_values)
+            X_trans_indicator = self.indicator_.fit_transform(X)
+        else:
+            self.indicator_ = None
+
+        if self.estimator is None:
+            from ..linear_model import BayesianRidge
+            self._estimator = BayesianRidge()
+        else:
+            self._estimator = clone(self.estimator)
+
+        self.imputation_sequence_ = []
+
+        if hasattr(self._estimator, 'random_state'):
+            self._estimator.random_state = self.random_state_
+
+        self._min_value = -np.inf if self.min_value is None else self.min_value
+        self._max_value = np.inf if self.max_value is None else self.max_value
+
+        self.initial_imputer_ = None
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.max_iter == 0 or np.all(mask_missing_values):
+            self.n_iter_ = 0
+            return Xt
+
+        # order in which to impute
+        # note this is probably too slow for large feature data (d > 100000)
+        # and a better way would be good.
+        # see: https://goo.gl/KyCNwj and subsequent comments
+        ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
+
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
+
+        n_samples, n_features = Xt.shape
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        if not self.sample_posterior:
+            Xt_previous = Xt.copy()
+            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+        for self.n_iter_ in range(1, self.max_iter + 1):
+            if self.imputation_order == 'random':
+                ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+            for feat_idx in ordered_idx:
+                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
+                                                                feat_idx,
+                                                                abs_corr_mat)
+                Xt, estimator = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
+                    estimator=None, fit_mode=True)
+                estimator_triplet = _ImputerTriplet(feat_idx,
+                                                    neighbor_feat_idx,
+                                                    estimator)
+                self.imputation_sequence_.append(estimator_triplet)
+
+            if self.verbose > 1:
+                print('[IterativeImputer] Ending imputation round '
+                      '%d/%d, elapsed time %0.2f'
+                      % (self.n_iter_, self.max_iter, time() - start_t))
+
+            if not self.sample_posterior:
+                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
+                                          axis=None)
+                if inf_norm < normalized_tol:
+                    if self.verbose > 0:
+                        print('[IterativeImputer] Early stopping criterion '
+                              'reached.')
+                    break
+                Xt_previous = Xt.copy()
+        else:
+            if not self.sample_posterior:
+                warnings.warn("[IterativeImputer] Early stopping criterion not"
+                              " reached.", ConvergenceWarning)
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+
+        if self.add_indicator:
+            Xt = np.hstack((Xt, X_trans_indicator))
+        return Xt
+
+    def transform(self, X):
+        """Imputes all missing values in X.
+
+        Note that this is stochastic, and that if random_state is not fixed,
+        repeated calls, or permuted input, will yield different results.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        check_is_fitted(self, 'initial_imputer_')
+
+        if self.add_indicator:
+            X_trans_indicator = self.indicator_.transform(X)
+
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.n_iter_ == 0 or np.all(mask_missing_values):
+            return Xt
+
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
+        i_rnd = 0
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        for it, estimator_triplet in enumerate(self.imputation_sequence_):
+            Xt, _ = self._impute_one_feature(
+                Xt,
+                mask_missing_values,
+                estimator_triplet.feat_idx,
+                estimator_triplet.neighbor_feat_idx,
+                estimator=estimator_triplet.estimator,
+                fit_mode=False
+            )
+            if not (it + 1) % imputations_per_round:
+                if self.verbose > 1:
+                    print('[IterativeImputer] Ending imputation round '
+                          '%d/%d, elapsed time %0.2f'
+                          % (i_rnd + 1, self.n_iter_, time() - start_t))
+                i_rnd += 1
+
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+
+        if self.add_indicator:
+            Xt = np.hstack((Xt, X_trans_indicator))
+        return Xt
+
+    def fit(self, X, y=None):
+        """Fits the imputer on X and return self.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        self.fit_transform(X)
+        return self
+
+    def _more_tags(self):
+        return {'allow_nan': True}
diff --git a/sklearn/impute/tests/__init__.py b/sklearn/impute/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
similarity index 99%
rename from sklearn/tests/test_impute.py
rename to sklearn/impute/tests/test_impute.py
index 979140ba246cf..1552031ff2193 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -13,6 +13,9 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 
+# make IterativeImputer available
+from sklearn.experimental import enable_iterative_imputer  # noqa
+
 from sklearn.impute import MissingIndicator
 from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e6f10cad77d9f..5a377043e9e38 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('feature_selection/tests')
     config.add_subpackage('gaussian_process')
     config.add_subpackage('gaussian_process/tests')
+    config.add_subpackage('impute')
+    config.add_subpackage('impute/tests')
     config.add_subpackage('inspection')
     config.add_subpackage('inspection/tests')
     config.add_subpackage('mixture')