diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ae27828dd22a3..c31385dd3e48d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -22,6 +22,7 @@ jobs:
         SCIPY_VERSION: '0.17.0'
         CYTHON_VERSION: '*'
         PILLOW_VERSION: '4.0.0'
+        MATPLOTLIB_VERSION: '1.5.1'
         # later version of joblib are not packaged in conda for Python 3.5
         JOBLIB_VERSION: '0.12.3'
         COVERAGE: 'true'
diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd
index 97f5cb4f7e465..a53cd61b34828 100644
--- a/build_tools/azure/install.cmd
+++ b/build_tools/azure/install.cmd
@@ -11,7 +11,7 @@ IF "%PYTHON_ARCH%"=="64" (
     call deactivate
     @rem Clean up any left-over from a previous build
     conda remove --all -q -y -n %VIRTUALENV%
-    conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow joblib
+    conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython matplotlib pytest wheel pillow joblib
 
     call activate %VIRTUALENV%
 ) else (
diff --git a/doc/conf.py b/doc/conf.py
index 27a6bf2ee30c2..c736adc8e267e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -263,9 +263,9 @@
                    'sphx_glr_plot_compare_methods_001.png': 349}
 
 
-# enable experimental module so that the new GBDTs estimators can be
+# enable experimental module so that experimental estimators can be
 # discovered properly by sphinx
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.experimental import *  # noqa
 
 
 def make_carousel_thumbs(app, exception):
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 34a5f63919c44..69e7f0b2b480d 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -195,67 +195,67 @@ The preferred way to contribute to scikit-learn is to fork the `main
 repository <https://github.com/scikit-learn/scikit-learn/>`__ on GitHub,
 then submit a "pull request" (PR):
 
- 1. `Create an account <https://github.com/join>`_ on
-    GitHub if you do not already have one.
+1. `Create an account <https://github.com/join>`_ on
+   GitHub if you do not already have one.
 
- 2. Fork the `project repository
-    <https://github.com/scikit-learn/scikit-learn>`__: click on the 'Fork'
-    button near the top of the page. This creates a copy of the code under your
-    account on the GitHub user account. For more details on how to fork a
-    repository see `this guide <https://help.github.com/articles/fork-a-repo/>`_.
+2. Fork the `project repository
+   <https://github.com/scikit-learn/scikit-learn>`__: click on the 'Fork'
+   button near the top of the page. This creates a copy of the code under your
+   account on the GitHub user account. For more details on how to fork a
+   repository see `this guide <https://help.github.com/articles/fork-a-repo/>`_.
 
- 3. Clone your fork of the scikit-learn repo from your GitHub account to your
-    local disk::
+3. Clone your fork of the scikit-learn repo from your GitHub account to your
+   local disk::
 
-        $ git clone git@github.com:YourLogin/scikit-learn.git
-        $ cd scikit-learn
+       $ git clone git@github.com:YourLogin/scikit-learn.git
+       $ cd scikit-learn
 
- 4. Install library in editable mode::
+4. Install library in editable mode::
 
-        $ pip install --editable .
+       $ pip install --editable .
 
-    for more details about advanced installation, see the
-    :ref:`install_bleeding_edge` section.
+   for more details about advanced installation, see the
+   :ref:`install_bleeding_edge` section.
 
- 5. Create a branch to hold your development changes::
+5. Create a branch to hold your development changes::
 
-        $ git checkout -b my-feature
+       $ git checkout -b my-feature
 
-    and start making changes. Always use a ``feature`` branch. It's good practice to
-    never work on the ``master`` branch!
+   and start making changes. Always use a ``feature`` branch. It's good practice to
+   never work on the ``master`` branch!
 
-.. note::
+   .. note::
 
-  In the above setup, your ``origin`` remote repository points to
-  ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main
-  repository instead of your forked one, you will need to add another remote
-  to use instead of ``origin``. If we choose the name ``upstream`` for it, the
-  command will be::
+     In the above setup, your ``origin`` remote repository points to
+     ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main
+     repository instead of your forked one, you will need to add another remote
+     to use instead of ``origin``. If we choose the name ``upstream`` for it, the
+     command will be::
 
-        $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git
+         $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git
 
-  And in order to fetch the new remote and base your work on the latest changes
-  of it you can::
+     And in order to fetch the new remote and base your work on the latest changes
+     of it you can::
 
-        $ git fetch upstream
-        $ git checkout -b my-feature upstream/master
+         $ git fetch upstream
+         $ git checkout -b my-feature upstream/master
 
- 6. Develop the feature on your feature branch on your computer, using Git to do the
-    version control. When you're done editing, add changed files using ``git add``
-    and then ``git commit`` files::
+6. Develop the feature on your feature branch on your computer, using Git to do the
+   version control. When you're done editing, add changed files using ``git add``
+   and then ``git commit`` files::
 
-        $ git add modified_files
-        $ git commit
+       $ git add modified_files
+       $ git commit
 
-    to record your changes in Git, then push the changes to your GitHub account with::
+   to record your changes in Git, then push the changes to your GitHub account with::
 
-        $ git push -u origin my-feature
+       $ git push -u origin my-feature
 
- 7. Follow `these
-    <https://help.github.com/articles/creating-a-pull-request-from-a-fork>`_
-    instructions to create a pull request from your fork. This will send an
-    email to the committers. You may want to consider sending an email to the
-    mailing list for more visibility.
+7. Follow `these
+   <https://help.github.com/articles/creating-a-pull-request-from-a-fork>`_
+   instructions to create a pull request from your fork. This will send an
+   email to the committers. You may want to consider sending an email to the
+   mailing list for more visibility.
 
 .. note::
 
@@ -626,7 +626,7 @@ reviewing pull requests, you may find :ref:`this tip
 .. _testing_coverage:
 
 Testing and improving test coverage
-------------------------------------
+-----------------------------------
 
 High-quality `unit testing <https://en.wikipedia.org/wiki/Unit_testing>`_
 is a corner-stone of the scikit-learn development process. For this
@@ -641,22 +641,42 @@ the corresponding subpackages.
 
 We expect code coverage of new features to be at least around 90%.
 
-.. note:: **Workflow to improve test coverage**
+For guidelines on how to use ``pytest`` efficiently, see the
+:ref:`pytest_tips`.
 
-   To test code coverage, you need to install the `coverage
-   <https://pypi.org/project/coverage/>`_ package in addition to pytest.
+Writing matplotlib related tests
+................................
 
-   1. Run 'make test-coverage'. The output lists for each file the line
-      numbers that are not tested.
+Test fixtures ensure that a set of tests will be executing with the appropriate
+initialization and cleanup. The scikit-learn test suite implements a fixture
+which can be used with ``matplotlib``.
 
-   2. Find a low hanging fruit, looking at which lines are not tested,
-      write or adapt a test specifically for these lines.
+``pyplot``
+    The ``pyplot`` fixture should be used when a test function is dealing with
+    ``matplotlib``. ``matplotlib`` is a soft dependency and is not required.
+    This fixture is in charge of skipping the tests if ``matplotlib`` is not
+    installed. In addition, figures created during the tests will be
+    automatically closed once the test function has been executed.
 
-   3. Loop.
+To use this fixture in a test function, one needs to pass it as an
+argument::
 
-For guidelines on how to use ``pytest`` efficiently, see the
-:ref:`pytest_tips`.
+    def test_requiring_mpl_fixture(pyplot):
+        # you can now safely use matplotlib
+
+Workflow to improve test coverage
+.................................
+
+To test code coverage, you need to install the `coverage
+<https://pypi.org/project/coverage/>`_ package in addition to pytest.
+
+1. Run 'make test-coverage'. The output lists for each file the line
+    numbers that are not tested.
+
+2. Find a low hanging fruit, looking at which lines are not tested,
+    write or adapt a test specifically for these lines.
 
+3. Loop.
 
 Developers web site
 -------------------
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c523236a11348..56de69db9519c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -471,6 +471,7 @@ Samples generator
    :toctree: generated/
 
    experimental.enable_hist_gradient_boosting
+   experimental.enable_iterative_imputer
 
 
 .. _feature_extraction_ref:
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 777a2bd157b29..4cd0ea6e85d60 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -105,7 +105,16 @@ of ``y``.  This is done for each feature in an iterative fashion, and then is
 repeated for ``max_iter`` imputation rounds. The results of the final
 imputation round are returned.
 
+.. note::
+
+   This estimator is still **experimental** for now: the predictions
+   and the API might change without any deprecation cycle. To use it,
+   you need to explicitly import ``enable_iterative_imputer``.
+
+::
+
     >>> import numpy as np
+    >>> from sklearn.experimental import enable_iterative_imputer
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(max_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a370791d248e2..c01b74775684f 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -136,17 +136,24 @@ Setting the regularization parameter: generalized Cross-Validation
 ------------------------------------------------------------------
 
 :class:`RidgeCV` implements ridge regression with built-in
-cross-validation of the alpha parameter.  The object works in the same way
+cross-validation of the alpha parameter. The object works in the same way
 as GridSearchCV except that it defaults to Generalized Cross-Validation
 (GCV), an efficient form of leave-one-out cross-validation::
 
+    >>> import numpy as np
     >>> from sklearn import linear_model
-    >>> reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3)
-    >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])       # doctest: +SKIP
-    RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3, fit_intercept=True, scoring=None,
-        normalize=False)
-    >>> reg.alpha_                                      # doctest: +SKIP
-    0.1
+    >>> reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
+    >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])       # doctest: +NORMALIZE_WHITESPACE
+    RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
+          1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
+            cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
+            scoring=None, store_cv_values=False)
+    >>> reg.alpha_
+    0.01
+
+Specifying the value of the `cv` attribute will trigger the use of
+cross-validation with `GridSearchCV`, for example `cv=10` for 10-fold
+cross-validation, rather than Generalized Cross-Validation.
 
 .. topic:: References
 
diff --git a/doc/roadmap.rst b/doc/roadmap.rst
index a8334604395a2..2252b62d273e6 100644
--- a/doc/roadmap.rst
+++ b/doc/roadmap.rst
@@ -128,7 +128,6 @@ bottom.
 
 #. Improved tools for model diagnostics and basic inference
 
-   * partial dependence plots :issue:`5653`
    * alternative feature importances implementations (e.g. methods or wrappers)
    * better ways to handle validation sets when fitting
    * better ways to find thresholds / create decision rules :issue:`8614`
@@ -144,19 +143,6 @@ bottom.
      :issue:`6929`
    * Callbacks or a similar system would facilitate logging and early stopping
 
-#. Use scipy BLAS Cython bindings
-
-   * This will make it possible to get rid of our partial copy of suboptimal
-     Atlas C-routines. :issue:`11638`
-   * This should speed up the Windows and Linux wheels
-
-#. Allow fine-grained parallelism in cython
-
-   * Now that we do not use fork-based multiprocessing in joblib anymore it's
-     possible to use the prange / openmp thread management which makes it
-     possible to have very efficient thread-based parallelism at the Cython
-     level. Example with K-Means: :issue:`11950`
-
 #. Distributed parallelism
 
    * Joblib can now plug onto several backends, some of them can distribute the
@@ -240,9 +226,6 @@ Subpackage-specific goals
 :mod:`sklearn.ensemble`
 
 * a stacking implementation
-* a binned feature histogram based and thread parallel implementation of
-  decision trees to compete with the performance of state of the art gradient
-  boosting like LightGBM.
 
 :mod:`sklearn.model_selection`
 
@@ -269,5 +252,3 @@ Subpackage-specific goals
 
 * Performance issues with `Pipeline.memory`
 * see "Everything in Scikit-learn should conform to our API contract" above
-* Add a verbose option :issue:`10435`
-
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index bf18d8350646e..91c8e4506ec2b 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -7,7 +7,7 @@
 Version 0.21.0
 ==============
 
-**May 2019**
+**10 May 2019**
 
 Changed models
 --------------
@@ -38,6 +38,8 @@ random sampling procedures.
   seed, including :class:`linear_model.LogisticRegression`,
   :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.Ridge`,
   and :class:`linear_model.RidgeCV` with 'sag' solver. |Fix|
+- :class:`linear_model.ridge.RidgeCV` when using generalized cross-validation
+  with sparse inputs. |Fix|
 
 
 Details are listed in the changelog below.
@@ -119,6 +121,12 @@ Support for Python 3.4 and below has been officially dropped.
   parameter which can be used to find the clusters instead of ``n_clusters``.
   :issue:`9069` by :user:`Vathsala Achar <VathsalaAchar>` and `Adrin Jalali`_.
 
+:mod:`sklearn.compose`
+......................
+
+- |API| :class:`compose.ColumnTransformer` is no longer an experimental
+  feature. :pr:`13835` by :user:`Hanmin Qin <qinhanmin2014>`.
+
 :mod:`sklearn.datasets`
 .......................
 
@@ -214,7 +222,7 @@ Support for Python 3.4 and below has been officially dropped.
 
     >>> # explicitly require this experimental feature
     >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-    >>> # now you can import normally from ensemble
+    >>> # now you can import normally from sklearn.ensemble
     >>> from sklearn.ensemble import HistGradientBoostingClassifier
 
   :pr:`12807` by :user:`Nicolas Hug<NicolasHug>`.
@@ -319,6 +327,17 @@ Support for Python 3.4 and below has been officially dropped.
   :pr:`12599` by :user:`Trevor Stephens<trevorstephens>` and
   :user:`Nicolas Hug<NicolasHug>`.
 
+- |Fix| :class:`ensemble.VotingClassifier` and
+  :class:`ensemble.VotingRegressor` were failing during ``fit`` in one
+  of the estimators was set to ``None`` and ``sample_weight`` was not ``None``.
+  :pr:`13779` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| :class:`ensemble.VotingClassifier` and
+  :class:`ensemble.VotingRegressor` accept ``'drop'`` to disable an estimator
+  in addition to ``None`` to be consistent with other estimators (i.e.,
+  :class:`pipeline.FeatureUnion` and :class:`compose.ColumnTransformer`).
+  :pr:`13780` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.externals`
 ........................
 
@@ -345,6 +364,15 @@ Support for Python 3.4 and below has been officially dropped.
   :pr:`12177` by :user:`Sergey Feldman <sergeyf>` and :user:`Ben Lawson
   <benlawson>`.
 
+  The API of IterativeImputer is experimental and subject to change without any
+  deprecation cycle. To use them, you need to explicitly import
+  ``enable_iterative_imputer``::
+
+    >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+    >>> # now you can import normally from sklearn.impute
+    >>> from sklearn.impute import IterativeImputer
+
+
 - |Feature| The :class:`impute.SimpleImputer` and
   :class:`impute.IterativeImputer` have a new parameter ``'add_indicator'``,
   which simply stacks a :class:`impute.MissingIndicator` transform into the
@@ -384,6 +412,10 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Enhancement| :class:`linear_model.Ridge` now preserves ``float32`` and
+  ``float64`` dtypes. :issues:`8769` and :issues:`11000` by
+  :user:`Guillaume Lemaitre <glemaitre>`, and :user:`Joan Massich <massich>`
+
 - |Feature| :class:`linear_model.LogisticRegression` and
   :class:`linear_model.LogisticRegressionCV` now support Elastic-Net penalty,
   with the 'saga' solver. :pr:`11646` by :user:`Nicolas Hug <NicolasHug>`.
@@ -478,6 +510,10 @@ Support for Python 3.4 and below has been officially dropped.
   in version 0.21 and will be removed in version 0.23.
   :pr:`12821` by :user:`Nicolas Hug <NicolasHug>`.
 
+- |Fix| :class:`linear_model.ridge.RidgeCV` with generalized cross-validation
+  now correctly fits an intercept when ``fit_intercept=True`` and the design
+  matrix is sparse. :issue:`13350` by :user:`Jérôme Dockès <jeromedockes>`
+
 :mod:`sklearn.manifold`
 .......................
 
@@ -577,7 +613,7 @@ Support for Python 3.4 and below has been officially dropped.
 - |Feature| Classes :class:`~model_selection.GridSearchCV` and
   :class:`~model_selection.RandomizedSearchCV` now allow for refit=callable
   to add flexibility in identifying the best estimator.
-  See :doc:`/auto_examples/model_selection/plot_grid_search_refit_callable.py`.
+  See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py`.
   :pr:`11354` by :user:`Wenhao Zhang <wenhaoz@ucla.edu>`,
   `Joel Nothman`_ and :user:`Adrin Jalali <adrinjalali>`.
 
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index c6a8cb65d2c6b..06fab08c381f2 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -8,13 +8,13 @@
 variable as an output in turn.
 
 In this example we compare some estimators for the purpose of missing feature
-imputation with :class:`sklearn.imputeIterativeImputer`::
+imputation with :class:`sklearn.impute.IterativeImputer`:
 
-    :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
-    :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
-    :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
-    :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
-    imputation approaches
+* :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
+* :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
+* :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
+* :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
+  imputation approaches
 
 Of particular interest is the ability of
 :class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a
@@ -42,6 +42,8 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.datasets import fetch_california_housing
 from sklearn.impute import SimpleImputer
 from sklearn.impute import IterativeImputer
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 897b66aad246c..2d2d37745abf3 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -23,6 +23,8 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index bd5e052a50577..1271b7e9fd4a9 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -45,7 +45,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.21rc2'
+__version__ = '0.21.0'
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index c99ef8f618b23..9f9245aa32f21 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -148,7 +148,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
     Parameters
     ----------
     X : array, shape (n_samples, n_features)
-        feature matrix  representing n_samples samples to be clustered
+        feature matrix representing n_samples samples to be clustered
 
     connectivity : sparse matrix (optional).
         connectivity matrix. Defines for each sample the neighboring samples
@@ -219,7 +219,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
     n_samples, n_features = X.shape
 
     if connectivity is None:
-        from scipy.cluster import hierarchy     # imports PIL
+        from scipy.cluster import hierarchy  # imports PIL
 
         if n_clusters is not None:
             warnings.warn('Partial build of the tree is implemented '
@@ -433,7 +433,7 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
             'of %s, but %s was given' % (linkage_choices.keys(), linkage))
 
     if connectivity is None:
-        from scipy.cluster import hierarchy     # imports PIL
+        from scipy.cluster import hierarchy  # imports PIL
 
         if n_clusters is not None:
             warnings.warn('Partial build of the tree is implemented '
@@ -597,7 +597,7 @@ def _single_linkage(*args, **kwargs):
 
 
 ###############################################################################
-# Functions for cutting  hierarchical clustering tree
+# Functions for cutting hierarchical clustering tree
 
 def _hc_cut(n_clusters, children, n_leaves):
     """Function cutting the ward tree for a given number of clusters.
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 042e6990b5df1..37dc6a3abda61 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -44,7 +44,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
     """Init n_clusters seeds according to k-means++
 
     Parameters
-    -----------
+    ----------
     X : array or sparse matrix, shape (n_samples, n_features)
         The data to pick seeds for. To avoid memory copy, the input data
         should be double precision (dtype=np.float64).
@@ -706,7 +706,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
         an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    x_squared_norms :  array, shape (n_samples,), optional
+    x_squared_norms : array, shape (n_samples,), optional
         Squared euclidean norm of each data point. Pass it if you have it at
         hands already to avoid it being recomputed here. Default: None
 
@@ -887,7 +887,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         probably much faster than the default batch implementation.
 
     Notes
-    ------
+    -----
     The k-means problem is solved using either Lloyd's or Elkan's algorithm.
 
     The average complexity is given by O(k n T), were n is the number of
@@ -1419,8 +1419,8 @@ class MiniBatchKMeans(KMeans):
     ...               [3, 2], [5, 5], [1, -1]])
     >>> # manually fit on batches
     >>> kmeans = MiniBatchKMeans(n_clusters=2,
-    ...         random_state=0,
-    ...         batch_size=6)
+    ...                          random_state=0,
+    ...                          batch_size=6)
     >>> kmeans = kmeans.partial_fit(X[0:6,:])
     >>> kmeans = kmeans.partial_fit(X[6:12,:])
     >>> kmeans.cluster_centers_
@@ -1430,9 +1430,9 @@ class MiniBatchKMeans(KMeans):
     array([0, 1], dtype=int32)
     >>> # fit on the whole data
     >>> kmeans = MiniBatchKMeans(n_clusters=2,
-    ...         random_state=0,
-    ...         batch_size=6,
-    ...         max_iter=10).fit(X)
+    ...                          random_state=0,
+    ...                          batch_size=6,
+    ...                          max_iter=10).fit(X)
     >>> kmeans.cluster_centers_
     array([[3.95918367, 2.40816327],
            [1.12195122, 1.3902439 ]])
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index 7e93e715b7585..68b92139537d3 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -409,7 +409,7 @@ def fit(self, X, y=None):
         """Perform clustering.
 
         Parameters
-        -----------
+        ----------
         X : array-like, shape=[n_samples, n_features]
             Samples to cluster.
 
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 82a771756d09c..fdaf423a11db4 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -173,7 +173,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None,
     Read more in the :ref:`User Guide <spectral_clustering>`.
 
     Parameters
-    -----------
+    ----------
     affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
         The affinity matrix describing the relationship of the samples to
         embed. **Must be symmetric**.
@@ -240,7 +240,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None,
       https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
 
     Notes
-    ------
+    -----
     The graph should contain only one connect component, elsewhere
     the results make little sense.
 
@@ -298,7 +298,7 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
     Read more in the :ref:`User Guide <spectral_clustering>`.
 
     Parameters
-    -----------
+    ----------
     n_clusters : integer, optional
         The dimension of the projection subspace.
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index a59e7962bbbb4..11dad7338b94a 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -33,9 +33,6 @@
 class ColumnTransformer(_BaseComposition, TransformerMixin):
     """Applies transformers to columns of an array or pandas DataFrame.
 
-    EXPERIMENTAL: some behaviors may change between releases without
-    deprecation.
-
     This estimator allows different columns or column subsets of the input
     to be transformed separately and the features generated by each transformer
     will be concatenated to form a single feature space.
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
new file mode 100644
index 0000000000000..d38e45f57b4f8
--- /dev/null
+++ b/sklearn/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+
+
+@pytest.fixture(scope='function')
+def pyplot():
+    """Setup and teardown fixture for matplotlib.
+
+    This fixture checks if we can import matplotlib. If not, the tests will be
+    skipped. Otherwise, we setup matplotlib backend and close the figures
+    after running the functions.
+
+    Returns
+    -------
+    pyplot : module
+        The ``matplotlib.pyplot`` module.
+    """
+    matplotlib = pytest.importorskip('matplotlib')
+    matplotlib.use('agg', warn=False, force=True)
+    pyplot = pytest.importorskip('matplotlib.pyplot')
+    yield pyplot
+    pyplot.close('all')
diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py
index 21d389846f198..a962c7ead8615 100644
--- a/sklearn/covariance/empirical_covariance_.py
+++ b/sklearn/covariance/empirical_covariance_.py
@@ -122,8 +122,8 @@ class EmpiricalCovariance(BaseEstimator):
     ...                      [.3, .4]])
     >>> rng = np.random.RandomState(0)
     >>> X = rng.multivariate_normal(mean=[0, 0],
-    ...                                   cov=real_cov,
-    ...                                   size=500)
+    ...                             cov=real_cov,
+    ...                             size=500)
     >>> cov = EmpiricalCovariance().fit(X)
     >>> cov.covariance_ # doctest: +ELLIPSIS
     array([[0.7569..., 0.2818...],
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 35ead3fcd8210..2e355f5cf3f1b 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -337,10 +337,10 @@ class GraphicalLasso(EmpiricalCovariance):
     --------
     >>> import numpy as np
     >>> from sklearn.covariance import GraphicalLasso
-    >>> true_cov = np.array([[.8, 0., .2, 0.],
-    ...                      [0., .4, 0., 0.],
-    ...                      [.2, 0., .3, .1],
-    ...                      [0., 0., .1, .7]])
+    >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+    ...                      [0.0, 0.4, 0.0, 0.0],
+    ...                      [0.2, 0.0, 0.3, 0.1],
+    ...                      [0.0, 0.0, 0.1, 0.7]])
     >>> np.random.seed(0)
     >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
     ...                                   cov=true_cov,
@@ -592,10 +592,10 @@ class GraphicalLassoCV(GraphicalLasso):
     --------
     >>> import numpy as np
     >>> from sklearn.covariance import GraphicalLassoCV
-    >>> true_cov = np.array([[.8, 0., .2, 0.],
-    ...                      [0., .4, 0., 0.],
-    ...                      [.2, 0., .3, .1],
-    ...                      [0., 0., .1, .7]])
+    >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+    ...                      [0.0, 0.4, 0.0, 0.0],
+    ...                      [0.2, 0.0, 0.3, 0.1],
+    ...                      [0.0, 0.0, 0.1, 0.7]])
     >>> np.random.seed(0)
     >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
     ...                                   cov=true_cov,
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 3fdffc5851d01..0b8f73c86117b 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -568,12 +568,12 @@ def load_digits(n_class=10, return_X_y=False):
 def load_diabetes(return_X_y=False):
     """Load and return the diabetes dataset (regression).
 
-    ==============      ==================
-    Samples total       442
-    Dimensionality      10
-    Features            real, -.2 < x < .2
-    Targets             integer 25 - 346
-    ==============      ==================
+    ==============   ==================
+    Samples total    442
+    Dimensionality   10
+    Features         real, -.2 < x < .2
+    Targets          integer 25 - 346
+    ==============   ==================
 
     Read more in the :ref:`User Guide <diabetes_dataset>`.
 
@@ -621,12 +621,12 @@ def load_diabetes(return_X_y=False):
 def load_linnerud(return_X_y=False):
     """Load and return the linnerud dataset (multivariate regression).
 
-    ==============    ============================
-    Samples total     20
-    Dimensionality    3 (for both data and target)
-    Features          integer
-    Targets           integer
-    ==============    ============================
+    ==============   ============================
+    Samples total    20
+    Dimensionality   3 (for both data and target)
+    Features         integer
+    Targets          integer
+    ==============   ============================
 
     Read more in the :ref:`User Guide <linnerrud_dataset>`.
 
@@ -685,12 +685,12 @@ def load_linnerud(return_X_y=False):
 def load_boston(return_X_y=False):
     """Load and return the boston house-prices dataset (regression).
 
-    ==============     ==============
-    Samples total                 506
-    Dimensionality                 13
-    Features           real, positive
-    Targets             real 5. - 50.
-    ==============     ==============
+    ==============   ==============
+    Samples total               506
+    Dimensionality               13
+    Features         real, positive
+    Targets           real 5. - 50.
+    ==============   ==============
 
     Read more in the :ref:`User Guide <boston_dataset>`.
 
@@ -810,7 +810,7 @@ def load_sample_image(image_name):
     Read more in the :ref:`User Guide <sample_images>`.
 
     Parameters
-    -----------
+    ----------
     image_name : {`china.jpg`, `flower.jpg`}
         The name of the sample image loaded
 
@@ -820,7 +820,7 @@ def load_sample_image(image_name):
         The image as a numpy array: height x width x color
 
     Examples
-    ---------
+    --------
 
     >>> from sklearn.datasets import load_sample_image
     >>> china = load_sample_image('china.jpg')   # doctest: +SKIP
@@ -895,7 +895,7 @@ def _fetch_remote(remote, dirname=None):
     downloaded file.
 
     Parameters
-    -----------
+    ----------
     remote : RemoteFileMetadata
         Named tuple containing remote dataset meta information: url, filename
         and checksum
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index 372d6e44f1b92..26550270c3aab 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -50,12 +50,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
                              return_X_y=False):
     """Load the California housing dataset (regression).
 
-    ==============     ==============
-    Samples total               20640
-    Dimensionality                  8
-    Features                     real
-    Target             real 0.15 - 5.
-    ==============     ==============
+    ==============   ==============
+    Samples total             20640
+    Dimensionality                8
+    Features                   real
+    Target           real 0.15 - 5.
+    ==============   ==============
 
     Read more in the :ref:`User Guide <california_housing_dataset>`.
 
@@ -97,7 +97,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
         .. versionadded:: 0.20
 
     Notes
-    ------
+    -----
 
     This dataset consists of 20,640 samples and 9 features.
     """
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 2363a9a4689ca..6f76ee15e2e40 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -449,9 +449,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     .. note:: EXPERIMENTAL
 
-        The API is experimental in version 0.20 (particularly the return value
-        structure), and might have small backward-incompatible changes in
-        future releases.
+        The API is experimental (particularly the return value structure),
+        and might have small backward-incompatible changes in future releases.
 
     Parameters
     ----------
@@ -515,10 +514,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
         .. note:: EXPERIMENTAL
 
-            This interface is **experimental** as at version 0.20 and
-            subsequent releases may change attributes without notice
-            (although there should only be minor changes to ``data``
-            and ``target``).
+            This interface is **experimental** and subsequent releases may
+            change attributes without notice (although there should only be
+            minor changes to ``data`` and ``target``).
 
         Missing values in the 'data' are represented as NaN's. Missing values
         in 'target' are represented as NaN's (numerical target) or None
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 34e8251f9551f..83cb5b132ccd5 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -154,7 +154,7 @@ def fetch_species_distributions(data_home=None,
         instead of trying to download the data from the source site.
 
     Returns
-    --------
+    -------
     The data is returned as a Bunch object with the following attributes:
 
     coverages : array, shape = [14, 1592, 1212]
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
index fbb38ceffa298..c85d4e91749b6 100644
--- a/sklearn/datasets/svmlight_format.py
+++ b/sklearn/datasets/svmlight_format.py
@@ -435,7 +435,7 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
         # if a user wants to get fancy, they'll have to decode themselves.
         # Avoid mention of str and unicode types for Python 3.x compat.
         if isinstance(comment, bytes):
-            comment.decode("ascii")     # just for the exception
+            comment.decode("ascii")  # just for the exception
         else:
             comment = comment.encode("utf-8")
         if b"\0" in comment:
diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py
index b318de0cd0daf..e0b9b33de0bda 100644
--- a/sklearn/decomposition/base.py
+++ b/sklearn/decomposition/base.py
@@ -27,7 +27,7 @@ def get_covariance(self):
         """Compute data covariance with the generative model.
 
         ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``
-        where  S**2 contains the explained variances, and sigma2 contains the
+        where S**2 contains the explained variances, and sigma2 contains the
         noise variances.
 
         Returns
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index ef823272e0e8f..8075b706a5f9c 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -171,7 +171,7 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
             copy_Xy=copy_cov).T
     else:
         raise ValueError('Sparse coding method must be "lasso_lars" '
-                         '"lasso_cd",  "lasso", "threshold" or "omp", got %s.'
+                         '"lasso_cd", "lasso", "threshold" or "omp", got %s.'
                          % algorithm)
     if new_code.ndim != 2:
         return new_code.reshape(n_samples, n_components)
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index c1c695c96d82b..555bd619c5a62 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -230,9 +230,9 @@ def _fit_transform(self, K):
         # there is a link between
         # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)'
         # if v is an eigenvector of K
-        #                      then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
+        #     then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
         # if u is an eigenvector of Phi(X)Phi(X)'
-        #                      then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
+        #     then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
         #
         # At this stage our self.alphas_ (the v) have norm 1, we need to scale
         # them so that eigenvectors in kernel feature space (the u) have norm=1
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index d1cee0345d5e6..5c8893d141724 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -223,6 +223,8 @@ class PCA(_BasePCA):
         The singular values are equal to the 2-norms of the ``n_components``
         variables in the lower-dimensional space.
 
+        .. versionadded:: 0.19
+
     mean_ : array, shape (n_features,)
         Per-feature empirical mean, estimated from the training set.
 
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index e710bc5045b30..9d64292b702e0 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -716,7 +716,7 @@ def _decision_function(self, X):
             Xm = X - self.means_[i]
             X2 = np.dot(Xm, R * (S ** (-0.5)))
             norm2.append(np.sum(X2 ** 2, 1))
-        norm2 = np.array(norm2).T   # shape = [len(X), n_classes]
+        norm2 = np.array(norm2).T  # shape = [len(X), n_classes]
         u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
         return (-0.5 * (norm2 + u) + np.log(self.priors_))
 
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 6e1fe461fabe7..98ecef6f6c459 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -492,10 +492,10 @@ def predict(self, X, return_std=False):
 
         Returns
         -------
-        y : array, shape = [n_samples]  or [n_samples, n_outputs]
+        y : array, shape = [n_samples] or [n_samples, n_outputs]
             Predicted target values for X.
 
-        y_std : array, shape = [n_samples]  or [n_samples, n_outputs]
+        y_std : array, shape = [n_samples] or [n_samples, n_outputs]
             Standard deviation of predictive distribution of query points.
         """
         check_is_fitted(self, "constant_")
diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py
index ca92589075b0c..19c66710bf0ad 100644
--- a/sklearn/ensemble/_gb_losses.py
+++ b/sklearn/ensemble/_gb_losses.py
@@ -879,6 +879,6 @@ def get_init_raw_predictions(self, X, estimator):
     'lad': LeastAbsoluteError,
     'huber': HuberLossFunction,
     'quantile': QuantileLossFunction,
-    'deviance': None,    # for both, multinomial and binomial
+    'deviance': None,  # for both, multinomial and binomial
     'exponential': ExponentialLoss,
 }
diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index c46ed25a4c4dc..64225db2348dc 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -239,131 +239,6 @@ def predict_stage(np.ndarray[object, ndim=2] estimators,
     return predict_stages(estimators[stage:stage + 1], X, scale, out)
 
 
-cdef inline int array_index(int32 val, int32[::1] arr):
-    """Find index of ``val`` in array ``arr``. """
-    cdef int32 res = -1
-    cdef int32 i = 0
-    cdef int32 n = arr.shape[0]
-    for i in range(n):
-        if arr[i] == val:
-            res = i
-            break
-    return res
-
-
-cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
-                               int32[::1] target_feature,
-                               double learn_rate,
-                               double[::1] out):
-    """Partial dependence of the response on the ``target_feature`` set.
-
-    For each row in ``X`` a tree traversal is performed.
-    Each traversal starts from the root with weight 1.0.
-
-    At each non-terminal node that splits on a target variable either
-    the left child or the right child is visited based on the feature
-    value of the current sample and the weight is not modified.
-    At each non-terminal node that splits on a complementary feature
-    both children are visited and the weight is multiplied by the fraction
-    of training samples which went to each child.
-
-    At each terminal node the value of the node is multiplied by the
-    current weight (weights sum to 1 for all visited terminal nodes).
-
-    Parameters
-    ----------
-    tree : sklearn.tree.Tree
-        A regression tree; tree.values.shape[1] == 1
-    X : memory view on 2d ndarray
-        The grid points on which the partial dependence
-        should be evaluated. X.shape[1] == target_feature.shape[0].
-    target_feature : memory view on 1d ndarray
-        The set of target features for which the partial dependence
-        should be evaluated. X.shape[1] == target_feature.shape[0].
-    learn_rate : double
-        Constant scaling factor for the leaf predictions.
-    out : memory view on 1d ndarray
-        The value of the partial dependence function on each grid
-        point.
-    """
-    cdef Py_ssize_t i = 0
-    cdef Py_ssize_t n_features = X.shape[1]
-    cdef Node* root_node = tree.nodes
-    cdef double *value = tree.value
-    cdef SIZE_t node_count = tree.node_count
-
-    cdef SIZE_t stack_capacity = node_count * 2
-    cdef Node **node_stack
-    cdef double[::1] weight_stack = np_ones((stack_capacity,), dtype=np_float64)
-    cdef SIZE_t stack_size = 1
-    cdef double left_sample_frac
-    cdef double current_weight
-    cdef double total_weight = 0.0
-    cdef Node *current_node
-    underlying_stack = np_zeros((stack_capacity,), dtype=np.intp)
-    node_stack = <Node **>(<np.ndarray> underlying_stack).data
-
-    for i in range(X.shape[0]):
-        # init stacks for new example
-        stack_size = 1
-        node_stack[0] = root_node
-        weight_stack[0] = 1.0
-        total_weight = 0.0
-
-        while stack_size > 0:
-            # get top node on stack
-            stack_size -= 1
-            current_node = node_stack[stack_size]
-
-            if current_node.left_child == TREE_LEAF:
-                out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
-                          learn_rate
-                total_weight += weight_stack[stack_size]
-            else:
-                # non-terminal node
-                feature_index = array_index(current_node.feature, target_feature)
-                if feature_index != -1:
-                    # split feature in target set
-                    # push left or right child on stack
-                    if X[i, feature_index] <= current_node.threshold:
-                        # left
-                        node_stack[stack_size] = (root_node +
-                                                  current_node.left_child)
-                    else:
-                        # right
-                        node_stack[stack_size] = (root_node +
-                                                  current_node.right_child)
-                    stack_size += 1
-                else:
-                    # split feature in complement set
-                    # push both children onto stack
-
-                    # push left child
-                    node_stack[stack_size] = root_node + current_node.left_child
-                    current_weight = weight_stack[stack_size]
-                    left_sample_frac = root_node[current_node.left_child].weighted_n_node_samples / \
-                                       current_node.weighted_n_node_samples
-                    if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
-                        raise ValueError("left_sample_frac:%d, "
-                                         "weighted_n_node_samples current: %d, "
-                                         "weighted_n_node_samples left: %d"
-                                         % (left_sample_frac,
-                                            current_node.weighted_n_node_samples,
-                                            root_node[current_node.left_child].weighted_n_node_samples))
-                    weight_stack[stack_size] = current_weight * left_sample_frac
-                    stack_size +=1
-
-                    # push right child
-                    node_stack[stack_size] = root_node + current_node.right_child
-                    weight_stack[stack_size] = current_weight * \
-                                               (1.0 - left_sample_frac)
-                    stack_size +=1
-
-        if not (0.999 < total_weight < 1.001):
-            raise ValueError("Total weight should be 1.0 but was %.9f" %
-                             total_weight)
-
-
 def _random_sample_mask(np.npy_intp n_total_samples,
                         np.npy_intp n_total_in_bag, random_state):
      """Create a random sample mask where ``n_total_in_bag`` elements are set.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index ce7ac7116030a..064c7ce8b6411 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -101,7 +101,7 @@ def __lt__(self, other_node):
         priority).
 
         Parameters
-        -----------
+        ----------
         other_node : TreeNode
             The node to compare with.
         """
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 3ce0eb7f456da..468de934f3666 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1694,7 +1694,7 @@ def _staged_raw_predict(self, X):
             Regression and binary classification are special cases with
             ``k == 1``, otherwise ``k==n_classes``.
         """
-        X = check_array(X, dtype=DTYPE, order="C",  accept_sparse='csr')
+        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
         raw_predictions = self._raw_predict_init(X)
         for i in range(self.estimators_.shape[0]):
             predict_stage(self.estimators_, i, X, self.learning_rate,
@@ -2085,7 +2085,7 @@ def decision_function(self, X):
             `classes_`. Regression and binary classification produce an
             array of shape [n_samples].
         """
-        X = check_array(X, dtype=DTYPE, order="C",  accept_sparse='csr')
+        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
         raw_predictions = self._raw_predict(X)
         if raw_predictions.shape[1] == 1:
             return raw_predictions.ravel()
@@ -2527,7 +2527,7 @@ def predict(self, X):
         y : array, shape (n_samples,)
             The predicted values.
         """
-        X = check_array(X, dtype=DTYPE, order="C",  accept_sparse='csr')
+        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
         # In regression we can directly return the raw value from the trees.
         return self._raw_predict(X).ravel()
 
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 11d5208d2d179..b1a40ffd09d1e 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -20,7 +20,6 @@
 from ..tree._tree import DTYPE
 from ..utils import deprecated
 
-from ._gradient_boosting import _partial_dependence_tree
 from .gradient_boosting import BaseGradientBoosting
 
 
@@ -174,8 +173,8 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
+            tree.compute_partial_dependence(grid, target_variables, pdp[k])
+    pdp *= gbrt.learning_rate
 
     return pdp, axes
 
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index a40fea2ff0099..dc0e0419e812e 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -7,14 +7,12 @@
 from numpy.testing import assert_array_equal, assert_allclose
 
 from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import if_matplotlib
 from sklearn.ensemble.partial_dependence import partial_dependence
 from sklearn.ensemble.partial_dependence import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn import datasets
 from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.testing import assert_warns_message
 
 
 # toy sample
@@ -156,8 +154,7 @@ def test_partial_dependecy_input():
 @ignore_warnings(category=DeprecationWarning)
 @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
 # matplotlib Python3.7 warning
-@if_matplotlib
-def test_plot_partial_dependence():
+def test_plot_partial_dependence(pyplot):
     # Test partial dependence plot function.
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
@@ -190,9 +187,8 @@ def test_plot_partial_dependence():
 
 @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
 # matplotlib Python3.7 warning
-@if_matplotlib
 @ignore_warnings(category=DeprecationWarning)
-def test_plot_partial_dependence_input():
+def test_plot_partial_dependence_input(pyplot):
     # Test partial dependence plot function input checks.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
 
@@ -228,9 +224,8 @@ def test_plot_partial_dependence_input():
 
 @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
 # matplotlib Python3.7 warning
-@if_matplotlib
 @ignore_warnings(category=DeprecationWarning)
-def test_plot_partial_dependence_multiclass():
+def test_plot_partial_dependence_multiclass(pyplot):
     # Test partial dependence plot function on multi-class input.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(iris.data, iris.target)
@@ -265,30 +260,18 @@ def test_plot_partial_dependence_multiclass():
                   grid_resolution=grid_resolution)
 
 
-def test_warning_raised_partial_dependence():
-    # Test that deprecation warning is raised
-
-    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
-    clf.fit(boston.data, boston.target)
-    grid_resolution = 25
-
-    assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.partial_dependence has been deprecated ",
-                         partial_dependence, clf, [0], X=boston.data,
-                         grid_resolution=grid_resolution)
-
-
-@if_matplotlib
-def test_warning_raised_partial_dependence_plot():
-    # Test that deprecation warning is raised
-
+@pytest.mark.parametrize(
+    "func, params",
+    [(partial_dependence, {'target_variables': [0], 'X': boston.data}),
+     (plot_partial_dependence, {'X': boston.data, 'features': [0, 1, (0, 1)]})]
+)
+def test_raise_deprecation_warning(pyplot, func, params):
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
     grid_resolution = 25
 
-    assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.plot_partial_dependence has been "
-                         "deprecated",
-                         plot_partial_dependence, clf, boston.data,
-                         [0, 1, (0, 1)], grid_resolution=grid_resolution,
-                         feature_names=boston.feature_names)
+    warn_msg = "The function ensemble.{} has been deprecated".format(
+        func.__name__
+    )
+    with pytest.warns(DeprecationWarning, match=warn_msg):
+        func(clf, **params, grid_resolution=grid_resolution)
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 2a19bc9a64dc0..b2b16cf8eeec3 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -8,9 +8,11 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_raise_message
 from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import GaussianNB
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
 from sklearn.ensemble import VotingClassifier, VotingRegressor
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
@@ -340,12 +342,25 @@ def test_sample_weight():
     assert_array_equal(eclf3.predict(X), clf1.predict(X))
     assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
 
+    # check that an error is raised and indicative if sample_weight is not
+    # supported.
     clf4 = KNeighborsClassifier()
     eclf3 = VotingClassifier(estimators=[
         ('lr', clf1), ('svc', clf3), ('knn', clf4)],
         voting='soft')
-    msg = ('Underlying estimator \'knn\' does not support sample weights.')
-    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
+    msg = ('Underlying estimator KNeighborsClassifier does not support '
+           'sample weights.')
+    with pytest.raises(ValueError, match=msg):
+        eclf3.fit(X, y, sample_weight)
+
+    # check that _parallel_fit_estimator will raise the right error
+    # it should raise the original error if this is not linked to sample_weight
+    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
+        def fit(self, X, y, sample_weight):
+            raise TypeError('Error unrelated to sample_weight.')
+    clf = ClassifierErrorFit()
+    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
+        clf.fit(X, y, sample_weight=sample_weight)
 
 
 def test_sample_weight_kwargs():
@@ -402,8 +417,10 @@ def test_set_params():
 @pytest.mark.filterwarnings('ignore: Default solver will be changed')  # 0.22
 @pytest.mark.filterwarnings('ignore: Default multi_class will')  # 0.22
 @pytest.mark.filterwarnings('ignore:The default value of n_estimators')
-def test_set_estimator_none():
-    """VotingClassifier set_params should be able to set estimators as None"""
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_set_estimator_none(drop):
+    """VotingClassifier set_params should be able to set estimators as None or
+    drop"""
     # Test predict
     clf1 = LogisticRegression(random_state=123)
     clf2 = RandomForestClassifier(random_state=123)
@@ -415,22 +432,22 @@ def test_set_estimator_none():
     eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                          ('nb', clf3)],
                              voting='hard', weights=[1, 1, 0.5])
-    eclf2.set_params(rf=None).fit(X, y)
+    eclf2.set_params(rf=drop).fit(X, y)
     assert_array_equal(eclf1.predict(X), eclf2.predict(X))
 
-    assert dict(eclf2.estimators)["rf"] is None
+    assert dict(eclf2.estimators)["rf"] is drop
     assert len(eclf2.estimators_) == 2
     assert all(isinstance(est, (LogisticRegression, GaussianNB))
                for est in eclf2.estimators_)
-    assert eclf2.get_params()["rf"] is None
+    assert eclf2.get_params()["rf"] is drop
 
     eclf1.set_params(voting='soft').fit(X, y)
     eclf2.set_params(voting='soft').fit(X, y)
     assert_array_equal(eclf1.predict(X), eclf2.predict(X))
     assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
-    msg = 'All estimators are None. At least one is required!'
+    msg = 'All estimators are None or "drop". At least one is required!'
     assert_raise_message(
-        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)
+        ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y)
 
     # Test soft voting transform
     X1 = np.array([[1], [2]])
@@ -442,7 +459,7 @@ def test_set_estimator_none():
     eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                              voting='soft', weights=[1, 0.5],
                              flatten_transform=False)
-    eclf2.set_params(rf=None).fit(X1, y1)
+    eclf2.set_params(rf=drop).fit(X1, y1)
     assert_array_almost_equal(eclf1.transform(X1),
                               np.array([[[0.7, 0.3], [0.3, 0.7]],
                                         [[1., 0.], [0., 1.]]]))
@@ -507,3 +524,26 @@ def test_transform():
             eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
             eclf2.transform(X)
     )
+
+
+@pytest.mark.filterwarnings('ignore: Default solver will be changed')  # 0.22
+@pytest.mark.filterwarnings('ignore: Default multi_class will')  # 0.22
+@pytest.mark.parametrize(
+    "X, y, voter",
+    [(X, y, VotingClassifier(
+        [('lr', LogisticRegression()),
+         ('rf', RandomForestClassifier(n_estimators=5))])),
+     (X_r, y_r, VotingRegressor(
+         [('lr', LinearRegression()),
+          ('rf', RandomForestRegressor(n_estimators=5))]))]
+)
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_none_estimator_with_weights(X, y, voter, drop):
+    # check that an estimator can be set to None and passing some weight
+    # regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/13777
+    voter.fit(X, y, sample_weight=np.ones(y.shape))
+    voter.set_params(lr=drop)
+    voter.fit(X, y, sample_weight=np.ones(y.shape))
+    y_pred = voter.predict(X)
+    assert y_pred.shape == y.shape
diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
index 35821201b6617..f60bb8f49b81d 100644
--- a/sklearn/ensemble/voting.py
+++ b/sklearn/ensemble/voting.py
@@ -30,7 +30,15 @@
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     """Private function used to fit an estimator within a job."""
     if sample_weight is not None:
-        estimator.fit(X, y, sample_weight=sample_weight)
+        try:
+            estimator.fit(X, y, sample_weight=sample_weight)
+        except TypeError as exc:
+            if "unexpected keyword argument 'sample_weight'" in str(exc):
+                raise ValueError(
+                    "Underlying estimator {} does not support sample weights."
+                    .format(estimator.__class__.__name__)
+                ) from exc
+            raise
     else:
         estimator.fit(X, y)
     return estimator
@@ -53,8 +61,8 @@ def _weights_not_none(self):
         """Get the weights of not `None` estimators"""
         if self.weights is None:
             return None
-        return [w for est, w in zip(self.estimators,
-                                    self.weights) if est[1] is not None]
+        return [w for est, w in zip(self.estimators, self.weights)
+                if est[1] not in (None, 'drop')]
 
     def _predict(self, X):
         """Collect results from clf.predict calls. """
@@ -76,24 +84,22 @@ def fit(self, X, y, sample_weight=None):
                              '; got %d weights, %d estimators'
                              % (len(self.weights), len(self.estimators)))
 
-        if sample_weight is not None:
-            for name, step in self.estimators:
-                if not has_fit_parameter(step, 'sample_weight'):
-                    raise ValueError('Underlying estimator \'%s\' does not'
-                                     ' support sample weights.' % name)
-
         names, clfs = zip(*self.estimators)
         self._validate_names(names)
 
-        n_isnone = np.sum([clf is None for _, clf in self.estimators])
+        n_isnone = np.sum(
+            [clf in (None, 'drop') for _, clf in self.estimators]
+        )
         if n_isnone == len(self.estimators):
-            raise ValueError('All estimators are None. At least one is '
-                             'required!')
+            raise ValueError(
+                'All estimators are None or "drop". At least one is required!'
+            )
 
         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                 delayed(_parallel_fit_estimator)(clone(clf), X, y,
                                                  sample_weight=sample_weight)
-                for clf in clfs if clf is not None)
+                for clf in clfs if clf not in (None, 'drop')
+            )
 
         self.named_estimators_ = Bunch()
         for k, e in zip(self.estimators, self.estimators_):
@@ -147,8 +153,8 @@ class VotingClassifier(_BaseVoting, ClassifierMixin):
     estimators : list of (string, estimator) tuples
         Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
         of those original estimators that will be stored in the class attribute
-        ``self.estimators_``. An estimator can be set to `None` using
-        ``set_params``.
+        ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
+        using ``set_params``.
 
     voting : str, {'hard', 'soft'} (default='hard')
         If 'hard', uses predicted class labels for majority rule voting.
@@ -281,7 +287,7 @@ def predict(self, X):
             The input samples.
 
         Returns
-        ----------
+        -------
         maj : array-like, shape (n_samples,)
             Predicted class labels.
         """
@@ -325,7 +331,7 @@ def predict_proba(self):
             The input samples.
 
         Returns
-        ----------
+        -------
         avg : array-like, shape (n_samples, n_classes)
             Weighted average probability for each class per sample.
         """
@@ -379,9 +385,9 @@ class VotingRegressor(_BaseVoting, RegressorMixin):
     Parameters
     ----------
     estimators : list of (string, estimator) tuples
-        Invoking the ``fit`` method on the ``VotingRegressor`` will fit
-        clones of those original estimators that will be stored in the class
-        attribute ``self.estimators_``. An estimator can be set to `None`
+        Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
         using ``set_params``.
 
     weights : array-like, shape (n_regressors,), optional (default=`None`)
diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py
index 6e13b7bd80ae2..724aa07d2d76c 100644
--- a/sklearn/ensemble/weight_boosting.py
+++ b/sklearn/ensemble/weight_boosting.py
@@ -685,7 +685,7 @@ class in ``classes_``, respectively.
             # The weights are all 1. for SAMME.R
             pred = sum(_samme_proba(estimator, n_classes, X)
                        for estimator in self.estimators_)
-        else:   # self.algorithm == "SAMME"
+        else:  # self.algorithm == "SAMME"
             pred = sum((estimator.predict(X) == classes).T * w
                        for estimator, w in zip(self.estimators_,
                                                self.estimator_weights_))
@@ -780,7 +780,7 @@ def predict_proba(self, X):
             # The weights are all 1. for SAMME.R
             proba = sum(_samme_proba(estimator, n_classes, X)
                         for estimator in self.estimators_)
-        else:   # self.algorithm == "SAMME"
+        else:  # self.algorithm == "SAMME"
             proba = sum(estimator.predict_proba(X) * w
                         for estimator, w in zip(self.estimators_,
                                                 self.estimator_weights_))
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 9cf207e40fdd6..22289db5c45e2 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -29,7 +29,7 @@ class NotFittedError(ValueError, AttributeError):
     ...     LinearSVC().predict([[1, 2], [2, 3], [3, 4]])
     ... except NotFittedError as e:
     ...     print(repr(e))
-    ...                        # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+    ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     NotFittedError('This LinearSVC instance is not fitted yet'...)
 
     .. versionchanged:: 0.18
@@ -114,7 +114,7 @@ class FitFailedWarning(RuntimeWarning):
     >>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1]
     >>> with warnings.catch_warnings(record=True) as w:
     ...     try:
-    ...         gs.fit(X, y)   # This will raise a ValueError since C is < 0
+    ...         gs.fit(X, y)  # This will raise a ValueError since C is < 0
     ...     except ValueError:
     ...         pass
     ...     print(repr(w[-1].message))
diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py
new file mode 100644
index 0000000000000..2f262141cc069
--- /dev/null
+++ b/sklearn/experimental/enable_iterative_imputer.py
@@ -0,0 +1,19 @@
+"""Enables IterativeImputer
+
+The API and results of this estimators might change without any deprecation
+cycle.
+
+Importing this file dynamically sets :class:`sklearn.impute.IterativeImputer`
+as an attribute of the impute module::
+
+    >>> # explicitly require this experimental feature
+    >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+    >>> # now you can import normally from impute
+    >>> from sklearn.impute import IterativeImputer
+"""
+
+from ..impute._iterative import IterativeImputer
+from .. import impute
+
+impute.IterativeImputer = IterativeImputer
+impute.__all__ += ['IterativeImputer']
diff --git a/sklearn/experimental/tests/test_enable_iterative_imputer.py b/sklearn/experimental/tests/test_enable_iterative_imputer.py
new file mode 100644
index 0000000000000..17579e0c43612
--- /dev/null
+++ b/sklearn/experimental/tests/test_enable_iterative_imputer.py
@@ -0,0 +1,39 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+from sklearn.utils.testing import assert_run_python_script
+
+
+def test_imports_strategies():
+    # Make sure different import strategies work or fail as expected.
+
+    # Since Python caches the imported modules, we need to run a child process
+    # for every test case. Else, the tests would not be independent
+    # (manually removing the imports from the cache (sys.modules) is not
+    # recommended and can lead to many complications).
+
+    good_import = """
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(good_import))
+
+    good_import_with_ensemble_first = """
+    import sklearn.ensemble
+    from sklearn.experimental import enable_iterative_imputer
+    from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first))
+
+    bad_imports = """
+    import pytest
+
+    with pytest.raises(ImportError):
+        from sklearn.impute import IterativeImputer
+
+    import sklearn.experimental
+    with pytest.raises(ImportError):
+        from sklearn.impute import IterativeImputer
+    """
+    assert_run_python_script(textwrap.dedent(bad_imports))
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 68a43ff0be313..e6f8ff4e555fa 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -32,7 +32,7 @@ def _make_edges_3d(n_x, n_y, n_z=1):
     """Returns a list of edges for a 3D image.
 
     Parameters
-    ===========
+    ----------
     n_x : integer
         The size of the grid in the x direction.
     n_y : integer
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 007e158f3a449..7891e332c8214 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -110,7 +110,7 @@ def _check_stop_list(stop):
         raise ValueError("not a built-in stop list: %s" % stop)
     elif stop is None:
         return None
-    else:               # assume it's a collection
+    else:  # assume it's a collection
         return frozenset(stop)
 
 
@@ -1496,7 +1496,7 @@ class TfidfVectorizer(CountVectorizer):
 
     idf_ : array, shape (n_features)
         The inverse document frequency (IDF) vector; only defined
-        if  ``use_idf`` is True.
+        if ``use_idf`` is True.
 
     stop_words_ : set
         Terms that were ignored because they either:
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e7d5e97037427..88e97deaecf54 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1156,7 +1156,7 @@ class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
     .. versionadded:: 0.18
 
     Parameters
-    -----------
+    ----------
     length_scale : float or array with shape (n_features,), default: 1.0
         The length scale of the kernel. If a float, an isotropic kernel is
         used. If an array, an anisotropic kernel is used where each dimension
@@ -1269,7 +1269,7 @@ class Matern(RBF):
     .. versionadded:: 0.18
 
     Parameters
-    -----------
+    ----------
     length_scale : float or array with shape (n_features,), default: 1.0
         The length scale of the kernel. If a float, an isotropic kernel is
         used. If an array, an anisotropic kernel is used where each dimension
diff --git a/sklearn/impute.py b/sklearn/impute.py
deleted file mode 100644
index 8bbf1bb94e242..0000000000000
--- a/sklearn/impute.py
+++ /dev/null
@@ -1,1339 +0,0 @@
-"""Transformers for missing value imputation"""
-# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
-#          Sergey Feldman <sergeyfeldman@gmail.com>
-# License: BSD 3 clause
-
-from __future__ import division
-
-import warnings
-import numbers
-from time import time
-from distutils.version import LooseVersion
-
-import numpy as np
-import numpy.ma as ma
-import scipy
-from scipy import sparse
-from scipy import stats
-from collections import namedtuple
-
-from .base import BaseEstimator, TransformerMixin
-from .base import clone
-from .exceptions import ConvergenceWarning
-from .preprocessing import normalize
-from .utils import check_array, check_random_state, safe_indexing
-from .utils.sparsefuncs import _get_median
-from .utils.validation import check_is_fitted
-from .utils.validation import FLOAT_DTYPES
-from .utils.fixes import _object_dtype_isnan
-from .utils import is_scalar_nan
-
-
-ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
-                                               'neighbor_feat_idx',
-                                               'estimator'])
-
-__all__ = [
-    'MissingIndicator',
-    'SimpleImputer',
-    'IterativeImputer',
-]
-
-
-def _check_inputs_dtype(X, missing_values):
-    if (X.dtype.kind in ("f", "i", "u") and
-            not isinstance(missing_values, numbers.Real)):
-        raise ValueError("'X' and 'missing_values' types are expected to be"
-                         " both numerical. Got X.dtype={} and "
-                         " type(missing_values)={}."
-                         .format(X.dtype, type(missing_values)))
-
-
-def _get_mask(X, value_to_mask):
-    """Compute the boolean mask X == missing_values."""
-    if is_scalar_nan(value_to_mask):
-        if X.dtype.kind == "f":
-            return np.isnan(X)
-        elif X.dtype.kind in ("i", "u"):
-            # can't have NaNs in integer array.
-            return np.zeros(X.shape, dtype=bool)
-        else:
-            # np.isnan does not work on object dtypes.
-            return _object_dtype_isnan(X)
-    else:
-        # X == value_to_mask with object dytpes does not always perform
-        # element-wise for old versions of numpy
-        return np.equal(X, value_to_mask)
-
-
-def _most_frequent(array, extra_value, n_repeat):
-    """Compute the most frequent value in a 1d array extended with
-       [extra_value] * n_repeat, where extra_value is assumed to be not part
-       of the array."""
-    # Compute the most frequent value in array only
-    if array.size > 0:
-        with warnings.catch_warnings():
-            # stats.mode raises a warning when input array contains objects due
-            # to incapacity to detect NaNs. Irrelevant here since input array
-            # has already been NaN-masked.
-            warnings.simplefilter("ignore", RuntimeWarning)
-            mode = stats.mode(array)
-
-        most_frequent_value = mode[0][0]
-        most_frequent_count = mode[1][0]
-    else:
-        most_frequent_value = 0
-        most_frequent_count = 0
-
-    # Compare to array + [extra_value] * n_repeat
-    if most_frequent_count == 0 and n_repeat == 0:
-        return np.nan
-    elif most_frequent_count < n_repeat:
-        return extra_value
-    elif most_frequent_count > n_repeat:
-        return most_frequent_value
-    elif most_frequent_count == n_repeat:
-        # Ties the breaks. Copy the behaviour of scipy.stats.mode
-        if most_frequent_value < extra_value:
-            return most_frequent_value
-        else:
-            return extra_value
-
-
-class SimpleImputer(BaseEstimator, TransformerMixin):
-    """Imputation transformer for completing missing values.
-
-    Read more in the :ref:`User Guide <impute>`.
-
-    Parameters
-    ----------
-    missing_values : number, string, np.nan (default) or None
-        The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
-
-    strategy : string, optional (default="mean")
-        The imputation strategy.
-
-        - If "mean", then replace missing values using the mean along
-          each column. Can only be used with numeric data.
-        - If "median", then replace missing values using the median along
-          each column. Can only be used with numeric data.
-        - If "most_frequent", then replace missing using the most frequent
-          value along each column. Can be used with strings or numeric data.
-        - If "constant", then replace missing values with fill_value. Can be
-          used with strings or numeric data.
-
-        .. versionadded:: 0.20
-           strategy="constant" for fixed value imputation.
-
-    fill_value : string or numerical value, optional (default=None)
-        When strategy == "constant", fill_value is used to replace all
-        occurrences of missing_values.
-        If left to the default, fill_value will be 0 when imputing numerical
-        data and "missing_value" for strings or object data types.
-
-    verbose : integer, optional (default=0)
-        Controls the verbosity of the imputer.
-
-    copy : boolean, optional (default=True)
-        If True, a copy of X will be created. If False, imputation will
-        be done in-place whenever possible. Note that, in the following cases,
-        a new copy will always be made, even if `copy=False`:
-
-        - If X is not an array of floating values;
-        - If X is encoded as a CSR matrix;
-        - If add_indicator=True.
-
-    add_indicator : boolean, optional (default=False)
-        If True, a `MissingIndicator` transform will stack onto output
-        of the imputer's transform. This allows a predictive estimator
-        to account for missingness despite imputation. If a feature has no
-        missing values at fit/train time, the feature won't appear on
-        the missing indicator even if there are missing values at
-        transform/test time.
-
-    Attributes
-    ----------
-    statistics_ : array of shape (n_features,)
-        The imputation fill value for each feature.
-
-    indicator_ : :class:`sklearn.impute.MissingIndicator`
-        Indicator used to add binary indicators for missing values.
-        ``None`` if add_indicator is False.
-
-    See also
-    --------
-    IterativeImputer : Multivariate imputation of missing values.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.impute import SimpleImputer
-    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
-    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
-    ... # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(add_indicator=False, copy=True, fill_value=None,
-            missing_values=nan, strategy='mean', verbose=0)
-    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
-    >>> print(imp_mean.transform(X))
-    ... # doctest: +NORMALIZE_WHITESPACE
-    [[ 7.   2.   3. ]
-     [ 4.   3.5  6. ]
-     [10.   3.5  9. ]]
-
-    Notes
-    -----
-    Columns which only contained missing values at `fit` are discarded upon
-    `transform` if strategy is not "constant".
-
-    """
-    def __init__(self, missing_values=np.nan, strategy="mean",
-                 fill_value=None, verbose=0, copy=True, add_indicator=False):
-        self.missing_values = missing_values
-        self.strategy = strategy
-        self.fill_value = fill_value
-        self.verbose = verbose
-        self.copy = copy
-        self.add_indicator = add_indicator
-
-    def _validate_input(self, X):
-        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
-        if self.strategy not in allowed_strategies:
-            raise ValueError("Can only use these strategies: {0} "
-                             " got strategy={1}".format(allowed_strategies,
-                                                        self.strategy))
-
-        if self.strategy in ("most_frequent", "constant"):
-            dtype = None
-        else:
-            dtype = FLOAT_DTYPES
-
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-
-        try:
-            X = check_array(X, accept_sparse='csc', dtype=dtype,
-                            force_all_finite=force_all_finite, copy=self.copy)
-        except ValueError as ve:
-            if "could not convert" in str(ve):
-                raise ValueError("Cannot use {0} strategy with non-numeric "
-                                 "data. Received datatype :{1}."
-                                 "".format(self.strategy, X.dtype.kind))
-            else:
-                raise ve
-
-        _check_inputs_dtype(X, self.missing_values)
-        if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise ValueError("SimpleImputer does not support data with dtype "
-                             "{0}. Please provide either a numeric array (with"
-                             " a floating point or integer dtype) or "
-                             "categorical data represented either as an array "
-                             "with integer dtype or an array of string values "
-                             "with an object dtype.".format(X.dtype))
-
-        return X
-
-    def fit(self, X, y=None):
-        """Fit the imputer on X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Input data, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features.
-
-        Returns
-        -------
-        self : SimpleImputer
-        """
-        X = self._validate_input(X)
-
-        # default fill_value is 0 for numerical input and "missing_value"
-        # otherwise
-        if self.fill_value is None:
-            if X.dtype.kind in ("i", "u", "f"):
-                fill_value = 0
-            else:
-                fill_value = "missing_value"
-        else:
-            fill_value = self.fill_value
-
-        # fill_value should be numerical in case of numerical input
-        if (self.strategy == "constant" and
-                X.dtype.kind in ("i", "u", "f") and
-                not isinstance(fill_value, numbers.Real)):
-            raise ValueError("'fill_value'={0} is invalid. Expected a "
-                             "numerical value when imputing numerical "
-                             "data".format(fill_value))
-
-        if sparse.issparse(X):
-            # missing_values = 0 not allowed with sparse data as it would
-            # force densification
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                self.statistics_ = self._sparse_fit(X,
-                                                    self.strategy,
-                                                    self.missing_values,
-                                                    fill_value)
-        else:
-            self.statistics_ = self._dense_fit(X,
-                                               self.strategy,
-                                               self.missing_values,
-                                               fill_value)
-
-        if self.add_indicator:
-            self.indicator_ = MissingIndicator(
-                missing_values=self.missing_values)
-            self.indicator_.fit(X)
-        else:
-            self.indicator_ = None
-
-        return self
-
-    def _sparse_fit(self, X, strategy, missing_values, fill_value):
-        """Fit the transformer on sparse data."""
-        mask_data = _get_mask(X.data, missing_values)
-        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
-
-        statistics = np.empty(X.shape[1])
-
-        if strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
-            statistics.fill(fill_value)
-        else:
-            for i in range(X.shape[1]):
-                column = X.data[X.indptr[i]:X.indptr[i + 1]]
-                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
-                column = column[~mask_column]
-
-                # combine explicit and implicit zeros
-                mask_zeros = _get_mask(column, 0)
-                column = column[~mask_zeros]
-                n_explicit_zeros = mask_zeros.sum()
-                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
-
-                if strategy == "mean":
-                    s = column.size + n_zeros
-                    statistics[i] = np.nan if s == 0 else column.sum() / s
-
-                elif strategy == "median":
-                    statistics[i] = _get_median(column,
-                                                n_zeros)
-
-                elif strategy == "most_frequent":
-                    statistics[i] = _most_frequent(column,
-                                                   0,
-                                                   n_zeros)
-        return statistics
-
-    def _dense_fit(self, X, strategy, missing_values, fill_value):
-        """Fit the transformer on dense data."""
-        mask = _get_mask(X, missing_values)
-        masked_X = ma.masked_array(X, mask=mask)
-
-        # Mean
-        if strategy == "mean":
-            mean_masked = np.ma.mean(masked_X, axis=0)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            mean = np.ma.getdata(mean_masked)
-            mean[np.ma.getmask(mean_masked)] = np.nan
-
-            return mean
-
-        # Median
-        elif strategy == "median":
-            median_masked = np.ma.median(masked_X, axis=0)
-            # Avoid the warning "Warning: converting a masked element to nan."
-            median = np.ma.getdata(median_masked)
-            median[np.ma.getmaskarray(median_masked)] = np.nan
-
-            return median
-
-        # Most frequent
-        elif strategy == "most_frequent":
-            # scipy.stats.mstats.mode cannot be used because it will no work
-            # properly if the first element is masked and if its frequency
-            # is equal to the frequency of the most frequent valid element
-            # See https://github.com/scipy/scipy/issues/2636
-
-            # To be able access the elements by columns
-            X = X.transpose()
-            mask = mask.transpose()
-
-            if X.dtype.kind == "O":
-                most_frequent = np.empty(X.shape[0], dtype=object)
-            else:
-                most_frequent = np.empty(X.shape[0])
-
-            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
-                row_mask = np.logical_not(row_mask).astype(np.bool)
-                row = row[row_mask]
-                most_frequent[i] = _most_frequent(row, np.nan, 0)
-
-            return most_frequent
-
-        # Constant
-        elif strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
-            return np.full(X.shape[1], fill_value, dtype=X.dtype)
-
-    def transform(self, X):
-        """Impute all missing values in X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-        """
-        check_is_fitted(self, 'statistics_')
-
-        X = self._validate_input(X)
-
-        statistics = self.statistics_
-
-        if X.shape[1] != statistics.shape[0]:
-            raise ValueError("X has %d features per sample, expected %d"
-                             % (X.shape[1], self.statistics_.shape[0]))
-
-        if self.add_indicator:
-            X_trans_indicator = self.indicator_.transform(X)
-
-        # Delete the invalid columns if strategy is not constant
-        if self.strategy == "constant":
-            valid_statistics = statistics
-        else:
-            # same as np.isnan but also works for object dtypes
-            invalid_mask = _get_mask(statistics, np.nan)
-            valid_mask = np.logical_not(invalid_mask)
-            valid_statistics = statistics[valid_mask]
-            valid_statistics_indexes = np.flatnonzero(valid_mask)
-
-            if invalid_mask.any():
-                missing = np.arange(X.shape[1])[invalid_mask]
-                if self.verbose:
-                    warnings.warn("Deleting features without "
-                                  "observed values: %s" % missing)
-                X = X[:, valid_statistics_indexes]
-
-        # Do actual imputation
-        if sparse.issparse(X):
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                mask = _get_mask(X.data, self.missing_values)
-                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
-                                    np.diff(X.indptr))[mask]
-
-                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
-                                                                copy=False)
-        else:
-            mask = _get_mask(X, self.missing_values)
-            n_missing = np.sum(mask, axis=0)
-            values = np.repeat(valid_statistics, n_missing)
-            coordinates = np.where(mask.transpose())[::-1]
-
-            X[coordinates] = values
-
-        if self.add_indicator:
-            hstack = sparse.hstack if sparse.issparse(X) else np.hstack
-            X = hstack((X, X_trans_indicator))
-
-        return X
-
-    def _more_tags(self):
-        return {'allow_nan': True}
-
-
-class IterativeImputer(BaseEstimator, TransformerMixin):
-    """Multivariate imputer that estimates each feature from all the others.
-
-    A strategy for imputing missing values by modeling each feature with
-    missing values as a function of other features in a round-robin fashion.
-
-    Read more in the :ref:`User Guide <iterative_imputer>`.
-
-    Parameters
-    ----------
-    estimator : estimator object, default=BayesianRidge()
-        The estimator to use at each step of the round-robin imputation.
-        If ``sample_posterior`` is True, the estimator must support
-        ``return_std`` in its ``predict`` method.
-
-    missing_values : int, np.nan, optional (default=np.nan)
-        The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed.
-
-    sample_posterior : boolean, default=False
-        Whether to sample from the (Gaussian) predictive posterior of the
-        fitted estimator for each imputation. Estimator must support
-        ``return_std`` in its ``predict`` method if set to ``True``. Set to
-        ``True`` if using ``IterativeImputer`` for multiple imputations.
-
-    max_iter : int, optional (default=10)
-        Maximum number of imputation rounds to perform before returning the
-        imputations computed during the final round. A round is a single
-        imputation of each feature with missing values. The stopping criterion
-        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
-        where `X_t` is `X` at iteration `t. Note that early stopping is only
-        applied if ``sample_posterior=False``.
-
-    tol : float, optional (default=1e-3)
-        Tolerance of the stopping condition.
-
-    n_nearest_features : int, optional (default=None)
-        Number of other features to use to estimate the missing values of
-        each feature column. Nearness between features is measured using
-        the absolute correlation coefficient between each feature pair (after
-        initial imputation). To ensure coverage of features throughout the
-        imputation process, the neighbor features are not necessarily nearest,
-        but are drawn with probability proportional to correlation for each
-        imputed target feature. Can provide significant speed-up when the
-        number of features is huge. If ``None``, all features will be used.
-
-    initial_strategy : str, optional (default="mean")
-        Which strategy to use to initialize the missing values. Same as the
-        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
-        Valid values: {"mean", "median", "most_frequent", or "constant"}.
-
-    imputation_order : str, optional (default="ascending")
-        The order in which the features will be imputed. Possible values:
-
-        "ascending"
-            From features with fewest missing values to most.
-        "descending"
-            From features with most missing values to fewest.
-        "roman"
-            Left to right.
-        "arabic"
-            Right to left.
-        "random"
-            A random order for each round.
-
-    min_value : float, optional (default=None)
-        Minimum possible imputed value. Default of ``None`` will set minimum
-        to negative infinity.
-
-    max_value : float, optional (default=None)
-        Maximum possible imputed value. Default of ``None`` will set maximum
-        to positive infinity.
-
-    verbose : int, optional (default=0)
-        Verbosity flag, controls the debug messages that are issued
-        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
-        or 2.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use. Randomizes
-        selection of estimator features if n_nearest_features is not None, the
-        ``imputation_order`` if ``random``, and the sampling from posterior if
-        ``sample_posterior`` is True. Use an integer for determinism.
-        See :term:`the Glossary <random_state>`.
-
-    add_indicator : boolean, optional (default=False)
-        If True, a `MissingIndicator` transform will stack onto output
-        of the imputer's transform. This allows a predictive estimator
-        to account for missingness despite imputation. If a feature has no
-        missing values at fit/train time, the feature won't appear on
-        the missing indicator even if there are missing values at
-        transform/test time.
-
-    Attributes
-    ----------
-    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
-        Imputer used to initialize the missing values.
-
-    imputation_sequence_ : list of tuples
-        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
-        ``feat_idx`` is the current feature to be imputed,
-        ``neighbor_feat_idx`` is the array of other features used to impute the
-        current feature, and ``estimator`` is the trained estimator used for
-        the imputation. Length is ``self.n_features_with_missing_ *
-        self.n_iter_``.
-
-    n_iter_ : int
-        Number of iteration rounds that occurred. Will be less than
-        ``self.max_iter`` if early stopping criterion was reached.
-
-    n_features_with_missing_ : int
-        Number of features with missing values.
-
-    indicator_ : :class:`sklearn.impute.MissingIndicator`
-        Indicator used to add binary indicators for missing values.
-        ``None`` if add_indicator is False.
-
-    See also
-    --------
-    SimpleImputer : Univariate imputation of missing values.
-
-    Notes
-    -----
-    To support imputation in inductive mode we store each feature's estimator
-    during the ``fit`` phase, and predict without refitting (in order) during
-    the ``transform`` phase.
-
-    Features which contain all missing values at ``fit`` are discarded upon
-    ``transform``.
-
-    Features with missing values during ``transform`` which did not have any
-    missing values during ``fit`` will be imputed with the initial imputation
-    method only.
-
-    References
-    ----------
-    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
-        Multivariate Imputation by Chained Equations in R". Journal of
-        Statistical Software 45: 1-67.
-        <https://www.jstatsoft.org/article/view/v045i03>`_
-
-    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
-        Multivariate Data Suitable for use with an Electronic Computer".
-        Journal of the Royal Statistical Society 22(2): 302-306.
-        <https://www.jstor.org/stable/2984099>`_
-    """
-
-    def __init__(self,
-                 estimator=None,
-                 missing_values=np.nan,
-                 sample_posterior=False,
-                 max_iter=10,
-                 tol=1e-3,
-                 n_nearest_features=None,
-                 initial_strategy="mean",
-                 imputation_order='ascending',
-                 min_value=None,
-                 max_value=None,
-                 verbose=0,
-                 random_state=None,
-                 add_indicator=False):
-
-        self.estimator = estimator
-        self.missing_values = missing_values
-        self.sample_posterior = sample_posterior
-        self.max_iter = max_iter
-        self.tol = tol
-        self.n_nearest_features = n_nearest_features
-        self.initial_strategy = initial_strategy
-        self.imputation_order = imputation_order
-        self.min_value = min_value
-        self.max_value = max_value
-        self.verbose = verbose
-        self.random_state = random_state
-        self.add_indicator = add_indicator
-
-    def _impute_one_feature(self,
-                            X_filled,
-                            mask_missing_values,
-                            feat_idx,
-                            neighbor_feat_idx,
-                            estimator=None,
-                            fit_mode=True):
-        """Impute a single feature from the others provided.
-
-        This function predicts the missing values of one of the features using
-        the current estimates of all the other features. The ``estimator`` must
-        support ``return_std=True`` in its ``predict`` method for this function
-        to work.
-
-        Parameters
-        ----------
-        X_filled : ndarray
-            Input data with the most recent imputations.
-
-        mask_missing_values : ndarray
-            Input data's missing indicator matrix.
-
-        feat_idx : int
-            Index of the feature currently being imputed.
-
-        neighbor_feat_idx : ndarray
-            Indices of the features to be used in imputing ``feat_idx``.
-
-        estimator : object
-            The estimator to use at this step of the round-robin imputation.
-            If ``sample_posterior`` is True, the estimator must support
-            ``return_std`` in its ``predict`` method.
-            If None, it will be cloned from self._estimator.
-
-        fit_mode : boolean, default=True
-            Whether to fit and predict with the estimator or just predict.
-
-        Returns
-        -------
-        X_filled : ndarray
-            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
-
-        estimator : estimator with sklearn API
-            The fitted estimator used to impute
-            ``X_filled[missing_row_mask, feat_idx]``.
-        """
-
-        # if nothing is missing, just return the default
-        # (should not happen at fit time because feat_ids would be excluded)
-        missing_row_mask = mask_missing_values[:, feat_idx]
-        if not np.any(missing_row_mask):
-            return X_filled, estimator
-
-        if estimator is None and fit_mode is False:
-            raise ValueError("If fit_mode is False, then an already-fitted "
-                             "estimator should be passed in.")
-
-        if estimator is None:
-            estimator = clone(self._estimator)
-
-        if fit_mode:
-            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
-                                    ~missing_row_mask)
-            y_train = safe_indexing(X_filled[:, feat_idx],
-                                    ~missing_row_mask)
-            estimator.fit(X_train, y_train)
-
-        # get posterior samples
-        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
-                               missing_row_mask)
-        if self.sample_posterior:
-            mus, sigmas = estimator.predict(X_test, return_std=True)
-            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-            # two types of problems: (1) non-positive sigmas, (2) mus outside
-            # legal range of min_value and max_value (results in inf sample)
-            positive_sigmas = sigmas > 0
-            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
-            mus_too_low = mus < self._min_value
-            imputed_values[mus_too_low] = self._min_value
-            mus_too_high = mus > self._max_value
-            imputed_values[mus_too_high] = self._max_value
-            # the rest can be sampled without statistical issues
-            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
-            mus = mus[inrange_mask]
-            sigmas = sigmas[inrange_mask]
-            a = (self._min_value - mus) / sigmas
-            b = (self._max_value - mus) / sigmas
-
-            if scipy.__version__ < LooseVersion('0.18'):
-                # bug with vector-valued `a` in old scipy
-                imputed_values[inrange_mask] = [
-                    stats.truncnorm(a=a_, b=b_,
-                                    loc=loc_, scale=scale_).rvs(
-                                        random_state=self.random_state_)
-                    for a_, b_, loc_, scale_
-                    in zip(a, b, mus, sigmas)]
-            else:
-                truncated_normal = stats.truncnorm(a=a, b=b,
-                                                   loc=mus, scale=sigmas)
-                imputed_values[inrange_mask] = truncated_normal.rvs(
-                    random_state=self.random_state_)
-        else:
-            imputed_values = estimator.predict(X_test)
-            imputed_values = np.clip(imputed_values,
-                                     self._min_value,
-                                     self._max_value)
-
-        # update the feature
-        X_filled[missing_row_mask, feat_idx] = imputed_values
-        return X_filled, estimator
-
-    def _get_neighbor_feat_idx(self,
-                               n_features,
-                               feat_idx,
-                               abs_corr_mat):
-        """Get a list of other features to predict ``feat_idx``.
-
-        If self.n_nearest_features is less than or equal to the total
-        number of features, then use a probability proportional to the absolute
-        correlation between ``feat_idx`` and each other feature to randomly
-        choose a subsample of the other features (without replacement).
-
-        Parameters
-        ----------
-        n_features : int
-            Number of features in ``X``.
-
-        feat_idx : int
-            Index of the feature currently being imputed.
-
-        abs_corr_mat : ndarray, shape (n_features, n_features)
-            Absolute correlation matrix of ``X``. The diagonal has been zeroed
-            out and each feature has been normalized to sum to 1. Can be None.
-
-        Returns
-        -------
-        neighbor_feat_idx : array-like
-            The features to use to impute ``feat_idx``.
-        """
-        if (self.n_nearest_features is not None and
-                self.n_nearest_features < n_features):
-            p = abs_corr_mat[:, feat_idx]
-            neighbor_feat_idx = self.random_state_.choice(
-                np.arange(n_features), self.n_nearest_features, replace=False,
-                p=p)
-        else:
-            inds_left = np.arange(feat_idx)
-            inds_right = np.arange(feat_idx + 1, n_features)
-            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
-        return neighbor_feat_idx
-
-    def _get_ordered_idx(self, mask_missing_values):
-        """Decide in what order we will update the features.
-
-        As a homage to the MICE R package, we will have 4 main options of
-        how to order the updates, and use a random order if anything else
-        is specified.
-
-        Also, this function skips features which have no missing values.
-
-        Parameters
-        ----------
-        mask_missing_values : array-like, shape (n_samples, n_features)
-            Input data's missing indicator matrix, where "n_samples" is the
-            number of samples and "n_features" is the number of features.
-
-        Returns
-        -------
-        ordered_idx : ndarray, shape (n_features,)
-            The order in which to impute the features.
-        """
-        frac_of_missing_values = mask_missing_values.mean(axis=0)
-        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
-        if self.imputation_order == 'roman':
-            ordered_idx = missing_values_idx
-        elif self.imputation_order == 'arabic':
-            ordered_idx = missing_values_idx[::-1]
-        elif self.imputation_order == 'ascending':
-            n = len(frac_of_missing_values) - len(missing_values_idx)
-            ordered_idx = np.argsort(frac_of_missing_values,
-                                     kind='mergesort')[n:][::-1]
-        elif self.imputation_order == 'descending':
-            n = len(frac_of_missing_values) - len(missing_values_idx)
-            ordered_idx = np.argsort(frac_of_missing_values,
-                                     kind='mergesort')[n:]
-        elif self.imputation_order == 'random':
-            ordered_idx = missing_values_idx
-            self.random_state_.shuffle(ordered_idx)
-        else:
-            raise ValueError("Got an invalid imputation order: '{0}'. It must "
-                             "be one of the following: 'roman', 'arabic', "
-                             "'ascending', 'descending', or "
-                             "'random'.".format(self.imputation_order))
-        return ordered_idx
-
-    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
-        """Get absolute correlation matrix between features.
-
-        Parameters
-        ----------
-        X_filled : ndarray, shape (n_samples, n_features)
-            Input data with the most recent imputations.
-
-        tolerance : float, optional (default=1e-6)
-            ``abs_corr_mat`` can have nans, which will be replaced
-            with ``tolerance``.
-
-        Returns
-        -------
-        abs_corr_mat : ndarray, shape (n_features, n_features)
-            Absolute correlation matrix of ``X`` at the beginning of the
-            current round. The diagonal has been zeroed out and each feature's
-            absolute correlations with all others have been normalized to sum
-            to 1.
-        """
-        n_features = X_filled.shape[1]
-        if (self.n_nearest_features is None or
-                self.n_nearest_features >= n_features):
-            return None
-        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
-        # np.corrcoef is not defined for features with zero std
-        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
-        # ensures exploration, i.e. at least some probability of sampling
-        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
-        # features are not their own neighbors
-        np.fill_diagonal(abs_corr_mat, 0)
-        # needs to sum to 1 for np.random.choice sampling
-        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
-        return abs_corr_mat
-
-    def _initial_imputation(self, X):
-        """Perform initial imputation for input X.
-
-        Parameters
-        ----------
-        X : ndarray, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        Returns
-        -------
-        Xt : ndarray, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        X_filled : ndarray, shape (n_samples, n_features)
-            Input data with the most recent imputations.
-
-        mask_missing_values : ndarray, shape (n_samples, n_features)
-            Input data's missing indicator matrix, where "n_samples" is the
-            number of samples and "n_features" is the number of features.
-        """
-        if is_scalar_nan(self.missing_values):
-            force_all_finite = "allow-nan"
-        else:
-            force_all_finite = True
-
-        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
-
-        mask_missing_values = _get_mask(X, self.missing_values)
-        if self.initial_imputer_ is None:
-            self.initial_imputer_ = SimpleImputer(
-                                            missing_values=self.missing_values,
-                                            strategy=self.initial_strategy)
-            X_filled = self.initial_imputer_.fit_transform(X)
-        else:
-            X_filled = self.initial_imputer_.transform(X)
-
-        valid_mask = np.flatnonzero(np.logical_not(
-            np.isnan(self.initial_imputer_.statistics_)))
-        Xt = X[:, valid_mask]
-        mask_missing_values = mask_missing_values[:, valid_mask]
-
-        return Xt, X_filled, mask_missing_values
-
-    def fit_transform(self, X, y=None):
-        """Fits the imputer on X and return the transformed X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        y : ignored.
-
-        Returns
-        -------
-        Xt : array-like, shape (n_samples, n_features)
-             The imputed input data.
-        """
-        self.random_state_ = getattr(self, "random_state_",
-                                     check_random_state(self.random_state))
-
-        if self.max_iter < 0:
-            raise ValueError(
-                "'max_iter' should be a positive integer. Got {} instead."
-                .format(self.max_iter))
-
-        if self.tol < 0:
-            raise ValueError(
-                "'tol' should be a non-negative float. Got {} instead."
-                .format(self.tol)
-            )
-
-        if self.add_indicator:
-            self.indicator_ = MissingIndicator(
-                missing_values=self.missing_values)
-            X_trans_indicator = self.indicator_.fit_transform(X)
-        else:
-            self.indicator_ = None
-
-        if self.estimator is None:
-            from .linear_model import BayesianRidge
-            self._estimator = BayesianRidge()
-        else:
-            self._estimator = clone(self.estimator)
-
-        self.imputation_sequence_ = []
-
-        if hasattr(self._estimator, 'random_state'):
-            self._estimator.random_state = self.random_state_
-
-        self._min_value = -np.inf if self.min_value is None else self.min_value
-        self._max_value = np.inf if self.max_value is None else self.max_value
-
-        self.initial_imputer_ = None
-        X, Xt, mask_missing_values = self._initial_imputation(X)
-
-        if self.max_iter == 0 or np.all(mask_missing_values):
-            self.n_iter_ = 0
-            return Xt
-
-        # order in which to impute
-        # note this is probably too slow for large feature data (d > 100000)
-        # and a better way would be good.
-        # see: https://goo.gl/KyCNwj and subsequent comments
-        ordered_idx = self._get_ordered_idx(mask_missing_values)
-        self.n_features_with_missing_ = len(ordered_idx)
-
-        abs_corr_mat = self._get_abs_corr_mat(Xt)
-
-        n_samples, n_features = Xt.shape
-        if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
-                  % (X.shape,))
-        start_t = time()
-        if not self.sample_posterior:
-            Xt_previous = Xt.copy()
-            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
-        for self.n_iter_ in range(1, self.max_iter + 1):
-            if self.imputation_order == 'random':
-                ordered_idx = self._get_ordered_idx(mask_missing_values)
-
-            for feat_idx in ordered_idx:
-                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
-                                                                feat_idx,
-                                                                abs_corr_mat)
-                Xt, estimator = self._impute_one_feature(
-                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
-                    estimator=None, fit_mode=True)
-                estimator_triplet = ImputerTriplet(feat_idx,
-                                                   neighbor_feat_idx,
-                                                   estimator)
-                self.imputation_sequence_.append(estimator_triplet)
-
-            if self.verbose > 1:
-                print('[IterativeImputer] Ending imputation round '
-                      '%d/%d, elapsed time %0.2f'
-                      % (self.n_iter_, self.max_iter, time() - start_t))
-
-            if not self.sample_posterior:
-                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
-                                          axis=None)
-                if inf_norm < normalized_tol:
-                    if self.verbose > 0:
-                        print('[IterativeImputer] Early stopping criterion '
-                              'reached.')
-                    break
-                Xt_previous = Xt.copy()
-        else:
-            if not self.sample_posterior:
-                warnings.warn("[IterativeImputer] Early stopping criterion not"
-                              " reached.", ConvergenceWarning)
-        Xt[~mask_missing_values] = X[~mask_missing_values]
-
-        if self.add_indicator:
-            Xt = np.hstack((Xt, X_trans_indicator))
-        return Xt
-
-    def transform(self, X):
-        """Imputes all missing values in X.
-
-        Note that this is stochastic, and that if random_state is not fixed,
-        repeated calls, or permuted input, will yield different results.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : array-like, shape (n_samples, n_features)
-             The imputed input data.
-        """
-        check_is_fitted(self, 'initial_imputer_')
-
-        if self.add_indicator:
-            X_trans_indicator = self.indicator_.transform(X)
-
-        X, Xt, mask_missing_values = self._initial_imputation(X)
-
-        if self.n_iter_ == 0 or np.all(mask_missing_values):
-            return Xt
-
-        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
-        i_rnd = 0
-        if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
-                  % (X.shape,))
-        start_t = time()
-        for it, estimator_triplet in enumerate(self.imputation_sequence_):
-            Xt, _ = self._impute_one_feature(
-                Xt,
-                mask_missing_values,
-                estimator_triplet.feat_idx,
-                estimator_triplet.neighbor_feat_idx,
-                estimator=estimator_triplet.estimator,
-                fit_mode=False
-            )
-            if not (it + 1) % imputations_per_round:
-                if self.verbose > 1:
-                    print('[IterativeImputer] Ending imputation round '
-                          '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, self.n_iter_, time() - start_t))
-                i_rnd += 1
-
-        Xt[~mask_missing_values] = X[~mask_missing_values]
-
-        if self.add_indicator:
-            Xt = np.hstack((Xt, X_trans_indicator))
-        return Xt
-
-    def fit(self, X, y=None):
-        """Fits the imputer on X and return self.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        y : ignored
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        self.fit_transform(X)
-        return self
-
-    def _more_tags(self):
-        return {'allow_nan': True}
-
-
-class MissingIndicator(BaseEstimator, TransformerMixin):
-    """Binary indicators for missing values.
-
-    Note that this component typically should not be used in a vanilla
-    :class:`Pipeline` consisting of transformers and a classifier, but rather
-    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
-
-    Read more in the :ref:`User Guide <impute>`.
-
-    Parameters
-    ----------
-    missing_values : number, string, np.nan (default) or None
-        The placeholder for the missing values. All occurrences of
-        `missing_values` will be indicated (True in the output array), the
-        other values will be marked as False.
-
-    features : str, optional
-        Whether the imputer mask should represent all or a subset of
-        features.
-
-        - If "missing-only" (default), the imputer mask will only represent
-          features containing missing values during fit time.
-        - If "all", the imputer mask will represent all features.
-
-    sparse : boolean or "auto", optional
-        Whether the imputer mask format should be sparse or dense.
-
-        - If "auto" (default), the imputer mask will be of same type as
-          input.
-        - If True, the imputer mask will be a sparse matrix.
-        - If False, the imputer mask will be a numpy array.
-
-    error_on_new : boolean, optional
-        If True (default), transform will raise an error when there are
-        features with missing values in transform that have no missing values
-        in fit. This is applicable only when ``features="missing-only"``.
-
-    Attributes
-    ----------
-    features_ : ndarray, shape (n_missing_features,) or (n_features,)
-        The features indices which will be returned when calling ``transform``.
-        They are computed during ``fit``. For ``features='all'``, it is
-        to ``range(n_features)``.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.impute import MissingIndicator
-    >>> X1 = np.array([[np.nan, 1, 3],
-    ...                [4, 0, np.nan],
-    ...                [8, 1, 0]])
-    >>> X2 = np.array([[5, 1, np.nan],
-    ...                [np.nan, 2, 3],
-    ...                [2, 4, 0]])
-    >>> indicator = MissingIndicator()
-    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
-    MissingIndicator(error_on_new=True, features='missing-only',
-             missing_values=nan, sparse='auto')
-    >>> X2_tr = indicator.transform(X2)
-    >>> X2_tr
-    array([[False,  True],
-           [ True, False],
-           [False, False]])
-
-    """
-
-    def __init__(self, missing_values=np.nan, features="missing-only",
-                 sparse="auto", error_on_new=True):
-        self.missing_values = missing_values
-        self.features = features
-        self.sparse = sparse
-        self.error_on_new = error_on_new
-
-    def _get_missing_features_info(self, X):
-        """Compute the imputer mask and the indices of the features
-        containing missing values.
-
-        Parameters
-        ----------
-        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The input data with missing values. Note that ``X`` has been
-            checked in ``fit`` and ``transform`` before to call this function.
-
-        Returns
-        -------
-        imputer_mask : {ndarray or sparse matrix}, shape \
-(n_samples, n_features) or (n_samples, n_features_with_missing)
-            The imputer mask of the original data.
-
-        features_with_missing : ndarray, shape (n_features_with_missing)
-            The features containing missing values.
-
-        """
-        if sparse.issparse(X):
-            mask = _get_mask(X.data, self.missing_values)
-
-            # The imputer mask will be constructed with the same sparse format
-            # as X.
-            sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
-                                  else sparse.csc_matrix)
-            imputer_mask = sparse_constructor(
-                (mask, X.indices.copy(), X.indptr.copy()),
-                shape=X.shape, dtype=bool)
-            imputer_mask.eliminate_zeros()
-
-            if self.features == 'missing-only':
-                n_missing = imputer_mask.getnnz(axis=0)
-
-            if self.sparse is False:
-                imputer_mask = imputer_mask.toarray()
-            elif imputer_mask.format == 'csr':
-                imputer_mask = imputer_mask.tocsc()
-        else:
-            imputer_mask = _get_mask(X, self.missing_values)
-
-            if self.features == 'missing-only':
-                n_missing = imputer_mask.sum(axis=0)
-
-            if self.sparse is True:
-                imputer_mask = sparse.csc_matrix(imputer_mask)
-
-        if self.features == 'all':
-            features_indices = np.arange(X.shape[1])
-        else:
-            features_indices = np.flatnonzero(n_missing)
-
-        return imputer_mask, features_indices
-
-    def _validate_input(self, X):
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
-        if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise ValueError("MissingIndicator does not support data with "
-                             "dtype {0}. Please provide either a numeric array"
-                             " (with a floating point or integer dtype) or "
-                             "categorical data represented either as an array "
-                             "with integer dtype or an array of string values "
-                             "with an object dtype.".format(X.dtype))
-
-        if sparse.issparse(X) and self.missing_values == 0:
-            # missing_values = 0 not allowed with sparse data as it would
-            # force densification
-            raise ValueError("Sparse input with missing_values=0 is "
-                             "not supported. Provide a dense "
-                             "array instead.")
-
-        return X
-
-    def fit(self, X, y=None):
-        """Fit the transformer on X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Input data, where ``n_samples`` is the number of samples and
-            ``n_features`` is the number of features.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        X = self._validate_input(X)
-        self._n_features = X.shape[1]
-
-        if self.features not in ('missing-only', 'all'):
-            raise ValueError("'features' has to be either 'missing-only' or "
-                             "'all'. Got {} instead.".format(self.features))
-
-        if not ((isinstance(self.sparse, str) and
-                self.sparse == "auto") or isinstance(self.sparse, bool)):
-            raise ValueError("'sparse' has to be a boolean or 'auto'. "
-                             "Got {!r} instead.".format(self.sparse))
-
-        self.features_ = self._get_missing_features_info(X)[1]
-
-        return self
-
-    def transform(self, X):
-        """Generate missing values indicator for X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The missing indicator for input data. The data type of ``Xt``
-            will be boolean.
-
-        """
-        check_is_fitted(self, "features_")
-        X = self._validate_input(X)
-
-        if X.shape[1] != self._n_features:
-            raise ValueError("X has a different number of features "
-                             "than during fitting.")
-
-        imputer_mask, features = self._get_missing_features_info(X)
-
-        if self.features == "missing-only":
-            features_diff_fit_trans = np.setdiff1d(features, self.features_)
-            if (self.error_on_new and features_diff_fit_trans.size > 0):
-                raise ValueError("The features {} have missing values "
-                                 "in transform but have no missing values "
-                                 "in fit.".format(features_diff_fit_trans))
-
-            if self.features_.size < self._n_features:
-                imputer_mask = imputer_mask[:, self.features_]
-
-        return imputer_mask
-
-    def fit_transform(self, X, y=None):
-        """Generate missing values indicator for X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            The input data to complete.
-
-        Returns
-        -------
-        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
-            The missing indicator for input data. The data type of ``Xt``
-            will be boolean.
-
-        """
-        return self.fit(X, y).transform(X)
-
-    def _more_tags(self):
-        return {'allow_nan': True,
-                'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py
new file mode 100644
index 0000000000000..abeb4d471f5f3
--- /dev/null
+++ b/sklearn/impute/__init__.py
@@ -0,0 +1,8 @@
+"""Transformers for missing value imputation"""
+
+from ._base import MissingIndicator, SimpleImputer
+
+__all__ = [
+    'MissingIndicator',
+    'SimpleImputer',
+]
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
new file mode 100644
index 0000000000000..7be9da691ce11
--- /dev/null
+++ b/sklearn/impute/_base.py
@@ -0,0 +1,675 @@
+# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
+#          Sergey Feldman <sergeyfeldman@gmail.com>
+# License: BSD 3 clause
+
+from __future__ import division
+
+import warnings
+import numbers
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils.sparsefuncs import _get_median
+from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
+from ..utils.fixes import _object_dtype_isnan
+from ..utils import is_scalar_nan
+from ..utils import check_array
+
+
+def _check_inputs_dtype(X, missing_values):
+    if (X.dtype.kind in ("f", "i", "u") and
+            not isinstance(missing_values, numbers.Real)):
+        raise ValueError("'X' and 'missing_values' types are expected to be"
+                         " both numerical. Got X.dtype={} and "
+                         " type(missing_values)={}."
+                         .format(X.dtype, type(missing_values)))
+
+
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == missing_values."""
+    if is_scalar_nan(value_to_mask):
+        if X.dtype.kind == "f":
+            return np.isnan(X)
+        elif X.dtype.kind in ("i", "u"):
+            # can't have NaNs in integer array.
+            return np.zeros(X.shape, dtype=bool)
+        else:
+            # np.isnan does not work on object dtypes.
+            return _object_dtype_isnan(X)
+    else:
+        # X == value_to_mask with object dytpes does not always perform
+        # element-wise for old versions of numpy
+        return np.equal(X, value_to_mask)
+
+
+def _most_frequent(array, extra_value, n_repeat):
+    """Compute the most frequent value in a 1d array extended with
+       [extra_value] * n_repeat, where extra_value is assumed to be not part
+       of the array."""
+    # Compute the most frequent value in array only
+    if array.size > 0:
+        with warnings.catch_warnings():
+            # stats.mode raises a warning when input array contains objects due
+            # to incapacity to detect NaNs. Irrelevant here since input array
+            # has already been NaN-masked.
+            warnings.simplefilter("ignore", RuntimeWarning)
+            mode = stats.mode(array)
+
+        most_frequent_value = mode[0][0]
+        most_frequent_count = mode[1][0]
+    else:
+        most_frequent_value = 0
+        most_frequent_count = 0
+
+    # Compare to array + [extra_value] * n_repeat
+    if most_frequent_count == 0 and n_repeat == 0:
+        return np.nan
+    elif most_frequent_count < n_repeat:
+        return extra_value
+    elif most_frequent_count > n_repeat:
+        return most_frequent_value
+    elif most_frequent_count == n_repeat:
+        # Ties the breaks. Copy the behaviour of scipy.stats.mode
+        if most_frequent_value < extra_value:
+            return most_frequent_value
+        else:
+            return extra_value
+
+
+class SimpleImputer(BaseEstimator, TransformerMixin):
+    """Imputation transformer for completing missing values.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    Parameters
+    ----------
+    missing_values : number, string, np.nan (default) or None
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed.
+
+    strategy : string, optional (default="mean")
+        The imputation strategy.
+
+        - If "mean", then replace missing values using the mean along
+          each column. Can only be used with numeric data.
+        - If "median", then replace missing values using the median along
+          each column. Can only be used with numeric data.
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+        .. versionadded:: 0.20
+           strategy="constant" for fixed value imputation.
+
+    fill_value : string or numerical value, optional (default=None)
+        When strategy == "constant", fill_value is used to replace all
+        occurrences of missing_values.
+        If left to the default, fill_value will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
+
+    verbose : integer, optional (default=0)
+        Controls the verbosity of the imputer.
+
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible. Note that, in the following cases,
+        a new copy will always be made, even if `copy=False`:
+
+        - If X is not an array of floating values;
+        - If X is encoded as a CSR matrix;
+        - If add_indicator=True.
+
+    add_indicator : boolean, optional (default=False)
+        If True, a `MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    Attributes
+    ----------
+    statistics_ : array of shape (n_features,)
+        The imputation fill value for each feature.
+
+    indicator_ : :class:`sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import SimpleImputer
+    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
+    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    SimpleImputer(add_indicator=False, copy=True, fill_value=None,
+            missing_values=nan, strategy='mean', verbose=0)
+    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+    >>> print(imp_mean.transform(X))
+    ... # doctest: +NORMALIZE_WHITESPACE
+    [[ 7.   2.   3. ]
+     [ 4.   3.5  6. ]
+     [10.   3.5  9. ]]
+
+    Notes
+    -----
+    Columns which only contained missing values at `fit` are discarded upon
+    `transform` if strategy is not "constant".
+
+    """
+    def __init__(self, missing_values=np.nan, strategy="mean",
+                 fill_value=None, verbose=0, copy=True, add_indicator=False):
+        self.missing_values = missing_values
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.verbose = verbose
+        self.copy = copy
+        self.add_indicator = add_indicator
+
+    def _validate_input(self, X):
+        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.strategy in ("most_frequent", "constant"):
+            dtype = None
+        else:
+            dtype = FLOAT_DTYPES
+
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+
+        try:
+            X = check_array(X, accept_sparse='csc', dtype=dtype,
+                            force_all_finite=force_all_finite, copy=self.copy)
+        except ValueError as ve:
+            if "could not convert" in str(ve):
+                raise ValueError("Cannot use {0} strategy with non-numeric "
+                                 "data. Received datatype :{1}."
+                                 "".format(self.strategy, X.dtype.kind))
+            else:
+                raise ve
+
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError("SimpleImputer does not support data with dtype "
+                             "{0}. Please provide either a numeric array (with"
+                             " a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
+
+        return X
+
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+
+        Returns
+        -------
+        self : SimpleImputer
+        """
+        X = self._validate_input(X)
+
+        # default fill_value is 0 for numerical input and "missing_value"
+        # otherwise
+        if self.fill_value is None:
+            if X.dtype.kind in ("i", "u", "f"):
+                fill_value = 0
+            else:
+                fill_value = "missing_value"
+        else:
+            fill_value = self.fill_value
+
+        # fill_value should be numerical in case of numerical input
+        if (self.strategy == "constant" and
+                X.dtype.kind in ("i", "u", "f") and
+                not isinstance(fill_value, numbers.Real)):
+            raise ValueError("'fill_value'={0} is invalid. Expected a "
+                             "numerical value when imputing numerical "
+                             "data".format(fill_value))
+
+        if sparse.issparse(X):
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                self.statistics_ = self._sparse_fit(X,
+                                                    self.strategy,
+                                                    self.missing_values,
+                                                    fill_value)
+        else:
+            self.statistics_ = self._dense_fit(X,
+                                               self.strategy,
+                                               self.missing_values,
+                                               fill_value)
+
+        if self.add_indicator:
+            self.indicator_ = MissingIndicator(
+                missing_values=self.missing_values)
+            self.indicator_.fit(X)
+        else:
+            self.indicator_ = None
+
+        return self
+
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on sparse data."""
+        mask_data = _get_mask(X.data, missing_values)
+        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
+
+        statistics = np.empty(X.shape[1])
+
+        if strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            statistics.fill(fill_value)
+        else:
+            for i in range(X.shape[1]):
+                column = X.data[X.indptr[i]:X.indptr[i + 1]]
+                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
+                column = column[~mask_column]
+
+                # combine explicit and implicit zeros
+                mask_zeros = _get_mask(column, 0)
+                column = column[~mask_zeros]
+                n_explicit_zeros = mask_zeros.sum()
+                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+                if strategy == "mean":
+                    s = column.size + n_zeros
+                    statistics[i] = np.nan if s == 0 else column.sum() / s
+
+                elif strategy == "median":
+                    statistics[i] = _get_median(column,
+                                                n_zeros)
+
+                elif strategy == "most_frequent":
+                    statistics[i] = _most_frequent(column,
+                                                   0,
+                                                   n_zeros)
+        return statistics
+
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on dense data."""
+        mask = _get_mask(X, missing_values)
+        masked_X = ma.masked_array(X, mask=mask)
+
+        # Mean
+        if strategy == "mean":
+            mean_masked = np.ma.mean(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            mean = np.ma.getdata(mean_masked)
+            mean[np.ma.getmask(mean_masked)] = np.nan
+
+            return mean
+
+        # Median
+        elif strategy == "median":
+            median_masked = np.ma.median(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            median = np.ma.getdata(median_masked)
+            median[np.ma.getmaskarray(median_masked)] = np.nan
+
+            return median
+
+        # Most frequent
+        elif strategy == "most_frequent":
+            # scipy.stats.mstats.mode cannot be used because it will no work
+            # properly if the first element is masked and if its frequency
+            # is equal to the frequency of the most frequent valid element
+            # See https://github.com/scipy/scipy/issues/2636
+
+            # To be able access the elements by columns
+            X = X.transpose()
+            mask = mask.transpose()
+
+            if X.dtype.kind == "O":
+                most_frequent = np.empty(X.shape[0], dtype=object)
+            else:
+                most_frequent = np.empty(X.shape[0])
+
+            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+                row_mask = np.logical_not(row_mask).astype(np.bool)
+                row = row[row_mask]
+                most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+            return most_frequent
+
+        # Constant
+        elif strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            return np.full(X.shape[1], fill_value, dtype=X.dtype)
+
+    def transform(self, X):
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+        """
+        check_is_fitted(self, 'statistics_')
+
+        X = self._validate_input(X)
+
+        statistics = self.statistics_
+
+        if X.shape[1] != statistics.shape[0]:
+            raise ValueError("X has %d features per sample, expected %d"
+                             % (X.shape[1], self.statistics_.shape[0]))
+
+        if self.add_indicator:
+            X_trans_indicator = self.indicator_.transform(X)
+
+        # Delete the invalid columns if strategy is not constant
+        if self.strategy == "constant":
+            valid_statistics = statistics
+        else:
+            # same as np.isnan but also works for object dtypes
+            invalid_mask = _get_mask(statistics, np.nan)
+            valid_mask = np.logical_not(invalid_mask)
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
+
+            if invalid_mask.any():
+                missing = np.arange(X.shape[1])[invalid_mask]
+                if self.verbose:
+                    warnings.warn("Deleting features without "
+                                  "observed values: %s" % missing)
+                X = X[:, valid_statistics_indexes]
+
+        # Do actual imputation
+        if sparse.issparse(X):
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                mask = _get_mask(X.data, self.missing_values)
+                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
+                                    np.diff(X.indptr))[mask]
+
+                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
+                                                                copy=False)
+        else:
+            mask = _get_mask(X, self.missing_values)
+            n_missing = np.sum(mask, axis=0)
+            values = np.repeat(valid_statistics, n_missing)
+            coordinates = np.where(mask.transpose())[::-1]
+
+            X[coordinates] = values
+
+        if self.add_indicator:
+            hstack = sparse.hstack if sparse.issparse(X) else np.hstack
+            X = hstack((X, X_trans_indicator))
+
+        return X
+
+    def _more_tags(self):
+        return {'allow_nan': True}
+
+
+class MissingIndicator(BaseEstimator, TransformerMixin):
+    """Binary indicators for missing values.
+
+    Note that this component typically should not be used in a vanilla
+    :class:`Pipeline` consisting of transformers and a classifier, but rather
+    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
+
+    Read more in the :ref:`User Guide <impute>`.
+
+    Parameters
+    ----------
+    missing_values : number, string, np.nan (default) or None
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be indicated (True in the output array), the
+        other values will be marked as False.
+
+    features : str, optional
+        Whether the imputer mask should represent all or a subset of
+        features.
+
+        - If "missing-only" (default), the imputer mask will only represent
+          features containing missing values during fit time.
+        - If "all", the imputer mask will represent all features.
+
+    sparse : boolean or "auto", optional
+        Whether the imputer mask format should be sparse or dense.
+
+        - If "auto" (default), the imputer mask will be of same type as
+          input.
+        - If True, the imputer mask will be a sparse matrix.
+        - If False, the imputer mask will be a numpy array.
+
+    error_on_new : boolean, optional
+        If True (default), transform will raise an error when there are
+        features with missing values in transform that have no missing values
+        in fit. This is applicable only when ``features="missing-only"``.
+
+    Attributes
+    ----------
+    features_ : ndarray, shape (n_missing_features,) or (n_features,)
+        The features indices which will be returned when calling ``transform``.
+        They are computed during ``fit``. For ``features='all'``, it is
+        to ``range(n_features)``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import MissingIndicator
+    >>> X1 = np.array([[np.nan, 1, 3],
+    ...                [4, 0, np.nan],
+    ...                [8, 1, 0]])
+    >>> X2 = np.array([[5, 1, np.nan],
+    ...                [np.nan, 2, 3],
+    ...                [2, 4, 0]])
+    >>> indicator = MissingIndicator()
+    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
+    MissingIndicator(error_on_new=True, features='missing-only',
+             missing_values=nan, sparse='auto')
+    >>> X2_tr = indicator.transform(X2)
+    >>> X2_tr
+    array([[False,  True],
+           [ True, False],
+           [False, False]])
+
+    """
+
+    def __init__(self, missing_values=np.nan, features="missing-only",
+                 sparse="auto", error_on_new=True):
+        self.missing_values = missing_values
+        self.features = features
+        self.sparse = sparse
+        self.error_on_new = error_on_new
+
+    def _get_missing_features_info(self, X):
+        """Compute the imputer mask and the indices of the features
+        containing missing values.
+
+        Parameters
+        ----------
+        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The input data with missing values. Note that ``X`` has been
+            checked in ``fit`` and ``transform`` before to call this function.
+
+        Returns
+        -------
+        imputer_mask : {ndarray or sparse matrix}, shape \
+(n_samples, n_features) or (n_samples, n_features_with_missing)
+            The imputer mask of the original data.
+
+        features_with_missing : ndarray, shape (n_features_with_missing)
+            The features containing missing values.
+
+        """
+        if sparse.issparse(X):
+            mask = _get_mask(X.data, self.missing_values)
+
+            # The imputer mask will be constructed with the same sparse format
+            # as X.
+            sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
+                                  else sparse.csc_matrix)
+            imputer_mask = sparse_constructor(
+                (mask, X.indices.copy(), X.indptr.copy()),
+                shape=X.shape, dtype=bool)
+            imputer_mask.eliminate_zeros()
+
+            if self.features == 'missing-only':
+                n_missing = imputer_mask.getnnz(axis=0)
+
+            if self.sparse is False:
+                imputer_mask = imputer_mask.toarray()
+            elif imputer_mask.format == 'csr':
+                imputer_mask = imputer_mask.tocsc()
+        else:
+            imputer_mask = _get_mask(X, self.missing_values)
+
+            if self.features == 'missing-only':
+                n_missing = imputer_mask.sum(axis=0)
+
+            if self.sparse is True:
+                imputer_mask = sparse.csc_matrix(imputer_mask)
+
+        if self.features == 'all':
+            features_indices = np.arange(X.shape[1])
+        else:
+            features_indices = np.flatnonzero(n_missing)
+
+        return imputer_mask, features_indices
+
+    def _validate_input(self, X):
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError("MissingIndicator does not support data with "
+                             "dtype {0}. Please provide either a numeric array"
+                             " (with a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
+
+        if sparse.issparse(X) and self.missing_values == 0:
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            raise ValueError("Sparse input with missing_values=0 is "
+                             "not supported. Provide a dense "
+                             "array instead.")
+
+        return X
+
+    def fit(self, X, y=None):
+        """Fit the transformer on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X = self._validate_input(X)
+        self._n_features = X.shape[1]
+
+        if self.features not in ('missing-only', 'all'):
+            raise ValueError("'features' has to be either 'missing-only' or "
+                             "'all'. Got {} instead.".format(self.features))
+
+        if not ((isinstance(self.sparse, str) and
+                self.sparse == "auto") or isinstance(self.sparse, bool)):
+            raise ValueError("'sparse' has to be a boolean or 'auto'. "
+                             "Got {!r} instead.".format(self.sparse))
+
+        self.features_ = self._get_missing_features_info(X)[1]
+
+        return self
+
+    def transform(self, X):
+        """Generate missing values indicator for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The missing indicator for input data. The data type of ``Xt``
+            will be boolean.
+
+        """
+        check_is_fitted(self, "features_")
+        X = self._validate_input(X)
+
+        if X.shape[1] != self._n_features:
+            raise ValueError("X has a different number of features "
+                             "than during fitting.")
+
+        imputer_mask, features = self._get_missing_features_info(X)
+
+        if self.features == "missing-only":
+            features_diff_fit_trans = np.setdiff1d(features, self.features_)
+            if (self.error_on_new and features_diff_fit_trans.size > 0):
+                raise ValueError("The features {} have missing values "
+                                 "in transform but have no missing values "
+                                 "in fit.".format(features_diff_fit_trans))
+
+            if self.features_.size < self._n_features:
+                imputer_mask = imputer_mask[:, self.features_]
+
+        return imputer_mask
+
+    def fit_transform(self, X, y=None):
+        """Generate missing values indicator for X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+            The missing indicator for input data. The data type of ``Xt``
+            will be boolean.
+
+        """
+        return self.fit(X, y).transform(X)
+
+    def _more_tags(self):
+        return {'allow_nan': True,
+                'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
new file mode 100644
index 0000000000000..40df3f4059c04
--- /dev/null
+++ b/sklearn/impute/_iterative.py
@@ -0,0 +1,680 @@
+
+from time import time
+from distutils.version import LooseVersion
+from collections import namedtuple
+import warnings
+
+import scipy
+from scipy import stats
+import numpy as np
+
+from ..base import clone, BaseEstimator, TransformerMixin
+from ..exceptions import ConvergenceWarning
+from ..preprocessing import normalize
+from ..utils import check_array, check_random_state, safe_indexing
+from ..utils.validation import FLOAT_DTYPES, check_is_fitted
+from ..utils import is_scalar_nan
+
+from ._base import (_get_mask, MissingIndicator, SimpleImputer,
+                    _check_inputs_dtype)
+
+
+_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx',
+                                                 'neighbor_feat_idx',
+                                                 'estimator'])
+
+
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
+
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
+
+    Read more in the :ref:`User Guide <iterative_imputer>`.
+
+    .. note::
+
+      This estimator is still **experimental** for now: the predictions
+      and the API might change without any deprecation cycle. To use it,
+      you need to explicitly import ``enable_iterative_imputer``::
+
+        >>> # explicitly require this experimental feature
+        >>> from sklearn.experimental import enable_iterative_imputer  # noqa
+        >>> # now you can import normally from sklearn.impute
+        >>> from sklearn.impute import IterativeImputer
+
+    Parameters
+    ----------
+    estimator : estimator object, default=BayesianRidge()
+        The estimator to use at each step of the round-robin imputation.
+        If ``sample_posterior`` is True, the estimator must support
+        ``return_std`` in its ``predict`` method.
+
+    missing_values : int, np.nan, optional (default=np.nan)
+        The placeholder for the missing values. All occurrences of
+        ``missing_values`` will be imputed.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted estimator for each imputation. Estimator must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
+
+    max_iter : int, optional (default=10)
+        Maximum number of imputation rounds to perform before returning the
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values. The stopping criterion
+        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
+        where `X_t` is `X` at iteration `t. Note that early stopping is only
+        applied if ``sample_posterior=False``.
+
+    tol : float, optional (default=1e-3)
+        Tolerance of the stopping condition.
+
+    n_nearest_features : int, optional (default=None)
+        Number of other features to use to estimate the missing values of
+        each feature column. Nearness between features is measured using
+        the absolute correlation coefficient between each feature pair (after
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
+
+    initial_strategy : str, optional (default="mean")
+        Which strategy to use to initialize the missing values. Same as the
+        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+        Valid values: {"mean", "median", "most_frequent", or "constant"}.
+
+    imputation_order : str, optional (default="ascending")
+        The order in which the features will be imputed. Possible values:
+
+        "ascending"
+            From features with fewest missing values to most.
+        "descending"
+            From features with most missing values to fewest.
+        "roman"
+            Left to right.
+        "arabic"
+            Right to left.
+        "random"
+            A random order for each round.
+
+    min_value : float, optional (default=None)
+        Minimum possible imputed value. Default of ``None`` will set minimum
+        to negative infinity.
+
+    max_value : float, optional (default=None)
+        Maximum possible imputed value. Default of ``None`` will set maximum
+        to positive infinity.
+
+    verbose : int, optional (default=0)
+        Verbosity flag, controls the debug messages that are issued
+        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+        or 2.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of estimator features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
+
+    add_indicator : boolean, optional (default=False)
+        If True, a `MissingIndicator` transform will stack onto output
+        of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on
+        the missing indicator even if there are missing values at
+        transform/test time.
+
+    Attributes
+    ----------
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
+
+    imputation_sequence_ : list of tuples
+        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
+        ``feat_idx`` is the current feature to be imputed,
+        ``neighbor_feat_idx`` is the array of other features used to impute the
+        current feature, and ``estimator`` is the trained estimator used for
+        the imputation. Length is ``self.n_features_with_missing_ *
+        self.n_iter_``.
+
+    n_iter_ : int
+        Number of iteration rounds that occurred. Will be less than
+        ``self.max_iter`` if early stopping criterion was reached.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    indicator_ : :class:`sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
+
+    Notes
+    -----
+    To support imputation in inductive mode we store each feature's estimator
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
+
+    Features which contain all missing values at ``fit`` are discarded upon
+    ``transform``.
+
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
+
+    References
+    ----------
+    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+        Multivariate Imputation by Chained Equations in R". Journal of
+        Statistical Software 45: 1-67.
+        <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
+    """
+
+    def __init__(self,
+                 estimator=None,
+                 missing_values=np.nan,
+                 sample_posterior=False,
+                 max_iter=10,
+                 tol=1e-3,
+                 n_nearest_features=None,
+                 initial_strategy="mean",
+                 imputation_order='ascending',
+                 min_value=None,
+                 max_value=None,
+                 verbose=0,
+                 random_state=None,
+                 add_indicator=False):
+
+        self.estimator = estimator
+        self.missing_values = missing_values
+        self.sample_posterior = sample_posterior
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_nearest_features = n_nearest_features
+        self.initial_strategy = initial_strategy
+        self.imputation_order = imputation_order
+        self.min_value = min_value
+        self.max_value = max_value
+        self.verbose = verbose
+        self.random_state = random_state
+        self.add_indicator = add_indicator
+
+    def _impute_one_feature(self,
+                            X_filled,
+                            mask_missing_values,
+                            feat_idx,
+                            neighbor_feat_idx,
+                            estimator=None,
+                            fit_mode=True):
+        """Impute a single feature from the others provided.
+
+        This function predicts the missing values of one of the features using
+        the current estimates of all the other features. The ``estimator`` must
+        support ``return_std=True`` in its ``predict`` method for this function
+        to work.
+
+        Parameters
+        ----------
+        X_filled : ndarray
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray
+            Input data's missing indicator matrix.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        neighbor_feat_idx : ndarray
+            Indices of the features to be used in imputing ``feat_idx``.
+
+        estimator : object
+            The estimator to use at this step of the round-robin imputation.
+            If ``sample_posterior`` is True, the estimator must support
+            ``return_std`` in its ``predict`` method.
+            If None, it will be cloned from self._estimator.
+
+        fit_mode : boolean, default=True
+            Whether to fit and predict with the estimator or just predict.
+
+        Returns
+        -------
+        X_filled : ndarray
+            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
+
+        estimator : estimator with sklearn API
+            The fitted estimator used to impute
+            ``X_filled[missing_row_mask, feat_idx]``.
+        """
+
+        # if nothing is missing, just return the default
+        # (should not happen at fit time because feat_ids would be excluded)
+        missing_row_mask = mask_missing_values[:, feat_idx]
+        if not np.any(missing_row_mask):
+            return X_filled, estimator
+
+        if estimator is None and fit_mode is False:
+            raise ValueError("If fit_mode is False, then an already-fitted "
+                             "estimator should be passed in.")
+
+        if estimator is None:
+            estimator = clone(self._estimator)
+
+        if fit_mode:
+            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
+                                    ~missing_row_mask)
+            y_train = safe_indexing(X_filled[:, feat_idx],
+                                    ~missing_row_mask)
+            estimator.fit(X_train, y_train)
+
+        # get posterior samples
+        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
+                               missing_row_mask)
+        if self.sample_posterior:
+            mus, sigmas = estimator.predict(X_test, return_std=True)
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            # two types of problems: (1) non-positive sigmas, (2) mus outside
+            # legal range of min_value and max_value (results in inf sample)
+            positive_sigmas = sigmas > 0
+            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+            mus_too_low = mus < self._min_value
+            imputed_values[mus_too_low] = self._min_value
+            mus_too_high = mus > self._max_value
+            imputed_values[mus_too_high] = self._max_value
+            # the rest can be sampled without statistical issues
+            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+            mus = mus[inrange_mask]
+            sigmas = sigmas[inrange_mask]
+            a = (self._min_value - mus) / sigmas
+            b = (self._max_value - mus) / sigmas
+
+            if scipy.__version__ < LooseVersion('0.18'):
+                # bug with vector-valued `a` in old scipy
+                imputed_values[inrange_mask] = [
+                    stats.truncnorm(a=a_, b=b_,
+                                    loc=loc_, scale=scale_).rvs(
+                                        random_state=self.random_state_)
+                    for a_, b_, loc_, scale_
+                    in zip(a, b, mus, sigmas)]
+            else:
+                truncated_normal = stats.truncnorm(a=a, b=b,
+                                                   loc=mus, scale=sigmas)
+                imputed_values[inrange_mask] = truncated_normal.rvs(
+                    random_state=self.random_state_)
+        else:
+            imputed_values = estimator.predict(X_test)
+            imputed_values = np.clip(imputed_values,
+                                     self._min_value,
+                                     self._max_value)
+
+        # update the feature
+        X_filled[missing_row_mask, feat_idx] = imputed_values
+        return X_filled, estimator
+
+    def _get_neighbor_feat_idx(self,
+                               n_features,
+                               feat_idx,
+                               abs_corr_mat):
+        """Get a list of other features to predict ``feat_idx``.
+
+        If self.n_nearest_features is less than or equal to the total
+        number of features, then use a probability proportional to the absolute
+        correlation between ``feat_idx`` and each other feature to randomly
+        choose a subsample of the other features (without replacement).
+
+        Parameters
+        ----------
+        n_features : int
+            Number of features in ``X``.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X``. The diagonal has been zeroed
+            out and each feature has been normalized to sum to 1. Can be None.
+
+        Returns
+        -------
+        neighbor_feat_idx : array-like
+            The features to use to impute ``feat_idx``.
+        """
+        if (self.n_nearest_features is not None and
+                self.n_nearest_features < n_features):
+            p = abs_corr_mat[:, feat_idx]
+            neighbor_feat_idx = self.random_state_.choice(
+                np.arange(n_features), self.n_nearest_features, replace=False,
+                p=p)
+        else:
+            inds_left = np.arange(feat_idx)
+            inds_right = np.arange(feat_idx + 1, n_features)
+            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+        return neighbor_feat_idx
+
+    def _get_ordered_idx(self, mask_missing_values):
+        """Decide in what order we will update the features.
+
+        As a homage to the MICE R package, we will have 4 main options of
+        how to order the updates, and use a random order if anything else
+        is specified.
+
+        Also, this function skips features which have no missing values.
+
+        Parameters
+        ----------
+        mask_missing_values : array-like, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+
+        Returns
+        -------
+        ordered_idx : ndarray, shape (n_features,)
+            The order in which to impute the features.
+        """
+        frac_of_missing_values = mask_missing_values.mean(axis=0)
+        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
+        if self.imputation_order == 'roman':
+            ordered_idx = missing_values_idx
+        elif self.imputation_order == 'arabic':
+            ordered_idx = missing_values_idx[::-1]
+        elif self.imputation_order == 'ascending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:][::-1]
+        elif self.imputation_order == 'descending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:]
+        elif self.imputation_order == 'random':
+            ordered_idx = missing_values_idx
+            self.random_state_.shuffle(ordered_idx)
+        else:
+            raise ValueError("Got an invalid imputation order: '{0}'. It must "
+                             "be one of the following: 'roman', 'arabic', "
+                             "'ascending', 'descending', or "
+                             "'random'.".format(self.imputation_order))
+        return ordered_idx
+
+    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+        """Get absolute correlation matrix between features.
+
+        Parameters
+        ----------
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        tolerance : float, optional (default=1e-6)
+            ``abs_corr_mat`` can have nans, which will be replaced
+            with ``tolerance``.
+
+        Returns
+        -------
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X`` at the beginning of the
+            current round. The diagonal has been zeroed out and each feature's
+            absolute correlations with all others have been normalized to sum
+            to 1.
+        """
+        n_features = X_filled.shape[1]
+        if (self.n_nearest_features is None or
+                self.n_nearest_features >= n_features):
+            return None
+        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+        # np.corrcoef is not defined for features with zero std
+        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+        # ensures exploration, i.e. at least some probability of sampling
+        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+        # features are not their own neighbors
+        np.fill_diagonal(abs_corr_mat, 0)
+        # needs to sum to 1 for np.random.choice sampling
+        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
+        return abs_corr_mat
+
+    def _initial_imputation(self, X):
+        """Perform initial imputation for input X.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        Returns
+        -------
+        Xt : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+        """
+        if is_scalar_nan(self.missing_values):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
+
+        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+
+        mask_missing_values = _get_mask(X, self.missing_values)
+        if self.initial_imputer_ is None:
+            self.initial_imputer_ = SimpleImputer(
+                                            missing_values=self.missing_values,
+                                            strategy=self.initial_strategy)
+            X_filled = self.initial_imputer_.fit_transform(X)
+        else:
+            X_filled = self.initial_imputer_.transform(X)
+
+        valid_mask = np.flatnonzero(np.logical_not(
+            np.isnan(self.initial_imputer_.statistics_)))
+        Xt = X[:, valid_mask]
+        mask_missing_values = mask_missing_values[:, valid_mask]
+
+        return Xt, X_filled, mask_missing_values
+
+    def fit_transform(self, X, y=None):
+        """Fits the imputer on X and return the transformed X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        self.random_state_ = getattr(self, "random_state_",
+                                     check_random_state(self.random_state))
+
+        if self.max_iter < 0:
+            raise ValueError(
+                "'max_iter' should be a positive integer. Got {} instead."
+                .format(self.max_iter))
+
+        if self.tol < 0:
+            raise ValueError(
+                "'tol' should be a non-negative float. Got {} instead."
+                .format(self.tol)
+            )
+
+        if self.add_indicator:
+            self.indicator_ = MissingIndicator(
+                missing_values=self.missing_values)
+            X_trans_indicator = self.indicator_.fit_transform(X)
+        else:
+            self.indicator_ = None
+
+        if self.estimator is None:
+            from ..linear_model import BayesianRidge
+            self._estimator = BayesianRidge()
+        else:
+            self._estimator = clone(self.estimator)
+
+        self.imputation_sequence_ = []
+
+        if hasattr(self._estimator, 'random_state'):
+            self._estimator.random_state = self.random_state_
+
+        self._min_value = -np.inf if self.min_value is None else self.min_value
+        self._max_value = np.inf if self.max_value is None else self.max_value
+
+        self.initial_imputer_ = None
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.max_iter == 0 or np.all(mask_missing_values):
+            self.n_iter_ = 0
+            return Xt
+
+        # order in which to impute
+        # note this is probably too slow for large feature data (d > 100000)
+        # and a better way would be good.
+        # see: https://goo.gl/KyCNwj and subsequent comments
+        ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
+
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
+
+        n_samples, n_features = Xt.shape
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        if not self.sample_posterior:
+            Xt_previous = Xt.copy()
+            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+        for self.n_iter_ in range(1, self.max_iter + 1):
+            if self.imputation_order == 'random':
+                ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+            for feat_idx in ordered_idx:
+                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
+                                                                feat_idx,
+                                                                abs_corr_mat)
+                Xt, estimator = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
+                    estimator=None, fit_mode=True)
+                estimator_triplet = _ImputerTriplet(feat_idx,
+                                                    neighbor_feat_idx,
+                                                    estimator)
+                self.imputation_sequence_.append(estimator_triplet)
+
+            if self.verbose > 1:
+                print('[IterativeImputer] Ending imputation round '
+                      '%d/%d, elapsed time %0.2f'
+                      % (self.n_iter_, self.max_iter, time() - start_t))
+
+            if not self.sample_posterior:
+                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
+                                          axis=None)
+                if inf_norm < normalized_tol:
+                    if self.verbose > 0:
+                        print('[IterativeImputer] Early stopping criterion '
+                              'reached.')
+                    break
+                Xt_previous = Xt.copy()
+        else:
+            if not self.sample_posterior:
+                warnings.warn("[IterativeImputer] Early stopping criterion not"
+                              " reached.", ConvergenceWarning)
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+
+        if self.add_indicator:
+            Xt = np.hstack((Xt, X_trans_indicator))
+        return Xt
+
+    def transform(self, X):
+        """Imputes all missing values in X.
+
+        Note that this is stochastic, and that if random_state is not fixed,
+        repeated calls, or permuted input, will yield different results.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        check_is_fitted(self, 'initial_imputer_')
+
+        if self.add_indicator:
+            X_trans_indicator = self.indicator_.transform(X)
+
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.n_iter_ == 0 or np.all(mask_missing_values):
+            return Xt
+
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
+        i_rnd = 0
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        for it, estimator_triplet in enumerate(self.imputation_sequence_):
+            Xt, _ = self._impute_one_feature(
+                Xt,
+                mask_missing_values,
+                estimator_triplet.feat_idx,
+                estimator_triplet.neighbor_feat_idx,
+                estimator=estimator_triplet.estimator,
+                fit_mode=False
+            )
+            if not (it + 1) % imputations_per_round:
+                if self.verbose > 1:
+                    print('[IterativeImputer] Ending imputation round '
+                          '%d/%d, elapsed time %0.2f'
+                          % (i_rnd + 1, self.n_iter_, time() - start_t))
+                i_rnd += 1
+
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+
+        if self.add_indicator:
+            Xt = np.hstack((Xt, X_trans_indicator))
+        return Xt
+
+    def fit(self, X, y=None):
+        """Fits the imputer on X and return self.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        self.fit_transform(X)
+        return self
+
+    def _more_tags(self):
+        return {'allow_nan': True}
diff --git a/sklearn/impute/tests/__init__.py b/sklearn/impute/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
similarity index 99%
rename from sklearn/tests/test_impute.py
rename to sklearn/impute/tests/test_impute.py
index 979140ba246cf..1552031ff2193 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -13,6 +13,9 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 
+# make IterativeImputer available
+from sklearn.experimental import enable_iterative_imputer  # noqa
+
 from sklearn.impute import MissingIndicator
 from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 8798fb459ec74..3191dcd7a1352 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -22,7 +22,6 @@
 from ..tree._tree import DTYPE
 from ..exceptions import NotFittedError
 from ..ensemble.gradient_boosting import BaseGradientBoosting
-from ..ensemble._gradient_boosting import _partial_dependence_tree
 
 
 __all__ = ['partial_dependence', 'plot_partial_dependence']
@@ -105,14 +104,14 @@ def _partial_dependence_recursion(est, grid, features):
     grid = np.asarray(grid, dtype=DTYPE, order='C')
 
     n_estimators, n_trees_per_stage = est.estimators_.shape
-    learning_rate = est.learning_rate
     averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
                                     dtype=np.float64, order='C')
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             tree = est.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, features,
-                                     learning_rate, averaged_predictions[k])
+            tree.compute_partial_dependence(grid, features,
+                                            averaged_predictions[k])
+    averaged_predictions *= est.learning_rate
 
     return averaged_predictions
 
@@ -356,7 +355,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
                                                              features)
 
     # reshape averaged_predictions to
-    # (n_outputs, n_values_feature_0, # n_values_feature_1, ...)
+    # (n_outputs, n_values_feature_0, n_values_feature_1, ...)
     averaged_predictions = averaged_predictions.reshape(
         -1, *[val.shape[0] for val in values])
 
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index d2d3c7818e448..b90b76c4220f3 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -27,7 +27,6 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import if_matplotlib
 
 
 # toy sample
@@ -396,11 +395,8 @@ def test_partial_dependence_sample_weight():
     assert np.corrcoef(pdp, values)[0, 1] > 0.99
 
 
-@if_matplotlib
-def test_plot_partial_dependence():
+def test_plot_partial_dependence(pyplot):
     # Test partial dependence plot function.
-    import matplotlib.pyplot as plt  # noqa
-
     boston = load_boston()
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
@@ -409,7 +405,7 @@ def test_plot_partial_dependence():
     plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
                             grid_resolution=grid_resolution,
                             feature_names=boston.feature_names)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
@@ -420,7 +416,7 @@ def test_plot_partial_dependence():
                             grid_resolution=grid_resolution,
                             feature_names=boston.feature_names)
 
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
@@ -431,18 +427,14 @@ def test_plot_partial_dependence():
                                                ('CRIM', 'ZN')],
                             grid_resolution=grid_resolution,
                             feature_names=feature_names)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
-    plt.close('all')
-
 
-@if_matplotlib
-def test_plot_partial_dependence_multiclass():
+def test_plot_partial_dependence_multiclass(pyplot):
     # Test partial dependence plot function on multi-class input.
-    import matplotlib.pyplot as plt  # noqa
     iris = load_iris()
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(iris.data, iris.target)
@@ -451,7 +443,7 @@ def test_plot_partial_dependence_multiclass():
     plot_partial_dependence(clf, iris.data, [0, 1],
                             target=0,
                             grid_resolution=grid_resolution)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
@@ -465,18 +457,14 @@ def test_plot_partial_dependence_multiclass():
     plot_partial_dependence(clf, iris.data, [0, 1],
                             target='setosa',
                             grid_resolution=grid_resolution)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    plt.close('all')
 
-
-@if_matplotlib
-def test_plot_partial_dependence_multioutput():
+def test_plot_partial_dependence_multioutput(pyplot):
     # Test partial dependence plot function on multi-output input.
-    import matplotlib.pyplot as plt  # noqa
     (X, y), _ = multioutput_regression_data
     clf = LinearRegression()
     clf.fit(X, y)
@@ -485,7 +473,7 @@ def test_plot_partial_dependence_multioutput():
     plot_partial_dependence(clf, X, [0, 1],
                             target=0,
                             grid_resolution=grid_resolution)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
@@ -493,15 +481,12 @@ def test_plot_partial_dependence_multioutput():
     plot_partial_dependence(clf, X, [0, 1],
                             target=1,
                             grid_resolution=grid_resolution)
-    fig = plt.gcf()
+    fig = pyplot.gcf()
     axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    plt.close('all')
-
 
-@if_matplotlib
 @pytest.mark.parametrize(
     "data, params, err_msg",
     [(multioutput_regression_data[0], {"target": None, 'features': [0]},
@@ -531,32 +516,23 @@ def test_plot_partial_dependence_multioutput():
 )
 @pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
 @pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
-def test_plot_partial_dependence_error(data, params, err_msg):
-    import matplotlib.pyplot as plt  # noqa
+def test_plot_partial_dependence_error(pyplot, data, params, err_msg):
     X, y = data
     estimator = LinearRegression().fit(X, y)
 
     with pytest.raises(ValueError, match=err_msg):
         plot_partial_dependence(estimator, X, **params)
 
-    plt.close()
 
-
-@if_matplotlib
-def test_plot_partial_dependence_fig():
+def test_plot_partial_dependence_fig(pyplot):
     # Make sure fig object is correctly used if not None
-
-    import matplotlib.pyplot as plt
-
     (X, y), _ = regression_data
     clf = LinearRegression()
     clf.fit(X, y)
 
-    fig = plt.figure()
+    fig = pyplot.figure()
     grid_resolution = 25
     plot_partial_dependence(
         clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
 
-    assert plt.gcf() is fig
-
-    plt.close()
+    assert pyplot.gcf() is fig
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 3b8f74a946699..7cff336715322 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -209,6 +209,15 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
 
     Correctness of Kruskal's algorithms for monotone regression with ties
     Leeuw, Psychometrica, 1977
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.isotonic import IsotonicRegression
+    >>> X, y = make_regression(n_samples=10, n_features=1, random_state=41)
+    >>> iso_reg = IsotonicRegression().fit(X.flatten(), y)
+    >>> iso_reg.predict([.1, .2])  # doctest: +ELLIPSIS
+    array([1.8628..., 3.7256...])
     """
     def __init__(self, y_min=None, y_max=None, increasing=True,
                  out_of_bounds='nan'):
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index aeb5fd45f413f..17a5247d5ab20 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -27,7 +27,7 @@ class KernelRidge(BaseEstimator, RegressorMixin, MultiOutputMixin):
     squared error loss while support vector regression uses epsilon-insensitive
     loss, both combined with l2 regularization. In contrast to SVR, fitting a
     KRR model can be done in closed-form and is typically faster for
-    medium-sized datasets. On the other  hand, the learned model is non-sparse
+    medium-sized datasets. On the other hand, the learned model is non-sparse
     and thus slower than SVR, which learns a sparse model for epsilon > 0, at
     prediction-time.
 
diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index ad0fa4277f3be..fcbe46ce77711 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -24,7 +24,7 @@ from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
 from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
 
 
-from ..utils cimport _random
+from ..utils._random cimport our_rand_r
 
 ctypedef np.float64_t DOUBLE
 ctypedef np.uint32_t UINT32_t
@@ -42,7 +42,7 @@ cdef enum:
 
 cdef inline UINT32_t rand_int(UINT32_t end, UINT32_t* random_state) nogil:
     """Generate a random integer in [0; end)."""
-    return _random.our_rand_r(random_state) % end
+    return our_rand_r(random_state) % end
 
 
 cdef inline floating fmax(floating x, floating y) nogil:
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index ceccafd706101..b14188bff50c1 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -224,7 +224,7 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
     values output by lars_path
 
     Examples
-    ---------
+    --------
 
     Comparing lasso_path and lars_path with interpolation:
 
@@ -661,7 +661,7 @@ def fit(self, X, y, check_input=True):
         """Fit model with coordinate descent.
 
         Parameters
-        -----------
+        ----------
         X : ndarray or scipy.sparse matrix, (n_samples, n_features)
             Data
 
@@ -1747,7 +1747,7 @@ def fit(self, X, y):
         """Fit MultiTaskElasticNet model with coordinate descent
 
         Parameters
-        -----------
+        ----------
         X : ndarray, shape (n_samples, n_features)
             Data
         y : ndarray, shape (n_samples, n_tasks)
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 5df45535db462..594fdb3676adb 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -42,7 +42,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
     Read more in the :ref:`User Guide <least_angle_regression>`.
 
     Parameters
-    -----------
+    ----------
     X : None or array, shape (n_samples, n_features)
         Input data. Note that if X is None then the Gram matrix must be
         specified, i.e., cannot be None or False.
@@ -112,7 +112,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
         solution of the coordinate descent lasso_path function.
 
     Returns
-    --------
+    -------
     alphas : array, shape (n_alphas + 1,)
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -179,7 +179,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
     Read more in the :ref:`User Guide <least_angle_regression>`.
 
     Parameters
-    -----------
+    ----------
     Xy : array-like, shape (n_samples,) or (n_samples, n_targets)
         Xy = np.dot(X.T, y).
 
@@ -231,7 +231,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
         solution of the coordinate descent lasso_path function.
 
     Returns
-    --------
+    -------
     alphas : array, shape (n_alphas + 1,)
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -295,7 +295,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
     Read more in the :ref:`User Guide <least_angle_regression>`.
 
     Parameters
-    -----------
+    ----------
     X : None or ndarray, shape (n_samples, n_features)
         Input data. Note that if X is None then Gram must be specified,
         i.e., cannot be None or False.
@@ -358,7 +358,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
         solution of the coordinate descent lasso_path function.
 
     Returns
-    --------
+    -------
     alphas : array, shape (n_alphas + 1,)
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -1128,7 +1128,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
     """Compute the residues on left-out data for a full LARS path
 
     Parameters
-    -----------
+    ----------
     X_train : array, shape (n_samples, n_features)
         The data to fit the LARS on
 
@@ -1189,7 +1189,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
 
 
     Returns
-    --------
+    -------
     alphas : array, shape (n_alphas,)
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index d9ee49cd37698..38be6ddd37540 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -681,7 +681,7 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
     """Compute the residues on left-out data for a full LARS path
 
     Parameters
-    -----------
+    ----------
     X_train : array, shape (n_samples, n_features)
         The data to fit the LARS on
 
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 2a491bd3ef515..0e54126e52c33 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -31,6 +31,7 @@
 from ..model_selection import GridSearchCV
 from ..metrics.scorer import check_scoring
 from ..exceptions import ConvergenceWarning
+from ..utils.sparsefuncs import mean_variance_axis
 
 
 def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0,
@@ -226,9 +227,17 @@ def _solve_svd(X, y, alpha):
     return np.dot(Vt.T, d_UT_y).T
 
 
+def _get_valid_accept_sparse(is_X_sparse, solver):
+    if is_X_sparse and solver in ['auto', 'sag', 'saga']:
+        return 'csr'
+    else:
+        return ['csr', 'csc', 'coo']
+
+
 def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                      max_iter=None, tol=1e-3, verbose=0, random_state=None,
-                     return_n_iter=False, return_intercept=False):
+                     return_n_iter=False, return_intercept=False,
+                     check_input=True):
     """Solve the ridge equation by the method of normal equations.
 
     Read more in the :ref:`User Guide <ridge_regression>`.
@@ -332,6 +341,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
         .. versionadded:: 0.17
 
+    check_input : boolean, default True
+        If False, the input arrays X and y will not be checked.
+
+        .. versionadded:: 0.21
+
     Returns
     -------
     coef : array, shape = [n_features] or [n_targets, n_features]
@@ -360,13 +374,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                              return_n_iter=return_n_iter,
                              return_intercept=return_intercept,
                              X_scale=None,
-                             X_offset=None)
+                             X_offset=None,
+                             check_input=check_input)
 
 
 def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                       max_iter=None, tol=1e-3, verbose=0, random_state=None,
                       return_n_iter=False, return_intercept=False,
-                      X_scale=None, X_offset=None):
+                      X_scale=None, X_offset=None, check_input=True):
 
     has_sw = sample_weight is not None
 
@@ -388,17 +403,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                          "intercept. Please change solver to 'sag' or set "
                          "return_intercept=False.")
 
-    _dtype = [np.float64, np.float32]
-
-    # SAG needs X and y columns to be C-contiguous and np.float64
-    if solver in ['sag', 'saga']:
-        X = check_array(X, accept_sparse=['csr'],
-                        dtype=np.float64, order='C')
-        y = check_array(y, dtype=np.float64, ensure_2d=False, order='F')
-    else:
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype=_dtype)
-        y = check_array(y, dtype=X.dtype, ensure_2d=False)
+    if check_input:
+        _dtype = [np.float64, np.float32]
+        _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
+        X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype,
+                        order="C")
+        y = check_array(y, dtype=X.dtype, ensure_2d=False, order="C")
     check_consistent_length(X, y)
 
     n_samples, n_features = X.shape
@@ -417,8 +427,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         raise ValueError("Number of samples in X and y does not correspond:"
                          " %d != %d" % (n_samples, n_samples_))
 
-
-
     if has_sw:
         if np.atleast_1d(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")
@@ -438,7 +446,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
     if alpha.size == 1 and n_targets > 1:
         alpha = np.repeat(alpha, n_targets)
 
-
     n_iter = None
     if solver == 'sparse_cg':
         coef = _solve_sparse_cg(X, y, alpha,
@@ -461,7 +468,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
             except linalg.LinAlgError:
                 # use SVD solver if matrix is singular
                 solver = 'svd'
-
         else:
             try:
                 coef = _solve_cholesky(X, y, alpha)
@@ -473,11 +479,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         # precompute max_squared_sum for all targets
         max_squared_sum = row_norms(X, squared=True).max()
 
-        coef = np.empty((y.shape[1], n_features))
+        coef = np.empty((y.shape[1], n_features), dtype=X.dtype)
         n_iter = np.empty(y.shape[1], dtype=np.int32)
-        intercept = np.zeros((y.shape[1], ))
+        intercept = np.zeros((y.shape[1], ), dtype=X.dtype)
         for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
-            init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
+            init = {'coef': np.zeros((n_features + int(return_intercept), 1),
+                                     dtype=X.dtype)}
             coef_, n_iter_, _ = sag_solver(
                 X, target.ravel(), sample_weight, 'squared', alpha_i, 0,
                 max_iter, tol, verbose, random_state, False, max_squared_sum,
@@ -530,13 +537,13 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
 
     def fit(self, X, y, sample_weight=None):
 
-        if self.solver in ('sag', 'saga'):
-            _dtype = np.float64
-        else:
-            # all other solvers work at both float precision levels
-            _dtype = [np.float64, np.float32]
-
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype,
+        # all other solvers work at both float precision levels
+        _dtype = [np.float64, np.float32]
+        _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
+                                                  self.solver)
+        X, y = check_X_y(X, y,
+                         accept_sparse=_accept_sparse,
+                         dtype=_dtype,
                          multi_output=True, y_numeric=True)
 
         if ((sample_weight is not None) and
@@ -555,7 +562,7 @@ def fit(self, X, y, sample_weight=None):
                 X, y, alpha=self.alpha, sample_weight=sample_weight,
                 max_iter=self.max_iter, tol=self.tol, solver=self.solver,
                 random_state=self.random_state, return_n_iter=True,
-                return_intercept=True)
+                return_intercept=True, check_input=False)
             # add the offset which was subtracted by _preprocess_data
             self.intercept_ += y_offset
         else:
@@ -570,8 +577,7 @@ def fit(self, X, y, sample_weight=None):
                 X, y, alpha=self.alpha, sample_weight=sample_weight,
                 max_iter=self.max_iter, tol=self.tol, solver=self.solver,
                 random_state=self.random_state, return_n_iter=True,
-                return_intercept=False, **params)
-
+                return_intercept=False, check_input=False, **params)
             self._set_intercept(X_offset, y_offset, X_scale)
 
         return self
@@ -893,8 +899,9 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : returns an instance of self.
         """
-        check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                  multi_output=True)
+        _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
+                                                  self.solver)
+        check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True)
 
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
         Y = self._label_binarizer.fit_transform(y)
@@ -921,6 +928,106 @@ def classes_(self):
         return self._label_binarizer.classes_
 
 
+def _check_gcv_mode(X, gcv_mode):
+    possible_gcv_modes = [None, 'auto', 'svd', 'eigen']
+    if gcv_mode not in possible_gcv_modes:
+        raise ValueError(
+            "Unknown value for 'gcv_mode'. "
+            "Got {} instead of one of {}" .format(
+                gcv_mode, possible_gcv_modes))
+    if gcv_mode in ['eigen', 'svd']:
+        return gcv_mode
+    # if X has more rows than columns, use decomposition of X^T.X,
+    # otherwise X.X^T
+    if X.shape[0] > X.shape[1]:
+        return 'svd'
+    return 'eigen'
+
+
+def _find_smallest_angle(query, vectors):
+    """Find the column of vectors that is most aligned with the query.
+
+    Both query and the columns of vectors must have their l2 norm equal to 1.
+
+    Parameters
+    ----------
+    query : ndarray, shape (n_samples,)
+        Normalized query vector.
+
+    vectors : ndarray, shape (n_samples, n_features)
+        Vectors to which we compare query, as columns. Must be normalized.
+    """
+    abs_cosine = np.abs(query.dot(vectors))
+    index = np.argmax(abs_cosine)
+    return index
+
+
+class _X_operator(sparse.linalg.LinearOperator):
+    """Behaves as centered and scaled X with an added intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_samples, n_features + 1))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        return safe_sparse_dot(
+            self.X, v[:-1], dense_output=True
+        ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw
+
+    def _matmat(self, v):
+        return (
+            safe_sparse_dot(self.X, v[:-1], dense_output=True) -
+            self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] *
+            self.sqrt_sw[:, None])
+
+    def _transpose(self):
+        return _Xt_operator(self.X, self.X_mean, self.sqrt_sw)
+
+
+class _Xt_operator(sparse.linalg.LinearOperator):
+    """Behaves as transposed centered and scaled X with an intercept column.
+
+    This operator behaves as
+    np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
+    """
+
+    def __init__(self, X, X_mean, sqrt_sw):
+        n_samples, n_features = X.shape
+        super().__init__(X.dtype, (n_features + 1, n_samples))
+        self.X = X
+        self.X_mean = X_mean
+        self.sqrt_sw = sqrt_sw
+
+    def _matvec(self, v):
+        v = v.ravel()
+        n_features = self.shape[0]
+        res = np.empty(n_features, dtype=self.X.dtype)
+        res[:-1] = (
+            safe_sparse_dot(self.X.T, v, dense_output=True) -
+            (self.X_mean * self.sqrt_sw.dot(v))
+        )
+        res[-1] = np.dot(v, self.sqrt_sw)
+        return res
+
+    def _matmat(self, v):
+        n_features = self.shape[0]
+        res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
+        res[:-1] = (
+            safe_sparse_dot(self.X.T, v, dense_output=True) -
+            self.X_mean[:, None] * self.sqrt_sw.dot(v)
+        )
+        res[-1] = np.dot(self.sqrt_sw, v)
+        return res
+
+
 class _RidgeGCV(LinearModel):
     """Ridge regression with built-in Generalized Cross-Validation
 
@@ -972,18 +1079,6 @@ def __init__(self, alphas=(0.1, 1.0, 10.0),
         self.gcv_mode = gcv_mode
         self.store_cv_values = store_cv_values
 
-    def _pre_compute(self, X, y, centered_kernel=True):
-        # even if X is very sparse, K is usually very dense
-        K = safe_sparse_dot(X, X.T, dense_output=True)
-        # the following emulates an additional constant regressor
-        # corresponding to fit_intercept=True
-        # but this is done only when the features have been centered
-        if centered_kernel:
-            K += np.ones_like(K)
-        v, Q = linalg.eigh(K)
-        QT_y = np.dot(Q.T, y)
-        return v, Q, QT_y
-
     def _decomp_diag(self, v_prime, Q):
         # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
         return (v_prime * Q ** 2).sum(axis=-1)
@@ -995,18 +1090,161 @@ def _diag_dot(self, D, B):
             D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)]
         return D * B
 
-    def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
-        """Helper function to avoid code duplication between self._errors and
-        self._values.
+    def _compute_gram(self, X, sqrt_sw):
+        """Computes the Gram matrix with possible centering.
 
-        Notes
-        -----
-        We don't construct matrix G, instead compute action on y & diagonal.
+        If ``center`` is ``True``, compute
+        (X - X.mean(axis=0)).dot((X - X.mean(axis=0)).T)
+        else X.dot(X.T)
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input uncentered data.
+
+        sqrt_sw : ndarray, shape (n_samples,)
+            square roots of sample weights
+
+        center : bool, default is True
+            Whether or not to remove the mean from ``X``.
+
+        Returns
+        -------
+        gram : ndarray, shape (n_samples, n_samples)
+            The Gram matrix.
+        X_mean : ndarray, shape (n_feature,)
+            The mean of ``X`` for each feature.
+        """
+        center = self.fit_intercept and sparse.issparse(X)
+        if not center:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X, X.T, dense_output=True), X_mean
+        # otherwise X is always sparse
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples))
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
+        X_mX = sqrt_sw[:, None] * safe_sparse_dot(
+            X_mean, X.T, dense_output=True)
+        X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
+        return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m
+                - X_mX - X_mX.T, X_mean)
+
+    def _compute_covariance(self, X, sqrt_sw):
+        """Computes centered covariance matrix.
+
+        If ``center`` is ``True``, compute
+        (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0))
+        else
+        X.T.dot(X)
+
+        Parameters
+        ----------
+        X : sparse matrix, shape (n_samples, n_features)
+            The input uncentered data.
+
+        sqrt_sw : ndarray, shape (n_samples,)
+            square roots of sample weights
+
+        center : bool, default is True
+            Whether or not to remove the mean from ``X``.
+
+        Returns
+        -------
+        covariance : ndarray, shape (n_features, n_features)
+            The covariance matrix.
+        X_mean : ndarray, shape (n_feature,)
+            The mean of ``X`` for each feature.
+        """
+        if not self.fit_intercept:
+            # in this case centering has been done in preprocessing
+            # or we are not fitting an intercept.
+            X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+            return safe_sparse_dot(X.T, X, dense_output=True), X_mean
+        # this function only gets called for sparse X
+        n_samples = X.shape[0]
+        sample_weight_matrix = sparse.dia_matrix(
+            (sqrt_sw, 0), shape=(n_samples, n_samples))
+        X_weighted = sample_weight_matrix.dot(X)
+        X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+        X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
+        weight_sum = sqrt_sw.dot(sqrt_sw)
+        return (safe_sparse_dot(X.T, X, dense_output=True) -
+                weight_sum * np.outer(X_mean, X_mean),
+                X_mean)
+
+    def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
+        """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
+        without explicitely centering X nor computing X.dot(A)
+        when X is sparse.
+
+        Parameters
+        ----------
+        X : sparse matrix, shape = (n_samples, n_features)
+
+        A : np.ndarray, shape = (n_features, n_features)
+
+        X_mean : np.ndarray, shape = (n_features,)
+
+        sqrt_sw : np.ndarray, shape = (n_features,)
+            square roots of sample weights
+
+        Returns
+        -------
+        diag : np.ndarray, shape = (n_samples,)
+            The computed diagonal.
+        """
+        intercept_col = sqrt_sw
+        scale = sqrt_sw
+        batch_size = X.shape[1]
+        diag = np.empty(X.shape[0], dtype=X.dtype)
+        for start in range(0, X.shape[0], batch_size):
+            batch = slice(start, min(X.shape[0], start + batch_size), 1)
+            X_batch = np.empty(
+                (X[batch].shape[0], X.shape[1] + self.fit_intercept),
+                dtype=X.dtype
+            )
+            if self.fit_intercept:
+                X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None]
+                X_batch[:, -1] = intercept_col[batch]
+            else:
+                X_batch = X[batch].A
+            diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
+        return diag
+
+    def _eigen_decompose_gram(self, X, y, sqrt_sw):
+        """Eigendecomposition of X.X^T, used when n_samples <= n_features"""
+        # if X is dense it has already been centered in preprocessing
+        K, X_mean = self._compute_gram(X, sqrt_sw)
+        if self.fit_intercept:
+            # to emulate centering X with sample weights,
+            # ie removing the weighted average, we add a column
+            # containing the square roots of the sample weights.
+            # by centering, it is orthogonal to the other columns
+            K += np.outer(sqrt_sw, sqrt_sw)
+        v, Q = linalg.eigh(K)
+        QT_y = np.dot(Q.T, y)
+        return X_mean, v, Q, QT_y
+
+    def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, v, Q, QT_y):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X.X^T (n_features >= n_samples).
         """
         w = 1. / (v + alpha)
-        constant_column = np.var(Q, 0) < 1.e-12
-        # detect constant columns
-        w[constant_column] = 0  # cancel the regularization for the intercept
+        if self.fit_intercept:
+            # the vector containing the square roots of the sample weights (1
+            # when no sample weights) is the eigenvector of XX^T which
+            # corresponds to the intercept; we cancel the regularization on
+            # this dimension. the corresponding eigenvalue is
+            # sum(sample_weight).
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, Q)
+            w[intercept_dim] = 0  # cancel regularization for the intercept
 
         c = np.dot(Q, self._diag_dot(w, QT_y))
         G_diag = self._decomp_diag(w, Q)
@@ -1015,35 +1253,117 @@ def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
             G_diag = G_diag[:, np.newaxis]
         return G_diag, c
 
-    def _errors(self, alpha, y, v, Q, QT_y):
-        G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
-        return (c / G_diag) ** 2, c
+    def _eigen_decompose_covariance(self, X, y, sqrt_sw):
+        """Eigendecomposition of X^T.X, used when n_samples > n_features."""
+        n_samples, n_features = X.shape
+        cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
+        cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
+        if not self.fit_intercept:
+            cov = cov[:-1, :-1]
+        # to emulate centering X with sample weights,
+        # ie removing the weighted average, we add a column
+        # containing the square roots of the sample weights.
+        # by centering, it is orthogonal to the other columns
+        # when all samples have the same weight we add a column of 1
+        else:
+            cov[-1] = 0
+            cov[:, -1] = 0
+            cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
+        nullspace_dim = max(0, X.shape[1] - X.shape[0])
+        s, V = linalg.eigh(cov)
+        # remove eigenvalues and vectors in the null space of X^T.X
+        s = s[nullspace_dim:]
+        V = V[:, nullspace_dim:]
+        return X_mean, s, V, X
+
+    def _solve_eigen_covariance_no_intercept(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse), and not fitting an intercept.
+        """
+        w = 1 / (s + alpha)
+        A = (V * w).dot(V.T)
+        AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
+        y_hat = safe_sparse_dot(X, AXy, dense_output=True)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
 
-    def _values(self, alpha, y, v, Q, QT_y):
-        G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
-        return y - (c / G_diag), c
+    def _solve_eigen_covariance_intercept(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
 
-    def _pre_compute_svd(self, X, y, centered_kernel=True):
-        if sparse.issparse(X):
-            raise TypeError("SVD not supported for sparse matrices")
-        if centered_kernel:
-            X = np.hstack((X, np.ones((X.shape[0], 1))))
-        # to emulate fit_intercept=True situation, add a column on ones
-        # Note that by centering, the other columns are orthogonal to that one
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse),
+        and we are fitting an intercept.
+        """
+        # the vector [0, 0, ..., 0, 1]
+        # is the eigenvector of X^TX which
+        # corresponds to the intercept; we cancel the regularization on
+        # this dimension. the corresponding eigenvalue is
+        # sum(sample_weight), e.g. n when uniform sample weights.
+        intercept_sv = np.zeros(V.shape[0])
+        intercept_sv[-1] = 1
+        intercept_dim = _find_smallest_angle(intercept_sv, V)
+        w = 1 / (s + alpha)
+        w[intercept_dim] = 1 / s[intercept_dim]
+        A = (V * w).dot(V.T)
+        # add a column to X containing the square roots of sample weights
+        X_op = _X_operator(X, X_mean, sqrt_sw)
+        AXy = A.dot(X_op.T.dot(y))
+        y_hat = X_op.dot(AXy)
+        hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+        # return (1 - hat_diag), (y - y_hat)
+        if len(y.shape) != 1:
+            # handle case where y is 2-d
+            hat_diag = hat_diag[:, np.newaxis]
+        return (1 - hat_diag) / alpha, (y - y_hat) / alpha
+
+    def _solve_eigen_covariance(
+            self, alpha, y, sqrt_sw, X_mean, s, V, X):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have a decomposition of X^T.X
+        (n_features < n_samples and X is sparse).
+        """
+        if self.fit_intercept:
+            return self._solve_eigen_covariance_intercept(
+                alpha, y, sqrt_sw, X_mean, s, V, X)
+        return self._solve_eigen_covariance_no_intercept(
+            alpha, y, sqrt_sw, X_mean, s, V, X)
+
+    def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
+        # X already centered
+        X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+        if self.fit_intercept:
+            # to emulate fit_intercept=True situation, add a column
+            # containing the square roots of the sample weights
+            # by centering, the other columns are orthogonal to that one
+            intercept_column = sqrt_sw[:, None]
+            X = np.hstack((X, intercept_column))
         U, s, _ = linalg.svd(X, full_matrices=0)
         v = s ** 2
         UT_y = np.dot(U.T, y)
-        return v, U, UT_y
+        return X_mean, v, U, UT_y
 
-    def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
-        """Helper function to avoid code duplication between self._errors_svd
-        and self._values_svd.
+    def _solve_svd_design_matrix(
+            self, alpha, y, sqrt_sw, X_mean, v, U, UT_y):
+        """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+        Used when we have an SVD decomposition of X
+        (n_features >= n_samples and X is dense).
         """
-        constant_column = np.var(U, 0) < 1.e-12
-        # detect columns colinear to ones
         w = ((v + alpha) ** -1) - (alpha ** -1)
-        w[constant_column] = - (alpha ** -1)
-        # cancel the regularization for the intercept
+        if self.fit_intercept:
+            # detect intercept column
+            normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+            intercept_dim = _find_smallest_angle(normalized_sw, U)
+            # cancel the regularization for the intercept
+            w[intercept_dim] = - (alpha ** -1)
         c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y
         G_diag = self._decomp_diag(w, U) + (alpha ** -1)
         if len(y.shape) != 1:
@@ -1051,24 +1371,16 @@ def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
             G_diag = G_diag[:, np.newaxis]
         return G_diag, c
 
-    def _errors_svd(self, alpha, y, v, U, UT_y):
-        G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-        return (c / G_diag) ** 2, c
-
-    def _values_svd(self, alpha, y, v, U, UT_y):
-        G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-        return y - (c / G_diag), c
-
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model
 
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            Training data
+            Training data. Will be cast to float64 if necessary
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
-            Target values. Will be cast to X's dtype if necessary
+            Target values. Will be cast to float64 if necessary
 
         sample_weight : float or array-like of shape [n_samples]
             Sample weight
@@ -1077,66 +1389,60 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64,
+        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'],
+                         dtype=[np.float64],
                          multi_output=True, y_numeric=True)
+
+        if np.any(self.alphas <= 0):
+            raise ValueError(
+                "alphas must be positive. Got {} containing some "
+                "negative or null value instead.".format(self.alphas))
+
         if sample_weight is not None and not isinstance(sample_weight, float):
-            sample_weight = check_array(sample_weight, ensure_2d=False)
+            sample_weight = check_array(sample_weight, ensure_2d=False,
+                                        dtype=X.dtype)
         n_samples, n_features = X.shape
 
         X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X,
             sample_weight=sample_weight)
 
-        gcv_mode = self.gcv_mode
-        with_sw = len(np.shape(sample_weight))
-
-        if gcv_mode is None or gcv_mode == 'auto':
-            if sparse.issparse(X) or n_features > n_samples or with_sw:
-                gcv_mode = 'eigen'
-            else:
-                gcv_mode = 'svd'
-        elif gcv_mode == "svd" and with_sw:
-            # FIXME non-uniform sample weights not yet supported
-            warnings.warn("non-uniform sample weights unsupported for svd, "
-                          "forcing usage of eigen")
-            gcv_mode = 'eigen'
+        gcv_mode = _check_gcv_mode(X, self.gcv_mode)
 
         if gcv_mode == 'eigen':
-            _pre_compute = self._pre_compute
-            _errors = self._errors
-            _values = self._values
+            decompose = self._eigen_decompose_gram
+            solve = self._solve_eigen_gram
         elif gcv_mode == 'svd':
-            # assert n_samples >= n_features
-            _pre_compute = self._pre_compute_svd
-            _errors = self._errors_svd
-            _values = self._values_svd
-        else:
-            raise ValueError('bad gcv_mode "%s"' % gcv_mode)
+            if sparse.issparse(X):
+                decompose = self._eigen_decompose_covariance
+                solve = self._solve_eigen_covariance
+            else:
+                decompose = self._svd_decompose_design_matrix
+                solve = self._solve_svd_design_matrix
 
         if sample_weight is not None:
             X, y = _rescale_data(X, y, sample_weight)
-
-        centered_kernel = not sparse.issparse(X) and self.fit_intercept
-
-        v, Q, QT_y = _pre_compute(X, y, centered_kernel)
-        n_y = 1 if len(y.shape) == 1 else y.shape[1]
-        cv_values = np.zeros((n_samples * n_y, len(self.alphas)))
-        C = []
+            sqrt_sw = np.sqrt(sample_weight)
+        else:
+            sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
 
         scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
         error = scorer is None
 
-        if np.any(self.alphas < 0):
-            raise ValueError("alphas cannot be negative. "
-                             "Got {} containing some "
-                             "negative value instead.".format(self.alphas))
-
+        n_y = 1 if len(y.shape) == 1 else y.shape[1]
+        cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
+                             dtype=X.dtype)
+        C = []
+        X_mean, *decomposition = decompose(X, y, sqrt_sw)
         for i, alpha in enumerate(self.alphas):
+            G_diag, c = solve(
+                float(alpha), y, sqrt_sw, X_mean, *decomposition)
             if error:
-                out, c = _errors(float(alpha), y, v, Q, QT_y)
+                squared_errors = (c / G_diag) ** 2
+                cv_values[:, i] = squared_errors.ravel()
             else:
-                out, c = _values(float(alpha), y, v, Q, QT_y)
-            cv_values[:, i] = out.ravel()
+                predictions = y - (c / G_diag)
+                cv_values[:, i] = predictions.ravel()
             C.append(c)
 
         if error:
@@ -1158,6 +1464,7 @@ def identity_estimator():
         self.dual_coef_ = C[best]
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
+        X_offset += X_mean * X_scale
         self._set_intercept(X_offset, y_offset, X_scale)
 
         if self.store_cv_values:
@@ -1189,7 +1496,8 @@ def fit(self, X, y, sample_weight=None):
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
-            Training data
+            Training data. If using GCV, will be cast to float64
+            if necessary.
 
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
             Target values. Will be cast to X's dtype if necessary
@@ -1200,8 +1508,17 @@ def fit(self, X, y, sample_weight=None):
         Returns
         -------
         self : object
+
+        Notes
+        -----
+        When sample_weight is provided, the selected hyperparameter may depend
+        on whether we use generalized cross-validation (cv=None or cv='auto')
+        or another form of cross-validation, because only generalized
+        cross-validation takes the sample weights into account when computing
+        the validation score.
         """
-        if self.cv is None:
+        cv = self.cv
+        if cv is None:
             estimator = _RidgeGCV(self.alphas,
                                   fit_intercept=self.fit_intercept,
                                   normalize=self.normalize,
@@ -1217,9 +1534,11 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError("cv!=None and store_cv_values=True "
                                  " are incompatible")
             parameters = {'alpha': self.alphas}
+            solver = 'sparse_cg' if sparse.issparse(X) else 'auto'
             gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept,
-                                    normalize=self.normalize),
-                              parameters, cv=self.cv, scoring=self.scoring)
+                                    normalize=self.normalize,
+                                    solver=solver),
+                              parameters, cv=cv, scoring=self.scoring)
             gs.fit(X, y, sample_weight=sample_weight)
             estimator = gs.best_estimator_
             self.alpha_ = gs.best_estimator_.alpha
@@ -1249,6 +1568,7 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         the estimates. Larger values specify stronger regularization.
         Alpha corresponds to ``C^-1`` in other linear models such as
         LogisticRegression or LinearSVC.
+        If using generalized cross-validation, alphas must be positive.
 
     fit_intercept : boolean
         Whether to calculate the intercept for this model. If set
@@ -1267,12 +1587,15 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
         ``scorer(estimator, X, y)``.
+        If None, the negative mean squared error if cv is 'auto' or None
+        (i.e. when using generalized cross-validation), and r2 score otherwise.
 
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
         - None, to use the efficient Leave-One-Out cross-validation
+          (also known as Generalized Cross-Validation).
         - integer, to specify the number of folds.
         - :term:`CV splitter`,
         - An iterable yielding (train, test) splits as arrays of indices.
@@ -1288,15 +1611,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
         Flag indicating which strategy to use when performing
         Generalized Cross-Validation. Options are::
 
-            'auto' : use svd if n_samples > n_features or when X is a sparse
-                     matrix, otherwise use eigen
-            'svd' : force computation via singular value decomposition of X
-                    (does not work for sparse matrices)
-            'eigen' : force computation via eigendecomposition of X^T X
+            'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
+            'svd' : force use of singular value decomposition of X when X is
+                dense, eigenvalue decomposition of X^T.X when X is sparse.
+            'eigen' : force computation via eigendecomposition of X.X^T
 
         The 'auto' mode is the default and is intended to pick the cheaper
-        option of the two depending upon the shape and format of the training
-        data.
+        option of the two depending on the shape of the training data.
 
     store_cv_values : boolean, default=False
         Flag indicating if the cross-validation values corresponding to
@@ -1463,7 +1784,8 @@ def fit(self, X, y, sample_weight=None):
         ----------
         X : array-like, shape (n_samples, n_features)
             Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
+            and n_features is the number of features. When using GCV,
+            will be cast to float64 if necessary.
 
         y : array-like, shape (n_samples,)
             Target values. Will be cast to X's dtype if necessary
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 7bfb617d4beff..fa7f0606b1010 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -6,8 +6,8 @@
 import pytest
 
 from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_greater
@@ -33,10 +33,12 @@
 from sklearn.linear_model.ridge import RidgeClassifierCV
 from sklearn.linear_model.ridge import _solve_cholesky
 from sklearn.linear_model.ridge import _solve_cholesky_kernel
+from sklearn.linear_model.ridge import _check_gcv_mode
+from sklearn.linear_model.ridge import _X_operator
 from sklearn.datasets import make_regression
 
 from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold
+from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
 
 from sklearn.utils import check_random_state
 from sklearn.datasets import make_multilabel_classification
@@ -311,6 +313,213 @@ def test_ridge_individual_penalties():
     assert_raises(ValueError, ridge.fit, X, y)
 
 
+@pytest.mark.parametrize('n_col', [(), (1,), (3,)])
+def test_x_operator(n_col):
+    rng = np.random.RandomState(0)
+    X = rng.randn(11, 8)
+    X_m = rng.randn(8)
+    sqrt_sw = rng.randn(len(X))
+    Y = rng.randn(11, *n_col)
+    A = rng.randn(9, *n_col)
+    operator = _X_operator(sp.csr_matrix(X), X_m, sqrt_sw)
+    reference_operator = np.hstack(
+        [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]])
+    assert_allclose(reference_operator.dot(A), operator.dot(A))
+    assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_gram(shape, uniform_weights):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_gram = X_centered.dot(X_centered.T)
+    X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_gram, computed_gram)
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_covariance(shape, uniform_weights):
+    rng = np.random.RandomState(0)
+    X = rng.randn(*shape)
+    if uniform_weights:
+        sw = np.ones(X.shape[0])
+    else:
+        sw = rng.chisquare(1, shape[0])
+    sqrt_sw = np.sqrt(sw)
+    X_mean = np.average(X, axis=0, weights=sw)
+    X_centered = (X - X_mean) * sqrt_sw[:, None]
+    true_covariance = X_centered.T.dot(X_centered)
+    X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+    gcv = _RidgeGCV(fit_intercept=True)
+    computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw)
+    assert_allclose(X_mean, computed_mean)
+    assert_allclose(true_covariance, computed_cov)
+
+
+def _make_sparse_offset_regression(
+        n_samples=100, n_features=100, proportion_nonzero=.5,
+        n_informative=10, n_targets=1, bias=13., X_offset=30.,
+        noise=30., shuffle=True, coef=False, random_state=None):
+    X, y, c = make_regression(
+        n_samples=n_samples, n_features=n_features,
+        n_informative=n_informative, n_targets=n_targets, bias=bias,
+        noise=noise, shuffle=shuffle,
+        coef=True, random_state=random_state)
+    if n_features == 1:
+        c = np.asarray([c])
+    X += X_offset
+    mask = np.random.RandomState(random_state).binomial(
+        1, proportion_nonzero, X.shape) > 0
+    removed_X = X.copy()
+    X[~mask] = 0.
+    removed_X[mask] = 0.
+    y -= removed_X.dot(c)
+    if n_features == 1:
+        c = c[0]
+    if coef:
+        return X, y, c
+    return X, y
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize(
+    'y_shape, normalize, noise',
+    [
+        ((11,), True, 1.),
+        ((11, 1), False, 30.),
+        ((11, 3), False, 150.),
+    ]
+)
+def test_ridge_gcv_vs_ridge_loo_cv(
+        gcv_mode, X_constructor, X_shape, y_shape,
+        fit_intercept, normalize, noise):
+    n_samples, n_features = X_shape
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=n_samples, n_features=n_features, n_targets=n_targets,
+        random_state=0, shuffle=False, noise=noise, n_informative=5
+    )
+    y = y.reshape(y_shape)
+
+    alphas = [1e-3, .1, 1., 10., 1e3]
+    loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
+                        alphas=alphas, scoring='neg_mean_squared_error',
+                        normalize=normalize)
+    gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
+                        alphas=alphas, normalize=normalize)
+
+    loo_ridge.fit(X, y)
+
+    X_gcv = X_constructor(X)
+    gcv_ridge.fit(X_gcv, y)
+
+    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
+    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('n_features', [8, 20])
+@pytest.mark.parametrize('y_shape, fit_intercept, noise',
+                         [((11,), True, 1.),
+                          ((11, 1), True, 20.),
+                          ((11, 3), True, 150.),
+                          ((11, 3), False, 30.)])
+def test_ridge_gcv_sample_weights(
+        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
+    alphas = [1e-3, .1, 1., 10., 1e3]
+    rng = np.random.RandomState(0)
+    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+    X, y = _make_sparse_offset_regression(
+        n_samples=11, n_features=n_features, n_targets=n_targets,
+        random_state=0, shuffle=False, noise=noise)
+    y = y.reshape(y_shape)
+
+    sample_weight = 3 * rng.randn(len(X))
+    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
+    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
+    sample_weight = sample_weight.astype(float)
+    X_tiled, y_tiled = X[indices], y[indices]
+
+    cv = GroupKFold(n_splits=X.shape[0])
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    kfold = RidgeCV(
+        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
+        fit_intercept=fit_intercept)
+    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
+    # `iid` parameter will change from True to False in version 0.22 and will
+    # be removed in 0.24
+    with ignore_warnings(category=DeprecationWarning):
+        kfold.fit(X_tiled, y_tiled)
+
+    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
+    splits = cv.split(X_tiled, y_tiled, groups=indices)
+    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
+    kfold_errors = (y_tiled - predictions)**2
+    kfold_errors = [
+        np.sum(kfold_errors[indices == i], axis=0) for
+        i in np.arange(X.shape[0])]
+    kfold_errors = np.asarray(kfold_errors)
+
+    X_gcv = X_constructor(X)
+    gcv_ridge = RidgeCV(
+        alphas=alphas, store_cv_values=True,
+        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
+    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
+    if len(y_shape) == 2:
+        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
+    else:
+        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]
+
+    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
+    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
+    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
+    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
+def test_check_gcv_mode_error(mode):
+    X, y = make_regression(n_samples=5, n_features=2)
+    gcv = RidgeCV(gcv_mode=mode)
+    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+        gcv.fit(X, y)
+    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+        _check_gcv_mode(X, mode)
+
+
+@pytest.mark.parametrize("sparse", [True, False])
+@pytest.mark.parametrize(
+    'mode, mode_n_greater_than_p, mode_p_greater_than_n',
+    [(None, 'svd', 'eigen'),
+     ('auto', 'svd', 'eigen'),
+     ('eigen', 'eigen', 'eigen'),
+     ('svd', 'svd', 'svd')]
+)
+def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p,
+                               mode_p_greater_than_n):
+    X, _ = make_regression(n_samples=5, n_features=2)
+    if sparse:
+        X = sp.csr_matrix(X)
+    assert _check_gcv_mode(X, mode) == mode_n_greater_than_p
+    assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
+
+
 def _test_ridge_loo(filter_):
     # test that can work with both dense or sparse matrices
     n_samples = X_diabetes.shape[0]
@@ -318,46 +527,7 @@ def _test_ridge_loo(filter_):
     ret = []
 
     fit_intercept = filter_ == DENSE_FILTER
-    if fit_intercept:
-        X_diabetes_ = X_diabetes - X_diabetes.mean(0)
-    else:
-        X_diabetes_ = X_diabetes
     ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)
-    ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept)
-
-    # because fit_intercept is applied
-
-    # generalized cross-validation (efficient leave-one-out)
-    decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept)
-    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
-    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)
-
-    # brute-force leave-one-out: remove one example at a time
-    errors2 = []
-    values2 = []
-    for i in range(n_samples):
-        sel = np.arange(n_samples) != i
-        X_new = X_diabetes_[sel]
-        y_new = y_diabetes[sel]
-        ridge.fit(X_new, y_new)
-        value = ridge.predict([X_diabetes_[i]])[0]
-        error = (y_diabetes[i] - value) ** 2
-        errors2.append(error)
-        values2.append(value)
-
-    # check that efficient and brute-force LOO give same results
-    assert_almost_equal(errors, errors2)
-    assert_almost_equal(values, values2)
-
-    # generalized cross-validation (efficient leave-one-out,
-    # SVD variation)
-    decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept)
-    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
-    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)
-
-    # check that efficient and SVD efficient LOO give same results
-    assert_almost_equal(errors, errors3)
-    assert_almost_equal(values, values3)
 
     # check best alpha
     ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
@@ -369,25 +539,26 @@ def _test_ridge_loo(filter_):
     scoring = make_scorer(mean_squared_error, greater_is_better=False)
     ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
     f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv2.alpha_, alpha_)
+    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with custom score_func
     func = lambda x, y: -mean_squared_error(x, y)
     scoring = make_scorer(func)
     ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
     f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv3.alpha_, alpha_)
+    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with a scorer
     scorer = get_scorer('neg_mean_squared_error')
     ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
     ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
-    assert_equal(ridge_gcv4.alpha_, alpha_)
+    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with sample weights
-    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
-                  sample_weight=np.ones(n_samples))
-    assert_equal(ridge_gcv.alpha_, alpha_)
+    if filter_ == DENSE_FILTER:
+        ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
+                      sample_weight=np.ones(n_samples))
+        assert ridge_gcv.alpha_ == pytest.approx(alpha_)
 
     # simulate several responses
     Y = np.vstack((y_diabetes, y_diabetes)).T
@@ -397,8 +568,8 @@ def _test_ridge_loo(filter_):
     ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
     y_pred = ridge_gcv.predict(filter_(X_diabetes))
 
-    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
-                              Y_pred, decimal=5)
+    assert_allclose(np.vstack((y_pred, y_pred)).T,
+                    Y_pred, rtol=1e-5)
 
     return ret
 
@@ -407,7 +578,7 @@ def _test_ridge_cv_normalize(filter_):
     ridge_cv = RidgeCV(normalize=True, cv=3)
     ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)
 
-    gs = GridSearchCV(Ridge(normalize=True), cv=3,
+    gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3,
                       param_grid={'alpha': ridge_cv.alphas})
     gs.fit(filter_(10. * X_diabetes), y_diabetes)
     assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
@@ -501,12 +672,6 @@ def test_dense_sparse(test_func):
     check_dense_sparse(test_func)
 
 
-def test_ridge_cv_sparse_svd():
-    X = sp.csr_matrix(X_diabetes)
-    ridge = RidgeCV(gcv_mode="svd")
-    assert_raises(TypeError, ridge.fit, X)
-
-
 def test_ridge_sparse_svd():
     X = sp.csc_matrix(rng.rand(100, 10))
     y = rng.rand(100)
@@ -620,6 +785,10 @@ def test_ridgecv_store_cv_values():
     r.fit(x, y)
     assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
 
+    r = RidgeCV(cv=3, store_cv_values=True)
+    assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
+                        r.fit, x, y)
+
 
 @pytest.mark.filterwarnings('ignore: The default value of cv')  # 0.22
 def test_ridge_classifier_cv_store_cv_values():
@@ -762,13 +931,13 @@ def test_ridgecv_negative_alphas():
     # Negative integers
     ridge = RidgeCV(alphas=(-1, -10, -100))
     assert_raises_regex(ValueError,
-                        "alphas cannot be negative.",
+                        "alphas must be positive",
                         ridge.fit, X, y)
 
     # Negative floats
     ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
     assert_raises_regex(ValueError,
-                        "alphas cannot be negative.",
+                        "alphas must be positive",
                         ridge.fit, X, y)
 
 
@@ -887,54 +1056,14 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
         assert_allclose(out, true_coefs, rtol=0, atol=atol)
 
 
-def test_errors_and_values_helper():
-    ridgecv = _RidgeGCV()
-    rng = check_random_state(42)
-    alpha = 1.
-    n = 5
-    y = rng.randn(n)
-    v = rng.randn(n)
-    Q = rng.randn(len(v), len(v))
-    QT_y = Q.T.dot(y)
-    G_diag, c = ridgecv._errors_and_values_helper(alpha, y, v, Q, QT_y)
-
-    # test that helper function behaves as expected
-    out, c_ = ridgecv._errors(alpha, y, v, Q, QT_y)
-    np.testing.assert_array_equal(out, (c / G_diag) ** 2)
-    np.testing.assert_array_equal(c, c)
-
-    out, c_ = ridgecv._values(alpha, y, v, Q, QT_y)
-    np.testing.assert_array_equal(out, y - (c / G_diag))
-    np.testing.assert_array_equal(c_, c)
-
-
-def test_errors_and_values_svd_helper():
-    ridgecv = _RidgeGCV()
-    rng = check_random_state(42)
-    alpha = 1.
-    for n, p in zip((5, 10), (12, 6)):
-        y = rng.randn(n)
-        v = rng.randn(p)
-        U = rng.randn(n, p)
-        UT_y = U.T.dot(y)
-        G_diag, c = ridgecv._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-
-        # test that helper function behaves as expected
-        out, c_ = ridgecv._errors_svd(alpha, y, v, U, UT_y)
-        np.testing.assert_array_equal(out, (c / G_diag) ** 2)
-        np.testing.assert_array_equal(c, c)
-
-        out, c_ = ridgecv._values_svd(alpha, y, v, U, UT_y)
-        np.testing.assert_array_equal(out, y - (c / G_diag))
-        np.testing.assert_array_equal(c_, c)
-
-
 def test_ridge_classifier_no_support_multilabel():
     X, y = make_multilabel_classification(n_samples=10, random_state=0)
     assert_raises(ValueError, RidgeClassifier().fit, X, y)
 
 
-def test_dtype_match():
+@pytest.mark.parametrize(
+    "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"])
+def test_dtype_match(solver):
     rng = np.random.RandomState(0)
     alpha = 1.0
 
@@ -944,25 +1073,22 @@ def test_dtype_match():
     X_32 = X_64.astype(np.float32)
     y_32 = y_64.astype(np.float32)
 
-    solvers = ["svd", "sparse_cg", "cholesky", "lsqr"]
-    for solver in solvers:
-
-        # Check type consistency 32bits
-        ridge_32 = Ridge(alpha=alpha, solver=solver)
-        ridge_32.fit(X_32, y_32)
-        coef_32 = ridge_32.coef_
+    # Check type consistency 32bits
+    ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
+    ridge_32.fit(X_32, y_32)
+    coef_32 = ridge_32.coef_
 
-        # Check type consistency 64 bits
-        ridge_64 = Ridge(alpha=alpha, solver=solver)
-        ridge_64.fit(X_64, y_64)
-        coef_64 = ridge_64.coef_
+    # Check type consistency 64 bits
+    ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
+    ridge_64.fit(X_64, y_64)
+    coef_64 = ridge_64.coef_
 
-        # Do the actual checks at once for easier debug
-        assert coef_32.dtype == X_32.dtype
-        assert coef_64.dtype == X_64.dtype
-        assert ridge_32.predict(X_32).dtype == X_32.dtype
-        assert ridge_64.predict(X_64).dtype == X_64.dtype
-        assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
+    # Do the actual checks at once for easier debug
+    assert coef_32.dtype == X_32.dtype
+    assert coef_64.dtype == X_64.dtype
+    assert ridge_32.predict(X_32).dtype == X_32.dtype
+    assert ridge_64.predict(X_64).dtype == X_64.dtype
+    assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
 
 
 def test_dtype_match_cholesky():
@@ -993,3 +1119,34 @@ def test_dtype_match_cholesky():
     assert ridge_32.predict(X_32).dtype == X_32.dtype
     assert ridge_64.predict(X_64).dtype == X_64.dtype
     assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
+
+
+@pytest.mark.parametrize(
+    'solver', ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
+@pytest.mark.parametrize('seed', range(1))
+def test_ridge_regression_dtype_stability(solver, seed):
+    random_state = np.random.RandomState(seed)
+    n_samples, n_features = 6, 5
+    X = random_state.randn(n_samples, n_features)
+    coef = random_state.randn(n_features)
+    y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples)
+    alpha = 1.0
+    results = dict()
+    # XXX: Sparse CG seems to be far less numerically stable than the
+    # others, maybe we should not enable float32 for this one.
+    atol = 1e-3 if solver == "sparse_cg" else 1e-5
+    for current_dtype in (np.float32, np.float64):
+        results[current_dtype] = ridge_regression(X.astype(current_dtype),
+                                                  y.astype(current_dtype),
+                                                  alpha=alpha,
+                                                  solver=solver,
+                                                  random_state=random_state,
+                                                  sample_weight=None,
+                                                  max_iter=500,
+                                                  tol=1e-10,
+                                                  return_n_iter=False,
+                                                  return_intercept=False)
+
+    assert results[np.float32].dtype == np.float32
+    assert results[np.float64].dtype == np.float64
+    assert_allclose(results[np.float32], results[np.float64], atol=atol)
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index bbb83a5ed81f8..88c979c0e1fdb 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -145,7 +145,7 @@ def reconstruction_error(self):
         reconstruction_error : float
 
         Notes
-        -------
+        -----
         The cost function of an isomap embedding is
 
         ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index e387ecec0f4d5..a6d5af54f9bc4 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -348,7 +348,7 @@ class SpectralEmbedding(BaseEstimator):
     Read more in the :ref:`User Guide <spectral_embedding>`.
 
     Parameters
-    -----------
+    ----------
     n_components : integer, default: 2
         The dimension of the projected subspace.
 
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 9eae62a28045e..d1337bdc61aed 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1989,16 +1989,18 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None):
     -----
     In multiclass classification, the Hamming loss corresponds to the Hamming
     distance between ``y_true`` and ``y_pred`` which is equivalent to the
-    subset ``zero_one_loss`` function.
+    subset ``zero_one_loss`` function, when `normalize` parameter is set to
+    True.
 
     In multilabel classification, the Hamming loss is different from the
     subset zero-one loss. The zero-one loss considers the entire set of labels
     for a given sample incorrect if it does not entirely match the true set of
-    labels. Hamming loss is more forgiving in that it penalizes the individual
-    labels.
+    labels. Hamming loss is more forgiving in that it penalizes only the
+    individual labels.
 
-    The Hamming loss is upperbounded by the subset zero-one loss. When
-    normalized over samples, the Hamming loss is always between 0 and 1.
+    The Hamming loss is upperbounded by the subset zero-one loss, when
+    `normalize` parameter is set to True. It is always between 0 and 1,
+    lower being better.
 
     References
     ----------
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a32d6aa6efbcc..9e377f3d4c07e 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -749,7 +749,7 @@ def paired_cosine_distances(X, Y):
     distances : ndarray, shape (n_samples, )
 
     Notes
-    ------
+    -----
     The cosine distance is equivalent to the half the squared
     euclidean distance if each sample is normalized to unit norm
     """
@@ -1169,17 +1169,17 @@ def distance_metrics():
 
     The valid distance metrics, and the function they map to, are:
 
-    ============     ====================================
-    metric           Function
-    ============     ====================================
-    'cityblock'      metrics.pairwise.manhattan_distances
-    'cosine'         metrics.pairwise.cosine_distances
-    'euclidean'      metrics.pairwise.euclidean_distances
-    'haversine'      metrics.pairwise.haversine_distances
-    'l1'             metrics.pairwise.manhattan_distances
-    'l2'             metrics.pairwise.euclidean_distances
-    'manhattan'      metrics.pairwise.manhattan_distances
-    ============     ====================================
+    ============   ====================================
+    metric         Function
+    ============   ====================================
+    'cityblock'    metrics.pairwise.manhattan_distances
+    'cosine'       metrics.pairwise.cosine_distances
+    'euclidean'    metrics.pairwise.euclidean_distances
+    'haversine'    metrics.pairwise.haversine_distances
+    'l1'           metrics.pairwise.manhattan_distances
+    'l2'           metrics.pairwise.euclidean_distances
+    'manhattan'    metrics.pairwise.manhattan_distances
+    ============   ====================================
 
     Read more in the :ref:`User Guide <metrics>`.
 
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 6f13f63e3fcd9..88c0ab66ae20a 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -140,7 +140,7 @@ class BayesianGaussianMixture(BaseMixture):
 
     mean_precision_prior : float | None, optional.
         The precision prior on the mean distribution (Gaussian).
-        Controls the extend to where means can be placed. Smaller
+        Controls the extend to where means can be placed. Larger
         values concentrate the means of each clusters around `mean_prior`.
         The value of the parameter must be greater than 0.
         If it is None, it's set to 1.
@@ -260,7 +260,7 @@ class BayesianGaussianMixture(BaseMixture):
     mean_precision_prior : float
         The precision prior on the mean distribution (Gaussian).
         Controls the extend to where means can be placed.
-        Smaller values concentrate the means of each clusters around
+        Larger values concentrate the means of each clusters around
         `mean_prior`.
 
     mean_precision_ : array-like, shape (n_components,)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 68f0e296b077c..6fe2a8edfa12a 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -447,7 +447,7 @@ def predict(self, X):
         ``predict``.
 
         Parameters
-        -----------
+        ----------
         X : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -464,7 +464,7 @@ def predict_proba(self, X):
         ``predict_proba``.
 
         Parameters
-        -----------
+        ----------
         X : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -481,7 +481,7 @@ def predict_log_proba(self, X):
         ``predict_log_proba``.
 
         Parameters
-        -----------
+        ----------
         X : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -498,7 +498,7 @@ def decision_function(self, X):
         ``decision_function``.
 
         Parameters
-        -----------
+        ----------
         X : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -515,7 +515,7 @@ def transform(self, X):
         ``refit=True``.
 
         Parameters
-        -----------
+        ----------
         X : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -532,7 +532,7 @@ def inverse_transform(self, Xt):
         ``inverse_transform`` and ``refit=True``.
 
         Parameters
-        -----------
+        ----------
         Xt : indexable, length n_samples
             Must fulfill the input assumptions of the
             underlying estimator.
@@ -1103,7 +1103,7 @@ class GridSearchCV(BaseSearchCV):
         This is present only if ``refit`` is not False.
 
     Notes
-    ------
+    -----
     The parameters selected are those that maximize the score of the left out
     data, unless an explicit score is passed in which case it is used instead.
 
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 17fb16ae8340e..24fefef5216fe 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -853,7 +853,7 @@ class LeaveOneGroupOut(BaseCrossValidator):
     >>> logo = LeaveOneGroupOut()
     >>> logo.get_n_splits(X, y, groups)
     2
-    >>> logo.get_n_splits(groups=groups) # 'groups' is always required
+    >>> logo.get_n_splits(groups=groups)  # 'groups' is always required
     2
     >>> print(logo)
     LeaveOneGroupOut()
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 2f5505fff01c6..3dc8b0441a64a 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1118,7 +1118,7 @@ def learning_curve(estimator, X, y, groups=None,
                    train_sizes=np.linspace(0.1, 1.0, 5), cv='warn',
                    scoring=None, exploit_incremental_learning=False,
                    n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False,
-                   random_state=None,  error_score='raise-deprecating'):
+                   random_state=None, error_score='raise-deprecating'):
     """Learning curve.
 
     Determines cross-validated training and test scores for different training
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 21f272c518f51..00c4b8636a17c 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -62,14 +62,14 @@ def _get_weights(dist, weights):
     """Get the weights from an array of distances and a parameter ``weights``
 
     Parameters
-    ===========
+    ----------
     dist : ndarray
         The input distances
     weights : {'uniform', 'distance' or a callable}
         The kind of weighting used
 
     Returns
-    ========
+    -------
     weights_arr : array of the same shape as ``dist``
         if ``weights == 'uniform'``, then returns None
     """
diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py
index 5ad2f7e9b7b1d..472710ea51bb2 100644
--- a/sklearn/neighbors/lof.py
+++ b/sklearn/neighbors/lof.py
@@ -401,7 +401,7 @@ def _decision_function(self, X):
     def score_samples(self):
         """Opposite of the Local Outlier Factor of X.
 
-        It is the opposite as as bigger is better, i.e. large values correspond
+        It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
         Only available for novelty detection (when novelty is set to True).
@@ -437,7 +437,7 @@ def score_samples(self):
     def _score_samples(self, X):
         """Opposite of the Local Outlier Factor of X.
 
-        It is the opposite as as bigger is better, i.e. large values correspond
+        It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
         Only available for novelty detection (when novelty is set to True).
@@ -500,5 +500,5 @@ def _local_reachability_density(self, distances_X, neighbors_indices):
                                         self.n_neighbors_ - 1]
         reach_dist_array = np.maximum(distances_X, dist_k)
 
-        #  1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
+        # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
         return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10)
diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py
index 8f19c7b488acc..3e49e94de8bd1 100644
--- a/sklearn/neural_network/_stochastic_optimizers.py
+++ b/sklearn/neural_network/_stochastic_optimizers.py
@@ -1,7 +1,7 @@
 """Stochastic optimization methods for MLP
 """
 
-# Authors:  Jiyuan Qian <jq401@nyu.edu>
+# Authors: Jiyuan Qian <jq401@nyu.edu>
 # License: BSD 3 clause
 
 import numpy as np
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 1fcdadaabb6c0..9a51fefd144ac 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -95,12 +95,12 @@ class Pipeline(_BaseComposition):
     >>> # For instance, fit using a k of 10 in the SelectKBest
     >>> # and a parameter 'C' of the svm
     >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
-    ...                      # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
     Pipeline(memory=None,
              steps=[('anova', SelectKBest(...)),
                     ('svc', SVC(...))], verbose=False)
     >>> prediction = anova_svm.predict(X)
-    >>> anova_svm.score(X, y)                        # doctest: +ELLIPSIS
+    >>> anova_svm.score(X, y)  # doctest: +ELLIPSIS
     0.83
     >>> # getting the selected features chosen by anova_filter
     >>> anova_svm['anova'].get_support()
@@ -671,7 +671,7 @@ def make_pipeline(*steps, **kwargs):
     >>> from sklearn.naive_bayes import GaussianNB
     >>> from sklearn.preprocessing import StandardScaler
     >>> make_pipeline(StandardScaler(), GaussianNB(priors=None))
-    ...     # doctest: +NORMALIZE_WHITESPACE
+    ... # doctest: +NORMALIZE_WHITESPACE
     Pipeline(memory=None,
              steps=[('standardscaler',
                      StandardScaler(copy=True, with_mean=True, with_std=True)),
@@ -782,7 +782,7 @@ class FeatureUnion(_BaseComposition, TransformerMixin):
     >>> union = FeatureUnion([("pca", PCA(n_components=1)),
     ...                       ("svd", TruncatedSVD(n_components=2))])
     >>> X = [[0., 1., 3], [2., 2., 5]]
-    >>> union.fit_transform(X)    # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+    >>> union.fit_transform(X)  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     array([[ 1.5       ,  3.0...,  0.8...],
            [-1.5       ,  5.7..., -0.4...]])
     """
@@ -1008,7 +1008,7 @@ def make_union(*transformers, **kwargs):
     --------
     >>> from sklearn.decomposition import PCA, TruncatedSVD
     >>> from sklearn.pipeline import make_union
-    >>> make_union(PCA(), TruncatedSVD())    # doctest: +NORMALIZE_WHITESPACE
+    >>> make_union(PCA(), TruncatedSVD())  # doctest: +NORMALIZE_WHITESPACE
     FeatureUnion(n_jobs=None,
            transformer_list=[('pca',
                               PCA(copy=True, iterated_power='auto',
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 8c8524ef6505c..2f020a0a4780e 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1475,17 +1475,21 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape [n_samples, n_features]
+        X : array-like or CSR/CSC sparse matrix, shape [n_samples, n_features]
             The data to transform, row by row.
-            Sparse input should preferably be in CSR format (for speed),
-            but must be in CSC format if the degree is 4 or higher.
 
-            If the input matrix is in CSR format and the expansion is of
-            degree 2 or 3, the method described in the work "Leveraging
-            Sparsity to Speed Up Polynomial Feature Expansions of CSR
-            Matrices Using K-Simplex Numbers" by Andrew Nystrom and
-            John Hughes is used, which is much faster than the method
-            used on CSC input.
+            Prefer CSR over CSC for sparse input (for speed), but CSC is
+            required if the degree is 4 or higher. If the degree is less than
+            4 and the input format is CSC, it will be converted to CSR, have
+            its polynomial features generated, then converted back to CSC.
+
+            If the degree is 2 or 3, the method described in "Leveraging
+            Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
+            Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
+            used, which is much faster than the method used on CSC input. For
+            this reason, a CSC input will be converted to CSR, and the output
+            will be converted back to CSC prior to being returned, hence the
+            preference of CSR.
 
         Returns
         -------
@@ -1679,7 +1683,7 @@ class Normalizer(BaseEstimator, TransformerMixin):
     >>> X = [[4, 1, 2, 2],
     ...      [1, 3, 9, 3],
     ...      [5, 7, 5, 1]]
-    >>> transformer = Normalizer().fit(X) # fit does nothing.
+    >>> transformer = Normalizer().fit(X)  # fit does nothing.
     >>> transformer
     Normalizer(copy=True, norm='l2')
     >>> transformer.transform(X)
@@ -1815,7 +1819,7 @@ class Binarizer(BaseEstimator, TransformerMixin):
     >>> X = [[ 1., -1.,  2.],
     ...      [ 2.,  0.,  0.],
     ...      [ 0.,  1., -1.]]
-    >>> transformer = Binarizer().fit(X) # fit does nothing.
+    >>> transformer = Binarizer().fit(X)  # fit does nothing.
     >>> transformer
     Binarizer(copy=True, threshold=0.0)
     >>> transformer.transform(X)
@@ -2262,7 +2266,7 @@ def _transform_col(self, X_col, quantiles, inverse):
             upper_bound_x = 1
             lower_bound_y = quantiles[0]
             upper_bound_y = quantiles[-1]
-            #  for inverse transform, match a uniform distribution
+            # for inverse transform, match a uniform distribution
             with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
                 if output_distribution == 'normal':
                     X_col = stats.norm.cdf(X_col)
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f7cffa1e663b5..4a1c700717555 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -420,7 +420,7 @@ def fit_transform(self, y):
         """Fit label binarizer and transform multi-class labels to binary
         labels.
 
-        The output of transform is sometimes referred to    as
+        The output of transform is sometimes referred to as
         the 1-of-K coding scheme.
 
         Parameters
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e6f10cad77d9f..5a377043e9e38 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('feature_selection/tests')
     config.add_subpackage('gaussian_process')
     config.add_subpackage('gaussian_process/tests')
+    config.add_subpackage('impute')
+    config.add_subpackage('impute/tests')
     config.add_subpackage('inspection')
     config.add_subpackage('inspection/tests')
     config.add_subpackage('mixture')
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index effb0dcd12504..fe2f943cbdb7c 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -126,7 +126,7 @@ def fit(self, X, y, sample_weight=None):
         self : object
 
         Notes
-        ------
+        -----
         If X and y are not C-ordered and contiguous arrays of np.float64 and
         X is not a scipy.sparse.csr_matrix, X and/or y may be copied.
 
@@ -293,7 +293,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel,
 
         if hasattr(self, "classes_"):
             n_class = len(self.classes_) - 1
-        else:   # regression
+        else:  # regression
             n_class = 1
         n_SV = self.support_vectors_.shape[0]
 
@@ -540,7 +540,7 @@ def decision_function(self, X):
             n_classes).
 
         Notes
-        ------
+        -----
         If decision_function_shape='ovo', the function values are proportional
         to the distance of the samples X to the separating hyperplane. If the
         exact distances are required, divide the function values by the norm of
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 8f45a501ddf35..a236ba716bc0d 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -429,10 +429,10 @@ def fit(self, X, y, sample_weight=None):
 class SVC(BaseSVC):
     """C-Support Vector Classification.
 
-    The implementation is based on libsvm. The fit time complexity
-    is more than quadratic with the number of samples which makes it hard
-    to scale to datasets with more than a couple of 10000 samples. For large
-    datasets consider using :class:`sklearn.linear_model.LinearSVC` or
+    The implementation is based on libsvm. The fit time scales at least
+    quadratically with the number of samples and may be impractical
+    beyond tens of thousands of samples. For large datasets
+    consider using :class:`sklearn.linear_model.LinearSVC` or
     :class:`sklearn.linear_model.SGDClassifier` instead, possibly after a
     :class:`sklearn.kernel_approximation.Nystroem` transformer.
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index fc3c7f3985e28..660b38c1ae4c2 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -215,7 +215,7 @@ def test_import_all_consistency():
 
 
 def test_root_import_all_completeness():
-    EXCEPTIONS = ('utils', 'tests', 'base', 'setup')
+    EXCEPTIONS = ('utils', 'tests', 'base', 'setup', 'conftest')
     for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__,
                                                onerror=lambda _: None):
         if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS:
diff --git a/sklearn/tree/__init__.py b/sklearn/tree/__init__.py
index e91540bed8c5f..a5ffc7585d4e4 100644
--- a/sklearn/tree/__init__.py
+++ b/sklearn/tree/__init__.py
@@ -11,4 +11,4 @@
 
 __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
            "ExtraTreeClassifier", "ExtraTreeRegressor", "export_graphviz",
-           "plot_tree",  "export_text"]
+           "plot_tree", "export_text"]
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index edd47845ad197..f27b42ae9c956 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1123,3 +1123,110 @@ cdef class Tree:
         Py_INCREF(self)
         arr.base = <PyObject*> self
         return arr
+
+
+    def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
+                                   int[::1] target_feature,
+                                   double[::1] out):
+        """Partial dependence of the response on the ``target_feature`` set.
+
+        For each sample in ``X`` a tree traversal is performed.
+        Each traversal starts from the root with weight 1.0.
+
+        At each non-leaf node that splits on a target feature, either
+        the left child or the right child is visited based on the feature
+        value of the current sample, and the weight is not modified.
+        At each non-leaf node that splits on a complementary feature,
+        both children are visited and the weight is multiplied by the fraction
+        of training samples which went to each child.
+
+        At each leaf, the value of the node is multiplied by the current
+        weight (weights sum to 1 for all visited terminal nodes).
+
+        Parameters
+        ----------
+        X : view on 2d ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_feature : view on 1d ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : view on 1d ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        cdef:
+            double[::1] weight_stack = np.zeros(self.node_count,
+                                                dtype=np.float64)
+            SIZE_t[::1] node_idx_stack = np.zeros(self.node_count,
+                                                  dtype=np.intp)
+            SIZE_t sample_idx
+            SIZE_t feature_idx
+            int stack_size
+            double left_sample_frac
+            double current_weight
+            double total_weight  # used for sanity check only
+            Node *current_node  # use a pointer to avoid copying attributes
+            SIZE_t current_node_idx
+            bint is_target_feature
+            SIZE_t _TREE_LEAF = TREE_LEAF  # to avoid python interactions
+
+        for sample_idx in range(X.shape[0]):
+            # init stacks for current sample
+            stack_size = 1
+            node_idx_stack[0] = 0  # root node
+            weight_stack[0] = 1  # all the samples are in the root node
+            total_weight = 0
+
+            while stack_size > 0:
+                # pop the stack
+                stack_size -= 1
+                current_node_idx = node_idx_stack[stack_size]
+                current_node = &self.nodes[current_node_idx]
+
+                if current_node.left_child == _TREE_LEAF:
+                    # leaf node
+                    out[sample_idx] += (weight_stack[stack_size] *
+                                        self.value[current_node_idx])
+                    total_weight += weight_stack[stack_size]
+                else:
+                    # non-leaf node
+
+                    # determine if the split feature is a target feature
+                    is_target_feature = False
+                    for feature_idx in range(target_feature.shape[0]):
+                        if target_feature[feature_idx] == current_node.feature:
+                            is_target_feature = True
+                            break
+
+                    if is_target_feature:
+                        # In this case, we push left or right child on stack
+                        if X[sample_idx, feature_idx] <= current_node.threshold:
+                            node_idx_stack[stack_size] = current_node.left_child
+                        else:
+                            node_idx_stack[stack_size] = current_node.right_child
+                        stack_size += 1
+                    else:
+                        # In this case, we push both children onto the stack,
+                        # and give a weight proportional to the number of
+                        # samples going through each branch.
+
+                        # push left child
+                        node_idx_stack[stack_size] = current_node.left_child
+                        left_sample_frac = (
+                            self.nodes[current_node.left_child].weighted_n_node_samples /
+                            current_node.weighted_n_node_samples)
+                        current_weight = weight_stack[stack_size]
+                        weight_stack[stack_size] = current_weight * left_sample_frac
+                        stack_size += 1
+
+                        # push right child
+                        node_idx_stack[stack_size] = current_node.right_child
+                        weight_stack[stack_size] = (
+                            current_weight * (1 - left_sample_frac))
+                        stack_size += 1
+
+            # Sanity check. Should never happen.
+            if not (0.999 < total_weight < 1.001):
+                raise ValueError("Total weight should be 1.0 but was %.9f" %
+                                 total_weight)
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index fdbd48e75f3a9..634eb3ef84cdd 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -20,7 +20,7 @@ import numpy as np
 cimport numpy as np
 np.import_array()
 
-from ..utils cimport _random
+from ..utils._random cimport our_rand_r
 
 # =============================================================================
 # Helper functions
@@ -64,13 +64,13 @@ cdef inline np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size):
 cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil:
     """Generate a random integer in [low; end)."""
-    return low + _random.our_rand_r(random_state) % (high - low)
+    return low + our_rand_r(random_state) % (high - low)
 
 
 cdef inline double rand_uniform(double low, double high,
                                 UINT32_t* random_state) nogil:
     """Generate a random double in [low; high)."""
-    return ((high - low) * <double> _random.our_rand_r(random_state) /
+    return ((high - low) * <double> our_rand_r(random_state) /
             <double> RAND_R_MAX) + low
 
 
diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py
index 02aa68b8af2dc..636ef03689a79 100644
--- a/sklearn/tree/export.py
+++ b/sklearn/tree/export.py
@@ -839,7 +839,7 @@ def export_text(decision_tree, feature_names=None, max_depth=10,
         Text summary of all the rules in the decision tree.
 
     Examples
-    -------
+    --------
 
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.tree import DecisionTreeClassifier
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index 65b0a201be369..eed9be7bcb5d9 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -399,9 +399,8 @@ def test_export_text():
     assert export_text(reg, decimals=1, show_weights=True) == expected_report
 
 
-def test_plot_tree():
+def test_plot_tree(pyplot):
     # mostly smoke tests
-    pytest.importorskip("matplotlib.pyplot")
     # Check correctness of export_graphviz
     clf = DecisionTreeClassifier(max_depth=3,
                                  min_samples_split=2,
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index f04e43106e415..fcb03b0cecddd 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -392,7 +392,7 @@ def weighted_mode(a, w, axis=0):
     The value 4 appears three times: with uniform weights, the result is
     simply the mode of the distribution.
 
-    >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
+    >>> weights = [1, 3, 0.5, 1.5, 1, 2]  # deweight the 4's
     >>> weighted_mode(x, weights)
     (array([2.]), array([3.5]))
 
diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py
index 9c059f2ed2ed9..76ad144ccb171 100644
--- a/sklearn/utils/mocking.py
+++ b/sklearn/utils/mocking.py
@@ -108,7 +108,7 @@ def fit(self, X, y, **fit_params):
     def predict(self, T):
         """
         Parameters
-        -----------
+        ----------
         T : indexable, length n_samples
         """
         if self.check_X is not None:
diff --git a/sklearn/utils/seq_dataset.pyx.tp b/sklearn/utils/seq_dataset.pyx.tp
index f1b34c4c86bce..14f80804554db 100644
--- a/sklearn/utils/seq_dataset.pyx.tp
+++ b/sklearn/utils/seq_dataset.pyx.tp
@@ -45,7 +45,7 @@ import numpy as np
 
 np.import_array()
 
-from . cimport _random
+from ._random cimport our_rand_r
 
 cdef class SequentialDataset{{name}}:
     """Base class for datasets with sequential data access.
@@ -155,7 +155,7 @@ cdef class SequentialDataset{{name}}:
         cdef int n = self.n_samples
         cdef unsigned i, j
         for i in range(n - 1):
-            j = i + _random.our_rand_r(&seed) % (n - i)
+            j = i + our_rand_r(&seed) % (n - i)
             ind[i], ind[j] = ind[j], ind[i]
 
     cdef int _get_next_index(self) nogil:
@@ -169,7 +169,7 @@ cdef class SequentialDataset{{name}}:
 
     cdef int _get_random_index(self) nogil:
         cdef int n = self.n_samples
-        cdef int current_index = _random.our_rand_r(&self.seed) % n
+        cdef int current_index = our_rand_r(&self.seed) % n
         self.current_index = current_index
         return current_index
 
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 65bed4c7ecef8..babf0b8658b5c 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -714,28 +714,6 @@ def set_random_state(estimator, random_state=0):
         estimator.set_params(random_state=random_state)
 
 
-def if_matplotlib(func):
-    """Test decorator that skips test if matplotlib not installed.
-
-    Parameters
-    ----------
-    func
-    """
-    @wraps(func)
-    def run_test(*args, **kwargs):
-        try:
-            import matplotlib
-            matplotlib.use('Agg', warn=False)
-            # this fails if no $DISPLAY specified
-            import matplotlib.pyplot as plt
-            plt.figure()
-        except ImportError:
-            raise SkipTest('Matplotlib not available.')
-        else:
-            return func(*args, **kwargs)
-    return run_test
-
-
 try:
     import pytest
 
@@ -1024,21 +1002,3 @@ def assert_run_python_script(source_code, timeout=60):
                                % e.output.decode('utf-8'))
     finally:
         os.unlink(source_file)
-
-
-def close_figure(fig=None):
-    """Close a matplotlibt figure.
-
-    Parameters
-    ----------
-    fig : int or str or Figure, optional (default=None)
-        The figure, figure number or figure name to close. If ``None``, all
-        current figures are closed.
-    """
-    from matplotlib.pyplot import get_fignums, close as _close  # noqa
-
-    if fig is None:
-        for fig in get_fignums():
-            _close(fig)
-    else:
-        _close(fig)