diff --git a/setup.cfg b/setup.cfg
index c854deb9d821b..09c5c9829ae21 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,6 +8,9 @@ addopts =
     --doctest-modules
     --disable-pytest-warnings
     -rs
+filterwarnings =
+    error::DeprecationWarning
+    error::FutureWarning
 
 [wheelhouse_uploader]
 artifact_indexes=
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index b3056b95d225c..9f79e2e759d34 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -7,6 +7,7 @@
 # License: BSD 3 clause
 from tempfile import mkdtemp
 import shutil
+import pytest
 from functools import partial
 
 import numpy as np
@@ -142,6 +143,8 @@ def test_agglomerative_clustering_wrong_arg_memory():
     assert_raises(ValueError, clustering.fit, X)
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
+                            "change in version 0.22")
 def test_agglomerative_clustering():
     # Check that we obtain the correct number of clusters with
     # agglomerative clustering.
@@ -250,6 +253,8 @@ def test_ward_agglomeration():
     assert_raises(ValueError, agglo.fit, X[:0])
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
+                            "change in version 0.22")
 def test_single_linkage_clustering():
     # Check that we get the correct result in two emblematic cases
     moons, moon_labels = make_moons(noise=0.05, random_state=42)
@@ -311,6 +316,8 @@ def test_scikit_vs_scipy():
     assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
+                            "change in version 0.22")
 def test_identical_points():
     # Ensure identical points are handled correctly when using mst with
     # a sparse connectivity matrix
diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py
index d476cc52373d2..76bcd302a7627 100644
--- a/sklearn/covariance/tests/test_graphical_lasso.py
+++ b/sklearn/covariance/tests/test_graphical_lasso.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from scipy import linalg
+import pytest
 
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_less
@@ -151,6 +152,5 @@ def test_deprecated_grid_scores(random_state=1):
                     "0.19 and will be removed in 0.21. Use "
                     "``grid_scores_`` instead")
 
-    assert_warns_message(DeprecationWarning, depr_message,
-                         lambda: graphical_lasso.grid_scores)
-    assert_equal(graphical_lasso.grid_scores, graphical_lasso.grid_scores_)
+    with pytest.warns(DeprecationWarning, match=depr_message):
+        assert_equal(graphical_lasso.grid_scores, graphical_lasso.grid_scores_)
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index 831af46e46130..734e3913cdb3f 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -58,6 +58,7 @@ def test_dict_learning_overcomplete():
     assert_true(dico.components_.shape == (n_components, n_features))
 
 
+@ignore_warnings(category=DeprecationWarning)  # positive lars deprecated 0.22
 @pytest.mark.parametrize("transform_algorithm", [
     "lasso_lars",
     "lasso_cd",
@@ -170,6 +171,7 @@ def test_dict_learning_online_shapes():
     assert_equal(np.dot(code, dictionary).shape, X.shape)
 
 
+@ignore_warnings(category=DeprecationWarning)  # positive lars deprecated 0.22
 @pytest.mark.parametrize("transform_algorithm", [
     "lasso_lars",
     "lasso_cd",
@@ -306,6 +308,7 @@ def test_sparse_encode_shapes():
         assert_equal(code.shape, (n_samples, n_components))
 
 
+@ignore_warnings(category=DeprecationWarning)  # positive lars deprecated 0.22
 @pytest.mark.parametrize("positive", [
     False,
     True,
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index 63281ce33dd1e..b0f2c5aeae52a 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -1,9 +1,10 @@
 import numpy as np
 import scipy.sparse as sp
+import pytest
 
 from sklearn.utils.testing import (assert_array_almost_equal, assert_less,
                                    assert_equal, assert_not_equal,
-                                   assert_raises)
+                                   assert_raises, ignore_warnings)
 
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.datasets import make_circles
@@ -172,6 +173,7 @@ def test_kernel_pca_invalid_kernel():
     assert_raises(ValueError, kpca.fit, X_fit)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch_pipeline():
     # Test if we can do a grid-search to find parameters to separate
     # circles with a perceptron model.
@@ -186,6 +188,7 @@ def test_gridsearch_pipeline():
     assert_equal(grid_search.best_score_, 1)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch_pipeline_precomputed():
     # Test if we can do a grid-search to find parameters to separate
     # circles with a perceptron model using a precomputed kernel.
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 5bf6ac337d25a..5a9c3cac00edc 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -944,16 +944,17 @@ class labels (multi-output problem).
     >>> X, y = make_classification(n_samples=1000, n_features=4,
     ...                            n_informative=2, n_redundant=0,
     ...                            random_state=0, shuffle=False)
-    >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
+    >>> clf = RandomForestClassifier(n_estimators=100, max_depth=2,
+    ...                              random_state=0)
     >>> clf.fit(X, y)
     RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                 max_depth=2, max_features='auto', max_leaf_nodes=None,
                 min_impurity_decrease=0.0, min_impurity_split=None,
                 min_samples_leaf=1, min_samples_split=2,
-                min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
+                min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                 oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(clf.feature_importances_)
-    [0.17287856 0.80608704 0.01884792 0.00218648]
+    [0.14205973 0.76664038 0.0282433  0.06305659]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
 
@@ -1188,18 +1189,19 @@ class RandomForestRegressor(ForestRegressor):
     >>>
     >>> X, y = make_regression(n_features=4, n_informative=2,
     ...                        random_state=0, shuffle=False)
-    >>> regr = RandomForestRegressor(max_depth=2, random_state=0)
+    >>> regr = RandomForestRegressor(max_depth=2, random_state=0,
+    ...                              n_estimators=100)
     >>> regr.fit(X, y)
     RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
-               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
+               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(regr.feature_importances_)
-    [0.17339552 0.81594114 0.         0.01066333]
+    [0.18146984 0.81473937 0.00145312 0.00233767]
     >>> print(regr.predict([[0, 0, 0, 0]]))
-    [-2.50699856]
+    [-8.32987858]
 
     Notes
     -----
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 626b34f58e5a6..396bda20159f2 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -5,6 +5,7 @@
 # Author: Gilles Louppe
 # License: BSD 3 clause
 
+import pytest
 import numpy as np
 
 from sklearn.base import BaseEstimator
@@ -33,7 +34,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
 from sklearn.utils import check_random_state
-from sklearn.preprocessing import Imputer
+from sklearn.preprocessing import FunctionTransformer
 
 from scipy.sparse import csc_matrix, csr_matrix
 
@@ -496,6 +497,7 @@ def test_parallel_regression():
     assert_array_almost_equal(y1, y3)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch():
     # Check that bagging ensembles can be grid-searched.
     # Transform iris into a binary classification task
@@ -755,6 +757,12 @@ def test_set_oob_score_label_encoding():
     assert_equal([x1, x2], [x3, x3])
 
 
+def replace(X):
+    X = X.copy().astype('float')
+    X[~np.isfinite(X)] = 0
+    return X
+
+
 def test_bagging_regressor_with_missing_inputs():
     # Check that BaggingRegressor can accept X with missing/infinite data
     X = np.array([
@@ -777,9 +785,7 @@ def test_bagging_regressor_with_missing_inputs():
     for y in y_values:
         regressor = DecisionTreeRegressor()
         pipeline = make_pipeline(
-            Imputer(),
-            Imputer(missing_values=np.inf),
-            Imputer(missing_values=np.NINF),
+            FunctionTransformer(replace, validate=False),
             regressor
         )
         pipeline.fit(X, y).predict(X)
@@ -807,9 +813,7 @@ def test_bagging_classifier_with_missing_inputs():
     y = np.array([3, 6, 6, 6, 6])
     classifier = DecisionTreeClassifier()
     pipeline = make_pipeline(
-        Imputer(),
-        Imputer(missing_values=np.inf),
-        Imputer(missing_values=np.NINF),
+        FunctionTransformer(replace, validate=False),
         classifier
     )
     pipeline.fit(X, y).predict(X)
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 6ed0769d005a7..cd7626b747599 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -439,13 +439,13 @@ def check_oob_score_raise_error(name):
 def test_oob_score_raise_error(name):
     check_oob_score_raise_error(name)
 
-
 def check_gridsearch(name):
     forest = FOREST_CLASSIFIERS[name]()
     clf = GridSearchCV(forest, {'n_estimators': (1, 2), 'max_depth': (1, 2)})
     clf.fit(iris.data, iris.target)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
 def test_gridsearch(name):
     # Check that base trees can be grid-searched.
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index cec7efc46f03b..5bdb563199ebf 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -1,6 +1,7 @@
 """
 Testing for the partial dependence module.
 """
+import pytest
 
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -103,6 +104,8 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
+# matplotlib Python3.7 warning
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -135,6 +138,8 @@ def test_plot_partial_dependence():
     assert all(ax.has_data for ax in axs)
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
+# matplotlib Python3.7 warning
 @if_matplotlib
 def test_plot_partial_dependence_input():
     # Test partial dependence plot function input checks.
@@ -170,6 +175,8 @@ def test_plot_partial_dependence_input():
                   clf, X, [{'foo': 'bar'}])
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
+# matplotlib Python3.7 warning
 @if_matplotlib
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
diff --git a/sklearn/ensemble/tests/test_voting_classifier.py b/sklearn/ensemble/tests/test_voting_classifier.py
index 5172c4391c633..f5bfdbd101beb 100644
--- a/sklearn/ensemble/tests/test_voting_classifier.py
+++ b/sklearn/ensemble/tests/test_voting_classifier.py
@@ -371,10 +371,12 @@ def test_set_estimator_none():
     X1 = np.array([[1], [2]])
     y1 = np.array([1, 2])
     eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
-                             voting='soft', weights=[0, 0.5]).fit(X1, y1)
+                             voting='soft', weights=[0, 0.5],
+                             flatten_transform=False).fit(X1, y1)
 
     eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
-                             voting='soft', weights=[1, 0.5])
+                             voting='soft', weights=[1, 0.5],
+                             flatten_transform=False)
     eclf2.set_params(rf=None).fit(X1, y1)
     assert_array_almost_equal(eclf1.transform(X1),
                               np.array([[[0.7, 0.3], [0.3, 0.7]],
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
index 6b62d87d94d18..87972c67bfb14 100755
--- a/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from sklearn.utils.testing import assert_array_equal, assert_array_less
-from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal, ignore_warnings
 from sklearn.utils.testing import assert_equal, assert_true, assert_greater
 from sklearn.utils.testing import assert_raises, assert_raises_regexp
 
@@ -196,6 +196,7 @@ def test_staged_predict():
     assert_array_almost_equal(score, staged_scores[-1])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch():
     # Check that base trees can be grid-searched.
     # AdaBoost classification
diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py
index caa821efc6deb..7ce8bcd80aa46 100644
--- a/sklearn/ensemble/voting_classifier.py
+++ b/sklearn/ensemble/voting_classifier.py
@@ -91,7 +91,7 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):
     >>> from sklearn.naive_bayes import GaussianNB
     >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
     >>> clf1 = LogisticRegression(random_state=1)
-    >>> clf2 = RandomForestClassifier(random_state=1)
+    >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
     >>> clf3 = GaussianNB()
     >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
     >>> y = np.array([1, 1, 1, 2, 2, 2])
diff --git a/sklearn/feature_extraction/tests/test_image.py b/sklearn/feature_extraction/tests/test_image.py
index dc93679806008..bf59f7802d424 100644
--- a/sklearn/feature_extraction/tests/test_image.py
+++ b/sklearn/feature_extraction/tests/test_image.py
@@ -11,7 +11,8 @@
 from sklearn.feature_extraction.image import (
     img_to_graph, grid_to_graph, extract_patches_2d,
     reconstruct_from_patches_2d, PatchExtractor, extract_patches)
-from sklearn.utils.testing import assert_equal, assert_true, assert_raises
+from sklearn.utils.testing import (assert_equal, assert_true, assert_raises,
+                                   ignore_warnings)
 
 
 def test_img_to_graph():
@@ -55,6 +56,7 @@ def test_grid_to_graph():
     assert_true(A.dtype == np.float64)
 
 
+@ignore_warnings(category=DeprecationWarning)  # scipy deprecation inside face
 def test_connect_regions():
     try:
         face = sp.face(gray=True)
@@ -67,7 +69,7 @@ def test_connect_regions():
         graph = img_to_graph(face, mask)
         assert_equal(ndimage.label(mask)[1], connected_components(graph)[0])
 
-
+@ignore_warnings(category=DeprecationWarning)  # scipy deprecation inside face
 def test_connect_regions_with_grid():
     try:
         face = sp.face(gray=True)
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 9a9284d30f796..8bfb6537962af 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -730,6 +730,7 @@ def test_vectorizer_inverse_transform(Vectorizer):
         assert_array_equal(np.sort(terms), np.sort(terms2))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_count_vectorizer_pipeline_grid_selection():
     # raw documents
     data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
@@ -766,6 +767,7 @@ def test_count_vectorizer_pipeline_grid_selection():
     assert_equal(best_vectorizer.ngram_range, (1, 1))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_vectorizer_pipeline_grid_selection():
     # raw documents
     data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 24cf355804b29..9df27af2666cd 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -14,7 +14,7 @@
 from __future__ import unicode_literals, division
 
 import array
-from collections import Mapping, defaultdict
+from collections import defaultdict
 import numbers
 from operator import itemgetter
 import re
@@ -32,6 +32,8 @@
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
 from ..utils.fixes import sp_version
+from ..utils.fixes import _Mapping as Mapping  # noqa
+
 
 __all__ = ['CountVectorizer',
            'ENGLISH_STOP_WORDS',
diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py
index 3067d6ef31bc1..5bb0b3ea890c3 100644
--- a/sklearn/feature_selection/base.py
+++ b/sklearn/feature_selection/base.py
@@ -72,7 +72,7 @@ def transform(self, X):
         X_r : array of shape [n_samples, n_selected_features]
             The input samples with only the selected features.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, dtype=None, accept_sparse='csr')
         mask = self.get_support()
         if not mask.any():
             warn("No features were selected: either the data is"
@@ -111,7 +111,7 @@ def inverse_transform(self, X):
             return Xt
 
         support = self.get_support()
-        X = check_array(X)
+        X = check_array(X, dtype=None)
         if support.sum() != X.shape[1]:
             raise ValueError("X has a different shape than during fitting.")
 
diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py
index 6ad331772d96c..7fd15d171024d 100644
--- a/sklearn/linear_model/passive_aggressive.py
+++ b/sklearn/linear_model/passive_aggressive.py
@@ -137,17 +137,17 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     >>> from sklearn.datasets import make_classification
     >>>
     >>> X, y = make_classification(n_features=4, random_state=0)
-    >>> clf = PassiveAggressiveClassifier(random_state=0)
+    >>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0)
     >>> clf.fit(X, y)
     PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                   early_stopping=False, fit_intercept=True, loss='hinge',
-                  max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=1,
+                  max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=1,
                   random_state=0, shuffle=True, tol=None,
                   validation_fraction=0.1, verbose=0, warm_start=False)
     >>> print(clf.coef_)
-    [[0.49324685 1.0552176  1.49519589 1.33798314]]
+    [[0.29509834 0.33711843 0.56127352 0.60105546]]
     >>> print(clf.intercept_)
-    [2.18438388]
+    [2.54153383]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
 
@@ -375,11 +375,11 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     >>> from sklearn.datasets import make_regression
     >>>
     >>> X, y = make_regression(n_features=4, random_state=0)
-    >>> regr = PassiveAggressiveRegressor(random_state=0)
+    >>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0)
     >>> regr.fit(X, y)
     PassiveAggressiveRegressor(C=1.0, average=False, early_stopping=False,
                   epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive',
-                  max_iter=None, n_iter=None, n_iter_no_change=5,
+                  max_iter=100, n_iter=None, n_iter_no_change=5,
                   random_state=0, shuffle=True, tol=None,
                   validation_fraction=0.1, verbose=0, warm_start=False)
     >>> print(regr.coef_)
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index eec1c2551784e..f75a59db5e760 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -296,7 +296,7 @@ class RandomizedLasso(BaseRandomizedLinearModel):
     Examples
     --------
     >>> from sklearn.linear_model import RandomizedLasso
-    >>> randomized_lasso = RandomizedLasso()
+    >>> randomized_lasso = RandomizedLasso() # doctest: +SKIP
 
     References
     ----------
@@ -490,7 +490,7 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
     Examples
     --------
     >>> from sklearn.linear_model import RandomizedLogisticRegression
-    >>> randomized_logistic = RandomizedLogisticRegression()
+    >>> randomized_logistic = RandomizedLogisticRegression() # doctest: +SKIP
 
     References
     ----------
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 80778132bb242..d3f845b4442b6 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -59,7 +59,8 @@ def _mv(x):
             # w = X.T * inv(X X^t + alpha*Id) y
             C = sp_linalg.LinearOperator(
                 (n_samples, n_samples), matvec=mv, dtype=X.dtype)
-            coef, info = sp_linalg.cg(C, y_column, tol=tol)
+            # FIXME atol
+            coef, info = sp_linalg.cg(C, y_column, tol=tol, atol='legacy')
             coefs[i] = X1.rmatvec(coef)
         else:
             # linear ridge
@@ -67,8 +68,9 @@ def _mv(x):
             y_column = X1.rmatvec(y_column)
             C = sp_linalg.LinearOperator(
                 (n_features, n_features), matvec=mv, dtype=X.dtype)
+            # FIXME atol
             coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter,
-                                          tol=tol)
+                                          tol=tol, atol='legacy')
         if info < 0:
             raise ValueError("Failed with error code %d" % info)
 
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
index 8f5e1761468fa..20107c233d670 100644
--- a/sklearn/linear_model/stochastic_gradient.py
+++ b/sklearn/linear_model/stochastic_gradient.py
@@ -921,12 +921,12 @@ class SGDClassifier(BaseSGDClassifier):
     >>> from sklearn import linear_model
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> Y = np.array([1, 1, 2, 2])
-    >>> clf = linear_model.SGDClassifier()
+    >>> clf = linear_model.SGDClassifier(max_iter=1000)
     >>> clf.fit(X, Y)
     ... #doctest: +NORMALIZE_WHITESPACE
     SGDClassifier(alpha=0.0001, average=False, class_weight=None,
            early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
-           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
+           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
            n_iter=None, n_iter_no_change=5, n_jobs=1, penalty='l2',
            power_t=0.5, random_state=None, shuffle=True, tol=None,
            validation_fraction=0.1, verbose=0, warm_start=False)
@@ -1524,12 +1524,12 @@ class SGDRegressor(BaseSGDRegressor):
     >>> np.random.seed(0)
     >>> y = np.random.randn(n_samples)
     >>> X = np.random.randn(n_samples, n_features)
-    >>> clf = linear_model.SGDRegressor()
+    >>> clf = linear_model.SGDRegressor(max_iter=1000)
     >>> clf.fit(X, y)
     ... #doctest: +NORMALIZE_WHITESPACE
     SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
            epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
-           learning_rate='invscaling', loss='squared_loss', max_iter=None,
+           learning_rate='invscaling', loss='squared_loss', max_iter=1000,
            n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
            random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
            verbose=0, warm_start=False)
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 630559fe4fef2..f5e5b2222d244 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -88,16 +88,18 @@ def test_all_precomputed():
             assert_array_almost_equal(expected, got)
 
 
+@pytest.mark.filterwarnings('ignore: `rcond` parameter will change')  # numpy deprecation
 def test_lars_lstsq():
     # Test that Lars gives least square solution at the end
     # of the path
     X1 = 3 * diabetes.data  # use un-normalized dataset
     clf = linear_model.LassoLars(alpha=0.)
     clf.fit(X1, y)
-    coef_lstsq = np.linalg.lstsq(X1, y)[0]
+    coef_lstsq = np.linalg.lstsq(X1, y, rcond=None)[0]
     assert_array_almost_equal(clf.coef_, coef_lstsq)
 
 
+@pytest.mark.filterwarnings('ignore:`rcond` parameter will change')  # numpy deprecation
 def test_lasso_gives_lstsq_solution():
     # Test that Lars Lasso gives least square solution at the end
     # of the path
@@ -473,6 +475,7 @@ def test_lars_path_readonly_data():
         _lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_lars_path_positive_constraint():
     # this is the main test for the positive parameter on the lars_path method
     # the estimator classes just make use of this function
@@ -487,12 +490,10 @@ def test_lars_path_positive_constraint():
     # assert_raises(ValueError, linear_model.lars_path, diabetes['data'],
     #               diabetes['target'], method='lar', positive=True)
 
-    with warnings.catch_warnings(record=True) as w:
+    with pytest.warns(DeprecationWarning, match="broken"):
         linear_model.lars_path(diabetes['data'], diabetes['target'],
                                return_path=True, method='lar',
                                positive=True)
-    assert_true(len(w) == 1)
-    assert "broken" in str(w[0].message)
 
     method = 'lasso'
     alpha, active, coefs = \
diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py
index c920ee2e6619a..077b440792d7e 100644
--- a/sklearn/linear_model/tests/test_randomized_l1.py
+++ b/sklearn/linear_model/tests/test_randomized_l1.py
@@ -1,5 +1,7 @@
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
+
+import pytest
 from tempfile import mkdtemp
 import shutil
 
@@ -111,6 +113,7 @@ def test_randomized_lasso():
     assert_raises(ValueError, clf.fit, X, y)
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_randomized_lasso_precompute():
     # Check randomized lasso for different values of precompute
     n_resampling = 20
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 2ea65f8498f8d..fc0c0a943b703 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -489,6 +489,7 @@ def check_dense_sparse(test_func):
         assert_array_almost_equal(ret_dense, ret_sparse, decimal=3)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 @pytest.mark.parametrize(
         'test_func',
         (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize,
@@ -640,6 +641,7 @@ def test_ridge_classifier_cv_store_cv_values():
     assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_ridgecv_sample_weight():
     rng = np.random.RandomState(0)
     alphas = (0.1, 1.0, 10.0)
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index bc32b58c6e7ff..d236c17e5dbb5 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -86,6 +86,8 @@ def test_sparse_graph_connected_component():
         assert_array_equal(component_1, component_2)
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
+                            "change in version 0.22")
 def test_spectral_embedding_two_components(seed=36):
     # Test spectral embedding with two components
     random_state = np.random.RandomState(seed)
@@ -180,6 +182,8 @@ def test_spectral_embedding_amg_solver(seed=36):
     assert_true(_check_with_col_sign_flipping(embed_amg, embed_arpack, 0.05))
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
+                            "change in version 0.22")
 def test_pipeline_spectral_clustering(seed=36):
     # Test using pipeline to do spectral clustering
     random_state = np.random.RandomState(seed)
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 13addf29fdc00..2f084c4fced90 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -701,14 +701,17 @@ def adjusted_mutual_info_score(labels_true, labels_pred,
 
       >>> from sklearn.metrics.cluster import adjusted_mutual_info_score
       >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      ... # doctest: +SKIP
       1.0
       >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      ... # doctest: +SKIP
       1.0
 
     If classes members are completely split across different clusters,
     the assignment is totally in-complete, hence the AMI is null::
 
       >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      ... # doctest: +SKIP
       0.0
 
     References
@@ -820,14 +823,17 @@ def normalized_mutual_info_score(labels_true, labels_pred,
 
       >>> from sklearn.metrics.cluster import normalized_mutual_info_score
       >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
+      ... # doctest: +SKIP
       1.0
       >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
+      ... # doctest: +SKIP
       1.0
 
     If classes members are completely split across different clusters,
     the assignment is totally in-complete, hence the NMI is null::
 
-      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
+      >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])i
+      ... # doctest: +SKIP
       0.0
 
     """
diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py
index a7e54d22cc7c8..a83987703b2e7 100644
--- a/sklearn/metrics/cluster/tests/test_common.py
+++ b/sklearn/metrics/cluster/tests/test_common.py
@@ -15,7 +15,7 @@
 from sklearn.metrics.cluster import calinski_harabaz_score
 from sklearn.metrics.cluster import davies_bouldin_score
 
-from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_allclose, ignore_warnings
 
 
 # Dictionaries of metrics
@@ -83,6 +83,8 @@ def test_symmetric_non_symmetric_union():
             sorted(SUPERVISED_METRICS))
 
 
+@ignore_warnings(category=FutureWarning)
+# 0.22 AMI and NMI changes
 @pytest.mark.parametrize(
     'metric_name, y1, y2',
     [(name, y1, y2) for name in SYMMETRIC_METRICS]
@@ -101,6 +103,8 @@ def test_non_symmetry(metric_name, y1, y2):
     assert metric(y1, y2) != pytest.approx(metric(y2, y1))
 
 
+@ignore_warnings(category=FutureWarning)
+# 0.22 AMI and NMI changes
 @pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
 def test_normalized_output(metric_name):
     upper_bound_1 = [0, 0, 0, 1, 1, 1]
@@ -121,6 +125,8 @@ def test_normalized_output(metric_name):
 
 # All clustering metrics do not change score due to permutations of labels
 # that is when 0 and 1 exchanged.
+@ignore_warnings(category=FutureWarning)
+# 0.22 AMI and NMI changes
 @pytest.mark.parametrize(
     "metric_name",
     dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
@@ -142,6 +148,8 @@ def test_permute_labels(metric_name):
 
 
 # For all clustering metrics Input parameters can be both
+@ignore_warnings(category=FutureWarning)
+# 0.22 AMI and NMI changes
 @pytest.mark.parametrize(
     "metric_name",
     dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index d7915eab60973..33406e35ae893 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -430,6 +430,7 @@ def test_auc():
     assert_array_almost_equal(auc(x, y), 0.5)
 
 
+@pytest.mark.filterwarnings("ignore: The 'reorder' parameter")  # 0.22
 def test_auc_duplicate_values():
     # Test Area Under Curve (AUC) computation with duplicate values
 
@@ -437,6 +438,8 @@ def test_auc_duplicate_values():
     # from numpy.argsort(x), which was reordering the tied 0's in this example
     # and resulting in an incorrect area computation. This test detects the
     # error.
+
+    # This will not work again in the future! so regression?
     x = [-2.0, 0.0, 0.0, 0.0, 1.0]
     y1 = [2.0, 0.0, 0.5, 1.0, 1.0]
     y2 = [2.0, 1.0, 0.0, 0.5, 1.0]
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6ce2955a127de..33a0fc110939b 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -413,6 +413,8 @@ def test_thresholded_scorers_multilabel_indicator_data():
     assert_almost_equal(score1, score2)
 
 
+@pytest.mark.filterwarnings("ignore:the behavior of ")
+# AMI and NMI changes for 0.22
 def test_supervised_cluster_scorers():
     # Test clustering scorers against gold standard labeling.
     X, y = make_blobs(random_state=0, centers=2)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index dac44360ace37..effee28cb20ca 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -938,8 +938,8 @@ class GridSearchCV(BaseSearchCV):
     >>> iris = datasets.load_iris()
     >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
     >>> svc = svm.SVC(gamma="scale")
-    >>> clf = GridSearchCV(svc, parameters)
-    >>> clf.fit(iris.data, iris.target)
+    >>> clf = GridSearchCV(svc, parameters) # doctest: +SKIP
+    >>> clf.fit(iris.data, iris.target) # doctest: +SKIP
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     GridSearchCV(cv=None, error_score=...,
            estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
@@ -950,7 +950,7 @@ class GridSearchCV(BaseSearchCV):
            fit_params=None, iid=..., n_jobs=1,
            param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
            scoring=..., verbose=...)
-    >>> sorted(clf.cv_results_.keys())
+    >>> sorted(clf.cv_results_.keys()) # doctest: +SKIP
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
      'mean_train_score', 'param_C', 'param_kernel', 'params',...
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index fd1e83d55dcb7..5a46ca0bf7929 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -185,7 +185,8 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     (please refer the ``scoring`` parameter doc for more information)
 
     >>> scores = cross_validate(lasso, X, y,
-    ...                         scoring=('r2', 'neg_mean_squared_error'))
+    ...                         scoring=('r2', 'neg_mean_squared_error'),
+    ...                         return_train_score=True)
     >>> print(scores['test_neg_mean_squared_error'])      # doctest: +ELLIPSIS
     [-3635.5... -3573.3... -6114.7...]
     >>> print(scores['train_r2'])                         # doctest: +ELLIPSIS
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 91d5a9fd841f0..9bd4c475d1908 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -7,6 +7,7 @@
 import sys
 from types import GeneratorType
 import re
+import warnings
 
 import numpy as np
 import scipy.sparse as sp
@@ -177,6 +178,7 @@ def test_parameter_grid():
     assert_grid_iter_equals_getitem(has_empty)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search():
     # Test that the best estimator contains the right value for foo_param
     clf = MockClassifier()
@@ -220,14 +222,19 @@ def check_hyperparameter_searcher_with_fit_params(klass, **klass_kwargs):
     searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_with_fit_params():
-    check_hyperparameter_searcher_with_fit_params(GridSearchCV)
+    check_hyperparameter_searcher_with_fit_params(GridSearchCV,
+                                                  error_score='raise')
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_random_search_with_fit_params():
-    check_hyperparameter_searcher_with_fit_params(RandomizedSearchCV, n_iter=1)
+    check_hyperparameter_searcher_with_fit_params(RandomizedSearchCV, n_iter=1,
+                                                  error_score='raise')
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_fit_params_deprecation():
     # NOTE: Remove this test in v0.21
 
@@ -241,6 +248,7 @@ def test_grid_search_fit_params_deprecation():
     assert_warns(DeprecationWarning, grid_search.fit, X, y)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_fit_params_two_places():
     # NOTE: Remove this test in v0.21
 
@@ -264,10 +272,16 @@ def test_grid_search_fit_params_two_places():
 
     # Verify that `fit` prefers its own kwargs by giving valid
     # kwargs in the constructor and invalid in the method call
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]},
-                               fit_params={'spam': np.ones(10)})
-    assert_raise_message(AssertionError, "Fit parameter spam has length 1",
-                         grid_search.fit, X, y, spam=np.ones(1))
+    with warnings.catch_warnings():
+        # JvR: As passing fit params to the constructor is deprecated, this
+        # unit test raises a warning (unit test can be removed after version
+        # 0.22)
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]},
+                                   fit_params={'spam': np.ones(10)},
+                                   error_score='raise')
+        assert_raise_message(AssertionError, "Fit parameter spam has length 1",
+                             grid_search.fit, X, y, spam=np.ones(1))
 
 
 @ignore_warnings
@@ -296,6 +310,7 @@ def test_grid_search_no_score():
                          [[1]])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_score_method():
     X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                                random_state=0)
@@ -305,7 +320,8 @@ def test_grid_search_score_method():
     search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
     search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
     search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
-                                              scoring='roc_auc').fit(X, y)
+                                              scoring='roc_auc'
+                                              ).fit(X, y)
     search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)
 
     # Check warning only occurs in situation where behavior changed:
@@ -324,6 +340,7 @@ def test_grid_search_score_method():
     assert_almost_equal(score_auc, score_no_score_auc)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_groups():
     # Check if ValueError (when groups is None) propagates to GridSearchCV
     # And also check if groups is correctly passed to the cv object
@@ -388,6 +405,7 @@ def test_return_train_score_warn():
             assert_no_warnings(result['warn'].get, key)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_classes__property():
     # Test that classes_ property matches best_estimator_.classes_
     X = np.arange(100).reshape(10, 10)
@@ -415,6 +433,7 @@ def test_classes__property():
     assert_false(hasattr(grid_search, 'classes_'))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_trivial_cv_results_attr():
     # Test search over a "grid" with only one point.
     clf = MockClassifier()
@@ -427,6 +446,7 @@ def test_trivial_cv_results_attr():
     assert_true(hasattr(grid_search, "cv_results_"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_no_refit():
     # Test that GSCV can be used for model selection alone without refitting
     clf = MockClassifier()
@@ -452,10 +472,12 @@ def test_no_refit():
                              "parameter refit must be set to a scorer key",
                              GridSearchCV(clf, {}, refit=refit,
                                           scoring={'acc': 'accuracy',
-                                                   'prec': 'precision'}).fit,
+                                                   'prec': 'precision'}
+                                          ).fit,
                              X, y)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_error():
     # Test that grid search will capture errors on data with different length
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
@@ -465,11 +487,12 @@ def test_grid_search_error():
     assert_raises(ValueError, cv.fit, X_[:180], y_)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_one_grid_point():
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
     param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
 
-    clf = SVC()
+    clf = SVC(gamma='auto')
     cv = GridSearchCV(clf, param_dict)
     cv.fit(X_, y_)
 
@@ -479,6 +502,7 @@ def test_grid_search_one_grid_point():
     assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_when_param_grid_includes_range():
     # Test that the best estimator contains the right value for foo_param
     clf = MockClassifier()
@@ -491,9 +515,10 @@ def test_grid_search_when_param_grid_includes_range():
     assert_equal(grid_search.best_estimator_.foo_param, 2)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_bad_param_grid():
     param_dict = {"C": 1.0}
-    clf = SVC()
+    clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
         "Parameter values for parameter (C) need to be a sequence"
@@ -508,7 +533,7 @@ def test_grid_search_bad_param_grid():
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": "1,2,3"}
-    clf = SVC()
+    clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
         "Parameter values for parameter (C) need to be a sequence"
@@ -520,6 +545,7 @@ def test_grid_search_bad_param_grid():
     assert_raises(ValueError, GridSearchCV, clf, param_dict)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_sparse():
     # Test that grid search works with both dense and sparse matrices
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
@@ -541,6 +567,7 @@ def test_grid_search_sparse():
     assert_equal(C, C2)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_sparse_scoring():
     X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
 
@@ -576,6 +603,7 @@ def f1_loss(y_true_, y_pred_):
     assert_array_equal(y_pred, y_pred3)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_precomputed_kernel():
     # Test that grid search works when the input features are given in the
     # form of a precomputed kernel matrix
@@ -604,6 +632,7 @@ def test_grid_search_precomputed_kernel():
     assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_precomputed_kernel_error_nonsquare():
     # Test that grid search returns an error with a non-square precomputed
     # training kernel matrix
@@ -641,6 +670,7 @@ def test_refit():
     clf.fit(X, y)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch_nd():
     # Pass X as list in GridSearchCV
     X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
@@ -653,6 +683,7 @@ def test_gridsearch_nd():
     assert_true(hasattr(grid_search, "cv_results_"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_X_as_list():
     # Pass X as list in GridSearchCV
     X = np.arange(100).reshape(10, 10)
@@ -665,6 +696,7 @@ def test_X_as_list():
     assert_true(hasattr(grid_search, "cv_results_"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_y_as_list():
     # Pass y as list in GridSearchCV
     X = np.arange(100).reshape(10, 10)
@@ -708,6 +740,7 @@ def check_series(x):
         assert_true(hasattr(grid_search, "cv_results_"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_unsupervised_grid_search():
     # test grid-search with unsupervised estimator
     X, y = make_blobs(random_state=0)
@@ -734,6 +767,7 @@ def test_unsupervised_grid_search():
     assert_equal(grid_search.best_params_["n_clusters"], 4)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_gridsearch_no_predict():
     # test grid-search with an estimator without predict.
     # slight duplication of a test from KDE
@@ -822,7 +856,7 @@ def test_grid_search_cv_results():
 
     for iid in (False, True):
         search = GridSearchCV(SVC(gamma='scale'), cv=n_splits, iid=iid,
-                              param_grid=params)
+                              param_grid=params, return_train_score=True)
         search.fit(X, y)
         assert_equal(iid, search.iid)
         cv_results = search.cv_results_
@@ -873,7 +907,8 @@ def test_random_search_cv_results():
     for iid in (False, True):
         search = RandomizedSearchCV(SVC(gamma='scale'), n_iter=n_search_iter,
                                     cv=n_splits, iid=iid,
-                                    param_distributions=params)
+                                    param_distributions=params,
+                                    return_train_score=True)
         search.fit(X, y)
         assert_equal(iid, search.iid)
         cv_results = search.cv_results_
@@ -901,11 +936,12 @@ def test_search_iid_param():
     # create "cv" for splits
     cv = [[mask, ~mask], [~mask, mask]]
     # once with iid=True (default)
-    grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]},
-                               cv=cv)
-    random_search = RandomizedSearchCV(SVC(), n_iter=2,
+    grid_search = GridSearchCV(SVC(gamma='auto'), param_grid={'C': [1, 10]},
+                               cv=cv, return_train_score=True)
+    random_search = RandomizedSearchCV(SVC(gamma='auto'), n_iter=2,
                                        param_distributions={'C': [1, 10]},
-                                       cv=cv)
+                                       cv=cv, iid=True,
+                                       return_train_score=True)
     for search in (grid_search, random_search):
         search.fit(X, y)
         assert_true(search.iid or search.iid is None)
@@ -936,7 +972,7 @@ def test_search_iid_param():
         assert_almost_equal(test_mean, expected_test_mean)
         assert_almost_equal(test_std, expected_test_std)
         assert_array_almost_equal(test_cv_scores,
-                                  cross_val_score(SVC(C=1), X,
+                                  cross_val_score(SVC(C=1, gamma='auto'), X,
                                                   y, cv=cv))
 
         # For the train scores, we do not take a weighted mean irrespective of
@@ -945,12 +981,13 @@ def test_search_iid_param():
         assert_almost_equal(train_std, 0)
 
     # once with iid=False
-    grid_search = GridSearchCV(SVC(),
+    grid_search = GridSearchCV(SVC(gamma='auto'),
                                param_grid={'C': [1, 10]},
-                               cv=cv, iid=False)
-    random_search = RandomizedSearchCV(SVC(), n_iter=2,
+                               cv=cv, iid=False, return_train_score=True)
+    random_search = RandomizedSearchCV(SVC(gamma='auto'), n_iter=2,
                                        param_distributions={'C': [1, 10]},
-                                       cv=cv, iid=False)
+                                       cv=cv, iid=False,
+                                       return_train_score=True)
 
     for search in (grid_search, random_search):
         search.fit(X, y)
@@ -1083,6 +1120,7 @@ def compare_refit_methods_when_refit_with_acc(search_multi, search_acc, refit):
         assert_equal(getattr(search_multi, key), getattr(search_acc, key))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_search_cv_results_rank_tie_breaking():
     X, y = make_blobs(n_samples=50, random_state=42)
 
@@ -1090,9 +1128,11 @@ def test_search_cv_results_rank_tie_breaking():
     # which would result in a tie of their mean cv-scores
     param_grid = {'C': [1, 1.001, 0.001]}
 
-    grid_search = GridSearchCV(SVC(gamma="scale"), param_grid=param_grid)
+    grid_search = GridSearchCV(SVC(gamma="scale"), param_grid=param_grid,
+                               return_train_score=True)
     random_search = RandomizedSearchCV(SVC(gamma="scale"), n_iter=3,
-                                       param_distributions=param_grid)
+                                       param_distributions=param_grid,
+                                       return_train_score=True)
 
     for search in (grid_search, random_search):
         search.fit(X, y)
@@ -1112,6 +1152,7 @@ def test_search_cv_results_rank_tie_breaking():
         assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_search_cv_results_none_param():
     X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
     estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
@@ -1119,7 +1160,8 @@ def test_search_cv_results_none_param():
     cv = KFold(random_state=0)
 
     for est in estimators:
-        grid_search = GridSearchCV(est, est_parameters, cv=cv).fit(X, y)
+        grid_search = GridSearchCV(est, est_parameters, cv=cv,
+                                   ).fit(X, y)
         assert_array_equal(grid_search.cv_results_['param_random_state'],
                            [0, None])
 
@@ -1152,6 +1194,7 @@ def test_search_cv_timing():
         assert_greater_equal(search.refit_time_, 0)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_correct_score_results():
     # test that correct scores are used
     n_splits = 3
@@ -1215,6 +1258,7 @@ def test_fit_grid_point():
                          {'score': scorer}, verbose=True)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_pickle():
     # Test that a fit search can be pickled
     clf = MockClassifier()
@@ -1232,6 +1276,7 @@ def test_pickle():
                               random_search_pickled.predict(X))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_with_multioutput_data():
     # Test search with multi-output estimator
 
@@ -1277,6 +1322,7 @@ def test_grid_search_with_multioutput_data():
                                               % i][cand_i])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_predict_proba_disabled():
     # Test predict_proba when disabled on estimator.
     X = np.arange(20).reshape(5, -1)
@@ -1286,6 +1332,7 @@ def test_predict_proba_disabled():
     assert_false(hasattr(gs, "predict_proba"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_allows_nans():
     # Test GridSearchCV with SimpleImputer
     X = np.arange(20, dtype=np.float64).reshape(5, -1)
@@ -1314,6 +1361,7 @@ def predict(self, X):
         return np.zeros(X.shape[0])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_failing_classifier():
     # GridSearchCV with on_error != 'raise'
     # Ensures that a warning is raised and score reset where appropriate.
@@ -1361,6 +1409,7 @@ def get_cand_scores(i):
     assert gs.best_index_ != clf.FAILING_PARAMETER
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_failing_classifier_raise():
     # GridSearchCV with on_error == 'raise' raises the error
 
@@ -1412,6 +1461,7 @@ def test_parameters_sampler_replacement():
     assert_equal(len(samples), 7)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_stochastic_gradient_loss_param():
     # Make sure the predict_proba works when loss is specified
     # as one of the parameters in the param_grid.
@@ -1442,6 +1492,7 @@ def test_stochastic_gradient_loss_param():
     assert_false(hasattr(clf, "predict_proba"))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_search_train_scores_set_to_false():
     X = np.arange(6).reshape(6, -1)
     y = [0, 0, 0, 1, 1, 1]
@@ -1452,6 +1503,7 @@ def test_search_train_scores_set_to_false():
     gs.fit(X, y)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_grid_search_cv_splits_consistency():
     # Check if a one time iterable is accepted as a cv parameter.
     n_samples = 100
@@ -1461,12 +1513,13 @@ def test_grid_search_cv_splits_consistency():
     gs = GridSearchCV(LinearSVC(random_state=0),
                       param_grid={'C': [0.1, 0.2, 0.3]},
                       cv=OneTimeSplitter(n_splits=n_splits,
-                                         n_samples=n_samples))
+                                         n_samples=n_samples),
+                      return_train_score=True)
     gs.fit(X, y)
 
     gs2 = GridSearchCV(LinearSVC(random_state=0),
                        param_grid={'C': [0.1, 0.2, 0.3]},
-                       cv=KFold(n_splits=n_splits))
+                       cv=KFold(n_splits=n_splits), return_train_score=True)
     gs2.fit(X, y)
 
     # Give generator as a cv parameter
@@ -1476,13 +1529,14 @@ def test_grid_search_cv_splits_consistency():
     gs3 = GridSearchCV(LinearSVC(random_state=0),
                        param_grid={'C': [0.1, 0.2, 0.3]},
                        cv=KFold(n_splits=n_splits, shuffle=True,
-                                random_state=0).split(X, y))
+                                random_state=0).split(X, y),
+                       return_train_score=True)
     gs3.fit(X, y)
 
     gs4 = GridSearchCV(LinearSVC(random_state=0),
                        param_grid={'C': [0.1, 0.2, 0.3]},
                        cv=KFold(n_splits=n_splits, shuffle=True,
-                                random_state=0))
+                                random_state=0), return_train_score=True)
     gs4.fit(X, y)
 
     def _pop_time_keys(cv_results):
@@ -1510,7 +1564,8 @@ def _pop_time_keys(cv_results):
     # Check consistency of folds across the parameters
     gs = GridSearchCV(LinearSVC(random_state=0),
                       param_grid={'C': [0.1, 0.1, 0.2, 0.2]},
-                      cv=KFold(n_splits=n_splits, shuffle=True))
+                      cv=KFold(n_splits=n_splits, shuffle=True),
+                      return_train_score=True)
     gs.fit(X, y)
 
     # As the first two param settings (C=0.1) and the next two param
@@ -1530,6 +1585,7 @@ def _pop_time_keys(cv_results):
                                   per_param_scores[3])
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_transform_inverse_transform_round_trip():
     clf = MockClassifier()
     grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 0071129d8ce73..98a1f808b4d74 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -473,16 +473,18 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility():
 
     for cv in (kf, skf):
         for data in zip((X, X2), (y, y2)):
-            # Test if the two splits are different
-            # numpy's assert_equal properly compares nested lists
-            try:
-                np.testing.assert_array_equal(list(cv.split(*data)),
-                                              list(cv.split(*data)))
-            except AssertionError:
-                pass
-            else:
-                raise AssertionError("The splits for data, %s, are same even "
-                                     "when random state is not set" % data)
+            # Test if the two splits are different cv
+            for (_, test_a), (_, test_b) in zip(cv.split(*data),
+                                                cv.split(*data)):
+                # cv.split(...) returns an array of tuples, each tuple
+                # consisting of an array with train indices and test indices
+                try:
+                    np.testing.assert_array_equal(test_a, test_b)
+                except AssertionError:
+                    pass
+                else:
+                    raise AssertionError("The splits for data, are same even "
+                                         "when random state is not set")
 
 
 def test_shuffle_stratifiedkfold():
@@ -1004,7 +1006,12 @@ def test_repeated_stratified_kfold_determinstic_split():
 
 def test_train_test_split_errors():
     assert_raises(ValueError, train_test_split)
-    assert_raises(ValueError, train_test_split, range(3), train_size=1.1)
+    with warnings.catch_warnings():
+        # JvR: Currently, a future warning is raised if test_size is not
+        # given. As that is the point of this test, ignore the future warning
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        assert_raises(ValueError, train_test_split, range(3), train_size=1.1)
+
     assert_raises(ValueError, train_test_split, range(3), test_size=0.6,
                   train_size=0.6)
     assert_raises(ValueError, train_test_split, range(3),
@@ -1403,7 +1410,7 @@ def test_nested_cv():
 
     for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
         gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
-                          cv=inner_cv)
+                          cv=inner_cv, error_score='raise', iid=False)
         cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                         fit_params={'groups': groups})
 
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index f2aff6b82beaa..f0a6564270b7b 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -239,6 +239,8 @@ def get_params(self, deep=False):
 P_sparse = coo_matrix(np.eye(5))
 
 
+@pytest.mark.filterwarnings('ignore: From version 0.22, errors during fit')
+# FIXME issue in error_score parameter
 def test_cross_val_score():
     clf = MockClassifier()
 
@@ -420,9 +422,10 @@ def check_cross_validate_single_metric(clf, X, y, scores):
     for (return_train_score, dict_len) in ((True, 4), (False, 3)):
         # Single metric passed as a string
         if return_train_score:
-            # It must be True by default
+            # It must be True by default - deprecated
             mse_scores_dict = cross_validate(clf, X, y, cv=5,
-                                             scoring='neg_mean_squared_error')
+                                             scoring='neg_mean_squared_error',
+                                             return_train_score=True)
             assert_array_almost_equal(mse_scores_dict['train_score'],
                                       train_mse_scores)
         else:
@@ -436,10 +439,11 @@ def check_cross_validate_single_metric(clf, X, y, scores):
 
         # Single metric passed as a list
         if return_train_score:
-            # It must be True by default
-            r2_scores_dict = cross_validate(clf, X, y, cv=5, scoring=['r2'])
+            # It must be True by default - deprecated
+            r2_scores_dict = cross_validate(clf, X, y, cv=5, scoring=['r2'],
+                                            return_train_score=True)
             assert_array_almost_equal(r2_scores_dict['train_r2'],
-                                      train_r2_scores)
+                                      train_r2_scores, True)
         else:
             r2_scores_dict = cross_validate(clf, X, y, cv=5, scoring=['r2'],
                                             return_train_score=False)
@@ -472,8 +476,9 @@ def check_cross_validate_multi_metric(clf, X, y, scores):
     for return_train_score in (True, False):
         for scoring in all_scoring:
             if return_train_score:
-                # return_train_score must be True by default
-                cv_results = cross_validate(clf, X, y, cv=5, scoring=scoring)
+                # return_train_score must be True by default - deprecated
+                cv_results = cross_validate(clf, X, y, cv=5, scoring=scoring,
+                                            return_train_score=True)
                 assert_array_almost_equal(cv_results['train_r2'],
                                           train_r2_scores)
                 assert_array_almost_equal(
@@ -523,6 +528,7 @@ def test_cross_val_score_predict_groups():
                              cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
 def test_cross_val_score_pandas():
     # check cross_val_score doesn't destroy pandas dataframe
     types = [(MockDataFrame, MockDataFrame)]
@@ -947,6 +953,8 @@ def test_cross_val_predict_input_types():
     assert_array_equal(predictions.shape, (150,))
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
+# python3.7 deprecation warnings in pandas via matplotlib :-/
 def test_cross_val_predict_pandas():
     # check cross_val_score doesn't destroy pandas dataframe
     types = [(MockDataFrame, MockDataFrame)]
@@ -1150,6 +1158,8 @@ def test_learning_curve_with_boolean_indices():
                               np.linspace(0.1, 1.0, 10))
 
 
+@pytest.mark.filterwarnings('ignore: From version 0.22, errors during fit')
+# FIXME this is an error in the error_score change!
 def test_learning_curve_with_shuffle():
     # Following test case was designed this way to verify the code
     # changes made in pull request: #7506.
@@ -1319,6 +1329,7 @@ def test_cross_val_predict_with_method():
     check_cross_val_predict_with_method(LogisticRegression())
 
 
+@pytest.mark.filterwarnings('ignore: max_iter and tol parameters')
 def test_cross_val_predict_method_checking():
     # Regression test for issue #9639. Tests that cross_val_predict does not
     # check estimator methods (e.g. predict_proba) before fitting
@@ -1326,6 +1337,7 @@ def test_cross_val_predict_method_checking():
     check_cross_val_predict_with_method(est)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')
 def test_gridsearchcv_cross_val_predict_with_method():
     est = GridSearchCV(LogisticRegression(random_state=42),
                        {'C': [0.1, 1]},
@@ -1421,6 +1433,7 @@ def test_score_memmap():
                 sleep(1.)
 
 
+@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
 def test_permutation_test_score_pandas():
     # check permutation_test_score doesn't destroy pandas dataframe
     types = [(MockDataFrame, MockDataFrame)]
@@ -1463,10 +1476,6 @@ def test_fit_and_score():
     assert_warns_message(FitFailedWarning, warning_message, _fit_and_score,
                          *fit_and_score_args, **fit_and_score_kwargs)
 
-    # check if exception is raised, with default error_score argument
-    assert_raise_message(ValueError, "Failing classifier failed as required",
-                         _fit_and_score, *fit_and_score_args)
-
     # check if warning was raised, with default error_score argument
     warning_message = ("From version 0.22, errors during fit will result "
                        "in a cross validation score of NaN by default. Use "
diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py
index 6a3fd571b3217..4a30bd8159988 100644
--- a/sklearn/neighbors/approximate.py
+++ b/sklearn/neighbors/approximate.py
@@ -187,17 +187,18 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
 
       >>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]
       >>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]
-      >>> lshf = LSHForest(random_state=42)
-      >>> lshf.fit(X_train)  # doctest: +NORMALIZE_WHITESPACE
+      >>> lshf = LSHForest(random_state=42)  # doctest: +SKIP
+      >>> lshf.fit(X_train)  # doctest: +SKIP
       LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10,
                 n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9,
                 random_state=42)
       >>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)
-      >>> distances                                        # doctest: +ELLIPSIS
+      ... # doctest: +SKIP
+      >>> distances                                        # doctest: +SKIP
       array([[0.069..., 0.149...],
              [0.229..., 0.481...],
              [0.004..., 0.014...]])
-      >>> indices
+      >>> indices  # doctest: +SKIP
       array([[1, 2],
              [2, 0],
              [4, 0]])
diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
index b472ab3d833bb..8c54743ced518 100644
--- a/sklearn/neighbors/tests/test_lof.py
+++ b/sklearn/neighbors/tests/test_lof.py
@@ -12,7 +12,7 @@
 from sklearn.metrics import roc_auc_score
 
 from sklearn.utils import check_random_state
-from sklearn.utils.testing import assert_greater
+from sklearn.utils.testing import assert_greater, ignore_warnings
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_warns_message, assert_raises
@@ -29,6 +29,7 @@
 iris.target = iris.target[perm]
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_lof():
     # Toy sample (the last two samples are outliers):
     X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
@@ -47,6 +48,7 @@ def test_lof():
     assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_lof_performance():
     # Generate train/test data
     rng = check_random_state(2)
@@ -68,6 +70,7 @@ def test_lof_performance():
     assert_greater(roc_auc_score(y_test, y_pred), .99)
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_lof_values():
     # toy samples:
     X_train = [[1, 1], [1, 2], [2, 1]]
@@ -87,6 +90,7 @@ def test_lof_values():
     assert_array_almost_equal(-clf2._score_samples([[1., 1.]]), [s_1])
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_lof_precomputed(random_state=42):
     """Tests LOF with a distance matrix."""
     # Note: smaller samples may result in spurious test success
@@ -112,6 +116,7 @@ def test_lof_precomputed(random_state=42):
     assert_array_almost_equal(pred_X_Y, pred_D_Y)
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_n_neighbors_attribute():
     X = iris.data
     clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
@@ -124,6 +129,7 @@ def test_n_neighbors_attribute():
     assert_equal(clf.n_neighbors_, X.shape[0] - 1)
 
 
+@ignore_warnings(category=DeprecationWarning)  # contamination changed to 'auto' 0.22
 def test_score_samples():
     X_train = [[1, 1], [1, 2], [2, 1]]
     clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
index 0bd57a859649f..966901333e7b5 100644
--- a/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -6,6 +6,7 @@
 from sklearn.utils.testing import (assert_equal, assert_array_equal,
                                    assert_allclose_dense_sparse)
 from sklearn.utils.testing import assert_warns_message, assert_no_warnings
+from sklearn.utils.testing import ignore_warnings
 
 
 def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
@@ -25,7 +26,7 @@ def test_delegate_to_func():
     kwargs_store = {}
     X = np.arange(10).reshape((5, 2))
     assert_array_equal(
-        FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
+        FunctionTransformer(_make_func(args_store, kwargs_store), validate=False).transform(X),
         X, 'transform should have returned X unchanged',
     )
 
@@ -53,7 +54,7 @@ def test_delegate_to_func():
         DeprecationWarning, "pass_y is deprecated",
         FunctionTransformer(
             _make_func(args_store, kwargs_store),
-            pass_y=True).transform, X, y)
+            pass_y=True, validate=False).transform, X, y)
 
     assert_array_equal(transformed, X,
                        err_msg='transform should have returned X unchanged')
@@ -75,6 +76,8 @@ def test_delegate_to_func():
     )
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_np_log():
     X = np.arange(10).reshape((5, 2))
 
@@ -85,6 +88,8 @@ def test_np_log():
     )
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_kw_arg():
     X = np.linspace(0, 1, num=10).reshape((5, 2))
 
@@ -95,6 +100,8 @@ def test_kw_arg():
                        np.around(X, decimals=3))
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_kw_arg_update():
     X = np.linspace(0, 1, num=10).reshape((5, 2))
 
@@ -106,6 +113,8 @@ def test_kw_arg_update():
     assert_array_equal(F.transform(X), np.around(X, decimals=1))
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_kw_arg_reset():
     X = np.linspace(0, 1, num=10).reshape((5, 2))
 
@@ -117,6 +126,8 @@ def test_kw_arg_reset():
     assert_array_equal(F.transform(X), np.around(X, decimals=1))
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_inverse_transform():
     X = np.array([1, 4, 9, 16]).reshape((2, 2))
 
@@ -131,6 +142,8 @@ def test_inverse_transform():
     )
 
 
+@ignore_warnings(category=FutureWarning)
+# ignore warning for validate=False 0.22
 def test_check_inverse():
     X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
 
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index ead2d1cd27fd6..eeb92e4824b94 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -22,7 +22,7 @@
 from sklearn.utils.testing import ignore_warnings, assert_raises
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.exceptions import NotFittedError
+from sklearn.exceptions import NotFittedError, UndefinedMetricWarning
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.externals import six
 
@@ -283,7 +283,7 @@ def test_oneclass_decision_function():
 
 def test_oneclass_score_samples():
     X_train = [[1, 1], [1, 2], [2, 1]]
-    clf = svm.OneClassSVM().fit(X_train)
+    clf = svm.OneClassSVM(gamma=1).fit(X_train)
     assert_array_equal(clf.score_samples([[2., 2.]]),
                        clf.decision_function([[2., 2.]]) + clf.offset_)
 
@@ -441,7 +441,7 @@ def test_sample_weights():
     clf.fit(X, Y, sample_weight=np.repeat(0.01, len(X)))
     assert_array_almost_equal(dual_coef_no_weight, clf.dual_coef_)
 
-
+@ignore_warnings(UndefinedMetricWarning)
 def test_auto_weight():
     # Test class weights for imbalanced data
     from sklearn.linear_model import LogisticRegression
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 427c5b39eb472..978a948c824bb 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -433,6 +433,7 @@ def test_imputation_constant_pandas(dtype):
     assert_array_equal(X_trans, X_true)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_imputation_pipeline_grid_search():
     # Test imputation within a pipeline + gridsearch.
     X = sparse_random_matrix(100, 100, density=0.10)
diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index 78a1fd617ccff..010d1fcd92c81 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -1,3 +1,5 @@
+import pytest
+
 import numpy as np
 import scipy.sparse as sp
 
@@ -329,6 +331,7 @@ def test_ovr_multilabel_dataset():
                             decimal=2)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_ovr_multilabel_predict_proba():
     base_clf = MultinomialNB(alpha=1)
     for au in (False, True):
@@ -421,6 +424,7 @@ def test_ovr_single_label_decision_function():
                        clf.predict(X_test))
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_ovr_gridsearch():
     ovr = OneVsRestClassifier(LinearSVC(random_state=0))
     Cs = [0.1, 0.5, 0.8]
@@ -597,6 +601,7 @@ def test_ovo_decision_function():
         assert_greater(len(np.unique(decisions[:, class_idx])), 146)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_ovo_gridsearch():
     ovo = OneVsOneClassifier(LinearSVC(random_state=0))
     Cs = [0.1, 0.5, 0.8]
@@ -691,6 +696,7 @@ def test_ecoc_fit_predict():
     assert_equal(len(ecoc.estimators_), n_classes * 2)
 
 
+@pytest.mark.filterwarnings('ignore: The default of the `iid`')  # 0.22
 def test_ecoc_gridsearch():
     ecoc = OutputCodeClassifier(LinearSVC(random_state=0),
                                 random_state=0)
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 218733145a0de..68a93a2a9120a 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -367,7 +367,7 @@ def logsumexp(arr, axis=0):
     >>> a = np.arange(10)
     >>> np.log(np.sum(np.exp(a)))
     9.458...
-    >>> logsumexp(a)
+    >>> logsumexp(a)  # doctest: +SKIP
     9.458...
     """
     return scipy_logsumexp(arr, axis)
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 1751fc8284c4b..bf8412b3e527d 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -380,25 +380,25 @@ def test_check_estimator_clones():
     for Estimator in [GaussianMixture, LinearRegression,
                       RandomForestClassifier, NMF, SGDClassifier,
                       MiniBatchKMeans]:
-        with ignore_warnings(category=FutureWarning):
+        with ignore_warnings(category=(FutureWarning, DeprecationWarning)):
             # when 'est = SGDClassifier()'
             est = Estimator()
-        set_checking_parameters(est)
-        set_random_state(est)
-        # without fitting
-        old_hash = joblib.hash(est)
-        check_estimator(est)
+            set_checking_parameters(est)
+            set_random_state(est)
+            # without fitting
+            old_hash = joblib.hash(est)
+            check_estimator(est)
         assert_equal(old_hash, joblib.hash(est))
 
-        with ignore_warnings(category=FutureWarning):
+        with ignore_warnings(category=(FutureWarning, DeprecationWarning)):
             # when 'est = SGDClassifier()'
             est = Estimator()
-        set_checking_parameters(est)
-        set_random_state(est)
-        # with fitting
-        est.fit(iris.data + 10, iris.target)
-        old_hash = joblib.hash(est)
-        check_estimator(est)
+            set_checking_parameters(est)
+            set_random_state(est)
+            # with fitting
+            est.fit(iris.data + 10, iris.target)
+            old_hash = joblib.hash(est)
+            check_estimator(est)
         assert_equal(old_hash, joblib.hash(est))
 
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 4aab971fe2dc3..b0401142642aa 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -413,8 +413,9 @@ def test_check_array_accept_sparse_type_exception():
            "Use X.toarray() to convert to a dense numpy array.")
     assert_raise_message(TypeError, msg,
                          check_array, X_csr, accept_sparse=False)
-    assert_raise_message(TypeError, msg,
-                         check_array, X_csr, accept_sparse=None)
+    with pytest.warns(DeprecationWarning):
+        assert_raise_message(TypeError, msg,
+                             check_array, X_csr, accept_sparse=None)
 
     msg = ("Parameter 'accept_sparse' should be a string, "
            "boolean or list of strings. You provided 'accept_sparse={}'.")