diff --git a/.travis.yml b/.travis.yml
index 1f3ca7371ec6c..d2612e8d67ad4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,7 +41,7 @@ matrix:
     # This environment tests the newest supported Anaconda release (5.0.0)
     # It also runs tests requiring Pandas and PyAMG
     - env: DISTRIB="conda" PYTHON_VERSION="3.6.2" INSTALL_MKL="true"
-           NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" PANDAS_VERSION="0.20.3"
+           NUMPY_VERSION="1.14.2" SCIPY_VERSION="1.0.0" PANDAS_VERSION="0.20.3"
            CYTHON_VERSION="0.26.1" PYAMG_VERSION="3.3.2" PILLOW_VERSION="4.3.0"
            COVERAGE=true
            CHECK_PYTEST_SOFT_DEPENDENCY="true" TEST_DOCSTRINGS="true"
diff --git a/conftest.py b/conftest.py
index 110fdd479483c..c2b9ae2038875 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,14 +1,31 @@
-# This file is here so that when running from the root folder
-# ./sklearn is added to sys.path by pytest.
-# See https://docs.pytest.org/en/latest/pythonpath.html for more details.
-# For example, this allows to build extensions in place and run pytest
-# doc/modules/clustering.rst and use sklearn from the local folder
-# rather than the one from site-packages.
-
-# Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make
-# the doctests pass
-import numpy as np
-try:
-    np.set_printoptions(legacy='1.13')
-except TypeError:
-    pass
+# Even if empty this file is useful so that when running from the root folder
+# ./sklearn is added to sys.path by pytest. See
+# https://docs.pytest.org/en/latest/pythonpath.html for more details.  For
+# example, this allows to build extensions in place and run pytest
+# doc/modules/clustering.rst and use sklearn from the local folder rather than
+# the one from site-packages.
+
+from distutils.version import LooseVersion
+
+import pytest
+from _pytest.doctest import DoctestItem
+
+
+def pytest_collection_modifyitems(config, items):
+    # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
+    # run doctests only for numpy >= 1.14.
+    skip_doctests = True
+    try:
+        import numpy as np
+        if LooseVersion(np.__version__) >= LooseVersion('1.14'):
+            skip_doctests = False
+    except ImportError:
+        pass
+
+    if skip_doctests:
+        skip_marker = pytest.mark.skip(
+            reason='doctests are only run for numpy >= 1.14')
+
+        for item in items:
+            if isinstance(item, DoctestItem):
+                item.add_marker(skip_marker)
diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst
index b94dfd7620a24..b098abbdcef92 100644
--- a/doc/datasets/mldata.rst
+++ b/doc/datasets/mldata.rst
@@ -34,7 +34,7 @@ of size 28x28 pixels, labeled from 0 to 9::
   >>> mnist.target.shape
   (70000,)
   >>> np.unique(mnist.target)
-  array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
+  array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
 
 After the first download, the dataset is cached locally in the path
 specified by the ``data_home`` keyword argument, which defaults to
diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst
index 6f0d02f2aed42..b72b7c8e5c5d0 100644
--- a/doc/developers/utilities.rst
+++ b/doc/developers/utilities.rst
@@ -71,7 +71,7 @@ For example::
     >>> random_state = 0
     >>> random_state = check_random_state(random_state)
     >>> random_state.rand(4)
-    array([ 0.5488135 ,  0.71518937,  0.60276338,  0.54488318])
+    array([0.5488135 , 0.71518937, 0.60276338, 0.54488318])
 
 When developing your own scikit-learn compatible estimator, the following
 helpers are available.
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index ec7dfc4784ac8..e51fc074345c0 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -190,7 +190,7 @@ object::
               steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
      >>> # The pca instance can be inspected directly
      >>> print(pca1.components_) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-         [[ -1.77484909e-19  ... 4.07058917e-18]]
+         [[-1.77484909e-19  ... 4.07058917e-18]]
 
    Enabling caching triggers a clone of the transformers before fitting.
    Therefore, the transformer instance given to the pipeline cannot be
@@ -212,7 +212,7 @@ object::
                steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
      >>> print(cached_pipe.named_steps['reduce_dim'].components_)
      ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-         [[ -1.77484909e-19  ... 4.07058917e-18]]
+         [[-1.77484909e-19  ... 4.07058917e-18]]
      >>> # Remove the cache directory
      >>> rmtree(cachedir)
 
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index a39c49faf63b1..e889515da0923 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -106,7 +106,7 @@ time)::
   >>> clf = svm.SVC(kernel='linear', C=1)
   >>> scores = cross_val_score(clf, iris.data, iris.target, cv=5)
   >>> scores                                              # doctest: +ELLIPSIS
-  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
+  array([0.96..., 1.  ..., 0.96..., 0.96..., 1.        ])
 
 The mean score and the 95\% confidence interval of the score estimate are hence
 given by::
@@ -122,7 +122,7 @@ scoring parameter::
   >>> scores = cross_val_score(
   ...     clf, iris.data, iris.target, cv=5, scoring='f1_macro')
   >>> scores                                              # doctest: +ELLIPSIS
-  array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
+  array([0.96..., 1.  ..., 0.96..., 0.96..., 1.        ])
 
 See :ref:`scoring_parameter` for details.
 In the case of the Iris dataset, the samples are balanced across target
@@ -141,7 +141,7 @@ validation iterator instead, for instance::
   >>> cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
   >>> cross_val_score(clf, iris.data, iris.target, cv=cv)
   ...                                                     # doctest: +ELLIPSIS
-  array([ 0.97...,  0.97...,  1.        ])
+  array([0.97..., 0.97..., 1.        ])
 
 
 .. topic:: Data transformation with held out data
@@ -168,7 +168,7 @@ validation iterator instead, for instance::
       >>> clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
       >>> cross_val_score(clf, iris.data, iris.target, cv=cv)
       ...                                                 # doctest: +ELLIPSIS
-      array([ 0.97...,  0.93...,  0.95...])
+      array([0.97..., 0.93..., 0.95...])
 
     See :ref:`combining_estimators`.
 
@@ -212,7 +212,7 @@ predefined scorer names::
     >>> sorted(scores.keys())
     ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
     >>> scores['test_recall_macro']                       # doctest: +ELLIPSIS
-    array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])
+    array([0.96..., 1.  ..., 0.96..., 0.96..., 1.        ])
 
 Or as a dict mapping scorer name to a predefined or custom scoring function::
 
@@ -225,7 +225,7 @@ Or as a dict mapping scorer name to a predefined or custom scoring function::
     ['fit_time', 'score_time', 'test_prec_macro', 'test_rec_micro',
      'train_prec_macro', 'train_rec_micro']
     >>> scores['train_rec_micro']                         # doctest: +ELLIPSIS
-    array([ 0.97...,  0.97...,  0.99...,  0.98...,  0.98...])
+    array([0.97..., 0.97..., 0.99..., 0.98..., 0.98...])
 
 Here is an example of ``cross_validate`` using a single metric::
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 32f49dd77d3bf..655d5638472aa 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -782,7 +782,7 @@ accessed via the ``feature_importances_`` property::
     >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     ...     max_depth=1, random_state=0).fit(X, y)
     >>> clf.feature_importances_  # doctest: +ELLIPSIS
-    array([ 0.11,  0.1 ,  0.11,  ...
+    array([0.11, 0.1 , 0.11, ...
 
 .. topic:: Examples:
 
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index eb8b65fbaa844..3718819f852c0 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -49,9 +49,9 @@ is a traditional numerical feature::
   >>> vec = DictVectorizer()
 
   >>> vec.fit_transform(measurements).toarray()
-  array([[  1.,   0.,   0.,  33.],
-         [  0.,   1.,   0.,  12.],
-         [  0.,   0.,   1.,  18.]])
+  array([[ 1.,  0.,  0., 33.],
+         [ 0.,  1.,  0., 12.],
+         [ 0.,  0.,  1., 18.]])
 
   >>> vec.get_feature_names()
   ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']
@@ -89,7 +89,7 @@ suitable for feeding into a classifier (maybe after being piped into a
   <1x6 sparse matrix of type '<... 'numpy.float64'>'
       with 6 stored elements in Compressed Sparse ... format>
   >>> pos_vectorized.toarray()
-  array([[ 1.,  1.,  1.,  1.,  1.,  1.]])
+  array([[1., 1., 1., 1., 1., 1.]])
   >>> vec.get_feature_names()
   ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']
 
@@ -463,12 +463,12 @@ content of the documents::
       with 9 stored elements in Compressed Sparse ... format>
 
   >>> tfidf.toarray()                        # doctest: +ELLIPSIS
-  array([[ 0.81940995,  0.        ,  0.57320793],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 0.47330339,  0.88089948,  0.        ],
-         [ 0.58149261,  0.        ,  0.81355169]])
+  array([[0.81940995, 0.        , 0.57320793],
+         [1.        , 0.        , 0.        ],
+         [1.        , 0.        , 0.        ],
+         [1.        , 0.        , 0.        ],
+         [0.47330339, 0.88089948, 0.        ],
+         [0.58149261, 0.        , 0.81355169]])
 
 Each row is normalized to have unit Euclidean norm:
 
@@ -523,19 +523,19 @@ And the L2-normalized tf-idf changes to
 
   >>> transformer = TfidfTransformer()
   >>> transformer.fit_transform(counts).toarray()
-  array([[ 0.85151335,  0.        ,  0.52433293],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 1.        ,  0.        ,  0.        ],
-         [ 0.55422893,  0.83236428,  0.        ],
-         [ 0.63035731,  0.        ,  0.77630514]])
+  array([[0.85151335, 0.        , 0.52433293],
+         [1.        , 0.        , 0.        ],
+         [1.        , 0.        , 0.        ],
+         [1.        , 0.        , 0.        ],
+         [0.55422893, 0.83236428, 0.        ],
+         [0.63035731, 0.        , 0.77630514]])
 
 The weights of each
 feature computed by the ``fit`` method call are stored in a model
 attribute::
 
   >>> transformer.idf_                       # doctest: +ELLIPSIS
-  array([ 1. ...,  2.25...,  1.84...])
+  array([1. ..., 2.25..., 1.84...])
 
 
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index aabd61ce3a9a2..7b1bc602fe7ce 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -413,9 +413,9 @@ kernel but with the hyperparameters set to ``theta``. An illustrative example:
     >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
     >>> kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0))
     >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
-    Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False)
     >>> params = kernel.get_params()
     >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
     k1 : 1**2 * RBF(length_scale=0.5)
@@ -431,9 +431,9 @@ kernel but with the hyperparameters set to ``theta``. An illustrative example:
     >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
     >>> print(kernel.bounds)  # Note: log-transformed
-    [[       -inf  2.30258509]
-     [       -inf  2.30258509]
-     [       -inf  2.30258509]]
+    [[      -inf 2.30258509]
+     [      -inf 2.30258509]
+     [      -inf 2.30258509]]
 
 
 All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 98ab4cf36b724..75aa7e27f48c4 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -31,9 +31,9 @@ that contain the missing values::
     SimpleImputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
     >>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
     >>> print(imp.transform(X))           # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
-    [[ 4.          2.        ]
-     [ 6.          3.666...]
-     [ 7.          6.        ]]
+    [[4.          2.        ]
+     [6.          3.666...]
+     [7.          6.        ]]
 
 The :class:`SimpleImputer` class also supports sparse matrices::
 
@@ -44,13 +44,13 @@ The :class:`SimpleImputer` class also supports sparse matrices::
     SimpleImputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)
     >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
     >>> print(imp.transform(X_test))      # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
-    [[ 4.          2.        ]
-     [ 6.          3.666...]
-     [ 7.          6.        ]]
+    [[4.          2.        ]
+     [6.          3.666...]
+     [7.          6.        ]]
 
 Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
 :class:`SimpleImputer` can be used in a Pipeline as a way to build a composite
-estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
\ No newline at end of file
+estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 71a8533583eb7..6ae5ac4a9b532 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -83,13 +83,13 @@ The function :func:`validation_curve` can help in this case::
   >>> train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha",
   ...                                               np.logspace(-7, 3, 3))
   >>> train_scores           # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-  array([[ 0.94...,  0.92...,  0.92...],
-         [ 0.94...,  0.92...,  0.92...],
-         [ 0.47...,  0.45...,  0.42...]])
+  array([[0.94..., 0.92..., 0.92...],
+         [0.94..., 0.92..., 0.92...],
+         [0.47..., 0.45..., 0.42...]])
   >>> valid_scores           # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-  array([[ 0.90...,  0.92...,  0.94...],
-         [ 0.90...,  0.92...,  0.94...],
-         [ 0.44...,  0.39...,  0.45...]])
+  array([[0.90..., 0.92..., 0.94...],
+         [0.90..., 0.92..., 0.94...],
+         [0.44..., 0.39..., 0.45...]])
 
 If the training score and the validation score are both low, the estimator will
 be underfitting. If the training score is high and the validation score is low,
@@ -148,11 +148,11 @@ average scores on the validation sets)::
   >>> train_sizes            # doctest: +NORMALIZE_WHITESPACE
   array([ 50, 80, 110])
   >>> train_scores           # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-  array([[ 0.98...,  0.98 ,  0.98...,  0.98...,  0.98...],
-         [ 0.98...,  1.   ,  0.98...,  0.98...,  0.98...],
-         [ 0.98...,  1.   ,  0.98...,  0.98...,  0.99...]])
+  array([[0.98..., 0.98 , 0.98..., 0.98..., 0.98...],
+         [0.98..., 1.   , 0.98..., 0.98..., 0.98...],
+         [0.98..., 1.   , 0.98..., 0.98..., 0.99...]])
   >>> valid_scores           # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-  array([[ 1. ,  0.93...,  1. ,  1. ,  0.96...],
-         [ 1. ,  0.96...,  1. ,  1. ,  0.96...],
-         [ 1. ,  0.96...,  1. ,  1. ,  0.96...]])
+  array([[1. ,  0.93...,  1. ,  1. ,  0.96...],
+         [1. ,  0.96...,  1. ,  1. ,  0.96...],
+         [1. ,  0.96...,  1. ,  1. ,  0.96...]])
 
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d12c38204a184..4edb19a833727 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -47,7 +47,7 @@ and will store the coefficients :math:`w` of the linear model in its
     >>> reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
     LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
     >>> reg.coef_
-    array([ 0.5,  0.5])
+    array([0.5, 0.5])
 
 However, coefficient estimates for Ordinary Least Squares rely on the
 independence of the model terms. When terms are correlated and the
@@ -106,7 +106,7 @@ its ``coef_`` member::
     Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
           normalize=False, random_state=None, solver='auto', tol=0.001)
     >>> reg.coef_
-    array([ 0.34545455,  0.34545455])
+    array([0.34545455, 0.34545455])
     >>> reg.intercept_ #doctest: +ELLIPSIS
     0.13636...
 
@@ -188,7 +188,7 @@ for another implementation::
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='cyclic', tol=0.0001, warm_start=False)
     >>> reg.predict([[1, 1]])
-    array([ 0.8])
+    array([0.8])
 
 Also useful for lower-level tasks is the function :func:`lasso_path` that
 computes the coefficients along the full path of possible values.
@@ -453,7 +453,7 @@ function of the norm of its coefficients.
         fit_path=True, max_iter=500, normalize=True, positive=False,
         precompute='auto', verbose=False)
    >>> reg.coef_    # doctest: +ELLIPSIS
-   array([ 0.717157...,  0.        ])
+   array([0.717157..., 0.        ])
 
 .. topic:: Examples:
 
@@ -619,13 +619,13 @@ Bayesian Ridge Regression is used for regression::
 After being fitted, the model can then be used to predict new values::
 
     >>> reg.predict ([[1, 0.]])
-    array([ 0.50000013])
+    array([0.50000013])
 
 
 The weights :math:`w` of the model can be access::
 
     >>> reg.coef_
-    array([ 0.49999993,  0.49999993])
+    array([0.49999993, 0.49999993])
 
 Due to the Bayesian framework, the weights found are slightly different to the
 ones found by :ref:`ordinary_least_squares`. However, Bayesian Ridge Regression
@@ -1211,9 +1211,9 @@ of a given degree.  It can be used as follows::
            [4, 5]])
     >>> poly = PolynomialFeatures(degree=2)
     >>> poly.fit_transform(X)
-    array([[  1.,   0.,   1.,   0.,   0.,   1.],
-           [  1.,   2.,   3.,   4.,   6.,   9.],
-           [  1.,   4.,   5.,  16.,  20.,  25.]])
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
 
 The features of ``X`` have been transformed from :math:`[x_1, x_2]` to
 :math:`[1, x_1, x_2, x_1^2, x_1 x_2, x_2^2]`, and can now be used within
diff --git a/doc/modules/metrics.rst b/doc/modules/metrics.rst
index 08779c750cf73..58cb636c6b9e2 100644
--- a/doc/modules/metrics.rst
+++ b/doc/modules/metrics.rst
@@ -166,10 +166,10 @@ It can be computed using :func:`chi2_kernel` and then passed to an
     >>> y = [0, 1, 0, 1]
     >>> K = chi2_kernel(X, gamma=.5)
     >>> K                        # doctest: +ELLIPSIS
-    array([[ 1.        ,  0.36...,  0.89...,  0.58...],
-           [ 0.36...,  1.        ,  0.51...,  0.83...],
-           [ 0.89...,  0.51...,  1.        ,  0.77... ],
-           [ 0.58...,  0.83...,  0.77... ,  1.        ]])
+    array([[1.        , 0.36787944, 0.89483932, 0.58364548],
+           [0.36787944, 1.        , 0.51341712, 0.83822343],
+           [0.89483932, 0.51341712, 1.        , 0.7768366 ],
+           [0.58364548, 0.83822343, 0.7768366 , 1.        ]])
 
     >>> svm = SVC(kernel='precomputed').fit(K, y)
     >>> svm.predict(K)
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 36045a7fa6199..e5024101fd1b0 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -830,7 +830,7 @@ Here are some small examples in binary classification::
   >>> metrics.fbeta_score(y_true, y_pred, beta=2) # doctest: +ELLIPSIS
   0.55...
   >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5)  # doctest: +ELLIPSIS
-  (array([ 0.66...,  1.        ]), array([ 1. ,  0.5]), array([ 0.71...,  0.83...]), array([2, 2]...))
+  (array([0.66..., 1.        ]), array([1. , 0.5]), array([0.71..., 0.83...]), array([2, 2]))
 
 
   >>> import numpy as np
@@ -840,11 +840,11 @@ Here are some small examples in binary classification::
   >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
   >>> precision, recall, threshold = precision_recall_curve(y_true, y_scores)
   >>> precision  # doctest: +ELLIPSIS
-  array([ 0.66...,  0.5       ,  1.        ,  1.        ])
+  array([0.66..., 0.5       , 1.        , 1.        ])
   >>> recall
-  array([ 1. ,  0.5,  0.5,  0. ])
+  array([1. , 0.5, 0.5, 0. ])
   >>> threshold
-  array([ 0.35,  0.4 ,  0.8 ])
+  array([0.35, 0.4 , 0.8 ])
   >>> average_precision_score(y_true, y_scores)  # doctest: +ELLIPSIS
   0.83...
 
@@ -911,7 +911,7 @@ Then the metrics are defined as:
   0.23...
   >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5, average=None)
   ... # doctest: +ELLIPSIS
-  (array([ 0.66...,  0.        ,  0.        ]), array([ 1.,  0.,  0.]), array([ 0.71...,  0.        ,  0.        ]), array([2, 2, 2]...))
+  (array([0.66..., 0.        , 0.        ]), array([1., 0., 0.]), array([0.71..., 0.        , 0.        ]), array([2, 2, 2]...))
 
 For multiclass classification with a "negative class", it is possible to exclude some labels:
 
@@ -1137,11 +1137,11 @@ Here is a small example of how to use the :func:`roc_curve` function::
     >>> scores = np.array([0.1, 0.4, 0.35, 0.8])
     >>> fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)
     >>> fpr
-    array([ 0. ,  0. ,  0.5,  0.5,  1. ])
+    array([0. , 0. , 0.5, 0.5, 1. ])
     >>> tpr
-    array([ 0. ,  0.5,  0.5,  1. ,  1. ])
+    array([0. , 0.5, 0.5, 1. , 1. ])
     >>> thresholds
-    array([ 1.8 ,  0.8 ,  0.4 ,  0.35,  0.1 ])
+    array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])
 
 This figure shows an example of such an ROC curve:
 
@@ -1515,7 +1515,7 @@ function::
     >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
     >>> explained_variance_score(y_true, y_pred, multioutput='raw_values')
     ... # doctest: +ELLIPSIS
-    array([ 0.967...,  1.        ])
+    array([0.967..., 1.        ])
     >>> explained_variance_score(y_true, y_pred, multioutput=[0.3, 0.7])
     ... # doctest: +ELLIPSIS
     0.990...
@@ -1550,10 +1550,10 @@ Here is a small example of usage of the :func:`mean_absolute_error` function::
   >>> mean_absolute_error(y_true, y_pred)
   0.75
   >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')
-  array([ 0.5,  1. ])
+  array([0.5, 1. ])
   >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
   ... # doctest: +ELLIPSIS
-  0.849...
+  0.85...
 
 .. _mean_squared_error:
 
@@ -1699,7 +1699,7 @@ Here is a small example of usage of the :func:`r2_score` function::
   0.936...
   >>> r2_score(y_true, y_pred, multioutput='raw_values')
   ... # doctest: +ELLIPSIS
-  array([ 0.965...,  0.908...])
+  array([0.965..., 0.908...])
   >>> r2_score(y_true, y_pred, multioutput=[0.3, 0.7])
   ... # doctest: +ELLIPSIS
   0.925...
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 0b95cb168bf91..7090967aac221 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -182,7 +182,7 @@ Below is an example of multiclass learning using OvR::
   >>> from sklearn.svm import LinearSVC
   >>> iris = datasets.load_iris()
   >>> X, y = iris.data, iris.target
-  >>> OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)
+  >>> OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X) # doctest: +NORMALIZE_WHITESPACE
   array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -239,7 +239,7 @@ Below is an example of multiclass learning using OvO::
   >>> from sklearn.svm import LinearSVC
   >>> iris = datasets.load_iris()
   >>> X, y = iris.data, iris.target
-  >>> OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)
+  >>> OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X) # doctest: +NORMALIZE_WHITESPACE
   array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -304,7 +304,7 @@ Below is an example of multiclass learning using Output-Codes::
   >>> X, y = iris.data, iris.target
   >>> clf = OutputCodeClassifier(LinearSVC(random_state=0),
   ...                            code_size=2, random_state=0)
-  >>> clf.fit(X, y).predict(X)
+  >>> clf.fit(X, y).predict(X) # doctest: +NORMALIZE_WHITESPACE
   array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
@@ -436,4 +436,4 @@ Regressor Chain
 Regressor chains (see :class:`RegressorChain`) is analogous to 
 ClassifierChain as a way of combining a number of regressions 
 into a single multi-target model that is capable of exploiting 
-correlations among targets.
\ No newline at end of file
+correlations among targets.
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 2421e751ef1bd..afcc8d3331119 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -85,12 +85,12 @@ used:
            [4, 3],
            [5, 4]]...)
     >>> distances
-    array([[ 0.        ,  1.        ],
-           [ 0.        ,  1.        ],
-           [ 0.        ,  1.41421356],
-           [ 0.        ,  1.        ],
-           [ 0.        ,  1.        ],
-           [ 0.        ,  1.41421356]])
+    array([[0.        , 1.        ],
+           [0.        , 1.        ],
+           [0.        , 1.41421356],
+           [0.        , 1.        ],
+           [0.        , 1.        ],
+           [0.        , 1.41421356]])
 
 Because the query set matches the training set, the nearest neighbor of each
 point is the point itself, at a distance of zero.
@@ -99,12 +99,12 @@ It is also possible to efficiently produce a sparse graph showing the
 connections between neighboring points:
 
     >>> nbrs.kneighbors_graph(X).toarray()
-    array([[ 1.,  1.,  0.,  0.,  0.,  0.],
-           [ 1.,  1.,  0.,  0.,  0.,  0.],
-           [ 0.,  1.,  1.,  0.,  0.,  0.],
-           [ 0.,  0.,  0.,  1.,  1.,  0.],
-           [ 0.,  0.,  0.,  1.,  1.,  0.],
-           [ 0.,  0.,  0.,  0.,  1.,  1.]])
+    array([[1., 1., 0., 0., 0., 0.],
+           [1., 1., 0., 0., 0., 0.],
+           [0., 1., 1., 0., 0., 0.],
+           [0., 0., 0., 1., 1., 0.],
+           [0., 0., 0., 1., 1., 0.],
+           [0., 0., 0., 0., 1., 1.]])
 
 The dataset is structured such that points nearby in index order are nearby
 in parameter space, leading to an approximately block-diagonal matrix of
diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
index 177ef09c0dfad..582e4c83543d6 100644
--- a/doc/modules/neural_networks_supervised.rst
+++ b/doc/modules/neural_networks_supervised.rst
@@ -120,8 +120,8 @@ classification, it minimizes the Cross-Entropy loss function, giving a vector
 of probability estimates :math:`P(y|x)` per sample :math:`x`::
 
     >>> clf.predict_proba([[2., 2.], [1., 2.]])  # doctest: +ELLIPSIS
-    array([[  1.967...e-04,   9.998...-01],
-           [  1.967...e-04,   9.998...-01]])
+    array([[1.967...e-04, 9.998...-01],
+           [1.967...e-04, 9.998...-01]])
 
 :class:`MLPClassifier` supports multi-class classification by
 applying `Softmax <https://en.wikipedia.org/wiki/Softmax_activation_function>`_
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 7c779161c4b91..19bdfc0d432a0 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -64,10 +64,10 @@ operation on a single array-like dataset::
 Scaled data has zero mean and unit variance::
 
   >>> X_scaled.mean(axis=0)
-  array([ 0.,  0.,  0.])
+  array([0., 0., 0.])
 
   >>> X_scaled.std(axis=0)
-  array([ 1.,  1.,  1.])
+  array([1., 1., 1.])
 
 ..    >>> print_options = np.set_printoptions(print_options)
 
@@ -83,10 +83,10 @@ This class is hence suitable for use in the early steps of a
   StandardScaler(copy=True, with_mean=True, with_std=True)
 
   >>> scaler.mean_                                      # doctest: +ELLIPSIS
-  array([ 1. ...,  0. ...,  0.33...])
+  array([1. ..., 0. ..., 0.33...])
 
   >>> scaler.scale_                                       # doctest: +ELLIPSIS
-  array([ 0.81...,  0.81...,  1.24...])
+  array([0.81..., 0.81..., 1.24...])
 
   >>> scaler.transform(X_train)                           # doctest: +ELLIPSIS
   array([[ 0.  ..., -1.22...,  1.33...],
@@ -127,15 +127,15 @@ Here is an example to scale a toy data matrix to the ``[0, 1]`` range::
   >>> min_max_scaler = preprocessing.MinMaxScaler()
   >>> X_train_minmax = min_max_scaler.fit_transform(X_train)
   >>> X_train_minmax
-  array([[ 0.5       ,  0.        ,  1.        ],
-         [ 1.        ,  0.5       ,  0.33333333],
-         [ 0.        ,  1.        ,  0.        ]])
+  array([[0.5       , 0.        , 1.        ],
+         [1.        , 0.5       , 0.33333333],
+         [0.        , 1.        , 0.        ]])
 
 The same instance of the transformer can then be applied to some new test data
 unseen during the fit call: the same scaling and shifting operations will be
 applied to be consistent with the transformation performed on the train data::
 
-  >>> X_test = np.array([[ -3., -1.,  4.]])
+  >>> X_test = np.array([[-3., -1.,  4.]])
   >>> X_test_minmax = min_max_scaler.transform(X_test)
   >>> X_test_minmax
   array([[-1.5       ,  0.        ,  1.66666667]])
@@ -144,10 +144,10 @@ It is possible to introspect the scaler attributes to find about the exact
 nature of the transformation learned on the training data::
 
   >>> min_max_scaler.scale_                             # doctest: +ELLIPSIS
-  array([ 0.5       ,  0.5       ,  0.33...])
+  array([0.5       , 0.5       , 0.33...])
 
   >>> min_max_scaler.min_                               # doctest: +ELLIPSIS
-  array([ 0.        ,  0.5       ,  0.33...])
+  array([0.        , 0.5       , 0.33...])
 
 If :class:`MinMaxScaler` is given an explicit ``feature_range=(min, max)`` the
 full formula is::
@@ -178,7 +178,7 @@ Here is how to use the toy data from the previous example with this scaler::
   >>> X_test_maxabs                 # doctest: +NORMALIZE_WHITESPACE
   array([[-1.5, -1. ,  2. ]])
   >>> max_abs_scaler.scale_         # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-  array([ 2.,  1.,  2.])
+  array([2.,  1.,  2.])
 
 
 As with :func:`scale`, the module further provides convenience functions
@@ -328,9 +328,9 @@ lognormal distribution to a normal distribution::
   >>> pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
   >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
   >>> X_lognormal                                         # doctest: +ELLIPSIS
-  array([[ 1.28...,  1.18...,  0.84...],
-         [ 0.94...,  1.60...,  0.38...],
-         [ 1.35...,  0.21...,  1.09...]])
+  array([[1.28..., 1.18..., 0.84...],
+         [0.94..., 1.60..., 0.38...],
+         [1.35..., 0.21..., 1.09...]])
   >>> pt.fit_transform(X_lognormal)                   # doctest: +ELLIPSIS
   array([[ 0.49...,  0.17..., -0.15...],
          [-0.05...,  0.58..., -0.57...],
@@ -358,13 +358,13 @@ Using the earlier example with the iris dataset::
   ...     output_distribution='normal', random_state=0)
   >>> X_trans = quantile_transformer.fit_transform(X)
   >>> quantile_transformer.quantiles_ # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-  array([[ 4.3...,   2...,     1...,     0.1...],
-         [ 4.31...,  2.02...,  1.01...,  0.1...],
-         [ 4.32...,  2.05...,  1.02...,  0.1...],
+  array([[4.3...,   2...,     1...,     0.1...],
+         [4.31...,  2.02...,  1.01...,  0.1...],
+         [4.32...,  2.05...,  1.02...,  0.1...],
          ...,
-         [ 7.84...,  4.34...,  6.84...,  2.5...],
-         [ 7.87...,  4.37...,  6.87...,  2.5...],
-         [ 7.9...,   4.4...,   6.9...,   2.5...]])
+         [7.84...,  4.34...,  6.84...,  2.5...],
+         [7.87...,  4.37...,  6.87...,  2.5...],
+         [7.9...,   4.4...,   6.9...,   2.5...]])
 
 Thus the median of the input becomes the mean of the output, centered at 0. The
 normal output is clipped so that the input's minimum and maximum ---
@@ -467,17 +467,17 @@ as each sample is treated independently of others::
   Binarizer(copy=True, threshold=0.0)
 
   >>> binarizer.transform(X)
-  array([[ 1.,  0.,  1.],
-         [ 1.,  0.,  0.],
-         [ 0.,  1.,  0.]])
+  array([[1., 0., 1.],
+         [1., 0., 0.],
+         [0., 1., 0.]])
 
 It is possible to adjust the threshold of the binarizer::
 
   >>> binarizer = preprocessing.Binarizer(threshold=1.1)
   >>> binarizer.transform(X)
-  array([[ 0.,  0.,  1.],
-         [ 1.,  0.,  0.],
-         [ 0.,  0.,  0.]])
+  array([[0., 0., 1.],
+         [1., 0., 0.],
+         [0., 0., 0.]])
 
 As for the :class:`StandardScaler` and :class:`Normalizer` classes, the
 preprocessing module provides a companion function :func:`binarize`
@@ -518,7 +518,7 @@ new feature of integers (0 to n_categories - 1)::
     CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
               encoding='ordinal', handle_unknown='error')
     >>> enc.transform([['female', 'from US', 'uses Safari']])
-    array([[ 0.,  1.,  1.]])
+    array([[0., 1., 1.]])
 
 Such integer representation can, however, not be used directly with all
 scikit-learn estimators, as these expect continuous input, and would interpret
@@ -542,8 +542,8 @@ Continuing the example above::
             encoding='onehot', handle_unknown='error')
   >>> enc.transform([['female', 'from US', 'uses Safari'],
   ...                ['male', 'from Europe', 'uses Safari']]).toarray()
-  array([[ 1.,  0.,  0.,  1.,  0.,  1.],
-         [ 0.,  1.,  1.,  0.,  0.,  1.]])
+  array([[1., 0., 0., 1., 0., 1.],
+         [0., 1., 1., 0., 0., 1.]])
 
 By default, the values each feature can take is inferred automatically
 from the dataset and can be found in the ``categories_`` attribute::
@@ -567,7 +567,7 @@ dataset::
               dtype=<... 'numpy.float64'>, encoding='onehot',
               handle_unknown='error')
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
-    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
+    array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
 
 If there is a possibility that the training data might have missing categorical
 features, it can often be better to specify ``handle_unknown='ignore'`` instead
@@ -583,7 +583,7 @@ columns for this feature will be all zeros
     CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
               encoding='onehot', handle_unknown='ignore')
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
-    array([[ 1.,  0.,  0.,  0.,  0.,  0.]])
+    array([[1., 0., 0., 0., 0., 0.]])
 
 
 See :ref:`dict_feature_extraction` for categorical features that are represented
@@ -612,9 +612,9 @@ Often it's useful to add complexity to the model by considering nonlinear featur
            [4, 5]])
     >>> poly = PolynomialFeatures(2)
     >>> poly.fit_transform(X)                             # doctest: +ELLIPSIS
-    array([[  1.,   0.,   1.,   0.,   0.,   1.],
-           [  1.,   2.,   3.,   4.,   6.,   9.],
-           [  1.,   4.,   5.,  16.,  20.,  25.]])
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
 
 The features of X have been transformed from :math:`(X_1, X_2)` to :math:`(1, X_1, X_2, X_1^2, X_1X_2, X_2^2)`.
 
@@ -627,9 +627,9 @@ In some cases, only interaction terms among features are required, and it can be
            [6, 7, 8]])
     >>> poly = PolynomialFeatures(degree=3, interaction_only=True)
     >>> poly.fit_transform(X)                             # doctest: +ELLIPSIS
-    array([[   1.,    0.,    1.,    2.,    0.,    0.,    2.,    0.],
-           [   1.,    3.,    4.,    5.,   12.,   15.,   20.,   60.],
-           [   1.,    6.,    7.,    8.,   42.,   48.,   56.,  336.]])
+    array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
+           [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
+           [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])
 
 The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`(1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)`.
 
@@ -652,8 +652,8 @@ a transformer that applies a log transformation in a pipeline, do::
     >>> transformer = FunctionTransformer(np.log1p)
     >>> X = np.array([[0, 1], [2, 3]])
     >>> transformer.transform(X)
-    array([[ 0.        ,  0.69314718],
-           [ 1.09861229,  1.38629436]])
+    array([[0.        , 0.69314718],
+           [1.09861229, 1.38629436]])
 
 You can ensure that ``func`` and ``inverse_func`` are the inverse of each other
 by setting ``check_inverse=True`` and calling ``fit`` before
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 75340a0eb0e83..6b384e12ce31d 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -77,7 +77,7 @@ SGD fits a linear model to the training data. The member ``coef_`` holds
 the model parameters::
 
     >>> clf.coef_                                         # doctest: +ELLIPSIS
-    array([[ 9.9...,  9.9...]])
+    array([[9.9..., 9.9...]])
 
 Member ``intercept_`` holds the intercept (aka offset or bias)::
 
@@ -90,7 +90,7 @@ hyperplane, is controlled by the parameter ``fit_intercept``.
 To get the signed distance to the hyperplane use :meth:`SGDClassifier.decision_function`::
 
     >>> clf.decision_function([[2., 2.]])                 # doctest: +ELLIPSIS
-    array([ 29.6...])
+    array([29.6...])
 
 The concrete loss function can be set via the ``loss``
 parameter. :class:`SGDClassifier` supports the following loss functions:
@@ -111,7 +111,7 @@ Using ``loss="log"`` or ``loss="modified_huber"`` enables the
 
     >>> clf = SGDClassifier(loss="log", max_iter=5).fit(X, y)
     >>> clf.predict_proba([[1., 1.]])                      # doctest: +ELLIPSIS
-    array([[ 0.00...,  0.99...]])
+    array([[0.00..., 0.99...]])
 
 The concrete penalty can be set via the ``penalty`` parameter.
 SGD supports the following penalties:
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index f0c6a0e76a576..aac074cc2d995 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -94,8 +94,8 @@ can be found in members ``support_vectors_``, ``support_`` and
 
     >>> # get support vectors
     >>> clf.support_vectors_
-    array([[ 0.,  0.],
-           [ 1.,  1.]])
+    array([[0., 0.],
+           [1., 1.]])
     >>> # get indices of support vectors
     >>> clf.support_ # doctest: +ELLIPSIS
     array([0, 1]...)
@@ -322,7 +322,7 @@ floating point values instead of integer values::
         gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
         tol=0.001, verbose=False)
     >>> clf.predict([[1, 1]])
-    array([ 1.5])
+    array([1.5])
 
 
 .. topic:: Examples:
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index c6a5a74eb17fc..91b58b031b418 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -110,7 +110,7 @@ Alternatively, the probability of each class can be predicted, which is the
 fraction of training samples of the same class in a leaf::
 
     >>> clf.predict_proba([[2., 2.]])
-    array([[ 0.,  1.]])
+    array([[0., 1.]])
 
 :class:`DecisionTreeClassifier` is capable of both binary (where the
 labels are [-1, 1]) classification and multiclass (where the labels are
@@ -175,7 +175,7 @@ Alternatively, the probability of each class can be predicted, which is the
 fraction of training samples of the same class in a leaf::
 
     >>> clf.predict_proba(iris.data[:1, :])
-    array([[ 1.,  0.,  0.]])
+    array([[1., 0., 0.]])
 
 .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_001.png
    :target: ../auto_examples/tree/plot_iris.html
@@ -210,7 +210,7 @@ instead of integer values::
     >>> clf = tree.DecisionTreeRegressor()
     >>> clf = clf.fit(X, y)
     >>> clf.predict([[1, 1]])
-    array([ 0.5])
+    array([0.5])
 
 .. topic:: Examples:
 
diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index edb2f046d7bee..ece691f7de973 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -101,13 +101,13 @@ For instance, in the case of the digits dataset, ``digits.data`` gives
 access to the features that can be used to classify the digits samples::
 
   >>> print(digits.data)  # doctest: +NORMALIZE_WHITESPACE
-  [[  0.   0.   5. ...,   0.   0.   0.]
-   [  0.   0.   0. ...,  10.   0.   0.]
-   [  0.   0.   0. ...,  16.   9.   0.]
-   ...,
-   [  0.   0.   1. ...,   6.   0.   0.]
-   [  0.   0.   2. ...,  12.   0.   0.]
-   [  0.   0.  10. ...,  12.   1.   0.]]
+  [[ 0.   0.   5. ...   0.   0.   0.]
+   [ 0.   0.   0. ...  10.   0.   0.]
+   [ 0.   0.   0. ...  16.   9.   0.]
+   ...
+   [ 0.   0.   1. ...   6.   0.   0.]
+   [ 0.   0.   2. ...  12.   0.   0.]
+   [ 0.   0.  10. ...  12.   1.   0.]]
 
 and ``digits.target`` gives the ground truth for the digit dataset, that
 is the number corresponding to each digit image that we are trying to
@@ -123,7 +123,7 @@ learn::
     digits, each original sample is an image of shape ``(8, 8)`` and can be
     accessed using::
 
-      >>> digits.images[0]
+      >>> digits.images[0]  # doctest: +NORMALIZE_WHITESPACE
       array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
              [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
              [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index 315ca420e4d19..3feba26c6a77d 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -19,7 +19,7 @@ better**.
     >>> y_digits = digits.target
     >>> svc = svm.SVC(C=1, kernel='linear')
     >>> svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])
-    0.97999999999999998
+    0.98
 
 To get a better measure of prediction accuracy (which we can use as a
 proxy for goodness of fit of the model), we can successively split the
@@ -38,8 +38,8 @@ data in *folds* that we use for training and testing::
     ...     y_test  = y_train.pop(k)
     ...     y_train = np.concatenate(y_train)
     ...     scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
-    >>> print(scores)
-    [0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
+    >>> print(scores)  # doctest: +ELLIPSIS
+    [0.934..., 0.956..., 0.939...]
 
 .. currentmodule:: sklearn.model_selection
 
@@ -71,8 +71,8 @@ This example shows an example usage of the ``split`` method.
 The cross-validation can then be performed easily::
 
     >>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
-    ...          for train, test in k_fold.split(X_digits)]
-    [0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
+    ...          for train, test in k_fold.split(X_digits)]  # doctest: +ELLIPSIS
+    [0.934..., 0.956..., 0.939...]
 
 The cross-validation score can be directly calculated using the
 :func:`cross_val_score` helper. Given an estimator, the cross-validation object
@@ -86,7 +86,7 @@ Refer the :ref:`metrics module <metrics>` to learn more on the available scoring
 methods.
 
     >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)
-    array([ 0.93489149,  0.95659432,  0.93989983])
+    array([0.93489149, 0.95659432, 0.93989983])
 
 `n_jobs=-1` means that the computation will be dispatched on all the CPUs
 of the computer.
@@ -96,7 +96,7 @@ scoring method.
 
     >>> cross_val_score(svc, X_digits, y_digits, cv=k_fold,
     ...                 scoring='precision_macro')
-    array([ 0.93969761,  0.95911415,  0.94041254])
+    array([0.93969761, 0.95911415, 0.94041254])
 
    **Cross-validation generators**
 
@@ -237,7 +237,7 @@ a stratified 3-fold.
 
         >>> cross_val_score(clf, X_digits, y_digits)
         ...                                               # doctest: +ELLIPSIS
-        array([ 0.938...,  0.963...,  0.944...])
+        array([0.938..., 0.963..., 0.944...])
 
     Two cross-validation loops are performed in parallel: one by the
     :class:`GridSearchCV` estimator to set ``gamma`` and the other one by
diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst
index cd222386d79c0..9be0b0af3fc59 100644
--- a/doc/tutorial/text_analytics/working_with_text_data.rst
+++ b/doc/tutorial/text_analytics/working_with_text_data.rst
@@ -457,7 +457,7 @@ The object's ``best_score_`` and ``best_params_`` attributes store the best
 mean score and the parameters setting corresponding to that score::
 
   >>> gs_clf.best_score_                                  # doctest: +ELLIPSIS
-  0.900...
+  0.9...
   >>> for param_name in sorted(parameters.keys()):
   ...     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
   ...
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index e0c58076e55fb..54a1864a67d5a 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -827,8 +827,8 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
     >>> kmeans.predict([[0, 0], [4, 4]])
     array([0, 1], dtype=int32)
     >>> kmeans.cluster_centers_
-    array([[ 1.,  2.],
-           [ 4.,  2.]])
+    array([[1., 2.],
+           [4., 2.]])
 
     See also
     --------
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 3793840cb136d..0537bf43a5260 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -86,7 +86,7 @@ class TransformedTargetRegressor(BaseEstimator, RegressorMixin):
     >>> tt.score(X, y)
     1.0
     >>> tt.regressor_.coef_
-    array([ 2.])
+    array([2.])
 
     Notes
     -----
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index e895ef5739859..e06466b2fcf27 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1559,7 +1559,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
     >>> y = diabetes.target[:150]
     >>> lasso = linear_model.Lasso()
     >>> print(cross_val_score(lasso, X, y))  # doctest:  +ELLIPSIS
-    [ 0.33150734  0.08022311  0.03531764]
+    [0.33150734 0.08022311 0.03531764]
 
     See Also
     ---------
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 2b715b7e06824..c5ceb40672dbe 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -279,27 +279,27 @@ class PCA(_BasePCA):
     PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
       svd_solver='auto', tol=0.0, whiten=False)
     >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.99244...  0.00755...]
+    [0.9924... 0.0075...]
     >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
-    [ 6.30061...  0.54980...]
+    [6.30061... 0.54980...]
 
     >>> pca = PCA(n_components=2, svd_solver='full')
     >>> pca.fit(X)                 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
       svd_solver='full', tol=0.0, whiten=False)
     >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.99244...  0.00755...]
+    [0.9924... 0.00755...]
     >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
-    [ 6.30061...  0.54980...]
+    [6.30061... 0.54980...]
 
     >>> pca = PCA(n_components=1, svd_solver='arpack')
     >>> pca.fit(X)
     PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
       svd_solver='arpack', tol=0.0, whiten=False)
     >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.99244...]
+    [0.99244...]
     >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
-    [ 6.30061...]
+    [6.30061...]
 
     See also
     --------
@@ -673,9 +673,9 @@ class RandomizedPCA(BaseEstimator, TransformerMixin):
     RandomizedPCA(copy=True, iterated_power=2, n_components=2,
            random_state=None, whiten=False)
     >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.99244...  0.00755...]
+    [0.9924... 0.007557...]
     >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
-    [ 6.30061...  0.54980...]
+    [6.30061... 0.54980...]
 
     See also
     --------
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index 726f9162eb925..049c165baea20 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -91,11 +91,11 @@ class TruncatedSVD(BaseEstimator, TransformerMixin):
     TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
             random_state=42, tol=0.0)
     >>> print(svd.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.0606... 0.0584... 0.0497... 0.0434... 0.0372...]
+    [0.0606... 0.0584... 0.0497... 0.0434... 0.0372...]
     >>> print(svd.explained_variance_ratio_.sum())  # doctest: +ELLIPSIS
     0.249...
     >>> print(svd.singular_values_)  # doctest: +ELLIPSIS
-    [ 2.5841... 2.5245... 2.3201... 2.1753... 2.0443...]
+    [2.5841... 2.5245... 2.3201... 2.1753... 2.0443...]
 
     See also
     --------
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 6d354ce5b0e5b..8293141ef1ba8 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -942,7 +942,7 @@ class labels (multi-output problem).
                 min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
                 oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(clf.feature_importances_)
-    [ 0.17287856  0.80608704  0.01884792  0.00218648]
+    [0.17287856 0.80608704 0.01884792 0.00218648]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
 
@@ -1182,7 +1182,7 @@ class RandomForestRegressor(ForestRegressor):
                min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(regr.feature_importances_)
-    [ 0.17339552  0.81594114  0.          0.01066333]
+    [0.17339552 0.81594114 0.         0.01066333]
     >>> print(regr.predict([[0, 0, 0, 0]]))
     [-2.50699856]
 
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 194e922596bf5..5b8f932e889c7 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -78,13 +78,13 @@ class DictVectorizer(BaseEstimator, TransformerMixin):
     >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
     >>> X = v.fit_transform(D)
     >>> X
-    array([[ 2.,  0.,  1.],
-           [ 0.,  1.,  3.]])
+    array([[2., 0., 1.],
+           [0., 1., 3.]])
     >>> v.inverse_transform(X) == \
         [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]
     True
     >>> v.transform({'foo': 4, 'unseen_feature': 3})
-    array([[ 0.,  0.,  4.]])
+    array([[0., 0., 4.]])
 
     See also
     --------
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 7134e33c1735a..b218436ca8546 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -96,8 +96,8 @@ class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
     >>> selector = RFE(estimator, 5, step=1)
     >>> selector = selector.fit(X, y)
     >>> selector.support_ # doctest: +NORMALIZE_WHITESPACE
-    array([ True,  True,  True,  True,  True,
-            False, False, False, False, False], dtype=bool)
+    array([ True,  True,  True,  True,  True, False, False, False, False,
+           False])
     >>> selector.ranking_
     array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
 
@@ -365,8 +365,8 @@ class RFECV(RFE, MetaEstimatorMixin):
     >>> selector = RFECV(estimator, step=1, cv=5)
     >>> selector = selector.fit(X, y)
     >>> selector.support_ # doctest: +NORMALIZE_WHITESPACE
-    array([ True,  True,  True,  True,  True,
-            False, False, False, False, False], dtype=bool)
+    array([ True,  True,  True,  True,  True, False, False, False, False,
+           False])
     >>> selector.ranking_
     array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
 
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
index 9fda4f88cfc27..7c220a67772c6 100644
--- a/sklearn/linear_model/bayes.py
+++ b/sklearn/linear_model/bayes.py
@@ -106,7 +106,7 @@ class BayesianRidge(LinearModel, RegressorMixin):
             copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06,
             n_iter=300, normalize=False, tol=0.001, verbose=False)
     >>> clf.predict([[1, 1]])
-    array([ 1.])
+    array([1.])
 
     Notes
     -----
@@ -385,7 +385,7 @@ class ARDRegression(LinearModel, RegressorMixin):
             n_iter=300, normalize=False, threshold_lambda=10000.0, tol=0.001,
             verbose=False)
     >>> clf.predict([[1, 1]])
-    array([ 1.])
+    array([1.])
 
     Notes
     -----
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index c2c3afead51e6..0bbfb87ebe4c5 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -235,8 +235,8 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
     >>> # Use lasso_path to compute a coefficient path
     >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5])
     >>> print(coef_path)
-    [[ 0.          0.          0.46874778]
-     [ 0.2159048   0.4425765   0.23689075]]
+    [[0.         0.         0.46874778]
+     [0.2159048  0.4425765  0.23689075]]
 
     >>> # Now use lars_path and 1D linear interpolation to compute the
     >>> # same path
@@ -246,8 +246,8 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
     >>> coef_path_continuous = interpolate.interp1d(alphas[::-1],
     ...                                             coef_path_lars[:, ::-1])
     >>> print(coef_path_continuous([5., 1., .5]))
-    [[ 0.          0.          0.46915237]
-     [ 0.2159048   0.4425765   0.23668876]]
+    [[0.         0.         0.46915237]
+     [0.2159048  0.4425765  0.23668876]]
 
 
     See also
@@ -627,11 +627,11 @@ class ElasticNet(LinearModel, RegressorMixin):
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=0, selection='cyclic', tol=0.0001, warm_start=False)
     >>> print(regr.coef_) # doctest: +ELLIPSIS
-    [ 18.83816048  64.55968825]
+    [18.83816048 64.55968825]
     >>> print(regr.intercept_) # doctest: +ELLIPSIS
-    1.45126075617
+    1.451...
     >>> print(regr.predict([[0, 0]])) # doctest: +ELLIPSIS
-    [ 1.45126076]
+    [1.451...]
 
 
     Notes
@@ -904,9 +904,9 @@ class Lasso(ElasticNet):
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='cyclic', tol=0.0001, warm_start=False)
     >>> print(clf.coef_)
-    [ 0.85  0.  ]
-    >>> print(clf.intercept_)
-    0.15
+    [0.85 0.  ]
+    >>> print(clf.intercept_)  # doctest: +ELLIPSIS
+    0.15...
 
     See also
     --------
@@ -1530,11 +1530,11 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
            normalize=False, positive=False, precompute='auto', random_state=0,
            selection='cyclic', tol=0.0001, verbose=0)
     >>> print(regr.alpha_) # doctest: +ELLIPSIS
-    0.19947279427
+    0.1994727942696716
     >>> print(regr.intercept_) # doctest: +ELLIPSIS
-    0.398882965428
+    0.398...
     >>> print(regr.predict([[0, 0]])) # doctest: +ELLIPSIS
-    [ 0.39888297]
+    [0.398...]
 
 
     Notes
@@ -1694,10 +1694,10 @@ class MultiTaskElasticNet(Lasso):
             l1_ratio=0.5, max_iter=1000, normalize=False, random_state=None,
             selection='cyclic', tol=0.0001, warm_start=False)
     >>> print(clf.coef_)
-    [[ 0.45663524  0.45612256]
-     [ 0.45663524  0.45612256]]
+    [[0.45663524 0.45612256]
+     [0.45663524 0.45612256]]
     >>> print(clf.intercept_)
-    [ 0.0872422  0.0872422]
+    [0.0872422 0.0872422]
 
     See also
     --------
@@ -1883,10 +1883,10 @@ class MultiTaskLasso(MultiTaskElasticNet):
             normalize=False, random_state=None, selection='cyclic', tol=0.0001,
             warm_start=False)
     >>> print(clf.coef_)
-    [[ 0.89393398  0.        ]
-     [ 0.89393398  0.        ]]
+    [[0.89393398 0.        ]
+     [0.89393398 0.        ]]
     >>> print(clf.intercept_)
-    [ 0.10606602  0.10606602]
+    [0.10606602 0.10606602]
 
     See also
     --------
@@ -2057,10 +2057,10 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
            n_jobs=1, normalize=False, random_state=None, selection='cyclic',
            tol=0.0001, verbose=0)
     >>> print(clf.coef_)
-    [[ 0.52875032  0.46958558]
-     [ 0.52875032  0.46958558]]
+    [[0.52875032 0.46958558]
+     [0.52875032 0.46958558]]
     >>> print(clf.intercept_)
-    [ 0.00166409  0.00166409]
+    [0.00166409 0.00166409]
 
     See also
     --------
diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py
index 91bb4d7b6ac06..e803840279adc 100644
--- a/sklearn/linear_model/passive_aggressive.py
+++ b/sklearn/linear_model/passive_aggressive.py
@@ -123,9 +123,9 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
                   n_jobs=1, random_state=0, shuffle=True, tol=None, verbose=0,
                   warm_start=False)
     >>> print(clf.coef_)
-    [[ 0.49324685  1.0552176   1.49519589  1.33798314]]
+    [[0.49324685 1.0552176  1.49519589 1.33798314]]
     >>> print(clf.intercept_)
-    [ 2.18438388]
+    [2.18438388]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
 
@@ -333,7 +333,7 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
                   max_iter=None, n_iter=None, random_state=0, shuffle=True,
                   tol=None, verbose=0, warm_start=False)
     >>> print(regr.coef_)
-    [ 20.48736655  34.18818427  67.59122734  87.94731329]
+    [20.48736655 34.18818427 67.59122734 87.94731329]
     >>> print(regr.intercept_)
     [-0.02306214]
     >>> print(regr.predict([[0, 0, 0, 0]]))
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index a9e752c662883..02346da8a683e 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -708,7 +708,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     >>> f1_score(y_true, y_pred, average='weighted')  # doctest: +ELLIPSIS
     0.26...
     >>> f1_score(y_true, y_pred, average=None)
-    array([ 0.8,  0. ,  0. ])
+    array([0.8, 0. , 0. ])
 
 
     """
@@ -819,7 +819,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     0.23...
     >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)
     ... # doctest: +ELLIPSIS
-    array([ 0.71...,  0.        ,  0.        ])
+    array([0.71..., 0.        , 0.        ])
 
     """
     _, _, f, _ = precision_recall_fscore_support(y_true, y_pred,
@@ -1012,9 +1012,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     >>> precision_recall_fscore_support(y_true, y_pred, average=None,
     ... labels=['pig', 'dog', 'cat'])
     ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
-    (array([ 0. ,  0. ,  0.66...]),
-     array([ 0.,  0.,  1.]),
-     array([ 0. ,  0. ,  0.8]),
+    (array([0.        , 0.        , 0.66...]),
+     array([0., 0., 1.]), array([0. , 0. , 0.8]),
      array([2, 2, 2]))
 
     """
@@ -1255,7 +1254,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     ... # doctest: +ELLIPSIS
     0.22...
     >>> precision_score(y_true, y_pred, average=None)  # doctest: +ELLIPSIS
-    array([ 0.66...,  0.        ,  0.        ])
+    array([0.66..., 0.        , 0.        ])
 
     """
     p, _, _, _ = precision_recall_fscore_support(y_true, y_pred,
@@ -1352,7 +1351,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     >>> recall_score(y_true, y_pred, average='weighted')  # doctest: +ELLIPSIS
     0.33...
     >>> recall_score(y_true, y_pred, average=None)
-    array([ 1.,  0.,  0.])
+    array([1., 0., 0.])
 
 
     """
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index a987778ae5d2e..b73303ad623fc 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -343,10 +343,10 @@ def homogeneity_score(labels_true, labels_pred):
 
       >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
       ...                                                  # doctest: +ELLIPSIS
-      1.0...
+      1.000000
       >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))
       ...                                                  # doctest: +ELLIPSIS
-      1.0...
+      1.000000
 
     Clusters that include samples from different classes do not make for an
     homogeneous labeling::
@@ -418,7 +418,7 @@ def completeness_score(labels_true, labels_pred):
       >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))
       1.0
       >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))
-      1.0
+      0.999...
 
     If classes members are split across different clusters, the
     assignment cannot be complete::
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 179fc97aa87e2..e13bcbfeb8042 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -209,12 +209,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     >>> X = [[0, 1], [1, 1]]
     >>> # distance between rows of X
     >>> euclidean_distances(X, X)
-    array([[ 0.,  1.],
-           [ 1.,  0.]])
+    array([[0., 1.],
+           [1., 0.]])
     >>> # get distance to origin
     >>> euclidean_distances(X, [[0, 0]])
-    array([[ 1.        ],
-           [ 1.41421356]])
+    array([[1.        ],
+           [1.41421356]])
 
     See also
     --------
@@ -505,21 +505,21 @@ def manhattan_distances(X, Y=None, sum_over_features=True,
     --------
     >>> from sklearn.metrics.pairwise import manhattan_distances
     >>> manhattan_distances([[3]], [[3]])#doctest:+ELLIPSIS
-    array([[ 0.]])
+    array([[0.]])
     >>> manhattan_distances([[3]], [[2]])#doctest:+ELLIPSIS
-    array([[ 1.]])
+    array([[1.]])
     >>> manhattan_distances([[2]], [[3]])#doctest:+ELLIPSIS
-    array([[ 1.]])
+    array([[1.]])
     >>> manhattan_distances([[1, 2], [3, 4]],\
          [[1, 2], [0, 3]])#doctest:+ELLIPSIS
-    array([[ 0.,  2.],
-           [ 4.,  4.]])
+    array([[0., 2.],
+           [4., 4.]])
     >>> import numpy as np
     >>> X = np.ones((1, 2))
     >>> y = 2 * np.ones((2, 2))
     >>> manhattan_distances(X, y, sum_over_features=False)#doctest:+ELLIPSIS
-    array([[ 1.,  1.],
-           [ 1.,  1.]]...)
+    array([[1., 1.],
+           [1., 1.]])
     """
     if size_threshold is not None:
         warnings.warn('Use of the "size_threshold" is deprecated '
@@ -700,7 +700,7 @@ def paired_distances(X, Y, metric="euclidean", **kwds):
     >>> X = [[0, 1], [1, 1]]
     >>> Y = [[0, 1], [2, 1]]
     >>> paired_distances(X, Y)
-    array([ 0.,  1.])
+    array([0., 1.])
 
     See also
     --------
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index d612e913d2a06..8800561f0cfa3 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -489,11 +489,11 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
     >>> precision, recall, thresholds = precision_recall_curve(
     ...     y_true, y_scores)
     >>> precision  # doctest: +ELLIPSIS
-    array([ 0.66...,  0.5       ,  1.        ,  1.        ])
+    array([0.66666667, 0.5       , 1.        , 1.        ])
     >>> recall
-    array([ 1. ,  0.5,  0.5,  0. ])
+    array([1. , 0.5, 0.5, 0. ])
     >>> thresholds
-    array([ 0.35,  0.4 ,  0.8 ])
+    array([0.35, 0.4 , 0.8 ])
 
     """
     fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred,
@@ -585,11 +585,11 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
     >>> scores = np.array([0.1, 0.4, 0.35, 0.8])
     >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
     >>> fpr
-    array([ 0. ,  0. ,  0.5,  0.5,  1. ])
+    array([0. , 0. , 0.5, 0.5, 1. ])
     >>> tpr
-    array([ 0. ,  0.5,  0.5,  1. ,  1. ])
+    array([0. , 0.5, 0.5, 1. , 1. ])
     >>> thresholds
-    array([ 1.8 ,  0.8 ,  0.4 ,  0.35,  0.1 ])
+    array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])
 
     """
     fps, tps, thresholds = _binary_clf_curve(
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index 0a1ec94e1dec4..4bc88561a73fd 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -161,10 +161,10 @@ def mean_absolute_error(y_true, y_pred,
     >>> mean_absolute_error(y_true, y_pred)
     0.75
     >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')
-    array([ 0.5,  1. ])
+    array([0.5, 1. ])
     >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
     ... # doctest: +ELLIPSIS
-    0.849...
+    0.85...
     """
     y_type, y_true, y_pred, multioutput = _check_reg_targets(
         y_true, y_pred, multioutput)
@@ -229,10 +229,10 @@ def mean_squared_error(y_true, y_pred,
     0.708...
     >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
     ... # doctest: +ELLIPSIS
-    array([ 0.416...,  1.        ])
+    array([0.41666667, 1.        ])
     >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
     ... # doctest: +ELLIPSIS
-    0.824...
+    0.825...
 
     """
     y_type, y_true, y_pred, multioutput = _check_reg_targets(
@@ -300,7 +300,7 @@ def mean_squared_log_error(y_true, y_pred,
     0.044...
     >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
     ... # doctest: +ELLIPSIS
-    array([ 0.004...,  0.083...])
+    array([0.00462428, 0.08377444])
     >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
     ... # doctest: +ELLIPSIS
     0.060...
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
index 207eff9f1502a..b3c231314cc21 100644
--- a/sklearn/mixture/gmm.py
+++ b/sklearn/mixture/gmm.py
@@ -234,10 +234,10 @@ class _GMMBase(BaseEstimator):
             n_components=2, n_init=1, n_iter=100, params='wmc',
             random_state=None, tol=0.001, verbose=0)
     >>> np.round(g.weights_, 2)
-    array([ 0.75,  0.25])
+    array([0.75, 0.25])
     >>> np.round(g.means_, 2)
-    array([[ 10.05],
-           [  0.06]])
+    array([[10.05],
+           [ 0.06]])
     >>> np.round(g.covars_, 2) # doctest: +SKIP
     array([[[ 1.02]],
            [[ 0.96]]])
@@ -252,7 +252,7 @@ class _GMMBase(BaseEstimator):
             n_components=2, n_init=1, n_iter=100, params='wmc',
             random_state=None, tol=0.001, verbose=0)
     >>> np.round(g.weights_, 2)
-    array([ 0.5,  0.5])
+    array([0.5, 0.5])
 
     """
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 03bf0c92c8b07..ceddce37781ad 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -177,7 +177,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     >>> sorted(cv_results.keys())                         # doctest: +ELLIPSIS
     ['fit_time', 'score_time', 'test_score']
     >>> cv_results['test_score']    # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    array([ 0.33...,  0.08...,  0.03...])
+    array([0.33150734, 0.08022311, 0.03531764])
 
     Multiple metric evaluation using ``cross_validate``
     (please refer the ``scoring`` parameter doc for more information)
@@ -187,7 +187,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     >>> print(scores['test_neg_mean_squared_error'])      # doctest: +ELLIPSIS
     [-3635.5... -3573.3... -6114.7...]
     >>> print(scores['train_r2'])                         # doctest: +ELLIPSIS
-    [ 0.28...  0.39...  0.22...]
+    [0.28010158 0.39088426 0.22784852]
 
     See Also
     ---------
@@ -333,7 +333,7 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
     >>> y = diabetes.target[:150]
     >>> lasso = linear_model.Lasso()
     >>> print(cross_val_score(lasso, X, y))  # doctest: +ELLIPSIS
-    [ 0.33150734  0.08022311  0.03531764]
+    [0.33150734 0.08022311 0.03531764]
 
     See Also
     ---------
diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py
index cf1ce4b104bd9..6a3fd571b3217 100644
--- a/sklearn/neighbors/approximate.py
+++ b/sklearn/neighbors/approximate.py
@@ -194,9 +194,9 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
                 random_state=42)
       >>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)
       >>> distances                                        # doctest: +ELLIPSIS
-      array([[ 0.069...,  0.149...],
-             [ 0.229...,  0.481...],
-             [ 0.004...,  0.014...]])
+      array([[0.069..., 0.149...],
+             [0.229..., 0.481...],
+             [0.004..., 0.014...]])
       >>> indices
       array([[1, 2],
              [2, 0],
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index e390860d13463..196092c6e6fab 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -311,7 +311,7 @@ class from an array representing our data set and ask who's
         >>> neigh.fit(samples) # doctest: +ELLIPSIS
         NearestNeighbors(algorithm='auto', leaf_size=30, ...)
         >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
-        (array([[ 0.5]]), array([[2]]...))
+        (array([[0.5]]), array([[2]]))
 
         As you can see, it returns [[0.5]], and [[2]], which means that the
         element is at distance 0.5 and is the third element of samples
@@ -456,9 +456,9 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
         NearestNeighbors(algorithm='auto', leaf_size=30, ...)
         >>> A = neigh.kneighbors_graph(X)
         >>> A.toarray()
-        array([[ 1.,  0.,  1.],
-               [ 0.,  1.,  1.],
-               [ 1.,  0.,  1.]])
+        array([[1., 0., 1.],
+               [0., 1., 1.],
+               [1., 0., 1.]])
 
         See also
         --------
@@ -552,7 +552,7 @@ class from an array representing our data set and ask who's
         NearestNeighbors(algorithm='auto', leaf_size=30, ...)
         >>> rng = neigh.radius_neighbors([[1., 1., 1.]])
         >>> print(np.asarray(rng[0][0])) # doctest: +ELLIPSIS
-        [ 1.5  0.5]
+        [1.5 0.5]
         >>> print(np.asarray(rng[1][0])) # doctest: +ELLIPSIS
         [1 2]
 
@@ -684,9 +684,9 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):
         NearestNeighbors(algorithm='auto', leaf_size=30, ...)
         >>> A = neigh.radius_neighbors_graph(X)
         >>> A.toarray()
-        array([[ 1.,  0.,  1.],
-               [ 0.,  1.,  0.],
-               [ 1.,  0.,  1.]])
+        array([[1., 0., 1.],
+               [0., 1., 0.],
+               [1., 0., 1.]])
 
         See also
         --------
diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py
index f85d751c7d33c..39735c0e1480d 100644
--- a/sklearn/neighbors/classification.py
+++ b/sklearn/neighbors/classification.py
@@ -91,7 +91,7 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
     >>> print(neigh.predict([[1.1]]))
     [0]
     >>> print(neigh.predict_proba([[0.9]]))
-    [[ 0.66666667  0.33333333]]
+    [[0.66666667 0.33333333]]
 
     See also
     --------
diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py
index 61a4561430cad..add4f241e7b4b 100644
--- a/sklearn/neighbors/graph.py
+++ b/sklearn/neighbors/graph.py
@@ -85,9 +85,9 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     >>> from sklearn.neighbors import kneighbors_graph
     >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
     >>> A.toarray()
-    array([[ 1.,  0.,  1.],
-           [ 0.,  1.,  1.],
-           [ 1.,  0.,  1.]])
+    array([[1., 0., 1.],
+           [0., 1., 1.],
+           [1., 0., 1.]])
 
     See also
     --------
@@ -160,9 +160,9 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     >>> from sklearn.neighbors import radius_neighbors_graph
     >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity', include_self=True)
     >>> A.toarray()
-    array([[ 1.,  0.,  1.],
-           [ 0.,  1.,  0.],
-           [ 1.,  0.,  1.]])
+    array([[1., 0., 1.],
+           [0., 1., 0.],
+           [1., 0., 1.]])
 
     See also
     --------
diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py
index 26c9f641c0c7a..7181c44efbfca 100644
--- a/sklearn/neighbors/regression.py
+++ b/sklearn/neighbors/regression.py
@@ -96,7 +96,7 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
     >>> neigh.fit(X, y) # doctest: +ELLIPSIS
     KNeighborsRegressor(...)
     >>> print(neigh.predict([[1.5]]))
-    [ 0.5]
+    [0.5]
 
     See also
     --------
@@ -247,7 +247,7 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
     >>> neigh.fit(X, y) # doctest: +ELLIPSIS
     RadiusNeighborsRegressor(...)
     >>> print(neigh.predict([[1.5]]))
-    [ 0.5]
+    [0.5]
 
     See also
     --------
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index a898e2572fe5c..c25af3bea90ee 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -92,19 +92,19 @@ class Pipeline(_BaseComposition):
                     ('svc', SVC(...))])
     >>> prediction = anova_svm.predict(X)
     >>> anova_svm.score(X, y)                        # doctest: +ELLIPSIS
-    0.829...
+    0.83
     >>> # getting the selected features chosen by anova_filter
     >>> anova_svm.named_steps['anova'].get_support()
     ... # doctest: +NORMALIZE_WHITESPACE
     array([False, False,  True,  True, False, False, True,  True, False,
            True,  False,  True,  True, False, True,  False, True, True,
-           False, False], dtype=bool)
+           False, False])
     >>> # Another way to get selected features chosen by anova_filter
     >>> anova_svm.named_steps.anova.get_support()
     ... # doctest: +NORMALIZE_WHITESPACE
     array([False, False,  True,  True, False, False, True,  True, False,
            True,  False,  True,  True, False, True,  False, True, True,
-           False, False], dtype=bool)
+           False, False])
     """
 
     # BaseEstimator interface
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index fb680116971d3..e8a2e92299c3d 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -261,14 +261,14 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
     >>> print(scaler.fit(data))
     MinMaxScaler(copy=True, feature_range=(0, 1))
     >>> print(scaler.data_max_)
-    [  1.  18.]
+    [ 1. 18.]
     >>> print(scaler.transform(data))
-    [[ 0.    0.  ]
-     [ 0.25  0.25]
-     [ 0.5   0.5 ]
-     [ 1.    1.  ]]
+    [[0.   0.  ]
+     [0.25 0.25]
+     [0.5  0.5 ]
+     [1.   1.  ]]
     >>> print(scaler.transform([[2, 2]]))
-    [[ 1.5  0. ]]
+    [[1.5 0. ]]
 
     See also
     --------
@@ -540,14 +540,14 @@ class StandardScaler(BaseEstimator, TransformerMixin):
     >>> print(scaler.fit(data))
     StandardScaler(copy=True, with_mean=True, with_std=True)
     >>> print(scaler.mean_)
-    [ 0.5  0.5]
+    [0.5 0.5]
     >>> print(scaler.transform(data))
     [[-1. -1.]
      [-1. -1.]
      [ 1.  1.]
      [ 1.  1.]]
     >>> print(scaler.transform([[2, 2]]))
-    [[ 3.  3.]]
+    [[3. 3.]]
 
     See also
     --------
@@ -1233,14 +1233,14 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
            [4, 5]])
     >>> poly = PolynomialFeatures(2)
     >>> poly.fit_transform(X)
-    array([[  1.,   0.,   1.,   0.,   0.,   1.],
-           [  1.,   2.,   3.,   4.,   6.,   9.],
-           [  1.,   4.,   5.,  16.,  20.,  25.]])
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
     >>> poly = PolynomialFeatures(interaction_only=True)
     >>> poly.fit_transform(X)
-    array([[  1.,   0.,   1.,   0.],
-           [  1.,   2.,   3.,   6.],
-           [  1.,   4.,   5.,  20.]])
+    array([[ 1.,  0.,  1.,  0.],
+           [ 1.,  2.,  3.,  6.],
+           [ 1.,  4.,  5., 20.]])
 
     Attributes
     ----------
@@ -1786,8 +1786,8 @@ def add_dummy_feature(X, value=1.0):
 
     >>> from sklearn.preprocessing import add_dummy_feature
     >>> add_dummy_feature([[0, 1], [1, 0]])
-    array([[ 1.,  0.,  1.],
-           [ 1.,  1.,  0.]])
+    array([[1., 0., 1.],
+           [1., 1., 0.]])
     """
     X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES)
     n_samples, n_features = X.shape
@@ -1954,7 +1954,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     >>> enc.feature_indices_
     array([0, 2, 5, 9])
     >>> enc.transform([[0, 1, 1]]).toarray()
-    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])
+    array([[1., 0., 0., 1., 0., 0., 1., 0., 0.]])
 
     See also
     --------
@@ -2939,8 +2939,8 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
-    array([[ 1.,  0.,  1.,  0.,  0.],
-           [ 0.,  1.,  0.,  0.,  0.]])
+    array([[1., 0., 1., 0., 0.],
+           [0., 1., 0., 0., 0.]])
     >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
     array([['Male', 1],
            [None, 2]], dtype=object)
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 9ed8c4d6a2db1..f6f8f9ed4b80c 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -123,9 +123,9 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
     >>> print(clf.coef_)
-    [[ 0.08551385  0.39414796  0.49847831  0.37513797]]
+    [[0.08551385 0.39414796 0.49847831 0.37513797]]
     >>> print(clf.intercept_)
-    [ 0.28418066]
+    [0.28418066]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
 
@@ -333,7 +333,7 @@ class LinearSVR(LinearModel, RegressorMixin):
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=0, tol=0.0001, verbose=0)
     >>> print(regr.coef_)
-    [ 16.35750999  26.91499923  42.30652207  60.47843124]
+    [16.35750999 26.91499923 42.30652207 60.47843124]
     >>> print(regr.intercept_)
     [-4.29756543]
     >>> print(regr.predict([[0, 0, 0, 0]]))
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index f0a00d8b757e8..21dcca4f0764e 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -212,18 +212,18 @@ def resample(*arrays, **options):
       >>> from sklearn.utils import resample
       >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
       >>> X
-      array([[ 1.,  0.],
-             [ 2.,  1.],
-             [ 1.,  0.]])
+      array([[1., 0.],
+             [2., 1.],
+             [1., 0.]])
 
       >>> X_sparse                   # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
       <3x2 sparse matrix of type '<... 'numpy.float64'>'
           with 4 stored elements in Compressed Sparse Row format>
 
       >>> X_sparse.toarray()
-      array([[ 1.,  0.],
-             [ 2.,  1.],
-             [ 1.,  0.]])
+      array([[1., 0.],
+             [2., 1.],
+             [1., 0.]])
 
       >>> y
       array([0, 1, 0])
@@ -316,18 +316,18 @@ def shuffle(*arrays, **options):
       >>> from sklearn.utils import shuffle
       >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
       >>> X
-      array([[ 0.,  0.],
-             [ 2.,  1.],
-             [ 1.,  0.]])
+      array([[0., 0.],
+             [2., 1.],
+             [1., 0.]])
 
       >>> X_sparse                   # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
       <3x2 sparse matrix of type '<... 'numpy.float64'>'
           with 3 stored elements in Compressed Sparse Row format>
 
       >>> X_sparse.toarray()
-      array([[ 0.,  0.],
-             [ 2.,  1.],
-             [ 1.,  0.]])
+      array([[0., 0.],
+             [2., 1.],
+             [1., 0.]])
 
       >>> y
       array([2, 1, 0])
@@ -505,7 +505,7 @@ def indices_to_mask(indices, mask_length):
     >>> from sklearn.utils import indices_to_mask
     >>> indices = [1, 2 , 3, 4]
     >>> indices_to_mask(indices, 5)
-    array([False,  True,  True,  True,  True], dtype=bool)
+    array([False,  True,  True,  True,  True])
     """
     if mask_length <= np.max(indices):
         raise ValueError("mask_length must be greater than max(indices)")
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index e95ceb57497ae..c6c13c4f933a5 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -361,9 +361,9 @@ def logsumexp(arr, axis=0):
     >>> from sklearn.utils.extmath import logsumexp
     >>> a = np.arange(10)
     >>> np.log(np.sum(np.exp(a)))
-    9.4586297444267107
+    9.458...
     >>> logsumexp(a)
-    9.4586297444267107
+    9.458...
     """
     return scipy_logsumexp(arr, axis)
 
@@ -398,14 +398,14 @@ def weighted_mode(a, w, axis=0):
     >>> x = [4, 1, 4, 2, 4, 2]
     >>> weights = [1, 1, 1, 1, 1, 1]
     >>> weighted_mode(x, weights)
-    (array([ 4.]), array([ 3.]))
+    (array([4.]), array([3.]))
 
     The value 4 appears three times: with uniform weights, the result is
     simply the mode of the distribution.
 
     >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
     >>> weighted_mode(x, weights)
-    (array([ 2.]), array([ 3.5]))
+    (array([2.]), array([3.5]))
 
     The value 2 has the highest score: it appears twice with weights of
     1.5 and 2: the sum of these is 3.