diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
index 21349cd5336db..aa3ef71821f4e 100644
--- a/examples/applications/plot_cyclical_feature_engineering.py
+++ b/examples/applications/plot_cyclical_feature_engineering.py
@@ -12,6 +12,7 @@
 `extrapolation="periodic"` option.
 
 """
+
 # %%
 # Data exploration on the Bike Sharing Demand dataset
 # ---------------------------------------------------
diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py
index 004292cdbb762..9c36dc032ab64 100644
--- a/examples/applications/plot_digits_denoising.py
+++ b/examples/applications/plot_digits_denoising.py
@@ -18,9 +18,8 @@
       "Learning to find pre-images."
       Advances in neural information processing systems 16 (2004): 449-456.
       <https://papers.nips.cc/paper/2003/file/ac1ad983e08ad3304a97e147f522747e-Paper.pdf>`_
-"""
 
-print(__doc__)
+"""
 
 # Authors: Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
 # Licence: BSD 3 clause
diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
index 7e5d05102fa0c..8a1764c68ff09 100644
--- a/examples/applications/plot_face_recognition.py
+++ b/examples/applications/plot_face_recognition.py
@@ -27,6 +27,7 @@
 ================== ============ ======= ========== =======
 
 """
+
 from time import time
 import logging
 import matplotlib.pyplot as plt
@@ -40,8 +41,6 @@
 from sklearn.svm import SVC
 
 
-print(__doc__)
-
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
 
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index 241d9d4e33cca..b440c653762d2 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -32,8 +32,6 @@
 
 """
 
-print(__doc__)
-
 # Authors: Eustache Diemert <eustache@diemert.fr>
 #          Maria Telenczuk <https://github.com/maikia>
 #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
diff --git a/examples/applications/plot_out_of_core_classification.py b/examples/applications/plot_out_of_core_classification.py
index 287188c35b807..05232ac8948cb 100644
--- a/examples/applications/plot_out_of_core_classification.py
+++ b/examples/applications/plot_out_of_core_classification.py
@@ -11,6 +11,7 @@
 HashingVectorizer that will project each example into the same feature space.
 This is especially useful in the case of text classification where new
 features (words) may appear in each batch.
+
 """
 
 # Authors: Eustache Diemert <eustache@diemert.fr>
diff --git a/examples/applications/plot_outlier_detection_wine.py b/examples/applications/plot_outlier_detection_wine.py
index 182f613d11eaa..45e4c64d9fcc4 100644
--- a/examples/applications/plot_outlier_detection_wine.py
+++ b/examples/applications/plot_outlier_detection_wine.py
@@ -33,7 +33,6 @@
 example, none of these could be applied that easily.
 
 """
-print(__doc__)
 
 # Author: Virgile Fritsch <virgile.fritsch@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/applications/plot_species_distribution_modeling.py b/examples/applications/plot_species_distribution_modeling.py
index 6dac08fe1942c..666173415e7cc 100644
--- a/examples/applications/plot_species_distribution_modeling.py
+++ b/examples/applications/plot_species_distribution_modeling.py
@@ -33,6 +33,7 @@
    <http://rob.schapire.net/papers/ecolmod.pdf>`_
    S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
    190:231-259, 2006.
+
 """
 
 # Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
@@ -58,8 +59,6 @@
 except ImportError:
     basemap = False
 
-print(__doc__)
-
 
 def construct_grids(batch):
     """Construct the map grid from the batch object
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index 5116d8939de5d..a1bdf38cfeab2 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -58,6 +58,7 @@
 is to position the labels minimizing overlap. For this we use an
 heuristic based on the direction of the nearest neighbor along each
 axis.
+
 """
 
 # Author: Gael Varoquaux gael.varoquaux@normalesup.org
@@ -73,8 +74,6 @@
 
 from sklearn import cluster, covariance, manifold
 
-print(__doc__)
-
 
 # #############################################################################
 # Retrieve the data from Internet
diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py
index 8e96dbff3dafb..1263d344861ae 100644
--- a/examples/applications/plot_tomography_l1_reconstruction.py
+++ b/examples/applications/plot_tomography_l1_reconstruction.py
@@ -33,9 +33,8 @@ class :class:`~sklearn.linear_model.Lasso`, that uses the coordinate descent
 reconstructed image, contrary to the L1 penalization. Note in particular
 the circular artifact separating the pixels in the corners, that have
 contributed to fewer projections than the central disk.
-"""
 
-print(__doc__)
+"""
 
 # Author: Emmanuelle Gouillart <emmanuelle.gouillart@nsup.org>
 # License: BSD 3 clause
diff --git a/examples/applications/svm_gui.py b/examples/applications/svm_gui.py
index f480b36ad1d94..c8019fa72ae91 100644
--- a/examples/applications/svm_gui.py
+++ b/examples/applications/svm_gui.py
@@ -14,8 +14,6 @@
 
 """
 
-print(__doc__)
-
 # Author: Peter Prettenhoer <peter.prettenhofer@gmail.com>
 #
 # License: BSD 3 clause
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index 1d86076431ed8..6afba838ebe7a 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -27,6 +27,7 @@
 
 The graph data is fetched from the DBpedia dumps. DBpedia is an extraction
 of the latent structured data of the Wikipedia content.
+
 """
 
 # Author: Olivier Grisel <olivier.grisel@ensta.org>
@@ -46,8 +47,6 @@
 from urllib.request import urlopen
 
 
-print(__doc__)
-
 # #############################################################################
 # Where to download the data, if not already on disk
 redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
index eb2c0c8dafb50..615a3d1495eb8 100644
--- a/examples/bicluster/plot_bicluster_newsgroups.py
+++ b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -22,6 +22,7 @@
 achieve a better V-measure than clusters found by MiniBatchKMeans.
 
 """
+
 from collections import defaultdict
 import operator
 from time import time
@@ -34,8 +35,6 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.cluster import v_measure_score
 
-print(__doc__)
-
 
 def number_normalizer(tokens):
     """Map all numeric tokens to a placeholder.
diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py
index 754853749a784..3a0af07815c02 100644
--- a/examples/bicluster/plot_spectral_biclustering.py
+++ b/examples/bicluster/plot_spectral_biclustering.py
@@ -15,7 +15,6 @@
 representation of the checkerboard structure.
 
 """
-print(__doc__)
 
 # Author: Kemal Eren <kemal@kemaleren.com>
 # License: BSD 3 clause
diff --git a/examples/bicluster/plot_spectral_coclustering.py b/examples/bicluster/plot_spectral_coclustering.py
index 26494bf4f1c4f..0df275e83e3bd 100644
--- a/examples/bicluster/plot_spectral_coclustering.py
+++ b/examples/bicluster/plot_spectral_coclustering.py
@@ -14,7 +14,6 @@
 the biclusters.
 
 """
-print(__doc__)
 
 # Author: Kemal Eren <kemal@kemaleren.com>
 # License: BSD 3 clause
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index c1e1acea0c7c9..ea1871a7104d8 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -20,8 +20,8 @@
 to the expected 0.5 for most of the samples belonging to the middle
 cluster with heterogeneous labels. This results in a significantly improved
 Brier score.
+
 """
-print(__doc__)
 
 # Author: Mathieu Blondel <mathieu@mblondel.org>
 #         Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py
index c52cff1a858b0..e8925a3aec146 100644
--- a/examples/calibration/plot_calibration_curve.py
+++ b/examples/calibration/plot_calibration_curve.py
@@ -9,10 +9,10 @@
 visualize how well calibrated the predicted probabilities are using calibration
 curves, also known as reliability diagrams. Calibration of an uncalibrated
 classifier will also be demonstrated.
+
 """
-print(__doc__)
-# %%
 
+# %%
 # Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
 #         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # License: BSD 3 clause.
diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py
index a8713dfc1f849..8cd78c1d75288 100644
--- a/examples/calibration/plot_calibration_multiclass.py
+++ b/examples/calibration/plot_calibration_multiclass.py
@@ -10,6 +10,7 @@
 classifier to the probability vectors predicted by the same classifier after
 sigmoid calibration on a hold-out validation set. Colors indicate the true
 class of an instance (red: class 1, green: class 2, blue: class 3).
+
 """
 
 # %%
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index f866e45e0ba2b..7e78dcfd2e80c 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -13,6 +13,7 @@
 models: :ref:`Logistic_regression`, :ref:`gaussian_naive_bayes`,
 :ref:`Random Forest Classifier <forest>` and :ref:`Linear SVM
 <svm_classification>`.
+
 """
 
 # %%
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index f88c749cb2b40..87c3f51db5eb2 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -14,8 +14,8 @@
 The logistic regression with One-Vs-Rest is not a multiclass classifier out of
 the box. As a result it has more trouble in separating class 2 and 3 than the
 other estimators.
+
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index bb8fe4ce743f2..1c7112e5fa3d5 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -17,9 +17,8 @@
 The plots show training points in solid colors and testing points
 semi-transparent. The lower right shows the classification accuracy on the test
 set.
-"""
-print(__doc__)
 
+"""
 
 # Code source: Gaël Varoquaux
 #              Andreas Müller
diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
index 1c4f150c37374..385bc865cd48b 100644
--- a/examples/classification/plot_digits_classification.py
+++ b/examples/classification/plot_digits_classification.py
@@ -5,9 +5,8 @@
 
 This example shows how scikit-learn can be used to recognize images of
 hand-written digits, from 0-9.
-"""
 
-print(__doc__)
+"""
 
 # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
 # License: BSD 3 clause
diff --git a/examples/classification/plot_lda.py b/examples/classification/plot_lda.py
index 856f2e206e9c9..47487fc1f2caf 100644
--- a/examples/classification/plot_lda.py
+++ b/examples/classification/plot_lda.py
@@ -5,7 +5,9 @@
 
 This example illustrates how the Ledoit-Wolf and Oracle Shrinkage
 Approximating (OAS) estimators of covariance can improve classification.
+
 """
+
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py
index 951f6b8c4cd29..d157f80fdac2e 100644
--- a/examples/classification/plot_lda_qda.py
+++ b/examples/classification/plot_lda_qda.py
@@ -8,8 +8,8 @@
 the double standard deviation for each class. With LDA, the
 standard deviation is the same for all the classes, while each
 class has its own standard deviation with QDA.
+
 """
-print(__doc__)
 
 from scipy import linalg
 import numpy as np
diff --git a/examples/cluster/plot_adjusted_for_chance_measures.py b/examples/cluster/plot_adjusted_for_chance_measures.py
index 0b77144ef5256..7985eb70e6a0b 100644
--- a/examples/cluster/plot_adjusted_for_chance_measures.py
+++ b/examples/cluster/plot_adjusted_for_chance_measures.py
@@ -20,7 +20,6 @@
 value of k on various overlapping sub-samples of the dataset.
 
 """
-print(__doc__)
 
 # Author: Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 799d8d2d949b7..7763fa41434ad 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -8,7 +8,6 @@
 Between Data Points", Science Feb. 2007
 
 """
-print(__doc__)
 
 from sklearn.cluster import AffinityPropagation
 from sklearn import metrics
diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py
index bc3adbf376bb6..9d590f572f121 100644
--- a/examples/cluster/plot_agglomerative_clustering.py
+++ b/examples/cluster/plot_agglomerative_clustering.py
@@ -20,7 +20,10 @@
 (try decreasing the number of neighbors in kneighbors_graph) and with
 complete linkage. In particular, having a very small number of neighbors in
 the graph, imposes a geometry that is close to that of single linkage,
-which is well known to have this percolation instability. """
+which is well known to have this percolation instability.
+
+"""
+
 # Authors: Gael Varoquaux, Nelle Varoquaux
 # License: BSD 3 clause
 
diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py
index 4901403a4d54b..e022035ca2207 100644
--- a/examples/cluster/plot_agglomerative_clustering_metrics.py
+++ b/examples/cluster/plot_agglomerative_clustering_metrics.py
@@ -31,7 +31,9 @@
 distance, the separation is good and the waveform classes are recovered.
 Finally, the cosine distance does not separate at all waveform 1 and 2,
 thus the clustering puts them in the same cluster.
+
 """
+
 # Author: Gael Varoquaux
 # License: BSD 3-Clause or CC-0
 
diff --git a/examples/cluster/plot_agglomerative_dendrogram.py b/examples/cluster/plot_agglomerative_dendrogram.py
index 94ede3c2451c0..2de5030d68f6d 100644
--- a/examples/cluster/plot_agglomerative_dendrogram.py
+++ b/examples/cluster/plot_agglomerative_dendrogram.py
@@ -6,6 +6,7 @@
 =========================================
 This example plots the corresponding dendrogram of a hierarchical clustering
 using AgglomerativeClustering and the dendrogram method available in scipy.
+
 """
 
 import numpy as np
diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py
index 67c554c4469f2..c84d06458ac3c 100644
--- a/examples/cluster/plot_birch_vs_minibatchkmeans.py
+++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py
@@ -11,14 +11,13 @@
 samples to a set of 158 clusters. This can be viewed as a preprocessing
 step before the final (global) clustering step that further reduces these
 158 clusters to 100 clusters.
+
 """
 
 # Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com
 #          Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
 # License: BSD 3 clause
 
-print(__doc__)
-
 from itertools import cycle
 from time import time
 import numpy as np
diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py
index 43b9a7e333e45..802f3996bbe7e 100644
--- a/examples/cluster/plot_cluster_comparison.py
+++ b/examples/cluster/plot_cluster_comparison.py
@@ -21,8 +21,8 @@
 While these examples give some intuition about the
 algorithms, this intuition might not apply to very high
 dimensional data.
+
 """
-print(__doc__)
 
 import time
 import warnings
diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py
index 5ca08990d1b6e..531a2b4e7d396 100644
--- a/examples/cluster/plot_cluster_iris.py
+++ b/examples/cluster/plot_cluster_iris.py
@@ -14,8 +14,6 @@
 and finally the ground truth.
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py
index 09cd8974e3eab..4d83d2bccf639 100644
--- a/examples/cluster/plot_coin_segmentation.py
+++ b/examples/cluster/plot_coin_segmentation.py
@@ -16,8 +16,8 @@
   using a kmeans algorithm
 * whereas 'discrete' will iteratively search for the closest partition
   space to the embedding space.
+
 """
-print(__doc__)
 
 # Author: Gael Varoquaux <gael.varoquaux@normalesup.org>, Brian Cheung
 # License: BSD 3 clause
diff --git a/examples/cluster/plot_coin_ward_segmentation.py b/examples/cluster/plot_coin_ward_segmentation.py
index b674700fb3726..87673794d4965 100644
--- a/examples/cluster/plot_coin_ward_segmentation.py
+++ b/examples/cluster/plot_coin_ward_segmentation.py
@@ -6,14 +6,13 @@
 Compute the segmentation of a 2D image with Ward hierarchical
 clustering. The clustering is spatially constrained in order
 for each segmented region to be in one piece.
+
 """
 
 # Author : Vincent Michel, 2010
 #          Alexandre Gramfort, 2011
 # License: BSD 3 clause
 
-print(__doc__)
-
 import time as time
 
 import numpy as np
diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py
index 90b75a8cd1352..6fc6cdd4a449f 100644
--- a/examples/cluster/plot_color_quantization.py
+++ b/examples/cluster/plot_color_quantization.py
@@ -17,14 +17,15 @@
 
 For comparison, a quantized image using a random codebook (colors picked up
 randomly) is also shown.
+
 """
+
 # Authors: Robert Layton <robertlayton@gmail.com>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 #          Mathieu Blondel <mathieu@mblondel.org>
 #
 # License: BSD 3 clause
 
-print(__doc__)
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
index 29d5f1b768210..f4293941569ce 100644
--- a/examples/cluster/plot_dbscan.py
+++ b/examples/cluster/plot_dbscan.py
@@ -7,7 +7,6 @@
 Finds core samples of high density and expands clusters from them.
 
 """
-print(__doc__)
 
 import numpy as np
 
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index 242774550d063..c7e4627a6bdea 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -18,8 +18,8 @@
 partial-fit. This is because the number of patches that they represent
 has become too low, and it is better to choose a random new
 cluster.
+
 """
-print(__doc__)
 
 import time
 
diff --git a/examples/cluster/plot_digits_agglomeration.py b/examples/cluster/plot_digits_agglomeration.py
index a45190dbe6ac7..18288da252024 100644
--- a/examples/cluster/plot_digits_agglomeration.py
+++ b/examples/cluster/plot_digits_agglomeration.py
@@ -6,8 +6,8 @@
 
 These images how similar features are merged together using
 feature agglomeration.
+
 """
-print(__doc__)
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index c5d78e362fb38..925f5c122d73f 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -16,12 +16,12 @@
 that ends up with a couple of singleton clusters, while in the case
 of single linkage we get a single central cluster with all other clusters
 being drawn from noise points around the fringes.
+
 """
 
 # Authors: Gael Varoquaux
 # License: BSD 3 clause (C) INRIA 2014
 
-print(__doc__)
 from time import time
 
 import numpy as np
diff --git a/examples/cluster/plot_face_compress.py b/examples/cluster/plot_face_compress.py
index df88fd6ccdb07..700d862ec6001 100644
--- a/examples/cluster/plot_face_compress.py
+++ b/examples/cluster/plot_face_compress.py
@@ -9,8 +9,6 @@
 used for vector quantization.
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index afb31751ce7a5..1ce5dd5d4b2e6 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -11,13 +11,12 @@
 
 Both methods are compared in a regression problem using
 a BayesianRidge as supervised estimator.
+
 """
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
 
-print(__doc__)
-
 import shutil
 import tempfile
 
diff --git a/examples/cluster/plot_inductive_clustering.py b/examples/cluster/plot_inductive_clustering.py
index 68fdf020fe734..fd8adb8df189e 100644
--- a/examples/cluster/plot_inductive_clustering.py
+++ b/examples/cluster/plot_inductive_clustering.py
@@ -17,10 +17,11 @@
 
 This example illustrates a generic implementation of a meta-estimator which
 extends clustering by inducing a classifier from the cluster labels.
+
 """
+
 # Authors: Chirag Nagpal
 #          Christos Aridas
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 4360f89faee3c..94f8ff6c58f52 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -8,8 +8,8 @@
 input data does not conform to some implicit assumption that k-means makes and
 undesirable clusters are produced as a result. In the last plot, k-means
 returns intuitive clusters despite unevenly sized blobs.
+
 """
-print(__doc__)
 
 # Author: Phil Roth <mr.phil.roth@gmail.com>
 # License: BSD 3 clause
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index 8190c6e2792e0..fc79c867a8589 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -22,8 +22,8 @@
 AMI          adjusted mutual information
 silhouette   silhouette coefficient
 =========== ========================================================
+
 """
-print(__doc__)
 
 # %%
 # Load the dataset
diff --git a/examples/cluster/plot_kmeans_plusplus.py b/examples/cluster/plot_kmeans_plusplus.py
index 8afbb62dfdda4..eea2c2ec85093 100644
--- a/examples/cluster/plot_kmeans_plusplus.py
+++ b/examples/cluster/plot_kmeans_plusplus.py
@@ -9,7 +9,6 @@
 K-Means++ is used as the default initialization for :ref:`k_means`.
 
 """
-print(__doc__)
 
 from sklearn.cluster import kmeans_plusplus
 from sklearn.datasets import make_blobs
diff --git a/examples/cluster/plot_kmeans_silhouette_analysis.py b/examples/cluster/plot_kmeans_silhouette_analysis.py
index 63efe0c406fd8..8f4e241100e24 100644
--- a/examples/cluster/plot_kmeans_silhouette_analysis.py
+++ b/examples/cluster/plot_kmeans_silhouette_analysis.py
@@ -28,7 +28,9 @@
 cluster. However when the ``n_clusters`` is equal to 4, all the plots are more
 or less of similar thickness and hence are of similar sizes as can be also
 verified from the labelled scatter plot on the right.
+
 """
+
 from sklearn.datasets import make_blobs
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_samples, silhouette_score
@@ -37,8 +39,6 @@
 import matplotlib.cm as cm
 import numpy as np
 
-print(__doc__)
-
 # Generating the sample data from make_blobs
 # This particular setting has one distinct cluster and 3 clusters placed close
 # together.
diff --git a/examples/cluster/plot_kmeans_stability_low_dim_dense.py b/examples/cluster/plot_kmeans_stability_low_dim_dense.py
index dd9f32d01485d..f9e9f40c6bdc3 100644
--- a/examples/cluster/plot_kmeans_stability_low_dim_dense.py
+++ b/examples/cluster/plot_kmeans_stability_low_dim_dense.py
@@ -20,8 +20,8 @@
 
 The dataset used for evaluation is a 2D grid of isotropic Gaussian
 clusters widely spaced.
+
 """
-print(__doc__)
 
 # Author: Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py
index 0a4667855f2ec..af4c3cd2894af 100644
--- a/examples/cluster/plot_linkage_comparison.py
+++ b/examples/cluster/plot_linkage_comparison.py
@@ -20,8 +20,8 @@
 While these examples give some intuition about the
 algorithms, this intuition might not apply to very high
 dimensional data.
+
 """
-print(__doc__)
 
 import time
 import warnings
diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py
index 53e052ed71d36..ae6d46a68dac1 100644
--- a/examples/cluster/plot_mean_shift.py
+++ b/examples/cluster/plot_mean_shift.py
@@ -10,7 +10,6 @@
 Machine Intelligence. 2002. pp. 603-619.
 
 """
-print(__doc__)
 
 import numpy as np
 from sklearn.cluster import MeanShift, estimate_bandwidth
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
index 99b5311c897d0..e97cd7146ff27 100644
--- a/examples/cluster/plot_mini_batch_kmeans.py
+++ b/examples/cluster/plot_mini_batch_kmeans.py
@@ -11,8 +11,8 @@
 MiniBatchKMeans, and plot the results.
 We will also plot the points that are labelled differently between the two
 algorithms.
+
 """
-print(__doc__)
 
 import time
 
diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py
index 4ca81be91de1b..5956a2d47afa5 100644
--- a/examples/cluster/plot_optics.py
+++ b/examples/cluster/plot_optics.py
@@ -13,13 +13,13 @@
 corresponds to :class:`~cluster.DBSCAN`. We can see that the different
 clusters of OPTICS's Xi method can be recovered with different choices of
 thresholds in DBSCAN.
+
 """
 
 # Authors: Shane Grigsby <refuge@rocktalus.com>
 #          Adrin Jalali <adrin.jalali@gmail.com>
 # License: BSD 3 clause
 
-
 from sklearn.cluster import OPTICS, cluster_optics_dbscan
 import matplotlib.gridspec as gridspec
 import matplotlib.pyplot as plt
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index f4709358b63a4..9d153bdd030fa 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -23,8 +23,8 @@
 In addition, we use the mask of the objects to restrict the graph to the
 outline of the objects. In this example, we are interested in
 separating the objects one from the other, and not from the background.
+
 """
-print(__doc__)
 
 # Authors:  Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
 #           Gael Varoquaux <gael.varoquaux@normalesup.org>
diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py
index 75a2aecb6fd3b..c0dd6f074bc4b 100644
--- a/examples/cluster/plot_ward_structured_vs_unstructured.py
+++ b/examples/cluster/plot_ward_structured_vs_unstructured.py
@@ -17,6 +17,7 @@
 respect the structure of the swiss roll and extend across different folds of
 the manifolds. On the opposite, when opposing connectivity constraints,
 the clusters form a nice parcellation of the swiss roll.
+
 """
 
 # Authors : Vincent Michel, 2010
@@ -24,8 +25,6 @@
 #           Gael Varoquaux, 2010
 # License: BSD 3 clause
 
-print(__doc__)
-
 import time as time
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
index 81fd448b56d18..d4798d828b321 100644
--- a/examples/compose/plot_column_transformer.py
+++ b/examples/compose/plot_column_transformer.py
@@ -15,6 +15,7 @@
 :class:`~sklearn.compose.ColumnTransformer` on a dataset containing
 different types of features. The choice of features is not particularly
 helpful, but serves to illustrate the technique.
+
 """
 
 # Author: Matt Terry <matt.terry@gmail.com>
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index f2ee04e888773..307f0384ab1d2 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -21,6 +21,7 @@
 Finally, the preprocessing pipeline is integrated in a full prediction pipeline
 using :class:`~pipeline.Pipeline`, together with a simple classification
 model.
+
 """
 
 # Author: Pedro Morales <part.morales@gmail.com>
diff --git a/examples/compose/plot_compare_reduction.py b/examples/compose/plot_compare_reduction.py
index 036d29fbf62aa..cd2c65021c4d4 100644
--- a/examples/compose/plot_compare_reduction.py
+++ b/examples/compose/plot_compare_reduction.py
@@ -18,6 +18,7 @@
 
 Note that the use of ``memory`` to enable caching becomes interesting when the
 fitting of a transformer is costly.
+
 """
 
 # %%
@@ -35,8 +36,6 @@
 from sklearn.decomposition import PCA, NMF
 from sklearn.feature_selection import SelectKBest, chi2
 
-print(__doc__)
-
 pipe = Pipeline(
     [
         # the reduce_dim stage is populated by the param_grid
diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index 48d580bb02a03..8c2c395de4250 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -10,14 +10,11 @@
 We use a GridSearchCV to set the dimensionality of the PCA
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
 # License: BSD 3 clause
 
-
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
diff --git a/examples/compose/plot_feature_union.py b/examples/compose/plot_feature_union.py
index 09e6e6919ab9a..e014b8b8808b9 100644
--- a/examples/compose/plot_feature_union.py
+++ b/examples/compose/plot_feature_union.py
@@ -13,6 +13,7 @@
 
 The combination used in this example is not particularly helpful on this
 dataset and is only used to illustrate the usage of FeatureUnion.
+
 """
 
 # Author: Andreas Mueller <amueller@ais.uni-bonn.de>
diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py
index 03ae17da53dd3..263f8e41c5bbe 100644
--- a/examples/compose/plot_transformed_target.py
+++ b/examples/compose/plot_transformed_target.py
@@ -9,6 +9,7 @@
 to illustrate the benefit of transforming the targets before learning a linear
 regression model. The first example uses synthetic data while the second
 example is based on the Ames housing data set.
+
 """
 
 # Author: Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index eae43b3c7d4d3..5912f7527fa02 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -41,7 +41,6 @@
 computationally costly.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/covariance/plot_lw_vs_oas.py b/examples/covariance/plot_lw_vs_oas.py
index abd451da3875a..1fd84b180f50a 100644
--- a/examples/covariance/plot_lw_vs_oas.py
+++ b/examples/covariance/plot_lw_vs_oas.py
@@ -20,7 +20,6 @@
 Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/covariance/plot_robust_vs_empirical_covariance.py b/examples/covariance/plot_robust_vs_empirical_covariance.py
index a4fb6ef8941ed..054ff471fa7fb 100644
--- a/examples/covariance/plot_robust_vs_empirical_covariance.py
+++ b/examples/covariance/plot_robust_vs_empirical_covariance.py
@@ -52,7 +52,6 @@
     Statistical Ass., 79:871, 1984.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py
index ddb8de244a3b9..8670ed4e06195 100644
--- a/examples/covariance/plot_sparse_cov.py
+++ b/examples/covariance/plot_sparse_cov.py
@@ -47,8 +47,9 @@
 set by internal cross-validation in the GraphicalLassoCV. As can be
 seen on figure 2, the grid to compute the cross-validation score is
 iteratively refined in the neighborhood of the maximum.
+
 """
-print(__doc__)
+
 # author: Gael Varoquaux <gael.varoquaux@inria.fr>
 # License: BSD 3 clause
 # Copyright: INRIA
diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
index 21b735e401711..deccd7aa1932c 100644
--- a/examples/cross_decomposition/plot_compare_cross_decomposition.py
+++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -17,8 +17,8 @@
 first diagonal). This is also true for components 2 in both dataset,
 however, the correlation across datasets for different components is
 weak: the point cloud is very spherical.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/cross_decomposition/plot_pcr_vs_pls.py b/examples/cross_decomposition/plot_pcr_vs_pls.py
index 09633e988c1f1..529225d11eead 100644
--- a/examples/cross_decomposition/plot_pcr_vs_pls.py
+++ b/examples/cross_decomposition/plot_pcr_vs_pls.py
@@ -30,9 +30,8 @@
 linear regressor to the transformed data. The main difference with PCR is
 that the PLS transformation is supervised. Therefore, as we will see in this
 example, it does not suffer from the issue we just mentioned.
-"""
 
-print(__doc__)
+"""
 
 # %%
 # The data
diff --git a/examples/datasets/plot_digits_last_image.py b/examples/datasets/plot_digits_last_image.py
index 619ea093178aa..a15e9d140ad45 100644
--- a/examples/datasets/plot_digits_last_image.py
+++ b/examples/datasets/plot_digits_last_image.py
@@ -12,9 +12,8 @@
 See `here
 <https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits>`_
 for more information about this dataset.
-"""
-print(__doc__)
 
+"""
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/datasets/plot_iris_dataset.py b/examples/datasets/plot_iris_dataset.py
index abea931849e9a..fa33e76e7ed7b 100644
--- a/examples/datasets/plot_iris_dataset.py
+++ b/examples/datasets/plot_iris_dataset.py
@@ -13,9 +13,8 @@
 The below plot uses the first two features.
 See `here <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ for more
 information on this dataset.
-"""
-print(__doc__)
 
+"""
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py
index 7a94eaa5550f7..4f3fdbbb11ef5 100644
--- a/examples/datasets/plot_random_dataset.py
+++ b/examples/datasets/plot_random_dataset.py
@@ -11,9 +11,8 @@
 different numbers of informative features, clusters per class and classes.
 The final 2 plots use :func:`~sklearn.datasets.make_blobs` and
 :func:`~sklearn.datasets.make_gaussian_quantiles`.
-"""
 
-print(__doc__)
+"""
 
 import matplotlib.pyplot as plt
 
diff --git a/examples/datasets/plot_random_multilabel_dataset.py b/examples/datasets/plot_random_multilabel_dataset.py
index a7ceba36e30db..368300b80166b 100644
--- a/examples/datasets/plot_random_multilabel_dataset.py
+++ b/examples/datasets/plot_random_multilabel_dataset.py
@@ -32,6 +32,7 @@
 "document length", while here we have much larger documents than vocabulary.
 Similarly, with ``n_classes > n_features``, it is much less likely that a
 feature distinguishes a particular class.
+
 """
 
 import numpy as np
@@ -39,8 +40,6 @@
 
 from sklearn.datasets import make_multilabel_classification as make_ml_clf
 
-print(__doc__)
-
 COLORS = np.array(
     [
         "!",
diff --git a/examples/decomposition/plot_beta_divergence.py b/examples/decomposition/plot_beta_divergence.py
index 41c908e273c72..2a69f9a22ffb4 100644
--- a/examples/decomposition/plot_beta_divergence.py
+++ b/examples/decomposition/plot_beta_divergence.py
@@ -5,13 +5,13 @@
 
 A plot that compares the various Beta-divergence loss functions supported by
 the Multiplicative-Update ('mu') solver in :class:`~sklearn.decomposition.NMF`.
+
 """
+
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.decomposition._nmf import _beta_divergence
 
-print(__doc__)
-
 x = np.linspace(0.001, 4, 1000)
 y = np.zeros(x.shape)
 
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index 7c873e867aa8b..ec6c4ebad226f 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -9,7 +9,6 @@
 :ref:`decompositions`) .
 
 """
-print(__doc__)
 
 # Authors: Vlad Niculae, Alexandre Gramfort
 # License: BSD 3 clause
diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py
index bbb9ac1c897e6..15945e5075ce8 100644
--- a/examples/decomposition/plot_ica_blind_source_separation.py
+++ b/examples/decomposition/plot_ica_blind_source_separation.py
@@ -13,7 +13,6 @@
 non-Gaussian processes.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py
index 769fe47a028f4..f02a857f19774 100644
--- a/examples/decomposition/plot_ica_vs_pca.py
+++ b/examples/decomposition/plot_ica_vs_pca.py
@@ -26,8 +26,8 @@
 after whitening by the variance corresponding to the PCA vectors (lower
 left). Running ICA corresponds to finding a rotation in this space to
 identify the directions of largest non-Gaussianity (lower right).
+
 """
-print(__doc__)
 
 # Authors: Alexandre Gramfort, Gael Varoquaux
 # License: BSD 3 clause
diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py
index df08e052d3ec1..ab8039c0c4355 100644
--- a/examples/decomposition/plot_image_denoising.py
+++ b/examples/decomposition/plot_image_denoising.py
@@ -31,7 +31,6 @@
 necessarily related to visualisation.
 
 """
-print(__doc__)
 
 from time import time
 
diff --git a/examples/decomposition/plot_incremental_pca.py b/examples/decomposition/plot_incremental_pca.py
index 88cd6a679c479..adc7f83f3cda0 100644
--- a/examples/decomposition/plot_incremental_pca.py
+++ b/examples/decomposition/plot_incremental_pca.py
@@ -18,7 +18,6 @@
 incremental approaches.
 
 """
-print(__doc__)
 
 # Authors: Kyle Kastner
 # License: BSD 3 clause
diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 8a9ad066cb181..3ec0958a9e602 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -5,8 +5,8 @@
 
 This example shows that Kernel PCA is able to find a projection of the data
 that makes data linearly separable.
+
 """
-print(__doc__)
 
 # Authors: Mathieu Blondel
 #          Andreas Mueller
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index 5a8094e624b86..27f06bca389a5 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -9,7 +9,6 @@
 comes in to choose a direction that is not flat.
 
 """
-print(__doc__)
 
 # Authors: Gael Varoquaux
 #          Jaques Grobler
diff --git a/examples/decomposition/plot_pca_iris.py b/examples/decomposition/plot_pca_iris.py
index c7ee5e321dfdc..7b7ce6e68c34a 100644
--- a/examples/decomposition/plot_pca_iris.py
+++ b/examples/decomposition/plot_pca_iris.py
@@ -10,8 +10,6 @@
 information on this dataset.
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # License: BSD 3 clause
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index 65c04838f8796..e7802476ef341 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -37,8 +37,6 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import GridSearchCV
 
-print(__doc__)
-
 # #############################################################################
 # Create the data
 
diff --git a/examples/decomposition/plot_pca_vs_lda.py b/examples/decomposition/plot_pca_vs_lda.py
index f9abf21b3ad0e..e88a0aff7882f 100644
--- a/examples/decomposition/plot_pca_vs_lda.py
+++ b/examples/decomposition/plot_pca_vs_lda.py
@@ -15,8 +15,8 @@
 Linear Discriminant Analysis (LDA) tries to identify attributes that
 account for the most variance *between classes*. In particular,
 LDA, in contrast to PCA, is a supervised method, using known class labels.
+
 """
-print(__doc__)
 
 import matplotlib.pyplot as plt
 
diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
index 681b1ca1942c0..f5a014096fb56 100644
--- a/examples/decomposition/plot_sparse_coding.py
+++ b/examples/decomposition/plot_sparse_coding.py
@@ -13,8 +13,8 @@
 
 The richer dictionary on the right is not larger in size, heavier subsampling
 is performed in order to stay on the same order of magnitude.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/decomposition/plot_varimax_fa.py b/examples/decomposition/plot_varimax_fa.py
index 82644595daf94..0103ccc65b5d6 100644
--- a/examples/decomposition/plot_varimax_fa.py
+++ b/examples/decomposition/plot_varimax_fa.py
@@ -13,6 +13,7 @@
 is found by maximizing the squared variances of the weights,
 finds a structure where the second component only loads
 positively on sepal width.
+
 """
 
 # Authors: Jona Sassenhagen
@@ -25,8 +26,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import load_iris
 
-print(__doc__)
-
 # %%
 # Load Iris data
 data = load_iris()
diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py
index a4bf4d3875ed2..d844a80bd95d5 100644
--- a/examples/ensemble/plot_adaboost_hastie_10_2.py
+++ b/examples/ensemble/plot_adaboost_hastie_10_2.py
@@ -18,7 +18,6 @@
 .. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
 
 """
-print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>,
 #         Noel Dawe <noel.dawe@gmail.com>
diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
index af28b3fe940bf..03b00fc010646 100644
--- a/examples/ensemble/plot_adaboost_multiclass.py
+++ b/examples/ensemble/plot_adaboost_multiclass.py
@@ -23,7 +23,6 @@
 .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
 
 """
-print(__doc__)
 
 # Author: Noel Dawe <noel.dawe@gmail.com>
 #
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 0c3f01299b06e..e60051d1c25a0 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -12,7 +12,6 @@
 .. [1] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
 
 """
-print(__doc__)
 
 # Author: Noel Dawe <noel.dawe@gmail.com>
 #
diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py
index d22f14cf7c8c9..38e3e95ae96ef 100644
--- a/examples/ensemble/plot_adaboost_twoclass.py
+++ b/examples/ensemble/plot_adaboost_twoclass.py
@@ -16,7 +16,6 @@
 with a decision score above some value.
 
 """
-print(__doc__)
 
 # Author: Noel Dawe <noel.dawe@gmail.com>
 #
diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py
index f78a200a41c83..0d80bb5a558c9 100644
--- a/examples/ensemble/plot_bias_variance.py
+++ b/examples/ensemble/plot_bias_variance.py
@@ -61,7 +61,6 @@
        "Elements of Statistical Learning", Springer, 2009.
 
 """
-print(__doc__)
 
 # Author: Gilles Louppe <g.louppe@gmail.com>
 # License: BSD 3 clause
diff --git a/examples/ensemble/plot_ensemble_oob.py b/examples/ensemble/plot_ensemble_oob.py
index 5b798eece8667..8c3a0e2b81875 100644
--- a/examples/ensemble/plot_ensemble_oob.py
+++ b/examples/ensemble/plot_ensemble_oob.py
@@ -19,11 +19,6 @@
        Learning Ed. 2", p592-593, Springer, 2009.
 
 """
-import matplotlib.pyplot as plt
-
-from collections import OrderedDict
-from sklearn.datasets import make_classification
-from sklearn.ensemble import RandomForestClassifier
 
 # Author: Kian Ho <hui.kian.ho@gmail.com>
 #         Gilles Louppe <g.louppe@gmail.com>
@@ -31,7 +26,11 @@
 #
 # License: BSD 3 Clause
 
-print(__doc__)
+import matplotlib.pyplot as plt
+
+from collections import OrderedDict
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
 
 RANDOM_STATE = 123
 
diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 63bb53794b82e..53c5d9b1a233c 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -17,14 +17,14 @@
 
 The resulting transformer has then learned a supervised, sparse,
 high-dimensional categorical embedding of the data.
+
 """
 
+
 # Author: Tim Head <betatim@gmail.com>
 #
 # License: BSD 3 clause
 
-print(__doc__)
-
 from sklearn import set_config
 
 set_config(display="diagram")
diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py
index d1f67ec451a18..fbda63b26faee 100644
--- a/examples/ensemble/plot_forest_importances.py
+++ b/examples/ensemble/plot_forest_importances.py
@@ -10,8 +10,9 @@
 
 As expected, the plot suggests that 3 features are informative, while the
 remaining are not.
+
 """
-print(__doc__)
+
 import matplotlib.pyplot as plt
 
 # %%
diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py
index bc23030bbdf99..3848873c297de 100644
--- a/examples/ensemble/plot_forest_importances_faces.py
+++ b/examples/ensemble/plot_forest_importances_faces.py
@@ -9,9 +9,8 @@
 
 The code below also illustrates how the construction and the computation
 of the predictions can be parallelized within multiple jobs.
+
 """
-# %%
-print(__doc__)
 
 # %%
 # Loading the data and model fitting
diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py
index b2e95ef2ecc81..ee414db7125dc 100644
--- a/examples/ensemble/plot_forest_iris.py
+++ b/examples/ensemble/plot_forest_iris.py
@@ -39,8 +39,8 @@
 It is worth noting that RandomForests and ExtraTrees can be fitted in parallel
 on many cores as each tree is built independently of the others. AdaBoost's
 samples are built sequentially and so do not use multiple cores.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index 3bb406a0ffe86..32acbcc2b9ec8 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -20,8 +20,8 @@
 
 We will work with the Ames Lowa Housing dataset which consists of numerical
 and categorical features, where the houses' sales prices is the target.
+
 """
-print(__doc__)
 
 # %%
 # Load Ames Housing dataset
diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py
index cc4408b6dc255..4489a5742a890 100644
--- a/examples/ensemble/plot_gradient_boosting_early_stopping.py
+++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py
@@ -29,6 +29,7 @@
 almost the same accuracy as compared to a model built without early stopping
 using many fewer estimators. This can significantly reduce training time,
 memory usage and prediction latency.
+
 """
 
 # Authors: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>
@@ -44,8 +45,6 @@
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 
-print(__doc__)
-
 data_list = [datasets.load_iris(), datasets.load_digits()]
 data_list = [(d.data, d.target) for d in data_list]
 data_list += [datasets.make_hastie_10_2()]
diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py
index ce13eb4398403..8182eafc2969a 100644
--- a/examples/ensemble/plot_gradient_boosting_oob.py
+++ b/examples/ensemble/plot_gradient_boosting_oob.py
@@ -22,8 +22,8 @@
 The figure also shows the performance of 3-fold cross validation which
 usually gives a better estimate of the test loss
 but is computationally more demanding.
+
 """
-print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 93bc70038d3f6..ebcf9c15f3103 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -5,7 +5,9 @@
 
 This example shows how quantile regression can be used to create prediction
 intervals.
+
 """
+
 # %%
 # Generate some data for a synthetic regression problem by applying the
 # function f to uniformly sampled random inputs.
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index b322fe3fce0a8..dc29bfbda8f77 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -12,8 +12,8 @@
 
 Note: For larger datasets (n_samples >= 10000), please refer to
 :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
+
 """
-print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Maria Telenczuk <https://github.com/maikia>
diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py
index 5947233d60860..2d4751e1c6c43 100644
--- a/examples/ensemble/plot_gradient_boosting_regularization.py
+++ b/examples/ensemble/plot_gradient_boosting_regularization.py
@@ -18,8 +18,8 @@
 
 .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
     Learning Ed. 2", Springer, 2009.
+
 """
-print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #
diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py
index 12a2bfd846279..5ffe9eb799ac9 100644
--- a/examples/ensemble/plot_isolation_forest.py
+++ b/examples/ensemble/plot_isolation_forest.py
@@ -22,7 +22,6 @@
 for particular samples, they are highly likely to be anomalies.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py
index 1039f69b6fbe9..fc1ced458b09e 100644
--- a/examples/ensemble/plot_monotonic_constraints.py
+++ b/examples/ensemble/plot_monotonic_constraints.py
@@ -17,15 +17,15 @@
 
 This example was inspired by the `XGBoost documentation
 <https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html>`_.
+
 """
+
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.inspection import PartialDependenceDisplay
 import numpy as np
 import matplotlib.pyplot as plt
 
 
-print(__doc__)
-
 rng = np.random.RandomState(0)
 
 n_samples = 5000
diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py
index 339dab440502d..000b83e67b92a 100644
--- a/examples/ensemble/plot_random_forest_embedding.py
+++ b/examples/ensemble/plot_random_forest_embedding.py
@@ -23,7 +23,9 @@
 decision boundary obtained by BernoulliNB in the transformed
 space with an ExtraTreesClassifier forests learned on the
 original data.
+
 """
+
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py
index 220f759ba40b1..4b3d4f4a9a728 100644
--- a/examples/ensemble/plot_random_forest_regression_multioutput.py
+++ b/examples/ensemble/plot_random_forest_regression_multioutput.py
@@ -20,7 +20,6 @@
 x and y coordinate as output.
 
 """
-print(__doc__)
 
 # Author: Tim Head <betatim@gmail.com>
 #
diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
index 1b48d50f2c40f..b70c53c8775dd 100644
--- a/examples/ensemble/plot_stack_predictors.py
+++ b/examples/ensemble/plot_stack_predictors.py
@@ -20,8 +20,6 @@
 #          Maria Telenczuk    <https://github.com/maikia>
 # License: BSD 3 clause
 
-print(__doc__)
-
 from sklearn import set_config
 
 set_config(display="diagram")
diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py
index 7b2cb278c9035..58bcd2dfc7404 100644
--- a/examples/ensemble/plot_voting_decision_regions.py
+++ b/examples/ensemble/plot_voting_decision_regions.py
@@ -22,7 +22,6 @@
 classifier when the averaged probability is calculated.
 
 """
-print(__doc__)
 
 from itertools import product
 
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index 311539f36fc75..54c290c3073e0 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -22,7 +22,6 @@
 example dataset.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py
index bc0c0b059e9e6..23e709cc9e62a 100644
--- a/examples/ensemble/plot_voting_regressor.py
+++ b/examples/ensemble/plot_voting_regressor.py
@@ -22,7 +22,6 @@
 measure of disease progression one year after baseline.
 
 """
-print(__doc__)
 
 import matplotlib.pyplot as plt
 
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index aa836111f081d..e318a50fc8d1c 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -7,9 +7,8 @@
 
 This exercise is used in the :ref:`cv_estimators_tut` part of the
 :ref:`model_selection_tut` section of the :ref:`stat_learn_tut_index`.
-"""
 
-print(__doc__)
+"""
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py
index f350444395be7..e43bbd86bb027 100644
--- a/examples/exercises/plot_cv_digits.py
+++ b/examples/exercises/plot_cv_digits.py
@@ -7,9 +7,8 @@
 
 This exercise is used in the :ref:`cv_generators_tut` part of the
 :ref:`model_selection_tut` section of the :ref:`stat_learn_tut_index`.
-"""
-print(__doc__)
 
+"""
 
 import numpy as np
 from sklearn.model_selection import cross_val_score
diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
index 638631f80d066..877e615659743 100644
--- a/examples/exercises/plot_digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -9,8 +9,8 @@
 This exercise is used in the :ref:`clf_tut` part of the
 :ref:`supervised_learning_tut` section of the
 :ref:`stat_learn_tut_index`.
+
 """
-print(__doc__)
 
 from sklearn import datasets, neighbors, linear_model
 
diff --git a/examples/exercises/plot_iris_exercise.py b/examples/exercises/plot_iris_exercise.py
index 98586c311697e..74da8c27889c9 100644
--- a/examples/exercises/plot_iris_exercise.py
+++ b/examples/exercises/plot_iris_exercise.py
@@ -7,9 +7,8 @@
 
 This exercise is used in the :ref:`using_kernels_tut` part of the
 :ref:`supervised_learning_tut` section of the :ref:`stat_learn_tut_index`.
-"""
-print(__doc__)
 
+"""
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/feature_selection/plot_f_test_vs_mi.py b/examples/feature_selection/plot_f_test_vs_mi.py
index d6fef3e62da0d..a8cfc5d426bbc 100644
--- a/examples/feature_selection/plot_f_test_vs_mi.py
+++ b/examples/feature_selection/plot_f_test_vs_mi.py
@@ -20,8 +20,8 @@
 kind of dependency between variables and it rates x_2 as the most
 discriminative feature, which probably agrees better with our intuitive
 perception for this example. Both methods correctly marks x_3 as irrelevant.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index a68019568e015..1b15e5209c619 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -18,8 +18,8 @@
 Applying univariate feature selection before the SVM
 increases the SVM weight attributed to the significant features, and will
 thus improve classification.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
index a9a426a0e7b47..22a8bfc8dd5c3 100644
--- a/examples/feature_selection/plot_feature_selection_pipeline.py
+++ b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -7,9 +7,9 @@
 a machine learning pipeline.
 
 We also show that you can easily introspect part of the pipeline.
+
 """
 
-print(__doc__)
 from sklearn import set_config
 
 set_config(display="diagram")
diff --git a/examples/feature_selection/plot_rfe_digits.py b/examples/feature_selection/plot_rfe_digits.py
index 7c486c6237640..9684f5fabd383 100644
--- a/examples/feature_selection/plot_rfe_digits.py
+++ b/examples/feature_selection/plot_rfe_digits.py
@@ -11,7 +11,6 @@
     See also :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`
 
 """  # noqa: E501
-print(__doc__)
 
 from sklearn.svm import SVC
 from sklearn.datasets import load_digits
diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py
index 160fd5a826376..4ebd4b55730c8 100644
--- a/examples/feature_selection/plot_rfe_with_cross_validation.py
+++ b/examples/feature_selection/plot_rfe_with_cross_validation.py
@@ -5,8 +5,8 @@
 
 A recursive feature elimination example with automatic tuning of the
 number of features selected with cross-validation.
+
 """
-print(__doc__)
 
 import matplotlib.pyplot as plt
 from sklearn.svm import SVC
diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py
index 353774acd113e..9fa3acf78b4e7 100644
--- a/examples/feature_selection/plot_select_from_model_diabetes.py
+++ b/examples/feature_selection/plot_select_from_model_diabetes.py
@@ -16,10 +16,8 @@
 `Maria Telenczuk <https://github.com/maikia>`_, Nicolas Hug.
 
 License: BSD 3 clause
-"""
-
-print(__doc__)
 
+"""
 
 # %%
 # Loading the data
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index dd60fa2d6dda8..d0aa462f5c3eb 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -43,13 +43,12 @@
 hyperparameter space. The time for predicting is similar; however, generating
 the variance of the predictive distribution of GPR takes considerable longer
 than just predicting the mean.
+
 """
-print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # License: BSD 3 clause
 
-
 import time
 
 import numpy as np
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index ab22134e2048c..e2d78fa23f09e 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -19,8 +19,8 @@
 The second figure shows the log-marginal-likelihood for different choices of
 the kernel's hyperparameters, highlighting the two choices of the
 hyperparameters used in the first figure by black dots.
+
 """
-print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py
index 4aa4121664ece..ce0ed066a1377 100644
--- a/examples/gaussian_process/plot_gpc_iris.py
+++ b/examples/gaussian_process/plot_gpc_iris.py
@@ -7,8 +7,8 @@
 and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
 The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
 assigning different length-scales to the two feature dimensions.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 97673d9badadb..84f1ecb98bd3c 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -6,8 +6,8 @@
 
 A two-dimensional classification example showing iso-probability lines for
 the predicted probabilities.
+
 """
-print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 # Adapted to GaussianProcessClassifier:
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 011f36ada1021..6eebbcf80098e 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -8,8 +8,8 @@
 dataset, the DotProduct kernel obtains considerably better results because the
 class-boundaries are linear and coincide with the coordinate axes. In general,
 stationary kernels often obtain better results.
+
 """
-print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 7afe41d77cc63..988ec228817ff 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -56,12 +56,13 @@
 overall noise level is very small, indicating that the data can be very well
 explained by the model. The figure shows also that the model makes very
 confident predictions until around 2015.
+
 """
+
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
 # License: BSD 3 clause
 
-
 import numpy as np
 
 from matplotlib import pyplot as plt
@@ -74,8 +75,6 @@
     ExpSineSquared,
 )
 
-print(__doc__)
-
 
 def load_mauna_loa_atmospheric_co2():
     ml_data = fetch_openml(data_id=41187, as_frame=False)
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 0bba4827cd685..c5d5a77780df3 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -14,8 +14,8 @@
 hyperparameters, the gradient-based optimization might also converge to the
 high-noise solution. It is thus important to repeat the optimization several
 times for different initializations.
+
 """
-print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 0058589f7a786..34b404b48eb2c 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -17,8 +17,8 @@
 
 Note that the parameter ``alpha`` is applied as a Tikhonov
 regularization of the assumed covariance between the training points.
+
 """
-print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py
index 59af31664cb74..f54ca8e2aeacc 100644
--- a/examples/gaussian_process/plot_gpr_on_structured_data.py
+++ b/examples/gaussian_process/plot_gpr_on_structured_data.py
@@ -35,8 +35,8 @@
 .. [1] Haussler, D. (1999). Convolution kernels on discrete structures
        (Vol. 646). Technical report, Department of Computer Science, University
        of California at Santa Cruz.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index bd82613a20a45..437d67f5b0ab9 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -10,8 +10,8 @@
 
 Here, we only give some illustration. To know more about kernels' formulation,
 refer to the :ref:`User Guide <gp_kernels>`.
+
 """
-print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 77f015fd9ae12..9aef74343871c 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -37,8 +37,8 @@
 For this particular pattern of missing values we see that
 :class:`~ensemble.ExtraTreesRegressor` and
 :class:`~linear_model.BayesianRidge` give the best results.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index ec9d881af85b1..f292fc27db798 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -27,7 +27,6 @@
 missing values imputed using different techniques.
 
 """
-print(__doc__)
 
 # Authors: Maria Telenczuk  <https://github.com/maikia>
 # License: BSD 3 clause
diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
index b8fe44009e4f6..ddc03895f2215 100644
--- a/examples/inspection/plot_linear_model_coefficient_interpretation.py
+++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -26,9 +26,8 @@
 .. contents::
    :local:
    :depth: 1
-"""
 
-print(__doc__)
+"""
 
 import numpy as np
 import scipy as sp
diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py
index 1a9b91c39b585..4d22077fdbc09 100644
--- a/examples/inspection/plot_partial_dependence.py
+++ b/examples/inspection/plot_partial_dependence.py
@@ -31,9 +31,8 @@
        the Black Box: Visualizing Statistical Learning With Plots of
        Individual Conditional Expectation. (2015) Journal of Computational and
        Graphical Statistics, 24(1): 44-65 (https://arxiv.org/abs/1309.6392)
-"""
 
-print(__doc__)
+"""
 
 # %%
 # California Housing data preprocessing
diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
index c67b24326e128..a2ba6a8e7ee96 100644
--- a/examples/inspection/plot_permutation_importance.py
+++ b/examples/inspection/plot_permutation_importance.py
@@ -22,8 +22,9 @@
 
    [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
        2001. https://doi.org/10.1023/A:1010933404324
+
 """
-print(__doc__)
+
 import matplotlib.pyplot as plt
 import numpy as np
 
diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py
index 23edc95583483..59871c00946a6 100644
--- a/examples/inspection/plot_permutation_importance_multicollinear.py
+++ b/examples/inspection/plot_permutation_importance_multicollinear.py
@@ -15,8 +15,9 @@
 .. note::
     See also
     :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`
+
 """
-print(__doc__)
+
 from collections import defaultdict
 
 import matplotlib.pyplot as plt
diff --git a/examples/kernel_approximation/plot_scalable_poly_kernels.py b/examples/kernel_approximation/plot_scalable_poly_kernels.py
index c52b9fd59668d..21c55fe016889 100644
--- a/examples/kernel_approximation/plot_scalable_poly_kernels.py
+++ b/examples/kernel_approximation/plot_scalable_poly_kernels.py
@@ -18,11 +18,12 @@
 features. Then, we train linear classifiers on different numbers of
 features (`n_components`) generated by :class:`PolynomialCountSketch`,
 approximating the accuracy of a kernelized classifier in a scalable manner.
+
 """
-print(__doc__)
 
 # Author: Daniel Lopez-Sanchez <lope@usal.es>
 # License: BSD 3 clause
+
 import matplotlib.pyplot as plt
 from sklearn.datasets import fetch_covtype
 from sklearn.model_selection import train_test_split
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index 58529fe37a2cc..0691a76e8ee13 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -21,8 +21,8 @@
 Note the uncertainty starts going up on the right side of the plot.
 This is because these test samples are outside of the range of the training
 samples.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index a9593a5b41061..7f823d99236ea 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -21,8 +21,8 @@
 Note the uncertainty starts going up on the right side of the plot.
 This is because these test samples are outside of the range of the training
 samples.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py
index fc4f9e6cc384c..f174bc3059078 100644
--- a/examples/linear_model/plot_bayesian_ridge_curvefit.py
+++ b/examples/linear_model/plot_bayesian_ridge_curvefit.py
@@ -23,8 +23,8 @@
 Also, by evaluating log marginal likelihood (L) of
 these models, we can determine which one is better.
 It can be concluded that the model with larger L is more likely.
+
 """
-print(__doc__)
 
 # Author: Yoshihiro Uchida <nimbus1after2a1sun7shower@gmail.com>
 
diff --git a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
index 977759bc66b4c..3bca3101758ff 100644
--- a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
+++ b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
@@ -17,8 +17,6 @@
 
 """
 
-print(__doc__)
-
 # %%
 # Let's start by loading the dataset and creating some sample weights.
 import numpy as np
diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py
index d3137f3ac26cb..2ea5a190e35d8 100644
--- a/examples/linear_model/plot_huber_vs_ridge.py
+++ b/examples/linear_model/plot_huber_vs_ridge.py
@@ -10,13 +10,12 @@
 influenced by the outliers since the model uses the linear loss for these.
 As the parameter epsilon is increased for the Huber regressor, the decision
 function approaches that of the ridge.
+
 """
 
 # Authors: Manoj Kumar mks542@nyu.edu
 # License: BSD 3 clause
 
-print(__doc__)
-
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index 594db07484fd9..88a1313662084 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -10,7 +10,6 @@
 are colored according to their labels.
 
 """
-print(__doc__)
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
index 6a4b481048d8b..a4c0e9281fd76 100644
--- a/examples/linear_model/plot_lasso_and_elasticnet.py
+++ b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -8,7 +8,6 @@
 compared with the ground-truth.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_lasso_coordinate_descent_path.py b/examples/linear_model/plot_lasso_coordinate_descent_path.py
index 30addd843779c..7aef55ff4af88 100644
--- a/examples/linear_model/plot_lasso_coordinate_descent_path.py
+++ b/examples/linear_model/plot_lasso_coordinate_descent_path.py
@@ -7,8 +7,8 @@
 coordinate descent.
 
 The coefficients can be forced to be positive.
+
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
index a7e8c22042459..46d5dfccfa33b 100644
--- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
+++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
@@ -7,7 +7,6 @@
 data and that in the case of sparse data the speed is improved.
 
 """
-print(__doc__)
 
 from time import time
 from scipy import sparse
diff --git a/examples/linear_model/plot_lasso_lars.py b/examples/linear_model/plot_lasso_lars.py
index 08b14536e1c74..6788b8b1d1598 100644
--- a/examples/linear_model/plot_lasso_lars.py
+++ b/examples/linear_model/plot_lasso_lars.py
@@ -9,7 +9,6 @@
 of the regularization parameter.
 
 """
-print(__doc__)
 
 # Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #         Alexandre Gramfort <alexandre.gramfort@inria.fr>
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 2bbc4a9e8d921..b2792c92f15bd 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -40,8 +40,8 @@
 evaluate the performance of a method for which a parameter is chosen by
 cross-validation: this choice of parameter may not be optimal for unseen
 data.
+
 """
-print(__doc__)
 
 # Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
 # License: BSD 3 clause
diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index f0d2b7cb25170..fac3fe1a4c039 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -9,8 +9,6 @@
 i.e. class one or two, using the logistic curve.
 
 """
-print(__doc__)
-
 
 # Code source: Gael Varoquaux
 # License: BSD 3 clause
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index bd42966604075..ce0afef012a2b 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -12,9 +12,8 @@
 
 We classify 8x8 images of digits into two classes: 0-4 against 5-9.
 The visualization shows coefficients of the models for varying C.
-"""
 
-print(__doc__)
+"""
 
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #          Mathieu Blondel <mathieu@mblondel.org>
diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py
index ec22777f71e8a..143e946b76d58 100644
--- a/examples/linear_model/plot_logistic_multinomial.py
+++ b/examples/linear_model/plot_logistic_multinomial.py
@@ -6,8 +6,9 @@
 Plot decision surface of multinomial and One-vs-Rest Logistic Regression.
 The hyperplanes corresponding to the three One-vs-Rest (OVR) classifiers
 are represented by the dashed lines.
+
 """
-print(__doc__)
+
 # Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
 # License: BSD 3 clause
 
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index a63a767e9f875..b74d6a67021d0 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -24,7 +24,6 @@
 full-path.
 
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index 16d6139a822b9..b53c78b986acd 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -12,7 +12,6 @@
 point. This makes feature selection by the Lasso more stable.
 
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/linear_model/plot_nnls.py b/examples/linear_model/plot_nnls.py
index 02a6dade30cae..c8ba2914d783a 100644
--- a/examples/linear_model/plot_nnls.py
+++ b/examples/linear_model/plot_nnls.py
@@ -6,8 +6,9 @@
 In this example, we fit a linear model with positive constraints on the
 regression coefficients and compare the estimated coefficients to a classic
 linear regression.
+
 """
-print(__doc__)
+
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.metrics import r2_score
diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py
index fa63ddb6880bc..2d1930eab1b2a 100644
--- a/examples/linear_model/plot_ols.py
+++ b/examples/linear_model/plot_ols.py
@@ -12,14 +12,12 @@
 
 The coefficients, residual sum of squares and the coefficient of
 determination are also calculated.
-"""
-print(__doc__)
 
+"""
 
 # Code source: Jaques Grobler
 # License: BSD 3 clause
 
-
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn import datasets, linear_model
diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py
index fb1e2a2cc03f4..2c9e0c7a91bc0 100644
--- a/examples/linear_model/plot_ols_3d.py
+++ b/examples/linear_model/plot_ols_3d.py
@@ -10,8 +10,6 @@
 give us much regarding `y` when compared to just feature 1
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/linear_model/plot_ols_ridge_variance.py b/examples/linear_model/plot_ols_ridge_variance.py
index a83978396ee3f..4c668f97786be 100644
--- a/examples/linear_model/plot_ols_ridge_variance.py
+++ b/examples/linear_model/plot_ols_ridge_variance.py
@@ -17,9 +17,8 @@
 of the prediction is much more stable and the variance
 in the line itself is greatly reduced, in comparison to that
 of the standard linear regression
-"""
-print(__doc__)
 
+"""
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/linear_model/plot_omp.py b/examples/linear_model/plot_omp.py
index 2431ca10f5fb7..a4ab6515026ee 100644
--- a/examples/linear_model/plot_omp.py
+++ b/examples/linear_model/plot_omp.py
@@ -5,8 +5,8 @@
 
 Using orthogonal matching pursuit for recovering a sparse signal from a noisy
 measurement encoded with a dictionary
+
 """
-print(__doc__)
 
 import matplotlib.pyplot as plt
 import numpy as np
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 7c9abd261937d..cadc25412eb57 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -35,11 +35,12 @@
     <http://dx.doi.org/10.2139/ssrn.3164764>`_
 
 """
-print(__doc__)
+
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Roman Yurchak <rth.yurchak@gmail.com>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
+
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index db1666168c88e..49a69ce894a52 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -33,8 +33,8 @@
 non-linear effects with a linear model, using a pipeline to add non-linear
 features. Kernel methods extend this idea and can induce very high (even
 infinite) dimensional feature spaces.
+
 """
-print(__doc__)
 
 # Author: Mathieu Blondel
 #         Jake Vanderplas
diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py
index 9fae4720499af..b4e1534efc54f 100644
--- a/examples/linear_model/plot_quantile_regression.py
+++ b/examples/linear_model/plot_quantile_regression.py
@@ -11,9 +11,8 @@
 
 The right figure shows an example of an asymmetric error distribution,
 namely the Pareto distribution.
-"""
 
-print(__doc__)
+"""
 
 # Authors: David Dale <dale.david@mail.ru>
 #          Christian Lorentzen <lorentzen.ch@gmail.com>
diff --git a/examples/linear_model/plot_ransac.py b/examples/linear_model/plot_ransac.py
index 0c4070daf2fe9..81670061a6609 100644
--- a/examples/linear_model/plot_ransac.py
+++ b/examples/linear_model/plot_ransac.py
@@ -7,6 +7,7 @@
 the RANSAC algorithm.
 
 """
+
 import numpy as np
 from matplotlib import pyplot as plt
 
diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py
index 0dd395baf5fcd..bfe6d818b2f37 100644
--- a/examples/linear_model/plot_ridge_coeffs.py
+++ b/examples/linear_model/plot_ridge_coeffs.py
@@ -35,12 +35,11 @@
 
 Please note that in this example the data is non-noisy, hence
 it is possible to extract the exact coefficients.
+
 """
 
 # Author: Kornel Kielczewski -- <kornel.k@plusnet.pl>
 
-print(__doc__)
-
 import matplotlib.pyplot as plt
 import numpy as np
 
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
index 14fac0b5bdc92..67a8753492734 100644
--- a/examples/linear_model/plot_ridge_path.py
+++ b/examples/linear_model/plot_ridge_path.py
@@ -24,13 +24,12 @@
 and the solution tends towards the ordinary least squares, coefficients
 exhibit big oscillations. In practise it is necessary to tune alpha
 in such a way that a balance is maintained between both.
+
 """
 
 # Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
 # License: BSD 3 clause
 
-print(__doc__)
-
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn import linear_model
diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py
index c75775cbe84f0..4e2139b450fb0 100644
--- a/examples/linear_model/plot_sgd_comparison.py
+++ b/examples/linear_model/plot_sgd_comparison.py
@@ -7,6 +7,7 @@
 on the hand-written digits dataset.
 
 """
+
 # Author: Rob Zinkov <rob at zinkov dot com>
 # License: BSD 3 clause
 
diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py
index baa63f770bb6c..6e41c961ebb93 100644
--- a/examples/linear_model/plot_sgd_early_stopping.py
+++ b/examples/linear_model/plot_sgd_early_stopping.py
@@ -34,10 +34,13 @@
 significantly reduce training time. Note that scores differ between the
 stopping criteria even from early iterations because some of the training data
 is held out with the validation stopping criterion.
+
 """
+
 # Authors: Tom Dupre la Tour
 #
 # License: BSD 3 clause
+
 import time
 import sys
 
@@ -52,8 +55,6 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils import shuffle
 
-print(__doc__)
-
 
 def load_mnist(n_samples=None, class_0="0", class_1="8"):
     """Load MNIST, select two classes, shuffle and return only n_samples."""
diff --git a/examples/linear_model/plot_sgd_iris.py b/examples/linear_model/plot_sgd_iris.py
index 367ec6a1f69af..0113c259d7afa 100644
--- a/examples/linear_model/plot_sgd_iris.py
+++ b/examples/linear_model/plot_sgd_iris.py
@@ -8,7 +8,6 @@
 are represented by the dashed lines.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sgd_loss_functions.py b/examples/linear_model/plot_sgd_loss_functions.py
index 0cc4378ac9286..a1f74dca4d6af 100644
--- a/examples/linear_model/plot_sgd_loss_functions.py
+++ b/examples/linear_model/plot_sgd_loss_functions.py
@@ -5,8 +5,8 @@
 
 A plot that compares the various convex loss functions supported by
 :class:`~sklearn.linear_model.SGDClassifier` .
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py
index 795be3a15c4dc..68df56063e642 100644
--- a/examples/linear_model/plot_sgd_penalties.py
+++ b/examples/linear_model/plot_sgd_penalties.py
@@ -10,7 +10,6 @@
 and :class:`~sklearn.linear_model.SGDRegressor`.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sgd_separating_hyperplane.py b/examples/linear_model/plot_sgd_separating_hyperplane.py
index b485c32fbd4d5..af288fcd3dde0 100644
--- a/examples/linear_model/plot_sgd_separating_hyperplane.py
+++ b/examples/linear_model/plot_sgd_separating_hyperplane.py
@@ -6,8 +6,8 @@
 Plot the maximum margin separating hyperplane within a two-class
 separable dataset using a linear Support Vector Machines classifier
 trained using SGD.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sgd_weighted_samples.py b/examples/linear_model/plot_sgd_weighted_samples.py
index 64156fe7096c1..56cb47d0d4902 100644
--- a/examples/linear_model/plot_sgd_weighted_samples.py
+++ b/examples/linear_model/plot_sgd_weighted_samples.py
@@ -5,8 +5,8 @@
 
 Plot decision function of a weighted dataset, where the size of points
 is proportional to its weight.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
index 2252ad1fc98e9..c25f4a84d91e0 100644
--- a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
+++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
@@ -16,8 +16,8 @@
 number of samples. It is not the purpose of this example to illustrate the
 benefits of such an approximation in terms of computation time but rather to
 show that we obtain similar results on a toy dataset.
-"""
-print(__doc__)  # noqa
+
+"""  # noqa: E501
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
index 71de01bbf34a1..10c4adeeff200 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
@@ -17,7 +17,11 @@
 A more traditional (and possibly better) way to predict on a sparse subset of
 input features would be to use univariate feature selection followed by a
 traditional (l2-penalised) logistic regression model.
+
 """
+
+# Author: Arthur Mensch
+
 import timeit
 import warnings
 
@@ -29,9 +33,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.exceptions import ConvergenceWarning
 
-print(__doc__)
-# Author: Arthur Mensch
-
 warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
 t0 = timeit.default_timer()
 
diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
index 27d428881216f..5862369bc7bff 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
@@ -16,6 +16,10 @@
 multi-layer perceptron model on this dataset.
 
 """
+
+# Author: Arthur Mensch <arthur.mensch@m4x.org>
+# License: BSD 3 clause
+
 import time
 import matplotlib.pyplot as plt
 import numpy as np
@@ -26,11 +30,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_random_state
 
-print(__doc__)
-
-# Author: Arthur Mensch <arthur.mensch@m4x.org>
-# License: BSD 3 clause
-
 # Turn down for faster convergence
 t0 = time.time()
 train_samples = 5000
diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py
index 54196188d864a..5d5de89b8e5da 100644
--- a/examples/linear_model/plot_theilsen.py
+++ b/examples/linear_model/plot_theilsen.py
@@ -32,6 +32,7 @@
 therefore also limits the runtime. Therefore, Theil-Sen is applicable to larger
 problems with the drawback of losing some of its mathematical properties since
 it then works on a random subset.
+
 """
 
 # Author: Florian Wilhelm -- <florian.wilhelm@gmail.com>
@@ -43,8 +44,6 @@
 from sklearn.linear_model import LinearRegression, TheilSenRegressor
 from sklearn.linear_model import RANSACRegressor
 
-print(__doc__)
-
 estimators = [
     ("OLS", LinearRegression()),
     ("Theil-Sen", TheilSenRegressor(random_state=42)),
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 1bcf26742d2f1..31c51750a3738 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -37,12 +37,12 @@
     <http://dx.doi.org/10.2139/ssrn.3164764>`_
 
 """
-print(__doc__)
 
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Roman Yurchak <rth.yurchak@gmail.com>
 #          Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
+
 from functools import partial
 
 import numpy as np
diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index a8485f07bf150..1228705c242d8 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -17,12 +17,11 @@
 the distances in the original high-dimensional space, unlike other
 manifold-learning algorithms, it does not seeks an isotropic
 representation of the data in the low-dimensional space.
+
 """
 
 # Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
 
-print(__doc__)
-
 from collections import OrderedDict
 from functools import partial
 from time import time
diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index 647f9e8a5acdb..ba8d0f839272c 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -4,9 +4,8 @@
 =============================================================================
 
 We illustrate various embedding techniques on the digits dataset.
-"""
 
-print(__doc__)
+"""
 
 # Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #          Olivier Grisel <olivier.grisel@ensta.org>
diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py
index 94616fa67167e..67f1447809841 100644
--- a/examples/manifold/plot_manifold_sphere.py
+++ b/examples/manifold/plot_manifold_sphere.py
@@ -23,13 +23,12 @@
 the low-dimensional space. Here the manifold problem matches fairly
 that of representing a flat map of the Earth, as with
 `map projection <https://en.wikipedia.org/wiki/Map_projection>`_
+
 """
 
 # Author: Jaques Grobler <jaques.grobler@inria.fr>
 # License: BSD 3 clause
 
-print(__doc__)
-
 from time import time
 
 import numpy as np
diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py
index 1605ae5b20164..1d74856da6dac 100644
--- a/examples/manifold/plot_mds.py
+++ b/examples/manifold/plot_mds.py
@@ -7,12 +7,12 @@
 
 The reconstructed points using the metric MDS and non metric MDS are slightly
 shifted to avoid overlapping.
+
 """
 
 # Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
 # License: BSD
 
-print(__doc__)
 import numpy as np
 
 from matplotlib import pyplot as plt
diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py
index 3aa2088c22687..ee5fe196fa6ba 100644
--- a/examples/manifold/plot_swissroll.py
+++ b/examples/manifold/plot_swissroll.py
@@ -5,13 +5,12 @@
 
 An illustration of Swiss Roll reduction
 with locally linear embedding
+
 """
 
 # Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
 # License: BSD 3 clause (C) INRIA 2011
 
-print(__doc__)
-
 import matplotlib.pyplot as plt
 
 # This import is needed to modify the way figure behaves
diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py
index 04da2eb51acb1..6fd83f48bdb3f 100644
--- a/examples/manifold/plot_t_sne_perplexity.py
+++ b/examples/manifold/plot_t_sne_perplexity.py
@@ -21,13 +21,12 @@
 https://distill.pub/2016/misread-tsne/ provides a good discussion of the
 effects of various parameters, as well as interactive plots to explore
 those effects.
+
 """
 
 # Author: Narine Kokhlikyan <narine@slice.com>
 # License: BSD
 
-print(__doc__)
-
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py
index 924ab47fa81ca..efb4f6d86edfc 100644
--- a/examples/miscellaneous/plot_anomaly_comparison.py
+++ b/examples/miscellaneous/plot_anomaly_comparison.py
@@ -59,6 +59,7 @@
 Finally, note that parameters of the models have been here handpicked but
 that in practice they need to be adjusted. In the absence of labelled data,
 the problem is completely unsupervised so model selection can be a challenge.
+
 """
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
@@ -80,8 +81,6 @@
 from sklearn.kernel_approximation import Nystroem
 from sklearn.pipeline import make_pipeline
 
-print(__doc__)
-
 matplotlib.rcParams["contour.negative_linestyle"] = "solid"
 
 # Example settings
diff --git a/examples/miscellaneous/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
index d27b17f3cc82d..c213c4b5d98af 100644
--- a/examples/miscellaneous/plot_changed_only_pprint_parameter.py
+++ b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
@@ -8,8 +8,8 @@
 Setting print_changed_only to True will alternate the representation of
 estimators to only show the parameters that have been set to non-default
 values. This can be used to have more compact representations.
+
 """
-print(__doc__)
 
 from sklearn.linear_model import LogisticRegression
 from sklearn import set_config
diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py
index bb9d252fe830c..e36b4de78d62a 100644
--- a/examples/miscellaneous/plot_display_object_visualization.py
+++ b/examples/miscellaneous/plot_display_object_visualization.py
@@ -12,8 +12,8 @@
 a model's predictions are already computed or expensive to compute. Note that
 this is advanced usage, and in general we recommend using their respective
 plot functions.
+
 """
-print(__doc__)
 
 # %%
 # Load Data and train model
diff --git a/examples/miscellaneous/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py
index f75bc1144e881..0240a8dec34b5 100644
--- a/examples/miscellaneous/plot_isotonic_regression.py
+++ b/examples/miscellaneous/plot_isotonic_regression.py
@@ -18,7 +18,6 @@
 target values are computed by the isotonic non-parametric fit.
 
 """
-print(__doc__)
 
 # Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
 #         Alexandre Gramfort <alexandre.gramfort@inria.fr>
diff --git a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
index 64815751efa36..c92920ffb7280 100644
--- a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
+++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
@@ -10,9 +10,8 @@
 
 .. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\
     Johnson%E2%80%93Lindenstrauss_lemma
-"""
 
-print(__doc__)
+"""
 
 import sys
 from time import time
diff --git a/examples/miscellaneous/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py
index ffd50e9dca06e..7dfc1e31220e8 100644
--- a/examples/miscellaneous/plot_kernel_approximation.py
+++ b/examples/miscellaneous/plot_kernel_approximation.py
@@ -38,8 +38,6 @@
 #         Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
-print(__doc__)
-
 # Standard scientific Python imports
 import matplotlib.pyplot as plt
 import numpy as np
diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
index eaff3d91fd82d..29a6f1ca6a0d6 100644
--- a/examples/miscellaneous/plot_kernel_ridge_regression.py
+++ b/examples/miscellaneous/plot_kernel_ridge_regression.py
@@ -29,12 +29,12 @@
 KRR for all sizes of the training set because of the learned sparse
 solution. Note that the degree of sparsity and thus the prediction time depends
 on the parameters epsilon and C of the SVR.
+
 """
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # License: BSD 3 clause
 
-
 import time
 
 import numpy as np
diff --git a/examples/miscellaneous/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py
index 4958084b9dbd7..aded595258fea 100644
--- a/examples/miscellaneous/plot_multilabel.py
+++ b/examples/miscellaneous/plot_multilabel.py
@@ -1,5 +1,3 @@
-# Authors: Vlad Niculae, Mathieu Blondel
-# License: BSD 3 clause
 """
 =========================
 Multilabel classification
@@ -28,8 +26,11 @@
 Note: in the plot, "unlabeled samples" does not mean that we don't know the
 labels (as in semi-supervised learning) but that the samples simply do *not*
 have a label.
+
 """
-print(__doc__)
+
+# Authors: Vlad Niculae, Mathieu Blondel
+# License: BSD 3 clause
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/miscellaneous/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py
index 63b3bea4175ba..31e73195747a5 100644
--- a/examples/miscellaneous/plot_multioutput_face_completion.py
+++ b/examples/miscellaneous/plot_multioutput_face_completion.py
@@ -11,7 +11,6 @@
 regression and ridge regression complete the lower half of those faces.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 5f6e61a89c4fe..604ead891877c 100644
--- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -12,8 +12,7 @@
 
     See also :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
 
-"""  # noqa
-print(__doc__)
+"""  # noqa: E501
 
 import pandas as pd
 import matplotlib.pyplot as plt
diff --git a/examples/miscellaneous/plot_roc_curve_visualization_api.py b/examples/miscellaneous/plot_roc_curve_visualization_api.py
index e1ccd24341493..b4e08493c77d4 100644
--- a/examples/miscellaneous/plot_roc_curve_visualization_api.py
+++ b/examples/miscellaneous/plot_roc_curve_visualization_api.py
@@ -6,8 +6,8 @@
 learning. The key features of this API is to allow for quick plotting and
 visual adjustments without recalculation. In this example, we will demonstrate
 how to use the visualization API by comparing ROC curves.
+
 """
-print(__doc__)
 
 # %%
 # Load Data and Train a SVC
diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py
index 65830b6dc1182..1d8642df64f48 100644
--- a/examples/mixture/plot_concentration_prior.py
+++ b/examples/mixture/plot_concentration_prior.py
@@ -26,7 +26,9 @@
 On the contrary the classical finite mixture model with a Dirichlet
 distribution prior will favor more uniformly weighted components and therefore
 tends to divide natural clusters into unnecessary sub-components.
+
 """
+
 # Author: Thierry Guillemot <thierry.guillemot.work@gmail.com>
 # License: BSD 3 clause
 
@@ -37,8 +39,6 @@
 
 from sklearn.mixture import BayesianGaussianMixture
 
-print(__doc__)
-
 
 def plot_ellipses(ax, weights, means, covars):
     for n in range(means.shape[0]):
diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py
index d3548eb6bed83..675aa341696ac 100644
--- a/examples/mixture/plot_gmm.py
+++ b/examples/mixture/plot_gmm.py
@@ -21,6 +21,7 @@
 full covariance matrices effectively even when there are less examples
 per cluster than there are dimensions in the data, due to
 regularization properties of the inference algorithm.
+
 """
 
 import itertools
diff --git a/examples/mixture/plot_gmm_covariances.py b/examples/mixture/plot_gmm_covariances.py
index 123c9846156a1..95b5d2c1ba90f 100644
--- a/examples/mixture/plot_gmm_covariances.py
+++ b/examples/mixture/plot_gmm_covariances.py
@@ -24,6 +24,7 @@
 crosses. The iris dataset is four-dimensional. Only the first two
 dimensions are shown here, and thus some points are separated in other
 dimensions.
+
 """
 
 # Author: Ron Weiss <ronweiss@gmail.com>, Gael Varoquaux
@@ -39,8 +40,6 @@
 from sklearn.mixture import GaussianMixture
 from sklearn.model_selection import StratifiedKFold
 
-print(__doc__)
-
 colors = ["navy", "turquoise", "darkorange"]
 
 
diff --git a/examples/mixture/plot_gmm_pdf.py b/examples/mixture/plot_gmm_pdf.py
index 73c46cd5e7fb1..70d58f22f8f41 100644
--- a/examples/mixture/plot_gmm_pdf.py
+++ b/examples/mixture/plot_gmm_pdf.py
@@ -6,6 +6,7 @@
 Plot the density estimation of a mixture of two Gaussians. Data is
 generated from two Gaussians with different centers and covariance
 matrices.
+
 """
 
 import numpy as np
diff --git a/examples/mixture/plot_gmm_selection.py b/examples/mixture/plot_gmm_selection.py
index 1d2aebe7b077a..4f9d6d69b1825 100644
--- a/examples/mixture/plot_gmm_selection.py
+++ b/examples/mixture/plot_gmm_selection.py
@@ -13,6 +13,7 @@
 
 In that case, the model with 2 components and full covariance
 (which corresponds to the true generative model) is selected.
+
 """
 
 import numpy as np
@@ -24,8 +25,6 @@
 
 from sklearn import mixture
 
-print(__doc__)
-
 # Number of samples per component
 n_samples = 500
 
diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py
index 1b6ac48d436a7..76f0d30e4e9d8 100644
--- a/examples/mixture/plot_gmm_sin.py
+++ b/examples/mixture/plot_gmm_sin.py
@@ -48,8 +48,6 @@
 
 from sklearn import mixture
 
-print(__doc__)
-
 color_iter = itertools.cycle(["navy", "c", "cornflowerblue", "gold", "darkorange"])
 
 
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index a28593eb90866..dc2618473ecb1 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -56,8 +56,6 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
 
-print(__doc__)
-
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py
index 251a4e175eb90..b891564db4025 100644
--- a/examples/model_selection/plot_confusion_matrix.py
+++ b/examples/model_selection/plot_confusion_matrix.py
@@ -24,8 +24,6 @@
 
 """
 
-print(__doc__)
-
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py
index 24deba92f11e5..1dc056e2072bd 100644
--- a/examples/model_selection/plot_cv_indices.py
+++ b/examples/model_selection/plot_cv_indices.py
@@ -9,6 +9,7 @@
 
 This example visualizes the behavior of several common scikit-learn objects
 for comparison.
+
 """
 
 from sklearn.model_selection import (
diff --git a/examples/model_selection/plot_cv_predict.py b/examples/model_selection/plot_cv_predict.py
index 8d31da4ff2fec..82ef0b8b81ae6 100644
--- a/examples/model_selection/plot_cv_predict.py
+++ b/examples/model_selection/plot_cv_predict.py
@@ -8,6 +8,7 @@
 errors.
 
 """
+
 from sklearn import datasets
 from sklearn.model_selection import cross_val_predict
 from sklearn import linear_model
diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py
index ea83984c4d712..1869c5a43c059 100644
--- a/examples/model_selection/plot_det.py
+++ b/examples/model_selection/plot_det.py
@@ -47,6 +47,7 @@
       example.
 
 """
+
 import matplotlib.pyplot as plt
 
 from sklearn.datasets import make_classification
diff --git a/examples/model_selection/plot_grid_search_digits.py b/examples/model_selection/plot_grid_search_digits.py
index b6100489d4a53..0553ed9749616 100644
--- a/examples/model_selection/plot_grid_search_digits.py
+++ b/examples/model_selection/plot_grid_search_digits.py
@@ -15,14 +15,13 @@
 sections on :ref:`cross_validation` and :ref:`grid_search`.
 
 """
+
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import classification_report
 from sklearn.svm import SVC
 
-print(__doc__)
-
 # Loading the Digits dataset
 digits = datasets.load_digits()
 
diff --git a/examples/model_selection/plot_grid_search_refit_callable.py b/examples/model_selection/plot_grid_search_refit_callable.py
index 3d363d6bb7384..53513aa4ba1ec 100644
--- a/examples/model_selection/plot_grid_search_refit_callable.py
+++ b/examples/model_selection/plot_grid_search_refit_callable.py
@@ -15,10 +15,10 @@
 [1] Hastie, T., Tibshirani, R.,, Friedman, J. (2001). Model Assessment and
 Selection. The Elements of Statistical Learning (pp. 219-260). New York,
 NY, USA: Springer New York Inc..
+
 """
-# Author: Wenhao Zhang <wenhaoz@ucla.edu>
 
-print(__doc__)
+# Author: Wenhao Zhang <wenhaoz@ucla.edu>
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/model_selection/plot_grid_search_stats.py b/examples/model_selection/plot_grid_search_stats.py
index 70dd204390491..179d860b42128 100644
--- a/examples/model_selection/plot_grid_search_stats.py
+++ b/examples/model_selection/plot_grid_search_stats.py
@@ -5,6 +5,7 @@
 
 This example illustrates how to statistically compare the performance of models
 trained and evaluated using :class:`~sklearn.model_selection.GridSearchCV`.
+
 """
 
 # %%
@@ -13,7 +14,6 @@
 # Datapoints will belong to one of two possible classes to be predicted by two
 # features. We will simulate 50 samples for each class:
 
-print(__doc__)
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.datasets import make_moons
diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index 46b11cad2e7e4..a7045a0bf6c88 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -14,8 +14,8 @@
 show the times required by the models to train with various sizes of training
 dataset. The plots in the third row show how much time was required to train
 the models for each training sizes.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py
index 7d6ce84eb5f26..0649c707b29ee 100644
--- a/examples/model_selection/plot_multi_metric_evaluation.py
+++ b/examples/model_selection/plot_multi_metric_evaluation.py
@@ -13,6 +13,7 @@
 
 The ``best_estimator_``, ``best_index_``, ``best_score_`` and ``best_params_``
 correspond to the scorer (key) that is set to the ``refit`` attribute.
+
 """
 
 # Author: Raghav RV <rvraghav93@gmail.com>
@@ -27,8 +28,6 @@
 from sklearn.metrics import accuracy_score
 from sklearn.tree import DecisionTreeClassifier
 
-print(__doc__)
-
 # %%
 # Running ``GridSearchCV`` using multiple evaluation metrics
 # ----------------------------------------------------------
diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py
index 030b46a0c748d..81d89d93afe91 100644
--- a/examples/model_selection/plot_nested_cross_validation_iris.py
+++ b/examples/model_selection/plot_nested_cross_validation_iris.py
@@ -43,14 +43,13 @@
      <http://jmlr.csail.mit.edu/papers/volume11/cawley10a/cawley10a.pdf>`_
 
 """
+
 from sklearn.datasets import load_iris
 from matplotlib import pyplot as plt
 from sklearn.svm import SVC
 from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
 import numpy as np
 
-print(__doc__)
-
 # Number of random trials
 NUM_TRIALS = 30
 
diff --git a/examples/model_selection/plot_permutation_tests_for_classification.py b/examples/model_selection/plot_permutation_tests_for_classification.py
index 20fcf11b1dd9e..204de0ce3b731 100644
--- a/examples/model_selection/plot_permutation_tests_for_classification.py
+++ b/examples/model_selection/plot_permutation_tests_for_classification.py
@@ -6,6 +6,7 @@
 This example demonstrates the use of
 :func:`~sklearn.model_selection.permutation_test_score` to evaluate the
 significance of a cross-validated score using permutations.
+
 """
 
 # Authors:  Alexandre Gramfort <alexandre.gramfort@inria.fr>
diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index c0f0a97dd44ce..4d9ebcdc4abe2 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -90,6 +90,7 @@
              :func:`sklearn.metrics.precision_score`,
              :func:`sklearn.metrics.f1_score`
 """
+
 # %%
 # In binary classification settings
 # ---------------------------------
diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py
index 7893f704404ac..b138ee17d11cd 100644
--- a/examples/model_selection/plot_randomized_search.py
+++ b/examples/model_selection/plot_randomized_search.py
@@ -17,8 +17,8 @@
 
 Note that in practice, one would not search over this many different parameters
 simultaneously using grid search, but pick only the ones deemed most important.
+
 """
-print(__doc__)
 
 import numpy as np
 
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 27c32820d5d96..e4f8d71f72b15 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -32,7 +32,6 @@
              :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index a1fd8cf24432e..97e70613d56ec 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -29,7 +29,6 @@
              :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py
index c78de42ab9359..c7104f6d7144b 100644
--- a/examples/model_selection/plot_successive_halving_heatmap.py
+++ b/examples/model_selection/plot_successive_halving_heatmap.py
@@ -7,6 +7,7 @@
 :class:`~sklearn.model_selection.GridSearchCV`.
 
 """
+
 from time import time
 
 import matplotlib.pyplot as plt
@@ -20,8 +21,6 @@
 from sklearn.model_selection import HalvingGridSearchCV
 
 
-print(__doc__)
-
 # %%
 # We first define the parameter space for an :class:`~sklearn.svm.SVC`
 # estimator, and compute the time required to train a
diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py
index 11c204ef939d5..f23d2023ec251 100644
--- a/examples/model_selection/plot_successive_halving_iterations.py
+++ b/examples/model_selection/plot_successive_halving_iterations.py
@@ -9,6 +9,7 @@
 multiple candidates.
 
 """
+
 import pandas as pd
 from sklearn import datasets
 import matplotlib.pyplot as plt
@@ -20,8 +21,6 @@
 from sklearn.ensemble import RandomForestClassifier
 
 
-print(__doc__)
-
 # %%
 # We first define the parameter space and train a
 # :class:`~sklearn.model_selection.HalvingRandomSearchCV` instance.
diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py
index d9c00805aa390..2bde2486b48d6 100644
--- a/examples/model_selection/plot_train_error_vs_test_error.py
+++ b/examples/model_selection/plot_train_error_vs_test_error.py
@@ -11,7 +11,6 @@
 measured using the explained variance a.k.a. R^2.
 
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py
index c93da3f2dc5b5..ae8450b50cea9 100644
--- a/examples/model_selection/plot_underfitting_overfitting.py
+++ b/examples/model_selection/plot_underfitting_overfitting.py
@@ -18,9 +18,8 @@
 cross-validation. We calculate the mean squared error (MSE) on the validation
 set, the higher, the less likely the model generalizes correctly from the
 training data.
-"""
 
-print(__doc__)
+"""
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
index 524dd71eda40f..0ad87e25ae746 100644
--- a/examples/model_selection/plot_validation_curve.py
+++ b/examples/model_selection/plot_validation_curve.py
@@ -10,8 +10,8 @@
 values for both scores, i.e. the classifier is performing fairly well. If gamma
 is too high, the classifier will overfit, which means that the training score
 is good but the validation score is poor.
+
 """
-print(__doc__)
 
 import matplotlib.pyplot as plt
 import numpy as np
diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
index a92249b97e4ad..791a619d07f5f 100644
--- a/examples/multioutput/plot_classifier_chain_yeast.py
+++ b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -30,6 +30,7 @@
 ensemble is greater than that of the independent models and tends to exceed
 the score of each chain in the ensemble (although this is not guaranteed
 with randomly ordered chains).
+
 """
 
 # Author: Adam Kleczewski
@@ -44,8 +45,6 @@
 from sklearn.metrics import jaccard_score
 from sklearn.linear_model import LogisticRegression
 
-print(__doc__)
-
 # Load a multi-label dataset from https://www.openml.org/d/40597
 X, Y = fetch_openml("yeast", version=4, return_X_y=True)
 Y = Y == "TRUE"
diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py
index 055796600fd4e..769e839fc9b21 100644
--- a/examples/neighbors/approximate_nearest_neighbors.py
+++ b/examples/neighbors/approximate_nearest_neighbors.py
@@ -36,6 +36,7 @@
     TSNE with internal NearestNeighbors: 64.984 sec
 
 """
+
 # Author: Tom Dupre la Tour
 #
 # License: BSD 3 clause
@@ -67,8 +68,6 @@
 from sklearn.manifold import TSNE
 from sklearn.utils import shuffle
 
-print(__doc__)
-
 
 class NMSlibTransformer(TransformerMixin, BaseEstimator):
     """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py
index 0ffe6b92d3053..00be6470c1591 100644
--- a/examples/neighbors/plot_caching_nearest_neighbors.py
+++ b/examples/neighbors/plot_caching_nearest_neighbors.py
@@ -15,7 +15,9 @@
 do not need to recompute the graph. Here the durations are small since the
 dataset is small, but the gain can be more substantial when the dataset grows
 larger, or when the grid of parameter to search is large.
+
 """
+
 # Author: Tom Dupre la Tour
 #
 # License: BSD 3 clause
@@ -27,8 +29,6 @@
 from sklearn.datasets import load_digits
 from sklearn.pipeline import Pipeline
 
-print(__doc__)
-
 X, y = load_digits(return_X_y=True)
 n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]
 
diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py
index 08e22a2874a16..59d6285d9dcd0 100644
--- a/examples/neighbors/plot_classification.py
+++ b/examples/neighbors/plot_classification.py
@@ -5,8 +5,8 @@
 
 Sample usage of Nearest Neighbors classification.
 It will plot the decision boundaries for each class.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py
index 44e4b1f65c38b..e580f9fa178bc 100644
--- a/examples/neighbors/plot_digits_kde_sampling.py
+++ b/examples/neighbors/plot_digits_kde_sampling.py
@@ -8,6 +8,7 @@
 a generative model for a dataset.  With this generative model in place,
 new samples can be drawn.  These new samples reflect the underlying model
 of the data.
+
 """
 
 import numpy as np
diff --git a/examples/neighbors/plot_kde_1d.py b/examples/neighbors/plot_kde_1d.py
index 857be4feaa367..6ff0491141d1a 100644
--- a/examples/neighbors/plot_kde_1d.py
+++ b/examples/neighbors/plot_kde_1d.py
@@ -25,7 +25,9 @@
 samples in 1 dimension.  Though this example uses 1D distributions, kernel
 density estimation is easily and efficiently extensible to higher dimensions
 as well.
+
 """
+
 # Author: Jake Vanderplas <jakevdp@cs.washington.edu>
 #
 import numpy as np
diff --git a/examples/neighbors/plot_lof_novelty_detection.py b/examples/neighbors/plot_lof_novelty_detection.py
index a37d28bb6d4bb..277134cc77673 100644
--- a/examples/neighbors/plot_lof_novelty_detection.py
+++ b/examples/neighbors/plot_lof_novelty_detection.py
@@ -22,6 +22,7 @@
 local outliers.
 In practice, such information is generally not available, and taking
 n_neighbors=20 appears to work well in general.
+
 """
 
 import numpy as np
@@ -29,8 +30,6 @@
 import matplotlib.pyplot as plt
 from sklearn.neighbors import LocalOutlierFactor
 
-print(__doc__)
-
 np.random.seed(42)
 
 xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
diff --git a/examples/neighbors/plot_lof_outlier_detection.py b/examples/neighbors/plot_lof_outlier_detection.py
index 0c7706acc8b93..1512173965889 100644
--- a/examples/neighbors/plot_lof_outlier_detection.py
+++ b/examples/neighbors/plot_lof_outlier_detection.py
@@ -21,14 +21,13 @@
 local outliers.
 In practice, such information is generally not available, and taking
 n_neighbors=20 appears to work well in general.
+
 """
 
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.neighbors import LocalOutlierFactor
 
-print(__doc__)
-
 np.random.seed(42)
 
 # Generate train data
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index 79b4a7a370557..7fb1d8f31c88f 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -12,6 +12,7 @@
 Components Analysis. The latter aims to find a linear transformation that
 maximises the (stochastic) nearest neighbor classification accuracy on the
 training set.
+
 """
 
 # License: BSD 3 clause
@@ -26,8 +27,6 @@
 from sklearn.pipeline import Pipeline
 
 
-print(__doc__)
-
 n_neighbors = 1
 
 dataset = datasets.load_iris()
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 95be0e7ec327d..d245e0223ccfa 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -25,6 +25,7 @@
 
 One can see that NCA enforces a clustering of the data that is visually
 meaningful despite the large reduction in dimension.
+
 """
 
 # License: BSD 3 clause
@@ -39,8 +40,6 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
-print(__doc__)
-
 n_neighbors = 3
 random_state = 0
 
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index ec158e5ad9824..d722ffa5be033 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -7,6 +7,7 @@
 the nearest neighbors classification accuracy. It provides a visual
 representation of this metric compared to the original point
 space. Please refer to the :ref:`User Guide <nca>` for more information.
+
 """
 
 # License: BSD 3 clause
@@ -18,8 +19,6 @@
 from matplotlib import cm
 from scipy.special import logsumexp
 
-print(__doc__)
-
 # %%
 # Original points
 # ---------------
diff --git a/examples/neighbors/plot_nearest_centroid.py b/examples/neighbors/plot_nearest_centroid.py
index 9e5d21777e718..a2d0bea5623d7 100644
--- a/examples/neighbors/plot_nearest_centroid.py
+++ b/examples/neighbors/plot_nearest_centroid.py
@@ -5,8 +5,8 @@
 
 Sample usage of Nearest Centroid classification.
 It will plot the decision boundaries for each class.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py
index ba91a1a65d2a8..76baf96b26063 100644
--- a/examples/neighbors/plot_regression.py
+++ b/examples/neighbors/plot_regression.py
@@ -8,7 +8,6 @@
 target using both barycenter and constant weights.
 
 """
-print(__doc__)
 
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
diff --git a/examples/neighbors/plot_species_kde.py b/examples/neighbors/plot_species_kde.py
index 9607d1a20aae4..c409d354ec986 100644
--- a/examples/neighbors/plot_species_kde.py
+++ b/examples/neighbors/plot_species_kde.py
@@ -35,6 +35,7 @@
    S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
    190:231-259, 2006.
 """  # noqa: E501
+
 # Author: Jake Vanderplas <jakevdp@cs.washington.edu>
 #
 # License: BSD 3 clause
diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py
index 8e35f61fe2494..3d316e4e85db0 100644
--- a/examples/neural_networks/plot_mlp_alpha.py
+++ b/examples/neural_networks/plot_mlp_alpha.py
@@ -14,9 +14,8 @@
 Similarly, decreasing alpha may fix high bias (a sign of underfitting) by
 encouraging larger weights, potentially resulting in a more complicated
 decision boundary.
-"""
-print(__doc__)
 
+"""
 
 # Author: Issam H. Laradji
 # License: BSD 3 clause
diff --git a/examples/neural_networks/plot_mlp_training_curves.py b/examples/neural_networks/plot_mlp_training_curves.py
index 45c3d3529da80..3fbddda879162 100644
--- a/examples/neural_networks/plot_mlp_training_curves.py
+++ b/examples/neural_networks/plot_mlp_training_curves.py
@@ -11,9 +11,8 @@
 
 Note that those results can be highly dependent on the value of
 ``learning_rate_init``.
-"""
 
-print(__doc__)
+"""
 
 import warnings
 
diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
index 0fba5412b96d0..686cf31a8ed79 100644
--- a/examples/neural_networks/plot_mnist_filters.py
+++ b/examples/neural_networks/plot_mnist_filters.py
@@ -21,6 +21,7 @@
 smoother spatial appearance. The example will throw a warning because it
 doesn't converge, in this case this is what we want because of CI's time
 constraints.
+
 """
 
 import warnings
@@ -30,8 +31,6 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.neural_network import MLPClassifier
 
-print(__doc__)
-
 # Load data from https://www.openml.org/d/554
 X, y = fetch_openml("mnist_784", version=1, return_X_y=True)
 X = X / 255.0
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index c17bebfc38cff..27691cc0b99ed 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -23,8 +23,8 @@
 Logistic regression on raw pixel values is presented for comparison. The
 example shows that the features extracted by the BernoulliRBM help improve the
 classification accuracy.
+
 """
-print(__doc__)
 
 # Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
 # License: BSD
diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py
index 66e428dd1a1e2..bbb114349a5e9 100644
--- a/examples/preprocessing/plot_all_scaling.py
+++ b/examples/preprocessing/plot_all_scaling.py
@@ -63,8 +63,6 @@
 
 from sklearn.datasets import fetch_california_housing
 
-print(__doc__)
-
 dataset = fetch_california_housing()
 X_full, y_full = dataset.data, dataset.target
 feature_names = dataset.feature_names
diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py
index 59018c93c737d..d064ea705903b 100644
--- a/examples/preprocessing/plot_discretization.py
+++ b/examples/preprocessing/plot_discretization.py
@@ -39,8 +39,6 @@
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.tree import DecisionTreeRegressor
 
-print(__doc__)
-
 # construct the dataset
 rnd = np.random.RandomState(42)
 X = rnd.uniform(-3, 3, size=100)
diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py
index ebbb9e1400fa4..59cdbdb766a12 100644
--- a/examples/preprocessing/plot_discretization_classification.py
+++ b/examples/preprocessing/plot_discretization_classification.py
@@ -26,7 +26,9 @@
 The plots show training points in solid colors and testing points
 semi-transparent. The lower right shows the classification accuracy on the test
 set.
+
 """
+
 # Code source: Tom Dupré la Tour
 # Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
 #
@@ -47,8 +49,6 @@
 from sklearn.utils._testing import ignore_warnings
 from sklearn.exceptions import ConvergenceWarning
 
-print(__doc__)
-
 h = 0.02  # step size in the mesh
 
 
diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py
index bee3a6314cd52..d6b9cb16ee53c 100644
--- a/examples/preprocessing/plot_discretization_strategies.py
+++ b/examples/preprocessing/plot_discretization_strategies.py
@@ -14,6 +14,7 @@
   procedure.
 
 The plot shows the regions where the discretized encoding is constant.
+
 """
 
 # Author: Tom Dupré la Tour
@@ -25,8 +26,6 @@
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.datasets import make_blobs
 
-print(__doc__)
-
 strategies = ["uniform", "quantile", "kmeans"]
 
 n_samples = 200
diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py
index ef605cedbe5f5..42a61d84fa384 100644
--- a/examples/preprocessing/plot_map_data_to_normal.py
+++ b/examples/preprocessing/plot_map_data_to_normal.py
@@ -31,6 +31,7 @@
 
 On "small" datasets (less than a few hundred points), the quantile transformer
 is prone to overfitting. The use of the power transform is then recommended.
+
 """
 
 # Author: Eric Chang <ericchang2017@u.northwestern.edu>
@@ -44,8 +45,6 @@
 from sklearn.preprocessing import QuantileTransformer
 from sklearn.model_selection import train_test_split
 
-print(__doc__)
-
 
 N_SAMPLES = 1000
 FONT_SIZE = 6
diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py
index 44801dfc1c715..4c7981e4a3170 100644
--- a/examples/preprocessing/plot_scaling_importance.py
+++ b/examples/preprocessing/plot_scaling_importance.py
@@ -40,6 +40,7 @@
 which is scaled before PCA vastly outperforms the unscaled version.
 
 """
+
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
@@ -49,8 +50,6 @@
 from sklearn.datasets import load_wine
 from sklearn.pipeline import make_pipeline
 
-print(__doc__)
-
 # Code source: Tyler Lanigan <tylerlanigan@gmail.com>
 #              Sebastian Raschka <mail@sebastianraschka.com>
 
diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py
index adac61ed9688f..520e2185ae2ce 100644
--- a/examples/release_highlights/plot_release_highlights_0_22_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_22_0.py
@@ -17,6 +17,7 @@
 or with conda::
 
     conda install -c conda-forge scikit-learn
+
 """
 
 # %%
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index d81f5886e6c63..4d6a914e01fb3 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -18,6 +18,7 @@
 or with conda::
 
     conda install -c conda-forge scikit-learn
+
 """
 
 ##############################################################################
diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py
index e54e91fe5aafb..7e044db11ccd3 100644
--- a/examples/release_highlights/plot_release_highlights_0_24_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_24_0.py
@@ -18,6 +18,7 @@
 or with conda::
 
     conda install -c conda-forge scikit-learn
+
 """
 
 ##############################################################################
diff --git a/examples/release_highlights/plot_release_highlights_1_0_0.py b/examples/release_highlights/plot_release_highlights_1_0_0.py
index d6bad517cd1e7..079d87a5d4f51 100644
--- a/examples/release_highlights/plot_release_highlights_1_0_0.py
+++ b/examples/release_highlights/plot_release_highlights_1_0_0.py
@@ -24,6 +24,7 @@
 or with conda::
 
     conda install -c conda-forge scikit-learn
+
 """
 
 ##############################################################################
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
index d75edca605b99..b91864fd2a541 100644
--- a/examples/semi_supervised/plot_label_propagation_digits.py
+++ b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -13,8 +13,8 @@
 class will be very good.
 
 At the end, the top 10 most uncertain predictions will be shown.
+
 """
-print(__doc__)
 
 # Authors: Clay Woolam <clay@woolam.org>
 # License: BSD
diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
index e6c19403aa728..7d4a348cad9b6 100644
--- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
+++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
@@ -17,8 +17,8 @@
 A plot will appear showing the top 5 most uncertain digits for each iteration
 of training. These may or may not contain mistakes, but we will train the next
 model with their true labels.
+
 """
-print(__doc__)
 
 # Authors: Clay Woolam <clay@woolam.org>
 # License: BSD
diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py
index e94eab6e16dc1..b6d8136eb1bd9 100644
--- a/examples/semi_supervised/plot_label_propagation_structure.py
+++ b/examples/semi_supervised/plot_label_propagation_structure.py
@@ -8,8 +8,8 @@
 labeled "red" and the inner circle "blue". Because both label groups
 lie inside their own distinct shape, we can see that the labels
 propagate correctly around the circle.
+
 """
-print(__doc__)
 
 # Authors: Clay Woolam <clay@woolam.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
diff --git a/examples/semi_supervised/plot_self_training_varying_threshold.py b/examples/semi_supervised/plot_self_training_varying_threshold.py
index fa2ac289086d3..9391e9d8b3c9b 100644
--- a/examples/semi_supervised/plot_self_training_varying_threshold.py
+++ b/examples/semi_supervised/plot_self_training_varying_threshold.py
@@ -26,8 +26,8 @@
 
 The optimal accuracy lies in between both of these extremes at a threshold of
 around 0.7.
+
 """
-print(__doc__)
 
 # Authors: Oliver Rausch <rauscho@ethz.ch>
 # License: BSD
diff --git a/examples/semi_supervised/plot_semi_supervised_newsgroups.py b/examples/semi_supervised/plot_semi_supervised_newsgroups.py
index 7316417e86120..6d02b5296868c 100644
--- a/examples/semi_supervised/plot_semi_supervised_newsgroups.py
+++ b/examples/semi_supervised/plot_semi_supervised_newsgroups.py
@@ -8,7 +8,9 @@
 
 You can adjust the number of categories by giving their names to the dataset
 loader or setting them to `None` to get all 20 of them.
+
 """
+
 import os
 
 import numpy as np
diff --git a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
index f9703fd44a902..402cd41d6a0f2 100644
--- a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
+++ b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
@@ -13,7 +13,6 @@
 identical to training the SVC on 100% of the data.
 
 """
-print(__doc__)
 
 # Authors: Clay Woolam   <clay@woolam.org>
 #          Oliver Rausch <rauscho@ethz.ch>
diff --git a/examples/svm/plot_custom_kernel.py b/examples/svm/plot_custom_kernel.py
index 86fe0aa8e585e..1dd0f2af6e145 100644
--- a/examples/svm/plot_custom_kernel.py
+++ b/examples/svm/plot_custom_kernel.py
@@ -7,7 +7,6 @@
 plot the decision surface and the support vectors.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_iris_svc.py b/examples/svm/plot_iris_svc.py
index 10b64f1c62e09..c4e5dc8314784 100644
--- a/examples/svm/plot_iris_svc.py
+++ b/examples/svm/plot_iris_svc.py
@@ -33,7 +33,6 @@
    more realistic high-dimensional problems.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_oneclass.py b/examples/svm/plot_oneclass.py
index a3b060a0e67c0..082cbcd6de2be 100644
--- a/examples/svm/plot_oneclass.py
+++ b/examples/svm/plot_oneclass.py
@@ -8,8 +8,8 @@
 :ref:`One-class SVM <svm_outlier_detection>` is an unsupervised
 algorithm that learns a decision function for novelty detection:
 classifying new data as similar or different to the training set.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index 19cae930d93a8..d0e921aa85cda 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -74,7 +74,6 @@
 map.
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_separating_hyperplane.py b/examples/svm/plot_separating_hyperplane.py
index cfb4a195e8a12..627aa3b8a2e72 100644
--- a/examples/svm/plot_separating_hyperplane.py
+++ b/examples/svm/plot_separating_hyperplane.py
@@ -6,8 +6,8 @@
 Plot the maximum margin separating hyperplane within a two-class
 separable dataset using a Support Vector Machine classifier with
 linear kernel.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_separating_hyperplane_unbalanced.py b/examples/svm/plot_separating_hyperplane_unbalanced.py
index a1db23f5f5ca8..b810016ffc74c 100644
--- a/examples/svm/plot_separating_hyperplane_unbalanced.py
+++ b/examples/svm/plot_separating_hyperplane_unbalanced.py
@@ -24,7 +24,6 @@
         clf = SGDClassifier(n_iter=100, alpha=0.01)
 
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 3fa7d05240df0..ce34e5b7ab3e2 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -7,8 +7,8 @@
 SVC (support vector classifier) to improve the classification scores. We use
 the iris dataset (4 features) and add 36 non-informative features. We can find
 that our model achieves best performance when we select around 10% of features.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_svm_kernels.py b/examples/svm/plot_svm_kernels.py
index 6435f6ac03433..dd7eb43e15231 100644
--- a/examples/svm/plot_svm_kernels.py
+++ b/examples/svm/plot_svm_kernels.py
@@ -10,8 +10,6 @@
 
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # License: BSD 3 clause
diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py
index 021122c0eb32e..dbf9a42bd6164 100644
--- a/examples/svm/plot_svm_margin.py
+++ b/examples/svm/plot_svm_margin.py
@@ -13,8 +13,6 @@
 the margins to be calculated using all the data in the area.
 
 """
-print(__doc__)
-
 
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
diff --git a/examples/svm/plot_svm_nonlinear.py b/examples/svm/plot_svm_nonlinear.py
index 09ae1febc8873..f88231b4b6af4 100644
--- a/examples/svm/plot_svm_nonlinear.py
+++ b/examples/svm/plot_svm_nonlinear.py
@@ -8,8 +8,8 @@
 inputs.
 
 The color map illustrates the decision function learned by the SVC.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py
index ca45951a9f224..8f0f2df27c4ce 100644
--- a/examples/svm/plot_svm_regression.py
+++ b/examples/svm/plot_svm_regression.py
@@ -6,7 +6,6 @@
 Toy example of 1D regression using linear, polynomial and RBF kernels.
 
 """
-print(__doc__)
 
 import numpy as np
 from sklearn.svm import SVR
diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index cc3793fefc7d3..ee99fe4b50f20 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -75,15 +75,13 @@
     Two separate datasets are used for the two different plots. The reason
     behind this is the `l1` case works better on sparse data, while `l2`
     is better suited to the non-sparse case.
-"""
-print(__doc__)
 
+"""
 
 # Author: Andreas Mueller <amueller@ais.uni-bonn.de>
 #         Jaques Grobler <jaques.grobler@inria.fr>
 # License: BSD 3 clause
 
-
 import numpy as np
 import matplotlib.pyplot as plt
 
diff --git a/examples/svm/plot_svm_tie_breaking.py b/examples/svm/plot_svm_tie_breaking.py
index aea34ac97fd8a..e12460b494c02 100644
--- a/examples/svm/plot_svm_tie_breaking.py
+++ b/examples/svm/plot_svm_tie_breaking.py
@@ -11,14 +11,12 @@
 tied. If ``break_ties=False``, all input in that area would be classified as
 one class, whereas if ``break_ties=True``, the tie-breaking mechanism will
 create a non-convex decision boundary in that area.
-"""
-print(__doc__)
 
+"""
 
 # Code source: Andreas Mueller, Adrin Jalali
 # License: BSD 3 clause
 
-
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.svm import SVC
diff --git a/examples/svm/plot_weighted_samples.py b/examples/svm/plot_weighted_samples.py
index f25390446fc87..d90b1516996ab 100644
--- a/examples/svm/plot_weighted_samples.py
+++ b/examples/svm/plot_weighted_samples.py
@@ -11,8 +11,8 @@
 subtle.
 To emphasize the effect here, we particularly weight outliers, making the
 deformation of the decision boundary very visible.
+
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py
index 5351bb5bef3e3..7a1fd0d8d288a 100644
--- a/examples/text/plot_document_classification_20newsgroups.py
+++ b/examples/text/plot_document_classification_20newsgroups.py
@@ -18,6 +18,7 @@
 #         Mathieu Blondel <mathieu@mblondel.org>
 #         Lars Buitinck
 # License: BSD 3 clause
+
 import logging
 import numpy as np
 from optparse import OptionParser
diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
index 832dabb6ba067..aaa9f95db723d 100644
--- a/examples/text/plot_document_clustering.py
+++ b/examples/text/plot_document_clustering.py
@@ -52,6 +52,7 @@
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Lars Buitinck
 # License: BSD 3 clause
+
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -118,6 +119,7 @@
 
 print(__doc__)
 op.print_help()
+print()
 
 
 def is_interactive():
diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py
index 1284fb7f164cd..672820108aebd 100644
--- a/examples/text/plot_hashing_vs_dict_vectorizer.py
+++ b/examples/text/plot_hashing_vs_dict_vectorizer.py
@@ -13,10 +13,12 @@
 
 A discrepancy between the number of terms reported for DictVectorizer and
 for FeatureHasher is to be expected due to hash collisions.
+
 """
 
 # Author: Lars Buitinck
 # License: BSD 3 clause
+
 from collections import defaultdict
 import re
 import sys
diff --git a/examples/tree/plot_cost_complexity_pruning.py b/examples/tree/plot_cost_complexity_pruning.py
index 9f317cc96b4fa..d21d163c9a1e3 100644
--- a/examples/tree/plot_cost_complexity_pruning.py
+++ b/examples/tree/plot_cost_complexity_pruning.py
@@ -17,7 +17,6 @@
 See also :ref:`minimal_cost_complexity_pruning` for details on pruning.
 """
 
-print(__doc__)
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_breast_cancer
diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py
index 7dbe203163de2..ec39f5b2b97ca 100644
--- a/examples/tree/plot_iris_dtc.py
+++ b/examples/tree/plot_iris_dtc.py
@@ -14,7 +14,6 @@
 
 We also show the tree structure of a model built on all of the features.
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/tree/plot_tree_regression.py b/examples/tree/plot_tree_regression.py
index 7a71b0450f2b7..6ed28a5cbfa99 100644
--- a/examples/tree/plot_tree_regression.py
+++ b/examples/tree/plot_tree_regression.py
@@ -13,7 +13,6 @@
 `max_depth` parameter) is set too high, the decision trees learn too fine
 details of the training data and learn from the noise, i.e. they overfit.
 """
-print(__doc__)
 
 # Import the necessary modules and libraries
 import numpy as np
diff --git a/examples/tree/plot_tree_regression_multioutput.py b/examples/tree/plot_tree_regression_multioutput.py
index ab9a530b0faee..a75652a6ddd56 100644
--- a/examples/tree/plot_tree_regression_multioutput.py
+++ b/examples/tree/plot_tree_regression_multioutput.py
@@ -14,7 +14,6 @@
 `max_depth` parameter) is set too high, the decision trees learn too fine
 details of the training data and learn from the noise, i.e. they overfit.
 """
-print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/examples/tree/plot_unveil_tree_structure.py b/examples/tree/plot_unveil_tree_structure.py
index 65aa685463bac..447011ff831dd 100644
--- a/examples/tree/plot_unveil_tree_structure.py
+++ b/examples/tree/plot_unveil_tree_structure.py
@@ -15,6 +15,7 @@
 - the decision path shared by a group of samples.
 
 """
+
 import numpy as np
 from matplotlib import pyplot as plt