diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py
index 598ba5fe2a95d..660c208402b7d 100644
--- a/benchmarks/bench_glm.py
+++ b/benchmarks/bench_glm.py
@@ -12,7 +12,7 @@
 
 if __name__ == '__main__':
 
-    import pylab as pl
+    import matplotlib.pyplot as plt
 
     n_iter = 40
 
@@ -46,13 +46,13 @@
         lasso.fit(X, Y)
         time_lasso[i] = total_seconds(datetime.now() - start)
 
-    pl.figure('scikit-learn GLM benchmark results')
-    pl.xlabel('Dimensions')
-    pl.ylabel('Time (s)')
-    pl.plot(dimensions, time_ridge, color='r')
-    pl.plot(dimensions, time_ols, color='g')
-    pl.plot(dimensions, time_lasso, color='b')
+    plt.figure('scikit-learn GLM benchmark results')
+    plt.xlabel('Dimensions')
+    plt.ylabel('Time (s)')
+    plt.plot(dimensions, time_ridge, color='r')
+    plt.plot(dimensions, time_ols, color='g')
+    plt.plot(dimensions, time_lasso, color='b')
 
-    pl.legend(['Ridge', 'OLS', 'LassoLars'], loc='upper left')
-    pl.axis('tight')
-    pl.show()
+    plt.legend(['Ridge', 'OLS', 'LassoLars'], loc='upper left')
+    plt.axis('tight')
+    plt.show()
diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py
index 4b9c35040c526..b05971ba1ff20 100644
--- a/benchmarks/bench_glmnet.py
+++ b/benchmarks/bench_glmnet.py
@@ -47,8 +47,8 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
 if __name__ == '__main__':
     from glmnet.elastic_net import Lasso as GlmnetLasso
     from sklearn.linear_model import Lasso as ScikitLasso
-    # Delayed import of pylab
-    import pylab as pl
+    # Delayed import of matplotlib.pyplot
+    import matplotlib.pyplot as plt
 
     scikit_results = []
     glmnet_results = []
@@ -76,15 +76,15 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
         print("benchmarking glmnet: ")
         glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
 
-    pl.clf()
+    plt.clf()
     xx = range(0, n * step, step)
-    pl.title('Lasso regression on sample dataset (%d features)' % n_features)
-    pl.plot(xx, scikit_results, 'b-', label='scikit-learn')
-    pl.plot(xx, glmnet_results, 'r-', label='glmnet')
-    pl.legend()
-    pl.xlabel('number of samples to classify')
-    pl.ylabel('Time (s)')
-    pl.show()
+    plt.title('Lasso regression on sample dataset (%d features)' % n_features)
+    plt.plot(xx, scikit_results, 'b-', label='scikit-learn')
+    plt.plot(xx, glmnet_results, 'r-', label='glmnet')
+    plt.legend()
+    plt.xlabel('number of samples to classify')
+    plt.ylabel('Time (s)')
+    plt.show()
 
     # now do a benchmark where the number of points is fixed
     # and the variable is the number of features
@@ -117,12 +117,12 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
         glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
 
     xx = np.arange(100, 100 + n * step, step)
-    pl.figure('scikit-learn vs. glmnet benchmark results')
-    pl.title('Regression in high dimensional spaces (%d samples)' % n_samples)
-    pl.plot(xx, scikit_results, 'b-', label='scikit-learn')
-    pl.plot(xx, glmnet_results, 'r-', label='glmnet')
-    pl.legend()
-    pl.xlabel('number of features')
-    pl.ylabel('Time (s)')
-    pl.axis('tight')
-    pl.show()
+    plt.figure('scikit-learn vs. glmnet benchmark results')
+    plt.title('Regression in high dimensional spaces (%d samples)' % n_samples)
+    plt.plot(xx, scikit_results, 'b-', label='scikit-learn')
+    plt.plot(xx, glmnet_results, 'r-', label='glmnet')
+    plt.legend()
+    plt.xlabel('number of features')
+    plt.ylabel('Time (s)')
+    plt.axis('tight')
+    plt.show()
diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py
index 7934fe1fdb0ba..7ed774ad2e790 100644
--- a/benchmarks/bench_lasso.py
+++ b/benchmarks/bench_lasso.py
@@ -59,7 +59,7 @@ def compute_bench(alpha, n_samples, n_features, precompute):
 
 if __name__ == '__main__':
     from sklearn.linear_model import Lasso, LassoLars
-    import pylab as pl
+    import matplotlib.pyplot as plt
 
     alpha = 0.01  # regularization parameter
 
@@ -68,28 +68,29 @@ def compute_bench(alpha, n_samples, n_features, precompute):
     lasso_results, lars_lasso_results = compute_bench(alpha, list_n_samples,
                                             [n_features], precompute=True)
 
-    pl.figure('scikit-learn LASSO benchmark results')
-    pl.subplot(211)
-    pl.plot(list_n_samples, lasso_results, 'b-',
+    plt.figure('scikit-learn LASSO benchmark results')
+    plt.subplot(211)
+    plt.plot(list_n_samples, lasso_results, 'b-',
                             label='Lasso')
-    pl.plot(list_n_samples, lars_lasso_results, 'r-',
+    plt.plot(list_n_samples, lars_lasso_results, 'r-',
                             label='LassoLars')
-    pl.title('precomputed Gram matrix, %d features, alpha=%s' % (n_features, alpha))
-    pl.legend(loc='upper left')
-    pl.xlabel('number of samples')
-    pl.ylabel('Time (s)')
-    pl.axis('tight')
+    plt.title('precomputed Gram matrix, %d features, alpha=%s' % (n_features,
+                            alpha))
+    plt.legend(loc='upper left')
+    plt.xlabel('number of samples')
+    plt.ylabel('Time (s)')
+    plt.axis('tight')
 
     n_samples = 2000
     list_n_features = np.linspace(500, 3000, 5).astype(np.int)
     lasso_results, lars_lasso_results = compute_bench(alpha, [n_samples],
                                            list_n_features, precompute=False)
-    pl.subplot(212)
-    pl.plot(list_n_features, lasso_results, 'b-', label='Lasso')
-    pl.plot(list_n_features, lars_lasso_results, 'r-', label='LassoLars')
-    pl.title('%d samples, alpha=%s' % (n_samples, alpha))
-    pl.legend(loc='upper left')
-    pl.xlabel('number of features')
-    pl.ylabel('Time (s)')
-    pl.axis('tight')
-    pl.show()
+    plt.subplot(212)
+    plt.plot(list_n_features, lasso_results, 'b-', label='Lasso')
+    plt.plot(list_n_features, lars_lasso_results, 'r-', label='LassoLars')
+    plt.title('%d samples, alpha=%s' % (n_samples, alpha))
+    plt.legend(loc='upper left')
+    plt.xlabel('number of features')
+    plt.ylabel('Time (s)')
+    plt.axis('tight')
+    plt.show()
diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py
index d163fb46b7193..19bd22a7157a5 100644
--- a/benchmarks/bench_plot_neighbors.py
+++ b/benchmarks/bench_plot_neighbors.py
@@ -4,7 +4,7 @@
 from time import time
 
 import numpy as np
-import pylab as pl
+import matplotlib.pyplot as plt
 from matplotlib import ticker
 
 from sklearn import neighbors, datasets
@@ -106,7 +106,7 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
             k_results_build[algorithm][i] = (t1 - t0)
             k_results_query[algorithm][i] = (t2 - t1)
 
-    pl.figure(figsize=(8, 11))
+    plt.figure(figsize=(8, 11))
 
     for (sbplt, vals, quantity,
          build_time, query_time) in [(311, Nrange, 'N',
@@ -118,8 +118,8 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
                                      (313, krange, 'k',
                                       k_results_build,
                                       k_results_query)]:
-        ax = pl.subplot(sbplt, yscale='log')
-        pl.grid(True)
+        ax = plt.subplot(sbplt, yscale='log')
+        plt.grid(True)
 
         tick_vals = []
         tick_labels = []
@@ -131,21 +131,21 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
             xvals = 0.1 + i * (1 + len(vals)) + np.arange(len(vals))
             width = 0.8
 
-            c_bar = pl.bar(xvals, build_time[alg] - bottom,
-                           width, bottom, color='r')
-            q_bar = pl.bar(xvals, query_time[alg],
-                           width, build_time[alg], color='b')
+            c_bar = plt.bar(xvals, build_time[alg] - bottom,
+                            width, bottom, color='r')
+            q_bar = plt.bar(xvals, query_time[alg],
+                            width, build_time[alg], color='b')
 
             tick_vals += list(xvals + 0.5 * width)
             tick_labels += ['%i' % val for val in vals]
 
-            pl.text((i + 0.02) / len(algorithms), 0.98, alg,
-                    transform=ax.transAxes,
-                    ha='left',
-                    va='top',
-                    bbox=dict(facecolor='w', edgecolor='w', alpha=0.5))
+            plt.text((i + 0.02) / len(algorithms), 0.98, alg,
+                     transform=ax.transAxes,
+                     ha='left',
+                     va='top',
+                     bbox=dict(facecolor='w', edgecolor='w', alpha=0.5))
 
-            pl.ylabel('Time (s)')
+            plt.ylabel('Time (s)')
 
         ax.xaxis.set_major_locator(ticker.FixedLocator(tick_vals))
         ax.xaxis.set_major_formatter(ticker.FixedFormatter(tick_labels))
@@ -166,20 +166,20 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
 
         descr_string = descr_string[:-2]
 
-        pl.text(1.01, 0.5, title_string,
-                transform=ax.transAxes, rotation=-90,
-                ha='left', va='center', fontsize=20)
+        plt.text(1.01, 0.5, title_string,
+                 transform=ax.transAxes, rotation=-90,
+                 ha='left', va='center', fontsize=20)
 
-        pl.text(0.99, 0.5, descr_string,
-                transform=ax.transAxes, rotation=-90,
-                ha='right', va='center')
+        plt.text(0.99, 0.5, descr_string,
+                 transform=ax.transAxes, rotation=-90,
+                 ha='right', va='center')
 
-        pl.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16)
+        plt.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16)
 
-    pl.figlegend((c_bar, q_bar), ('construction', 'N-point query'),
-                 'upper right')
+    plt.figlegend((c_bar, q_bar), ('construction', 'N-point query'),
+                  'upper right')
 
 if __name__ == '__main__':
     barplot_neighbors(dataset='digits')
     barplot_neighbors(dataset='dense')
-    pl.show()
+    plt.show()
diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
index debc3c3be4567..58d350af539a1 100644
--- a/benchmarks/bench_plot_omp_lars.py
+++ b/benchmarks/bench_plot_omp_lars.py
@@ -105,19 +105,19 @@ def compute_bench(samples_range, features_range):
     results = compute_bench(samples_range, features_range)
     max_time = max(np.max(t) for t in results.values())
 
-    import pylab as pl
-    fig = pl.figure('scikit-learn OMP vs. LARS benchmark results')
+    import matplotlib.pyplot as plt
+    fig = plt.figure('scikit-learn OMP vs. LARS benchmark results')
     for i, (label, timings) in enumerate(sorted(results.iteritems())):
-        ax = fig.add_subplot(1, 2, i)
+        ax = fig.add_subplot(1, 2, i+1)
         vmax = max(1 - timings.min(), -1 + timings.max())
-        pl.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax)
+        plt.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax)
         ax.set_xticklabels([''] + map(str, samples_range))
         ax.set_yticklabels([''] + map(str, features_range))
-        pl.xlabel('n_samples')
-        pl.ylabel('n_features')
-        pl.title(label)
-
-    pl.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63)
-    ax = pl.axes([0.1, 0.08, 0.8, 0.06])
-    pl.colorbar(cax=ax, orientation='horizontal')
-    pl.show()
+        plt.xlabel('n_samples')
+        plt.ylabel('n_features')
+        plt.title(label)
+
+    plt.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63)
+    ax = plt.axes([0.1, 0.08, 0.8, 0.06])
+    plt.colorbar(cax=ax, orientation='horizontal')
+    plt.show()
diff --git a/benchmarks/bench_plot_parallel_pairwise.py b/benchmarks/bench_plot_parallel_pairwise.py
index c17f42d7ca5eb..e844ea06e1f53 100644
--- a/benchmarks/bench_plot_parallel_pairwise.py
+++ b/benchmarks/bench_plot_parallel_pairwise.py
@@ -2,7 +2,7 @@
 # License: BSD 3 clause
 import time
 
-import pylab as pl
+import matplotlib.pyplot as plt
 
 from sklearn.utils import check_random_state
 from sklearn.metrics.pairwise import pairwise_distances
@@ -25,13 +25,13 @@ def plot(func):
         func(X, n_jobs=-1)
         multi_core.append(time.time() - start)
 
-    pl.figure('scikit-learn parallel %s benchmark results' % func.__name__)
-    pl.plot(sample_sizes, one_core, label="one core")
-    pl.plot(sample_sizes, multi_core, label="multi core")
-    pl.xlabel('n_samples')
-    pl.ylabel('Time (s)')
-    pl.title('Parallel %s' % func.__name__)
-    pl.legend()
+    plt.figure('scikit-learn parallel %s benchmark results' % func.__name__)
+    plt.plot(sample_sizes, one_core, label="one core")
+    plt.plot(sample_sizes, multi_core, label="multi core")
+    plt.xlabel('n_samples')
+    plt.ylabel('Time (s)')
+    plt.title('Parallel %s' % func.__name__)
+    plt.legend()
 
 def euclidean_distances(X, n_jobs):
     return pairwise_distances(X, metric="euclidean", n_jobs=n_jobs)
@@ -41,4 +41,4 @@ def rbf_kernels(X, n_jobs):
 
 plot(euclidean_distances)
 plot(rbf_kernels)
-pl.show()
+plt.show()
diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py
index 0159cc5232f08..be93d6d2508e9 100644
--- a/benchmarks/bench_plot_ward.py
+++ b/benchmarks/bench_plot_ward.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from scipy.cluster import hierarchy
-import pylab as pl
+import matplotlib.pyplot as plt
 
 from sklearn.cluster import AgglomerativeClustering
 
@@ -31,13 +31,13 @@
 
 ratio = scikits_time / scipy_time
 
-pl.figure("scikit-learn Ward's method benchmark results")
-pl.imshow(np.log(ratio), aspect='auto', origin="lower")
-pl.colorbar()
-pl.contour(ratio, levels=[1, ], colors='k')
-pl.yticks(range(len(n_features)), n_features.astype(np.int))
-pl.ylabel('N features')
-pl.xticks(range(len(n_samples)), n_samples.astype(np.int))
-pl.xlabel('N samples')
-pl.title("Scikit's time, in units of scipy time (log)")
-pl.show()
+plt.figure("scikit-learn Ward's method benchmark results")
+plt.imshow(np.log(ratio), aspect='auto', origin="lower")
+plt.colorbar()
+plt.contour(ratio, levels=[1, ], colors='k')
+plt.yticks(range(len(n_features)), n_features.astype(np.int))
+plt.ylabel('N features')
+plt.xticks(range(len(n_samples)), n_samples.astype(np.int))
+plt.xlabel('N samples')
+plt.title("Scikit's time, in units of scipy time (log)")
+plt.show()
diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py
index 63e3e4c51846b..e66f656114760 100644
--- a/benchmarks/bench_sgd_regression.py
+++ b/benchmarks/bench_sgd_regression.py
@@ -11,7 +11,7 @@
 # License: BSD 3 clause
 
 import numpy as np
-import pylab as pl
+import matplotlib.pyplot as plt
 
 import gc
 
@@ -113,39 +113,39 @@
     # Plot results
     i = 0
     m = len(list_n_features)
-    pl.figure('scikit-learn SGD regression benchmark results',
-              figsize=(5 * 2, 4 * m))
+    plt.figure('scikit-learn SGD regression benchmark results',
+               figsize=(5 * 2, 4 * m))
     for j in range(m):
-        pl.subplot(m, 2, i + 1)
-        pl.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]),
-                label="ElasticNet")
-        pl.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]),
-                label="SGDRegressor")
-        pl.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]),
-                label="A-SGDRegressor")
-        pl.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]),
-                label="Ridge")
-        pl.legend(prop={"size": 10})
-        pl.xlabel("n_train")
-        pl.ylabel("RMSE")
-        pl.title("Test error - %d features" % list_n_features[j])
+        plt.subplot(m, 2, i + 1)
+        plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]),
+                 label="ElasticNet")
+        plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]),
+                 label="SGDRegressor")
+        plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]),
+                 label="A-SGDRegressor")
+        plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]),
+                 label="Ridge")
+        plt.legend(prop={"size": 10})
+        plt.xlabel("n_train")
+        plt.ylabel("RMSE")
+        plt.title("Test error - %d features" % list_n_features[j])
         i += 1
 
-        pl.subplot(m, 2, i + 1)
-        pl.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]),
-                label="ElasticNet")
-        pl.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]),
-                label="SGDRegressor")
-        pl.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]),
-                label="A-SGDRegressor")
-        pl.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]),
-                label="Ridge")
-        pl.legend(prop={"size": 10})
-        pl.xlabel("n_train")
-        pl.ylabel("Time [sec]")
-        pl.title("Training time - %d features" % list_n_features[j])
+        plt.subplot(m, 2, i + 1)
+        plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]),
+                 label="ElasticNet")
+        plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]),
+                 label="SGDRegressor")
+        plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]),
+                 label="A-SGDRegressor")
+        plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]),
+                 label="Ridge")
+        plt.legend(prop={"size": 10})
+        plt.xlabel("n_train")
+        plt.ylabel("Time [sec]")
+        plt.title("Training time - %d features" % list_n_features[j])
         i += 1
 
-    pl.subplots_adjust(hspace=.30)
+    plt.subplots_adjust(hspace=.30)
 
-    pl.show()
+    plt.show()
diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py
index ca97cf9be275b..8a0af26d4c221 100644
--- a/benchmarks/bench_tree.py
+++ b/benchmarks/bench_tree.py
@@ -14,7 +14,7 @@
 of the number of dimensions.
 """
 import numpy as np
-import pylab as pl
+import matplotlib.pyplot as plt
 import gc
 from datetime import datetime
 
@@ -84,14 +84,14 @@ def bench_scikit_tree_regressor(X, Y):
         bench_scikit_tree_regressor(X, Y)
 
     xx = range(0, n * step, step)
-    pl.figure('scikit-learn tree benchmark results')
-    pl.subplot(211)
-    pl.title('Learning with varying number of samples')
-    pl.plot(xx, scikit_classifier_results, 'g-', label='classification')
-    pl.plot(xx, scikit_regressor_results, 'r-', label='regression')
-    pl.legend(loc='upper left')
-    pl.xlabel('number of samples')
-    pl.ylabel('Time (s)')
+    plt.figure('scikit-learn tree benchmark results')
+    plt.subplot(211)
+    plt.title('Learning with varying number of samples')
+    plt.plot(xx, scikit_classifier_results, 'g-', label='classification')
+    plt.plot(xx, scikit_regressor_results, 'r-', label='regression')
+    plt.legend(loc='upper left')
+    plt.xlabel('number of samples')
+    plt.ylabel('Time (s)')
 
     scikit_classifier_results = []
     scikit_regressor_results = []
@@ -113,12 +113,12 @@ def bench_scikit_tree_regressor(X, Y):
         bench_scikit_tree_regressor(X, Y)
 
     xx = np.arange(start_dim, start_dim + n * step, step)
-    pl.subplot(212)
-    pl.title('Learning in high dimensional spaces')
-    pl.plot(xx, scikit_classifier_results, 'g-', label='classification')
-    pl.plot(xx, scikit_regressor_results, 'r-', label='regression')
-    pl.legend(loc='upper left')
-    pl.xlabel('number of dimensions')
-    pl.ylabel('Time (s)')
-    pl.axis('tight')
-    pl.show()
+    plt.subplot(212)
+    plt.title('Learning in high dimensional spaces')
+    plt.plot(xx, scikit_classifier_results, 'g-', label='classification')
+    plt.plot(xx, scikit_regressor_results, 'r-', label='regression')
+    plt.legend(loc='upper left')
+    plt.xlabel('number of dimensions')
+    plt.ylabel('Time (s)')
+    plt.axis('tight')
+    plt.show()
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index fb97a16fc9b8a..49d056902c392 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -93,7 +93,7 @@ and pipeline on 2D data.
   The default coding of images is based on the ``uint8`` dtype to
   spare memory.  Often machine learning algorithms work best if the
   input is converted to a floating point representation first.  Also,
-  if you plan to use ``pylab.imshow`` don't forget to scale to the range
+  if you plan to use ``matplotlib.pyplpt.imshow`` don't forget to scale to the range
   0 - 1 as done in the following example.
 
 .. topic:: Examples:
diff --git a/doc/tutorial/statistical_inference/settings.rst b/doc/tutorial/statistical_inference/settings.rst
index fead00cf952fb..3537151d0144d 100644
--- a/doc/tutorial/statistical_inference/settings.rst
+++ b/doc/tutorial/statistical_inference/settings.rst
@@ -29,7 +29,7 @@ these arrays is the **samples** axis, while the second is the
 When the data is not initially in the ``(n_samples, n_features)`` shape, it
 needs to be preprocessed in order to be used by scikit-learn.
 
-.. topic:: An example of reshaping data would be the digits dataset 
+.. topic:: An example of reshaping data would be the digits dataset
 
     .. image:: ../../auto_examples/datasets/images/plot_digits_last_image_001.png
         :target: ../../auto_examples/datasets/plot_digits_last_image.html
@@ -42,8 +42,8 @@ needs to be preprocessed in order to be used by scikit-learn.
         >>> digits = datasets.load_digits()
         >>> digits.images.shape
         (1797, 8, 8)
-        >>> import pylab as pl #doctest: +SKIP
-        >>> pl.imshow(digits.images[-1], cmap=pl.cm.gray_r) #doctest: +SKIP
+        >>> import matplotlib.pyplot as plt #doctest: +SKIP
+        >>> plt.imshow(digits.images[-1], cmap=plt.cm.gray_r) #doctest: +SKIP
         <matplotlib.image.AxesImage object at ...>
 
     To use this dataset with the scikit, we transform each 8x8 image into a
@@ -89,4 +89,3 @@ parameters are attributes of the estimator object ending by an
 underscore::
 
     >>> estimator.estimated_param_ #doctest: +SKIP
-
diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
index d5e69e15dd0f4..5a0397b24acd4 100644
--- a/doc/tutorial/statistical_inference/supervised_learning.rst
+++ b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -212,15 +212,15 @@ induces high variance:
     >>> test = np.c_[ 0, 2].T
     >>> regr = linear_model.LinearRegression()
 
-    >>> import pylab as pl # doctest: +SKIP
-    >>> pl.figure() # doctest: +SKIP
+    >>> import matplotlib.pyplot as plt # doctest: +SKIP
+    >>> plt.figure() # doctest: +SKIP
 
     >>> np.random.seed(0)
     >>> for _ in range(6): # doctest: +SKIP
     ...    this_X = .1*np.random.normal(size=(2, 1)) + X
     ...    regr.fit(this_X, y)
-    ...    pl.plot(test, regr.predict(test)) # doctest: +SKIP
-    ...    pl.scatter(this_X, y, s=3)  # doctest: +SKIP
+    ...    plt.plot(test, regr.predict(test)) # doctest: +SKIP
+    ...    plt.scatter(this_X, y, s=3)  # doctest: +SKIP
 
 
 
@@ -238,14 +238,14 @@ regression:
 
     >>> regr = linear_model.Ridge(alpha=.1)
 
-    >>> pl.figure() # doctest: +SKIP
+    >>> plt.figure() # doctest: +SKIP
 
     >>> np.random.seed(0)
     >>> for _ in range(6): # doctest: +SKIP
     ...    this_X = .1*np.random.normal(size=(2, 1)) + X
     ...    regr.fit(this_X, y)
-    ...    pl.plot(test, regr.predict(test)) # doctest: +SKIP
-    ...    pl.scatter(this_X, y, s=3) # doctest: +SKIP
+    ...    plt.plot(test, regr.predict(test)) # doctest: +SKIP
+    ...    plt.scatter(this_X, y, s=3) # doctest: +SKIP
 
 This is an example of **bias/variance tradeoff**: the larger the ridge
 ``alpha`` parameter, the higher the bias and the lower the variance.
diff --git a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
index 51649840d9049..0b56cfc10d837 100644
--- a/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
+++ b/doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py
@@ -46,9 +46,9 @@
 cm = metrics.confusion_matrix(y_test, y_predicted)
 print(cm)
 
-#import pylab as pl
-#pl.matshow(cm, cmap=pl.cm.jet)
-#pl.show()
+#import matplotlib.pyplot as plt
+#plt.matshow(cm, cmap=plt.cm.jet)
+#plt.show()
 
 # Predict the result on some short new sentences:
 sentences = [
diff --git a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
index 869cfbee2565d..f4e15774711b9 100644
--- a/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
+++ b/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py
@@ -54,9 +54,9 @@
 cm = metrics.confusion_matrix(y_test, y_predicted)
 print(cm)
 
-#import pylab as pl
-#pl.matshow(cm, cmap=pl.cm.jet)
-#pl.show()
+#import matlotlib.pyplot as plt
+#plt.matshow(cm, cmap=plt.cm.jet)
+#plt.show()
 
 # Predict the result on some short new sentences:
 sentences = [
diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
index d9c2b570336a7..3d21ee591c319 100644
--- a/examples/classification/plot_digits_classification.py
+++ b/examples/classification/plot_digits_classification.py
@@ -27,7 +27,7 @@
 # The data that we are interested in is made of 8x8 images of digits, let's
 # have a look at the first 4 images, stored in the `images` attribute of the
 # dataset.  If we were working from image files, we could load them using
-# pylab.imread.  Note that each image must have the same size. For these
+# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
 # images, we know which digit they represent: it is given in the 'target' of
 # the dataset.
 images_and_labels = list(zip(digits.images, digits.target))
diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
index 7a3704c9fefde..408eedbdbaa4b 100644
--- a/examples/decomposition/plot_sparse_coding.py
+++ b/examples/decomposition/plot_sparse_coding.py
@@ -17,7 +17,7 @@
 print(__doc__)
 
 import numpy as np
-import matplotlib.pylab as plt
+import matplotlib.pyplot as plt
 
 from sklearn.decomposition import SparseCoder
 
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 21b3a010bf2f3..64eeba6d649fd 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 
-from matplotlib import pyplot as pl
+from matplotlib import pyplot as plt
 from matplotlib import cm
 
 from sklearn.gaussian_process import GaussianProcessClassifier
@@ -64,39 +64,39 @@ def g(x):
 y_prob = y_prob.reshape((res, res))
 
 # Plot the probabilistic classification iso-values
-fig = pl.figure(1)
+fig = plt.figure(1)
 ax = fig.gca()
 ax.axes.set_aspect('equal')
-pl.xticks([])
-pl.yticks([])
+plt.xticks([])
+plt.yticks([])
 ax.set_xticklabels([])
 ax.set_yticklabels([])
-pl.xlabel('$x_1$')
-pl.ylabel('$x_2$')
+plt.xlabel('$x_1$')
+plt.ylabel('$x_2$')
 
-cax = pl.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
-                extent=(-lim, lim, -lim, lim))
-norm = pl.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
-cb = pl.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
+cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
+                 extent=(-lim, lim, -lim, lim))
+norm = plt.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
+cb = plt.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
 cb.set_label('${\\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\\right]$')
-pl.clim(0, 1)
+plt.clim(0, 1)
 
-pl.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
+plt.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
 
-pl.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12)
+plt.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12)
 
-cs = pl.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')
+cs = plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')
 
-cs = pl.contour(x1, x2, y_prob, [0.666], colors='b',
-                linestyles='solid')
-pl.clabel(cs, fontsize=11)
+cs = plt.contour(x1, x2, y_prob, [0.666], colors='b',
+                 linestyles='solid')
+plt.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, y_prob, [0.5], colors='k',
-                linestyles='dashed')
-pl.clabel(cs, fontsize=11)
+cs = plt.contour(x1, x2, y_prob, [0.5], colors='k',
+                 linestyles='dashed')
+plt.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, y_prob, [0.334], colors='r',
-                linestyles='solid')
-pl.clabel(cs, fontsize=11)
+cs = plt.contour(x1, x2, y_prob, [0.334], colors='r',
+                 linestyles='solid')
+plt.clabel(cs, fontsize=11)
 
-pl.show()
+plt.show()
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 329d0384b40ea..d1c313d714509 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -26,7 +26,7 @@
 # Licence: BSD 3 clause
 
 import numpy as np
-from matplotlib import pyplot as pl
+from matplotlib import pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
@@ -61,18 +61,18 @@ def f(x):
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
-fig = pl.figure()
-pl.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$')
-pl.plot(X, y, 'r.', markersize=10, label=u'Observations')
-pl.plot(x, y_pred, 'b-', label=u'Prediction')
-pl.fill(np.concatenate([x, x[::-1]]),
-        np.concatenate([y_pred - 1.9600 * sigma,
-                       (y_pred + 1.9600 * sigma)[::-1]]),
-        alpha=.5, fc='b', ec='None', label='95% confidence interval')
-pl.xlabel('$x$')
-pl.ylabel('$f(x)$')
-pl.ylim(-10, 20)
-pl.legend(loc='upper left')
+fig = plt.figure()
+plt.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$')
+plt.plot(X, y, 'r.', markersize=10, label=u'Observations')
+plt.plot(x, y_pred, 'b-', label=u'Prediction')
+plt.fill(np.concatenate([x, x[::-1]]),
+         np.concatenate([y_pred - 1.9600 * sigma,
+                        (y_pred + 1.9600 * sigma)[::-1]]),
+         alpha=.5, fc='b', ec='None', label='95% confidence interval')
+plt.xlabel('$x$')
+plt.ylabel('$f(x)$')
+plt.ylim(-10, 20)
+plt.legend(loc='upper left')
 
 # ----------------------------------------------------------------------
 # now the noisy case
@@ -97,17 +97,17 @@ def f(x):
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
-fig = pl.figure()
-pl.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$')
-pl.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label=u'Observations')
-pl.plot(x, y_pred, 'b-', label=u'Prediction')
-pl.fill(np.concatenate([x, x[::-1]]),
-        np.concatenate([y_pred - 1.9600 * sigma,
-                       (y_pred + 1.9600 * sigma)[::-1]]),
-        alpha=.5, fc='b', ec='None', label='95% confidence interval')
-pl.xlabel('$x$')
-pl.ylabel('$f(x)$')
-pl.ylim(-10, 20)
-pl.legend(loc='upper left')
-
-pl.show()
+fig = plt.figure()
+plt.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$')
+plt.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label=u'Observations')
+plt.plot(x, y_pred, 'b-', label=u'Prediction')
+plt.fill(np.concatenate([x, x[::-1]]),
+         np.concatenate([y_pred - 1.9600 * sigma,
+                        (y_pred + 1.9600 * sigma)[::-1]]),
+         alpha=.5, fc='b', ec='None', label='95% confidence interval')
+plt.xlabel('$x$')
+plt.ylabel('$f(x)$')
+plt.ylim(-10, 20)
+plt.legend(loc='upper left')
+
+plt.show()
diff --git a/examples/text/mlcomp_sparse_document_classification.py b/examples/text/mlcomp_sparse_document_classification.py
index c1d2631453d9c..52f9267f3f0eb 100644
--- a/examples/text/mlcomp_sparse_document_classification.py
+++ b/examples/text/mlcomp_sparse_document_classification.py
@@ -44,7 +44,7 @@
 import os
 import numpy as np
 import scipy.sparse as sp
-import pylab as pl
+import matplotlib.pyplot as pl
 
 from sklearn.datasets import load_mlcomp
 from sklearn.feature_extraction.text import TfidfVectorizer
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 27e890a87bc45..4c1fe553f7115 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -415,10 +415,10 @@ def load_digits(n_class=10):
         >>> digits = load_digits()
         >>> print(digits.data.shape)
         (1797, 64)
-        >>> import pylab as pl #doctest: +SKIP
-        >>> pl.gray() #doctest: +SKIP
-        >>> pl.matshow(digits.images[0]) #doctest: +SKIP
-        >>> pl.show() #doctest: +SKIP
+        >>> import matplotlib.pyplot as plt #doctest: +SKIP
+        >>> plt.gray() #doctest: +SKIP
+        >>> plt.matshow(digits.images[0]) #doctest: +SKIP
+        >>> plt.show() #doctest: +SKIP
     """
     module_path = dirname(__file__)
     data = np.loadtxt(join(module_path, 'data', 'digits.csv.gz'),
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 3b1756ba73d09..0f0fcd9cab552 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -208,10 +208,10 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     ax : Matplotlib axis object, default None
         An axis object onto which the plots will be drawn.
     line_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
     contour_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
     fig_kw : dict
         Dict with keywords passed to the figure() call.