diff --git a/.circleci/config.yml b/.circleci/config.yml
index e1e410c440314..0e77f30d18ed7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,13 +41,12 @@ jobs:
       # Test examples run with minimal dependencies
       - MINICONDA_PATH: ~/miniconda
       - CONDA_ENV_NAME: testenv
-      - PYTHON_VERSION: 2
-      - NUMPY_VERSION: 1.8.2
-      # XXX: plot_gpc_xor.py fails with scipy 0.13.3
-      - SCIPY_VERSION: 0.14
-      - MATPLOTLIB_VERSION: 1.3
-      - SCIKIT_IMAGE_VERSION: 0.9.3
-      - PANDAS_VERSION: 0.13.1
+      - PYTHON_VERSION: "2"
+      - NUMPY_VERSION: "1.10"
+      - SCIPY_VERSION: "0.16"
+      - MATPLOTLIB_VERSION: "1.4"
+      - SCIKIT_IMAGE_VERSION: "0.11"
+      - PANDAS_VERSION: "0.17.1"
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
diff --git a/README.rst b/README.rst
index fa2ef793b9e26..b4d67af56eec8 100644
--- a/README.rst
+++ b/README.rst
@@ -56,8 +56,8 @@ scikit-learn requires:
 **Scikit-learn 0.20 is the last version to support Python2.7.**
 Scikit-learn 0.21 and later will require Python 3.5 or newer.
 
-For running the examples Matplotlib >= 1.3.1 is required. A few examples
-require scikit-image >= 0.9.3 and a few examples require pandas >= 0.13.1.
+For running the examples Matplotlib >= 1.4 is required. A few examples
+require scikit-image >= 0.11.3 and a few examples require pandas >= 0.17.1.
 
 scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra
 Subprograms library. scikit-learn comes with a reference implementation, but
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 2d9d0af0dcbc5..15dd0e57a3021 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -75,7 +75,7 @@
 y_test = y[train_samples:]
 
 # Create classifiers
-lr = LogisticRegression()
+lr = LogisticRegression(solver='lbfgs')
 gnb = GaussianNB()
 svc = LinearSVC(C=1.0)
 rfc = RandomForestClassifier(n_estimators=100)
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 4542362817d71..ea4df9e6fb583 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -3,13 +3,17 @@
 Plot classification probability
 ===============================
 
-Plot the classification probability for different classifiers. We use a 3
-class dataset, and we classify it with a Support Vector classifier, L1
-and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting, and Gaussian process classification.
+Plot the classification probability for different classifiers. We use a 3 class
+dataset, and we classify it with a Support Vector classifier, L1 and L2
+penalized logistic regression with either a One-Vs-Rest or multinomial setting,
+and Gaussian process classification.
 
-The logistic regression is not a multiclass classifier out of the box. As
-a result it can identify only the first class.
+Linear SVC is not a probabilistic classifier by default but it has a built-in
+calibration option enabled in this example (`probability=True`).
+
+The logistic regression with One-Vs-Rest is not a multiclass classifier out of
+the box. As a result it has more trouble in separating class 2 and 3 than the
+other estimators.
 """
 print(__doc__)
 
@@ -19,6 +23,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 import matplotlib.pyplot as plt
 import numpy as np
 
+from sklearn.metrics import accuracy_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 from sklearn.gaussian_process import GaussianProcessClassifier
@@ -31,19 +36,27 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 n_features = X.shape[1]
 
-C = 1.0
+C = 10
 kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
-# Create different classifiers. The logistic regression cannot do
-# multiclass out of the box.
-classifiers = {'L1 logistic': LogisticRegression(C=C, penalty='l1'),
-               'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2'),
-               'Linear SVC': SVC(kernel='linear', C=C, probability=True,
-                                 random_state=0),
-               'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'),
-               'GPC': GaussianProcessClassifier(kernel)
-               }
+# Create different classifiers.
+classifiers = {
+    'L1 logistic': LogisticRegression(C=C, penalty='l1',
+                                      solver='saga',
+                                      multi_class='multinomial',
+                                      max_iter=10000),
+    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
+                                                    solver='saga',
+                                                    multi_class='multinomial',
+                                                    max_iter=10000),
+    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
+                                            solver='saga',
+                                            multi_class='ovr',
+                                            max_iter=10000),
+    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
+                      random_state=0),
+    'GPC': GaussianProcessClassifier(kernel)
+}
 
 n_classifiers = len(classifiers)
 
@@ -59,10 +72,10 @@ class dataset, and we classify it with a Support Vector classifier, L1
     classifier.fit(X, y)
 
     y_pred = classifier.predict(X)
-    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
-    print("classif_rate for %s : %f " % (name, classif_rate))
+    accuracy = accuracy_score(y, y_pred)
+    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
 
-    # View probabilities=
+    # View probabilities:
     probas = classifier.predict_proba(Xfull)
     n_classes = np.unique(y_pred).size
     for k in range(n_classes):
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 73ee27f83a907..1da0c7e0d60e8 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -71,7 +71,7 @@
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
 clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+                      ('classifier', LogisticRegression(solver='lbfgs'))])
 
 X = data.drop('survived', axis=1)
 y = data['survived']
diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index 2352abba4584e..6e722c9861529 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -22,42 +22,58 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+import pandas as pd
 
-from sklearn import linear_model, decomposition, datasets
+from sklearn import datasets
+from sklearn.decomposition import PCA
+from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 
-logistic = linear_model.LogisticRegression()
 
-pca = decomposition.PCA()
+# Define a pipeline to search for the best combination of PCA truncation
+# and classifier regularization.
+logistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,
+                         max_iter=10000, tol=1e-5, random_state=0)
+pca = PCA()
 pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
 
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target
 
+# Parameters of pipelines can be set using ‘__’ separated parameter names:
+param_grid = {
+    'pca__n_components': [5, 20, 30, 40, 50, 64],
+    'logistic__alpha': np.logspace(-4, 4, 5),
+}
+search = GridSearchCV(pipe, param_grid, iid=False, cv=5,
+                      return_train_score=False)
+search.fit(X_digits, y_digits)
+print("Best parameter (CV score=%0.3f):" % search.best_score_)
+print(search.best_params_)
+
 # Plot the PCA spectrum
 pca.fit(X_digits)
 
-plt.figure(1, figsize=(4, 3))
-plt.clf()
-plt.axes([.2, .2, .7, .7])
-plt.plot(pca.explained_variance_, linewidth=2)
-plt.axis('tight')
-plt.xlabel('n_components')
-plt.ylabel('explained_variance_')
+fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
+ax0.plot(pca.explained_variance_ratio_, linewidth=2)
+ax0.set_ylabel('PCA explained variance')
+
+ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
+            linestyle=':', label='n_components chosen')
+ax0.legend(prop=dict(size=12))
 
-# Prediction
-n_components = [20, 40, 64]
-Cs = np.logspace(-4, 4, 3)
+# For each number of components, find the best classifier results
+results = pd.DataFrame(search.cv_results_)
+components_col = 'param_pca__n_components'
+best_clfs = results.groupby(components_col).apply(
+    lambda g: g.nlargest(1, 'mean_test_score'))
 
-# Parameters of pipelines can be set using ‘__’ separated parameter names:
-estimator = GridSearchCV(pipe,
-                         dict(pca__n_components=n_components,
-                              logistic__C=Cs), cv=5)
-estimator.fit(X_digits, y_digits)
+best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
+               legend=False, ax=ax1)
+ax1.set_ylabel('Classification accuracy (val)')
+ax1.set_xlabel('n_components')
 
-plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
-            linestyle=':', label='n_components chosen')
-plt.legend(prop=dict(size=12))
+plt.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 5dbc2754b3a35..085309ed2a942 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -42,19 +42,19 @@
 n_estimator = 10
 X, y = make_classification(n_samples=80000)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
+
 # It is important to train the ensemble of trees on a different subset
 # of the training data than the linear regression model to avoid
 # overfitting, in particular if the total number of leaves is
 # similar to the number of training samples
-X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
-                                                            y_train,
-                                                            test_size=0.5)
+X_train, X_train_lr, y_train, y_train_lr = train_test_split(
+    X_train, y_train, test_size=0.5)
 
 # Unsupervised transformation based on totally random trees
 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
                           random_state=0)
 
-rt_lm = LogisticRegression()
+rt_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 pipeline = make_pipeline(rt, rt_lm)
 pipeline.fit(X_train, y_train)
 y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
@@ -63,7 +63,7 @@
 # Supervised transformation based on random forests
 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
 rf_enc = OneHotEncoder(categories='auto')
-rf_lm = LogisticRegression()
+rf_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 rf.fit(X_train, y_train)
 rf_enc.fit(rf.apply(X_train))
 rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
@@ -71,9 +71,10 @@
 y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
 fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
 
+# Supervised transformation based on gradient boosted trees
 grd = GradientBoostingClassifier(n_estimators=n_estimator)
 grd_enc = OneHotEncoder(categories='auto')
-grd_lm = LogisticRegression()
+grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 grd.fit(X_train, y_train)
 grd_enc.fit(grd.apply(X_train)[:, :, 0])
 grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
@@ -82,12 +83,10 @@
     grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
 fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
 
-
 # The gradient boosted model by itself
 y_pred_grd = grd.predict_proba(X_test)[:, 1]
 fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
 
-
 # The random forest model by itself
 y_pred_rf = rf.predict_proba(X_test)[:, 1]
 fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index c729818620a60..e38a618da3782 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -29,7 +29,7 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import VotingClassifier
 
-clf1 = LogisticRegression(random_state=123)
+clf1 = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=123)
 clf2 = RandomForestClassifier(n_estimators=100, random_state=123)
 clf3 = GaussianNB()
 X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
@@ -79,4 +79,5 @@
 plt.ylim([0, 1])
 plt.title('Class probabilities for sample 1 by different classifiers')
 plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left')
+plt.tight_layout()
 plt.show()
diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
index 25ab7e71c5925..6651a1fa05783 100644
--- a/examples/exercises/plot_digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -15,7 +15,7 @@
 from sklearn import datasets, neighbors, linear_model
 
 digits = datasets.load_digits()
-X_digits = digits.data
+X_digits = digits.data / digits.data.max()
 y_digits = digits.target
 
 n_samples = len(X_digits)
@@ -26,7 +26,8 @@
 y_test = y_digits[int(.9 * n_samples):]
 
 knn = neighbors.KNeighborsClassifier()
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000,
+                                           multi_class='multinomial')
 
 print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
 print('LogisticRegression score: %f'
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index d2193e9907b56..968598392722d 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -7,29 +7,28 @@
 =========================================================
 
 Show below is a logistic-regression classifiers decision boundaries on the
-`iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The
-datapoints are colored according to their labels.
+first two dimensions (sepal length and width) of the `iris
+<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The datapoints
+are colored according to their labels.
 
 """
 print(__doc__)
 
-
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
 # License: BSD 3 clause
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import linear_model, datasets
+from sklearn.linear_model import LogisticRegression
+from sklearn import datasets
 
 # import some data to play with
 iris = datasets.load_iris()
 X = iris.data[:, :2]  # we only take the first two features.
 Y = iris.target
 
-h = .02  # step size in the mesh
-
-logreg = linear_model.LogisticRegression(C=1e5)
+logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
 
 # we create an instance of Neighbours Classifier and fit the data.
 logreg.fit(X, Y)
@@ -38,6 +37,7 @@
 # point in the mesh [x_min, x_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+h = .02  # step size in the mesh
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
 
diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index 488f1c3543a6a..6d94cb0548601 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -23,8 +23,7 @@
 
 from sklearn import linear_model
 
-# this is our test set, it's just a straight line with some
-# Gaussian noise
+# General a toy dataset:s it's just a straight line with some Gaussian noise:
 xmin, xmax = -5, 5
 n_samples = 100
 np.random.seed(0)
@@ -34,8 +33,9 @@
 X += .3 * np.random.normal(size=n_samples)
 
 X = X[:, np.newaxis]
-# run the classifier
-clf = linear_model.LogisticRegression(C=1e5)
+
+# Fit the classifier
+clf = linear_model.LogisticRegression(C=1e5, solver='lbfgs')
 clf.fit(X, y)
 
 # and plot the result
@@ -47,6 +47,8 @@
 
 def model(x):
     return 1 / (1 + np.exp(-x))
+
+
 loss = model(X_test * clf.coef_ + clf.intercept_).ravel()
 plt.plot(X_test, loss, color='red', linewidth=3)
 
@@ -63,4 +65,5 @@ def model(x):
 plt.xlim(-4, 10)
 plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
            loc="lower right", fontsize='small')
+plt.tight_layout()
 plt.show()
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index be63b144c260a..bffc648965fca 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -37,10 +37,10 @@
 
 
 # Set regularization parameter
-for i, C in enumerate((100, 1, 0.01)):
+for i, C in enumerate((1, 0.1, 0.01)):
     # turn down tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
-    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
+    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01, solver='saga')
+    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga')
     clf_l1_LR.fit(X, y)
     clf_l2_LR.fit(X, y)
 
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 66a1ab9bd0254..79b5522575eb0 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -1,10 +1,28 @@
 #!/usr/bin/env python
 """
-=================================
-Path with L1- Logistic Regression
-=================================
+==============================================
+Regularization path of L1- Logistic Regression
+==============================================
 
-Computes path on IRIS dataset.
+
+Train l1-penalized logistic regression models on a binary classification
+problem derived from the Iris dataset.
+
+The models are ordered from strongest regularized to least regularized. The 4
+coefficients of the models are collected and plotted as a "regularization
+path": on the left-hand side of the figure (strong regularizers), all the
+coefficients are exactly 0. When regularization gets progressively looser,
+coefficients can get non-zero values one after the other.
+
+Here we choose the SAGA solver because it can efficiently optimize for the
+Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
+
+Also note that we set a low value for the tolerance to make sure that the model
+has converged before collecting the coefficients.
+
+We also use warm_start=True which means that the coefficients of the models are
+reused to initialize the next model fit to speed-up the computation of the
+full-path.
 
 """
 print(__doc__)
@@ -12,7 +30,7 @@
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
 
-from datetime import datetime
+from time import time
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -27,26 +45,28 @@
 X = X[y != 2]
 y = y[y != 2]
 
-X -= np.mean(X, 0)
+X /= X.max()  # Normalize X to speed-up convergence
 
 # #############################################################################
 # Demo path functions
 
-cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
+cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16)
 
 
 print("Computing regularization path ...")
-start = datetime.now()
-clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
+start = time()
+clf = linear_model.LogisticRegression(penalty='l1', solver='saga',
+                                      tol=1e-6, max_iter=int(1e6),
+                                      warm_start=True)
 coefs_ = []
 for c in cs:
     clf.set_params(C=c)
     clf.fit(X, y)
     coefs_.append(clf.coef_.ravel().copy())
-print("This took ", datetime.now() - start)
+print("This took %0.3fs" % (time() - start))
 
 coefs_ = np.array(coefs_)
-plt.plot(np.log10(cs), coefs_)
+plt.plot(np.log10(cs), coefs_, marker='o')
 ymin, ymax = plt.ylim()
 plt.xlabel('log(C)')
 plt.ylabel('Coefficients')
diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
index cb3a5085e316d..afe0131926dea 100644
--- a/examples/multioutput/plot_classifier_chain_yeast.py
+++ b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -54,14 +54,15 @@
 
 # Fit an independent logistic regression model for each class using the
 # OneVsRestClassifier wrapper.
-ovr = OneVsRestClassifier(LogisticRegression())
+base_lr = LogisticRegression(solver='lbfgs')
+ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
 ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)
 
 # Fit an ensemble of logistic regression classifier chains and take the
 # take the average prediction of all the chains.
-chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i)
+chains = [ClassifierChain(base_lr, order='random', random_state=i)
           for i in range(10)]
 for chain in chains:
     chain.fit(X_train, Y_train)
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index aa75ccc06d1f1..26223ad245214 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -40,6 +40,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import BernoulliRBM
 from sklearn.pipeline import Pipeline
+from sklearn.base import clone
 
 
 # #############################################################################
@@ -67,29 +68,32 @@ def nudge_dataset(X, Y):
          [0, 0, 0],
          [0, 1, 0]]]
 
-    shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
-                                  weights=w).ravel()
+    def shift(x, w):
+        return convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
+
     X = np.concatenate([X] +
                        [np.apply_along_axis(shift, 1, X, vector)
                         for vector in direction_vectors])
     Y = np.concatenate([Y for _ in range(5)], axis=0)
     return X, Y
 
+
 # Load Data
 digits = datasets.load_digits()
 X = np.asarray(digits.data, 'float32')
 X, Y = nudge_dataset(X, digits.target)
 X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
 
-X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
-                                                    test_size=0.2,
-                                                    random_state=0)
+X_train, X_test, Y_train, Y_test = train_test_split(
+    X, Y, test_size=0.2, random_state=0)
 
 # Models we will use
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000,
+                                           multi_class='multinomial')
 rbm = BernoulliRBM(random_state=0, verbose=True)
 
-classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
+rbm_features_classifier = Pipeline(
+    steps=[('rbm', rbm), ('logistic', logistic)])
 
 # #############################################################################
 # Training
@@ -102,28 +106,26 @@ def nudge_dataset(X, Y):
 # More components tend to give better prediction performance, but larger
 # fitting time
 rbm.n_components = 100
-logistic.C = 6000.0
+logistic.C = 6000
 
 # Training RBM-Logistic Pipeline
-classifier.fit(X_train, Y_train)
+rbm_features_classifier.fit(X_train, Y_train)
 
-# Training Logistic regression
-logistic_classifier = linear_model.LogisticRegression(C=100.0)
-logistic_classifier.fit(X_train, Y_train)
+# Training the Logistic regression classifier directly on the pixel
+raw_pixel_classifier = clone(logistic)
+raw_pixel_classifier.C = 100.
+raw_pixel_classifier.fit(X_train, Y_train)
 
 # #############################################################################
 # Evaluation
 
-print()
+Y_pred = rbm_features_classifier.predict(X_test)
 print("Logistic regression using RBM features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
+Y_pred = raw_pixel_classifier.predict(X_test)
 print("Logistic regression using raw pixel features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        logistic_classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
 # #############################################################################
 # Plotting