From d189026b02f6fdbb7a66c59df7832b9d1fd793d2 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 12:09:24 +0200
Subject: [PATCH 01/19] Be more specific about logistic regression solver in
 examples

---
 .../calibration/plot_compare_calibration.py   |  2 +-
 .../plot_classification_probability.py        | 53 ++++++++++++-------
 .../plot_column_transformer_mixed_types.py    |  2 +-
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 2d9d0af0dcbc5..15dd0e57a3021 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -75,7 +75,7 @@
 y_test = y[train_samples:]
 
 # Create classifiers
-lr = LogisticRegression()
+lr = LogisticRegression(solver='lbfgs')
 gnb = GaussianNB()
 svc = LinearSVC(C=1.0)
 rfc = RandomForestClassifier(n_estimators=100)
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 4542362817d71..6fb3785cb7ea5 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -3,13 +3,17 @@
 Plot classification probability
 ===============================
 
-Plot the classification probability for different classifiers. We use a 3
-class dataset, and we classify it with a Support Vector classifier, L1
-and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting, and Gaussian process classification.
+Plot the classification probability for different classifiers. We use a 3 class
+dataset, and we classify it with a Support Vector classifier, L1 and L2
+penalized logistic regression with either a One-Vs-Rest or multinomial setting,
+and Gaussian process classification.
 
-The logistic regression is not a multiclass classifier out of the box. As
-a result it can identify only the first class.
+Linear SVC is not a probabilistic classifier by default but it has a built-in
+calibration option enabled in this example (`probability=True`).
+
+The logistic regression with One-Vs-Rest is not a multiclass classifier out of
+the box. As a result it has more trouvle in separating class 2 and 3 than the
+other estimators.
 """
 print(__doc__)
 
@@ -19,6 +23,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 import matplotlib.pyplot as plt
 import numpy as np
 
+from sklearn.metrics import accuracy_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 from sklearn.gaussian_process import GaussianProcessClassifier
@@ -31,19 +36,27 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 n_features = X.shape[1]
 
-C = 1.0
+C = 10
 kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
-# Create different classifiers. The logistic regression cannot do
-# multiclass out of the box.
-classifiers = {'L1 logistic': LogisticRegression(C=C, penalty='l1'),
-               'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2'),
-               'Linear SVC': SVC(kernel='linear', C=C, probability=True,
-                                 random_state=0),
-               'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'),
-               'GPC': GaussianProcessClassifier(kernel)
-               }
+# Create different classifiers.
+classifiers = {
+    'L1 logistic': LogisticRegression(C=C, penalty='l1',
+                                      solver='saga',
+                                      multi_class='multinomial',
+                                      max_iter=10000),
+    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
+                                                    solver='saga',
+                                                    multi_class='multinomial',
+                                                    max_iter=10000),
+    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
+                                            solver='saga',
+                                            multi_class='ovr',
+                                            max_iter=10000),
+    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
+                      random_state=0),
+    'GPC': GaussianProcessClassifier(kernel)
+}
 
 n_classifiers = len(classifiers)
 
@@ -59,10 +72,10 @@ class dataset, and we classify it with a Support Vector classifier, L1
     classifier.fit(X, y)
 
     y_pred = classifier.predict(X)
-    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
-    print("classif_rate for %s : %f " % (name, classif_rate))
+    accuracy = accuracy_score(y, y_pred)
+    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
 
-    # View probabilities=
+    # View probabilities:
     probas = classifier.predict_proba(Xfull)
     n_classes = np.unique(y_pred).size
     for k in range(n_classes):
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 73ee27f83a907..1da0c7e0d60e8 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -71,7 +71,7 @@
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
 clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+                      ('classifier', LogisticRegression(solver='lbfgs'))])
 
 X = data.drop('survived', axis=1)
 y = data['survived']

From 4b7483fc15f813acd449ab5099082b1213d470d4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 15:35:50 +0200
Subject: [PATCH 02/19] Use early stopped SGD (faster) and plot cross-validated
 error for best models

---
 examples/compose/plot_digits_pipe.py | 57 ++++++++++++++++++----------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index 2352abba4584e..d1758c168c511 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -22,42 +22,57 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+import pandas as pd
 
-from sklearn import linear_model, decomposition, datasets
+from sklearn import datasets
+from sklearn.decomposition import PCA
+from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 
-logistic = linear_model.LogisticRegression()
 
-pca = decomposition.PCA()
+# Define a pipeline to search for the best combination of PCA truncation
+# and classifier regularization.
+logistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,
+                         max_iter=10000, tol=1e-5, random_state=0)
+pca = PCA()
 pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
 
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target
 
+# Parameters of pipelines can be set using ‘__’ separated parameter names:
+param_grid = {
+    'pca__n_components': [5, 20, 30, 40, 50, 64],
+    'logistic__alpha': np.logspace(-4, 4, 5),
+}
+search = GridSearchCV(pipe, param_grid, iid=False, cv=5,
+                      return_train_score=False)
+search.fit(X_digits, y_digits)
+print("Best parameter (CV score=%0.3f):" % search.best_score_)
+print(search.best_params_)
+
 # Plot the PCA spectrum
 pca.fit(X_digits)
 
-plt.figure(1, figsize=(4, 3))
-plt.clf()
-plt.axes([.2, .2, .7, .7])
-plt.plot(pca.explained_variance_, linewidth=2)
-plt.axis('tight')
-plt.xlabel('n_components')
-plt.ylabel('explained_variance_')
+fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(4, 5))
+ax0.plot(pca.explained_variance_ratio_, linewidth=2)
+ax0.set_ylabel('PCA explained variance')
+
+ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
+            linestyle=':', label='n_components chosen')
+ax0.legend(prop=dict(size=12))
 
-# Prediction
-n_components = [20, 40, 64]
-Cs = np.logspace(-4, 4, 3)
+# For each number of components, find the best classifier results
+results = pd.DataFrame(search.cv_results_)
+components_col = 'param_pca__n_components'
+best_clfs = results.groupby(components_col).apply(
+    lambda g: g.nlargest(1, 'mean_test_score'))
 
-# Parameters of pipelines can be set using ‘__’ separated parameter names:
-estimator = GridSearchCV(pipe,
-                         dict(pca__n_components=n_components,
-                              logistic__C=Cs), cv=5)
-estimator.fit(X_digits, y_digits)
+best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
+               legend=False, ax=ax1)
+ax1.set_ylabel('Classification accuracy (val)')
+ax1.set_xlabel('n_components')
 
-plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
-            linestyle=':', label='n_components chosen')
-plt.legend(prop=dict(size=12))
 plt.show()

From 447d6f2af50a2ceafd3618a4d9d0d77c943359ca Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 16:28:38 +0200
Subject: [PATCH 03/19] Fix LR solver in
 /plot_voting_probas.pyexamples/ensemble/plot_voting_probas.py

---
 examples/ensemble/plot_feature_transformation.py | 15 +++++++--------
 examples/ensemble/plot_voting_probas.py          |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 5dbc2754b3a35..085309ed2a942 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -42,19 +42,19 @@
 n_estimator = 10
 X, y = make_classification(n_samples=80000)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
+
 # It is important to train the ensemble of trees on a different subset
 # of the training data than the linear regression model to avoid
 # overfitting, in particular if the total number of leaves is
 # similar to the number of training samples
-X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
-                                                            y_train,
-                                                            test_size=0.5)
+X_train, X_train_lr, y_train, y_train_lr = train_test_split(
+    X_train, y_train, test_size=0.5)
 
 # Unsupervised transformation based on totally random trees
 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
                           random_state=0)
 
-rt_lm = LogisticRegression()
+rt_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 pipeline = make_pipeline(rt, rt_lm)
 pipeline.fit(X_train, y_train)
 y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
@@ -63,7 +63,7 @@
 # Supervised transformation based on random forests
 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
 rf_enc = OneHotEncoder(categories='auto')
-rf_lm = LogisticRegression()
+rf_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 rf.fit(X_train, y_train)
 rf_enc.fit(rf.apply(X_train))
 rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
@@ -71,9 +71,10 @@
 y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
 fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
 
+# Supervised transformation based on gradient boosted trees
 grd = GradientBoostingClassifier(n_estimators=n_estimator)
 grd_enc = OneHotEncoder(categories='auto')
-grd_lm = LogisticRegression()
+grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 grd.fit(X_train, y_train)
 grd_enc.fit(grd.apply(X_train)[:, :, 0])
 grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
@@ -82,12 +83,10 @@
     grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
 fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
 
-
 # The gradient boosted model by itself
 y_pred_grd = grd.predict_proba(X_test)[:, 1]
 fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
 
-
 # The random forest model by itself
 y_pred_rf = rf.predict_proba(X_test)[:, 1]
 fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index c729818620a60..4916a00511702 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -29,7 +29,7 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import VotingClassifier
 
-clf1 = LogisticRegression(random_state=123)
+clf1 = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=123)
 clf2 = RandomForestClassifier(n_estimators=100, random_state=123)
 clf3 = GaussianNB()
 X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])

From c54c34263cb9511bbfae5b4b0ed6d6e281ae2f2b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 16:33:28 +0200
Subject: [PATCH 04/19] Fix LR solver & scale data in
 plot_digits_classification_exercise.py

---
 examples/exercises/plot_digits_classification_exercise.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
index 25ab7e71c5925..6651a1fa05783 100644
--- a/examples/exercises/plot_digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -15,7 +15,7 @@
 from sklearn import datasets, neighbors, linear_model
 
 digits = datasets.load_digits()
-X_digits = digits.data
+X_digits = digits.data / digits.data.max()
 y_digits = digits.target
 
 n_samples = len(X_digits)
@@ -26,7 +26,8 @@
 y_test = y_digits[int(.9 * n_samples):]
 
 knn = neighbors.KNeighborsClassifier()
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000,
+                                           multi_class='multinomial')
 
 print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
 print('LogisticRegression score: %f'

From c4bba067f4fd982f7bbd144ce253435b5c9684ab Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 16:37:43 +0200
Subject: [PATCH 05/19] Use saga solver in plot_logistic_l1_l2_sparsity.py

---
 examples/linear_model/plot_logistic_l1_l2_sparsity.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index be63b144c260a..bffc648965fca 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -37,10 +37,10 @@
 
 
 # Set regularization parameter
-for i, C in enumerate((100, 1, 0.01)):
+for i, C in enumerate((1, 0.1, 0.01)):
     # turn down tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
-    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
+    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01, solver='saga')
+    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga')
     clf_l1_LR.fit(X, y)
     clf_l2_LR.fit(X, y)
 

From dfb94e2b932c93d531ca2215f9b1c3a66386294a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 17:16:04 +0200
Subject: [PATCH 06/19] Use LBFGS solver in plot_iris_logistic.py

---
 examples/linear_model/plot_iris_logistic.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index d2193e9907b56..d3790370bbde5 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -7,29 +7,29 @@
 =========================================================
 
 Show below is a logistic-regression classifiers decision boundaries on the
+first two dimensions (sepal length and width) of the iris data:
 `iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The
 datapoints are colored according to their labels.
 
 """
 print(__doc__)
 
-
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
 # License: BSD 3 clause
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import linear_model, datasets
+from sklearn.linear_model import LogisticRegression
+from sklearn import datasets
 
 # import some data to play with
 iris = datasets.load_iris()
 X = iris.data[:, :2]  # we only take the first two features.
+X /= X.max()  # rescale to [0-1] range to speed-up convergence
 Y = iris.target
 
-h = .02  # step size in the mesh
-
-logreg = linear_model.LogisticRegression(C=1e5)
+logreg = LogisticRegression(C=1e5, solver='lbfgs')
 
 # we create an instance of Neighbours Classifier and fit the data.
 logreg.fit(X, Y)
@@ -38,6 +38,7 @@
 # point in the mesh [x_min, x_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+h = .02  # step size in the mesh
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
 

From c856d9ed847921e54e6c4cbcd18a41a53c99bda1 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 17:19:17 +0200
Subject: [PATCH 07/19] Use LBFGS in plot_logistic.py

---
 examples/linear_model/plot_logistic.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index 488f1c3543a6a..171a7135844be 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -23,8 +23,7 @@
 
 from sklearn import linear_model
 
-# this is our test set, it's just a straight line with some
-# Gaussian noise
+# General a toy dataset:s it's just a straight line with some Gaussian noise:
 xmin, xmax = -5, 5
 n_samples = 100
 np.random.seed(0)
@@ -34,8 +33,9 @@
 X += .3 * np.random.normal(size=n_samples)
 
 X = X[:, np.newaxis]
-# run the classifier
-clf = linear_model.LogisticRegression(C=1e5)
+
+# Fit the classifier
+clf = linear_model.LogisticRegression(C=1e5, solver='lbfgs')
 clf.fit(X, y)
 
 # and plot the result
@@ -47,6 +47,8 @@
 
 def model(x):
     return 1 / (1 + np.exp(-x))
+
+
 loss = model(X_test * clf.coef_ + clf.intercept_).ravel()
 plt.plot(X_test, loss, color='red', linewidth=3)
 

From 03102e0fe6fa26295cfd8064070dd25eff124528 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 17:52:53 +0200
Subject: [PATCH 08/19] Use SAGA solver for Logistic Regression Path example

---
 examples/linear_model/plot_logistic_path.py | 42 +++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 66a1ab9bd0254..c9bbf8720545e 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -1,10 +1,28 @@
 #!/usr/bin/env python
 """
-=================================
-Path with L1- Logistic Regression
-=================================
+==============================================
+Regularization path of L1- Logistic Regression
+==============================================
 
-Computes path on IRIS dataset.
+
+Train l1-penalized logistic regression models on binary classification problem
+derived from the Iris dataset.
+
+The models are ordered from strongest regularized to least regularized. The 4
+coefficients of the models are collected and plotted as a "regularization
+path": on the left-hand side of the figure (strong regularizers), all the
+coefficients are exactly 0. When regularization gets progressively looser,
+coefficients can get non-zero values one after the other.
+
+Here we choose the SAGA solver because it can efficiently optimize for the
+Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
+
+Also note that we set a low value for the tolerance to make sure that the
+model has converged before collecting the coefficients.
+
+We also use warm_start=True which mean that the coefficients of the models
+are reused to initialize the next model fit and therefore speed-up the
+computation of the full-path.
 
 """
 print(__doc__)
@@ -12,7 +30,7 @@
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
 
-from datetime import datetime
+from time import time
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -27,26 +45,28 @@
 X = X[y != 2]
 y = y[y != 2]
 
-X -= np.mean(X, 0)
+X /= X.max()  # Normalize X to speed-up convergence
 
 # #############################################################################
 # Demo path functions
 
-cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
+cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16)
 
 
 print("Computing regularization path ...")
-start = datetime.now()
-clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
+start = time()
+clf = linear_model.LogisticRegression(penalty='l1', solver='saga',
+                                      tol=1e-6, max_iter=int(1e6),
+                                      warm_start=True)
 coefs_ = []
 for c in cs:
     clf.set_params(C=c)
     clf.fit(X, y)
     coefs_.append(clf.coef_.ravel().copy())
-print("This took ", datetime.now() - start)
+print("This took %0.3fs" % (time() - start))
 
 coefs_ = np.array(coefs_)
-plt.plot(np.log10(cs), coefs_)
+plt.plot(np.log10(cs), coefs_, marker='o')
 ymin, ymax = plt.ylim()
 plt.xlabel('log(C)')
 plt.ylabel('Coefficients')

From 3509ac1a99c5483d22e719ccf4ab0229c8d79e96 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 18:01:41 +0200
Subject: [PATCH 09/19] Use LBFGS solver in plot_classifier_chain_yeast.py

---
 examples/multioutput/plot_classifier_chain_yeast.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
index cb3a5085e316d..afe0131926dea 100644
--- a/examples/multioutput/plot_classifier_chain_yeast.py
+++ b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -54,14 +54,15 @@
 
 # Fit an independent logistic regression model for each class using the
 # OneVsRestClassifier wrapper.
-ovr = OneVsRestClassifier(LogisticRegression())
+base_lr = LogisticRegression(solver='lbfgs')
+ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
 ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)
 
 # Fit an ensemble of logistic regression classifier chains and take the
 # take the average prediction of all the chains.
-chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i)
+chains = [ClassifierChain(base_lr, order='random', random_state=i)
           for i in range(10)]
 for chain in chains:
     chain.fit(X_train, Y_train)

From ec2918e0def0fb0db73d8b8a7b73d8bc14d51782 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 18:36:06 +0200
Subject: [PATCH 10/19] Use LBFGS solver in plot_rbm_logistic_classification.py

---
 .../plot_rbm_logistic_classification.py       | 40 ++++++++++---------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index aa75ccc06d1f1..26223ad245214 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -40,6 +40,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import BernoulliRBM
 from sklearn.pipeline import Pipeline
+from sklearn.base import clone
 
 
 # #############################################################################
@@ -67,29 +68,32 @@ def nudge_dataset(X, Y):
          [0, 0, 0],
          [0, 1, 0]]]
 
-    shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
-                                  weights=w).ravel()
+    def shift(x, w):
+        return convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
+
     X = np.concatenate([X] +
                        [np.apply_along_axis(shift, 1, X, vector)
                         for vector in direction_vectors])
     Y = np.concatenate([Y for _ in range(5)], axis=0)
     return X, Y
 
+
 # Load Data
 digits = datasets.load_digits()
 X = np.asarray(digits.data, 'float32')
 X, Y = nudge_dataset(X, digits.target)
 X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
 
-X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
-                                                    test_size=0.2,
-                                                    random_state=0)
+X_train, X_test, Y_train, Y_test = train_test_split(
+    X, Y, test_size=0.2, random_state=0)
 
 # Models we will use
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000,
+                                           multi_class='multinomial')
 rbm = BernoulliRBM(random_state=0, verbose=True)
 
-classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
+rbm_features_classifier = Pipeline(
+    steps=[('rbm', rbm), ('logistic', logistic)])
 
 # #############################################################################
 # Training
@@ -102,28 +106,26 @@ def nudge_dataset(X, Y):
 # More components tend to give better prediction performance, but larger
 # fitting time
 rbm.n_components = 100
-logistic.C = 6000.0
+logistic.C = 6000
 
 # Training RBM-Logistic Pipeline
-classifier.fit(X_train, Y_train)
+rbm_features_classifier.fit(X_train, Y_train)
 
-# Training Logistic regression
-logistic_classifier = linear_model.LogisticRegression(C=100.0)
-logistic_classifier.fit(X_train, Y_train)
+# Training the Logistic regression classifier directly on the pixel
+raw_pixel_classifier = clone(logistic)
+raw_pixel_classifier.C = 100.
+raw_pixel_classifier.fit(X_train, Y_train)
 
 # #############################################################################
 # Evaluation
 
-print()
+Y_pred = rbm_features_classifier.predict(X_test)
 print("Logistic regression using RBM features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
+Y_pred = raw_pixel_classifier.predict(X_test)
 print("Logistic regression using raw pixel features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        logistic_classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
 # #############################################################################
 # Plotting

From 30b56e5988b73d8d8c3cc8bfa0261786c8b4d875 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 18:43:14 +0200
Subject: [PATCH 11/19] typo

---
 examples/classification/plot_classification_probability.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 6fb3785cb7ea5..ea4df9e6fb583 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -12,7 +12,7 @@
 calibration option enabled in this example (`probability=True`).
 
 The logistic regression with One-Vs-Rest is not a multiclass classifier out of
-the box. As a result it has more trouvle in separating class 2 and 3 than the
+the box. As a result it has more trouble in separating class 2 and 3 than the
 other estimators.
 """
 print(__doc__)

From e76d5fc16fa6fe528fadfc375414e2eaf63c23a9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 19 Sep 2018 18:47:29 +0200
Subject: [PATCH 12/19] typo

---
 examples/linear_model/plot_logistic_path.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index c9bbf8720545e..79b5522575eb0 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -5,8 +5,8 @@
 ==============================================
 
 
-Train l1-penalized logistic regression models on binary classification problem
-derived from the Iris dataset.
+Train l1-penalized logistic regression models on a binary classification
+problem derived from the Iris dataset.
 
 The models are ordered from strongest regularized to least regularized. The 4
 coefficients of the models are collected and plotted as a "regularization
@@ -17,12 +17,12 @@
 Here we choose the SAGA solver because it can efficiently optimize for the
 Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
 
-Also note that we set a low value for the tolerance to make sure that the
-model has converged before collecting the coefficients.
+Also note that we set a low value for the tolerance to make sure that the model
+has converged before collecting the coefficients.
 
-We also use warm_start=True which mean that the coefficients of the models
-are reused to initialize the next model fit and therefore speed-up the
-computation of the full-path.
+We also use warm_start=True which means that the coefficients of the models are
+reused to initialize the next model fit to speed-up the computation of the
+full-path.
 
 """
 print(__doc__)

From 765826809f57f33d23c10ca7830cc615953ec3aa Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 10:27:25 +0200
Subject: [PATCH 13/19] Bump up pandas dependency to 0.17.1

---
 .circleci/config.yml | 2 +-
 README.rst           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e1e410c440314..d627636a35279 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -47,7 +47,7 @@ jobs:
       - SCIPY_VERSION: 0.14
       - MATPLOTLIB_VERSION: 1.3
       - SCIKIT_IMAGE_VERSION: 0.9.3
-      - PANDAS_VERSION: 0.13.1
+      - PANDAS_VERSION: 0.17.1
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
diff --git a/README.rst b/README.rst
index fa2ef793b9e26..b967a07ccc0c9 100644
--- a/README.rst
+++ b/README.rst
@@ -57,7 +57,7 @@ scikit-learn requires:
 Scikit-learn 0.21 and later will require Python 3.5 or newer.
 
 For running the examples Matplotlib >= 1.3.1 is required. A few examples
-require scikit-image >= 0.9.3 and a few examples require pandas >= 0.13.1.
+require scikit-image >= 0.9.3 and a few examples require pandas >= 0.17.1.
 
 scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra
 Subprograms library. scikit-learn comes with a reference implementation, but

From 080160620ef45d27d991e2f986a02d6c513a184a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 11:31:53 +0200
Subject: [PATCH 14/19] Bump up examples minimal deps to match pandas 0.17.1

---
 .circleci/config.yml | 9 ++++-----
 README.rst           | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d627636a35279..6e9e75298e172 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -42,11 +42,10 @@ jobs:
       - MINICONDA_PATH: ~/miniconda
       - CONDA_ENV_NAME: testenv
       - PYTHON_VERSION: 2
-      - NUMPY_VERSION: 1.8.2
-      # XXX: plot_gpc_xor.py fails with scipy 0.13.3
-      - SCIPY_VERSION: 0.14
-      - MATPLOTLIB_VERSION: 1.3
-      - SCIKIT_IMAGE_VERSION: 0.9.3
+      - NUMPY_VERSION: 1.10
+      - SCIPY_VERSION: 0.16
+      - MATPLOTLIB_VERSION: 1.4
+      - SCIKIT_IMAGE_VERSION: 0.11
       - PANDAS_VERSION: 0.17.1
     steps:
       - checkout
diff --git a/README.rst b/README.rst
index b967a07ccc0c9..b4d67af56eec8 100644
--- a/README.rst
+++ b/README.rst
@@ -56,8 +56,8 @@ scikit-learn requires:
 **Scikit-learn 0.20 is the last version to support Python2.7.**
 Scikit-learn 0.21 and later will require Python 3.5 or newer.
 
-For running the examples Matplotlib >= 1.3.1 is required. A few examples
-require scikit-image >= 0.9.3 and a few examples require pandas >= 0.17.1.
+For running the examples Matplotlib >= 1.4 is required. A few examples
+require scikit-image >= 0.11.3 and a few examples require pandas >= 0.17.1.
 
 scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra
 Subprograms library. scikit-learn comes with a reference implementation, but

From 8134fa8d12201b16e630bd9db46c313af84141bb Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 11:48:20 +0200
Subject: [PATCH 15/19] Fix figure layout for plot_digits_pipe.py

---
 examples/compose/plot_digits_pipe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index d1758c168c511..6e722c9861529 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -56,7 +56,7 @@
 # Plot the PCA spectrum
 pca.fit(X_digits)
 
-fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(4, 5))
+fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
 ax0.plot(pca.explained_variance_ratio_, linewidth=2)
 ax0.set_ylabel('PCA explained variance')
 
@@ -75,4 +75,5 @@
 ax1.set_ylabel('Classification accuracy (val)')
 ax1.set_xlabel('n_components')
 
+plt.tight_layout()
 plt.show()

From 2539fc5537f4724e3d962ff4a46c64fa422eacd0 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 11:49:04 +0200
Subject: [PATCH 16/19] Version numbers are not decimal numbers

---
 .circleci/config.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6e9e75298e172..0e77f30d18ed7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,12 +41,12 @@ jobs:
       # Test examples run with minimal dependencies
       - MINICONDA_PATH: ~/miniconda
       - CONDA_ENV_NAME: testenv
-      - PYTHON_VERSION: 2
-      - NUMPY_VERSION: 1.10
-      - SCIPY_VERSION: 0.16
-      - MATPLOTLIB_VERSION: 1.4
-      - SCIKIT_IMAGE_VERSION: 0.11
-      - PANDAS_VERSION: 0.17.1
+      - PYTHON_VERSION: "2"
+      - NUMPY_VERSION: "1.10"
+      - SCIPY_VERSION: "0.16"
+      - MATPLOTLIB_VERSION: "1.4"
+      - SCIKIT_IMAGE_VERSION: "0.11"
+      - PANDAS_VERSION: "0.17.1"
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh

From c7a5c05fc70e1dea1479efdc1cd0b323e10d4153 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 11:55:12 +0200
Subject: [PATCH 17/19] Set multinomial, no scaling to keep example simple, fix
 formatting of example doc

---
 examples/linear_model/plot_iris_logistic.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index d3790370bbde5..968598392722d 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -7,9 +7,9 @@
 =========================================================
 
 Show below is a logistic-regression classifiers decision boundaries on the
-first two dimensions (sepal length and width) of the iris data:
-`iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The
-datapoints are colored according to their labels.
+first two dimensions (sepal length and width) of the `iris
+<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The datapoints
+are colored according to their labels.
 
 """
 print(__doc__)
@@ -26,10 +26,9 @@
 # import some data to play with
 iris = datasets.load_iris()
 X = iris.data[:, :2]  # we only take the first two features.
-X /= X.max()  # rescale to [0-1] range to speed-up convergence
 Y = iris.target
 
-logreg = LogisticRegression(C=1e5, solver='lbfgs')
+logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
 
 # we create an instance of Neighbours Classifier and fit the data.
 logreg.fit(X, Y)

From 2bbc6cd5a4bdd20483f52645f2f9c13177fa8061 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 13:28:54 +0200
Subject: [PATCH 18/19] Missing plt.tight_layout() in plot_voting_probas.py

---
 examples/ensemble/plot_voting_probas.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index 4916a00511702..e38a618da3782 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -79,4 +79,5 @@
 plt.ylim([0, 1])
 plt.title('Class probabilities for sample 1 by different classifiers')
 plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left')
+plt.tight_layout()
 plt.show()

From 79a97f94f8f58eb3313b52d05bfc0b634c14aa95 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 20 Sep 2018 13:33:12 +0200
Subject: [PATCH 19/19] Missing plt.tight_layout() in plot_logistic.py

---
 examples/linear_model/plot_logistic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index 171a7135844be..6d94cb0548601 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -65,4 +65,5 @@ def model(x):
 plt.xlim(-4, 10)
 plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
            loc="lower right", fontsize='small')
+plt.tight_layout()
 plt.show()