Skip to content

Commit eb70f8a

Browse files
committed
Change default solver in LogisticRegression
1 parent bc07078 commit eb70f8a

File tree

6 files changed

+465
-82
lines changed

6 files changed

+465
-82
lines changed

benchmarks/bench_logistic_solvers.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
"""
2+
Benchmarks of sklearn solver in LogisticRegression.
3+
"""
4+
5+
# Author: Tom Dupre la Tour
6+
import time
7+
from os.path import expanduser
8+
9+
import matplotlib.pyplot as plt
10+
import scipy.sparse as sp # noqa
11+
import numpy as np
12+
import pandas as pd
13+
14+
from sklearn.datasets import fetch_mldata
15+
from sklearn.datasets import fetch_rcv1, load_iris, load_digits
16+
from sklearn.datasets import fetch_20newsgroups_vectorized
17+
from sklearn.exceptions import ConvergenceWarning
18+
from sklearn.externals.joblib import delayed, Parallel, Memory
19+
from sklearn.linear_model import LogisticRegression
20+
from sklearn.linear_model.logistic import _multinomial_loss
21+
from sklearn.model_selection import train_test_split
22+
from sklearn.preprocessing import LabelBinarizer
23+
from sklearn.preprocessing import MinMaxScaler # noqa
24+
from sklearn.utils.testing import ignore_warnings
25+
from sklearn.utils import shuffle
26+
27+
28+
def get_loss(coefs, intercepts, X, y, C, multi_class, penalty):
29+
if multi_class == 'ovr':
30+
if np.array(intercepts).ndim == 0 and intercepts == 0:
31+
intercepts = np.zeros(coefs.shape[0])
32+
loss = 0
33+
for ii, (coef, intercept) in enumerate(zip(coefs, intercepts)):
34+
y_bin = y.copy()
35+
y_bin[y == ii] = 1
36+
y_bin[y != ii] = -1
37+
loss += np.sum(
38+
np.log(1. + np.exp(-y_bin * (X.dot(coef) + intercept))))
39+
40+
if penalty == 'l2':
41+
loss += 0.5 * coef.dot(coef) / C
42+
else:
43+
loss += np.sum(np.abs(coef)) / C
44+
else:
45+
coefs_and_intercept = np.vstack((coefs.T, intercepts.T)).T.ravel()
46+
lbin = LabelBinarizer()
47+
Y_multi = lbin.fit_transform(y)
48+
if Y_multi.shape[1] == 1:
49+
Y_multi = np.hstack([1 - Y_multi, Y_multi])
50+
loss, _, _ = _multinomial_loss(coefs_and_intercept, X, Y_multi, 0,
51+
np.ones(X.shape[0]))
52+
coefs = coefs.ravel()
53+
if penalty == 'l2':
54+
loss += 0.5 * coefs.dot(coefs) / C
55+
else:
56+
loss += np.sum(np.abs(coefs)) / C
57+
58+
loss /= X.shape[0]
59+
60+
return loss
61+
62+
63+
def fit_single(solver, X, y, X_shape, dataset, penalty='l2',
64+
multi_class='multinomial', C=1, max_iter=10):
65+
assert X.shape == X_shape
66+
67+
# if not sp.issparse(X):
68+
# X = MinMaxScaler().fit_transform(X)
69+
70+
X_train, X_test, y_train, y_test = train_test_split(
71+
X, y, random_state=42, stratify=y)
72+
train_scores, train_losses, test_scores, times = [], [], [], []
73+
74+
if solver == 'newton-cg':
75+
max_iter /= 2
76+
77+
n_repeats = None
78+
max_iter_range = np.unique(np.int_(np.logspace(0, np.log10(max_iter), 10)))
79+
for this_max_iter in max_iter_range:
80+
msg = ('[%s, %s, %s, %s] Max iter: %s' % (dataset, multi_class, solver,
81+
penalty, this_max_iter))
82+
lr = LogisticRegression(solver=solver, multi_class=multi_class, C=C,
83+
penalty=penalty, fit_intercept=False,
84+
tol=1e-24, max_iter=this_max_iter,
85+
random_state=42, intercept_scaling=10000)
86+
t0 = time.clock()
87+
try:
88+
if penalty == 'l1' and multi_class == 'multinomial':
89+
raise ValueError('skip as only saga is available.')
90+
91+
with ignore_warnings(category=ConvergenceWarning):
92+
# first time for timing
93+
if n_repeats is None:
94+
t0 = time.clock()
95+
lr.fit(X_train, y_train)
96+
max_iter_duration = max_iter * (time.clock() - t0)
97+
n_repeats = max(1, int(1. / max_iter_duration))
98+
99+
t0 = time.clock()
100+
for _ in range(n_repeats):
101+
lr.fit(X_train, y_train)
102+
train_time = (time.clock() - t0) / n_repeats
103+
print('%s (repeat=%d)' % (msg, n_repeats))
104+
105+
except ValueError:
106+
train_score = np.nan
107+
train_loss = np.nan
108+
test_score = np.nan
109+
train_time = np.nan
110+
print('%s (skipped)' % (msg, ))
111+
continue
112+
113+
train_loss = get_loss(lr.coef_, lr.intercept_, X_train, y_train, C,
114+
multi_class, penalty)
115+
train_score = lr.score(X_train, y_train)
116+
test_score = lr.score(X_test, y_test)
117+
118+
train_scores.append(train_score)
119+
train_losses.append(train_loss)
120+
test_scores.append(test_score)
121+
times.append(train_time)
122+
123+
return (solver, penalty, dataset, multi_class, times, train_losses,
124+
train_scores, test_scores)
125+
126+
127+
def load_dataset(dataset, n_samples_max):
128+
if dataset == 'rcv1':
129+
rcv1 = fetch_rcv1()
130+
X = rcv1.data
131+
y = rcv1.target
132+
133+
# take only 3 categories (CCAT, ECAT, MCAT)
134+
y = y[:, [1, 4, 10]].astype(np.float64)
135+
# remove samples that have more than one category
136+
mask = np.asarray(y.sum(axis=1) == 1).ravel()
137+
y = y[mask, :].indices
138+
X = X[mask, :]
139+
140+
elif dataset == 'mnist':
141+
mnist = fetch_mldata('MNIST original')
142+
X, y = shuffle(mnist.data, mnist.target, random_state=42)
143+
X = X.astype(np.float64)
144+
145+
elif dataset == 'digits':
146+
digits = load_digits()
147+
X, y = digits.data, digits.target
148+
149+
elif dataset == 'iris':
150+
iris = load_iris()
151+
X, y = iris.data, iris.target
152+
153+
elif dataset == '20news':
154+
ng = fetch_20newsgroups_vectorized()
155+
X = ng.data
156+
y = ng.target
157+
158+
X = X[:n_samples_max]
159+
y = y[:n_samples_max]
160+
161+
return X, y
162+
163+
164+
def run(solvers, penalties, multi_classes, n_samples_max, max_iter, datasets,
165+
n_jobs):
166+
mem = Memory(cachedir=expanduser('~/cache'), verbose=0)
167+
168+
results = []
169+
for dataset in datasets:
170+
for multi_class in multi_classes:
171+
X, y = load_dataset(dataset, n_samples_max)
172+
173+
cached_fit = mem.cache(fit_single, ignore=['X'])
174+
cached_fit = fit_single
175+
176+
out = Parallel(n_jobs=n_jobs, mmap_mode=None)(delayed(cached_fit)(
177+
solver, X, y, X.shape, dataset=dataset, penalty=penalty,
178+
multi_class=multi_class, C=1, max_iter=max_iter)
179+
for solver in solvers
180+
for penalty in penalties) # yapf: disable
181+
182+
results.extend(out)
183+
184+
columns = ("solver penalty dataset multi_class times "
185+
"train_losses train_scores test_scores").split()
186+
results_df = pd.DataFrame(out, columns=columns)
187+
plot(results_df)
188+
189+
190+
def plot(res):
191+
res.set_index(['dataset', 'multi_class', 'penalty'], inplace=True)
192+
193+
grouped = res.groupby(level=['dataset', 'multi_class', 'penalty'])
194+
195+
colors = {
196+
'sag': 'red',
197+
'saga': 'orange',
198+
'liblinear': 'blue',
199+
'lbfgs': 'green',
200+
'newton-cg': 'darkviolet',
201+
'auto': 'black',
202+
}
203+
204+
for idx, group in grouped:
205+
dataset, multi_class, penalty = idx
206+
fig = plt.figure(figsize=(12, 4))
207+
208+
# -----------------------
209+
ax = fig.add_subplot(131)
210+
train_losses = group['train_losses']
211+
tmp = np.sort(np.concatenate(train_losses.values))
212+
if tmp.size == 0:
213+
plt.close(fig)
214+
continue
215+
ref = 2 * tmp[0] - tmp[1]
216+
217+
for losses, times, solver in zip(group['train_losses'], group['times'],
218+
group['solver']):
219+
losses = losses - ref
220+
linestyle = ':' if solver == 'auto' else '-'
221+
ax.plot(times, losses, label=solver, color=colors[solver],
222+
linestyle=linestyle, marker='.')
223+
ax.set_xlabel('Time (s)')
224+
ax.set_ylabel('Training objective (relative to min)')
225+
ax.set_yscale('log')
226+
227+
# -----------------------
228+
ax = fig.add_subplot(132)
229+
230+
for train_score, times, solver in zip(group['train_scores'],
231+
group['times'], group['solver']):
232+
linestyle = ':' if solver == 'auto' else '-'
233+
ax.plot(times, train_score, label=solver, color=colors[solver],
234+
linestyle=linestyle, marker='.')
235+
ax.set_xlabel('Time (s)')
236+
ax.set_ylabel('Train score')
237+
238+
# -----------------------
239+
ax = fig.add_subplot(133)
240+
241+
for test_score, times, solver in zip(group['test_scores'],
242+
group['times'], group['solver']):
243+
linestyle = ':' if solver == 'auto' else '-'
244+
ax.plot(times, test_score, label=solver, color=colors[solver],
245+
linestyle=linestyle, marker='.')
246+
ax.set_xlabel('Time (s)')
247+
ax.set_ylabel('Test score')
248+
ax.legend()
249+
250+
# -----------------------
251+
name = '%s_%s_%s' % (multi_class, penalty, dataset)
252+
plt.suptitle(name)
253+
fig.tight_layout()
254+
fig.subplots_adjust(top=0.9)
255+
plt.savefig('figures/' + name + '.png')
256+
plt.close(fig)
257+
print('SAVED: ' + name)
258+
259+
260+
if __name__ == '__main__':
261+
n_jobs = 3
262+
max_iter = 50
263+
solvers = ['liblinear', 'saga', 'sag', 'lbfgs', 'newton-cg', 'auto']
264+
penalties = ['l2', 'l1']
265+
multi_classes = ['multinomial', 'ovr']
266+
datasets = ['iris', 'digits', 'mnist', '20news', 'rcv1']
267+
268+
run(solvers, penalties, multi_classes, n_samples_max=None, n_jobs=n_jobs,
269+
datasets=datasets, max_iter=max_iter)

doc/modules/linear_model.rst

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -773,19 +773,30 @@ The "saga" solver [7]_ is a variant of "sag" that also supports the
773773
non-smooth `penalty="l1"` option. This is therefore the solver of choice
774774
for sparse multinomial logistic regression.
775775

776-
In a nutshell, one may choose the solver with the following rules:
777-
778-
================================= =====================================
779-
Case Solver
780-
================================= =====================================
781-
L1 penalty "liblinear" or "saga"
782-
Multinomial loss "lbfgs", "sag", "saga" or "newton-cg"
783-
Very Large dataset (`n_samples`) "sag" or "saga"
784-
================================= =====================================
776+
In a nutshell, the following table summarizes the solvers characteristics:
777+
778+
============================ =========== ======= =========== ===== ======
779+
solver 'liblinear' 'lbfgs' 'newton-cg' 'sag' 'saga'
780+
============================ =========== ======= =========== ===== ======
781+
Multinomial + L2 penalty no yes yes yes yes
782+
OVR + L2 penalty yes yes yes yes yes
783+
Multinomial + L1 penalty no no no no yes
784+
OVR + L1 penalty yes no no no yes
785+
============================ =========== ======= =========== ===== ======
786+
Penalize the intercept (bad) yes no no no no
787+
Faster for large datasets no no no yes yes
788+
Robust to unscaled datasets yes yes yes no no
789+
============================ =========== ======= =========== ===== ======
785790

786791
The "saga" solver is often the best choice. The "liblinear" solver is
787792
used by default for historical reasons.
788793

794+
The default solver will change to "auto" in version 0.22. This option
795+
automatically selects a good solver based on both `penalty` and `multi_class`
796+
parameters, and on the size of the training set. Note that the "auto" behavior
797+
may change without notice in the future, leading to similar but not necessarily
798+
exact same solutions.
799+
789800
For large dataset, you may also consider using :class:`SGDClassifier`
790801
with 'log' loss.
791802

doc/tutorial/statistical_inference/supervised_learning.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,8 @@ function or **logistic** function:
372372
>>> logistic.fit(iris_X_train, iris_y_train)
373373
LogisticRegression(C=100000.0, class_weight=None, dual=False,
374374
fit_intercept=True, intercept_scaling=1, max_iter=100,
375-
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
376-
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
375+
multi_class='default', n_jobs=1, penalty='l2', random_state=None,
376+
solver='default', tol=0.0001, verbose=0, warm_start=False)
377377

378378
This is known as :class:`LogisticRegression`.
379379

0 commit comments

Comments
 (0)