-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
Closed
Description
Description
The objective function of LogisticRegression
is C * sum(log_loss) + penalty
. For LBFGS, the reformulation (having the same argmin) is much more favorable: 1/N * sum(log_loss) + 1/(N*C)*penalty
.
Note that the division by 1/C is already done for all solvers but "liblinear"
.
Proposed Action
Similar to _GeneralizedLinearRegressor, internally in
_logistic_regression_path`, use
if solver == "lbfgs":
if sample_weight is None
sample_weight = np.full_like(y, 1/y.shape[0])
else:
sample_weight = sample_weight / sample_weight.sum()
Detailed Background
This unfavorable behaviour was noted in #23314 (comment).
import warnings
from pathlib import Path
import numpy as np
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn._loss import HalfBinomialLoss
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.linear_model._glm.glm import _GeneralizedLinearRegressor
from time import perf_counter
import pandas as pd
import joblib
class BinomialRegressor(_GeneralizedLinearRegressor):
def _get_loss(self):
return HalfBinomialLoss()
@joblib.Memory(location=".").cache
def prepare_data():
df = fetch_openml(data_id=41214).frame
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
log_scale_transformer = make_pipeline(
FunctionTransformer(np.log, validate=False), StandardScaler()
)
linear_model_preprocessor = ColumnTransformer(
[
("passthrough_numeric", "passthrough", ["BonusMalus"]),
(
"binned_numeric",
KBinsDiscretizer(n_bins=10, subsample=None),
["VehAge", "DrivAge"],
),
("log_scaled_numeric", log_scale_transformer, ["Density"]),
(
"onehot_categorical",
OneHotEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"],
),
],
remainder="drop",
)
y = df["Frequency"]
w = df["Exposure"]
X = linear_model_preprocessor.fit_transform(df)
return X, y, w
X, y, w = prepare_data()
y = y.values
y = (y > np.quantile(y, q=0.95)).astype(np.float64)
print(y.mean())
# Make sure, we use equivalent penalties
alpha = 1e2
C = 1 / alpha / X.shape[0]
lr1 = LogisticRegression(C=C, tol=1e-12, max_iter=10000).fit(X, y)
lr2 = BinomialRegressor(alpha=alpha, tol=1e-12, solver="newton-cholesky").fit(X, y)
assert np.allclose(lr1.intercept_, lr2.intercept_)
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
X, y, w, train_size=50_000, test_size=10_000, random_state=0
)
print(f"{X_train.shape = }")
results = []
loss_sw = np.full_like(y_train, fill_value=(1.0 / y_train.shape[0]))
# loss_sw = None
alpha = 1e-12
C = 1. / alpha / X.shape[0]
max_iter = 10_000
for tol in np.logspace(-1, -10, 10):
for solver in ["lbfgs", "lbfgs2", "newton-cg", "newton-cholesky"]:
tic = perf_counter()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ConvergenceWarning)
if solver in ["lbfgs", "newton-cg"]:
model = LogisticRegression(
C=C, solver=solver, tol=tol, max_iter=max_iter
).fit(X_train, y_train)
toc = perf_counter()
else:
if solver == "lbfgs2":
binomial_solver = "lbfgs"
else:
binomial_solver = solver
model = BinomialRegressor(
solver=binomial_solver, alpha=alpha, tol=tol, max_iter=max_iter
).fit(X_train, y_train)
toc = perf_counter()
train_time = toc - tic
lml = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=True)
train_loss = lml.loss(
coef=np.r_[model.coef_.ravel(), model.intercept_.ravel()],
X=X_train,
y=y_train.astype(np.float64),
sample_weight=loss_sw,
l2_reg_strength=alpha,
)
result = {
"solver": solver,
"tol": tol,
"train_time": train_time,
"train_loss": train_loss,
"n_iter": int(np.squeeze(model.n_iter_)),
"converged": np.squeeze(model.n_iter_ < model.max_iter).all(),
"coef_sq_norm": (model.coef_**2).sum(),
}
print(result)
results.append(result)
results = pd.DataFrame.from_records(results)
# %%
import pandas as pd
from pathlib import Path
#results = pd.read_csv(Path(__file__).parent / "bench_quantile_clf.csv")
results["suboptimality"] = results["train_loss"] - results["train_loss"].min() + 1e-12
results
# %%
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
group.sort_values("tol").plot(x="n_iter", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")
# %%
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
group.sort_values("tol").plot(x="train_time", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")
"lbfgs"
isLogisticRegression(solver="lbfgs")
"lbfgs2"
is usingwithclass BinomialRegressor(_GeneralizedLinearRegressor): def _get_loss(self): return HalfBinomialLoss()
solver="lbfgs"
The same kind of solver performance profile on the kicks dataset, see https://github.com/lorentzenchr/notebooks/blob/master/bench_glm_newton_solvers_cholesky_lsmr.ipynb.