Fix scaling of LogisticRegression objective for LBFGS

## Description
The objective function of `LogisticRegression` is `C * sum(log_loss) + penalty`. For LBFGS, the reformulation (having the same argmin) is much more favorable: `1/N * sum(log_loss) + 1/(N*C)*penalty`.

Note that the division by 1/C is already done for all solvers but `"liblinear"`.

## Proposed Action
Similar to `_GeneralizedLinearRegressor, internally in `_logistic_regression_path`, use
```python
if solver == "lbfgs":
    if sample_weight is None
        sample_weight = np.full_like(y, 1/y.shape[0])
    else:
        sample_weight = sample_weight / sample_weight.sum()
```

## Detailed Background
This unfavorable behaviour was noted in https://github.com/scikit-learn/scikit-learn/pull/23314#issuecomment-1133642064.

![image](https://user-images.githubusercontent.com/15324633/197764809-b70a1ebf-b006-462a-beaa-a4dd897953d3.png)
![image](https://user-images.githubusercontent.com/15324633/197764841-bf9be901-8e1d-47ed-b13a-e6f77e67ad6f.png)

<details>

```python
import warnings
from pathlib import Path
import numpy as np
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn._loss import HalfBinomialLoss
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.linear_model._glm.glm import _GeneralizedLinearRegressor
from time import perf_counter
import pandas as pd
import joblib


class BinomialRegressor(_GeneralizedLinearRegressor):
    def _get_loss(self):
        return HalfBinomialLoss()


@joblib.Memory(location=".").cache
def prepare_data():
    df = fetch_openml(data_id=41214).frame
    df["Frequency"] = df["ClaimNb"] / df["Exposure"]
    log_scale_transformer = make_pipeline(
        FunctionTransformer(np.log, validate=False), StandardScaler()
    )
    linear_model_preprocessor = ColumnTransformer(
        [
            ("passthrough_numeric", "passthrough", ["BonusMalus"]),
            (
                "binned_numeric",
                KBinsDiscretizer(n_bins=10, subsample=None),
                ["VehAge", "DrivAge"],
            ),
            ("log_scaled_numeric", log_scale_transformer, ["Density"]),
            (
                "onehot_categorical",
                OneHotEncoder(),
                ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
            ),
        ],
        remainder="drop",
    )
    y = df["Frequency"]
    w = df["Exposure"]
    X = linear_model_preprocessor.fit_transform(df)
    return X, y, w


X, y, w = prepare_data()
y = y.values
y = (y > np.quantile(y, q=0.95)).astype(np.float64)
print(y.mean())

# Make sure, we use equivalent penalties
alpha = 1e2
C = 1 / alpha / X.shape[0]
lr1 = LogisticRegression(C=C, tol=1e-12, max_iter=10000).fit(X, y)
lr2 = BinomialRegressor(alpha=alpha, tol=1e-12, solver="newton-cholesky").fit(X, y)
assert np.allclose(lr1.intercept_, lr2.intercept_)

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, train_size=50_000, test_size=10_000, random_state=0
)
print(f"{X_train.shape = }")

results = []
loss_sw = np.full_like(y_train, fill_value=(1.0 / y_train.shape[0]))
# loss_sw = None
alpha = 1e-12
C = 1. / alpha / X.shape[0]
max_iter = 10_000
for tol in np.logspace(-1, -10, 10):
    for solver in ["lbfgs", "lbfgs2", "newton-cg", "newton-cholesky"]:
        tic = perf_counter()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            if solver in ["lbfgs", "newton-cg"]:
                model = LogisticRegression(
                    C=C, solver=solver, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
            else:
                if solver == "lbfgs2":
                    binomial_solver = "lbfgs"
                else:
                    binomial_solver = solver
                model = BinomialRegressor(
                    solver=binomial_solver, alpha=alpha, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
        train_time = toc - tic

        lml = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=True)
        train_loss = lml.loss(
            coef=np.r_[model.coef_.ravel(), model.intercept_.ravel()],
            X=X_train,
            y=y_train.astype(np.float64),
            sample_weight=loss_sw,
            l2_reg_strength=alpha,
        )
        result = {
            "solver": solver,
            "tol": tol,
            "train_time": train_time,
            "train_loss": train_loss,
            "n_iter": int(np.squeeze(model.n_iter_)),
            "converged": np.squeeze(model.n_iter_ < model.max_iter).all(),
            "coef_sq_norm": (model.coef_**2).sum(),
        }
        print(result)
        results.append(result)

results = pd.DataFrame.from_records(results)

# %%
import pandas as pd
from pathlib import Path

#results = pd.read_csv(Path(__file__).parent / "bench_quantile_clf.csv")
results["suboptimality"] = results["train_loss"] - results["train_loss"].min() + 1e-12
results

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(x="n_iter", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(x="train_time", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")
```

</details>
where

- `"lbfgs"` is `LogisticRegression(solver="lbfgs")`
- `"lbfgs2"` is using
  ```python
  class BinomialRegressor(_GeneralizedLinearRegressor):
      def _get_loss(self):
          return HalfBinomialLoss()
  ```
  with `solver="lbfgs"`

The same kind of solver performance profile on the kicks dataset, see https://github.com/lorentzenchr/notebooks/blob/master/bench_glm_newton_solvers_cholesky_lsmr.ipynb.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Fix scaling of LogisticRegression objective for LBFGS #24752

Description

Proposed Action

Detailed Background

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Fix scaling of LogisticRegression objective for LBFGS #24752

Description

Description

Proposed Action

Detailed Background

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions