FIX LogisticRegression warm start with newton-cholesky solver (#31866)

lorentzenchr · web-flow · commit 217fe948d41b · 2025-08-11T13:18:12.000+05:00
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/31866.fix.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/31866.fix.rst
@@ -0,0 +1,6 @@
+- Fixed a bug in class:`linear_model:LogisticRegression` when used with
+ `solver="newton-cholesky"`and `warm_start=True` on multi-class problems, either
+  with `fit_intercept=True` or with `penalty=None` (both resulting in unpenalized
+  parameters for the solver). The coefficients and intercepts of the last class as
+  provided by warm start were partially wrongly overwritten by zero.
+  By :user:`Christian Lorentzen <lorentzenchr>`
diff --git a/sklearn/linear_model/_glm/_newton_solver.py b/sklearn/linear_model/_glm/_newton_solver.py
@@ -469,6 +469,19 @@ def setup(self, X, y, sample_weight):
         self.is_multinomial_no_penalty = (
             self.linear_loss.base_loss.is_multiclass and self.l2_reg_strength == 0
         )
+        if self.is_multinomial_no_penalty:
+            # See inner_solve. The provided coef might not adhere to the convention
+            # that the last class is set to zero.
+            # This is done by the usual freedom of a (overparametrized) multinomial to
+            # add a constant to all classes which doesn't change predictions.
+            n_classes = self.linear_loss.base_loss.n_classes
+            coef = self.coef.reshape(n_classes, -1, order="F")  # easier as 2d
+            coef -= coef[-1, :]  # coef -= coef of last class
+        elif self.is_multinomial_with_intercept:
+            # See inner_solve. Same as above, but only for the intercept.
+            n_classes = self.linear_loss.base_loss.n_classes
+            # intercept -= intercept of last class
+            self.coef[-n_classes:] -= self.coef[-1]
 
     def update_gradient_hessian(self, X, y, sample_weight):
         _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
@@ -518,10 +531,10 @@ def inner_solve(self, X, y, sample_weight):
             #
             # We choose the standard approach and set all the coefficients of the last
             # class to zero, for all features including the intercept.
+            # Note that coef was already dealt with in setup.
             n_classes = self.linear_loss.base_loss.n_classes
             n_dof = self.coef.size // n_classes  # degree of freedom per class
             n = self.coef.size - n_dof  # effective size
-            self.coef[n_classes - 1 :: n_classes] = 0
             self.gradient[n_classes - 1 :: n_classes] = 0
             self.hessian[n_classes - 1 :: n_classes, :] = 0
             self.hessian[:, n_classes - 1 :: n_classes] = 0
@@ -544,7 +557,7 @@ def inner_solve(self, X, y, sample_weight):
         elif self.is_multinomial_with_intercept:
             # Here, only intercepts are unpenalized. We again choose the last class and
             # set its intercept to zero.
-            self.coef[-1] = 0
+            # Note that coef was already dealt with in setup.
             self.gradient[-1] = 0
             self.hessian[-1, :] = 0
             self.hessian[:, -1] = 0
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -1434,9 +1434,7 @@ def test_n_iter(solver):
     assert clf_cv.n_iter_.shape == (1, n_cv_fold, n_Cs)
 
 
-@pytest.mark.parametrize(
-    "solver", sorted(set(SOLVERS) - set(["liblinear", "newton-cholesky"]))
-)
+@pytest.mark.parametrize("solver", sorted(set(SOLVERS) - set(["liblinear"])))
 @pytest.mark.parametrize("warm_start", (True, False))
 @pytest.mark.parametrize("fit_intercept", (True, False))
 def test_warm_start(global_random_seed, solver, warm_start, fit_intercept):
@@ -1469,6 +1467,40 @@ def test_warm_start(global_random_seed, solver, warm_start, fit_intercept):
         assert cum_diff > 2.0, msg
 
 
+@pytest.mark.parametrize("solver", ["newton-cholesky", "newton-cg"])
+@pytest.mark.parametrize("fit_intercept", (True, False))
+@pytest.mark.parametrize("penalty", ("l2", None))
+def test_warm_start_newton_solver(global_random_seed, solver, fit_intercept, penalty):
+    """Test that 2 steps at once are the same as 2 single steps with warm start."""
+    X, y = iris.data, iris.target
+
+    clf1 = LogisticRegression(
+        solver=solver,
+        max_iter=2,
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        random_state=global_random_seed,
+    )
+    with ignore_warnings(category=ConvergenceWarning):
+        clf1.fit(X, y)
+
+    clf2 = LogisticRegression(
+        solver=solver,
+        max_iter=1,
+        warm_start=True,
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        random_state=global_random_seed,
+    )
+    with ignore_warnings(category=ConvergenceWarning):
+        clf2.fit(X, y)
+        clf2.fit(X, y)
+
+    assert_allclose(clf2.coef_, clf1.coef_)
+    if fit_intercept:
+        assert_allclose(clf2.intercept_, clf1.intercept_)
+
+
 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
 def test_saga_vs_liblinear(global_random_seed, csr_container):
     iris = load_iris()