Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion sklearn/linear_model/_glm/glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,12 @@ def fit(self, X, y, sample_weight=None):
else:
coef = np.zeros(n_features, dtype=loss_dtype)

# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside of LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
Comment on lines +274 to +276
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is worth being a bit more explicit?

Suggested change
# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside of LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
# To save some memory, we preallocate two ndarrays used respectively
# as per row loss, gradient inside of LinearLoss by several methods
# e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.

per_sample_loss_out = np.empty_like(y)
per_sample_gradient_out = np.empty_like(y)

# Algorithms for optimization:
# Note again that our losses implement 1/2 * deviance.
if solver == "lbfgs":
Expand All @@ -289,7 +295,15 @@ def fit(self, X, y, sample_weight=None):
"gtol": self.tol,
"ftol": 1e3 * np.finfo(float).eps,
},
args=(X, y, sample_weight, l2_reg_strength, n_threads),
args=(
X,
y,
sample_weight,
l2_reg_strength,
n_threads,
per_sample_loss_out,
per_sample_gradient_out,
),
)
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
coef = opt_res.x
Expand Down
72 changes: 68 additions & 4 deletions sklearn/linear_model/_linear_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,16 @@ def _w_intercept_raw(self, coef, X):

return weights, intercept, raw_prediction

def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1):
def loss(
self,
coef,
X,
y,
sample_weight=None,
l2_reg_strength=0.0,
n_threads=1,
per_sample_loss_out=None,
):
"""Compute the loss as sum over point-wise losses.

Parameters
Expand All @@ -132,6 +141,10 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
per_sample_loss_out : None or C-contiguous array of shape (n_samples,), \
default=None
A location into which the per sample loss is stored. If None, a new array
might be created. Providing such an array can save a little memory.

Returns
-------
Expand All @@ -144,6 +157,7 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
loss_out=per_sample_loss_out,
n_threads=n_threads,
)
loss = loss.sum()
Expand All @@ -152,7 +166,15 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
return loss + 0.5 * l2_reg_strength * norm2_w

def loss_gradient(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
self,
coef,
X,
y,
sample_weight=None,
l2_reg_strength=0.0,
n_threads=1,
per_sample_loss_out=None,
per_sample_gradient_out=None,
):
"""Computes the sum of loss and gradient w.r.t. coef.

Expand All @@ -173,6 +195,14 @@ def loss_gradient(
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
per_sample_loss_out : None or C-contiguous array of shape (n_samples,), \
default=None
A location into which the per sample loss is stored. If None, a new array
might be created. Providing such an array can save a little memory.
per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
array of shape (n_samples, n_classes), default=None
A location into which the per sample gradient is stored. If None, a new
array might be created. Providing such an array can save a little memory.

Returns
-------
Expand All @@ -190,6 +220,8 @@ def loss_gradient(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
loss_out=per_sample_loss_out,
gradient_out=per_sample_gradient_out,
n_threads=n_threads,
)
loss = loss.sum()
Expand All @@ -213,7 +245,14 @@ def loss_gradient(
return loss, grad

def gradient(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
self,
coef,
X,
y,
sample_weight=None,
l2_reg_strength=0.0,
n_threads=1,
per_sample_gradient_out=None,
):
"""Computes the gradient w.r.t. coef.

Expand All @@ -234,6 +273,10 @@ def gradient(
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
array of shape (n_samples, n_classes), default=None
A location into which the per sample gradient is stored. If None, a new
array might be created. Providing such an array can save a little memory.

Returns
-------
Expand All @@ -248,6 +291,7 @@ def gradient(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
gradient_out=per_sample_gradient_out,
n_threads=n_threads,
)

Expand All @@ -269,7 +313,15 @@ def gradient(
return grad

def gradient_hessian_product(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
self,
coef,
X,
y,
sample_weight=None,
l2_reg_strength=0.0,
n_threads=1,
per_sample_gradient_out=None,
per_sample_hessian_out=None,
):
"""Computes gradient and hessp (hessian product function) w.r.t. coef.

Expand All @@ -290,6 +342,14 @@ def gradient_hessian_product(
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
array of shape (n_samples, n_classes), default=None
A location into which the per sample gradient is stored. If None, a new
array might be created. Providing such an array can save a little memory.
per_sample_hessian_out : None or C-contiguous array of shape (n_samples,) or
array of shape (n_samples, n_classes), default=None
A location into which the per sample hessian is stored. If None, a new
array might be created. Providing such an array can save a little memory.

Returns
-------
Expand All @@ -309,6 +369,8 @@ def gradient_hessian_product(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
gradient_out=per_sample_gradient_out,
hessian_out=per_sample_hessian_out,
n_threads=n_threads,
)
grad = np.empty_like(coef, dtype=weights.dtype)
Expand Down Expand Up @@ -356,6 +418,8 @@ def hessp(s):
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
gradient_out=per_sample_gradient_out,
proba_out=per_sample_hessian_out,
n_threads=n_threads,
)
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
Expand Down
60 changes: 47 additions & 13 deletions sklearn/linear_model/_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# Simon Wu <s8wu@uwaterloo.ca>
# Arthur Mensch <arthur.mensch@m4x.org

import functools
import numbers
import warnings

Expand Down Expand Up @@ -397,34 +398,67 @@ def _logistic_regression_path(
# reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
# As w0 is F-contiguous, ravel(order="F") also avoids a copy.
w0 = w0.ravel(order="F")
loss = LinearModelLoss(
linear_loss = LinearModelLoss(
base_loss=HalfMultinomialLoss(n_classes=classes.size),
fit_intercept=fit_intercept,
)
target = Y_multi
if solver in "lbfgs":
func = loss.loss_gradient
elif solver == "newton-cg":
func = loss.loss
grad = loss.gradient
hess = loss.gradient_hessian_product # hess = [gradient, hessp]
warm_start_sag = {"coef": w0.T}
else:
target = y_bin
if solver == "lbfgs":
loss = LinearModelLoss(
linear_loss = LinearModelLoss(
base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
)
func = loss.loss_gradient
elif solver == "newton-cg":
loss = LinearModelLoss(
linear_loss = LinearModelLoss(
base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
)
func = loss.loss
grad = loss.gradient
hess = loss.gradient_hessian_product # hess = [gradient, hessp]
warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}

if solver == "lbfgs":
# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
per_sample_loss_out = np.empty_like(target)
if linear_loss.base_loss.is_multiclass:
per_sample_gradient_out = np.empty(
shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
)
else:
per_sample_gradient_out = np.empty_like(target, order="C")

func = functools.partial(
linear_loss.loss_gradient,
per_sample_loss_out=per_sample_loss_out,
per_sample_gradient_out=per_sample_gradient_out,
)
elif solver == "newton-cg":
# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
per_sample_loss_out = np.empty_like(target)
if linear_loss.base_loss.is_multiclass:
per_sample_gradient_out = np.empty(
shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
)
else:
per_sample_gradient_out = np.empty_like(target, order="C")
Comment on lines +419 to +446
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be boiled down to this? Note that I have also specified dtype=X.dtype when creating per_sample_gradient_out and changed the comment.

Suggested change
if solver == "lbfgs":
# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
per_sample_loss_out = np.empty_like(target)
if linear_loss.base_loss.is_multiclass:
per_sample_gradient_out = np.empty(
shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
)
else:
per_sample_gradient_out = np.empty_like(target, order="C")
func = functools.partial(
linear_loss.loss_gradient,
per_sample_loss_out=per_sample_loss_out,
per_sample_gradient_out=per_sample_gradient_out,
)
elif solver == "newton-cg":
# To save some memory, we preallocate a ndarray used as per row loss and
# gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
# others).
per_sample_loss_out = np.empty_like(target)
if linear_loss.base_loss.is_multiclass:
per_sample_gradient_out = np.empty(
shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
)
else:
per_sample_gradient_out = np.empty_like(target, order="C")
# To save some memory, we preallocate two ndarrays used respectively
# as per row loss, gradient inside of LinearLoss by several methods
# e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.
per_sample_loss_out = np.empty_like(target)
if linear_loss.base_loss.is_multiclass:
per_sample_gradient_out = np.empty(
shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
)
else:
per_sample_gradient_out = np.empty_like(target, dtype=X.dtype, order="C")
if solver == "lbfgs":
func = functools.partial(
linear_loss.loss_gradient,
per_sample_loss_out=per_sample_loss_out,
per_sample_gradient_out=per_sample_gradient_out,
)
elif solver == "newton-cg":

per_sample_hessian_out = np.empty_like(per_sample_gradient_out)

func = functools.partial(
linear_loss.loss,
per_sample_loss_out=per_sample_loss_out,
)
grad = functools.partial(
linear_loss.gradient, per_sample_gradient_out=per_sample_gradient_out
)
hess = functools.partial(
linear_loss.gradient_hessian_product, # hess = [gradient, hessp]
per_sample_gradient_out=per_sample_gradient_out,
per_sample_hessian_out=per_sample_hessian_out,
)
Comment on lines +456 to +460
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hess = functools.partial(
linear_loss.gradient_hessian_product, # hess = [gradient, hessp]
per_sample_gradient_out=per_sample_gradient_out,
per_sample_hessian_out=per_sample_hessian_out,
)
# hess = [gradient, hessp]
hess = functools.partial(
linear_loss.gradient_hessian_product,
per_sample_gradient_out=per_sample_gradient_out,
per_sample_hessian_out=per_sample_hessian_out,
)


coefs = list()
n_iter = np.zeros(len(Cs), dtype=np.int32)
for i, C in enumerate(Cs):
Expand Down