Skip to content

ENH Support float32 in SGDClassifier and SGDRegressor #25587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 27, 2023
Merged
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ sklearn/utils/_seq_dataset.pxd
sklearn/utils/_weight_vector.pyx
sklearn/utils/_weight_vector.pxd
sklearn/linear_model/_sag_fast.pyx
sklearn/linear_model/_sgd_fast.pyx
sklearn/metrics/_dist_metrics.pyx
sklearn/metrics/_dist_metrics.pxd
sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
Expand Down
7 changes: 7 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@ Changelog
- |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`.
:pr:`25232` by :user:`Thijs van Weezel <ValueInvestorThijs>`.

:mod:`sklearn.linear_model`
...........................

- |Enhancement| :class:`SGDClassifier`, :class:`SGDRegressor` and
:class:`SGDOneClassSVM` now preserve dtype for `numpy.float32`.
:pr:`25587` by :user:`Omar Salman <OmarManzoor>`

:mod:`sklearn.metrics`
......................

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def check_package_status(package, min_version):
],
"linear_model": [
{"sources": ["_cd_fast.pyx"], "include_np": True},
{"sources": ["_sgd_fast.pyx"], "include_np": True},
{"sources": ["_sgd_fast.pyx.tp"], "include_np": True},
{"sources": ["_sag_fast.pyx.tp"], "include_np": True},
],
"manifold": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,50 @@
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Mathieu Blondel (partial_fit support)
# Rob Zinkov (passive-aggressive)
# Lars Buitinck
#
# License: BSD 3 clause
{{py:

"""
Template file to easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).

Generated file: _sgd_fast.pyx

Each relevant function is duplicated for the dtypes float and double.
The keywords between double braces are substituted in setup.py.

Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
Mathieu Blondel (partial_fit support)
Rob Zinkov (passive-aggressive)
Lars Buitinck

License: BSD 3 clause
"""

# The dtypes are defined as follows (name_suffix, c_type, np_type)
dtypes = [
("64", "double", "np.float64"),
("32", "float", "np.float32"),
]

}}

#------------------------------------------------------------------------------

"""
SGD implementation
WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
"""

from cython cimport floating
import numpy as np
from time import time

from libc.math cimport exp, log, pow, fabs
cimport numpy as cnp
from numpy.math cimport INFINITY
cdef extern from "_sgd_fast_helpers.h":
bint skl_isfinite(double) nogil
bint skl_isfinite32(float) nogil
bint skl_isfinite64(double) nogil

from ..utils._weight_vector cimport WeightVector64 as WeightVector
from ..utils._seq_dataset cimport SequentialDataset64 as SequentialDataset
from ..utils._weight_vector cimport WeightVector32, WeightVector64
Copy link
Member

@thomasjpfan thomasjpfan Feb 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I put on my "numerical computing on a CPU hat", I tend to think that float32 input dtypes should have accumulators of float64. Concretely, WeightVector32 keeps internal statistics as float64 and upcast data as it is processing the float32 input. In any case, I think this PR can be accepted as is and any updates to WeightVector32 can be done as a follow up.

@jjerphan @jeremiedbb What do you think?

XREF: For reference, PyTorch uses a float64 accumulator for float32 inputs on CPUS.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using float64 accumulator when on CPU is the best thing to do. We do it at several places within scikit-learn. SciPy also does this for its distance metrics' implementations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #25721 to force WeightVector32 to use a float64 accumulator.

from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64

cnp.import_array()

Expand Down Expand Up @@ -372,37 +400,48 @@ cdef class SquaredEpsilonInsensitive(Regression):
def __reduce__(self):
return SquaredEpsilonInsensitive, (self.epsilon,)


def _plain_sgd(const double[::1] weights,
double intercept,
const double[::1] average_weights,
double average_intercept,
LossFunction loss,
int penalty_type,
double alpha, double C,
double l1_ratio,
SequentialDataset dataset,
const unsigned char[::1] validation_mask,
bint early_stopping, validation_score_cb,
int n_iter_no_change,
unsigned int max_iter, double tol, int fit_intercept,
int verbose, bint shuffle, cnp.uint32_t seed,
double weight_pos, double weight_neg,
int learning_rate, double eta0,
double power_t,
bint one_class,
double t=1.0,
double intercept_decay=1.0,
int average=0):
{{for name_suffix, c_type, np_type in dtypes}}

def _plain_sgd{{name_suffix}}(
const {{c_type}}[::1] weights,
double intercept,
const {{c_type}}[::1] average_weights,
double average_intercept,
LossFunction loss,
int penalty_type,
double alpha,
double C,
double l1_ratio,
SequentialDataset{{name_suffix}} dataset,
const unsigned char[::1] validation_mask,
bint early_stopping,
validation_score_cb,
int n_iter_no_change,
unsigned int max_iter,
double tol,
int fit_intercept,
int verbose,
bint shuffle,
cnp.uint32_t seed,
double weight_pos,
double weight_neg,
int learning_rate,
double eta0,
double power_t,
bint one_class,
double t=1.0,
double intercept_decay=1.0,
int average=0,
):
"""SGD for generic loss functions and penalties with optional averaging

Parameters
----------
weights : ndarray[double, ndim=1]
weights : ndarray[{{c_type}}, ndim=1]
The allocated vector of weights.
intercept : double
The initial intercept.
average_weights : ndarray[double, ndim=1]
average_weights : ndarray[{{c_type}}, ndim=1]
The average weights as computed for ASGD. Should be None if average
is 0.
average_intercept : double
Expand Down Expand Up @@ -489,8 +528,8 @@ def _plain_sgd(const double[::1] weights,
cdef Py_ssize_t n_samples = dataset.n_samples
cdef Py_ssize_t n_features = weights.shape[0]

cdef WeightVector w = WeightVector(weights, average_weights)
cdef double *x_data_ptr = NULL
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
cdef {{c_type}} *x_data_ptr = NULL
cdef int *x_ind_ptr = NULL

# helper variables
Expand All @@ -505,9 +544,9 @@ def _plain_sgd(const double[::1] weights,
cdef double score = 0.0
cdef double best_loss = INFINITY
cdef double best_score = -INFINITY
cdef double y = 0.0
cdef double sample_weight
cdef double class_weight = 1.0
cdef {{c_type}} y = 0.0
cdef {{c_type}} sample_weight
cdef {{c_type}} class_weight = 1.0
cdef unsigned int count = 0
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
cdef unsigned int epoch = 0
Expand All @@ -520,10 +559,10 @@ def _plain_sgd(const double[::1] weights,
cdef long long sample_index

# q vector is only used for L1 regularization
cdef double[::1] q = None
cdef double * q_data_ptr = NULL
cdef {{c_type}}[::1] q = None
cdef {{c_type}} * q_data_ptr = NULL
if penalty_type == L1 or penalty_type == ELASTICNET:
q = np.zeros((n_features,), dtype=np.float64, order="c")
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
q_data_ptr = &q[0]
cdef double u = 0.0

Expand Down Expand Up @@ -627,7 +666,7 @@ def _plain_sgd(const double[::1] weights,

if penalty_type == L1 or penalty_type == ELASTICNET:
u += (l1_ratio * eta * alpha)
l1penalty(w, q_data_ptr, x_ind_ptr, xnnz, u)
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)

t += 1
count += 1
Expand Down Expand Up @@ -694,15 +733,28 @@ def _plain_sgd(const double[::1] weights,
epoch + 1
)

{{endfor}}


cdef inline bint skl_isfinite(floating w) noexcept nogil:
if floating is float:
return skl_isfinite32(w)
else:
return skl_isfinite64(w)

cdef bint any_nonfinite(const double *w, int n) noexcept nogil:

cdef inline bint any_nonfinite(const floating *w, int n) noexcept nogil:
for i in range(n):
if not skl_isfinite(w[i]):
return True
return 0


cdef double sqnorm(double * x_data_ptr, int * x_ind_ptr, int xnnz) noexcept nogil:
cdef inline double sqnorm(
floating * x_data_ptr,
int * x_ind_ptr,
int xnnz,
) noexcept nogil:
cdef double x_norm = 0.0
cdef int j
cdef double z
Expand All @@ -712,8 +764,15 @@ cdef double sqnorm(double * x_data_ptr, int * x_ind_ptr, int xnnz) noexcept nogi
return x_norm


cdef void l1penalty(WeightVector w, double * q_data_ptr,
int *x_ind_ptr, int xnnz, double u) noexcept nogil:
{{for name_suffix, c_type, np_type in dtypes}}

cdef void l1penalty{{name_suffix}}(
WeightVector{{name_suffix}} w,
{{c_type}} * q_data_ptr,
int *x_ind_ptr,
int xnnz,
double u,
) noexcept nogil:
"""Apply the L1 penalty to each updated feature

This implements the truncated gradient approach by
Expand All @@ -723,7 +782,7 @@ cdef void l1penalty(WeightVector w, double * q_data_ptr,
cdef int j = 0
cdef int idx = 0
cdef double wscale = w.wscale
cdef double *w_data_ptr = w.w_data_ptr
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
z = w_data_ptr[idx]
Expand All @@ -736,3 +795,5 @@ cdef void l1penalty(WeightVector w, double * q_data_ptr,
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))

q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)

{{endfor}}
Loading