Skip to content

FIX Better support large or read-only datasets in decomposition.DictionaryLearning #25172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ Changelog
The ARFF specs requires to ignore the leading space.
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.decomposition`
............................

- |Fix| :class:`decomposition.DictionaryLearning` better supports readonly NumPy
arrays. In particular, it better supports large datasets which are memory-mapped
when it is used with coordinate descent algorithms (i.e. when `fit_algorithm='cd'`).
:pr:`25172` by :user:`Julien Jerphanion <jjerphan>`.

:mod:`sklearn.ensemble`
.......................

Expand Down
7 changes: 7 additions & 0 deletions sklearn/decomposition/_dict_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def _sparse_encode(
)

if init is not None:
# In some workflows using coordinate descent algorithms:
# - users might provide NumPy arrays with read-only buffers
# - `joblib` might memmap arrays making their buffer read-only
# TODO: move this handling (which is currently too broad)
# closer to the actual private function which need buffers to be writable.
if not init.flags["WRITEABLE"]:
init = np.array(init)
clf.coef_ = init

clf.fit(dictionary.T, X.T, check_input=check_input)
Expand Down
27 changes: 27 additions & 0 deletions sklearn/decomposition/tests/test_dict_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
from functools import partial
import itertools

from joblib import Parallel

import sklearn

from sklearn.base import clone

from sklearn.exceptions import ConvergenceWarning
Expand Down Expand Up @@ -990,6 +994,29 @@ def test_get_feature_names_out(estimator):
)


def test_cd_work_on_joblib_memmapped_data(monkeypatch):
monkeypatch.setattr(
sklearn.decomposition._dict_learning,
"Parallel",
partial(Parallel, max_nbytes=100),
)

rng = np.random.RandomState(0)
X_train = rng.randn(10, 10)

dict_learner = DictionaryLearning(
n_components=5,
random_state=0,
n_jobs=2,
fit_algorithm="cd",
max_iter=50,
verbose=True,
)

# This must run and complete without error.
dict_learner.fit(X_train)


# TODO(1.4) remove
def test_minibatch_dictionary_learning_warns_and_ignore_n_iter():
"""Check that we always raise a warning when `n_iter` is set even if it is
Expand Down
30 changes: 16 additions & 14 deletions sklearn/linear_model/_cd_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ cdef floating diff_abs_max(int n, floating* a, floating* b) nogil:
m = d
return m


# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
def enet_coordinate_descent(
floating[::1] w,
cnp.ndarray[floating, ndim=1, mode='c'] w,
floating alpha,
floating beta,
floating[::1, :] X,
floating[::1] y,
cnp.ndarray[floating, ndim=2, mode='fortran'] X,
cnp.ndarray[floating, ndim=1, mode='c'] y,
unsigned int max_iter,
floating tol,
object rng,
Expand Down Expand Up @@ -273,16 +273,17 @@ def enet_coordinate_descent(
return np.asarray(w), gap, tol, n_iter + 1


# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this TODO take effect now since const fused typed memoryviews are available?

Copy link
Member Author

@jjerphan jjerphan Jan 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet, but once #25342 is merged.

Copy link
Member

@ogrisel ogrisel Jan 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#25342 is now merged. Shall we try that now or is it safer not to experiment with const typed memory views in a bugfix (1.2.1) and instead do that only in main (for 1.3.0)?

Copy link
Member Author

@jjerphan jjerphan Jan 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Provided that Cython has good coverage and test cases for const-qualified memoryviews, I think it is safe and the best to use them to resolve such issues (as it is conceptually the best solution). Yet it do not think we need to use them for this fix with respect to the next bug fix release (I do not want to postpone this release).

Note that using such construct might not be sufficient to fix the problem tackled by this PR as cython implementations need writable buffers.

I will start investigating const-qualified fused typed memoryviews tomorrow for this fix and for other long standing issues.

def sparse_enet_coordinate_descent(
floating [::1] w,
cnp.ndarray[floating, ndim=1, mode='c'] w,
floating alpha,
floating beta,
floating[::1] X_data, # TODO: Make const after release of Cython 3 (#23147)
cnp.ndarray[floating, ndim=1, mode='c'] X_data,
const int[::1] X_indices,
const int[::1] X_indptr,
floating[::1] y,
floating[::1] sample_weight,
floating[::1] X_mean,
cnp.ndarray[floating, ndim=1, mode='c'] y,
cnp.ndarray[floating, ndim=1, mode='c'] sample_weight,
cnp.ndarray[floating, ndim=1, mode='c'] X_mean,
unsigned int max_iter,
floating tol,
object rng,
Expand Down Expand Up @@ -564,8 +565,9 @@ def sparse_enet_coordinate_descent(
return np.asarray(w), gap, tol, n_iter + 1


# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
def enet_coordinate_descent_gram(
floating[::1] w,
cnp.ndarray[floating, ndim=1, mode='c'] w,
floating alpha,
floating beta,
cnp.ndarray[floating, ndim=2, mode='c'] Q,
Expand Down Expand Up @@ -630,9 +632,9 @@ def enet_coordinate_descent_gram(
cdef UINT32_t* rand_r_state = &rand_r_state_seed

cdef floating y_norm2 = np.dot(y, y)
cdef floating* w_ptr = <floating*>&w[0]
cdef floating* w_ptr = &w[0]
cdef floating* Q_ptr = &Q[0, 0]
cdef floating* q_ptr = <floating*>q.data
cdef floating* q_ptr = &q[0]
cdef floating* H_ptr = &H[0]
cdef floating* XtA_ptr = &XtA[0]
tol = tol * y_norm2
Expand Down Expand Up @@ -734,9 +736,9 @@ def enet_coordinate_descent_gram(

return np.asarray(w), gap, tol, n_iter + 1


# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
def enet_coordinate_descent_multi_task(
floating[::1, :] W,
cnp.ndarray[floating, ndim=2, mode='fortran'] W,
floating l1_reg,
floating l2_reg,
# TODO: use const qualified fused-typed memoryview when Cython 3.0 is used.
Expand Down
5 changes: 1 addition & 4 deletions sklearn/linear_model/_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
check_is_fitted,
column_or_1d,
)
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
from ..utils.fixes import delayed

# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
Expand Down Expand Up @@ -594,9 +593,7 @@ def enet_path(
w=coef_,
alpha=l1_reg,
beta=l2_reg,
X_data=ReadonlyArrayWrapper(
X.data
), # TODO: Remove after release of Cython 3 (#23147)
X_data=X.data,
X_indices=X.indices,
X_indptr=X.indptr,
y=y,
Expand Down