diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index aa3ed3c055f1c..a0605d8d7b635 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -30,6 +30,14 @@ Changelog The ARFF specs requires to ignore the leading space. :pr:`25312` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.decomposition` +............................ + +- |Fix| :class:`decomposition.DictionaryLearning` better supports readonly NumPy + arrays. In particular, it better supports large datasets which are memory-mapped + when it is used with coordinate descent algorithms (i.e. when `fit_algorithm='cd'`). + :pr:`25172` by :user:`Julien Jerphanion `. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index c5003f36445a3..34bc9cd63a4be 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -174,6 +174,13 @@ def _sparse_encode( ) if init is not None: + # In some workflows using coordinate descent algorithms: + # - users might provide NumPy arrays with read-only buffers + # - `joblib` might memmap arrays making their buffer read-only + # TODO: move this handling (which is currently too broad) + # closer to the actual private function which need buffers to be writable. + if not init.flags["WRITEABLE"]: + init = np.array(init) clf.coef_ = init clf.fit(dictionary.T, X.T, check_input=check_input) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index df54a35e968f2..7903cbd35fc5d 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -5,6 +5,10 @@ from functools import partial import itertools +from joblib import Parallel + +import sklearn + from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -990,6 +994,29 @@ def test_get_feature_names_out(estimator): ) +def test_cd_work_on_joblib_memmapped_data(monkeypatch): + monkeypatch.setattr( + sklearn.decomposition._dict_learning, + "Parallel", + partial(Parallel, max_nbytes=100), + ) + + rng = np.random.RandomState(0) + X_train = rng.randn(10, 10) + + dict_learner = DictionaryLearning( + n_components=5, + random_state=0, + n_jobs=2, + fit_algorithm="cd", + max_iter=50, + verbose=True, + ) + + # This must run and complete without error. + dict_learner.fit(X_train) + + # TODO(1.4) remove def test_minibatch_dictionary_learning_warns_and_ignore_n_iter(): """Check that we always raise a warning when `n_iter` is set even if it is diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx index 0c28d9929be63..ba9b922461775 100644 --- a/sklearn/linear_model/_cd_fast.pyx +++ b/sklearn/linear_model/_cd_fast.pyx @@ -90,13 +90,13 @@ cdef floating diff_abs_max(int n, floating* a, floating* b) nogil: m = d return m - +# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used. def enet_coordinate_descent( - floating[::1] w, + cnp.ndarray[floating, ndim=1, mode='c'] w, floating alpha, floating beta, - floating[::1, :] X, - floating[::1] y, + cnp.ndarray[floating, ndim=2, mode='fortran'] X, + cnp.ndarray[floating, ndim=1, mode='c'] y, unsigned int max_iter, floating tol, object rng, @@ -273,16 +273,17 @@ def enet_coordinate_descent( return np.asarray(w), gap, tol, n_iter + 1 +# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used. def sparse_enet_coordinate_descent( - floating [::1] w, + cnp.ndarray[floating, ndim=1, mode='c'] w, floating alpha, floating beta, - floating[::1] X_data, # TODO: Make const after release of Cython 3 (#23147) + cnp.ndarray[floating, ndim=1, mode='c'] X_data, const int[::1] X_indices, const int[::1] X_indptr, - floating[::1] y, - floating[::1] sample_weight, - floating[::1] X_mean, + cnp.ndarray[floating, ndim=1, mode='c'] y, + cnp.ndarray[floating, ndim=1, mode='c'] sample_weight, + cnp.ndarray[floating, ndim=1, mode='c'] X_mean, unsigned int max_iter, floating tol, object rng, @@ -564,8 +565,9 @@ def sparse_enet_coordinate_descent( return np.asarray(w), gap, tol, n_iter + 1 +# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used. def enet_coordinate_descent_gram( - floating[::1] w, + cnp.ndarray[floating, ndim=1, mode='c'] w, floating alpha, floating beta, cnp.ndarray[floating, ndim=2, mode='c'] Q, @@ -630,9 +632,9 @@ def enet_coordinate_descent_gram( cdef UINT32_t* rand_r_state = &rand_r_state_seed cdef floating y_norm2 = np.dot(y, y) - cdef floating* w_ptr = &w[0] + cdef floating* w_ptr = &w[0] cdef floating* Q_ptr = &Q[0, 0] - cdef floating* q_ptr = q.data + cdef floating* q_ptr = &q[0] cdef floating* H_ptr = &H[0] cdef floating* XtA_ptr = &XtA[0] tol = tol * y_norm2 @@ -734,9 +736,9 @@ def enet_coordinate_descent_gram( return np.asarray(w), gap, tol, n_iter + 1 - +# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used. def enet_coordinate_descent_multi_task( - floating[::1, :] W, + cnp.ndarray[floating, ndim=2, mode='fortran'] W, floating l1_reg, floating l2_reg, # TODO: use const qualified fused-typed memoryview when Cython 3.0 is used. diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 4a0fe36deae2d..67fffac39d466 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -30,7 +30,6 @@ check_is_fitted, column_or_1d, ) -from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper from ..utils.fixes import delayed # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' @@ -594,9 +593,7 @@ def enet_path( w=coef_, alpha=l1_reg, beta=l2_reg, - X_data=ReadonlyArrayWrapper( - X.data - ), # TODO: Remove after release of Cython 3 (#23147) + X_data=X.data, X_indices=X.indices, X_indptr=X.indptr, y=y,