diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 23304b2673723..69b497abf0dea 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -501,6 +501,11 @@ Decomposition, manifold learning and clustering :class:`mixture.BayesianGaussianMixture`. :issue:`10740` by :user:`Erich Schubert ` and :user:`Guillaume Lemaitre `. +- Fixed a bug in :class:`decomposition.SparseCoder` when running OMP sparse + coding in parallel using readonly memory mapped datastructures. :issue:`5956` + by :user:`Vighnesh Birodkar ` and + :user:`Olivier Grisel `. + Metrics - Fixed a bug in :func:`metrics.precision_recall_fscore_support` diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 0bf62cbc8d77b..aa67adeec76f1 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -1,3 +1,4 @@ +from __future__ import division import pytest import numpy as np @@ -366,3 +367,22 @@ def test_sparse_coder_estimator(): transform_alpha=0.001).transform(X) assert_true(not np.all(code == 0)) assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1) + + +def test_sparse_coder_parallel_mmap(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/5956 + # Test that SparseCoder does not error by passing reading only + # arrays to child processes + + rng = np.random.RandomState(777) + n_components, n_features = 40, 64 + init_dict = rng.rand(n_components, n_features) + # Ensure that `data` is >2M. Joblib memory maps arrays + # if they are larger than 1MB. The 4 accounts for float32 + # data type + n_samples = int(2e6) // (4 * n_features) + data = np.random.rand(n_samples, n_features).astype(np.float32) + + sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) + sc.fit_transform(data) diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index 298a1fa4259d9..777b915d0339a 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -191,7 +191,7 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None, """ Gram = Gram.copy('F') if copy_Gram else np.asfortranarray(Gram) - if copy_Xy: + if copy_Xy or not Xy.flags.writeable: Xy = Xy.copy() min_float = np.finfo(Gram.dtype).eps @@ -491,6 +491,9 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None, Xy = Xy[:, np.newaxis] if tol is not None: norms_squared = [norms_squared] + if copy_Xy or not Xy.flags.writeable: + # Make the copy once instead of many times in _gram_omp itself. + Xy = Xy.copy() if n_nonzero_coefs is None and tol is None: n_nonzero_coefs = int(0.1 * len(Gram)) @@ -515,7 +518,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None, out = _gram_omp( Gram, Xy[:, k], n_nonzero_coefs, norms_squared[k] if tol is not None else None, tol, - copy_Gram=copy_Gram, copy_Xy=copy_Xy, + copy_Gram=copy_Gram, copy_Xy=False, return_path=return_path) if return_path: _, idx, coefs, n_iter = out diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index 355c2eaf697bc..d083e745f8299 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -104,6 +104,20 @@ def test_perfect_signal_recovery(): assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2) +def test_orthogonal_mp_gram_readonly(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/5956 + idx, = gamma[:, 0].nonzero() + G_readonly = G.copy() + G_readonly.setflags(write=False) + Xy_readonly = Xy.copy() + Xy_readonly.setflags(write=False) + gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], 5, + copy_Gram=False, copy_Xy=False) + assert_array_equal(idx, np.flatnonzero(gamma_gram)) + assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2) + + def test_estimator(): omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs) omp.fit(X, y[:, 0])