Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,11 @@ Decomposition, manifold learning and clustering
:class:`mixture.BayesianGaussianMixture`. :issue:`10740` by :user:`Erich
Schubert <kno10>` and :user:`Guillaume Lemaitre <glemaitre>`.

- Fixed a bug in :class:`decomposition.SparseCoder` when running OMP sparse
coding in parallel using readonly memory mapped datastructures. :issue:`5956`
by :user:`Vighnesh Birodkar <vighneshbirodkar>` and
:user:`Olivier Grisel <ogrisel>`.

Metrics

- Fixed a bug in :func:`metrics.precision_recall_fscore_support`
Expand Down
20 changes: 20 additions & 0 deletions sklearn/decomposition/tests/test_dict_learning.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import division
import pytest

import numpy as np
Expand Down Expand Up @@ -366,3 +367,22 @@ def test_sparse_coder_estimator():
transform_alpha=0.001).transform(X)
assert_true(not np.all(code == 0))
assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)


def test_sparse_coder_parallel_mmap():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/5956
# Test that SparseCoder does not error by passing reading only
# arrays to child processes

rng = np.random.RandomState(777)
n_components, n_features = 40, 64
init_dict = rng.rand(n_components, n_features)
# Ensure that `data` is >2M. Joblib memory maps arrays
# if they are larger than 1MB. The 4 accounts for float32
# data type
n_samples = int(2e6) // (4 * n_features)
data = np.random.rand(n_samples, n_features).astype(np.float32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively can just do data.setflags(write=False). That way having a lot of data is not necessary to generate the same problem. Did something similar in PR ( #11342 ) in case it helps.

Copy link
Member Author

@ogrisel ogrisel Jun 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, I believe it's not possible to use this as in this particular case it's an internal data structure Xy (inderectly derived from the data) that is being mutated.

We could write a unittest for the private function _gram_omp tough.


sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2)
sc.fit_transform(data)
7 changes: 5 additions & 2 deletions sklearn/linear_model/omp.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None,
"""
Gram = Gram.copy('F') if copy_Gram else np.asfortranarray(Gram)

if copy_Xy:
if copy_Xy or not Xy.flags.writeable:
Xy = Xy.copy()

min_float = np.finfo(Gram.dtype).eps
Expand Down Expand Up @@ -491,6 +491,9 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
Xy = Xy[:, np.newaxis]
if tol is not None:
norms_squared = [norms_squared]
if copy_Xy or not Xy.flags.writeable:
# Make the copy once instead of many times in _gram_omp itself.
Xy = Xy.copy()

if n_nonzero_coefs is None and tol is None:
n_nonzero_coefs = int(0.1 * len(Gram))
Expand All @@ -515,7 +518,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
out = _gram_omp(
Gram, Xy[:, k], n_nonzero_coefs,
norms_squared[k] if tol is not None else None, tol,
copy_Gram=copy_Gram, copy_Xy=copy_Xy,
copy_Gram=copy_Gram, copy_Xy=False,
return_path=return_path)
if return_path:
_, idx, coefs, n_iter = out
Expand Down
14 changes: 14 additions & 0 deletions sklearn/linear_model/tests/test_omp.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,20 @@ def test_perfect_signal_recovery():
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)


def test_orthogonal_mp_gram_readonly():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/5956
idx, = gamma[:, 0].nonzero()
G_readonly = G.copy()
G_readonly.setflags(write=False)
Xy_readonly = Xy.copy()
Xy_readonly.setflags(write=False)
gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], 5,
copy_Gram=False, copy_Xy=False)
assert_array_equal(idx, np.flatnonzero(gamma_gram))
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)


def test_estimator():
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
omp.fit(X, y[:, 0])
Expand Down