Skip to content

[MRG+1] Read-only data compatibility for Lasso #4775

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/decomposition/plot_image_denoising.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@

###############################################################################
# Load Lena image and extract patches

lena = lena() / 256.0

# downsample for higher speed
Expand Down
13 changes: 12 additions & 1 deletion sklearn/decomposition/tests/test_dict_learning.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import numpy as np


from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_less
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import TempMemmap

from sklearn.decomposition import DictionaryLearning
from sklearn.decomposition import MiniBatchDictionaryLearning
Expand Down Expand Up @@ -60,6 +62,15 @@ def test_dict_learning_reconstruction_parallel():
assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8: you need 2 empty lines between top-level functions



def test_dict_learning_lassocd_readonly_data():
n_components = 12
with TempMemmap(X) as X_read_only:
dico = DictionaryLearning(n_components, transform_algorithm='lasso_cd',
transform_alpha=0.001, random_state=0, n_jobs=-1)
code = dico.fit(X_read_only).transform(X_read_only)
assert_array_almost_equal(np.dot(code, dico.components_), X_read_only, decimal=2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8 here two: add an empty line.



def test_dict_learning_nonzero_coefs():
n_components = 4
dico = DictionaryLearning(n_components, transform_algorithm='lars',
Expand Down Expand Up @@ -214,4 +225,4 @@ def test_sparse_coder_estimator():
code = SparseCoder(dictionary=V, transform_algorithm='lasso_lars',
transform_alpha=0.001).transform(X)
assert_true(not np.all(code == 0))
assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)
assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)
5,171 changes: 2,640 additions & 2,531 deletions sklearn/linear_model/cd_fast.c

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions sklearn/linear_model/cd_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,9 @@ def enet_coordinate_descent(np.ndarray[DOUBLE, ndim=1] w,
@cython.cdivision(True)
def sparse_enet_coordinate_descent(double[:] w,
double alpha, double beta,
double[:] X_data, int[:] X_indices,
int[:] X_indptr, double[:] y,
np.ndarray[double, ndim=1] X_data,
np.ndarray[int, ndim=1] X_indices,
np.ndarray[int, ndim=1] X_indptr, np.ndarray[double, ndim=1] y,
double[:] X_mean, int max_iter,
double tol, object rng, bint random=0,
bint positive=0):
Expand Down Expand Up @@ -487,7 +488,9 @@ def sparse_enet_coordinate_descent(double[:] w,
@cython.wraparound(False)
@cython.cdivision(True)
def enet_coordinate_descent_gram(double[:] w, double alpha, double beta,
double[:, :] Q, double[:] q, double[:] y,
np.ndarray[double, ndim=2] Q,
np.ndarray[double, ndim=1] q,
np.ndarray[double, ndim=1] y,
int max_iter, double tol, object rng,
bint random=0, bint positive=0):
"""Cython version of the coordinate descent algorithm
Expand Down Expand Up @@ -628,8 +631,8 @@ def enet_coordinate_descent_gram(double[:] w, double alpha, double beta,
@cython.wraparound(False)
@cython.cdivision(True)
def enet_coordinate_descent_multi_task(double[::1, :] W, double l1_reg,
double l2_reg, double[::1, :] X,
double[:, :] Y, int max_iter,
double l2_reg, np.ndarray[double, ndim=2] X,
np.ndarray[double, ndim=2] Y, int max_iter,
double tol, object rng,
bint random=0):
"""Cython version of the coordinate descent algorithm
Expand Down
1 change: 1 addition & 0 deletions sklearn/linear_model/coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
ElasticNetCV
"""
X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X)
y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False, ensure_2d=False)
if Xy is not None:
Xy = check_array(Xy, 'csc', dtype=np.float64, order='F', copy=False,
ensure_2d=False)
Expand Down
24 changes: 24 additions & 0 deletions sklearn/linear_model/tests/test_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import TempMemmap

from sklearn.linear_model.coordinate_descent import Lasso, \
LassoCV, ElasticNet, ElasticNetCV, MultiTaskLasso, MultiTaskElasticNet, \
Expand Down Expand Up @@ -388,6 +389,29 @@ def test_multi_task_lasso_and_enet():
assert_array_almost_equal(clf.coef_[0], clf.coef_[1])


def test_lasso_readonly_data():
X = np.array([[-1], [0], [1]])
Y = np.array([-1, 0, 1]) # just a straight line
T = np.array([[2], [3], [4]]) # test sample
with TempMemmap((X, Y)) as (X, Y):
clf = Lasso(alpha=0.5)
clf.fit(X, Y)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [.25])
assert_array_almost_equal(pred, [0.5, 0.75, 1.])
assert_almost_equal(clf.dual_gap_, 0)


def test_multi_task_lasso_readonly_data():
X, y, X_test, y_test = build_dataset()
Y = np.c_[y, y]
with TempMemmap((X, Y)) as (X, Y):
Y = np.c_[y, y]
clf = MultiTaskLasso(alpha=1, tol=1e-8).fit(X, Y)
assert_true(0 < clf.dual_gap_ < 1e-5)
assert_array_almost_equal(clf.coef_[0], clf.coef_[1])


def test_enet_multitarget():
n_targets = 3
X, y, _, _ = build_dataset(n_samples=10, n_features=8,
Expand Down
22 changes: 3 additions & 19 deletions sklearn/linear_model/tests/test_least_angle.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
import tempfile
import shutil
import os.path as op
import warnings
from nose.tools import assert_equal

import numpy as np
Expand All @@ -16,6 +12,7 @@
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_no_warnings, assert_warns
from sklearn.utils.testing import TempMemmap
from sklearn.utils import ConvergenceWarning
from sklearn import linear_model, datasets
from sklearn.linear_model.least_angle import _lars_path_residues
Expand Down Expand Up @@ -440,19 +437,6 @@ def test_lars_path_readonly_data():
# This is a non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/4597
splitted_data = train_test_split(X, y, random_state=42)
temp_folder = tempfile.mkdtemp()
try:
fpath = op.join(temp_folder, 'data.pkl')
joblib.dump(splitted_data, fpath)
X_train, X_test, y_train, y_test = joblib.load(fpath, mmap_mode='r')

with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test):
# The following should not fail despite copy=False
_lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
finally:
# try to release the mmap file handle in time to be able to delete
# the temporary folder under windows
del X_train, X_test, y_train, y_test
try:
shutil.rmtree(temp_folder)
except shutil.WindowsError:
warnings.warn("Could not delete temporary folder %s" % temp_folder)
_lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
43 changes: 43 additions & 0 deletions sklearn/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,20 @@
from urllib.request import urlopen
from urllib.error import HTTPError

import tempfile
import shutil
import os.path as op
import atexit

# WindowsError only exist on Windows
try:
WindowsError
except NameError:
WindowsError = None

import sklearn
from sklearn.base import BaseEstimator
from sklearn.externals import joblib

# Conveniently import all assertions in one place.
from nose.tools import assert_equal
Expand Down Expand Up @@ -697,5 +709,36 @@ def check_skip_travis():
if os.environ.get('TRAVIS') == "true":
raise SkipTest("This test needs to be skipped on Travis")


def _delete_folder(folder_path, warn=False):
"""Utility function to cleanup a temporary folder if still existing.
Copy from joblib.pool (for independance)"""
try:
if os.path.exists(folder_path):
# This can fail under windows,
# but will succeed when called by atexit
shutil.rmtree(folder_path)
except WindowsError:
if warn:
warnings.warn("Could not delete temporary folder %s" % folder_path)


class TempMemmap(object):
def __init__(self, data, mmap_mode='r'):
self.temp_folder = tempfile.mkdtemp(prefix='sklearn_testing_')
self.mmap_mode = mmap_mode
self.data = data

def __enter__(self):
fpath = op.join(self.temp_folder, 'data.pkl')
joblib.dump(self.data, fpath)
data_read_only = joblib.load(fpath, mmap_mode=self.mmap_mode)
atexit.register(lambda: _delete_folder(self.temp_folder, warn=True))
return data_read_only

def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)


with_network = with_setup(check_skip_network)
with_travis = with_setup(check_skip_travis)