Skip to content

Check joblib memmap alignment fix #22607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build_tools/azure/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ python_environment_install() {
}

scikit_learn_install() {
# install joblib from branch with memmap alignment fix
python -m pip install https://github.com/lesteve/joblib/archive/refs/heads/memmap-align.zip

setup_ccache
show_installed_libraries

Expand Down
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,10 @@ def check_package_status(package, min_version):


def setup_package():
install_requires = min_deps.tag_to_packages["install"].copy()
install_requires.append(
"joblib @ https://github.com/lesteve/joblib/archive/refs/heads/memmap-align.zip"
)
metadata = dict(
name=DISTNAME,
maintainer=MAINTAINER,
Expand Down Expand Up @@ -264,7 +268,7 @@ def setup_package():
],
cmdclass=cmdclass,
python_requires=">=3.7",
install_requires=min_deps.tag_to_packages["install"],
install_requires=install_requires,
package_data={"": ["*.pxd"]},
**extra_setuptools_args,
)
Expand Down
6 changes: 3 additions & 3 deletions sklearn/_loss/tests/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ def test_loss_dtype(
out2 = np.empty_like(raw_prediction, dtype=dtype_out)

if readonly_memmap:
y_true = create_memmap_backed_data(y_true, aligned=True)
raw_prediction = create_memmap_backed_data(raw_prediction, aligned=True)
y_true = create_memmap_backed_data(y_true)
raw_prediction = create_memmap_backed_data(raw_prediction)
if sample_weight is not None:
sample_weight = create_memmap_backed_data(sample_weight, aligned=True)
sample_weight = create_memmap_backed_data(sample_weight)

loss.loss(
y_true=y_true,
Expand Down
4 changes: 2 additions & 2 deletions sklearn/utils/_readonly_array_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ cdef class ReadonlyArrayWrapper:
PyBuffer_Release(buffer)


def _test_sum(NUM_TYPES[::1] x):
def _test_sum(NUM_TYPES[:] x):
"""This function is for testing only.

As this function does not modify x, we would like to define it as

_test_sum(const NUM_TYPES[::1] x)
_test_sum(const NUM_TYPES[:] x)

which is not possible as fused typed const memoryviews aren't
supported in Cython<3.0.
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_show_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def _get_deps_info():

"""
deps = [
"pip",
"setuptools",
"pip",
"sklearn",
"numpy",
"scipy",
Expand Down
25 changes: 4 additions & 21 deletions sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,36 +529,19 @@ def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)


def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
"""
Parameters
----------
data
mmap_mode : str, default='r'
return_folder : bool, default=False
aligned : bool, default=False
If True, if input is a single numpy array and if the input array is aligned,
the memory mapped array will also be aligned. This is a workaround for
https://github.com/joblib/joblib/issues/563.
"""
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
else:
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
result = (
memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
)
Expand Down
15 changes: 1 addition & 14 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from ..model_selection import ShuffleSplit
from ..model_selection._validation import _safe_split
from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances
from ..utils.fixes import threadpool_info
from ..utils.validation import check_is_fitted

from . import shuffle
Expand Down Expand Up @@ -2089,19 +2088,7 @@ def check_classifiers_one_label(name, classifier_orig):


def _create_memmap_backed_data(numpy_arrays):
# OpenBLAS is known to segfault with unaligned data on the Prescott architecture
# See: https://github.com/scipy/scipy/issues/14886
has_prescott_openblas = any(
True
for info in threadpool_info()
if info["internal_api"] == "openblas"
# Prudently assume Prescott might be the architecture if it is unknown.
and info.get("architecture", "prescott").lower() == "prescott"
)
return [
create_memmap_backed_data(array, aligned=has_prescott_openblas)
for array in numpy_arrays
]
return [create_memmap_backed_data(array) for array in numpy_arrays]


@ignore_warnings # Warnings are raised by decision function
Expand Down
8 changes: 1 addition & 7 deletions sklearn/utils/tests/test_readonly_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,7 @@ def _readonly_array_copy(x):
return y


def _create_memmap_backed_data(data):
return create_memmap_backed_data(
data, mmap_mode="r", return_folder=False, aligned=True
)


@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data])
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_readonly_array_wrapper(readonly, dtype):
"""Test that ReadonlyWrapper allows working with fused-typed."""
Expand Down
46 changes: 8 additions & 38 deletions sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from sklearn.utils.deprecation import deprecated
from sklearn.utils.metaestimators import available_if, if_delegate_has_method
from sklearn.utils._readonly_array_wrapper import _test_sum
from sklearn.utils._testing import (
assert_raises,
assert_warns,
Expand Down Expand Up @@ -677,59 +676,30 @@ def test_tempmemmap(monkeypatch):
assert registration_counter.nb_calls == 2


@pytest.mark.parametrize("aligned", [False, True])
def test_create_memmap_backed_data(monkeypatch, aligned):
def test_create_memmap_backed_data(monkeypatch):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, "register", registration_counter)

input_array = np.ones(3)
data = create_memmap_backed_data(input_array, aligned=aligned)
data = create_memmap_backed_data(input_array)
check_memmap(input_array, data)
assert registration_counter.nb_calls == 1

data, folder = create_memmap_backed_data(
input_array, return_folder=True, aligned=aligned
)
data, folder = create_memmap_backed_data(input_array, return_folder=True)
check_memmap(input_array, data)
assert folder == os.path.dirname(data.filename)
assert registration_counter.nb_calls == 2

mmap_mode = "r+"
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
check_memmap(input_array, data, mmap_mode)
assert registration_counter.nb_calls == 3

input_list = [input_array, input_array + 1, input_array + 2]
if aligned:
with pytest.raises(
ValueError, match="If aligned=True, input must be a single numpy array."
):
create_memmap_backed_data(input_list, aligned=True)
else:
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4


@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_memmap_on_contiguous_data(dtype):
"""Test memory mapped array on contigous memoryview."""
x = np.arange(10).astype(dtype)
assert x.flags["C_CONTIGUOUS"]
assert x.flags["ALIGNED"]

# _test_sum consumes contiguous arrays
# def _test_sum(NUM_TYPES[::1] x):
sum_origin = _test_sum(x)

# now on memory mapped data
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
# without alignment, this can produce segmentation faults, see
# https://github.com/scikit-learn/scikit-learn/pull/21654
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
sum_mmap = _test_sum(x_mmap)
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
mmap_data_list = create_memmap_backed_data(input_list)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4


@pytest.mark.parametrize(
Expand Down