Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import re
import contextlib
from collections.abc import Iterable
from collections.abc import Sequence

import scipy as sp
from functools import wraps
Expand Down Expand Up @@ -60,6 +61,7 @@
check_is_fitted,
check_X_y,
)
from sklearn.utils.fixes import threadpool_info


__all__ = [
Expand Down Expand Up @@ -602,6 +604,38 @@ def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)


def _create_memmap_backed_array(array, filename, mmap_mode):
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
fp = np.memmap(filename, dtype=array.dtype, mode="w+", shape=array.shape)
fp[:] = array[:] # write array to memmap array
fp.flush()
memmap_backed_array = np.memmap(
filename, dtype=array.dtype, mode=mmap_mode, shape=array.shape
)
return memmap_backed_array


def _create_aligned_memmap_backed_arrays(data, mmap_mode, folder):
if isinstance(data, np.ndarray):
filename = op.join(folder, "data.dat")
return _create_memmap_backed_array(data, filename, mmap_mode)

if isinstance(data, Sequence) and all(
isinstance(each, np.ndarray) for each in data
):
return [
_create_memmap_backed_array(
array, op.join(folder, f"data{index}.dat"), mmap_mode
)
for index, array in enumerate(data)
]

raise ValueError(
"When creating aligned memmap-backed arrays, input must be a single array or a"
" sequence of arrays"
)


def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
"""
Parameters
Expand All @@ -616,18 +650,23 @@ def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=
"""
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
# OpenBLAS is known to segfault with unaligned data on the Prescott
# architecture so force aligned=True on Prescott. For more details, see:
# https://github.com/scipy/scipy/issues/14886
has_prescott_openblas = any(
True
for info in threadpool_info()
if info["internal_api"] == "openblas"
# Prudently assume Prescott might be the architecture if it is unknown.
and info.get("architecture", "prescott").lower() == "prescott"
)
if has_prescott_openblas:
aligned = True

if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
memmap_backed_data = _create_aligned_memmap_backed_arrays(
data, mmap_mode, temp_folder
)
else:
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
Expand Down
21 changes: 2 additions & 19 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
from ..model_selection import ShuffleSplit
from ..model_selection._validation import _safe_split
from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances
from ..utils.fixes import threadpool_info
from ..utils.fixes import sp_version
from ..utils.fixes import parse_version
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -2120,22 +2119,6 @@ def check_classifiers_one_label(name, classifier_orig):
assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict)


def _create_memmap_backed_data(numpy_arrays):
# OpenBLAS is known to segfault with unaligned data on the Prescott architecture
# See: https://github.com/scipy/scipy/issues/14886
has_prescott_openblas = any(
True
for info in threadpool_info()
if info["internal_api"] == "openblas"
# Prudently assume Prescott might be the architecture if it is unknown.
and info.get("architecture", "prescott").lower() == "prescott"
)
return [
create_memmap_backed_data(array, aligned=has_prescott_openblas)
for array in numpy_arrays
]


@ignore_warnings # Warnings are raised by decision function
def check_classifiers_train(
name, classifier_orig, readonly_memmap=False, X_dtype="float64"
Expand All @@ -2153,7 +2136,7 @@ def check_classifiers_train(
X_b -= X_b.min()

if readonly_memmap:
X_m, y_m, X_b, y_b = _create_memmap_backed_data([X_m, y_m, X_b, y_b])
X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])

problems = [(X_b, y_b)]
tags = _safe_tags(classifier_orig)
Expand Down Expand Up @@ -2814,7 +2797,7 @@ def check_regressors_train(
y_ = y

if readonly_memmap:
X, y, y_ = _create_memmap_backed_data([X, y, y_])
X, y, y_ = create_memmap_backed_data([X, y, y_])

if not hasattr(regressor, "alphas") and hasattr(regressor, "alpha"):
# linear regressors need to set alpha, but not generalized CV ones
Expand Down
23 changes: 13 additions & 10 deletions sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,16 +702,19 @@ def test_create_memmap_backed_data(monkeypatch, aligned):
assert registration_counter.nb_calls == 3

input_list = [input_array, input_array + 1, input_array + 2]
if aligned:
with pytest.raises(
ValueError, match="If aligned=True, input must be a single numpy array."
):
create_memmap_backed_data(input_list, aligned=True)
else:
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4
mmap_data_list = create_memmap_backed_data(input_list, aligned=aligned)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4

with pytest.raises(
ValueError,
match=(
"When creating aligned memmap-backed arrays, input must be a single array"
" or a sequence of arrays"
),
):
create_memmap_backed_data([input_array, "not-an-array"], aligned=True)


@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
Expand Down