Skip to content

Commit 79c21c5

Browse files
authored
TST Make sure memmap are aligned when OpenBLAS detects Prescott architecture (#23994)
1 parent 7da7ba6 commit 79c21c5

File tree

3 files changed

+65
-40
lines changed

3 files changed

+65
-40
lines changed

sklearn/utils/_testing.py

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import re
2323
import contextlib
2424
from collections.abc import Iterable
25+
from collections.abc import Sequence
2526

2627
import scipy as sp
2728
from functools import wraps
@@ -60,6 +61,7 @@
6061
check_is_fitted,
6162
check_X_y,
6263
)
64+
from sklearn.utils.fixes import threadpool_info
6365

6466

6567
__all__ = [
@@ -602,6 +604,38 @@ def __exit__(self, exc_type, exc_val, exc_tb):
602604
_delete_folder(self.temp_folder)
603605

604606

607+
def _create_memmap_backed_array(array, filename, mmap_mode):
608+
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
609+
fp = np.memmap(filename, dtype=array.dtype, mode="w+", shape=array.shape)
610+
fp[:] = array[:] # write array to memmap array
611+
fp.flush()
612+
memmap_backed_array = np.memmap(
613+
filename, dtype=array.dtype, mode=mmap_mode, shape=array.shape
614+
)
615+
return memmap_backed_array
616+
617+
618+
def _create_aligned_memmap_backed_arrays(data, mmap_mode, folder):
619+
if isinstance(data, np.ndarray):
620+
filename = op.join(folder, "data.dat")
621+
return _create_memmap_backed_array(data, filename, mmap_mode)
622+
623+
if isinstance(data, Sequence) and all(
624+
isinstance(each, np.ndarray) for each in data
625+
):
626+
return [
627+
_create_memmap_backed_array(
628+
array, op.join(folder, f"data{index}.dat"), mmap_mode
629+
)
630+
for index, array in enumerate(data)
631+
]
632+
633+
raise ValueError(
634+
"When creating aligned memmap-backed arrays, input must be a single array or a"
635+
" sequence of arrays"
636+
)
637+
638+
605639
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
606640
"""
607641
Parameters
@@ -616,18 +650,23 @@ def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=
616650
"""
617651
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
618652
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
653+
# OpenBLAS is known to segfault with unaligned data on the Prescott
654+
# architecture so force aligned=True on Prescott. For more details, see:
655+
# https://github.com/scipy/scipy/issues/14886
656+
has_prescott_openblas = any(
657+
True
658+
for info in threadpool_info()
659+
if info["internal_api"] == "openblas"
660+
# Prudently assume Prescott might be the architecture if it is unknown.
661+
and info.get("architecture", "prescott").lower() == "prescott"
662+
)
663+
if has_prescott_openblas:
664+
aligned = True
665+
619666
if aligned:
620-
if isinstance(data, np.ndarray) and data.flags.aligned:
621-
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
622-
filename = op.join(temp_folder, "data.dat")
623-
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
624-
fp[:] = data[:] # write data to memmap array
625-
fp.flush()
626-
memmap_backed_data = np.memmap(
627-
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
628-
)
629-
else:
630-
raise ValueError("If aligned=True, input must be a single numpy array.")
667+
memmap_backed_data = _create_aligned_memmap_backed_arrays(
668+
data, mmap_mode, temp_folder
669+
)
631670
else:
632671
filename = op.join(temp_folder, "data.pkl")
633672
joblib.dump(data, filename)

sklearn/utils/estimator_checks.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
from ..model_selection import ShuffleSplit
5555
from ..model_selection._validation import _safe_split
5656
from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances
57-
from ..utils.fixes import threadpool_info
5857
from ..utils.fixes import sp_version
5958
from ..utils.fixes import parse_version
6059
from ..utils.validation import check_is_fitted
@@ -2120,22 +2119,6 @@ def check_classifiers_one_label(name, classifier_orig):
21202119
assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict)
21212120

21222121

2123-
def _create_memmap_backed_data(numpy_arrays):
2124-
# OpenBLAS is known to segfault with unaligned data on the Prescott architecture
2125-
# See: https://github.com/scipy/scipy/issues/14886
2126-
has_prescott_openblas = any(
2127-
True
2128-
for info in threadpool_info()
2129-
if info["internal_api"] == "openblas"
2130-
# Prudently assume Prescott might be the architecture if it is unknown.
2131-
and info.get("architecture", "prescott").lower() == "prescott"
2132-
)
2133-
return [
2134-
create_memmap_backed_data(array, aligned=has_prescott_openblas)
2135-
for array in numpy_arrays
2136-
]
2137-
2138-
21392122
@ignore_warnings # Warnings are raised by decision function
21402123
def check_classifiers_train(
21412124
name, classifier_orig, readonly_memmap=False, X_dtype="float64"
@@ -2153,7 +2136,7 @@ def check_classifiers_train(
21532136
X_b -= X_b.min()
21542137

21552138
if readonly_memmap:
2156-
X_m, y_m, X_b, y_b = _create_memmap_backed_data([X_m, y_m, X_b, y_b])
2139+
X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])
21572140

21582141
problems = [(X_b, y_b)]
21592142
tags = _safe_tags(classifier_orig)
@@ -2814,7 +2797,7 @@ def check_regressors_train(
28142797
y_ = y
28152798

28162799
if readonly_memmap:
2817-
X, y, y_ = _create_memmap_backed_data([X, y, y_])
2800+
X, y, y_ = create_memmap_backed_data([X, y, y_])
28182801

28192802
if not hasattr(regressor, "alphas") and hasattr(regressor, "alpha"):
28202803
# linear regressors need to set alpha, but not generalized CV ones

sklearn/utils/tests/test_testing.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -702,16 +702,19 @@ def test_create_memmap_backed_data(monkeypatch, aligned):
702702
assert registration_counter.nb_calls == 3
703703

704704
input_list = [input_array, input_array + 1, input_array + 2]
705-
if aligned:
706-
with pytest.raises(
707-
ValueError, match="If aligned=True, input must be a single numpy array."
708-
):
709-
create_memmap_backed_data(input_list, aligned=True)
710-
else:
711-
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
712-
for input_array, data in zip(input_list, mmap_data_list):
713-
check_memmap(input_array, data)
714-
assert registration_counter.nb_calls == 4
705+
mmap_data_list = create_memmap_backed_data(input_list, aligned=aligned)
706+
for input_array, data in zip(input_list, mmap_data_list):
707+
check_memmap(input_array, data)
708+
assert registration_counter.nb_calls == 4
709+
710+
with pytest.raises(
711+
ValueError,
712+
match=(
713+
"When creating aligned memmap-backed arrays, input must be a single array"
714+
" or a sequence of arrays"
715+
),
716+
):
717+
create_memmap_backed_data([input_array, "not-an-array"], aligned=True)
715718

716719

717720
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])

0 commit comments

Comments
 (0)