From d81af67b61645d3825cec81225a8bddf6bffe0cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 25 Jul 2022 16:22:29 +0200 Subject: [PATCH 1/4] TST Make sure memmap are aligned when OpenBLAS detects Prescott architecture --- sklearn/utils/_testing.py | 60 +++++++++++++++++++++++------ sklearn/utils/estimator_checks.py | 21 +--------- sklearn/utils/tests/test_testing.py | 23 ++++++----- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index a3ff844083998..2217fd10237ef 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -60,6 +60,7 @@ check_is_fitted, check_X_y, ) +from sklearn.utils.fixes import threadpool_info __all__ = [ @@ -602,6 +603,38 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) +def _create_memmap_backed_array(array, filename, mmap_mode): + # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + fp = np.memmap(filename, dtype=array.dtype, mode="w+", shape=array.shape) + fp[:] = array[:] # write array to memmap array + fp.flush() + memmap_backed_array = np.memmap( + filename, dtype=array.dtype, mode=mmap_mode, shape=array.shape + ) + return memmap_backed_array + + +def _create_aligned_memmap_backed_arrays(data, mmap_mode, folder): + if isinstance(data, np.ndarray): + filename = op.join(folder, "data.dat") + return _create_memmap_backed_array(data, filename, mmap_mode) + + if isinstance(data, Iterable) and all( + isinstance(each, np.ndarray) for each in data + ): + return [ + _create_memmap_backed_array( + array, op.join(folder, f"data{index}.dat"), mmap_mode + ) + for index, array in enumerate(data) + ] + + raise ValueError( + "When creating aligned memmap-backed arrays, input must be a single array or a" + " iterable of arrays" + ) + + def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False): """ Parameters @@ -616,18 +649,23 @@ def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned= """ temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_") atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) + # OpenBLAS is known to segfault with unaligned data on the Prescott + # architecture so force aligned=True on Prescott. For more details, see: + # https://github.com/scipy/scipy/issues/14886 + has_prescott_openblas = any( + True + for info in threadpool_info() + if info["internal_api"] == "openblas" + # Prudently assume Prescott might be the architecture if it is unknown. + and info.get("architecture", "prescott").lower() == "prescott" + ) + if has_prescott_openblas: + aligned = True + if aligned: - if isinstance(data, np.ndarray) and data.flags.aligned: - # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html - filename = op.join(temp_folder, "data.dat") - fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape) - fp[:] = data[:] # write data to memmap array - fp.flush() - memmap_backed_data = np.memmap( - filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape - ) - else: - raise ValueError("If aligned=True, input must be a single numpy array.") + memmap_backed_data = _create_aligned_memmap_backed_arrays( + data, mmap_mode, temp_folder + ) else: filename = op.join(temp_folder, "data.pkl") joblib.dump(data, filename) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 69857117b4ab2..b33119e071328 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -54,7 +54,6 @@ from ..model_selection import ShuffleSplit from ..model_selection._validation import _safe_split from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances -from ..utils.fixes import threadpool_info from ..utils.fixes import sp_version from ..utils.fixes import parse_version from ..utils.validation import check_is_fitted @@ -2120,22 +2119,6 @@ def check_classifiers_one_label(name, classifier_orig): assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict) -def _create_memmap_backed_data(numpy_arrays): - # OpenBLAS is known to segfault with unaligned data on the Prescott architecture - # See: https://github.com/scipy/scipy/issues/14886 - has_prescott_openblas = any( - True - for info in threadpool_info() - if info["internal_api"] == "openblas" - # Prudently assume Prescott might be the architecture if it is unknown. - and info.get("architecture", "prescott").lower() == "prescott" - ) - return [ - create_memmap_backed_data(array, aligned=has_prescott_openblas) - for array in numpy_arrays - ] - - @ignore_warnings # Warnings are raised by decision function def check_classifiers_train( name, classifier_orig, readonly_memmap=False, X_dtype="float64" @@ -2153,7 +2136,7 @@ def check_classifiers_train( X_b -= X_b.min() if readonly_memmap: - X_m, y_m, X_b, y_b = _create_memmap_backed_data([X_m, y_m, X_b, y_b]) + X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b]) problems = [(X_b, y_b)] tags = _safe_tags(classifier_orig) @@ -2814,7 +2797,7 @@ def check_regressors_train( y_ = y if readonly_memmap: - X, y, y_ = _create_memmap_backed_data([X, y, y_]) + X, y, y_ = create_memmap_backed_data([X, y, y_]) if not hasattr(regressor, "alphas") and hasattr(regressor, "alpha"): # linear regressors need to set alpha, but not generalized CV ones diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index fca7a07b14c19..8bee96b5fa95d 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -702,16 +702,19 @@ def test_create_memmap_backed_data(monkeypatch, aligned): assert registration_counter.nb_calls == 3 input_list = [input_array, input_array + 1, input_array + 2] - if aligned: - with pytest.raises( - ValueError, match="If aligned=True, input must be a single numpy array." - ): - create_memmap_backed_data(input_list, aligned=True) - else: - mmap_data_list = create_memmap_backed_data(input_list, aligned=False) - for input_array, data in zip(input_list, mmap_data_list): - check_memmap(input_array, data) - assert registration_counter.nb_calls == 4 + mmap_data_list = create_memmap_backed_data(input_list, aligned=aligned) + for input_array, data in zip(input_list, mmap_data_list): + check_memmap(input_array, data) + assert registration_counter.nb_calls == 4 + + with pytest.raises( + ValueError, + match=( + "When creating aligned memmap-backed arrays, input must be a single array" + " or a iterable of arrays" + ), + ): + create_memmap_backed_data([input_array, "not-an-array"], aligned=True) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) From 7ec6a65ba343d02afdb4ca67634f14900dd16924 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 25 Jul 2022 13:33:42 -0400 Subject: [PATCH 2/4] CI Run [nogil] From 22a96676e3fd9fc9d0627ebafbaea7a3899c2779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 26 Jul 2022 11:48:57 +0200 Subject: [PATCH 3/4] Allow sequence of arrays --- sklearn/utils/_testing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 2217fd10237ef..4851322197d7b 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -22,6 +22,7 @@ import re import contextlib from collections.abc import Iterable +from collections.abc import Sequence import scipy as sp from functools import wraps @@ -619,7 +620,7 @@ def _create_aligned_memmap_backed_arrays(data, mmap_mode, folder): filename = op.join(folder, "data.dat") return _create_memmap_backed_array(data, filename, mmap_mode) - if isinstance(data, Iterable) and all( + if isinstance(data, Sequence) and all( isinstance(each, np.ndarray) for each in data ): return [ @@ -631,7 +632,7 @@ def _create_aligned_memmap_backed_arrays(data, mmap_mode, folder): raise ValueError( "When creating aligned memmap-backed arrays, input must be a single array or a" - " iterable of arrays" + " sequence of arrays" ) From 7dcd7a3e422a6c3997785d0512594388e868e852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 26 Jul 2022 12:34:54 +0200 Subject: [PATCH 4/4] fix test --- sklearn/utils/tests/test_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index 8bee96b5fa95d..75f35a3dea83c 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -711,7 +711,7 @@ def test_create_memmap_backed_data(monkeypatch, aligned): ValueError, match=( "When creating aligned memmap-backed arrays, input must be a single array" - " or a iterable of arrays" + " or a sequence of arrays" ), ): create_memmap_backed_data([input_array, "not-an-array"], aligned=True)