diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index 55ac82f9d80fd..2c81330df2eb0 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -48,12 +48,12 @@ cdef class ReadonlyArrayWrapper: PyBuffer_Release(buffer) -def _test_sum(NUM_TYPES[:] x): +def _test_sum(NUM_TYPES[::1] x): """This function is for testing only. As this function does not modify x, we would like to define it as - _test_sum(const NUM_TYPES[:] x) + _test_sum(const NUM_TYPES[::1] x) which is not possible as fused typed const memoryviews aren't supported in Cython<3.0. diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 1e4ecdd53e136..1724063be2f43 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -def create_memmap_backed_data(data, mmap_mode="r", return_folder=False): +def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False): """ Parameters ---------- data mmap_mode : str, default='r' return_folder : bool, default=False + aligned : bool, default=False + If True, if input is a single numpy array and if the input array is aligned, + the memory mapped array will also be aligned. This is a workaround for + https://github.com/joblib/joblib/issues/563. """ temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_") atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) - filename = op.join(temp_folder, "data.pkl") - joblib.dump(data, filename) - memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) + if aligned: + if isinstance(data, np.ndarray) and data.flags.aligned: + # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html + filename = op.join(temp_folder, "data.dat") + fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape) + fp[:] = data[:] # write data to memmap array + fp.flush() + memmap_backed_data = np.memmap( + filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape + ) + else: + raise ValueError("If aligned=True, input must be a single numpy array.") + else: + filename = op.join(temp_folder, "data.pkl") + joblib.dump(data, filename) + memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) result = ( memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder) ) diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index c385e1db51cd3..38163cc2461ce 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,7 +13,13 @@ def _readonly_array_copy(x): return y -@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) +def _create_memmap_backed_data(data): + return create_memmap_backed_data( + data, mmap_mode="r", return_folder=False, aligned=True + ) + + +@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): """Test that ReadonlyWrapper allows working with fused-typed.""" diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index e9e9252bd5f0f..a3a42aeb4c83f 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -12,6 +12,7 @@ from sklearn.utils.deprecation import deprecated from sklearn.utils.metaestimators import available_if, if_delegate_has_method +from sklearn.utils._readonly_array_wrapper import _test_sum from sklearn.utils._testing import ( assert_raises, assert_warns, @@ -680,30 +681,59 @@ def test_tempmemmap(monkeypatch): assert registration_counter.nb_calls == 2 -def test_create_memmap_backed_data(monkeypatch): +@pytest.mark.parametrize("aligned", [False, True]) +def test_create_memmap_backed_data(monkeypatch, aligned): registration_counter = RegistrationCounter() monkeypatch.setattr(atexit, "register", registration_counter) input_array = np.ones(3) - data = create_memmap_backed_data(input_array) + data = create_memmap_backed_data(input_array, aligned=aligned) check_memmap(input_array, data) assert registration_counter.nb_calls == 1 - data, folder = create_memmap_backed_data(input_array, return_folder=True) + data, folder = create_memmap_backed_data( + input_array, return_folder=True, aligned=aligned + ) check_memmap(input_array, data) assert folder == os.path.dirname(data.filename) assert registration_counter.nb_calls == 2 mmap_mode = "r+" - data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode) + data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned) check_memmap(input_array, data, mmap_mode) assert registration_counter.nb_calls == 3 input_list = [input_array, input_array + 1, input_array + 2] - mmap_data_list = create_memmap_backed_data(input_list) - for input_array, data in zip(input_list, mmap_data_list): - check_memmap(input_array, data) - assert registration_counter.nb_calls == 4 + if aligned: + with pytest.raises( + ValueError, match="If aligned=True, input must be a single numpy array." + ): + create_memmap_backed_data(input_list, aligned=True) + else: + mmap_data_list = create_memmap_backed_data(input_list, aligned=False) + for input_array, data in zip(input_list, mmap_data_list): + check_memmap(input_array, data) + assert registration_counter.nb_calls == 4 + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) +def test_memmap_on_contiguous_data(dtype): + """Test memory mapped array on contigous memoryview.""" + x = np.arange(10).astype(dtype) + assert x.flags["C_CONTIGUOUS"] + assert x.flags["ALIGNED"] + + # _test_sum consumes contiguous arrays + # def _test_sum(NUM_TYPES[::1] x): + sum_origin = _test_sum(x) + + # now on memory mapped data + # aligned=True so avoid https://github.com/joblib/joblib/issues/563 + # without alignment, this can produce segmentation faults, see + # https://github.com/scikit-learn/scikit-learn/pull/21654 + x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True) + sum_mmap = _test_sum(x_mmap) + assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) @pytest.mark.parametrize(