diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index ff89358c0c1f6..dc4b563627d64 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -140,6 +140,9 @@ python_environment_install() { } scikit_learn_install() { + # install joblib from branch with memmap alignment fix + python -m pip install https://github.com/lesteve/joblib/archive/refs/heads/memmap-align.zip + setup_ccache show_installed_libraries diff --git a/setup.py b/setup.py index 99633008c8dfc..30eefbd6c2c7c 100755 --- a/setup.py +++ b/setup.py @@ -230,6 +230,10 @@ def check_package_status(package, min_version): def setup_package(): + install_requires = min_deps.tag_to_packages["install"].copy() + install_requires.append( + "joblib @ https://github.com/lesteve/joblib/archive/refs/heads/memmap-align.zip" + ) metadata = dict( name=DISTNAME, maintainer=MAINTAINER, @@ -264,7 +268,7 @@ def setup_package(): ], cmdclass=cmdclass, python_requires=">=3.7", - install_requires=min_deps.tag_to_packages["install"], + install_requires=install_requires, package_data={"": ["*.pxd"]}, **extra_setuptools_args, ) diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py index 9a56f12ea0039..79ec7dfb47d48 100644 --- a/sklearn/_loss/tests/test_loss.py +++ b/sklearn/_loss/tests/test_loss.py @@ -275,10 +275,10 @@ def test_loss_dtype( out2 = np.empty_like(raw_prediction, dtype=dtype_out) if readonly_memmap: - y_true = create_memmap_backed_data(y_true, aligned=True) - raw_prediction = create_memmap_backed_data(raw_prediction, aligned=True) + y_true = create_memmap_backed_data(y_true) + raw_prediction = create_memmap_backed_data(raw_prediction) if sample_weight is not None: - sample_weight = create_memmap_backed_data(sample_weight, aligned=True) + sample_weight = create_memmap_backed_data(sample_weight) loss.loss( y_true=y_true, diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx index 2c81330df2eb0..55ac82f9d80fd 100644 --- a/sklearn/utils/_readonly_array_wrapper.pyx +++ b/sklearn/utils/_readonly_array_wrapper.pyx @@ -48,12 +48,12 @@ cdef class ReadonlyArrayWrapper: PyBuffer_Release(buffer) -def _test_sum(NUM_TYPES[::1] x): +def _test_sum(NUM_TYPES[:] x): """This function is for testing only. As this function does not modify x, we would like to define it as - _test_sum(const NUM_TYPES[::1] x) + _test_sum(const NUM_TYPES[:] x) which is not possible as fused typed const memoryviews aren't supported in Cython<3.0. diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py index ca01fd1909f57..df1d506fa0766 100644 --- a/sklearn/utils/_show_versions.py +++ b/sklearn/utils/_show_versions.py @@ -44,8 +44,8 @@ def _get_deps_info(): """ deps = [ - "pip", "setuptools", + "pip", "sklearn", "numpy", "scipy", diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index ac84bf058df8c..f890c8f8cefc1 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -529,36 +529,19 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False): +def create_memmap_backed_data(data, mmap_mode="r", return_folder=False): """ Parameters ---------- data mmap_mode : str, default='r' return_folder : bool, default=False - aligned : bool, default=False - If True, if input is a single numpy array and if the input array is aligned, - the memory mapped array will also be aligned. This is a workaround for - https://github.com/joblib/joblib/issues/563. """ temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_") atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) - if aligned: - if isinstance(data, np.ndarray) and data.flags.aligned: - # https://numpy.org/doc/stable/reference/generated/numpy.memmap.html - filename = op.join(temp_folder, "data.dat") - fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape) - fp[:] = data[:] # write data to memmap array - fp.flush() - memmap_backed_data = np.memmap( - filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape - ) - else: - raise ValueError("If aligned=True, input must be a single numpy array.") - else: - filename = op.join(temp_folder, "data.pkl") - joblib.dump(data, filename) - memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) + filename = op.join(temp_folder, "data.pkl") + joblib.dump(data, filename) + memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode) result = ( memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder) ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4230ab3532b76..cf58168fb562a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -52,7 +52,6 @@ from ..model_selection import ShuffleSplit from ..model_selection._validation import _safe_split from ..metrics.pairwise import rbf_kernel, linear_kernel, pairwise_distances -from ..utils.fixes import threadpool_info from ..utils.validation import check_is_fitted from . import shuffle @@ -2089,19 +2088,7 @@ def check_classifiers_one_label(name, classifier_orig): def _create_memmap_backed_data(numpy_arrays): - # OpenBLAS is known to segfault with unaligned data on the Prescott architecture - # See: https://github.com/scipy/scipy/issues/14886 - has_prescott_openblas = any( - True - for info in threadpool_info() - if info["internal_api"] == "openblas" - # Prudently assume Prescott might be the architecture if it is unknown. - and info.get("architecture", "prescott").lower() == "prescott" - ) - return [ - create_memmap_backed_data(array, aligned=has_prescott_openblas) - for array in numpy_arrays - ] + return [create_memmap_backed_data(array) for array in numpy_arrays] @ignore_warnings # Warnings are raised by decision function diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py index 38163cc2461ce..c385e1db51cd3 100644 --- a/sklearn/utils/tests/test_readonly_wrapper.py +++ b/sklearn/utils/tests/test_readonly_wrapper.py @@ -13,13 +13,7 @@ def _readonly_array_copy(x): return y -def _create_memmap_backed_data(data): - return create_memmap_backed_data( - data, mmap_mode="r", return_folder=False, aligned=True - ) - - -@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data]) +@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data]) @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_readonly_array_wrapper(readonly, dtype): """Test that ReadonlyWrapper allows working with fused-typed.""" diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index ea4831fb02400..b14edbe402606 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -12,7 +12,6 @@ from sklearn.utils.deprecation import deprecated from sklearn.utils.metaestimators import available_if, if_delegate_has_method -from sklearn.utils._readonly_array_wrapper import _test_sum from sklearn.utils._testing import ( assert_raises, assert_warns, @@ -677,59 +676,30 @@ def test_tempmemmap(monkeypatch): assert registration_counter.nb_calls == 2 -@pytest.mark.parametrize("aligned", [False, True]) -def test_create_memmap_backed_data(monkeypatch, aligned): +def test_create_memmap_backed_data(monkeypatch): registration_counter = RegistrationCounter() monkeypatch.setattr(atexit, "register", registration_counter) input_array = np.ones(3) - data = create_memmap_backed_data(input_array, aligned=aligned) + data = create_memmap_backed_data(input_array) check_memmap(input_array, data) assert registration_counter.nb_calls == 1 - data, folder = create_memmap_backed_data( - input_array, return_folder=True, aligned=aligned - ) + data, folder = create_memmap_backed_data(input_array, return_folder=True) check_memmap(input_array, data) assert folder == os.path.dirname(data.filename) assert registration_counter.nb_calls == 2 mmap_mode = "r+" - data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned) + data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode) check_memmap(input_array, data, mmap_mode) assert registration_counter.nb_calls == 3 input_list = [input_array, input_array + 1, input_array + 2] - if aligned: - with pytest.raises( - ValueError, match="If aligned=True, input must be a single numpy array." - ): - create_memmap_backed_data(input_list, aligned=True) - else: - mmap_data_list = create_memmap_backed_data(input_list, aligned=False) - for input_array, data in zip(input_list, mmap_data_list): - check_memmap(input_array, data) - assert registration_counter.nb_calls == 4 - - -@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) -def test_memmap_on_contiguous_data(dtype): - """Test memory mapped array on contigous memoryview.""" - x = np.arange(10).astype(dtype) - assert x.flags["C_CONTIGUOUS"] - assert x.flags["ALIGNED"] - - # _test_sum consumes contiguous arrays - # def _test_sum(NUM_TYPES[::1] x): - sum_origin = _test_sum(x) - - # now on memory mapped data - # aligned=True so avoid https://github.com/joblib/joblib/issues/563 - # without alignment, this can produce segmentation faults, see - # https://github.com/scikit-learn/scikit-learn/pull/21654 - x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True) - sum_mmap = _test_sum(x_mmap) - assert sum_mmap == pytest.approx(sum_origin, rel=1e-11) + mmap_data_list = create_memmap_backed_data(input_list) + for input_array, data in zip(input_list, mmap_data_list): + check_memmap(input_array, data) + assert registration_counter.nb_calls == 4 @pytest.mark.parametrize(