diff --git a/.github/meson_actions/action.yml b/.github/meson_actions/action.yml new file mode 100644 index 000000000000..aff70da169bc --- /dev/null +++ b/.github/meson_actions/action.yml @@ -0,0 +1,29 @@ +name: MesonBuildTest +description: "checkout repo, build, and test numpy" +runs: + using: composite + steps: + - name: Install dependencies + shell: bash + run: pip install -r build_requirements.txt + - name: Build + shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"' + env: + TERM: xterm-256color + run: + spin build -- ${MESON_ARGS[@]} + - name: Check build-internal dependencies + shell: bash + run: + ninja -C build -t missingdeps + - name: Check installed test and stub files + shell: bash + run: + python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy') + - name: Test + shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"' + env: + TERM: xterm-256color + run: | + pip install pytest pytest-xdist hypothesis typing_extensions + spin test -j auto diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 928018b13905..b0a24d7730a1 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -49,7 +49,7 @@ jobs: if: "github.repository == 'numpy/numpy'" runs-on: ubuntu-latest env: - WITHOUT_SIMD: 1 + MESON_ARGS: "-Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none" steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 with: @@ -58,7 +58,7 @@ jobs: - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: python-version: ${{ env.PYTHON_VERSION }} - - uses: ./.github/actions + - uses: ./.github/meson_actions basic: needs: [smoke_test] @@ -122,7 +122,7 @@ jobs: runs-on: ubuntu-latest if: github.event_name != 'push' env: - WITHOUT_OPTIMIZATIONS: 1 + MESON_ARGS: "-Dallow-noblas=true -Ddisable-optimization=true" steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 with: @@ -131,14 +131,14 @@ jobs: - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: python-version: ${{ env.PYTHON_VERSION }} - - uses: ./.github/actions + - uses: ./.github/meson_actions with_baseline_only: needs: [smoke_test] runs-on: ubuntu-latest if: github.event_name != 'push' env: - CPU_DISPATCH: "none" + MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=none" steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 with: @@ -147,14 +147,14 @@ jobs: - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: python-version: ${{ env.PYTHON_VERSION }} - - uses: ./.github/actions + - uses: ./.github/meson_actions without_avx512: needs: [smoke_test] runs-on: ubuntu-latest if: github.event_name != 'push' env: - CPU_DISPATCH: "max -xop -fma4 -avx512f -avx512cd -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl" + MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,AVX2,FMA3" steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 with: @@ -163,14 +163,14 @@ jobs: - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: python-version: ${{ env.PYTHON_VERSION }} - - uses: ./.github/actions + - uses: ./.github/meson_actions without_avx512_avx2_fma3: needs: [smoke_test] runs-on: ubuntu-latest if: github.event_name != 'push' env: - CPU_DISPATCH: "SSSE3 SSE41 POPCNT SSE42 AVX F16C" + MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C" steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 with: @@ -179,7 +179,7 @@ jobs: - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: python-version: ${{ env.PYTHON_VERSION }} - - uses: ./.github/actions + - uses: ./.github/meson_actions debug: needs: [smoke_test] diff --git a/MANIFEST.in b/MANIFEST.in index ab6ecd518e1b..4803b39131e1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -21,6 +21,8 @@ recursive-include numpy/random *.pyx *.pxd *.pyx.in *.pxd.in include numpy/py.typed include numpy/random/include/* include numpy/*.pxd +# Meson CPU Dispatcher +recursive-include meson_cpu *.build *.in # Add build support that should go in sdist, but not go in bdist/be installed # Note that sub-directories that don't have __init__ are apparently not # included by 'recursive-include', so list those separately diff --git a/build_requirements.txt b/build_requirements.txt index 3627f1b91685..e7e776a7de89 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -1,5 +1,5 @@ -meson-python>=0.10.0 -Cython +meson-python>=0.13.1 +Cython>=3.0 wheel==0.38.1 ninja spin==0.4 diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst index 783d5a447df9..bc6c3b3818d2 100644 --- a/doc/source/user/quickstart.rst +++ b/doc/source/user/quickstart.rst @@ -517,7 +517,7 @@ and other Python sequences. >>> for i in a: ... print(i**(1 / 3.)) ... - 9.999999999999998 + 9.999999999999998 # may vary 1.0 9.999999999999998 3.0 diff --git a/meson.build b/meson.build index 8bfe987715d1..33d0e7b462ef 100644 --- a/meson.build +++ b/meson.build @@ -6,7 +6,7 @@ project( # See `numpy/__init__.py` version: '1.26.0.dev0', license: 'BSD-3', - meson_version: '>= 1.1.0', + meson_version: '>=1.2.99', # version in vendored-meson is 1.2.99 default_options: [ 'buildtype=debugoptimized', 'b_ndebug=if-release', @@ -80,4 +80,5 @@ else meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif +subdir('meson_cpu') subdir('numpy') diff --git a/meson_cpu/arm/meson.build b/meson_cpu/arm/meson.build new file mode 100644 index 000000000000..f968b2e99682 --- /dev/null +++ b/meson_cpu/arm/meson.build @@ -0,0 +1,58 @@ +source_root = meson.project_source_root() +mod_features = import('features') +NEON = mod_features.new( + 'NEON', 1, + test_code: files(source_root + '/numpy/distutils/checks/cpu_neon.c')[0] +) +NEON_FP16 = mod_features.new( + 'NEON_FP16', 2, implies: NEON, + test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_fp16.c')[0] +) +# FMA +NEON_VFPV4 = mod_features.new( + 'NEON_VFPV4', 3, implies: NEON_FP16, + test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_vfpv4.c')[0] +) +# Advanced SIMD +ASIMD = mod_features.new( + 'ASIMD', 4, implies: NEON_VFPV4, detect: {'val': 'ASIMD', 'match': 'NEON.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_asimd.c')[0] +) +cpu_family = host_machine.cpu_family() +if cpu_family == 'aarch64' + # hardware baseline + NEON.update(implies: [NEON_FP16, NEON_VFPV4, ASIMD]) + NEON_FP16.update(implies: [NEON, NEON_VFPV4, ASIMD]) + NEON_VFPV4.update(implies: [NEON, NEON_FP16, ASIMD]) +elif cpu_family == 'arm' + NEON.update(args: '-mfpu=neon') + NEON_FP16.update(args: ['-mfp16-format=ieee', {'val': '-mfpu=neon-fp16', 'match': '-mfpu=.*'}]) + NEON_VFPV4.update(args: [{'val': '-mfpu=neon-vfpv4', 'match': '-mfpu=.*'}]) + ASIMD.update(args: [ + {'val': '-mfpu=neon-fp-armv8', 'match': '-mfpu=.*'}, + '-march=armv8-a+simd' + ]) +endif +# ARMv8.2 half-precision & vector arithm +ASIMDHP = mod_features.new( + 'ASIMDHP', 5, implies: ASIMD, + args: {'val': '-march=armv8.2-a+fp16', 'match': '-march=.*', 'mfilter': '\+.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdhp.c')[0] +) +## ARMv8.2 dot product +ASIMDDP = mod_features.new( + 'ASIMDDP', 6, implies: ASIMD, + args: {'val': '-march=armv8.2-a+dotprod', 'match': '-march=.*', 'mfilter': '\+.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_asimddp.c')[0] +) +## ARMv8.2 Single & half-precision Multiply +ASIMDFHM = mod_features.new( + 'ASIMDFHM', 7, implies: ASIMDHP, + args: {'val': '-march=armv8.2-a+fp16fml', 'match': '-march=.*', 'mfilter': '\+.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdfhm.c')[0] +) +# TODO: Add support for MSVC +ARM_FEATURES = { + 'NEON': NEON, 'NEON_FP16': NEON_FP16, 'NEON_VFPV4': NEON_VFPV4, + 'ASIMD': ASIMD, 'ASIMDHP': ASIMDHP, 'ASIMDFHM': ASIMDFHM +} diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in new file mode 100644 index 000000000000..c7c13b2c7eb1 --- /dev/null +++ b/meson_cpu/main_config.h.in @@ -0,0 +1,351 @@ +/* + * Main configuration header of the CPU dispatcher. + * + * This header is autogenerated by the Meson build script located at `meson_cpu/meson.build`. + * It provides a set of utilities that are required for the runtime dispatching process. + * + * The most important macros in this header are: + * - @ref @P@CPU_DISPATCH_DECLARE: Used to declare the dispatched functions and variables. + * - @ref @P@CPU_DISPATCH_CURFX: Used to define the dispatched functions with target-specific suffixes. + * - @ref @P@CPU_DISPATCH_CALL: Used for runtime dispatching of the exported functions and variables. + */ +#ifndef @P@_CPU_DISPATCHER_CONF_H_ +#define @P@_CPU_DISPATCHER_CONF_H_ +/// This definition is required to provides comptablity with NumPy distutils +#define @P@_CPU_MESON_BUILD +/** + * @def @P@WITH_CPU_BASELINE + * Enabled baseline features names as a single string where each is separated by a single space. + * For example: "SSE SSE2 SSE3" + * Required for logging purposes only. + */ +#define @P@WITH_CPU_BASELINE "@WITH_CPU_BASELINE@" +/** + * @def @P@WITH_CPU_BASELINE_N + * Number of enabled baseline features. + */ +#define @P@WITH_CPU_BASELINE_N @WITH_CPU_BASELINE_N@ +/** + * @def @P@WITH_CPU_DISPATCH + * Dispatched features names as a single string where each is separated by a single space. + */ +#define @P@WITH_CPU_DISPATCH "@WITH_CPU_DISPATCH@" +/** + * @def @P@WITH_CPU_DISPATCH_N + * Number of enabled dispatched features. + */ +#define @P@WITH_CPU_DISPATCH_N @WITH_CPU_DISPATCH_N@ +// Expand a macro, used by the following macros +#define @P@_CPU_EXPAND(X) X +#define @P@_CPU_CAT__(a, b) a ## b +#define @P@_CPU_CAT_(a, b) @P@_CPU_CAT__(a, b) +#define @P@_CPU_CAT(a, b) @P@_CPU_CAT_(a, b) + +/** + * @def @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) + * Call each enabled baseline feature sorted by lowest interest + * using preprocessor callback without testing whiher the + * feature is supported by CPU or not. + * + * Required for logging purposes only, for example, generating + * a Python list to hold the information of the enabled features. + * + * Unwrapped Version: + * @code + * #define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \ + * @P@_CPU_EXPAND(EXEC_CB(SSE, __VA_ARGS__)) \ + * @P@_CPU_EXPAND(EXEC_CB(SSE2, __VA_ARGS__)) \ + * @P@_CPU_EXPAND(EXEC_CB(SSE3, __VA_ARGS__)) + * @endcode + * + * @param EXEC_CB The preprocessor callback to be called for each enabled baseline feature. + * @param ... Additional arguments to be passed to the preprocessor callback. + */ +#define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \ +@WITH_CPU_BASELINE_CALL@ + +/** + * @def @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...) + * Similar to the above but for enabled dispatched features. + * + * @param EXEC_CB The preprocessor callback to be called for each enabled dispatched feature. + * @param ... Additional arguments to be passed to the preprocessor callback. + */ +#define @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...) \ +@WITH_CPU_DISPATCH_CALL@ + +/* + * Defines the default behavior for the configurable macros derived from the configuration header + * that is generated by the meson function `mod_features.multi_targets()`. + * + * Note: Providing fallback in case of optimization disabled is no longer needed for meson + * since we always guarantee having configuration headers. + * + * However, it is still needed for compatibility with Numpy distutils. + */ +#ifndef @P@DISABLE_OPTIMIZATION + #define @P@MTARGETS_CONF_BASELINE(CB, ...) \ + &&"Expected config header that generated by mod_features.multi_targets()"; + #define @P@MTARGETS_CONF_DISPATCH(TEST_FEATURE_CB, CB, ...) \ + &&"Expected config header that generated by mod_features.multi_targets()"; +#else + #define @P@MTARGETS_CONF_BASELINE(CB, ...) @P@_CPU_EXPAND(CB(__VA_ARGS__)) + #define @P@MTARGETS_CONF_DISPATCH(CHK, CB, ...) +#endif +/** + * @def @P@CPU_DISPATCH_CURFX(NAME) + * + * Returns `NAME` suffixed with "_" + "the current target" during compiling + * the generated static libraries that are derived from the Meson function + * `mod_features.multi_targets()`. + * + * It also returns `NAME` as-is without any suffix when it comes to the baseline features or + * in case if the optimization is disabled. + * + * Note: `mod_features.multi_targets()` provides a unique target name within the compiler #definition + * called `@P@MTARGETS_CURRENT` on each generated library based on the specified features + * within its parameter 'dispatch:'. + * + * For example: + * + * @code + * # from meson + * mod_features.multi_targets( + * 'arithmetic.dispatch.h', 'arithmetic.c', + * baseline: [SSE3], dispatch: [AVX512_SKX, AVX2], + * prefix: '@P@' + * ) + * @code + * + * @code + * void @P@CPU_DISPATCH_CURFX(add)(const int *src0, const int *src1, int *dst) + * { + * #ifdef @P@HAVE_AVX512F // one of the implied feature of AVX512_SKX + * // code + * #elif defined(@P@HAVE_AVX2) + * // code + * #elif defined(@P@HAVE_SSE3) + * // CODE + * #else + * // Fallback code in case of features enabled + * #endif + * } + * @endif + * + * // Unwrapped version : + * void add_AVX512_SKX(const int *src0, const int *src1, int *dst) + * {...} + * void add_AVX2(const int *src0, const int *src1, int *dst) + * {...} + * // baseline + * void add(const int *src0, const int *src1, int *dst) + * {...} + * @endcode + * + * @param NAME The base name of the dispatched function or variable. + */ +#ifdef @P@MTARGETS_CURRENT + // '@P@MTARGETS_CURRENT': only defined by the dispatchable sources + #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_CAT(@P@_CPU_CAT(NAME, _), @P@MTARGETS_CURRENT) +#else + #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_EXPAND(NAME) +#endif + +/** + * @def @P@CPU_DISPATCH_DECLARE(...) + * + * Provides forward declarations for the exported variables and functions + * based on the enabled baseline and dispatched features. + * + * This macro requires include the config file that been generated + * by meson function `mod_features.multi_targets()` to determine the enabled + * baseline and dispatched features. + * + * For example: + * + * @code + * # from meson + * mod_features.multi_targets( + * 'arithmetic.dispatch.h', 'arithmetic.c', + * baseline: [SSE3], dispatch: [AVX512_SKX, AVX2], + * prefix: '@P@' + * ) + * @code + * + * @code + * // from C + * #include "arithmetic.dispatch.h" + * @P@CPU_DISPATCH_DECLARE(void add, (const int *src0, const int *src1, int *dst)) + * + * // Unwrapped version: + * void add_AVX512_SKX(const int *src0, const int *src1, int *dst); + * void add_AVX2(const int *src0, const int *src1, int *dst); + * void add(const int *src0, const int *src1, int *dst); // baseline + * @endcode + * + * @param ... The function or variable prototype to be declared, + * with the target-specific suffix added automatically. + */ +#define @P@CPU_DISPATCH_DECLARE(...) \ + @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \ + @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__) + +// Preprocessor callbacks +#define @P@CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \ + @P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; +#define @P@CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \ + LEFT __VA_ARGS__; +// Dummy CPU runtime checking +#define @P@CPU_DISPATCH_DECLARE_CHK_(FEATURE_NAME) + +/** + * @def @P@CPU_DISPATCH_DECLARE_XB(LEFT, ...) + * + * Same as `@P@CPU_DISPATCH_DECLARE` but exclude the baseline declaration even + * if it was enabled within `mod_features.multi_targets()`. + */ +#define @P@CPU_DISPATCH_DECLARE_XB(...) \ + @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) + +/** + * @def @P@CPU_DISPATCH_CALL(...) + * + * Helper macro used for runtime dispatching of the exported functions and variables + * within the meson `mod_features.multi_targets()` function. + * + * This macro dispatches only one symbol based on the order of the specified features within the meson function + * `mod_features.multi_targets()`. For example, if `mod_features.multi_targets()` is called with + * `dispatch: [features_highest_1, features_highest_2]`, the macro will test each enabled feature against + * the CPU at runtime. Once it fails, it will move to the next order until falling back to the baseline. + * + * Similar to `@P@CPU_DISPATCH_DECLARE`, this macro requires including the config file that has been generated + * by the meson function `mod_features.multi_targets()` to determine the enabled baseline and dispatched features. + * + * Example usage: + * + * @code + * # from meson + * mod_features.multi_targets( + * 'arithmetic.dispatch.h', 'arithmetic.c', + * baseline: [SSE3], dispatch: [AVX512_SKX, AVX2], + * prefix: '@P@' + * ) + * @endcode + * + * @code + * // from C + * #include "arithmetic.dispatch.h" + * + * // Example 1: + * @P@CPU_DISPATCH_CALL(add, (src0, src1, dst)); + * + * // Unwrapped version: + * @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX(src0, src1, dst) : + * (@P@CPU_HAVE(AVX2) ? add_AVX2(src0, src1, dst) : + * add(src0, src1, dst); // baseline + * + * // Example 2: + * typedef void (*func_type)(const int*, const int*, int*); + * func_type func = @P@CPU_DISPATCH_CALL(add); + * + * // Unwrapped version: + * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX : + * (@P@CPU_HAVE(AVX2) ? add_AVX2 : + * add; // baseline + * + * // Example 3: + * func_type func3; + * @P@CPU_DISPATCH_CALL(func3 = add); + * + * // Unwrapped version: + * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? func3 = add_AVX512_SKX : + * (@P@CPU_HAVE(AVX2) ? func3 = add_AVX2 : + * func3 = add; // baseline + * + * @endcode + * + * @param ... The function or variable prototype to be called or assigned, + * with the target-specific suffix added automatically. + */ +#define @P@CPU_DISPATCH_CALL(...) \ + @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \ + @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__) +// Preprocessor callbacks +#define @P@CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \ + (TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : +#define @P@CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \ + (LEFT __VA_ARGS__) + +/** + * @def @P@CPU_DISPATCH_CALL_XB(LEFT, ...) + * + * Same as `@P@CPU_DISPATCH_CALL` but exclude the baseline call even + * if it was provided within meson `mod_features.multi_targets()`. + * + * Note: This macro returns void + */ +#define @P@CPU_DISPATCH_CALL_XB_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \ + (TESTED_FEATURES) ? (void) (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : +#define @P@CPU_DISPATCH_CALL_XB(...) \ + @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \ + ((void) 0 /* discarded expression value */) + +/** + * Macro @P@CPU_DISPATCH_CALL_ALL(...) + * + * Same as `@P@CPU_DISPATCH_CALL` but dispatching all the required optimizations for + * the exported functions and variables instead of highest interested one. + * Returns void. + */ +#define @P@CPU_DISPATCH_CALL_ALL(...) \ + (@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \ + @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__)) +// Preprocessor callbacks +#define @P@CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \ + ((TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0), +#define @P@CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \ + ( LEFT __VA_ARGS__ ) + +// Brings the headers files of enabled CPU features +#ifdef @P@HAVE_SSE + #include +#endif +#ifdef @P@HAVE_SSE2 + #include +#endif +#ifdef @P@HAVE_SSE3 + #include +#endif +#ifdef @P@HAVE_SSSE3 + #include +#endif +#ifdef @P@HAVE_SSE41 + #include +#endif +#ifdef @P@HAVE_POPCNT + #ifdef _MSC_VER + #include + #else + #include + #endif +#endif +#ifdef @P@HAVE_AVX + #include +#endif + +#if defined(@P@HAVE_XOP) || defined(@P@HAVE_FMA4) + #include +#endif + +#ifdef @P@HAVE_VSX + #include +#endif + +#ifdef @P@HAVE_VX + #include +#endif + +#ifdef @P@HAVE_NEON + #include +#endif +#endif // @P@_CPU_DISPATCHER_CONF_H_ diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build new file mode 100644 index 000000000000..b99638bfc24f --- /dev/null +++ b/meson_cpu/meson.build @@ -0,0 +1,307 @@ +# The CPU Dispatcher implementation. +# +# This script handles the CPU dispatcher and requires the Meson module +# 'features'. +# +# The CPU dispatcher script is responsible for three main tasks: +# +# 1. Defining the enabled baseline and dispatched features by parsing build +# options or compiler arguments, including detection of native flags. +# +# 2. Specifying the baseline arguments and definitions across all sources. +# +# 3. Generating the main configuration file, which contains information about +# the enabled features, along with a collection of C macros necessary for +# runtime dispatching. For more details, see the template file +# `main_config.h.in`. +# +# This script exposes the following variables: +# +# - `CPU_BASELINE`: A set of CPU feature objects obtained from +# `mod_features.new()`, representing the minimum CPU features +# specified by the build option `-Dcpu-baseline`. +# +# - `CPU_BASELINE_NAMES`: A set of enabled CPU feature names, representing the +# minimum CPU features specified by the build option +# `-Dcpu-baseline`. +# +# - `CPU_DISPATCH_NAMES`: A set of enabled CPU feature names, representing the +# additional CPU features that can be dispatched at +# runtime, specified by the build option +# `-Dcpu-dispatch`. +# +# - `CPU_FEATURES`: A dictionary containing all supported CPU feature objects. +# +# Additionally, this script exposes a set of variables that represent each +# supported feature to be used within the Meson function +# `mod_features.multi_targets()`. + +# Prefix used by all macros and features definitions +CPU_CONF_PREFIX = 'NPY_' +# main configuration name +CPU_CONF_CONFIG = 'npy_cpu_dispatch_config.h' + +if get_option('disable-optimization') + add_project_arguments('-D' + CPU_CONF_PREFIX + 'DISABLE_OPTIMIZATION', language: ['c', 'cpp']) + CPU_CONF_BASELINE = 'none' + CPU_CONF_DISPATCH = 'none' +else + baseline_detect = false + c_args = get_option('c_args') + foreach arg : c_args + foreach carch : ['-march', '-mcpu', '-xhost', '/QxHost'] + if arg.contains(carch) + message('Appending option "detect" to "cpu-baseline" due to detecting global architecture c_arg "' + arg + '"') + baseline_detect = true + break + endif + endforeach + if baseline_detect + break + endif + endforeach + # The required minimal set of required CPU features. + CPU_CONF_BASELINE = get_option('cpu-baseline') + if baseline_detect + CPU_CONF_BASELINE += '+detect' + endif + # The required dispatched set of additional CPU features. + CPU_CONF_DISPATCH = get_option('cpu-dispatch') +endif + +# Initialize the CPU features Export the X86 features objects 'SSE', 'AVX', +# etc. plus a dictionary "X86_FEATURES" which maps to each object by its name +subdir('x86') +subdir('ppc64') +subdir('s390x') +subdir('arm') + +CPU_FEATURES = {} +CPU_FEATURES += ARM_FEATURES +CPU_FEATURES += X86_FEATURES +CPU_FEATURES += PPC64_FEATURES +CPU_FEATURES += S390X_FEATURES + +# Parse the requsted baseline (CPU_CONF_BASELINE) and dispatch features +# (CPU_CONF_DISPATCH). +cpu_family = host_machine.cpu_family() +# Used by build option 'min' +min_features = { + 'x86': [SSE2], + 'x86_64': [SSE3], + 'ppc64': [], + 's390x': [], + 'arm': [], + 'aarch64': [ASIMD] +}.get(cpu_family, []) +if host_machine.endian() == 'little' and cpu_family == 'ppc64' + min_features = [VSX2] +endif + +# Used by build option 'max' +max_features_dict = { + 'x86': X86_FEATURES, + 'x86_64': X86_FEATURES, + 'ppc64': PPC64_FEATURES, + 's390x': S390X_FEATURES, + 'arm': ARM_FEATURES, + 'aarch64': ARM_FEATURES, +}.get(cpu_family, []) +max_features = [] +foreach fet_name, fet_obj : max_features_dict + max_features += [fet_obj] +endforeach + +parse_options = { + 'cpu-baseline': CPU_CONF_BASELINE, + 'cpu-dispatch': CPU_CONF_DISPATCH +} +parse_result = { + 'cpu-baseline': [], + 'cpu-dispatch': [] +} +mod_features = import('features') +foreach opt_name, conf : parse_options + # no support for regex :(? + tokens = conf.replace(',', ' ').replace('+', ' + ').replace('-', ' - ').strip().to_upper().split() + result = [] + ignored = [] + # append is the default + append = true + foreach tok : tokens + if tok == '+' + append = true + continue + elif tok == '-' + append = false + continue + elif tok == 'NONE' + continue + elif tok == 'NATIVE' + if not is_variable('cpu_native_features') + compiler_id = meson.get_compiler('c').get_id() + native_flags = { + 'intel': '-xHost', + 'intel-cl': '/QxHost', + # FIXME: Add support for fcc(-mcpu=a64fx) compiler + }.get(compiler_id, '-march=native') + test_native = mod_features.test( + max_features, anyfet: true, + force_args: [native_flags] + '-DDETECT_FEATURES' + ) + if not test_native[0] + error('Option "native" doesn\'t support compiler', compiler_id) + endif + cpu_native_features = [] + foreach fet_name : test_native[1].get('features') + cpu_native_features += CPU_FEATURES[fet_name] + endforeach + endif + accumulate = cpu_native_features + elif tok == 'DETECT' + if not is_variable('cpu_detect_features') + test_detect = mod_features.test( + max_features, anyfet: true, + force_args: ['-DDETECT_FEATURES'] + get_option('c_args') + ) + cpu_detect_features = [] + foreach fet_name : test_detect[1].get('features') + cpu_detect_features += CPU_FEATURES[fet_name] + endforeach + endif + accumulate = cpu_detect_features + elif tok == 'MIN' + accumulate = min_features + elif tok == 'MAX' + accumulate = max_features + elif tok in CPU_FEATURES + tokobj = CPU_FEATURES[tok] + if tokobj not in max_features + ignored += tok + continue + endif + accumulate = [tokobj] + else + error('Invalid token "'+tok+'" within option --'+opt_name) + endif + if append + foreach fet : accumulate + if fet not in result + result += fet + endif + endforeach + else + filterd = [] + foreach fet : result + if fet not in accumulate + filterd += fet + endif + endforeach + result = filterd + endif # append + endforeach # tok : tokens + if ignored.length() > 0 + message( + 'During parsing ' + opt_name + + ': The following CPU features were ignored due to platform ' + + 'incompatibility or lack of support:\n"' + ' '.join(ignored) + '"' + ) + endif + if result.length() > 0 + parse_result += {opt_name: mod_features.implicit_c(result)} + endif +endforeach # opt_name, conf : parse_options + +# Test the baseline and dispatch features and set their flags and #definitions +# across all sources. +# +# It is important to know that this test enables the maximum supported features +# by the platform depending on the required features. +# +# For example, if the user specified `--cpu-baseline=avx512_skx`, and the +# compiler doesn't support it, but still supports any of the implied features, +# then we enable the maximum supported implied features, e.g., AVX2, which can +# be done by specifying `anyfet: true` to the test function. +if parse_result['cpu-baseline'].length() > 0 + baseline = mod_features.test(parse_result['cpu-baseline'], anyfet: true)[1] + baseline_args = baseline['args'] + foreach baseline_fet : baseline['defines'] + baseline_args += ['-D' + CPU_CONF_PREFIX + 'HAVE_' + baseline_fet] + endforeach + add_project_arguments(baseline_args, language: ['c', 'cpp']) +else + baseline = {} +endif +# The name of the baseline features including its implied features. +CPU_BASELINE_NAMES = baseline.get('features', []) +CPU_BASELINE = [] +foreach fet_name : CPU_BASELINE_NAMES + CPU_BASELINE += [CPU_FEATURES[fet_name]] +endforeach +# Loop over all initialized features and disable any feature that is not part +# of the requested baseline and dispatch features to avoid it enabled by +# import('feature').multi_targets +foreach fet_name, fet_obj : CPU_FEATURES + if fet_obj in parse_result['cpu-dispatch'] or fet_name in CPU_BASELINE_NAMES + continue + endif + fet_obj.update(disable: 'Not part of the requsted features') +endforeach + +CPU_DISPATCH_NAMES = [] +foreach fet_obj : parse_result['cpu-dispatch'] + # skip baseline features + if fet_obj.get('name') in CPU_BASELINE_NAMES + continue + endif + fet_test = mod_features.test(fet_obj) + if not fet_test[0] + continue + endif + CPU_DISPATCH_NAMES += [fet_obj.get('name')] +endforeach +# Generate main configuration header see 'main_config.h.in' for more +# clarification. +main_config = { + 'P': CPU_CONF_PREFIX, + 'WITH_CPU_BASELINE': ' '.join(CPU_BASELINE_NAMES), + 'WITH_CPU_BASELINE_N': CPU_BASELINE_NAMES.length(), + 'WITH_CPU_DISPATCH': ' '.join(CPU_DISPATCH_NAMES), + 'WITH_CPU_DISPATCH_N': CPU_DISPATCH_NAMES.length(), +} +clines = [] +macro_tpl = '@0@_CPU_EXPAND(EXEC_CB(@1@, __VA_ARGS__)) \\' +foreach fet : CPU_BASELINE_NAMES + clines += macro_tpl.format(CPU_CONF_PREFIX, fet) +endforeach +main_config += {'WITH_CPU_BASELINE_CALL': '\n'.join(clines)} +clines = [] +foreach fet : CPU_DISPATCH_NAMES + clines += macro_tpl.format(CPU_CONF_PREFIX, fet) +endforeach +main_config += {'WITH_CPU_DISPATCH_CALL': '\n'.join(clines)} + +configure_file( + input : 'main_config.h.in', + output : CPU_CONF_CONFIG, + configuration : configuration_data(main_config) +) +add_project_arguments( + '-I' + meson.current_build_dir(), + language: ['c', 'cpp'] +) + +message( +''' +CPU Optimization Options + baseline: + Requested : @0@ + Enabled : @1@ + dispatch: + Requested : @2@ + Enabled : @3@ +'''.format( + CPU_CONF_BASELINE, ' '.join(CPU_BASELINE_NAMES), + CPU_CONF_DISPATCH, ' '.join(CPU_DISPATCH_NAMES) + ) +) diff --git a/meson_cpu/ppc64/meson.build b/meson_cpu/ppc64/meson.build new file mode 100644 index 000000000000..d14b23703fe3 --- /dev/null +++ b/meson_cpu/ppc64/meson.build @@ -0,0 +1,38 @@ +source_root = meson.project_source_root() +mod_features = import('features') +compiler_id = meson.get_compiler('c').get_id() + +VSX = mod_features.new( + 'VSX', 1, args: '-mvsx', + test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx.c')[0], + extra_tests: { + 'VSX_ASM': files(source_root + '/numpy/distutils/checks/extra_vsx_asm.c')[0] + } +) +if compiler_id == 'clang' + VSX.update(args: ['-mvsx', '-maltivec']) +endif +VSX2 = mod_features.new( + 'VSX2', 2, implies: VSX, args: {'val': '-mcpu=power8', 'match': '.*vsx'}, + detect: {'val': 'VSX2', 'match': 'VSX'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx2.c')[0], +) +# VSX2 is hardware baseline feature on ppc64le since the first little-endian +# support was part of Power8 +if host_machine.endian() == 'little' + VSX.update(implies: VSX2) +endif +VSX3 = mod_features.new( + 'VSX3', 3, implies: VSX2, args: {'val': '-mcpu=power9', 'match': '.*[mcpu=|vsx].*'}, + detect: {'val': 'VSX3', 'match': 'VSX.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0], +) +VSX4 = mod_features.new( + 'VSX4', 4, implies: VSX3, args: {'val': '-mcpu=power10', 'match': '.*[mcpu=|vsx].*'}, + detect: {'val': 'VSX4', 'match': 'VSX.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0], + extra_tests: { + 'VSX4_MMA': files(source_root + '/numpy/distutils/checks/extra_vsx4_mma.c')[0] + } +) +PPC64_FEATURES = {'VSX': VSX, 'VSX2': VSX2, 'VSX3': VSX3, 'VSX4': VSX4} diff --git a/meson_cpu/s390x/meson.build b/meson_cpu/s390x/meson.build new file mode 100644 index 000000000000..a69252d1607c --- /dev/null +++ b/meson_cpu/s390x/meson.build @@ -0,0 +1,18 @@ +source_root = meson.project_source_root() +mod_features = import('features') + +VX = mod_features.new( + 'VX', 1, args: ['-mzvector', '-march=arch11'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_vx.c')[0], +) +VXE = mod_features.new( + 'VXE', 2, implies: VX, args: {'val': '-march=arch12', 'match': '-march=.*'}, + detect: {'val': 'VXE', 'match': 'VX'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe.c')[0], +) +VXE2 = mod_features.new( + 'VXE2', 3, implies: VXE, args: {'val': '-march=arch13', 'match': '-march=.*'}, + detect: {'val': 'VXE2', 'match': 'VX.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe2.c')[0], +) +S390X_FEATURES = {'VX': VX, 'VXE': VXE, 'VXE2': VXE2} diff --git a/meson_cpu/x86/meson.build b/meson_cpu/x86/meson.build new file mode 100644 index 000000000000..caf6bf09c14e --- /dev/null +++ b/meson_cpu/x86/meson.build @@ -0,0 +1,227 @@ +source_root = meson.project_source_root() +mod_features = import('features') + +SSE = mod_features.new( + 'SSE', 1, args: '-msse', + test_code: files(source_root + '/numpy/distutils/checks/cpu_sse.c')[0] +) +SSE2 = mod_features.new( + 'SSE2', 2, implies: SSE, + args: '-msse2', + test_code: files(source_root + '/numpy/distutils/checks/cpu_sse2.c')[0] +) +# enabling SSE without SSE2 is useless also it's non-optional for x86_64 +SSE.update(implies: SSE2) +SSE3 = mod_features.new( + 'SSE3', 3, implies: SSE2, + args: '-msse3', + test_code: files(source_root + '/numpy/distutils/checks/cpu_sse3.c')[0] +) +SSSE3 = mod_features.new( + 'SSSE3', 4, implies: SSE3, + args: '-mssse3', + test_code: files(source_root + '/numpy/distutils/checks/cpu_ssse3.c')[0] +) +SSE41 = mod_features.new( + 'SSE41', 5, implies: SSSE3, + args: '-msse4.1', + test_code: files(source_root + '/numpy/distutils/checks/cpu_sse41.c')[0] +) +POPCNT = mod_features.new( + 'POPCNT', 6, implies: SSE41, + args: '-mpopcnt', + test_code: files(source_root + '/numpy/distutils/checks/cpu_popcnt.c')[0] +) +SSE42 = mod_features.new( + 'SSE42', 7, implies: POPCNT, args: '-msse4.2', + test_code: files(source_root + '/numpy/distutils/checks/cpu_sse42.c')[0] +) +# 7-20 left as margin for any extra features +AVX = mod_features.new( + 'AVX', 20, implies: SSE42, args: '-mavx', + detect: {'val': 'AVX', 'match': '.*SSE.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx.c')[0] +) +XOP = mod_features.new( + 'XOP', 21, implies: AVX, args: '-mxop', + test_code: files(source_root + '/numpy/distutils/checks/cpu_xop.c')[0] +) +FMA4 = mod_features.new( + 'FMA4', 22, implies: AVX, args: '-mfma4', + test_code: files(source_root + '/numpy/distutils/checks/cpu_fma4.c')[0] +) +# x86 half-precision +F16C = mod_features.new( + 'F16C', 23, implies: AVX, args: '-mf16c', + test_code: files(source_root + '/numpy/distutils/checks/cpu_f16c.c')[0] +) +FMA3 = mod_features.new( + 'FMA3', 24, implies: F16C, args: '-mfma', + test_code: files(source_root + '/numpy/distutils/checks/cpu_fma3.c')[0] +) +AVX2 = mod_features.new( + 'AVX2', 25, implies: F16C, args: '-mavx2', + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx2.c')[0] +) +# 25-40 left as margin for any extra features +AVX512F = mod_features.new( + 'AVX512F', 40, implies: [FMA3, AVX2], + # Disables mmx because of stack corruption that may happen during mask + # conversions. + # TODO (seiko2plus): provide more clarification + args: ['-mno-mmx', '-mavx512f'], + detect: {'val': 'AVX512F', 'match': '.*'}, + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512f.c')[0], + extra_tests: { + 'AVX512F_REDUCE': files(source_root + '/numpy/distutils/checks/extra_avx512f_reduce.c')[0] + } +) +AVX512CD = mod_features.new( + 'AVX512CD', 41, implies: AVX512F, args: '-mavx512cd', + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512cd.c')[0] +) +AVX512_KNL = mod_features.new( + 'AVX512_KNL', 42, implies: AVX512CD, args: ['-mavx512er', '-mavx512pf'], + group: ['AVX512ER', 'AVX512PF'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knl.c')[0] +) +AVX512_KNM = mod_features.new( + 'AVX512_KNM', 43, implies: AVX512_KNL, + args: ['-mavx5124fmaps', '-mavx5124vnniw', '-mavx512vpopcntdq'], + group: ['AVX5124FMAPS', 'AVX5124VNNIW', 'AVX512VPOPCNTDQ'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knm.c')[0] +) +AVX512_SKX = mod_features.new( + 'AVX512_SKX', 50, implies: AVX512CD, + args: ['-mavx512vl', '-mavx512bw', '-mavx512dq'], + group: ['AVX512VL', 'AVX512BW', 'AVX512DQ'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_skx.c')[0], + extra_tests: { + 'AVX512BW_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512bw_mask.c')[0], + 'AVX512DQ_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512dq_mask.c')[0] + } +) +AVX512_CLX = mod_features.new( + 'AVX512_CLX', 51, implies: AVX512_SKX, args: '-mavx512vnni', + group: ['AVX512VNNI'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_clx.c')[0] +) +AVX512_CNL = mod_features.new( + 'AVX512_CNL', 52, implies: AVX512_SKX, + args: ['-mavx512ifma', '-mavx512vbmi'], + group: ['AVX512IFMA', 'AVX512VBMI'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_cnl.c')[0] +) +AVX512_ICL = mod_features.new( + 'AVX512_ICL', 53, implies: [AVX512_CLX, AVX512_CNL], + args: ['-mavx512vbmi2', '-mavx512bitalg', '-mavx512vpopcntdq'], + group: ['AVX512VBMI2', 'AVX512BITALG', 'AVX512VPOPCNTDQ'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_icl.c')[0] +) +# TODO add support for zen4 +AVX512_SPR = mod_features.new( + 'AVX512_SPR', 55, implies: AVX512_ICL, + args: ['-mavx512fp16'], + group: ['AVX512FP16'], + test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_spr.c')[0] +) + +# Specializations for non unix-like compilers +# ------------------------------------------- +cpu_family = host_machine.cpu_family() +compiler_id = meson.get_compiler('c').get_id() +if compiler_id not in ['gcc', 'clang'] + AVX512_SPR.update(disable: compiler_id + ' compiler does not support it') +endif + +# Common specializations between both Intel compilers (unix-like and msvc-like) +if compiler_id in ['intel', 'intel-cl'] + # POPCNT, and F16C don't own private FLAGS however the compiler still + # provides ISA capability for them. + POPCNT.update(args: '') + F16C.update(args: '') + # Intel compilers don't support the following features independently + FMA3.update(implies: [F16C, AVX2]) + AVX2.update(implies: [F16C, FMA3]) + AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX]) + AVX512CD.update(implies: [AVX512F, AVX512_SKX]) + XOP.update(disable: 'Intel Compiler does not support it') + FMA4.update(disable: 'Intel Compiler does not support it') +endif + +if compiler_id == 'intel-cl' + foreach fet : [SSE, SSE2, SSE3, SSSE3, AVX] + fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': '/arch:.*'}) + endforeach + SSE41.update(args: {'val': '/arch:SSE4.1', 'match': '/arch:.*'}) + SSE42.update(args: {'val': '/arch:SSE4.2', 'match': '/arch:.*'}) + FMA3.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'}) + AVX2.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'}) + AVX512F.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'}) + AVX512CD.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'}) + AVX512_KNL.update(args: {'val': '/Qx:KNL', 'match': '/[arch|Qx]:.*'}) + AVX512_KNM.update(args: {'val': '/Qx:KNM', 'match': '/[arch|Qx]:.*'}) + AVX512_SKX.update(args: {'val': '/Qx:SKYLAKE-AVX512', 'match': '/[arch|Qx]:.*'}) + AVX512_CLX.update(args: {'val': '/Qx:CASCADELAKE', 'match': '/[arch|Qx]:.*'}) + AVX512_CNL.update(args: {'val': '/Qx:CANNONLAKE', 'match': '/[arch|Qx]:.*'}) + AVX512_ICL.update(args: {'val': '/Qx:ICELAKE-CLIENT', 'match': '/[arch|Qx]:.*'}) +endif + +if compiler_id == 'intel' + clear_m = '^(-mcpu=|-march=)' + clear_any = '^(-mcpu=|-march=|-x[A-Z0-9\-])' + FMA3.update(args: {'val': '-march=core-avx2', 'match': clear_m}) + AVX2.update(args: {'val': '-march=core-avx2', 'match': clear_m}) + AVX512F.update(args: {'val': '-march=common-avx512', 'match': clear_m}) + AVX512CD.update(args: {'val': '-march=common-avx512', 'match': clear_m}) + AVX512_KNL.update(args: {'val': '-xKNL', 'match': clear_any}) + AVX512_KNM.update(args: {'val': '-xKNM', 'match': clear_any}) + AVX512_SKX.update(args: {'val': '-xSKYLAKE-AVX512', 'match': clear_any}) + AVX512_CLX.update(args: {'val': '-xCASCADELAKE', 'match': clear_any}) + AVX512_CNL.update(args: {'val': '-xCANNONLAKE', 'match': clear_any}) + AVX512_ICL.update(args: {'val': '-xICELAKE-CLIENT', 'match': clear_any}) +endif + +if compiler_id == 'msvc' + # MSVC compiler doesn't support the following features + foreach fet : [AVX512_KNL, AVX512_KNM] + fet.update(disable: compiler_id + ' compiler does not support it') + endforeach + # The following features don't own private FLAGS, however the compiler still + # provides ISA capability for them. + foreach fet : [ + SSE3, SSSE3, SSE41, POPCNT, SSE42, AVX, F16C, XOP, FMA4, + AVX512F, AVX512CD, AVX512_CLX, AVX512_CNL, + AVX512_ICL + ] + fet.update(args: '') + endforeach + # MSVC compiler doesn't support the following features independently + FMA3.update(implies: [F16C, AVX2]) + AVX2.update(implies: [F16C, FMA3]) + AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX]) + AVX512CD.update(implies: [AVX512F, AVX512_SKX]) + clear_arch = '/arch:.*' + # only available on 32-bit. Its enabled by default on 64-bit mode + foreach fet : [SSE, SSE2] + if cpu_family == 'x86' + fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': clear_arch}) + else + fet.update(args: '') + endif + endforeach + FMA3.update(args: {'val': '/arch:AVX2', 'match': clear_arch}) + AVX2.update(args: {'val': '/arch:AVX2', 'match': clear_arch}) + AVX512_SKX.update(args: {'val': '/arch:AVX512', 'match': clear_arch}) +endif + +X86_FEATURES = { + 'SSE': SSE, 'SSE2': SSE2, 'SSE3': SSE3, 'SSSE3': SSSE3, + 'SSE41': SSE41, 'POPCNT': POPCNT, 'SSE42': SSE42, 'AVX': AVX, + 'XOP': XOP, 'FMA4': FMA4, 'F16C': F16C, 'FMA3': FMA3, + 'AVX2': AVX2, 'AVX512F': AVX512F, 'AVX512CD': AVX512CD, + 'AVX512_KNL': AVX512_KNL, 'AVX512_KNM': AVX512_KNM, + 'AVX512_SKX': AVX512_SKX, 'AVX512_CLX': AVX512_CLX, + 'AVX512_CNL': AVX512_CNL, 'AVX512_ICL': AVX512_ICL, + 'AVX512_SPR': AVX512_SPR +} diff --git a/meson_options.txt b/meson_options.txt index 7ce4eefacd89..8b1fad6c4041 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1,7 +1,7 @@ option('blas', type: 'string', value: 'openblas', - description: 'option for BLAS library switching') + description: 'Option for BLAS library switching') option('lapack', type: 'string', value: 'openblas', - description: 'option for LAPACK library switching') + description: 'Option for LAPACK library switching') option('allow-noblas', type: 'boolean', value: false, description: 'If set to true, allow building with (slow!) internal fallback routines') option('use-ilp64', type: 'boolean', value: false, @@ -12,8 +12,22 @@ option('disable-svml', type: 'boolean', value: false, description: 'Disable building against SVML') option('disable-threading', type: 'boolean', value: false, description: 'Disable threading support (see `NPY_ALLOW_THREADS` docs)') -# TODO: flip value to 'false' once we have `npy_cpu_dispatch_config.h` & co. -option('disable-simd-optimizations', type: 'boolean', value: true, - description: 'Disable SIMD features beyond the baseline ones') +option('disable-optimization', type: 'boolean', value: false, + description: 'Disable CPU optimized code (dispatch,simd,unroll...)') +option('cpu-baseline', type: 'string', value: 'min', + description: 'Minimal set of required CPU features') +option('cpu-dispatch', type: 'string', value: 'max -xop -fma4', + description: 'Dispatched set of additional CPU features') +option('test-simd', type: 'array', + value: [ + 'BASELINE', 'SSE2', 'SSE42', 'XOP', 'FMA4', + 'AVX2', 'FMA3', 'AVX2,FMA3', 'AVX512F', 'AVX512_SKX', + 'VSX', 'VSX2', 'VSX3', 'VSX4', + 'NEON', 'ASIMD', + 'VX', 'VXE', 'VXE2', + ], + description: 'Specify a list of CPU features to be tested against NumPy SIMD interface') +option('test-simd-args', type: 'string', value: '', + description: 'Extra args to be passed to the `_simd` module that is used for testing the NumPy SIMD interface') option('relaxed-strides-debug', type: 'boolean', value: false, description: 'Enable relaxed strides debug mode (see `NPY_RELAXED_STRIDES_DEBUG` docs)') diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 17760efa2fc0..ccc060aacb96 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -84,6 +84,7 @@ cdata.set('NPY_API_VERSION', C_API_VERSION) use_svml = ( host_machine.system() == 'linux' and host_machine.cpu_family() == 'x86_64' and + ('AVX512_SKX' in CPU_DISPATCH_NAMES or 'AVX512_SKX' in CPU_BASELINE_NAMES) and not get_option('disable-svml') ) if use_svml @@ -291,9 +292,6 @@ endforeach # SSE headers only enabled automatically on amd64/x32 builds optional_headers = [ - 'xmmintrin.h', # SSE - 'emmintrin.h', # SSE2 - 'immintrin.h', # AVX 'features.h', # for glibc version linux 'xlocale.h', # see GH#8367 'dlfcn.h', # dladdr @@ -322,6 +320,15 @@ optional_function_attributes = [ # endif #endforeach +# Max possible optimization flags. We pass this flags to all our dispatch-able +# (multi_targets) sources. +compiler_id = cc.get_id() +max_opt = { + 'msvc': ['/O2'], + 'intel-cl': ['/O3'], +}.get(compiler_id, ['-O3']) +max_opt = cc.has_multi_arguments(max_opt) ? max_opt : [] + # Optional GCC compiler builtins and their call arguments. # If given, a required header and definition name (HAVE_ prepended) # Call arguments are required as the compiler will do strict signature checking @@ -513,12 +520,6 @@ if cc.get_id() == 'msvc' staticlib_cflags += '-d2VolatileMetadata-' endif endif -# TODO: change to "feature" option in meson_options.txt? See -# https://mesonbuild.com/Build-options.html#build-options -if get_option('disable-simd-optimizations') - staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION' - staticlib_cppflags += '-DNPY_DISABLE_OPTIMIZATION' -endif npy_math_internal_h = custom_target( output: 'npy_math_internal.h', @@ -626,19 +627,10 @@ src_ufunc_api = custom_target('__ufunc_api', # Set common build flags for C and C++ code # ----------------------------------------- - -# TODO: change to "feature" option in meson_options.txt? See -# https://mesonbuild.com/Build-options.html#build-options -disable_simd_optimizations = [] -if get_option('disable-simd-optimizations') - disable_simd_optimizations = '-DNPY_DISABLE_OPTIMIZATION' -endif - # Common build flags c_args_common = [ '-DNPY_INTERNAL_BUILD', '-DHAVE_NPY_CONFIG_H', - disable_simd_optimizations, cflags_large_file_support, ] @@ -667,11 +659,9 @@ np_core_dep = declare_dependency( '.', 'include', 'src/common', - ], - compile_args: disable_simd_optimizations + ] ) - # Build multiarray_tests module # ----------------------------- py.extension_module('_multiarray_tests', @@ -691,15 +681,30 @@ py.extension_module('_multiarray_tests', subdir: 'numpy/core', ) +_umath_tests_mtargets = mod_features.multi_targets( + '_umath_tests.dispatch.h', + 'src/umath/_umath_tests.dispatch.c', + dispatch: [ + AVX2, SSE41, SSE2, + ASIMDHP, ASIMD, NEON, + VSX3, VSX2, VSX, + VXE, VX, + ], + baseline: CPU_BASELINE, + prefix: 'NPY_', + dependencies: [py_dep, np_core_dep] +) + test_modules_src = [ ['_umath_tests', [ src_file.process('src/umath/_umath_tests.c.src'), - 'src/umath/_umath_tests.dispatch.c', 'src/common/npy_cpu_features.c', - ]], - ['_rational_tests', 'src/umath/_rational_tests.c'], - ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c'], - ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c'], + ], + _umath_tests_mtargets.static_lib('_umath_tests_mtargets') + ], + ['_rational_tests', 'src/umath/_rational_tests.c', []], + ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c', []], + ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c', []], ] foreach gen: test_modules_src py.extension_module(gen[0], @@ -709,7 +714,261 @@ foreach gen: test_modules_src dependencies: np_core_dep, install: true, subdir: 'numpy/core', + link_with: gen[2], + ) +endforeach + +# Build multiarray dispatch-able sources +# -------------------------------------- +multiarray_gen_headers = [ + src_file.process('src/multiarray/arraytypes.h.src'), + src_file.process('src/common/npy_sort.h.src'), +] +foreach gen_mtargets : [ + [ + 'argfunc.dispatch.h', + src_file.process('src/multiarray/argfunc.dispatch.c.src'), + [ + AVX512_SKX, AVX2, XOP, SSE42, SSE2, + VSX2, + ASIMD, NEON, + VXE, VX + ] + ], +] + mtargets = mod_features.multi_targets( + gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1], + dispatch: gen_mtargets[2], + baseline: CPU_BASELINE, + prefix: 'NPY_', + dependencies: [py_dep, np_core_dep], + c_args: c_args_common + max_opt, + cpp_args: cpp_args_common + max_opt, + include_directories: [ + 'include', + 'src/common', + 'src/multiarray', + 'src/npymath', + 'src/umath' + ] + ) + if not is_variable('multiarray_umath_mtargets') + multiarray_umath_mtargets = mtargets + else + multiarray_umath_mtargets.extend(mtargets) + endif +endforeach + +# Build npysort dispatch-able sources +# ----------------------------------- +foreach gen_mtargets : [ + [ + 'simd_qsort.dispatch.h', + 'src/npysort/simd_qsort.dispatch.cpp', + [AVX512_SKX] + ], + [ + 'simd_qsort_16bit.dispatch.h', + 'src/npysort/simd_qsort_16bit.dispatch.cpp', + [AVX512_SPR, AVX512_ICL] + ], +] + mtargets = mod_features.multi_targets( + gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1], + dispatch: gen_mtargets[2], + # baseline: CPU_BASELINE, it doesn't provide baseline fallback + prefix: 'NPY_', + dependencies: [py_dep, np_core_dep], + c_args: c_args_common + max_opt, + cpp_args: cpp_args_common + max_opt, + include_directories: [ + 'include', + 'src/common', + 'src/multiarray', + 'src/npymath', + 'src/umath' + ] ) + if not is_variable('multiarray_umath_mtargets') + multiarray_umath_mtargets = mtargets + else + multiarray_umath_mtargets.extend(mtargets) + endif +endforeach + +# Build umath dispatch-able sources +# --------------------------------- +mod_features = import('features') +umath_gen_headers = [ + src_file.process('src/umath/loops.h.src'), + src_file.process('src/umath/loops_utils.h.src'), +] + +foreach gen_mtargets : [ + [ + 'loops_arithm_fp.dispatch.h', + src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'), + [ + [AVX2, FMA3], SSE2, + ASIMD, NEON, + VSX3, VSX2, + VXE, VX, + ] + ], + [ + 'loops_arithmetic.dispatch.h', + src_file.process('src/umath/loops_arithmetic.dispatch.c.src'), + [ + AVX512_SKX, AVX512F, AVX2, SSE41, SSE2, + NEON, + VSX4, VSX2, + VX, + ] + ], + [ + 'loops_comparison.dispatch.h', + src_file.process('src/umath/loops_comparison.dispatch.c.src'), + [ + AVX512_SKX, AVX512F, AVX2, SSE42, SSE2, + VSX3, VSX2, + NEON, + VXE, VX, + ] + ], + [ + 'loops_exponent_log.dispatch.h', + src_file.process('src/umath/loops_exponent_log.dispatch.c.src'), + # Enabling SIMD on clang-cl raises spurious FP exceptions + # TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp + compiler_id == 'clang-cl' ? [] : [ + AVX512_SKX, AVX512F, [AVX2, FMA3] + ] + ], + [ + 'loops_hyperbolic.dispatch.h', + src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'), + [ + AVX512_SKX, [AVX2, FMA3], + VSX4, VSX2, + NEON_VFPV4, + VXE, VX + ] + ], + [ + 'loops_logical.dispatch.h', + src_file.process('src/umath/loops_logical.dispatch.c.src'), + [ + ASIMD, NEON, + AVX512_SKX, AVX2, SSE2, + VSX2, + VX, + ] + ], + [ + 'loops_minmax.dispatch.h', + src_file.process('src/umath/loops_minmax.dispatch.c.src'), + [ + ASIMD, NEON, + AVX512_SKX, AVX2, SSE2, + VSX2, + VXE, VX, + ] + ], + [ + 'loops_modulo.dispatch.h', + src_file.process('src/umath/loops_modulo.dispatch.c.src'), + [ + VSX4 + ] + ], + [ + 'loops_trigonometric.dispatch.h', + src_file.process('src/umath/loops_trigonometric.dispatch.c.src'), + # Enabling SIMD on clang-cl raises spurious FP exceptions + # TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos + compiler_id == 'clang-cl' ? [] : [ + AVX512F, [AVX2, FMA3], + VSX4, VSX3, VSX2, + NEON_VFPV4, + VXE2, VXE + ] + ], + [ + 'loops_umath_fp.dispatch.h', + src_file.process('src/umath/loops_umath_fp.dispatch.c.src'), + [AVX512_SKX] + ], + [ + 'loops_unary.dispatch.h', + src_file.process('src/umath/loops_unary.dispatch.c.src'), + [ + ASIMD, NEON, + AVX512_SKX, AVX2, SSE2, + VSX2, + VXE, VX + ] + ], + [ + 'loops_unary_fp.dispatch.h', + src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), + [ + SSE41, SSE2, + VSX2, + ASIMD, NEON, + VXE, VX + ] + ], + [ + 'loops_unary_fp_le.dispatch.h', + src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'), + [ + SSE41, SSE2, + VSX2, + ASIMD, NEON, + ] + ], + [ + 'loops_unary_complex.dispatch.h', + src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), + [ + AVX512F, [AVX2, FMA3], SSE2, + ASIMD, NEON, + VSX3, VSX2, + VXE, VX, + ] + ], + [ + 'loops_autovec.dispatch.h', + src_file.process('src/umath/loops_autovec.dispatch.c.src'), + [ + AVX2, SSE2, + NEON, + VSX2, + VX, + ] + ], +] + mtargets = mod_features.multi_targets( + gen_mtargets[0], umath_gen_headers + gen_mtargets[1], + dispatch: gen_mtargets[2], + baseline: CPU_BASELINE, + prefix: 'NPY_', + dependencies: [py_dep, np_core_dep], + c_args: c_args_common + max_opt, + cpp_args: cpp_args_common + max_opt, + include_directories: [ + 'include', + 'src/common', + 'src/multiarray', + 'src/npymath', + 'src/umath' + ] + ) + if not is_variable('multiarray_umath_mtargets') + multiarray_umath_mtargets = mtargets + else + multiarray_umath_mtargets.extend(mtargets) + endif endforeach # Build _multiarray_umath module @@ -733,12 +992,10 @@ if have_blas ] endif -src_multiarray = [ +src_multiarray = multiarray_gen_headers + [ 'src/multiarray/abstractdtypes.c', 'src/multiarray/alloc.c', - src_file.process('src/multiarray/argfunc.dispatch.c.src'), 'src/multiarray/arrayobject.c', - src_file.process('src/multiarray/arraytypes.h.src'), 'src/multiarray/array_coercion.c', 'src/multiarray/array_method.c', 'src/multiarray/array_assign_scalar.c', @@ -792,9 +1049,6 @@ src_multiarray = [ 'src/multiarray/typeinfo.c', 'src/multiarray/usertypes.c', 'src/multiarray/vdot.c', - src_file.process('src/common/npy_sort.h.src'), - 'src/npysort/simd_qsort.dispatch.cpp', - 'src/npysort/simd_qsort_16bit.dispatch.cpp', 'src/npysort/quicksort.cpp', 'src/npysort/mergesort.cpp', 'src/npysort/timsort.cpp', @@ -817,26 +1071,9 @@ src_multiarray = [ 'src/npymath/arm64_exports.c', ] -src_umath = [ +src_umath = umath_gen_headers + [ src_file.process('src/umath/funcs.inc.src'), - src_file.process('src/umath/loops.h.src'), - src_file.process('src/umath/loops_utils.h.src'), src_file.process('src/umath/loops.c.src'), - src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'), - src_file.process('src/umath/loops_arithmetic.dispatch.c.src'), - src_file.process('src/umath/loops_comparison.dispatch.c.src'), - src_file.process('src/umath/loops_exponent_log.dispatch.c.src'), - src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'), - src_file.process('src/umath/loops_logical.dispatch.c.src'), - src_file.process('src/umath/loops_minmax.dispatch.c.src'), - src_file.process('src/umath/loops_modulo.dispatch.c.src'), - src_file.process('src/umath/loops_trigonometric.dispatch.c.src'), - src_file.process('src/umath/loops_umath_fp.dispatch.c.src'), - src_file.process('src/umath/loops_unary.dispatch.c.src'), - src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), - src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'), - src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), - src_file.process('src/umath/loops_autovec.dispatch.c.src'), src_file.process('src/umath/matmul.c.src'), src_file.process('src/umath/matmul.h.src'), 'src/umath/ufunc_type_resolution.c', @@ -863,52 +1100,24 @@ src_umath = [ # may be able to avoid the accuracy regressions in SVML. svml_objects = [] if use_svml - svml_objects += [ - 'src/umath/svml/linux/avx512/svml_z0_acos_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_acos_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_acosh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_acosh_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_asin_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_asin_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_asinh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_asinh_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atan2_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atan2_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atan_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atan_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atanh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_atanh_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cbrt_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cbrt_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cos_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cos_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cosh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_cosh_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_exp2_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_exp2_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_exp_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_exp_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_expm1_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_expm1_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log10_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log10_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log1p_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log1p_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log2_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log2_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_log_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_pow_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_pow_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_sin_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_sin_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_sinh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_sinh_s_la.s', - 'src/umath/svml/linux/avx512/svml_z0_tan_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_tan_s_la.s', - # 'src/umath/svml/linux/avx512/svml_z0_tanh_d_la.s', - 'src/umath/svml/linux/avx512/svml_z0_tanh_s_la.s', + foreach svml_func : [ + 'acos', 'acosh', 'asin', + 'asinh', 'atan2', + 'atan', 'atanh', + 'cbrt', 'cos', + 'cosh', 'exp2', + 'exp', 'expm1', + 'log10', 'log1p', + 'log2', 'log', + 'pow', 'sin', 'sinh', 'tan', + 'tanh' ] + foreach svml_sfx : ['d_la', 's_la', 'd_ha', 's_la'] + svml_objects += [ + 'src/umath/svml/linux/avx512/svml_z0_'+svml_func+'_'+svml_sfx+'.s' + ] + endforeach + endforeach endif py.extension_module('_multiarray_umath', @@ -934,26 +1143,60 @@ py.extension_module('_multiarray_umath', 'src/umath', ], dependencies: blas_dep, - link_with: npymath_lib, + link_with: [npymath_lib, multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')], install: true, subdir: 'numpy/core', ) # Build SIMD module # ----------------- +_simd_dispatch = [] +_simd_baseline = [] +foreach target : get_option('test-simd') + target = target.strip().to_upper().split(',') + mfeatures = [] + foreach fet_name : target + if fet_name == 'BASELINE' + _simd_baseline = CPU_BASELINE + break + endif + if fet_name not in CPU_FEATURES + error('Expected a valid feature name, got('+fet_name+')') + endif + mfeatures += CPU_FEATURES[fet_name] + endforeach + _simd_dispatch += [mfeatures] +endforeach -py.extension_module('_simd', +_simd_mtargets = mod_features.multi_targets( + '_simd.dispatch.h', [ - 'src/common/npy_cpu_features.c', - 'src/_simd/_simd.c', src_file.process('src/_simd/_simd_inc.h.src'), src_file.process('src/_simd/_simd_data.inc.src'), src_file.process('src/_simd/_simd.dispatch.c.src'), ], + # Skip validating the order of `_simd_dispatch` because we execute all these + # features, not just the highest interest one. The sorting doesn't matter + # here, given the nature of this testing unit. + keep_sort: true, + dispatch: _simd_dispatch, + baseline: _simd_baseline, + prefix: 'NPY_', + dependencies: [py_dep, np_core_dep], + include_directories: ['src/_simd', 'src/npymath'], + c_args: c_args_common, + cpp_args: cpp_args_common, +) + +py.extension_module('_simd', + [ + 'src/common/npy_cpu_features.c', + 'src/_simd/_simd.c', + ], c_args: c_args_common, include_directories: ['src/_simd', 'src/npymath'], dependencies: np_core_dep, - link_with: npymath_lib, + link_with: [npymath_lib, _simd_mtargets.static_lib('_simd_mtargets')], install: true, subdir: 'numpy/core', ) diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c index 52b66e7652a8..5a113fe57876 100644 --- a/numpy/core/src/_simd/_simd.c +++ b/numpy/core/src/_simd/_simd.c @@ -85,9 +85,13 @@ PyMODINIT_FUNC PyInit__simd(void) goto err; \ } \ } - - NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY) - NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY) + #ifdef NPY__CPU_MESON_BUILD + NPY_MTARGETS_CONF_DISPATCH(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY) + NPY_MTARGETS_CONF_BASELINE(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY) + #else + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY) + NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY) + #endif return m; err: Py_DECREF(m); diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index f532c9e022f7..51f5ddd54b22 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -919,7 +919,9 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) { static struct PyModuleDef defs = { .m_base = PyModuleDef_HEAD_INIT, - #ifdef NPY__CPU_TARGET_CURRENT + #if defined(NPY_MTARGETS_CURRENT) // meson build + .m_name = "numpy.core._simd." NPY_TOSTRING(NPY_MTARGETS_CURRENT), + #elif defined(NPY__CPU_TARGET_CURRENT) .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT), #else .m_name = "numpy.core._simd.baseline", diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h index 4d5addec809e..699f8536f6a2 100644 --- a/numpy/core/src/common/npy_cpu_dispatch.h +++ b/numpy/core/src/common/npy_cpu_dispatch.h @@ -43,6 +43,7 @@ #endif #endif #endif // !NPY_DISABLE_OPTIMIZATION +#ifndef NPY__CPU_MESON_BUILD /** * Macro NPY_CPU_DISPATCH_CURFX(NAME) * @@ -261,5 +262,5 @@ ((TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0), #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \ ( LEFT __VA_ARGS__ ) - +#endif // NPY__CPU_MESON_BUILD #endif // NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_ diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 72a87eac1715..357b136d25cd 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -321,7 +321,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { npyv_f32 m = npyv_mul_f32(a, b); - #if NPY_HAVE_SSE3 + #ifdef NPY_HAVE_SSE3 return _mm_addsub_ps(m, c); #else const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); @@ -331,7 +331,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { npyv_f64 m = npyv_mul_f64(a, b); - #if NPY_HAVE_SSE3 + #ifdef NPY_HAVE_SSE3 return _mm_addsub_pd(m, c); #else const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);