From ee6c23319a204c0ff9ae42b6c6212ca69cf06f1e Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Fri, 11 Aug 2023 22:27:02 +0400
Subject: [PATCH] BLD, SIMD: The meson CPU dispatcher implementation  (#23096)

Almost gives the same functionality as Distutils/CCompiler Opt,
with a few changes to the way we specify the targets. Also, it
abandons the idea of wrapping the dispatchable sources, instead it
counts on static libraries to enable different paths and flags.
---
 .github/meson_actions/action.yml            |  29 ++
 .github/workflows/build_test.yml            |  20 +-
 MANIFEST.in                                 |   2 +
 build_requirements.txt                      |   4 +-
 doc/source/user/quickstart.rst              |   2 +-
 meson.build                                 |   3 +-
 meson_cpu/arm/meson.build                   |  58 +++
 meson_cpu/main_config.h.in                  | 351 ++++++++++++++++
 meson_cpu/meson.build                       | 307 ++++++++++++++
 meson_cpu/ppc64/meson.build                 |  38 ++
 meson_cpu/s390x/meson.build                 |  18 +
 meson_cpu/x86/meson.build                   | 227 ++++++++++
 meson_options.txt                           |  24 +-
 numpy/core/meson.build                      | 443 +++++++++++++++-----
 numpy/core/src/_simd/_simd.c                |  10 +-
 numpy/core/src/_simd/_simd.dispatch.c.src   |   4 +-
 numpy/core/src/common/npy_cpu_dispatch.h    |   3 +-
 numpy/core/src/common/simd/sse/arithmetic.h |   4 +-
 18 files changed, 1421 insertions(+), 126 deletions(-)
 create mode 100644 .github/meson_actions/action.yml
 create mode 100644 meson_cpu/arm/meson.build
 create mode 100644 meson_cpu/main_config.h.in
 create mode 100644 meson_cpu/meson.build
 create mode 100644 meson_cpu/ppc64/meson.build
 create mode 100644 meson_cpu/s390x/meson.build
 create mode 100644 meson_cpu/x86/meson.build

diff --git a/.github/meson_actions/action.yml b/.github/meson_actions/action.yml
new file mode 100644
index 000000000000..aff70da169bc
--- /dev/null
+++ b/.github/meson_actions/action.yml
@@ -0,0 +1,29 @@
+name: MesonBuildTest
+description: "checkout repo, build, and test numpy"
+runs:
+  using: composite
+  steps:
+  - name: Install dependencies
+    shell: bash
+    run: pip install -r build_requirements.txt
+  - name: Build
+    shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+    env:
+      TERM: xterm-256color
+    run:
+      spin build -- ${MESON_ARGS[@]}
+  - name: Check build-internal dependencies
+    shell: bash
+    run:
+      ninja -C build -t missingdeps
+  - name: Check installed test and stub files
+    shell: bash
+    run:
+      python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy')
+  - name: Test
+    shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+    env:
+      TERM: xterm-256color
+    run: |
+      pip install pytest pytest-xdist hypothesis typing_extensions
+      spin test -j auto
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index 928018b13905..b0a24d7730a1 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -49,7 +49,7 @@ jobs:
     if: "github.repository == 'numpy/numpy'"
     runs-on: ubuntu-latest
     env:
-      WITHOUT_SIMD: 1
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -58,7 +58,7 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   basic:
     needs: [smoke_test]
@@ -122,7 +122,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      WITHOUT_OPTIMIZATIONS: 1
+      MESON_ARGS: "-Dallow-noblas=true -Ddisable-optimization=true"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -131,14 +131,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   with_baseline_only:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "none"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=none"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -147,14 +147,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   without_avx512:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "max -xop -fma4 -avx512f -avx512cd -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,AVX2,FMA3"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -163,14 +163,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   without_avx512_avx2_fma3:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "SSSE3 SSE41 POPCNT SSE42 AVX F16C"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -179,7 +179,7 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   debug:
     needs: [smoke_test]
diff --git a/MANIFEST.in b/MANIFEST.in
index ab6ecd518e1b..4803b39131e1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -21,6 +21,8 @@ recursive-include numpy/random *.pyx *.pxd *.pyx.in *.pxd.in
 include numpy/py.typed
 include numpy/random/include/*
 include numpy/*.pxd
+# Meson CPU Dispatcher
+recursive-include meson_cpu *.build *.in
 # Add build support that should go in sdist, but not go in bdist/be installed
 # Note that sub-directories that don't have __init__ are apparently not
 # included by 'recursive-include', so list those separately
diff --git a/build_requirements.txt b/build_requirements.txt
index 3627f1b91685..e7e776a7de89 100644
--- a/build_requirements.txt
+++ b/build_requirements.txt
@@ -1,5 +1,5 @@
-meson-python>=0.10.0
-Cython
+meson-python>=0.13.1
+Cython>=3.0
 wheel==0.38.1
 ninja
 spin==0.4
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index 783d5a447df9..bc6c3b3818d2 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -517,7 +517,7 @@ and other Python sequences.
     >>> for i in a:
     ...     print(i**(1 / 3.))
     ...
-    9.999999999999998
+    9.999999999999998  # may vary
     1.0
     9.999999999999998
     3.0
diff --git a/meson.build b/meson.build
index 8bfe987715d1..33d0e7b462ef 100644
--- a/meson.build
+++ b/meson.build
@@ -6,7 +6,7 @@ project(
   # See `numpy/__init__.py`
   version: '1.26.0.dev0',
   license: 'BSD-3',
-  meson_version: '>= 1.1.0',
+  meson_version: '>=1.2.99',  # version in vendored-meson is 1.2.99
   default_options: [
     'buildtype=debugoptimized',
     'b_ndebug=if-release',
@@ -80,4 +80,5 @@ else
     meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
 endif
 
+subdir('meson_cpu')
 subdir('numpy')
diff --git a/meson_cpu/arm/meson.build b/meson_cpu/arm/meson.build
new file mode 100644
index 000000000000..f968b2e99682
--- /dev/null
+++ b/meson_cpu/arm/meson.build
@@ -0,0 +1,58 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+NEON = mod_features.new(
+  'NEON', 1,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon.c')[0]
+)
+NEON_FP16 = mod_features.new(
+  'NEON_FP16', 2, implies: NEON,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_fp16.c')[0]
+)
+# FMA
+NEON_VFPV4 = mod_features.new(
+  'NEON_VFPV4', 3, implies: NEON_FP16,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_vfpv4.c')[0]
+)
+# Advanced SIMD
+ASIMD = mod_features.new(
+  'ASIMD', 4, implies: NEON_VFPV4, detect: {'val': 'ASIMD', 'match': 'NEON.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimd.c')[0]
+)
+cpu_family = host_machine.cpu_family()
+if cpu_family == 'aarch64'
+  # hardware baseline
+  NEON.update(implies: [NEON_FP16, NEON_VFPV4, ASIMD])
+  NEON_FP16.update(implies: [NEON, NEON_VFPV4, ASIMD])
+  NEON_VFPV4.update(implies: [NEON, NEON_FP16, ASIMD])
+elif cpu_family == 'arm'
+  NEON.update(args: '-mfpu=neon')
+  NEON_FP16.update(args: ['-mfp16-format=ieee', {'val': '-mfpu=neon-fp16', 'match': '-mfpu=.*'}])
+  NEON_VFPV4.update(args: [{'val': '-mfpu=neon-vfpv4', 'match': '-mfpu=.*'}])
+  ASIMD.update(args: [
+    {'val': '-mfpu=neon-fp-armv8', 'match': '-mfpu=.*'},
+    '-march=armv8-a+simd'
+  ])
+endif
+# ARMv8.2 half-precision & vector arithm
+ASIMDHP = mod_features.new(
+  'ASIMDHP', 5, implies: ASIMD,
+  args: {'val': '-march=armv8.2-a+fp16', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdhp.c')[0]
+)
+## ARMv8.2 dot product
+ASIMDDP = mod_features.new(
+  'ASIMDDP', 6, implies: ASIMD,
+  args: {'val': '-march=armv8.2-a+dotprod', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimddp.c')[0]
+)
+## ARMv8.2 Single & half-precision Multiply
+ASIMDFHM = mod_features.new(
+  'ASIMDFHM', 7, implies: ASIMDHP,
+  args: {'val': '-march=armv8.2-a+fp16fml', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdfhm.c')[0]
+)
+# TODO: Add support for MSVC
+ARM_FEATURES = {
+  'NEON': NEON, 'NEON_FP16': NEON_FP16, 'NEON_VFPV4': NEON_VFPV4,
+  'ASIMD': ASIMD, 'ASIMDHP': ASIMDHP, 'ASIMDFHM': ASIMDFHM
+}
diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in
new file mode 100644
index 000000000000..c7c13b2c7eb1
--- /dev/null
+++ b/meson_cpu/main_config.h.in
@@ -0,0 +1,351 @@
+/*
+ * Main configuration header of the CPU dispatcher.
+ *
+ * This header is autogenerated by the Meson build script located at `meson_cpu/meson.build`.
+ * It provides a set of utilities that are required for the runtime dispatching process.
+ *
+ * The most important macros in this header are:
+ *   - @ref @P@CPU_DISPATCH_DECLARE: Used to declare the dispatched functions and variables.
+ *   - @ref @P@CPU_DISPATCH_CURFX: Used to define the dispatched functions with target-specific suffixes.
+ *   - @ref @P@CPU_DISPATCH_CALL: Used for runtime dispatching of the exported functions and variables.
+ */
+#ifndef @P@_CPU_DISPATCHER_CONF_H_
+#define @P@_CPU_DISPATCHER_CONF_H_
+/// This definition is required to provides comptablity with NumPy distutils
+#define @P@_CPU_MESON_BUILD
+/**
+ * @def @P@WITH_CPU_BASELINE
+ * Enabled baseline features names as a single string where each is separated by a single space.
+ * For example: "SSE SSE2 SSE3"
+ * Required for logging purposes only.
+ */
+#define @P@WITH_CPU_BASELINE "@WITH_CPU_BASELINE@"
+/**
+ * @def @P@WITH_CPU_BASELINE_N
+ * Number of enabled baseline features.
+ */
+#define @P@WITH_CPU_BASELINE_N @WITH_CPU_BASELINE_N@
+/**
+ * @def @P@WITH_CPU_DISPATCH
+ * Dispatched features names as a single string where each is separated by a single space.
+ */
+#define @P@WITH_CPU_DISPATCH "@WITH_CPU_DISPATCH@"
+/**
+ * @def @P@WITH_CPU_DISPATCH_N
+ * Number of enabled dispatched features.
+ */
+#define @P@WITH_CPU_DISPATCH_N @WITH_CPU_DISPATCH_N@
+// Expand a macro, used by the following macros
+#define @P@_CPU_EXPAND(X) X
+#define @P@_CPU_CAT__(a, b) a ## b
+#define @P@_CPU_CAT_(a, b) @P@_CPU_CAT__(a, b)
+#define @P@_CPU_CAT(a, b) @P@_CPU_CAT_(a, b)
+
+/**
+ * @def @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...)
+ * Call each enabled baseline feature sorted by lowest interest
+ * using preprocessor callback without testing whiher the
+ * feature is supported by CPU or not.
+ *
+ * Required for logging purposes only, for example, generating
+ * a Python list to hold the information of the enabled features.
+ *
+ * Unwrapped Version:
+ * @code
+ * #define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE, __VA_ARGS__))   \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE2, __VA_ARGS__))  \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE3, __VA_ARGS__))
+ * @endcode
+ *
+ * @param EXEC_CB The preprocessor callback to be called for each enabled baseline feature.
+ * @param ... Additional arguments to be passed to the preprocessor callback.
+ */
+#define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \
+@WITH_CPU_BASELINE_CALL@
+
+/**
+ * @def @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...)
+ * Similar to the above but for enabled dispatched features.
+ *
+ * @param EXEC_CB The preprocessor callback to be called for each enabled dispatched feature.
+ * @param ... Additional arguments to be passed to the preprocessor callback.
+ */
+#define @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...) \
+@WITH_CPU_DISPATCH_CALL@
+
+/*
+ * Defines the default behavior for the configurable macros derived from the configuration header
+ * that is generated by the meson function `mod_features.multi_targets()`.
+ *
+ * Note: Providing fallback in case of optimization disabled is no longer needed for meson
+ * since we always guarantee having configuration headers.
+ *
+ * However, it is still needed for compatibility with Numpy distutils.
+ */
+#ifndef @P@DISABLE_OPTIMIZATION
+    #define @P@MTARGETS_CONF_BASELINE(CB, ...) \
+         &&"Expected config header that generated by mod_features.multi_targets()";
+    #define @P@MTARGETS_CONF_DISPATCH(TEST_FEATURE_CB, CB, ...) \
+         &&"Expected config header that generated by mod_features.multi_targets()";
+#else
+    #define @P@MTARGETS_CONF_BASELINE(CB, ...) @P@_CPU_EXPAND(CB(__VA_ARGS__))
+    #define @P@MTARGETS_CONF_DISPATCH(CHK, CB, ...)
+#endif
+/**
+ * @def @P@CPU_DISPATCH_CURFX(NAME)
+ *
+ * Returns `NAME` suffixed with "_" + "the current target" during compiling
+ * the generated static libraries that are derived from the Meson function
+ * `mod_features.multi_targets()`.
+ *
+ * It also returns `NAME` as-is without any suffix when it comes to the baseline features or
+ * in case if the optimization is disabled.
+ *
+ * Note: `mod_features.multi_targets()` provides a unique target name within the compiler #definition
+ * called `@P@MTARGETS_CURRENT` on each generated library based on the specified features
+ * within its parameter 'dispatch:'.
+ *
+ * For example:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *  'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @code
+ *
+ * @code
+ * void @P@CPU_DISPATCH_CURFX(add)(const int *src0, const int *src1, int *dst)
+ * {
+ * #ifdef @P@HAVE_AVX512F // one of the implied feature of AVX512_SKX
+ *   // code
+ * #elif defined(@P@HAVE_AVX2)
+ *   // code
+ * #elif defined(@P@HAVE_SSE3)
+ *   // CODE
+ * #else
+ *   // Fallback code in case of features enabled
+ * #endif
+ * }
+ * @endif
+ *
+ * // Unwrapped version :
+ * void add_AVX512_SKX(const int *src0, const int *src1, int *dst)
+ * {...}
+ * void add_AVX2(const int *src0, const int *src1, int *dst)
+ * {...}
+ * // baseline
+ * void add(const int *src0, const int *src1, int *dst)
+ * {...}
+ * @endcode
+ *
+ * @param NAME The base name of the dispatched function or variable.
+ */
+#ifdef @P@MTARGETS_CURRENT
+    // '@P@MTARGETS_CURRENT': only defined by the dispatchable sources
+    #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_CAT(@P@_CPU_CAT(NAME, _), @P@MTARGETS_CURRENT)
+#else
+    #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_EXPAND(NAME)
+#endif
+
+/**
+ * @def @P@CPU_DISPATCH_DECLARE(...)
+ *
+ * Provides forward declarations for the exported variables and functions
+ * based on the enabled baseline and dispatched features.
+ *
+ * This macro requires include the config file that been generated
+ * by meson function `mod_features.multi_targets()` to determine the enabled
+ * baseline and dispatched features.
+ *
+ * For example:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *  'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @code
+ *
+ * @code
+ * // from C
+ * #include "arithmetic.dispatch.h"
+ * @P@CPU_DISPATCH_DECLARE(void add, (const int *src0, const int *src1, int *dst))
+ *
+ * // Unwrapped version:
+ * void add_AVX512_SKX(const int *src0, const int *src1, int *dst);
+ * void add_AVX2(const int *src0, const int *src1, int *dst);
+ * void add(const int *src0, const int *src1, int *dst); // baseline
+ * @endcode
+ *
+ * @param ... The function or variable prototype to be declared,
+ *            with the target-specific suffix added automatically.
+ */
+#define @P@CPU_DISPATCH_DECLARE(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__)
+
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \
+    @P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__;
+#define @P@CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \
+    LEFT __VA_ARGS__;
+// Dummy CPU runtime checking
+#define @P@CPU_DISPATCH_DECLARE_CHK_(FEATURE_NAME)
+
+/**
+ * @def @P@CPU_DISPATCH_DECLARE_XB(LEFT, ...)
+ *
+ * Same as `@P@CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
+ * if it was enabled within `mod_features.multi_targets()`.
+ */
+#define @P@CPU_DISPATCH_DECLARE_XB(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__)
+
+/**
+ * @def @P@CPU_DISPATCH_CALL(...)
+ *
+ * Helper macro used for runtime dispatching of the exported functions and variables
+ * within the meson `mod_features.multi_targets()` function.
+ *
+ * This macro dispatches only one symbol based on the order of the specified features within the meson function
+ * `mod_features.multi_targets()`. For example, if `mod_features.multi_targets()` is called with
+ * `dispatch: [features_highest_1, features_highest_2]`, the macro will test each enabled feature against
+ * the CPU at runtime. Once it fails, it will move to the next order until falling back to the baseline.
+ *
+ * Similar to `@P@CPU_DISPATCH_DECLARE`, this macro requires including the config file that has been generated
+ * by the meson function `mod_features.multi_targets()` to determine the enabled baseline and dispatched features.
+ *
+ * Example usage:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *   'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @endcode
+ *
+ * @code
+ * // from C
+ * #include "arithmetic.dispatch.h"
+ *
+ * // Example 1:
+ * @P@CPU_DISPATCH_CALL(add, (src0, src1, dst));
+ *
+ * // Unwrapped version:
+ * @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX(src0, src1, dst) :
+ *     (@P@CPU_HAVE(AVX2) ? add_AVX2(src0, src1, dst) :
+ *         add(src0, src1, dst); // baseline
+ *
+ * // Example 2:
+ * typedef void (*func_type)(const int*, const int*, int*);
+ * func_type func = @P@CPU_DISPATCH_CALL(add);
+ *
+ * // Unwrapped version:
+ * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX :
+ *                     (@P@CPU_HAVE(AVX2) ? add_AVX2 :
+ *                         add; // baseline
+ *
+ * // Example 3:
+ * func_type func3;
+ * @P@CPU_DISPATCH_CALL(func3 = add);
+ *
+ * // Unwrapped version:
+ * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? func3 = add_AVX512_SKX :
+ *                     (@P@CPU_HAVE(AVX2) ? func3 = add_AVX2 :
+ *                         func3 = add; // baseline
+ *
+ * @endcode
+ *
+ * @param ... The function or variable prototype to be called or assigned,
+ *            with the target-specific suffix added automatically.
+ */
+#define @P@CPU_DISPATCH_CALL(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
+#define @P@CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
+    (LEFT __VA_ARGS__)
+
+/**
+ * @def @P@CPU_DISPATCH_CALL_XB(LEFT, ...)
+ *
+ * Same as `@P@CPU_DISPATCH_CALL` but exclude the baseline call even
+ * if it was provided within meson `mod_features.multi_targets()`.
+ *
+ * Note: This macro returns void
+ */
+#define @P@CPU_DISPATCH_CALL_XB_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (void) (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
+#define @P@CPU_DISPATCH_CALL_XB(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \
+    ((void) 0 /* discarded expression value */)
+
+/**
+ * Macro @P@CPU_DISPATCH_CALL_ALL(...)
+ *
+ * Same as `@P@CPU_DISPATCH_CALL` but dispatching all the required optimizations for
+ * the exported functions and variables instead of highest interested one.
+ * Returns void.
+ */
+#define @P@CPU_DISPATCH_CALL_ALL(...) \
+    (@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__))
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    ((TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
+#define @P@CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
+    ( LEFT __VA_ARGS__ )
+
+// Brings the headers files of enabled CPU features
+#ifdef @P@HAVE_SSE
+    #include <xmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE2
+    #include <emmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE3
+    #include <pmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSSE3
+    #include <tmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE41
+    #include <smmintrin.h>
+#endif
+#ifdef @P@HAVE_POPCNT
+    #ifdef _MSC_VER
+        #include <nmmintrin.h>
+    #else
+        #include <popcntintrin.h>
+    #endif
+#endif
+#ifdef @P@HAVE_AVX
+    #include <immintrin.h>
+#endif
+
+#if defined(@P@HAVE_XOP) || defined(@P@HAVE_FMA4)
+    #include <x86intrin.h>
+#endif
+
+#ifdef @P@HAVE_VSX
+    #include <altivec.h>
+#endif
+
+#ifdef @P@HAVE_VX
+    #include <vecintrin.h>
+#endif
+
+#ifdef @P@HAVE_NEON
+    #include <arm_neon.h>
+#endif
+#endif // @P@_CPU_DISPATCHER_CONF_H_
diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build
new file mode 100644
index 000000000000..b99638bfc24f
--- /dev/null
+++ b/meson_cpu/meson.build
@@ -0,0 +1,307 @@
+# The CPU Dispatcher implementation.
+#
+# This script handles the CPU dispatcher and requires the Meson module
+# 'features'.
+#
+# The CPU dispatcher script is responsible for three main tasks:
+#
+# 1. Defining the enabled baseline and dispatched features by parsing build
+#    options or compiler arguments, including detection of native flags.
+#
+# 2. Specifying the baseline arguments and definitions across all sources.
+#
+# 3. Generating the main configuration file, which contains information about
+#    the enabled features, along with a collection of C macros necessary for
+#    runtime dispatching. For more details, see the template file
+#    `main_config.h.in`.
+#
+# This script exposes the following variables:
+#
+# - `CPU_BASELINE`: A set of CPU feature objects obtained from
+#                   `mod_features.new()`, representing the minimum CPU features
+#                   specified by the build option `-Dcpu-baseline`.
+#
+# - `CPU_BASELINE_NAMES`: A set of enabled CPU feature names, representing the
+#                         minimum CPU features specified by the build option
+#                         `-Dcpu-baseline`.
+#
+# - `CPU_DISPATCH_NAMES`: A set of enabled CPU feature names, representing the
+#                         additional CPU features that can be dispatched at
+#                         runtime, specified by the build option
+#                         `-Dcpu-dispatch`.
+#
+# - `CPU_FEATURES`: A dictionary containing all supported CPU feature objects.
+#
+# Additionally, this script exposes a set of variables that represent each
+# supported feature to be used within the Meson function
+# `mod_features.multi_targets()`.
+
+# Prefix used by all macros and features definitions
+CPU_CONF_PREFIX = 'NPY_'
+# main configuration name
+CPU_CONF_CONFIG = 'npy_cpu_dispatch_config.h'
+
+if get_option('disable-optimization')
+  add_project_arguments('-D' + CPU_CONF_PREFIX + 'DISABLE_OPTIMIZATION', language: ['c', 'cpp'])
+  CPU_CONF_BASELINE = 'none'
+  CPU_CONF_DISPATCH = 'none'
+else
+  baseline_detect = false
+  c_args = get_option('c_args')
+  foreach arg : c_args
+    foreach carch : ['-march', '-mcpu', '-xhost', '/QxHost']
+      if arg.contains(carch)
+        message('Appending option "detect" to "cpu-baseline" due to detecting global architecture c_arg "' + arg + '"')
+        baseline_detect = true
+        break
+      endif
+    endforeach
+    if baseline_detect
+      break
+    endif
+  endforeach
+  # The required minimal set of required CPU features.
+  CPU_CONF_BASELINE = get_option('cpu-baseline')
+  if baseline_detect
+    CPU_CONF_BASELINE += '+detect'
+  endif
+  # The required dispatched set of additional CPU features.
+  CPU_CONF_DISPATCH = get_option('cpu-dispatch')
+endif
+
+# Initialize the CPU features Export the X86 features objects 'SSE', 'AVX',
+# etc. plus a dictionary "X86_FEATURES" which maps to each object by its name
+subdir('x86')
+subdir('ppc64')
+subdir('s390x')
+subdir('arm')
+
+CPU_FEATURES = {}
+CPU_FEATURES += ARM_FEATURES
+CPU_FEATURES += X86_FEATURES
+CPU_FEATURES += PPC64_FEATURES
+CPU_FEATURES += S390X_FEATURES
+
+# Parse the requsted baseline (CPU_CONF_BASELINE) and dispatch features
+# (CPU_CONF_DISPATCH).
+cpu_family = host_machine.cpu_family()
+# Used by build option 'min'
+min_features = {
+  'x86': [SSE2],
+  'x86_64': [SSE3],
+  'ppc64': [],
+  's390x': [],
+  'arm': [],
+  'aarch64': [ASIMD]
+}.get(cpu_family, [])
+if host_machine.endian() == 'little' and cpu_family == 'ppc64'
+  min_features = [VSX2]
+endif
+
+# Used by build option 'max'
+max_features_dict = {
+  'x86': X86_FEATURES,
+  'x86_64': X86_FEATURES,
+  'ppc64': PPC64_FEATURES,
+  's390x': S390X_FEATURES,
+  'arm': ARM_FEATURES,
+  'aarch64': ARM_FEATURES,
+}.get(cpu_family, [])
+max_features = []
+foreach fet_name, fet_obj : max_features_dict
+  max_features += [fet_obj]
+endforeach
+
+parse_options = {
+  'cpu-baseline': CPU_CONF_BASELINE,
+  'cpu-dispatch': CPU_CONF_DISPATCH
+}
+parse_result = {
+  'cpu-baseline': [],
+  'cpu-dispatch': []
+}
+mod_features = import('features')
+foreach opt_name, conf : parse_options
+  # no support for regex :(?
+  tokens = conf.replace(',', ' ').replace('+', ' + ').replace('-', ' - ').strip().to_upper().split()
+  result = []
+  ignored = []
+  # append is the default
+  append = true
+  foreach tok : tokens
+    if tok == '+'
+      append = true
+      continue
+    elif tok == '-'
+      append = false
+      continue
+    elif tok == 'NONE'
+      continue
+    elif tok == 'NATIVE'
+      if not is_variable('cpu_native_features')
+        compiler_id = meson.get_compiler('c').get_id()
+        native_flags = {
+          'intel': '-xHost',
+          'intel-cl': '/QxHost',
+          # FIXME: Add support for fcc(-mcpu=a64fx) compiler
+        }.get(compiler_id, '-march=native')
+        test_native = mod_features.test(
+          max_features, anyfet: true,
+          force_args: [native_flags] + '-DDETECT_FEATURES'
+        )
+        if not test_native[0]
+          error('Option "native" doesn\'t support compiler', compiler_id)
+        endif
+        cpu_native_features = []
+        foreach fet_name : test_native[1].get('features')
+          cpu_native_features += CPU_FEATURES[fet_name]
+        endforeach
+      endif
+      accumulate = cpu_native_features
+    elif tok == 'DETECT'
+      if not is_variable('cpu_detect_features')
+        test_detect = mod_features.test(
+          max_features, anyfet: true,
+          force_args: ['-DDETECT_FEATURES'] + get_option('c_args')
+        )
+        cpu_detect_features = []
+        foreach fet_name : test_detect[1].get('features')
+          cpu_detect_features += CPU_FEATURES[fet_name]
+        endforeach
+      endif
+      accumulate = cpu_detect_features
+    elif tok == 'MIN'
+      accumulate = min_features
+    elif tok == 'MAX'
+      accumulate = max_features
+    elif tok in CPU_FEATURES
+      tokobj = CPU_FEATURES[tok]
+      if tokobj not in max_features
+        ignored += tok
+        continue
+      endif
+      accumulate = [tokobj]
+    else
+      error('Invalid token "'+tok+'" within option --'+opt_name)
+    endif
+    if append
+      foreach fet : accumulate
+        if fet not in result
+          result += fet
+        endif
+      endforeach
+    else
+      filterd = []
+      foreach fet : result
+        if fet not in accumulate
+          filterd += fet
+        endif
+      endforeach
+      result = filterd
+    endif # append
+  endforeach # tok : tokens
+  if ignored.length() > 0
+    message(
+      'During parsing ' + opt_name +
+      ': The following CPU features were ignored due to platform ' +
+      'incompatibility or lack of support:\n"' + ' '.join(ignored) + '"'
+    )
+  endif
+  if result.length() > 0
+    parse_result += {opt_name: mod_features.implicit_c(result)}
+  endif
+endforeach # opt_name, conf : parse_options
+
+# Test the baseline and dispatch features and set their flags and #definitions
+# across all sources.
+#
+# It is important to know that this test enables the maximum supported features
+# by the platform depending on the required features.
+#
+# For example, if the user specified `--cpu-baseline=avx512_skx`, and the
+# compiler doesn't support it, but still supports any of the implied features,
+# then we enable the maximum supported implied features, e.g., AVX2, which can
+# be done by specifying `anyfet: true` to the test function.
+if parse_result['cpu-baseline'].length() > 0
+    baseline = mod_features.test(parse_result['cpu-baseline'], anyfet: true)[1]
+    baseline_args = baseline['args']
+    foreach baseline_fet : baseline['defines']
+        baseline_args += ['-D' + CPU_CONF_PREFIX + 'HAVE_' + baseline_fet]
+    endforeach
+    add_project_arguments(baseline_args, language: ['c', 'cpp'])
+else
+    baseline = {}
+endif
+# The name of the baseline features including its implied features.
+CPU_BASELINE_NAMES = baseline.get('features', [])
+CPU_BASELINE = []
+foreach fet_name : CPU_BASELINE_NAMES
+  CPU_BASELINE += [CPU_FEATURES[fet_name]]
+endforeach
+# Loop over all initialized features and disable any feature that is not part
+# of the requested baseline and dispatch features to avoid it enabled by
+# import('feature').multi_targets
+foreach fet_name, fet_obj : CPU_FEATURES
+  if fet_obj in parse_result['cpu-dispatch'] or fet_name in CPU_BASELINE_NAMES
+    continue
+  endif
+  fet_obj.update(disable: 'Not part of the requsted features')
+endforeach
+
+CPU_DISPATCH_NAMES = []
+foreach fet_obj : parse_result['cpu-dispatch']
+  # skip baseline features
+  if fet_obj.get('name') in CPU_BASELINE_NAMES
+    continue
+  endif
+  fet_test = mod_features.test(fet_obj)
+  if not fet_test[0]
+    continue
+  endif
+  CPU_DISPATCH_NAMES += [fet_obj.get('name')]
+endforeach
+# Generate main configuration header see 'main_config.h.in' for more
+# clarification.
+main_config = {
+  'P': CPU_CONF_PREFIX,
+  'WITH_CPU_BASELINE': ' '.join(CPU_BASELINE_NAMES),
+  'WITH_CPU_BASELINE_N': CPU_BASELINE_NAMES.length(),
+  'WITH_CPU_DISPATCH': ' '.join(CPU_DISPATCH_NAMES),
+  'WITH_CPU_DISPATCH_N': CPU_DISPATCH_NAMES.length(),
+}
+clines = []
+macro_tpl = '@0@_CPU_EXPAND(EXEC_CB(@1@, __VA_ARGS__)) \\'
+foreach fet : CPU_BASELINE_NAMES
+  clines += macro_tpl.format(CPU_CONF_PREFIX, fet)
+endforeach
+main_config += {'WITH_CPU_BASELINE_CALL': '\n'.join(clines)}
+clines = []
+foreach fet : CPU_DISPATCH_NAMES
+  clines += macro_tpl.format(CPU_CONF_PREFIX, fet)
+endforeach
+main_config += {'WITH_CPU_DISPATCH_CALL': '\n'.join(clines)}
+
+configure_file(
+  input : 'main_config.h.in',
+  output : CPU_CONF_CONFIG,
+  configuration : configuration_data(main_config)
+)
+add_project_arguments(
+  '-I' + meson.current_build_dir(),
+  language: ['c', 'cpp']
+)
+
+message(
+'''
+CPU Optimization Options
+  baseline:
+    Requested : @0@
+    Enabled   : @1@
+  dispatch:
+    Requested : @2@
+    Enabled   : @3@
+'''.format(
+    CPU_CONF_BASELINE, ' '.join(CPU_BASELINE_NAMES),
+    CPU_CONF_DISPATCH, ' '.join(CPU_DISPATCH_NAMES)
+  )
+)
diff --git a/meson_cpu/ppc64/meson.build b/meson_cpu/ppc64/meson.build
new file mode 100644
index 000000000000..d14b23703fe3
--- /dev/null
+++ b/meson_cpu/ppc64/meson.build
@@ -0,0 +1,38 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+compiler_id = meson.get_compiler('c').get_id()
+
+VSX = mod_features.new(
+  'VSX', 1, args: '-mvsx',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx.c')[0],
+  extra_tests: {
+    'VSX_ASM': files(source_root + '/numpy/distutils/checks/extra_vsx_asm.c')[0]
+  }
+)
+if compiler_id == 'clang'
+  VSX.update(args: ['-mvsx', '-maltivec'])
+endif
+VSX2 = mod_features.new(
+  'VSX2', 2, implies: VSX, args: {'val': '-mcpu=power8', 'match': '.*vsx'},
+  detect: {'val': 'VSX2', 'match': 'VSX'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx2.c')[0],
+)
+# VSX2 is hardware baseline feature on ppc64le since the first little-endian
+# support was part of Power8
+if host_machine.endian() == 'little'
+  VSX.update(implies: VSX2)
+endif
+VSX3 = mod_features.new(
+  'VSX3', 3, implies: VSX2, args: {'val': '-mcpu=power9', 'match': '.*[mcpu=|vsx].*'},
+  detect: {'val': 'VSX3', 'match': 'VSX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0],
+)
+VSX4 = mod_features.new(
+  'VSX4', 4, implies: VSX3, args: {'val': '-mcpu=power10', 'match': '.*[mcpu=|vsx].*'},
+  detect: {'val': 'VSX4', 'match': 'VSX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0],
+  extra_tests: {
+    'VSX4_MMA': files(source_root + '/numpy/distutils/checks/extra_vsx4_mma.c')[0]
+  }
+)
+PPC64_FEATURES = {'VSX': VSX, 'VSX2': VSX2, 'VSX3': VSX3, 'VSX4': VSX4}
diff --git a/meson_cpu/s390x/meson.build b/meson_cpu/s390x/meson.build
new file mode 100644
index 000000000000..a69252d1607c
--- /dev/null
+++ b/meson_cpu/s390x/meson.build
@@ -0,0 +1,18 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+
+VX = mod_features.new(
+  'VX', 1, args: ['-mzvector', '-march=arch11'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vx.c')[0],
+)
+VXE = mod_features.new(
+  'VXE', 2, implies: VX, args: {'val': '-march=arch12', 'match': '-march=.*'},
+  detect: {'val': 'VXE', 'match': 'VX'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe.c')[0],
+)
+VXE2 = mod_features.new(
+  'VXE2', 3, implies: VXE, args: {'val': '-march=arch13', 'match': '-march=.*'},
+  detect: {'val': 'VXE2', 'match': 'VX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe2.c')[0],
+)
+S390X_FEATURES = {'VX': VX, 'VXE': VXE, 'VXE2': VXE2}
diff --git a/meson_cpu/x86/meson.build b/meson_cpu/x86/meson.build
new file mode 100644
index 000000000000..caf6bf09c14e
--- /dev/null
+++ b/meson_cpu/x86/meson.build
@@ -0,0 +1,227 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+
+SSE = mod_features.new(
+  'SSE', 1,  args: '-msse',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse.c')[0]
+)
+SSE2 = mod_features.new(
+  'SSE2', 2, implies: SSE,
+  args: '-msse2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse2.c')[0]
+)
+# enabling SSE without SSE2 is useless also it's non-optional for x86_64
+SSE.update(implies: SSE2)
+SSE3 = mod_features.new(
+  'SSE3', 3, implies: SSE2,
+  args: '-msse3',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse3.c')[0]
+)
+SSSE3 = mod_features.new(
+  'SSSE3', 4, implies: SSE3,
+  args: '-mssse3',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_ssse3.c')[0]
+)
+SSE41 = mod_features.new(
+  'SSE41', 5, implies: SSSE3,
+  args: '-msse4.1',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse41.c')[0]
+)
+POPCNT = mod_features.new(
+  'POPCNT', 6, implies: SSE41,
+  args: '-mpopcnt',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_popcnt.c')[0]
+)
+SSE42 = mod_features.new(
+  'SSE42', 7, implies: POPCNT, args: '-msse4.2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse42.c')[0]
+)
+# 7-20 left as margin for any extra features
+AVX = mod_features.new(
+  'AVX', 20, implies: SSE42, args: '-mavx',
+  detect: {'val': 'AVX', 'match': '.*SSE.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx.c')[0]
+)
+XOP = mod_features.new(
+  'XOP', 21, implies: AVX, args: '-mxop',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_xop.c')[0]
+)
+FMA4 = mod_features.new(
+  'FMA4', 22, implies: AVX, args: '-mfma4',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma4.c')[0]
+)
+# x86 half-precision
+F16C = mod_features.new(
+  'F16C', 23, implies: AVX, args: '-mf16c',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_f16c.c')[0]
+)
+FMA3 = mod_features.new(
+  'FMA3', 24, implies: F16C, args: '-mfma',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma3.c')[0]
+)
+AVX2 = mod_features.new(
+  'AVX2', 25, implies: F16C, args: '-mavx2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx2.c')[0]
+)
+# 25-40 left as margin for any extra features
+AVX512F = mod_features.new(
+  'AVX512F', 40, implies: [FMA3, AVX2],
+  # Disables mmx because of stack corruption that may happen during mask
+  # conversions.
+  # TODO (seiko2plus): provide more clarification
+  args: ['-mno-mmx', '-mavx512f'],
+  detect: {'val': 'AVX512F', 'match': '.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512f.c')[0],
+  extra_tests: {
+    'AVX512F_REDUCE': files(source_root + '/numpy/distutils/checks/extra_avx512f_reduce.c')[0]
+  }
+)
+AVX512CD = mod_features.new(
+  'AVX512CD', 41, implies: AVX512F, args: '-mavx512cd',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512cd.c')[0]
+)
+AVX512_KNL = mod_features.new(
+  'AVX512_KNL', 42, implies: AVX512CD, args: ['-mavx512er', '-mavx512pf'],
+  group: ['AVX512ER', 'AVX512PF'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knl.c')[0]
+)
+AVX512_KNM = mod_features.new(
+  'AVX512_KNM', 43, implies: AVX512_KNL,
+  args: ['-mavx5124fmaps', '-mavx5124vnniw', '-mavx512vpopcntdq'],
+  group: ['AVX5124FMAPS', 'AVX5124VNNIW', 'AVX512VPOPCNTDQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knm.c')[0]
+)
+AVX512_SKX = mod_features.new(
+  'AVX512_SKX', 50, implies: AVX512CD,
+  args: ['-mavx512vl', '-mavx512bw', '-mavx512dq'],
+  group: ['AVX512VL', 'AVX512BW', 'AVX512DQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_skx.c')[0],
+  extra_tests: {
+    'AVX512BW_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512bw_mask.c')[0],
+    'AVX512DQ_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512dq_mask.c')[0]
+  }
+)
+AVX512_CLX = mod_features.new(
+  'AVX512_CLX', 51, implies: AVX512_SKX, args: '-mavx512vnni',
+  group: ['AVX512VNNI'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_clx.c')[0]
+)
+AVX512_CNL = mod_features.new(
+  'AVX512_CNL', 52, implies: AVX512_SKX,
+  args: ['-mavx512ifma', '-mavx512vbmi'],
+  group: ['AVX512IFMA', 'AVX512VBMI'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_cnl.c')[0]
+)
+AVX512_ICL = mod_features.new(
+  'AVX512_ICL', 53, implies: [AVX512_CLX, AVX512_CNL],
+  args: ['-mavx512vbmi2', '-mavx512bitalg', '-mavx512vpopcntdq'],
+  group: ['AVX512VBMI2', 'AVX512BITALG', 'AVX512VPOPCNTDQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_icl.c')[0]
+)
+# TODO add support for zen4
+AVX512_SPR = mod_features.new(
+  'AVX512_SPR', 55, implies: AVX512_ICL,
+  args: ['-mavx512fp16'],
+  group: ['AVX512FP16'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_spr.c')[0]
+)
+
+# Specializations for non unix-like compilers
+# -------------------------------------------
+cpu_family = host_machine.cpu_family()
+compiler_id = meson.get_compiler('c').get_id()
+if compiler_id not in ['gcc', 'clang']
+  AVX512_SPR.update(disable: compiler_id + ' compiler does not support it')
+endif
+
+# Common specializations between both Intel compilers (unix-like and msvc-like)
+if compiler_id in ['intel', 'intel-cl']
+  # POPCNT, and F16C don't own private FLAGS however the compiler still
+  # provides ISA capability for them.
+  POPCNT.update(args: '')
+  F16C.update(args: '')
+  # Intel compilers don't support the following features independently
+  FMA3.update(implies: [F16C, AVX2])
+  AVX2.update(implies: [F16C, FMA3])
+  AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX])
+  AVX512CD.update(implies: [AVX512F, AVX512_SKX])
+  XOP.update(disable: 'Intel Compiler does not support it')
+  FMA4.update(disable: 'Intel Compiler does not support it')
+endif
+
+if compiler_id == 'intel-cl'
+  foreach fet : [SSE, SSE2, SSE3, SSSE3, AVX]
+    fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': '/arch:.*'})
+  endforeach
+  SSE41.update(args: {'val': '/arch:SSE4.1', 'match': '/arch:.*'})
+  SSE42.update(args: {'val': '/arch:SSE4.2', 'match': '/arch:.*'})
+  FMA3.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
+  AVX2.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
+  AVX512F.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
+  AVX512CD.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
+  AVX512_KNL.update(args: {'val': '/Qx:KNL', 'match': '/[arch|Qx]:.*'})
+  AVX512_KNM.update(args: {'val': '/Qx:KNM', 'match': '/[arch|Qx]:.*'})
+  AVX512_SKX.update(args: {'val': '/Qx:SKYLAKE-AVX512', 'match': '/[arch|Qx]:.*'})
+  AVX512_CLX.update(args: {'val': '/Qx:CASCADELAKE', 'match': '/[arch|Qx]:.*'})
+  AVX512_CNL.update(args: {'val': '/Qx:CANNONLAKE', 'match': '/[arch|Qx]:.*'})
+  AVX512_ICL.update(args: {'val': '/Qx:ICELAKE-CLIENT', 'match': '/[arch|Qx]:.*'})
+endif
+
+if compiler_id == 'intel'
+  clear_m = '^(-mcpu=|-march=)'
+  clear_any = '^(-mcpu=|-march=|-x[A-Z0-9\-])'
+  FMA3.update(args: {'val': '-march=core-avx2', 'match': clear_m})
+  AVX2.update(args: {'val': '-march=core-avx2', 'match': clear_m})
+  AVX512F.update(args: {'val': '-march=common-avx512', 'match': clear_m})
+  AVX512CD.update(args: {'val': '-march=common-avx512', 'match': clear_m})
+  AVX512_KNL.update(args: {'val': '-xKNL', 'match': clear_any})
+  AVX512_KNM.update(args: {'val': '-xKNM', 'match': clear_any})
+  AVX512_SKX.update(args: {'val': '-xSKYLAKE-AVX512', 'match': clear_any})
+  AVX512_CLX.update(args: {'val': '-xCASCADELAKE', 'match': clear_any})
+  AVX512_CNL.update(args: {'val': '-xCANNONLAKE', 'match': clear_any})
+  AVX512_ICL.update(args: {'val': '-xICELAKE-CLIENT', 'match': clear_any})
+endif
+
+if compiler_id == 'msvc'
+  # MSVC compiler doesn't support the following features
+  foreach fet : [AVX512_KNL, AVX512_KNM]
+    fet.update(disable: compiler_id + ' compiler does not support it')
+  endforeach
+  # The following features don't own private FLAGS, however the compiler still
+  # provides ISA capability for them.
+  foreach fet : [
+    SSE3, SSSE3, SSE41, POPCNT, SSE42, AVX, F16C, XOP, FMA4,
+    AVX512F, AVX512CD, AVX512_CLX, AVX512_CNL,
+    AVX512_ICL
+  ]
+    fet.update(args: '')
+  endforeach
+  # MSVC compiler doesn't support the following features independently
+  FMA3.update(implies: [F16C, AVX2])
+  AVX2.update(implies: [F16C, FMA3])
+  AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX])
+  AVX512CD.update(implies: [AVX512F, AVX512_SKX])
+  clear_arch = '/arch:.*'
+  # only available on 32-bit. Its enabled by default on 64-bit mode
+  foreach fet : [SSE, SSE2]
+    if cpu_family == 'x86'
+      fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': clear_arch})
+    else
+      fet.update(args: '')
+    endif
+  endforeach
+  FMA3.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
+  AVX2.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
+  AVX512_SKX.update(args: {'val': '/arch:AVX512', 'match': clear_arch})
+endif
+
+X86_FEATURES = {
+ 'SSE': SSE, 'SSE2': SSE2, 'SSE3': SSE3, 'SSSE3': SSSE3,
+ 'SSE41': SSE41, 'POPCNT': POPCNT, 'SSE42': SSE42, 'AVX': AVX,
+ 'XOP': XOP, 'FMA4': FMA4, 'F16C': F16C, 'FMA3': FMA3,
+ 'AVX2': AVX2, 'AVX512F': AVX512F, 'AVX512CD': AVX512CD,
+ 'AVX512_KNL': AVX512_KNL, 'AVX512_KNM': AVX512_KNM,
+ 'AVX512_SKX': AVX512_SKX, 'AVX512_CLX': AVX512_CLX,
+ 'AVX512_CNL': AVX512_CNL, 'AVX512_ICL': AVX512_ICL,
+ 'AVX512_SPR': AVX512_SPR
+}
diff --git a/meson_options.txt b/meson_options.txt
index 7ce4eefacd89..8b1fad6c4041 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,7 +1,7 @@
 option('blas', type: 'string', value: 'openblas',
-        description: 'option for BLAS library switching')
+        description: 'Option for BLAS library switching')
 option('lapack', type: 'string', value: 'openblas',
-        description: 'option for LAPACK library switching')
+        description: 'Option for LAPACK library switching')
 option('allow-noblas', type: 'boolean', value: false,
         description: 'If set to true, allow building with (slow!) internal fallback routines')
 option('use-ilp64', type: 'boolean', value: false,
@@ -12,8 +12,22 @@ option('disable-svml', type: 'boolean', value: false,
         description: 'Disable building against SVML')
 option('disable-threading', type: 'boolean', value: false,
         description: 'Disable threading support (see `NPY_ALLOW_THREADS` docs)')
-# TODO: flip value to 'false' once we have `npy_cpu_dispatch_config.h` & co.
-option('disable-simd-optimizations', type: 'boolean', value: true,
-        description: 'Disable SIMD features beyond the baseline ones')
+option('disable-optimization', type: 'boolean', value: false,
+        description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
+option('cpu-baseline', type: 'string', value: 'min',
+        description: 'Minimal set of required CPU features')
+option('cpu-dispatch', type: 'string', value: 'max -xop -fma4',
+        description: 'Dispatched set of additional CPU features')
+option('test-simd', type: 'array',
+        value: [
+          'BASELINE', 'SSE2', 'SSE42', 'XOP', 'FMA4',
+          'AVX2', 'FMA3', 'AVX2,FMA3', 'AVX512F', 'AVX512_SKX',
+          'VSX', 'VSX2', 'VSX3', 'VSX4',
+          'NEON', 'ASIMD',
+          'VX', 'VXE', 'VXE2',
+        ],
+        description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
+option('test-simd-args', type: 'string', value: '',
+        description: 'Extra args to be passed to the `_simd` module that is used for testing the NumPy SIMD interface')
 option('relaxed-strides-debug', type: 'boolean', value: false,
         description: 'Enable relaxed strides debug mode (see `NPY_RELAXED_STRIDES_DEBUG` docs)')
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 17760efa2fc0..ccc060aacb96 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -84,6 +84,7 @@ cdata.set('NPY_API_VERSION', C_API_VERSION)
 use_svml = (
   host_machine.system() == 'linux' and
   host_machine.cpu_family() == 'x86_64' and
+  ('AVX512_SKX' in CPU_DISPATCH_NAMES or 'AVX512_SKX' in CPU_BASELINE_NAMES) and
   not get_option('disable-svml')
 )
 if use_svml
@@ -291,9 +292,6 @@ endforeach
 
 # SSE headers only enabled automatically on amd64/x32 builds
 optional_headers = [
-  'xmmintrin.h',  # SSE
-  'emmintrin.h',  # SSE2
-  'immintrin.h',  # AVX
   'features.h',   # for glibc version linux
   'xlocale.h',    # see GH#8367
   'dlfcn.h',      # dladdr
@@ -322,6 +320,15 @@ optional_function_attributes = [
 #  endif
 #endforeach
 
+# Max possible optimization flags. We pass this flags to all our dispatch-able
+# (multi_targets) sources.
+compiler_id = cc.get_id()
+max_opt = {
+  'msvc': ['/O2'],
+  'intel-cl': ['/O3'],
+}.get(compiler_id, ['-O3'])
+max_opt = cc.has_multi_arguments(max_opt) ? max_opt : []
+
 # Optional GCC compiler builtins and their call arguments.
 # If given, a required header and definition name (HAVE_ prepended)
 # Call arguments are required as the compiler will do strict signature checking
@@ -513,12 +520,6 @@ if cc.get_id() == 'msvc'
      staticlib_cflags +=  '-d2VolatileMetadata-'
   endif
 endif
-# TODO: change to "feature" option in meson_options.txt? See
-# https://mesonbuild.com/Build-options.html#build-options
-if get_option('disable-simd-optimizations')
-  staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
-  staticlib_cppflags += '-DNPY_DISABLE_OPTIMIZATION'
-endif
 
 npy_math_internal_h = custom_target(
   output: 'npy_math_internal.h',
@@ -626,19 +627,10 @@ src_ufunc_api = custom_target('__ufunc_api',
 
 # Set common build flags for C and C++ code
 # -----------------------------------------
-
-# TODO: change to "feature" option in meson_options.txt? See
-# https://mesonbuild.com/Build-options.html#build-options
-disable_simd_optimizations = []
-if get_option('disable-simd-optimizations')
-  disable_simd_optimizations = '-DNPY_DISABLE_OPTIMIZATION'
-endif
-
 # Common build flags
 c_args_common = [
   '-DNPY_INTERNAL_BUILD',
   '-DHAVE_NPY_CONFIG_H',
-  disable_simd_optimizations,
   cflags_large_file_support,
 ]
 
@@ -667,11 +659,9 @@ np_core_dep = declare_dependency(
     '.',
     'include',
     'src/common',
-  ],
-  compile_args: disable_simd_optimizations
+  ]
 )
 
-
 # Build multiarray_tests module
 # -----------------------------
 py.extension_module('_multiarray_tests',
@@ -691,15 +681,30 @@ py.extension_module('_multiarray_tests',
   subdir: 'numpy/core',
 )
 
+_umath_tests_mtargets = mod_features.multi_targets(
+  '_umath_tests.dispatch.h',
+  'src/umath/_umath_tests.dispatch.c',
+  dispatch: [
+    AVX2, SSE41, SSE2,
+    ASIMDHP, ASIMD, NEON,
+    VSX3, VSX2, VSX,
+    VXE, VX,
+  ],
+  baseline: CPU_BASELINE,
+  prefix: 'NPY_',
+  dependencies: [py_dep, np_core_dep]
+)
+
 test_modules_src = [
   ['_umath_tests', [
       src_file.process('src/umath/_umath_tests.c.src'),
-      'src/umath/_umath_tests.dispatch.c',
       'src/common/npy_cpu_features.c',
-    ]],
-  ['_rational_tests', 'src/umath/_rational_tests.c'],
-  ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c'],
-  ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c'],
+    ],
+    _umath_tests_mtargets.static_lib('_umath_tests_mtargets')
+  ],
+  ['_rational_tests', 'src/umath/_rational_tests.c', []],
+  ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c', []],
+  ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c', []],
 ]
 foreach gen: test_modules_src
   py.extension_module(gen[0],
@@ -709,7 +714,261 @@ foreach gen: test_modules_src
     dependencies: np_core_dep,
     install: true,
     subdir: 'numpy/core',
+    link_with: gen[2],
+  )
+endforeach
+
+# Build multiarray dispatch-able sources
+# --------------------------------------
+multiarray_gen_headers = [
+  src_file.process('src/multiarray/arraytypes.h.src'),
+  src_file.process('src/common/npy_sort.h.src'),
+]
+foreach gen_mtargets : [
+  [
+    'argfunc.dispatch.h',
+    src_file.process('src/multiarray/argfunc.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX2, XOP, SSE42, SSE2,
+      VSX2,
+      ASIMD, NEON,
+      VXE, VX
+    ]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    baseline: CPU_BASELINE,
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
+  )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
+endforeach
+
+# Build npysort dispatch-able sources
+# -----------------------------------
+foreach gen_mtargets : [
+  [
+    'simd_qsort.dispatch.h',
+    'src/npysort/simd_qsort.dispatch.cpp',
+    [AVX512_SKX]
+  ],
+  [
+    'simd_qsort_16bit.dispatch.h',
+    'src/npysort/simd_qsort_16bit.dispatch.cpp',
+    [AVX512_SPR, AVX512_ICL]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    # baseline: CPU_BASELINE, it doesn't provide baseline fallback
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
   )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
+endforeach
+
+# Build umath dispatch-able sources
+# ---------------------------------
+mod_features = import('features')
+umath_gen_headers = [
+  src_file.process('src/umath/loops.h.src'),
+  src_file.process('src/umath/loops_utils.h.src'),
+]
+
+foreach gen_mtargets : [
+  [
+    'loops_arithm_fp.dispatch.h',
+    src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
+    [
+      [AVX2, FMA3], SSE2,
+      ASIMD, NEON,
+      VSX3, VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_arithmetic.dispatch.h',
+    src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
+      NEON,
+      VSX4, VSX2,
+      VX,
+    ]
+  ],
+  [
+    'loops_comparison.dispatch.h',
+    src_file.process('src/umath/loops_comparison.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX512F, AVX2, SSE42, SSE2,
+      VSX3, VSX2,
+      NEON,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_exponent_log.dispatch.h',
+    src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
+    # Enabling SIMD on clang-cl raises spurious FP exceptions
+    # TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
+    compiler_id == 'clang-cl' ? [] : [
+      AVX512_SKX, AVX512F, [AVX2, FMA3]
+    ]
+  ],
+  [
+    'loops_hyperbolic.dispatch.h',
+    src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
+    [
+      AVX512_SKX, [AVX2, FMA3],
+      VSX4, VSX2,
+      NEON_VFPV4,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_logical.dispatch.h',
+    src_file.process('src/umath/loops_logical.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VX,
+    ]
+  ],
+  [
+    'loops_minmax.dispatch.h',
+    src_file.process('src/umath/loops_minmax.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_modulo.dispatch.h',
+    src_file.process('src/umath/loops_modulo.dispatch.c.src'),
+    [
+      VSX4
+    ]
+  ],
+  [
+    'loops_trigonometric.dispatch.h',
+    src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
+    # Enabling SIMD on clang-cl raises spurious FP exceptions
+    # TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
+    compiler_id == 'clang-cl' ? [] : [
+      AVX512F, [AVX2, FMA3],
+      VSX4, VSX3, VSX2,
+      NEON_VFPV4,
+      VXE2, VXE
+    ]
+  ],
+  [
+    'loops_umath_fp.dispatch.h',
+    src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
+    [AVX512_SKX]
+  ],
+  [
+    'loops_unary.dispatch.h',
+    src_file.process('src/umath/loops_unary.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_unary_fp.dispatch.h',
+    src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
+    [
+      SSE41, SSE2,
+      VSX2,
+      ASIMD, NEON,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_unary_fp_le.dispatch.h',
+    src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+    [
+      SSE41, SSE2,
+      VSX2,
+      ASIMD, NEON,
+    ]
+  ],
+  [
+    'loops_unary_complex.dispatch.h',
+    src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
+    [
+      AVX512F, [AVX2, FMA3], SSE2,
+      ASIMD, NEON,
+      VSX3, VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_autovec.dispatch.h',
+    src_file.process('src/umath/loops_autovec.dispatch.c.src'),
+    [
+      AVX2, SSE2,
+      NEON,
+      VSX2,
+      VX,
+    ]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], umath_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    baseline: CPU_BASELINE,
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
+  )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
 endforeach
 
 # Build _multiarray_umath module
@@ -733,12 +992,10 @@ if have_blas
   ]
 endif
 
-src_multiarray = [
+src_multiarray = multiarray_gen_headers + [
   'src/multiarray/abstractdtypes.c',
   'src/multiarray/alloc.c',
-  src_file.process('src/multiarray/argfunc.dispatch.c.src'),
   'src/multiarray/arrayobject.c',
-  src_file.process('src/multiarray/arraytypes.h.src'),
   'src/multiarray/array_coercion.c',
   'src/multiarray/array_method.c',
   'src/multiarray/array_assign_scalar.c',
@@ -792,9 +1049,6 @@ src_multiarray = [
   'src/multiarray/typeinfo.c',
   'src/multiarray/usertypes.c',
   'src/multiarray/vdot.c',
-  src_file.process('src/common/npy_sort.h.src'),
-  'src/npysort/simd_qsort.dispatch.cpp',
-  'src/npysort/simd_qsort_16bit.dispatch.cpp',
   'src/npysort/quicksort.cpp',
   'src/npysort/mergesort.cpp',
   'src/npysort/timsort.cpp',
@@ -817,26 +1071,9 @@ src_multiarray = [
   'src/npymath/arm64_exports.c',
 ]
 
-src_umath = [
+src_umath = umath_gen_headers + [
   src_file.process('src/umath/funcs.inc.src'),
-  src_file.process('src/umath/loops.h.src'),
-  src_file.process('src/umath/loops_utils.h.src'),
   src_file.process('src/umath/loops.c.src'),
-  src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
-  src_file.process('src/umath/loops_comparison.dispatch.c.src'),
-  src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
-  src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
-  src_file.process('src/umath/loops_logical.dispatch.c.src'),
-  src_file.process('src/umath/loops_minmax.dispatch.c.src'),
-  src_file.process('src/umath/loops_modulo.dispatch.c.src'),
-  src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
-  src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
-  src_file.process('src/umath/loops_autovec.dispatch.c.src'),
   src_file.process('src/umath/matmul.c.src'),
   src_file.process('src/umath/matmul.h.src'),
   'src/umath/ufunc_type_resolution.c',
@@ -863,52 +1100,24 @@ src_umath = [
 # may be able to avoid the accuracy regressions in SVML.
 svml_objects = []
 if use_svml
-  svml_objects += [
-    'src/umath/svml/linux/avx512/svml_z0_acos_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acos_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acosh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acosh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asin_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asin_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asinh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asinh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atanh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atanh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cbrt_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cbrt_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cos_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cos_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cosh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cosh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_expm1_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_expm1_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log10_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log10_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log1p_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log1p_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_pow_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_pow_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sin_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sin_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sinh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sinh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tan_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tan_s_la.s',
-    # 'src/umath/svml/linux/avx512/svml_z0_tanh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tanh_s_la.s',
+  foreach svml_func : [
+    'acos', 'acosh', 'asin',
+    'asinh', 'atan2',
+    'atan',  'atanh',
+    'cbrt', 'cos',
+    'cosh', 'exp2',
+    'exp', 'expm1',
+    'log10', 'log1p',
+    'log2', 'log',
+    'pow', 'sin', 'sinh', 'tan',
+    'tanh'
   ]
+    foreach svml_sfx : ['d_la', 's_la', 'd_ha', 's_la']
+      svml_objects += [
+        'src/umath/svml/linux/avx512/svml_z0_'+svml_func+'_'+svml_sfx+'.s'
+      ]
+    endforeach
+  endforeach
 endif
 
 py.extension_module('_multiarray_umath',
@@ -934,26 +1143,60 @@ py.extension_module('_multiarray_umath',
     'src/umath',
   ],
   dependencies: blas_dep,
-  link_with: npymath_lib,
+  link_with: [npymath_lib, multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')],
   install: true,
   subdir: 'numpy/core',
 )
 
 # Build SIMD module
 # -----------------
+_simd_dispatch = []
+_simd_baseline = []
+foreach target : get_option('test-simd')
+  target = target.strip().to_upper().split(',')
+  mfeatures = []
+  foreach fet_name : target
+    if fet_name == 'BASELINE'
+      _simd_baseline = CPU_BASELINE
+      break
+    endif
+    if fet_name not in CPU_FEATURES
+      error('Expected a valid feature name, got('+fet_name+')')
+    endif
+    mfeatures += CPU_FEATURES[fet_name]
+  endforeach
+  _simd_dispatch += [mfeatures]
+endforeach
 
-py.extension_module('_simd',
+_simd_mtargets = mod_features.multi_targets(
+  '_simd.dispatch.h',
   [
-    'src/common/npy_cpu_features.c',
-    'src/_simd/_simd.c',
     src_file.process('src/_simd/_simd_inc.h.src'),
     src_file.process('src/_simd/_simd_data.inc.src'),
     src_file.process('src/_simd/_simd.dispatch.c.src'),
   ],
+  # Skip validating the order of `_simd_dispatch` because we execute all these
+  # features, not just the highest interest one. The sorting doesn't matter
+  # here, given the nature of this testing unit.
+  keep_sort: true,
+  dispatch: _simd_dispatch,
+  baseline: _simd_baseline,
+  prefix: 'NPY_',
+  dependencies: [py_dep, np_core_dep],
+  include_directories: ['src/_simd', 'src/npymath'],
+  c_args: c_args_common,
+  cpp_args: cpp_args_common,
+)
+
+py.extension_module('_simd',
+  [
+    'src/common/npy_cpu_features.c',
+    'src/_simd/_simd.c',
+  ],
   c_args: c_args_common,
   include_directories: ['src/_simd', 'src/npymath'],
   dependencies: np_core_dep,
-  link_with: npymath_lib,
+  link_with: [npymath_lib, _simd_mtargets.static_lib('_simd_mtargets')],
   install: true,
   subdir: 'numpy/core',
 )
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
index 52b66e7652a8..5a113fe57876 100644
--- a/numpy/core/src/_simd/_simd.c
+++ b/numpy/core/src/_simd/_simd.c
@@ -85,9 +85,13 @@ PyMODINIT_FUNC PyInit__simd(void)
                 goto err;                                                      \
             }                                                                  \
         }
-
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
-    NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #ifdef NPY__CPU_MESON_BUILD
+        NPY_MTARGETS_CONF_DISPATCH(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+        NPY_MTARGETS_CONF_BASELINE(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #else
+        NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+        NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #endif
     return m;
 err:
     Py_DECREF(m);
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index f532c9e022f7..51f5ddd54b22 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -919,7 +919,9 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
-    #ifdef NPY__CPU_TARGET_CURRENT
+    #if defined(NPY_MTARGETS_CURRENT) // meson build
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY_MTARGETS_CURRENT),
+    #elif defined(NPY__CPU_TARGET_CURRENT)
         .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
     #else
         .m_name = "numpy.core._simd.baseline",
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 4d5addec809e..699f8536f6a2 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -43,6 +43,7 @@
         #endif
     #endif
 #endif // !NPY_DISABLE_OPTIMIZATION
+#ifndef NPY__CPU_MESON_BUILD
 /**
  * Macro NPY_CPU_DISPATCH_CURFX(NAME)
  *
@@ -261,5 +262,5 @@
     ((TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
 #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
     ( LEFT __VA_ARGS__ )
-
+#endif // NPY__CPU_MESON_BUILD
 #endif  // NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 72a87eac1715..357b136d25cd 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -321,7 +321,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
     {
         npyv_f32 m = npyv_mul_f32(a, b);
-    #if NPY_HAVE_SSE3
+    #ifdef NPY_HAVE_SSE3
         return _mm_addsub_ps(m, c);
     #else
         const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
@@ -331,7 +331,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     {
         npyv_f64 m = npyv_mul_f64(a, b);
-    #if NPY_HAVE_SSE3
+    #ifdef NPY_HAVE_SSE3
         return _mm_addsub_pd(m, c);
     #else
         const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);