diff --git a/.github/meson_actions/action.yml b/.github/meson_actions/action.yml
new file mode 100644
index 000000000000..aff70da169bc
--- /dev/null
+++ b/.github/meson_actions/action.yml
@@ -0,0 +1,29 @@
+name: MesonBuildTest
+description: "checkout repo, build, and test numpy"
+runs:
+  using: composite
+  steps:
+  - name: Install dependencies
+    shell: bash
+    run: pip install -r build_requirements.txt
+  - name: Build
+    shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+    env:
+      TERM: xterm-256color
+    run:
+      spin build -- ${MESON_ARGS[@]}
+  - name: Check build-internal dependencies
+    shell: bash
+    run:
+      ninja -C build -t missingdeps
+  - name: Check installed test and stub files
+    shell: bash
+    run:
+      python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy')
+  - name: Test
+    shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+    env:
+      TERM: xterm-256color
+    run: |
+      pip install pytest pytest-xdist hypothesis typing_extensions
+      spin test -j auto
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index 928018b13905..b0a24d7730a1 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -49,7 +49,7 @@ jobs:
     if: "github.repository == 'numpy/numpy'"
     runs-on: ubuntu-latest
     env:
-      WITHOUT_SIMD: 1
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -58,7 +58,7 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   basic:
     needs: [smoke_test]
@@ -122,7 +122,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      WITHOUT_OPTIMIZATIONS: 1
+      MESON_ARGS: "-Dallow-noblas=true -Ddisable-optimization=true"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -131,14 +131,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   with_baseline_only:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "none"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=none"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -147,14 +147,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   without_avx512:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "max -xop -fma4 -avx512f -avx512cd -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,AVX2,FMA3"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -163,14 +163,14 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   without_avx512_avx2_fma3:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     if: github.event_name != 'push'
     env:
-      CPU_DISPATCH: "SSSE3 SSE41 POPCNT SSE42 AVX F16C"
+      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C"
     steps:
     - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
       with:
@@ -179,7 +179,7 @@ jobs:
     - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
-    - uses: ./.github/actions
+    - uses: ./.github/meson_actions
 
   debug:
     needs: [smoke_test]
diff --git a/MANIFEST.in b/MANIFEST.in
index ab6ecd518e1b..4803b39131e1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -21,6 +21,8 @@ recursive-include numpy/random *.pyx *.pxd *.pyx.in *.pxd.in
 include numpy/py.typed
 include numpy/random/include/*
 include numpy/*.pxd
+# Meson CPU Dispatcher
+recursive-include meson_cpu *.build *.in
 # Add build support that should go in sdist, but not go in bdist/be installed
 # Note that sub-directories that don't have __init__ are apparently not
 # included by 'recursive-include', so list those separately
diff --git a/build_requirements.txt b/build_requirements.txt
index 3627f1b91685..e7e776a7de89 100644
--- a/build_requirements.txt
+++ b/build_requirements.txt
@@ -1,5 +1,5 @@
-meson-python>=0.10.0
-Cython
+meson-python>=0.13.1
+Cython>=3.0
 wheel==0.38.1
 ninja
 spin==0.4
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index 783d5a447df9..bc6c3b3818d2 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -517,7 +517,7 @@ and other Python sequences.
     >>> for i in a:
     ...     print(i**(1 / 3.))
     ...
-    9.999999999999998
+    9.999999999999998  # may vary
     1.0
     9.999999999999998
     3.0
diff --git a/meson.build b/meson.build
index 8bfe987715d1..33d0e7b462ef 100644
--- a/meson.build
+++ b/meson.build
@@ -6,7 +6,7 @@ project(
   # See `numpy/__init__.py`
   version: '1.26.0.dev0',
   license: 'BSD-3',
-  meson_version: '>= 1.1.0',
+  meson_version: '>=1.2.99',  # version in vendored-meson is 1.2.99
   default_options: [
     'buildtype=debugoptimized',
     'b_ndebug=if-release',
@@ -80,4 +80,5 @@ else
     meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
 endif
 
+subdir('meson_cpu')
 subdir('numpy')
diff --git a/meson_cpu/arm/meson.build b/meson_cpu/arm/meson.build
new file mode 100644
index 000000000000..f968b2e99682
--- /dev/null
+++ b/meson_cpu/arm/meson.build
@@ -0,0 +1,58 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+NEON = mod_features.new(
+  'NEON', 1,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon.c')[0]
+)
+NEON_FP16 = mod_features.new(
+  'NEON_FP16', 2, implies: NEON,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_fp16.c')[0]
+)
+# FMA
+NEON_VFPV4 = mod_features.new(
+  'NEON_VFPV4', 3, implies: NEON_FP16,
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_neon_vfpv4.c')[0]
+)
+# Advanced SIMD
+ASIMD = mod_features.new(
+  'ASIMD', 4, implies: NEON_VFPV4, detect: {'val': 'ASIMD', 'match': 'NEON.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimd.c')[0]
+)
+cpu_family = host_machine.cpu_family()
+if cpu_family == 'aarch64'
+  # hardware baseline
+  NEON.update(implies: [NEON_FP16, NEON_VFPV4, ASIMD])
+  NEON_FP16.update(implies: [NEON, NEON_VFPV4, ASIMD])
+  NEON_VFPV4.update(implies: [NEON, NEON_FP16, ASIMD])
+elif cpu_family == 'arm'
+  NEON.update(args: '-mfpu=neon')
+  NEON_FP16.update(args: ['-mfp16-format=ieee', {'val': '-mfpu=neon-fp16', 'match': '-mfpu=.*'}])
+  NEON_VFPV4.update(args: [{'val': '-mfpu=neon-vfpv4', 'match': '-mfpu=.*'}])
+  ASIMD.update(args: [
+    {'val': '-mfpu=neon-fp-armv8', 'match': '-mfpu=.*'},
+    '-march=armv8-a+simd'
+  ])
+endif
+# ARMv8.2 half-precision & vector arithm
+ASIMDHP = mod_features.new(
+  'ASIMDHP', 5, implies: ASIMD,
+  args: {'val': '-march=armv8.2-a+fp16', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdhp.c')[0]
+)
+## ARMv8.2 dot product
+ASIMDDP = mod_features.new(
+  'ASIMDDP', 6, implies: ASIMD,
+  args: {'val': '-march=armv8.2-a+dotprod', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimddp.c')[0]
+)
+## ARMv8.2 Single & half-precision Multiply
+ASIMDFHM = mod_features.new(
+  'ASIMDFHM', 7, implies: ASIMDHP,
+  args: {'val': '-march=armv8.2-a+fp16fml', 'match': '-march=.*', 'mfilter': '\+.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdfhm.c')[0]
+)
+# TODO: Add support for MSVC
+ARM_FEATURES = {
+  'NEON': NEON, 'NEON_FP16': NEON_FP16, 'NEON_VFPV4': NEON_VFPV4,
+  'ASIMD': ASIMD, 'ASIMDHP': ASIMDHP, 'ASIMDFHM': ASIMDFHM
+}
diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in
new file mode 100644
index 000000000000..c7c13b2c7eb1
--- /dev/null
+++ b/meson_cpu/main_config.h.in
@@ -0,0 +1,351 @@
+/*
+ * Main configuration header of the CPU dispatcher.
+ *
+ * This header is autogenerated by the Meson build script located at `meson_cpu/meson.build`.
+ * It provides a set of utilities that are required for the runtime dispatching process.
+ *
+ * The most important macros in this header are:
+ *   - @ref @P@CPU_DISPATCH_DECLARE: Used to declare the dispatched functions and variables.
+ *   - @ref @P@CPU_DISPATCH_CURFX: Used to define the dispatched functions with target-specific suffixes.
+ *   - @ref @P@CPU_DISPATCH_CALL: Used for runtime dispatching of the exported functions and variables.
+ */
+#ifndef @P@_CPU_DISPATCHER_CONF_H_
+#define @P@_CPU_DISPATCHER_CONF_H_
+/// This definition is required to provides comptablity with NumPy distutils
+#define @P@_CPU_MESON_BUILD
+/**
+ * @def @P@WITH_CPU_BASELINE
+ * Enabled baseline features names as a single string where each is separated by a single space.
+ * For example: "SSE SSE2 SSE3"
+ * Required for logging purposes only.
+ */
+#define @P@WITH_CPU_BASELINE "@WITH_CPU_BASELINE@"
+/**
+ * @def @P@WITH_CPU_BASELINE_N
+ * Number of enabled baseline features.
+ */
+#define @P@WITH_CPU_BASELINE_N @WITH_CPU_BASELINE_N@
+/**
+ * @def @P@WITH_CPU_DISPATCH
+ * Dispatched features names as a single string where each is separated by a single space.
+ */
+#define @P@WITH_CPU_DISPATCH "@WITH_CPU_DISPATCH@"
+/**
+ * @def @P@WITH_CPU_DISPATCH_N
+ * Number of enabled dispatched features.
+ */
+#define @P@WITH_CPU_DISPATCH_N @WITH_CPU_DISPATCH_N@
+// Expand a macro, used by the following macros
+#define @P@_CPU_EXPAND(X) X
+#define @P@_CPU_CAT__(a, b) a ## b
+#define @P@_CPU_CAT_(a, b) @P@_CPU_CAT__(a, b)
+#define @P@_CPU_CAT(a, b) @P@_CPU_CAT_(a, b)
+
+/**
+ * @def @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...)
+ * Call each enabled baseline feature sorted by lowest interest
+ * using preprocessor callback without testing whiher the
+ * feature is supported by CPU or not.
+ *
+ * Required for logging purposes only, for example, generating
+ * a Python list to hold the information of the enabled features.
+ *
+ * Unwrapped Version:
+ * @code
+ * #define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE, __VA_ARGS__))   \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE2, __VA_ARGS__))  \
+ *     @P@_CPU_EXPAND(EXEC_CB(SSE3, __VA_ARGS__))
+ * @endcode
+ *
+ * @param EXEC_CB The preprocessor callback to be called for each enabled baseline feature.
+ * @param ... Additional arguments to be passed to the preprocessor callback.
+ */
+#define @P@WITH_CPU_BASELINE_CALL(EXEC_CB, ...) \
+@WITH_CPU_BASELINE_CALL@
+
+/**
+ * @def @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...)
+ * Similar to the above but for enabled dispatched features.
+ *
+ * @param EXEC_CB The preprocessor callback to be called for each enabled dispatched feature.
+ * @param ... Additional arguments to be passed to the preprocessor callback.
+ */
+#define @P@WITH_CPU_DISPATCH_CALL(EXEC_CB, ...) \
+@WITH_CPU_DISPATCH_CALL@
+
+/*
+ * Defines the default behavior for the configurable macros derived from the configuration header
+ * that is generated by the meson function `mod_features.multi_targets()`.
+ *
+ * Note: Providing fallback in case of optimization disabled is no longer needed for meson
+ * since we always guarantee having configuration headers.
+ *
+ * However, it is still needed for compatibility with Numpy distutils.
+ */
+#ifndef @P@DISABLE_OPTIMIZATION
+    #define @P@MTARGETS_CONF_BASELINE(CB, ...) \
+         &&"Expected config header that generated by mod_features.multi_targets()";
+    #define @P@MTARGETS_CONF_DISPATCH(TEST_FEATURE_CB, CB, ...) \
+         &&"Expected config header that generated by mod_features.multi_targets()";
+#else
+    #define @P@MTARGETS_CONF_BASELINE(CB, ...) @P@_CPU_EXPAND(CB(__VA_ARGS__))
+    #define @P@MTARGETS_CONF_DISPATCH(CHK, CB, ...)
+#endif
+/**
+ * @def @P@CPU_DISPATCH_CURFX(NAME)
+ *
+ * Returns `NAME` suffixed with "_" + "the current target" during compiling
+ * the generated static libraries that are derived from the Meson function
+ * `mod_features.multi_targets()`.
+ *
+ * It also returns `NAME` as-is without any suffix when it comes to the baseline features or
+ * in case if the optimization is disabled.
+ *
+ * Note: `mod_features.multi_targets()` provides a unique target name within the compiler #definition
+ * called `@P@MTARGETS_CURRENT` on each generated library based on the specified features
+ * within its parameter 'dispatch:'.
+ *
+ * For example:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *  'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @code
+ *
+ * @code
+ * void @P@CPU_DISPATCH_CURFX(add)(const int *src0, const int *src1, int *dst)
+ * {
+ * #ifdef @P@HAVE_AVX512F // one of the implied feature of AVX512_SKX
+ *   // code
+ * #elif defined(@P@HAVE_AVX2)
+ *   // code
+ * #elif defined(@P@HAVE_SSE3)
+ *   // CODE
+ * #else
+ *   // Fallback code in case of features enabled
+ * #endif
+ * }
+ * @endif
+ *
+ * // Unwrapped version :
+ * void add_AVX512_SKX(const int *src0, const int *src1, int *dst)
+ * {...}
+ * void add_AVX2(const int *src0, const int *src1, int *dst)
+ * {...}
+ * // baseline
+ * void add(const int *src0, const int *src1, int *dst)
+ * {...}
+ * @endcode
+ *
+ * @param NAME The base name of the dispatched function or variable.
+ */
+#ifdef @P@MTARGETS_CURRENT
+    // '@P@MTARGETS_CURRENT': only defined by the dispatchable sources
+    #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_CAT(@P@_CPU_CAT(NAME, _), @P@MTARGETS_CURRENT)
+#else
+    #define @P@CPU_DISPATCH_CURFX(NAME) @P@_CPU_EXPAND(NAME)
+#endif
+
+/**
+ * @def @P@CPU_DISPATCH_DECLARE(...)
+ *
+ * Provides forward declarations for the exported variables and functions
+ * based on the enabled baseline and dispatched features.
+ *
+ * This macro requires include the config file that been generated
+ * by meson function `mod_features.multi_targets()` to determine the enabled
+ * baseline and dispatched features.
+ *
+ * For example:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *  'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @code
+ *
+ * @code
+ * // from C
+ * #include "arithmetic.dispatch.h"
+ * @P@CPU_DISPATCH_DECLARE(void add, (const int *src0, const int *src1, int *dst))
+ *
+ * // Unwrapped version:
+ * void add_AVX512_SKX(const int *src0, const int *src1, int *dst);
+ * void add_AVX2(const int *src0, const int *src1, int *dst);
+ * void add(const int *src0, const int *src1, int *dst); // baseline
+ * @endcode
+ *
+ * @param ... The function or variable prototype to be declared,
+ *            with the target-specific suffix added automatically.
+ */
+#define @P@CPU_DISPATCH_DECLARE(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__)
+
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \
+    @P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__;
+#define @P@CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \
+    LEFT __VA_ARGS__;
+// Dummy CPU runtime checking
+#define @P@CPU_DISPATCH_DECLARE_CHK_(FEATURE_NAME)
+
+/**
+ * @def @P@CPU_DISPATCH_DECLARE_XB(LEFT, ...)
+ *
+ * Same as `@P@CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
+ * if it was enabled within `mod_features.multi_targets()`.
+ */
+#define @P@CPU_DISPATCH_DECLARE_XB(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_DISPATCH_DECLARE_CHK_, @P@CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__)
+
+/**
+ * @def @P@CPU_DISPATCH_CALL(...)
+ *
+ * Helper macro used for runtime dispatching of the exported functions and variables
+ * within the meson `mod_features.multi_targets()` function.
+ *
+ * This macro dispatches only one symbol based on the order of the specified features within the meson function
+ * `mod_features.multi_targets()`. For example, if `mod_features.multi_targets()` is called with
+ * `dispatch: [features_highest_1, features_highest_2]`, the macro will test each enabled feature against
+ * the CPU at runtime. Once it fails, it will move to the next order until falling back to the baseline.
+ *
+ * Similar to `@P@CPU_DISPATCH_DECLARE`, this macro requires including the config file that has been generated
+ * by the meson function `mod_features.multi_targets()` to determine the enabled baseline and dispatched features.
+ *
+ * Example usage:
+ *
+ * @code
+ * # from meson
+ * mod_features.multi_targets(
+ *   'arithmetic.dispatch.h', 'arithmetic.c',
+ *   baseline: [SSE3], dispatch: [AVX512_SKX, AVX2],
+ *   prefix: '@P@'
+ * )
+ * @endcode
+ *
+ * @code
+ * // from C
+ * #include "arithmetic.dispatch.h"
+ *
+ * // Example 1:
+ * @P@CPU_DISPATCH_CALL(add, (src0, src1, dst));
+ *
+ * // Unwrapped version:
+ * @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX(src0, src1, dst) :
+ *     (@P@CPU_HAVE(AVX2) ? add_AVX2(src0, src1, dst) :
+ *         add(src0, src1, dst); // baseline
+ *
+ * // Example 2:
+ * typedef void (*func_type)(const int*, const int*, int*);
+ * func_type func = @P@CPU_DISPATCH_CALL(add);
+ *
+ * // Unwrapped version:
+ * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? add_AVX512_SKX :
+ *                     (@P@CPU_HAVE(AVX2) ? add_AVX2 :
+ *                         add; // baseline
+ *
+ * // Example 3:
+ * func_type func3;
+ * @P@CPU_DISPATCH_CALL(func3 = add);
+ *
+ * // Unwrapped version:
+ * func_type func2 = @P@CPU_HAVE(AVX512_SKX) ? func3 = add_AVX512_SKX :
+ *                     (@P@CPU_HAVE(AVX2) ? func3 = add_AVX2 :
+ *                         func3 = add; // baseline
+ *
+ * @endcode
+ *
+ * @param ... The function or variable prototype to be called or assigned,
+ *            with the target-specific suffix added automatically.
+ */
+#define @P@CPU_DISPATCH_CALL(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
+#define @P@CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
+    (LEFT __VA_ARGS__)
+
+/**
+ * @def @P@CPU_DISPATCH_CALL_XB(LEFT, ...)
+ *
+ * Same as `@P@CPU_DISPATCH_CALL` but exclude the baseline call even
+ * if it was provided within meson `mod_features.multi_targets()`.
+ *
+ * Note: This macro returns void
+ */
+#define @P@CPU_DISPATCH_CALL_XB_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (void) (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
+#define @P@CPU_DISPATCH_CALL_XB(...) \
+    @P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \
+    ((void) 0 /* discarded expression value */)
+
+/**
+ * Macro @P@CPU_DISPATCH_CALL_ALL(...)
+ *
+ * Same as `@P@CPU_DISPATCH_CALL` but dispatching all the required optimizations for
+ * the exported functions and variables instead of highest interested one.
+ * Returns void.
+ */
+#define @P@CPU_DISPATCH_CALL_ALL(...) \
+    (@P@MTARGETS_CONF_DISPATCH(@P@CPU_HAVE, @P@CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+    @P@MTARGETS_CONF_BASELINE(@P@CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__))
+// Preprocessor callbacks
+#define @P@CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    ((TESTED_FEATURES) ? (@P@_CPU_CAT(@P@_CPU_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
+#define @P@CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
+    ( LEFT __VA_ARGS__ )
+
+// Brings the headers files of enabled CPU features
+#ifdef @P@HAVE_SSE
+    #include <xmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE2
+    #include <emmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE3
+    #include <pmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSSE3
+    #include <tmmintrin.h>
+#endif
+#ifdef @P@HAVE_SSE41
+    #include <smmintrin.h>
+#endif
+#ifdef @P@HAVE_POPCNT
+    #ifdef _MSC_VER
+        #include <nmmintrin.h>
+    #else
+        #include <popcntintrin.h>
+    #endif
+#endif
+#ifdef @P@HAVE_AVX
+    #include <immintrin.h>
+#endif
+
+#if defined(@P@HAVE_XOP) || defined(@P@HAVE_FMA4)
+    #include <x86intrin.h>
+#endif
+
+#ifdef @P@HAVE_VSX
+    #include <altivec.h>
+#endif
+
+#ifdef @P@HAVE_VX
+    #include <vecintrin.h>
+#endif
+
+#ifdef @P@HAVE_NEON
+    #include <arm_neon.h>
+#endif
+#endif // @P@_CPU_DISPATCHER_CONF_H_
diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build
new file mode 100644
index 000000000000..b99638bfc24f
--- /dev/null
+++ b/meson_cpu/meson.build
@@ -0,0 +1,307 @@
+# The CPU Dispatcher implementation.
+#
+# This script handles the CPU dispatcher and requires the Meson module
+# 'features'.
+#
+# The CPU dispatcher script is responsible for three main tasks:
+#
+# 1. Defining the enabled baseline and dispatched features by parsing build
+#    options or compiler arguments, including detection of native flags.
+#
+# 2. Specifying the baseline arguments and definitions across all sources.
+#
+# 3. Generating the main configuration file, which contains information about
+#    the enabled features, along with a collection of C macros necessary for
+#    runtime dispatching. For more details, see the template file
+#    `main_config.h.in`.
+#
+# This script exposes the following variables:
+#
+# - `CPU_BASELINE`: A set of CPU feature objects obtained from
+#                   `mod_features.new()`, representing the minimum CPU features
+#                   specified by the build option `-Dcpu-baseline`.
+#
+# - `CPU_BASELINE_NAMES`: A set of enabled CPU feature names, representing the
+#                         minimum CPU features specified by the build option
+#                         `-Dcpu-baseline`.
+#
+# - `CPU_DISPATCH_NAMES`: A set of enabled CPU feature names, representing the
+#                         additional CPU features that can be dispatched at
+#                         runtime, specified by the build option
+#                         `-Dcpu-dispatch`.
+#
+# - `CPU_FEATURES`: A dictionary containing all supported CPU feature objects.
+#
+# Additionally, this script exposes a set of variables that represent each
+# supported feature to be used within the Meson function
+# `mod_features.multi_targets()`.
+
+# Prefix used by all macros and features definitions
+CPU_CONF_PREFIX = 'NPY_'
+# main configuration name
+CPU_CONF_CONFIG = 'npy_cpu_dispatch_config.h'
+
+if get_option('disable-optimization')
+  add_project_arguments('-D' + CPU_CONF_PREFIX + 'DISABLE_OPTIMIZATION', language: ['c', 'cpp'])
+  CPU_CONF_BASELINE = 'none'
+  CPU_CONF_DISPATCH = 'none'
+else
+  baseline_detect = false
+  c_args = get_option('c_args')
+  foreach arg : c_args
+    foreach carch : ['-march', '-mcpu', '-xhost', '/QxHost']
+      if arg.contains(carch)
+        message('Appending option "detect" to "cpu-baseline" due to detecting global architecture c_arg "' + arg + '"')
+        baseline_detect = true
+        break
+      endif
+    endforeach
+    if baseline_detect
+      break
+    endif
+  endforeach
+  # The required minimal set of required CPU features.
+  CPU_CONF_BASELINE = get_option('cpu-baseline')
+  if baseline_detect
+    CPU_CONF_BASELINE += '+detect'
+  endif
+  # The required dispatched set of additional CPU features.
+  CPU_CONF_DISPATCH = get_option('cpu-dispatch')
+endif
+
+# Initialize the CPU features Export the X86 features objects 'SSE', 'AVX',
+# etc. plus a dictionary "X86_FEATURES" which maps to each object by its name
+subdir('x86')
+subdir('ppc64')
+subdir('s390x')
+subdir('arm')
+
+CPU_FEATURES = {}
+CPU_FEATURES += ARM_FEATURES
+CPU_FEATURES += X86_FEATURES
+CPU_FEATURES += PPC64_FEATURES
+CPU_FEATURES += S390X_FEATURES
+
+# Parse the requsted baseline (CPU_CONF_BASELINE) and dispatch features
+# (CPU_CONF_DISPATCH).
+cpu_family = host_machine.cpu_family()
+# Used by build option 'min'
+min_features = {
+  'x86': [SSE2],
+  'x86_64': [SSE3],
+  'ppc64': [],
+  's390x': [],
+  'arm': [],
+  'aarch64': [ASIMD]
+}.get(cpu_family, [])
+if host_machine.endian() == 'little' and cpu_family == 'ppc64'
+  min_features = [VSX2]
+endif
+
+# Used by build option 'max'
+max_features_dict = {
+  'x86': X86_FEATURES,
+  'x86_64': X86_FEATURES,
+  'ppc64': PPC64_FEATURES,
+  's390x': S390X_FEATURES,
+  'arm': ARM_FEATURES,
+  'aarch64': ARM_FEATURES,
+}.get(cpu_family, [])
+max_features = []
+foreach fet_name, fet_obj : max_features_dict
+  max_features += [fet_obj]
+endforeach
+
+parse_options = {
+  'cpu-baseline': CPU_CONF_BASELINE,
+  'cpu-dispatch': CPU_CONF_DISPATCH
+}
+parse_result = {
+  'cpu-baseline': [],
+  'cpu-dispatch': []
+}
+mod_features = import('features')
+foreach opt_name, conf : parse_options
+  # no support for regex :(?
+  tokens = conf.replace(',', ' ').replace('+', ' + ').replace('-', ' - ').strip().to_upper().split()
+  result = []
+  ignored = []
+  # append is the default
+  append = true
+  foreach tok : tokens
+    if tok == '+'
+      append = true
+      continue
+    elif tok == '-'
+      append = false
+      continue
+    elif tok == 'NONE'
+      continue
+    elif tok == 'NATIVE'
+      if not is_variable('cpu_native_features')
+        compiler_id = meson.get_compiler('c').get_id()
+        native_flags = {
+          'intel': '-xHost',
+          'intel-cl': '/QxHost',
+          # FIXME: Add support for fcc(-mcpu=a64fx) compiler
+        }.get(compiler_id, '-march=native')
+        test_native = mod_features.test(
+          max_features, anyfet: true,
+          force_args: [native_flags] + '-DDETECT_FEATURES'
+        )
+        if not test_native[0]
+          error('Option "native" doesn\'t support compiler', compiler_id)
+        endif
+        cpu_native_features = []
+        foreach fet_name : test_native[1].get('features')
+          cpu_native_features += CPU_FEATURES[fet_name]
+        endforeach
+      endif
+      accumulate = cpu_native_features
+    elif tok == 'DETECT'
+      if not is_variable('cpu_detect_features')
+        test_detect = mod_features.test(
+          max_features, anyfet: true,
+          force_args: ['-DDETECT_FEATURES'] + get_option('c_args')
+        )
+        cpu_detect_features = []
+        foreach fet_name : test_detect[1].get('features')
+          cpu_detect_features += CPU_FEATURES[fet_name]
+        endforeach
+      endif
+      accumulate = cpu_detect_features
+    elif tok == 'MIN'
+      accumulate = min_features
+    elif tok == 'MAX'
+      accumulate = max_features
+    elif tok in CPU_FEATURES
+      tokobj = CPU_FEATURES[tok]
+      if tokobj not in max_features
+        ignored += tok
+        continue
+      endif
+      accumulate = [tokobj]
+    else
+      error('Invalid token "'+tok+'" within option --'+opt_name)
+    endif
+    if append
+      foreach fet : accumulate
+        if fet not in result
+          result += fet
+        endif
+      endforeach
+    else
+      filterd = []
+      foreach fet : result
+        if fet not in accumulate
+          filterd += fet
+        endif
+      endforeach
+      result = filterd
+    endif # append
+  endforeach # tok : tokens
+  if ignored.length() > 0
+    message(
+      'During parsing ' + opt_name +
+      ': The following CPU features were ignored due to platform ' +
+      'incompatibility or lack of support:\n"' + ' '.join(ignored) + '"'
+    )
+  endif
+  if result.length() > 0
+    parse_result += {opt_name: mod_features.implicit_c(result)}
+  endif
+endforeach # opt_name, conf : parse_options
+
+# Test the baseline and dispatch features and set their flags and #definitions
+# across all sources.
+#
+# It is important to know that this test enables the maximum supported features
+# by the platform depending on the required features.
+#
+# For example, if the user specified `--cpu-baseline=avx512_skx`, and the
+# compiler doesn't support it, but still supports any of the implied features,
+# then we enable the maximum supported implied features, e.g., AVX2, which can
+# be done by specifying `anyfet: true` to the test function.
+if parse_result['cpu-baseline'].length() > 0
+    baseline = mod_features.test(parse_result['cpu-baseline'], anyfet: true)[1]
+    baseline_args = baseline['args']
+    foreach baseline_fet : baseline['defines']
+        baseline_args += ['-D' + CPU_CONF_PREFIX + 'HAVE_' + baseline_fet]
+    endforeach
+    add_project_arguments(baseline_args, language: ['c', 'cpp'])
+else
+    baseline = {}
+endif
+# The name of the baseline features including its implied features.
+CPU_BASELINE_NAMES = baseline.get('features', [])
+CPU_BASELINE = []
+foreach fet_name : CPU_BASELINE_NAMES
+  CPU_BASELINE += [CPU_FEATURES[fet_name]]
+endforeach
+# Loop over all initialized features and disable any feature that is not part
+# of the requested baseline and dispatch features to avoid it enabled by
+# import('feature').multi_targets
+foreach fet_name, fet_obj : CPU_FEATURES
+  if fet_obj in parse_result['cpu-dispatch'] or fet_name in CPU_BASELINE_NAMES
+    continue
+  endif
+  fet_obj.update(disable: 'Not part of the requsted features')
+endforeach
+
+CPU_DISPATCH_NAMES = []
+foreach fet_obj : parse_result['cpu-dispatch']
+  # skip baseline features
+  if fet_obj.get('name') in CPU_BASELINE_NAMES
+    continue
+  endif
+  fet_test = mod_features.test(fet_obj)
+  if not fet_test[0]
+    continue
+  endif
+  CPU_DISPATCH_NAMES += [fet_obj.get('name')]
+endforeach
+# Generate main configuration header see 'main_config.h.in' for more
+# clarification.
+main_config = {
+  'P': CPU_CONF_PREFIX,
+  'WITH_CPU_BASELINE': ' '.join(CPU_BASELINE_NAMES),
+  'WITH_CPU_BASELINE_N': CPU_BASELINE_NAMES.length(),
+  'WITH_CPU_DISPATCH': ' '.join(CPU_DISPATCH_NAMES),
+  'WITH_CPU_DISPATCH_N': CPU_DISPATCH_NAMES.length(),
+}
+clines = []
+macro_tpl = '@0@_CPU_EXPAND(EXEC_CB(@1@, __VA_ARGS__)) \\'
+foreach fet : CPU_BASELINE_NAMES
+  clines += macro_tpl.format(CPU_CONF_PREFIX, fet)
+endforeach
+main_config += {'WITH_CPU_BASELINE_CALL': '\n'.join(clines)}
+clines = []
+foreach fet : CPU_DISPATCH_NAMES
+  clines += macro_tpl.format(CPU_CONF_PREFIX, fet)
+endforeach
+main_config += {'WITH_CPU_DISPATCH_CALL': '\n'.join(clines)}
+
+configure_file(
+  input : 'main_config.h.in',
+  output : CPU_CONF_CONFIG,
+  configuration : configuration_data(main_config)
+)
+add_project_arguments(
+  '-I' + meson.current_build_dir(),
+  language: ['c', 'cpp']
+)
+
+message(
+'''
+CPU Optimization Options
+  baseline:
+    Requested : @0@
+    Enabled   : @1@
+  dispatch:
+    Requested : @2@
+    Enabled   : @3@
+'''.format(
+    CPU_CONF_BASELINE, ' '.join(CPU_BASELINE_NAMES),
+    CPU_CONF_DISPATCH, ' '.join(CPU_DISPATCH_NAMES)
+  )
+)
diff --git a/meson_cpu/ppc64/meson.build b/meson_cpu/ppc64/meson.build
new file mode 100644
index 000000000000..d14b23703fe3
--- /dev/null
+++ b/meson_cpu/ppc64/meson.build
@@ -0,0 +1,38 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+compiler_id = meson.get_compiler('c').get_id()
+
+VSX = mod_features.new(
+  'VSX', 1, args: '-mvsx',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx.c')[0],
+  extra_tests: {
+    'VSX_ASM': files(source_root + '/numpy/distutils/checks/extra_vsx_asm.c')[0]
+  }
+)
+if compiler_id == 'clang'
+  VSX.update(args: ['-mvsx', '-maltivec'])
+endif
+VSX2 = mod_features.new(
+  'VSX2', 2, implies: VSX, args: {'val': '-mcpu=power8', 'match': '.*vsx'},
+  detect: {'val': 'VSX2', 'match': 'VSX'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx2.c')[0],
+)
+# VSX2 is hardware baseline feature on ppc64le since the first little-endian
+# support was part of Power8
+if host_machine.endian() == 'little'
+  VSX.update(implies: VSX2)
+endif
+VSX3 = mod_features.new(
+  'VSX3', 3, implies: VSX2, args: {'val': '-mcpu=power9', 'match': '.*[mcpu=|vsx].*'},
+  detect: {'val': 'VSX3', 'match': 'VSX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0],
+)
+VSX4 = mod_features.new(
+  'VSX4', 4, implies: VSX3, args: {'val': '-mcpu=power10', 'match': '.*[mcpu=|vsx].*'},
+  detect: {'val': 'VSX4', 'match': 'VSX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vsx3.c')[0],
+  extra_tests: {
+    'VSX4_MMA': files(source_root + '/numpy/distutils/checks/extra_vsx4_mma.c')[0]
+  }
+)
+PPC64_FEATURES = {'VSX': VSX, 'VSX2': VSX2, 'VSX3': VSX3, 'VSX4': VSX4}
diff --git a/meson_cpu/s390x/meson.build b/meson_cpu/s390x/meson.build
new file mode 100644
index 000000000000..a69252d1607c
--- /dev/null
+++ b/meson_cpu/s390x/meson.build
@@ -0,0 +1,18 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+
+VX = mod_features.new(
+  'VX', 1, args: ['-mzvector', '-march=arch11'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vx.c')[0],
+)
+VXE = mod_features.new(
+  'VXE', 2, implies: VX, args: {'val': '-march=arch12', 'match': '-march=.*'},
+  detect: {'val': 'VXE', 'match': 'VX'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe.c')[0],
+)
+VXE2 = mod_features.new(
+  'VXE2', 3, implies: VXE, args: {'val': '-march=arch13', 'match': '-march=.*'},
+  detect: {'val': 'VXE2', 'match': 'VX.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_vxe2.c')[0],
+)
+S390X_FEATURES = {'VX': VX, 'VXE': VXE, 'VXE2': VXE2}
diff --git a/meson_cpu/x86/meson.build b/meson_cpu/x86/meson.build
new file mode 100644
index 000000000000..caf6bf09c14e
--- /dev/null
+++ b/meson_cpu/x86/meson.build
@@ -0,0 +1,227 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+
+SSE = mod_features.new(
+  'SSE', 1,  args: '-msse',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse.c')[0]
+)
+SSE2 = mod_features.new(
+  'SSE2', 2, implies: SSE,
+  args: '-msse2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse2.c')[0]
+)
+# enabling SSE without SSE2 is useless also it's non-optional for x86_64
+SSE.update(implies: SSE2)
+SSE3 = mod_features.new(
+  'SSE3', 3, implies: SSE2,
+  args: '-msse3',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse3.c')[0]
+)
+SSSE3 = mod_features.new(
+  'SSSE3', 4, implies: SSE3,
+  args: '-mssse3',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_ssse3.c')[0]
+)
+SSE41 = mod_features.new(
+  'SSE41', 5, implies: SSSE3,
+  args: '-msse4.1',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse41.c')[0]
+)
+POPCNT = mod_features.new(
+  'POPCNT', 6, implies: SSE41,
+  args: '-mpopcnt',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_popcnt.c')[0]
+)
+SSE42 = mod_features.new(
+  'SSE42', 7, implies: POPCNT, args: '-msse4.2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse42.c')[0]
+)
+# 7-20 left as margin for any extra features
+AVX = mod_features.new(
+  'AVX', 20, implies: SSE42, args: '-mavx',
+  detect: {'val': 'AVX', 'match': '.*SSE.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx.c')[0]
+)
+XOP = mod_features.new(
+  'XOP', 21, implies: AVX, args: '-mxop',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_xop.c')[0]
+)
+FMA4 = mod_features.new(
+  'FMA4', 22, implies: AVX, args: '-mfma4',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma4.c')[0]
+)
+# x86 half-precision
+F16C = mod_features.new(
+  'F16C', 23, implies: AVX, args: '-mf16c',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_f16c.c')[0]
+)
+FMA3 = mod_features.new(
+  'FMA3', 24, implies: F16C, args: '-mfma',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma3.c')[0]
+)
+AVX2 = mod_features.new(
+  'AVX2', 25, implies: F16C, args: '-mavx2',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx2.c')[0]
+)
+# 25-40 left as margin for any extra features
+AVX512F = mod_features.new(
+  'AVX512F', 40, implies: [FMA3, AVX2],
+  # Disables mmx because of stack corruption that may happen during mask
+  # conversions.
+  # TODO (seiko2plus): provide more clarification
+  args: ['-mno-mmx', '-mavx512f'],
+  detect: {'val': 'AVX512F', 'match': '.*'},
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512f.c')[0],
+  extra_tests: {
+    'AVX512F_REDUCE': files(source_root + '/numpy/distutils/checks/extra_avx512f_reduce.c')[0]
+  }
+)
+AVX512CD = mod_features.new(
+  'AVX512CD', 41, implies: AVX512F, args: '-mavx512cd',
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512cd.c')[0]
+)
+AVX512_KNL = mod_features.new(
+  'AVX512_KNL', 42, implies: AVX512CD, args: ['-mavx512er', '-mavx512pf'],
+  group: ['AVX512ER', 'AVX512PF'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knl.c')[0]
+)
+AVX512_KNM = mod_features.new(
+  'AVX512_KNM', 43, implies: AVX512_KNL,
+  args: ['-mavx5124fmaps', '-mavx5124vnniw', '-mavx512vpopcntdq'],
+  group: ['AVX5124FMAPS', 'AVX5124VNNIW', 'AVX512VPOPCNTDQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knm.c')[0]
+)
+AVX512_SKX = mod_features.new(
+  'AVX512_SKX', 50, implies: AVX512CD,
+  args: ['-mavx512vl', '-mavx512bw', '-mavx512dq'],
+  group: ['AVX512VL', 'AVX512BW', 'AVX512DQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_skx.c')[0],
+  extra_tests: {
+    'AVX512BW_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512bw_mask.c')[0],
+    'AVX512DQ_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512dq_mask.c')[0]
+  }
+)
+AVX512_CLX = mod_features.new(
+  'AVX512_CLX', 51, implies: AVX512_SKX, args: '-mavx512vnni',
+  group: ['AVX512VNNI'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_clx.c')[0]
+)
+AVX512_CNL = mod_features.new(
+  'AVX512_CNL', 52, implies: AVX512_SKX,
+  args: ['-mavx512ifma', '-mavx512vbmi'],
+  group: ['AVX512IFMA', 'AVX512VBMI'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_cnl.c')[0]
+)
+AVX512_ICL = mod_features.new(
+  'AVX512_ICL', 53, implies: [AVX512_CLX, AVX512_CNL],
+  args: ['-mavx512vbmi2', '-mavx512bitalg', '-mavx512vpopcntdq'],
+  group: ['AVX512VBMI2', 'AVX512BITALG', 'AVX512VPOPCNTDQ'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_icl.c')[0]
+)
+# TODO add support for zen4
+AVX512_SPR = mod_features.new(
+  'AVX512_SPR', 55, implies: AVX512_ICL,
+  args: ['-mavx512fp16'],
+  group: ['AVX512FP16'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_spr.c')[0]
+)
+
+# Specializations for non unix-like compilers
+# -------------------------------------------
+cpu_family = host_machine.cpu_family()
+compiler_id = meson.get_compiler('c').get_id()
+if compiler_id not in ['gcc', 'clang']
+  AVX512_SPR.update(disable: compiler_id + ' compiler does not support it')
+endif
+
+# Common specializations between both Intel compilers (unix-like and msvc-like)
+if compiler_id in ['intel', 'intel-cl']
+  # POPCNT, and F16C don't own private FLAGS however the compiler still
+  # provides ISA capability for them.
+  POPCNT.update(args: '')
+  F16C.update(args: '')
+  # Intel compilers don't support the following features independently
+  FMA3.update(implies: [F16C, AVX2])
+  AVX2.update(implies: [F16C, FMA3])
+  AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX])
+  AVX512CD.update(implies: [AVX512F, AVX512_SKX])
+  XOP.update(disable: 'Intel Compiler does not support it')
+  FMA4.update(disable: 'Intel Compiler does not support it')
+endif
+
+if compiler_id == 'intel-cl'
+  foreach fet : [SSE, SSE2, SSE3, SSSE3, AVX]
+    fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': '/arch:.*'})
+  endforeach
+  SSE41.update(args: {'val': '/arch:SSE4.1', 'match': '/arch:.*'})
+  SSE42.update(args: {'val': '/arch:SSE4.2', 'match': '/arch:.*'})
+  FMA3.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
+  AVX2.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
+  AVX512F.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
+  AVX512CD.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
+  AVX512_KNL.update(args: {'val': '/Qx:KNL', 'match': '/[arch|Qx]:.*'})
+  AVX512_KNM.update(args: {'val': '/Qx:KNM', 'match': '/[arch|Qx]:.*'})
+  AVX512_SKX.update(args: {'val': '/Qx:SKYLAKE-AVX512', 'match': '/[arch|Qx]:.*'})
+  AVX512_CLX.update(args: {'val': '/Qx:CASCADELAKE', 'match': '/[arch|Qx]:.*'})
+  AVX512_CNL.update(args: {'val': '/Qx:CANNONLAKE', 'match': '/[arch|Qx]:.*'})
+  AVX512_ICL.update(args: {'val': '/Qx:ICELAKE-CLIENT', 'match': '/[arch|Qx]:.*'})
+endif
+
+if compiler_id == 'intel'
+  clear_m = '^(-mcpu=|-march=)'
+  clear_any = '^(-mcpu=|-march=|-x[A-Z0-9\-])'
+  FMA3.update(args: {'val': '-march=core-avx2', 'match': clear_m})
+  AVX2.update(args: {'val': '-march=core-avx2', 'match': clear_m})
+  AVX512F.update(args: {'val': '-march=common-avx512', 'match': clear_m})
+  AVX512CD.update(args: {'val': '-march=common-avx512', 'match': clear_m})
+  AVX512_KNL.update(args: {'val': '-xKNL', 'match': clear_any})
+  AVX512_KNM.update(args: {'val': '-xKNM', 'match': clear_any})
+  AVX512_SKX.update(args: {'val': '-xSKYLAKE-AVX512', 'match': clear_any})
+  AVX512_CLX.update(args: {'val': '-xCASCADELAKE', 'match': clear_any})
+  AVX512_CNL.update(args: {'val': '-xCANNONLAKE', 'match': clear_any})
+  AVX512_ICL.update(args: {'val': '-xICELAKE-CLIENT', 'match': clear_any})
+endif
+
+if compiler_id == 'msvc'
+  # MSVC compiler doesn't support the following features
+  foreach fet : [AVX512_KNL, AVX512_KNM]
+    fet.update(disable: compiler_id + ' compiler does not support it')
+  endforeach
+  # The following features don't own private FLAGS, however the compiler still
+  # provides ISA capability for them.
+  foreach fet : [
+    SSE3, SSSE3, SSE41, POPCNT, SSE42, AVX, F16C, XOP, FMA4,
+    AVX512F, AVX512CD, AVX512_CLX, AVX512_CNL,
+    AVX512_ICL
+  ]
+    fet.update(args: '')
+  endforeach
+  # MSVC compiler doesn't support the following features independently
+  FMA3.update(implies: [F16C, AVX2])
+  AVX2.update(implies: [F16C, FMA3])
+  AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX])
+  AVX512CD.update(implies: [AVX512F, AVX512_SKX])
+  clear_arch = '/arch:.*'
+  # only available on 32-bit. Its enabled by default on 64-bit mode
+  foreach fet : [SSE, SSE2]
+    if cpu_family == 'x86'
+      fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': clear_arch})
+    else
+      fet.update(args: '')
+    endif
+  endforeach
+  FMA3.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
+  AVX2.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
+  AVX512_SKX.update(args: {'val': '/arch:AVX512', 'match': clear_arch})
+endif
+
+X86_FEATURES = {
+ 'SSE': SSE, 'SSE2': SSE2, 'SSE3': SSE3, 'SSSE3': SSSE3,
+ 'SSE41': SSE41, 'POPCNT': POPCNT, 'SSE42': SSE42, 'AVX': AVX,
+ 'XOP': XOP, 'FMA4': FMA4, 'F16C': F16C, 'FMA3': FMA3,
+ 'AVX2': AVX2, 'AVX512F': AVX512F, 'AVX512CD': AVX512CD,
+ 'AVX512_KNL': AVX512_KNL, 'AVX512_KNM': AVX512_KNM,
+ 'AVX512_SKX': AVX512_SKX, 'AVX512_CLX': AVX512_CLX,
+ 'AVX512_CNL': AVX512_CNL, 'AVX512_ICL': AVX512_ICL,
+ 'AVX512_SPR': AVX512_SPR
+}
diff --git a/meson_options.txt b/meson_options.txt
index 7ce4eefacd89..8b1fad6c4041 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,7 +1,7 @@
 option('blas', type: 'string', value: 'openblas',
-        description: 'option for BLAS library switching')
+        description: 'Option for BLAS library switching')
 option('lapack', type: 'string', value: 'openblas',
-        description: 'option for LAPACK library switching')
+        description: 'Option for LAPACK library switching')
 option('allow-noblas', type: 'boolean', value: false,
         description: 'If set to true, allow building with (slow!) internal fallback routines')
 option('use-ilp64', type: 'boolean', value: false,
@@ -12,8 +12,22 @@ option('disable-svml', type: 'boolean', value: false,
         description: 'Disable building against SVML')
 option('disable-threading', type: 'boolean', value: false,
         description: 'Disable threading support (see `NPY_ALLOW_THREADS` docs)')
-# TODO: flip value to 'false' once we have `npy_cpu_dispatch_config.h` & co.
-option('disable-simd-optimizations', type: 'boolean', value: true,
-        description: 'Disable SIMD features beyond the baseline ones')
+option('disable-optimization', type: 'boolean', value: false,
+        description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
+option('cpu-baseline', type: 'string', value: 'min',
+        description: 'Minimal set of required CPU features')
+option('cpu-dispatch', type: 'string', value: 'max -xop -fma4',
+        description: 'Dispatched set of additional CPU features')
+option('test-simd', type: 'array',
+        value: [
+          'BASELINE', 'SSE2', 'SSE42', 'XOP', 'FMA4',
+          'AVX2', 'FMA3', 'AVX2,FMA3', 'AVX512F', 'AVX512_SKX',
+          'VSX', 'VSX2', 'VSX3', 'VSX4',
+          'NEON', 'ASIMD',
+          'VX', 'VXE', 'VXE2',
+        ],
+        description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
+option('test-simd-args', type: 'string', value: '',
+        description: 'Extra args to be passed to the `_simd` module that is used for testing the NumPy SIMD interface')
 option('relaxed-strides-debug', type: 'boolean', value: false,
         description: 'Enable relaxed strides debug mode (see `NPY_RELAXED_STRIDES_DEBUG` docs)')
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 17760efa2fc0..ccc060aacb96 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -84,6 +84,7 @@ cdata.set('NPY_API_VERSION', C_API_VERSION)
 use_svml = (
   host_machine.system() == 'linux' and
   host_machine.cpu_family() == 'x86_64' and
+  ('AVX512_SKX' in CPU_DISPATCH_NAMES or 'AVX512_SKX' in CPU_BASELINE_NAMES) and
   not get_option('disable-svml')
 )
 if use_svml
@@ -291,9 +292,6 @@ endforeach
 
 # SSE headers only enabled automatically on amd64/x32 builds
 optional_headers = [
-  'xmmintrin.h',  # SSE
-  'emmintrin.h',  # SSE2
-  'immintrin.h',  # AVX
   'features.h',   # for glibc version linux
   'xlocale.h',    # see GH#8367
   'dlfcn.h',      # dladdr
@@ -322,6 +320,15 @@ optional_function_attributes = [
 #  endif
 #endforeach
 
+# Max possible optimization flags. We pass this flags to all our dispatch-able
+# (multi_targets) sources.
+compiler_id = cc.get_id()
+max_opt = {
+  'msvc': ['/O2'],
+  'intel-cl': ['/O3'],
+}.get(compiler_id, ['-O3'])
+max_opt = cc.has_multi_arguments(max_opt) ? max_opt : []
+
 # Optional GCC compiler builtins and their call arguments.
 # If given, a required header and definition name (HAVE_ prepended)
 # Call arguments are required as the compiler will do strict signature checking
@@ -513,12 +520,6 @@ if cc.get_id() == 'msvc'
      staticlib_cflags +=  '-d2VolatileMetadata-'
   endif
 endif
-# TODO: change to "feature" option in meson_options.txt? See
-# https://mesonbuild.com/Build-options.html#build-options
-if get_option('disable-simd-optimizations')
-  staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
-  staticlib_cppflags += '-DNPY_DISABLE_OPTIMIZATION'
-endif
 
 npy_math_internal_h = custom_target(
   output: 'npy_math_internal.h',
@@ -626,19 +627,10 @@ src_ufunc_api = custom_target('__ufunc_api',
 
 # Set common build flags for C and C++ code
 # -----------------------------------------
-
-# TODO: change to "feature" option in meson_options.txt? See
-# https://mesonbuild.com/Build-options.html#build-options
-disable_simd_optimizations = []
-if get_option('disable-simd-optimizations')
-  disable_simd_optimizations = '-DNPY_DISABLE_OPTIMIZATION'
-endif
-
 # Common build flags
 c_args_common = [
   '-DNPY_INTERNAL_BUILD',
   '-DHAVE_NPY_CONFIG_H',
-  disable_simd_optimizations,
   cflags_large_file_support,
 ]
 
@@ -667,11 +659,9 @@ np_core_dep = declare_dependency(
     '.',
     'include',
     'src/common',
-  ],
-  compile_args: disable_simd_optimizations
+  ]
 )
 
-
 # Build multiarray_tests module
 # -----------------------------
 py.extension_module('_multiarray_tests',
@@ -691,15 +681,30 @@ py.extension_module('_multiarray_tests',
   subdir: 'numpy/core',
 )
 
+_umath_tests_mtargets = mod_features.multi_targets(
+  '_umath_tests.dispatch.h',
+  'src/umath/_umath_tests.dispatch.c',
+  dispatch: [
+    AVX2, SSE41, SSE2,
+    ASIMDHP, ASIMD, NEON,
+    VSX3, VSX2, VSX,
+    VXE, VX,
+  ],
+  baseline: CPU_BASELINE,
+  prefix: 'NPY_',
+  dependencies: [py_dep, np_core_dep]
+)
+
 test_modules_src = [
   ['_umath_tests', [
       src_file.process('src/umath/_umath_tests.c.src'),
-      'src/umath/_umath_tests.dispatch.c',
       'src/common/npy_cpu_features.c',
-    ]],
-  ['_rational_tests', 'src/umath/_rational_tests.c'],
-  ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c'],
-  ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c'],
+    ],
+    _umath_tests_mtargets.static_lib('_umath_tests_mtargets')
+  ],
+  ['_rational_tests', 'src/umath/_rational_tests.c', []],
+  ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c', []],
+  ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c', []],
 ]
 foreach gen: test_modules_src
   py.extension_module(gen[0],
@@ -709,7 +714,261 @@ foreach gen: test_modules_src
     dependencies: np_core_dep,
     install: true,
     subdir: 'numpy/core',
+    link_with: gen[2],
+  )
+endforeach
+
+# Build multiarray dispatch-able sources
+# --------------------------------------
+multiarray_gen_headers = [
+  src_file.process('src/multiarray/arraytypes.h.src'),
+  src_file.process('src/common/npy_sort.h.src'),
+]
+foreach gen_mtargets : [
+  [
+    'argfunc.dispatch.h',
+    src_file.process('src/multiarray/argfunc.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX2, XOP, SSE42, SSE2,
+      VSX2,
+      ASIMD, NEON,
+      VXE, VX
+    ]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    baseline: CPU_BASELINE,
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
+  )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
+endforeach
+
+# Build npysort dispatch-able sources
+# -----------------------------------
+foreach gen_mtargets : [
+  [
+    'simd_qsort.dispatch.h',
+    'src/npysort/simd_qsort.dispatch.cpp',
+    [AVX512_SKX]
+  ],
+  [
+    'simd_qsort_16bit.dispatch.h',
+    'src/npysort/simd_qsort_16bit.dispatch.cpp',
+    [AVX512_SPR, AVX512_ICL]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    # baseline: CPU_BASELINE, it doesn't provide baseline fallback
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
   )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
+endforeach
+
+# Build umath dispatch-able sources
+# ---------------------------------
+mod_features = import('features')
+umath_gen_headers = [
+  src_file.process('src/umath/loops.h.src'),
+  src_file.process('src/umath/loops_utils.h.src'),
+]
+
+foreach gen_mtargets : [
+  [
+    'loops_arithm_fp.dispatch.h',
+    src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
+    [
+      [AVX2, FMA3], SSE2,
+      ASIMD, NEON,
+      VSX3, VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_arithmetic.dispatch.h',
+    src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
+      NEON,
+      VSX4, VSX2,
+      VX,
+    ]
+  ],
+  [
+    'loops_comparison.dispatch.h',
+    src_file.process('src/umath/loops_comparison.dispatch.c.src'),
+    [
+      AVX512_SKX, AVX512F, AVX2, SSE42, SSE2,
+      VSX3, VSX2,
+      NEON,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_exponent_log.dispatch.h',
+    src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
+    # Enabling SIMD on clang-cl raises spurious FP exceptions
+    # TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
+    compiler_id == 'clang-cl' ? [] : [
+      AVX512_SKX, AVX512F, [AVX2, FMA3]
+    ]
+  ],
+  [
+    'loops_hyperbolic.dispatch.h',
+    src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
+    [
+      AVX512_SKX, [AVX2, FMA3],
+      VSX4, VSX2,
+      NEON_VFPV4,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_logical.dispatch.h',
+    src_file.process('src/umath/loops_logical.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VX,
+    ]
+  ],
+  [
+    'loops_minmax.dispatch.h',
+    src_file.process('src/umath/loops_minmax.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_modulo.dispatch.h',
+    src_file.process('src/umath/loops_modulo.dispatch.c.src'),
+    [
+      VSX4
+    ]
+  ],
+  [
+    'loops_trigonometric.dispatch.h',
+    src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
+    # Enabling SIMD on clang-cl raises spurious FP exceptions
+    # TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
+    compiler_id == 'clang-cl' ? [] : [
+      AVX512F, [AVX2, FMA3],
+      VSX4, VSX3, VSX2,
+      NEON_VFPV4,
+      VXE2, VXE
+    ]
+  ],
+  [
+    'loops_umath_fp.dispatch.h',
+    src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
+    [AVX512_SKX]
+  ],
+  [
+    'loops_unary.dispatch.h',
+    src_file.process('src/umath/loops_unary.dispatch.c.src'),
+    [
+      ASIMD, NEON,
+      AVX512_SKX, AVX2, SSE2,
+      VSX2,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_unary_fp.dispatch.h',
+    src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
+    [
+      SSE41, SSE2,
+      VSX2,
+      ASIMD, NEON,
+      VXE, VX
+    ]
+  ],
+  [
+    'loops_unary_fp_le.dispatch.h',
+    src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+    [
+      SSE41, SSE2,
+      VSX2,
+      ASIMD, NEON,
+    ]
+  ],
+  [
+    'loops_unary_complex.dispatch.h',
+    src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
+    [
+      AVX512F, [AVX2, FMA3], SSE2,
+      ASIMD, NEON,
+      VSX3, VSX2,
+      VXE, VX,
+    ]
+  ],
+  [
+    'loops_autovec.dispatch.h',
+    src_file.process('src/umath/loops_autovec.dispatch.c.src'),
+    [
+      AVX2, SSE2,
+      NEON,
+      VSX2,
+      VX,
+    ]
+  ],
+]
+  mtargets = mod_features.multi_targets(
+    gen_mtargets[0], umath_gen_headers + gen_mtargets[1],
+    dispatch: gen_mtargets[2],
+    baseline: CPU_BASELINE,
+    prefix: 'NPY_',
+    dependencies: [py_dep, np_core_dep],
+    c_args: c_args_common + max_opt,
+    cpp_args: cpp_args_common + max_opt,
+    include_directories: [
+      'include',
+      'src/common',
+      'src/multiarray',
+      'src/npymath',
+      'src/umath'
+    ]
+  )
+  if not is_variable('multiarray_umath_mtargets')
+    multiarray_umath_mtargets = mtargets
+  else
+    multiarray_umath_mtargets.extend(mtargets)
+  endif
 endforeach
 
 # Build _multiarray_umath module
@@ -733,12 +992,10 @@ if have_blas
   ]
 endif
 
-src_multiarray = [
+src_multiarray = multiarray_gen_headers + [
   'src/multiarray/abstractdtypes.c',
   'src/multiarray/alloc.c',
-  src_file.process('src/multiarray/argfunc.dispatch.c.src'),
   'src/multiarray/arrayobject.c',
-  src_file.process('src/multiarray/arraytypes.h.src'),
   'src/multiarray/array_coercion.c',
   'src/multiarray/array_method.c',
   'src/multiarray/array_assign_scalar.c',
@@ -792,9 +1049,6 @@ src_multiarray = [
   'src/multiarray/typeinfo.c',
   'src/multiarray/usertypes.c',
   'src/multiarray/vdot.c',
-  src_file.process('src/common/npy_sort.h.src'),
-  'src/npysort/simd_qsort.dispatch.cpp',
-  'src/npysort/simd_qsort_16bit.dispatch.cpp',
   'src/npysort/quicksort.cpp',
   'src/npysort/mergesort.cpp',
   'src/npysort/timsort.cpp',
@@ -817,26 +1071,9 @@ src_multiarray = [
   'src/npymath/arm64_exports.c',
 ]
 
-src_umath = [
+src_umath = umath_gen_headers + [
   src_file.process('src/umath/funcs.inc.src'),
-  src_file.process('src/umath/loops.h.src'),
-  src_file.process('src/umath/loops_utils.h.src'),
   src_file.process('src/umath/loops.c.src'),
-  src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
-  src_file.process('src/umath/loops_comparison.dispatch.c.src'),
-  src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
-  src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
-  src_file.process('src/umath/loops_logical.dispatch.c.src'),
-  src_file.process('src/umath/loops_minmax.dispatch.c.src'),
-  src_file.process('src/umath/loops_modulo.dispatch.c.src'),
-  src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
-  src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
-  src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
-  src_file.process('src/umath/loops_autovec.dispatch.c.src'),
   src_file.process('src/umath/matmul.c.src'),
   src_file.process('src/umath/matmul.h.src'),
   'src/umath/ufunc_type_resolution.c',
@@ -863,52 +1100,24 @@ src_umath = [
 # may be able to avoid the accuracy regressions in SVML.
 svml_objects = []
 if use_svml
-  svml_objects += [
-    'src/umath/svml/linux/avx512/svml_z0_acos_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acos_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acosh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_acosh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asin_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asin_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asinh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_asinh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atan_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atanh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_atanh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cbrt_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cbrt_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cos_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cos_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cosh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_cosh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_exp_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_expm1_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_expm1_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log10_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log10_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log1p_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log1p_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log2_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log2_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_log_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_pow_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_pow_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sin_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sin_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sinh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_sinh_s_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tan_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tan_s_la.s',
-    # 'src/umath/svml/linux/avx512/svml_z0_tanh_d_la.s',
-    'src/umath/svml/linux/avx512/svml_z0_tanh_s_la.s',
+  foreach svml_func : [
+    'acos', 'acosh', 'asin',
+    'asinh', 'atan2',
+    'atan',  'atanh',
+    'cbrt', 'cos',
+    'cosh', 'exp2',
+    'exp', 'expm1',
+    'log10', 'log1p',
+    'log2', 'log',
+    'pow', 'sin', 'sinh', 'tan',
+    'tanh'
   ]
+    foreach svml_sfx : ['d_la', 's_la', 'd_ha', 's_la']
+      svml_objects += [
+        'src/umath/svml/linux/avx512/svml_z0_'+svml_func+'_'+svml_sfx+'.s'
+      ]
+    endforeach
+  endforeach
 endif
 
 py.extension_module('_multiarray_umath',
@@ -934,26 +1143,60 @@ py.extension_module('_multiarray_umath',
     'src/umath',
   ],
   dependencies: blas_dep,
-  link_with: npymath_lib,
+  link_with: [npymath_lib, multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')],
   install: true,
   subdir: 'numpy/core',
 )
 
 # Build SIMD module
 # -----------------
+_simd_dispatch = []
+_simd_baseline = []
+foreach target : get_option('test-simd')
+  target = target.strip().to_upper().split(',')
+  mfeatures = []
+  foreach fet_name : target
+    if fet_name == 'BASELINE'
+      _simd_baseline = CPU_BASELINE
+      break
+    endif
+    if fet_name not in CPU_FEATURES
+      error('Expected a valid feature name, got('+fet_name+')')
+    endif
+    mfeatures += CPU_FEATURES[fet_name]
+  endforeach
+  _simd_dispatch += [mfeatures]
+endforeach
 
-py.extension_module('_simd',
+_simd_mtargets = mod_features.multi_targets(
+  '_simd.dispatch.h',
   [
-    'src/common/npy_cpu_features.c',
-    'src/_simd/_simd.c',
     src_file.process('src/_simd/_simd_inc.h.src'),
     src_file.process('src/_simd/_simd_data.inc.src'),
     src_file.process('src/_simd/_simd.dispatch.c.src'),
   ],
+  # Skip validating the order of `_simd_dispatch` because we execute all these
+  # features, not just the highest interest one. The sorting doesn't matter
+  # here, given the nature of this testing unit.
+  keep_sort: true,
+  dispatch: _simd_dispatch,
+  baseline: _simd_baseline,
+  prefix: 'NPY_',
+  dependencies: [py_dep, np_core_dep],
+  include_directories: ['src/_simd', 'src/npymath'],
+  c_args: c_args_common,
+  cpp_args: cpp_args_common,
+)
+
+py.extension_module('_simd',
+  [
+    'src/common/npy_cpu_features.c',
+    'src/_simd/_simd.c',
+  ],
   c_args: c_args_common,
   include_directories: ['src/_simd', 'src/npymath'],
   dependencies: np_core_dep,
-  link_with: npymath_lib,
+  link_with: [npymath_lib, _simd_mtargets.static_lib('_simd_mtargets')],
   install: true,
   subdir: 'numpy/core',
 )
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
index 52b66e7652a8..5a113fe57876 100644
--- a/numpy/core/src/_simd/_simd.c
+++ b/numpy/core/src/_simd/_simd.c
@@ -85,9 +85,13 @@ PyMODINIT_FUNC PyInit__simd(void)
                 goto err;                                                      \
             }                                                                  \
         }
-
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
-    NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #ifdef NPY__CPU_MESON_BUILD
+        NPY_MTARGETS_CONF_DISPATCH(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+        NPY_MTARGETS_CONF_BASELINE(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #else
+        NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+        NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    #endif
     return m;
 err:
     Py_DECREF(m);
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index f532c9e022f7..51f5ddd54b22 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -919,7 +919,9 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
-    #ifdef NPY__CPU_TARGET_CURRENT
+    #if defined(NPY_MTARGETS_CURRENT) // meson build
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY_MTARGETS_CURRENT),
+    #elif defined(NPY__CPU_TARGET_CURRENT)
         .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
     #else
         .m_name = "numpy.core._simd.baseline",
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 4d5addec809e..699f8536f6a2 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -43,6 +43,7 @@
         #endif
     #endif
 #endif // !NPY_DISABLE_OPTIMIZATION
+#ifndef NPY__CPU_MESON_BUILD
 /**
  * Macro NPY_CPU_DISPATCH_CURFX(NAME)
  *
@@ -261,5 +262,5 @@
     ((TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
 #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
     ( LEFT __VA_ARGS__ )
-
+#endif // NPY__CPU_MESON_BUILD
 #endif  // NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 72a87eac1715..357b136d25cd 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -321,7 +321,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
     {
         npyv_f32 m = npyv_mul_f32(a, b);
-    #if NPY_HAVE_SSE3
+    #ifdef NPY_HAVE_SSE3
         return _mm_addsub_ps(m, c);
     #else
         const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
@@ -331,7 +331,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     {
         npyv_f64 m = npyv_mul_f64(a, b);
-    #if NPY_HAVE_SSE3
+    #ifdef NPY_HAVE_SSE3
         return _mm_addsub_pd(m, c);
     #else
         const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);