Merge pull request #20976 from rafaelcfsousa/p10_enh_intdiv

mattip · web-flow · commit b97e7d585de4 · 2022-02-10T12:07:53.000+02:00
ENH,BENCH: Optimize floor_divide for VSX4/Power10
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
@@ -150,6 +150,19 @@ def setup(self, dtype, divisor):
     def time_floor_divide_int(self, dtype, divisor):
         self.x // divisor
 
+class CustomArrayFloorDivideInt(Benchmark):
+    params = (np.sctypes['int'] + np.sctypes['uint'], [100, 10000, 1000000])
+    param_names = ['dtype', 'size']
+
+    def setup(self, dtype, size):
+        iinfo = np.iinfo(dtype)
+        self.x = np.random.randint(
+                    iinfo.min, iinfo.max, size=size, dtype=dtype)
+        self.y = np.random.randint(2, 32, size=size, dtype=dtype)
+
+    def time_floor_divide_int(self, dtype, size):
+        self.x // self.y
+
 
 class Scalar(Benchmark):
     def setup(self):
diff --git a/doc/release/upcoming_changes/20821.improvement.rst b/doc/release/upcoming_changes/20821.improvement.rst
@@ -0,0 +1,5 @@
+Add support for VSX4/Power10
+----------------------------------------------
+With VSX4/Power10 enablement, the new instructions available in 
+Power ISA 3.1 can be used to accelerate some NumPy operations, 
+e.g., floor_divide, modulo, etc.
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
@@ -46,9 +46,6 @@
  *  - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division
  *    since emulating multiply-high is expensive and both architectures have very fast dividers.
  *
- ** TODO:
- *   - Add support for Power10(VSX4)
- *
  ***************************************************************
  ** Figure 4.1: Unsigned division by run–time invariant divisor
  ***************************************************************
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -97,9 +97,6 @@
 /***************************
  * Integer Division
  ***************************/
-/***
- * TODO: Add support for VSX4(Power10)
- */
 // See simd/intdiv.h for more clarification
 // divide each unsigned 8-bit element by a precomputed divisor
 NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
@@ -172,6 +169,10 @@ NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
 // divide each unsigned 32-bit element by a precomputed divisor
 NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
 {
+#if defined(NPY_HAVE_VSX4)
+    // high part of unsigned multiplication
+    npyv_u32 mulhi    = vec_mulh(a, divisor.val[0]);
+#else
 #if defined(__GNUC__) && __GNUC__ < 8
     // Doubleword integer wide multiplication supported by GCC 8+
     npyv_u64 mul_even, mul_odd;
@@ -184,6 +185,7 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
 #endif
     // high part of unsigned multiplication
     npyv_u32 mulhi    = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd);
+#endif
     // floor(x/d)     = (((a-mulhi) >> sh1) + mulhi) >> sh2
     npyv_u32 q        = vec_sub(a, mulhi);
              q        = vec_sr(q, divisor.val[1]);
@@ -194,6 +196,10 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
 // divide each signed 32-bit element by a precomputed divisor (round towards zero)
 NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
 {
+#if defined(NPY_HAVE_VSX4)
+    // high part of signed multiplication
+    npyv_s32 mulhi    = vec_mulh(a, divisor.val[0]);
+#else
 #if defined(__GNUC__) && __GNUC__ < 8
     // Doubleword integer wide multiplication supported by GCC8+
     npyv_s64 mul_even, mul_odd;
@@ -206,6 +212,7 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
 #endif
     // high part of signed multiplication
     npyv_s32 mulhi    = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd);
+#endif
     // q              = ((a + mulhi) >> sh1) - XSIGN(a)
     // trunc(a/d)     = (q ^ dsign) - dsign
     npyv_s32 q        = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
@@ -216,8 +223,12 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
 // divide each unsigned 64-bit element by a precomputed divisor
 NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
 {
+#if defined(NPY_HAVE_VSX4)
+    return vec_div(a, divisor.val[0]);
+#else
     const npy_uint64 d = vec_extract(divisor.val[0], 0);
     return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d);
+#endif
 }
 // divide each signed 64-bit element by a precomputed divisor (round towards zero)
 NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -1,7 +1,7 @@
 /*@targets
  ** $maxopt baseline
  ** sse2 sse41 avx2 avx512f avx512_skx
- ** vsx2
+ ** vsx2 vsx4
  ** neon
  **/
 #define _UMATHMODULE
@@ -101,6 +101,82 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 }
 /**end repeat**/
 
+#if defined(NPY_HAVE_VSX4)
+/*
+ * As Power10 only supports integer vector division for data of 32 bits or
+ * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division instruction, and then, convert the result back to npyv_u8.
+ */
+NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b)
+{
+    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
+    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
+    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
+    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
+    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
+    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
+    npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]);
+    npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]);
+    npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]);
+    npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]);
+    npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1);
+    npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2);
+    npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo);
+    return res;
+}
+
+NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b)
+{
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
+    npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]);
+    npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo);
+    return res;
+}
+
+#define vsx4_div_u32 vec_div
+
+/**begin repeat
+ * Unsigned types
+ * #sfx    = u8, u16, u32#
+ * #len    = 8,  16,  32#
+ */
+static NPY_INLINE void
+vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ *src1  = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ *src2  = (npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
+    const npyv_@sfx@ zero      = npyv_zero_@sfx@();
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst += vstep) {
+        npyv_@sfx@ a = npyv_load_@sfx@(src1);
+        npyv_@sfx@ b = npyv_load_@sfx@(src2);
+        npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
+        npyv_store_@sfx@(dst, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, zero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_@sfx@ a = *src1;
+        const npyv_lanetype_@sfx@ b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst = 0;
+        } else{
+            *dst = a / b;
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_HAVE_VSX4
+
 /**begin repeat
  * Unsigned types
  * #sfx    = u8, u16, u32, u64#
@@ -128,7 +204,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     npyv_cleanup();
 }
 /**end repeat**/
-#endif
+#endif // NPY_SIMD
 
 /********************************************************************************
  ** Defining ufunc inner functions
@@ -203,6 +279,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
  *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
  *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
+ *  #vector = 1, 1, 1, 0, 0#
  */
 #undef TO_SIMD_SFX
 #if 0
@@ -218,8 +295,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
  * because emulating multiply-high on these architectures is going to be expensive comparing
  * to the native scalar dividers.
  * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
- * Power10(VSX4) is an exception here since it has native support for integer vector division,
- * note neither infrastructure nor NPYV has supported VSX4 yet.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
  */
 #if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
@@ -240,6 +316,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         *((@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4) && @vector@
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
     // for contiguous block of memory, divisor is a scalar and not 0
     else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
              (*(@type@ *)args[1]) != 0) {