Skip to content

Commit b97e7d5

Browse files
authored
Merge pull request #20976 from rafaelcfsousa/p10_enh_intdiv
ENH,BENCH: Optimize floor_divide for VSX4/Power10
2 parents d7929b3 + eed9c36 commit b97e7d5

File tree

5 files changed

+118
-10
lines changed

5 files changed

+118
-10
lines changed

benchmarks/benchmarks/bench_ufunc.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,19 @@ def setup(self, dtype, divisor):
150150
def time_floor_divide_int(self, dtype, divisor):
151151
self.x // divisor
152152

153+
class CustomArrayFloorDivideInt(Benchmark):
154+
params = (np.sctypes['int'] + np.sctypes['uint'], [100, 10000, 1000000])
155+
param_names = ['dtype', 'size']
156+
157+
def setup(self, dtype, size):
158+
iinfo = np.iinfo(dtype)
159+
self.x = np.random.randint(
160+
iinfo.min, iinfo.max, size=size, dtype=dtype)
161+
self.y = np.random.randint(2, 32, size=size, dtype=dtype)
162+
163+
def time_floor_divide_int(self, dtype, size):
164+
self.x // self.y
165+
153166

154167
class Scalar(Benchmark):
155168
def setup(self):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Add support for VSX4/Power10
2+
----------------------------------------------
3+
With VSX4/Power10 enablement, the new instructions available in
4+
Power ISA 3.1 can be used to accelerate some NumPy operations,
5+
e.g., floor_divide, modulo, etc.

numpy/core/src/common/simd/intdiv.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@
4646
* - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division
4747
* since emulating multiply-high is expensive and both architectures have very fast dividers.
4848
*
49-
** TODO:
50-
* - Add support for Power10(VSX4)
51-
*
5249
***************************************************************
5350
** Figure 4.1: Unsigned division by run–time invariant divisor
5451
***************************************************************

numpy/core/src/common/simd/vsx/arithmetic.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@
9797
/***************************
9898
* Integer Division
9999
***************************/
100-
/***
101-
* TODO: Add support for VSX4(Power10)
102-
*/
103100
// See simd/intdiv.h for more clarification
104101
// divide each unsigned 8-bit element by a precomputed divisor
105102
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
@@ -172,6 +169,10 @@ NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
172169
// divide each unsigned 32-bit element by a precomputed divisor
173170
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
174171
{
172+
#if defined(NPY_HAVE_VSX4)
173+
// high part of unsigned multiplication
174+
npyv_u32 mulhi = vec_mulh(a, divisor.val[0]);
175+
#else
175176
#if defined(__GNUC__) && __GNUC__ < 8
176177
// Doubleword integer wide multiplication supported by GCC 8+
177178
npyv_u64 mul_even, mul_odd;
@@ -184,6 +185,7 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
184185
#endif
185186
// high part of unsigned multiplication
186187
npyv_u32 mulhi = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd);
188+
#endif
187189
// floor(x/d) = (((a-mulhi) >> sh1) + mulhi) >> sh2
188190
npyv_u32 q = vec_sub(a, mulhi);
189191
q = vec_sr(q, divisor.val[1]);
@@ -194,6 +196,10 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
194196
// divide each signed 32-bit element by a precomputed divisor (round towards zero)
195197
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
196198
{
199+
#if defined(NPY_HAVE_VSX4)
200+
// high part of signed multiplication
201+
npyv_s32 mulhi = vec_mulh(a, divisor.val[0]);
202+
#else
197203
#if defined(__GNUC__) && __GNUC__ < 8
198204
// Doubleword integer wide multiplication supported by GCC8+
199205
npyv_s64 mul_even, mul_odd;
@@ -206,6 +212,7 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
206212
#endif
207213
// high part of signed multiplication
208214
npyv_s32 mulhi = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd);
215+
#endif
209216
// q = ((a + mulhi) >> sh1) - XSIGN(a)
210217
// trunc(a/d) = (q ^ dsign) - dsign
211218
npyv_s32 q = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
@@ -216,8 +223,12 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
216223
// divide each unsigned 64-bit element by a precomputed divisor
217224
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
218225
{
226+
#if defined(NPY_HAVE_VSX4)
227+
return vec_div(a, divisor.val[0]);
228+
#else
219229
const npy_uint64 d = vec_extract(divisor.val[0], 0);
220230
return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d);
231+
#endif
221232
}
222233
// divide each signed 64-bit element by a precomputed divisor (round towards zero)
223234
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)

numpy/core/src/umath/loops_arithmetic.dispatch.c.src

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*@targets
22
** $maxopt baseline
33
** sse2 sse41 avx2 avx512f avx512_skx
4-
** vsx2
4+
** vsx2 vsx4
55
** neon
66
**/
77
#define _UMATHMODULE
@@ -101,6 +101,82 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
101101
}
102102
/**end repeat**/
103103

104+
#if defined(NPY_HAVE_VSX4)
105+
/*
106+
* As Power10 only supports integer vector division for data of 32 bits or
107+
* greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
108+
* vector division instruction, and then, convert the result back to npyv_u8.
109+
*/
110+
NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b)
111+
{
112+
npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
113+
npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
114+
npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
115+
npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
116+
npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
117+
npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
118+
npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]);
119+
npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]);
120+
npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]);
121+
npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]);
122+
npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1);
123+
npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2);
124+
npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo);
125+
return res;
126+
}
127+
128+
NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b)
129+
{
130+
npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
131+
npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
132+
npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]);
133+
npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]);
134+
npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo);
135+
return res;
136+
}
137+
138+
#define vsx4_div_u32 vec_div
139+
140+
/**begin repeat
141+
* Unsigned types
142+
* #sfx = u8, u16, u32#
143+
* #len = 8, 16, 32#
144+
*/
145+
static NPY_INLINE void
146+
vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
147+
{
148+
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
149+
npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
150+
npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
151+
const int vstep = npyv_nlanes_@sfx@;
152+
const npyv_@sfx@ zero = npyv_zero_@sfx@();
153+
154+
for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
155+
dst += vstep) {
156+
npyv_@sfx@ a = npyv_load_@sfx@(src1);
157+
npyv_@sfx@ b = npyv_load_@sfx@(src2);
158+
npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
159+
npyv_store_@sfx@(dst, c);
160+
if (NPY_UNLIKELY(vec_any_eq(b, zero))) {
161+
npy_set_floatstatus_divbyzero();
162+
}
163+
}
164+
165+
for (; len > 0; --len, ++src1, ++src2, ++dst) {
166+
const npyv_lanetype_@sfx@ a = *src1;
167+
const npyv_lanetype_@sfx@ b = *src2;
168+
if (NPY_UNLIKELY(b == 0)) {
169+
npy_set_floatstatus_divbyzero();
170+
*dst = 0;
171+
} else{
172+
*dst = a / b;
173+
}
174+
}
175+
npyv_cleanup();
176+
}
177+
/**end repeat**/
178+
#endif // NPY_HAVE_VSX4
179+
104180
/**begin repeat
105181
* Unsigned types
106182
* #sfx = u8, u16, u32, u64#
@@ -128,7 +204,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
128204
npyv_cleanup();
129205
}
130206
/**end repeat**/
131-
#endif
207+
#endif // NPY_SIMD
132208

133209
/********************************************************************************
134210
** Defining ufunc inner functions
@@ -203,6 +279,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
203279
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
204280
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
205281
* #STYPE = BYTE, SHORT, INT, LONG, LONGLONG#
282+
* #vector = 1, 1, 1, 0, 0#
206283
*/
207284
#undef TO_SIMD_SFX
208285
#if 0
@@ -218,8 +295,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
218295
* because emulating multiply-high on these architectures is going to be expensive comparing
219296
* to the native scalar dividers.
220297
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
221-
* Power10(VSX4) is an exception here since it has native support for integer vector division,
222-
* note neither infrastructure nor NPYV has supported VSX4 yet.
298+
* Power10(VSX4) is an exception here since it has native support for integer vector division.
223299
*/
224300
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
225301
#undef TO_SIMD_SFX
@@ -240,6 +316,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
240316
*((@type@ *)iop1) = io1;
241317
}
242318
#if NPY_SIMD && defined(TO_SIMD_SFX)
319+
#if defined(NPY_HAVE_VSX4) && @vector@
320+
// both arguments are arrays of the same size
321+
else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
322+
TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
323+
}
324+
#endif
243325
// for contiguous block of memory, divisor is a scalar and not 0
244326
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
245327
(*(@type@ *)args[1]) != 0) {

0 commit comments

Comments
 (0)