1
1
/*@targets
2
2
** $maxopt baseline
3
3
** sse2 sse41 avx2 avx512f avx512_skx
4
- ** vsx2
4
+ ** vsx2 vsx4
5
5
** neon
6
6
**/
7
7
#define _UMATHMODULE
@@ -101,6 +101,82 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
101
101
}
102
102
/**end repeat**/
103
103
104
+ #if defined(NPY_HAVE_VSX4)
105
+ /*
106
+ * As Power10 only supports integer vector division for data of 32 bits or
107
+ * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
108
+ * vector division instruction, and then, convert the result back to npyv_u8.
109
+ */
110
+ NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b)
111
+ {
112
+ npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
113
+ npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
114
+ npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
115
+ npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
116
+ npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
117
+ npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
118
+ npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]);
119
+ npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]);
120
+ npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]);
121
+ npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]);
122
+ npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1);
123
+ npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2);
124
+ npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo);
125
+ return res;
126
+ }
127
+
128
+ NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b)
129
+ {
130
+ npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
131
+ npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
132
+ npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]);
133
+ npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]);
134
+ npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo);
135
+ return res;
136
+ }
137
+
138
+ #define vsx4_div_u32 vec_div
139
+
140
+ /**begin repeat
141
+ * Unsigned types
142
+ * #sfx = u8, u16, u32#
143
+ * #len = 8, 16, 32#
144
+ */
145
+ static NPY_INLINE void
146
+ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
147
+ {
148
+ npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
149
+ npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
150
+ npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
151
+ const int vstep = npyv_nlanes_@sfx@;
152
+ const npyv_@sfx@ zero = npyv_zero_@sfx@();
153
+
154
+ for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
155
+ dst += vstep) {
156
+ npyv_@sfx@ a = npyv_load_@sfx@(src1);
157
+ npyv_@sfx@ b = npyv_load_@sfx@(src2);
158
+ npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
159
+ npyv_store_@sfx@(dst, c);
160
+ if (NPY_UNLIKELY(vec_any_eq(b, zero))) {
161
+ npy_set_floatstatus_divbyzero();
162
+ }
163
+ }
164
+
165
+ for (; len > 0; --len, ++src1, ++src2, ++dst) {
166
+ const npyv_lanetype_@sfx@ a = *src1;
167
+ const npyv_lanetype_@sfx@ b = *src2;
168
+ if (NPY_UNLIKELY(b == 0)) {
169
+ npy_set_floatstatus_divbyzero();
170
+ *dst = 0;
171
+ } else{
172
+ *dst = a / b;
173
+ }
174
+ }
175
+ npyv_cleanup();
176
+ }
177
+ /**end repeat**/
178
+ #endif // NPY_HAVE_VSX4
179
+
104
180
/**begin repeat
105
181
* Unsigned types
106
182
* #sfx = u8, u16, u32, u64#
@@ -128,7 +204,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
128
204
npyv_cleanup();
129
205
}
130
206
/**end repeat**/
131
- #endif
207
+ #endif // NPY_SIMD
132
208
133
209
/********************************************************************************
134
210
** Defining ufunc inner functions
@@ -203,6 +279,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
203
279
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
204
280
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
205
281
* #STYPE = BYTE, SHORT, INT, LONG, LONGLONG#
282
+ * #vector = 1, 1, 1, 0, 0#
206
283
*/
207
284
#undef TO_SIMD_SFX
208
285
#if 0
@@ -218,8 +295,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
218
295
* because emulating multiply-high on these architectures is going to be expensive comparing
219
296
* to the native scalar dividers.
220
297
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
221
- * Power10(VSX4) is an exception here since it has native support for integer vector division,
222
- * note neither infrastructure nor NPYV has supported VSX4 yet.
298
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
223
299
*/
224
300
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
225
301
#undef TO_SIMD_SFX
@@ -240,6 +316,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
240
316
*((@type@ *)iop1) = io1;
241
317
}
242
318
#if NPY_SIMD && defined(TO_SIMD_SFX)
319
+ #if defined(NPY_HAVE_VSX4) && @vector@
320
+ // both arguments are arrays of the same size
321
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
322
+ TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
323
+ }
324
+ #endif
243
325
// for contiguous block of memory, divisor is a scalar and not 0
244
326
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
245
327
(*(@type@ *)args[1]) != 0) {
0 commit comments