Skip to content

Commit cd2e52c

Browse files
authored
Merge pull request #13134 from r-devulap/logexp-simd
ENH: Use AVX for float32 implementation of np.exp & np.log
2 parents 31e71d7 + d3125fa commit cd2e52c

File tree

12 files changed

+591
-1
lines changed

12 files changed

+591
-1
lines changed

doc/release/1.17.0-notes.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,12 @@ but with this change, you can do::
191191

192192
thereby saving a level of indentation
193193

194+
`numpy.exp and numpy.log` speed up for float32 implementation
195+
-------------------------------------------------------------
196+
float32 implementation of numpy.exp and numpy.log now benefit from AVX2/AVX512
197+
instruction set which are detected during runtime. numpy.exp has a max ulp
198+
error of 2.52 and numpy.log has a max ulp error or 3.83.
199+
194200
Improve performance of ``np.pad``
195201
---------------------------------
196202
The performance of the function has been improved for most cases by filling in

numpy/core/code_generators/generate_umath.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,7 @@ def english_upper(s):
697697
Ufunc(1, 1, None,
698698
docstrings.get('numpy.core.umath.exp'),
699699
None,
700+
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
700701
TD(inexact, f='exp', astype={'e':'f'}),
701702
TD(P, f='exp'),
702703
),
@@ -718,6 +719,7 @@ def english_upper(s):
718719
Ufunc(1, 1, None,
719720
docstrings.get('numpy.core.umath.log'),
720721
None,
722+
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
721723
TD(inexact, f='log', astype={'e':'f'}),
722724
TD(P, f='log'),
723725
),

numpy/core/include/numpy/npy_common.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,20 @@
4646
#endif
4747
#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
4848
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
49+
#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
50+
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
4951
#else
5052
#define NPY_GCC_TARGET_AVX2
5153
#endif
5254

55+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
56+
#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
57+
#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
58+
#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
59+
#else
60+
#define NPY_GCC_TARGET_AVX512F
61+
#endif
62+
5363
/*
5464
* mark an argument (starting from 1) that must not be NULL and is not checked
5565
* DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -68,6 +78,13 @@
6878
#define NPY_HAVE_SSE2_INTRINSICS
6979
#endif
7080

81+
#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
82+
#define NPY_HAVE_AVX2_INTRINSICS
83+
#endif
84+
85+
#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
86+
#define NPY_HAVE_AVX512F_INTRINSICS
87+
#endif
7188
/*
7289
* give a hint to the compiler which branch is more likely or unlikely
7390
* to occur, e.g. rare error cases:

numpy/core/include/numpy/npy_math.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,38 @@ NPY_INLINE static float __npy_nzerof(void)
113113
#define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */
114114
#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */
115115

116+
/*
117+
* Constants used in vector implementation of exp(x)
118+
*/
119+
#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
120+
#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
121+
#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
122+
#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
123+
#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
124+
#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
125+
#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
126+
#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
127+
#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
128+
#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
129+
#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
130+
#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
131+
132+
/*
133+
* Constants used in vector implementation of log(x)
134+
*/
135+
#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
136+
#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
137+
#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
138+
#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
139+
#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
140+
#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
141+
#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
142+
#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
143+
#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
144+
#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
145+
#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
146+
#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
147+
116148
/*
117149
* C99 double math funcs
118150
*/

numpy/core/setup.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ def check_funcs(funcs_name):
171171
if config.check_gcc_function_attribute(dec, fn):
172172
moredefs.append((fname2def(fn), 1))
173173

174+
for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
175+
if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
176+
header):
177+
moredefs.append((fname2def(fn), 1))
178+
174179
for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
175180
if config.check_gcc_variable_attribute(fn):
176181
m = fn.replace("(", "_").replace(")", "_")

numpy/core/setup_common.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def check_api_version(apiversion, codegen_dir):
118118
# sse headers only enabled automatically on amd64/x32 builds
119119
"xmmintrin.h", # SSE
120120
"emmintrin.h", # SSE2
121+
"immintrin.h", # AVX
121122
"features.h", # for glibc version linux
122123
"xlocale.h", # see GH#8367
123124
"dlfcn.h", # dladdr
@@ -149,6 +150,8 @@ def check_api_version(apiversion, codegen_dir):
149150
"stdio.h", "LINK_AVX"),
150151
("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
151152
"stdio.h", "LINK_AVX2"),
153+
("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
154+
"stdio.h", "LINK_AVX512F"),
152155
("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
153156
]
154157

@@ -165,6 +168,23 @@ def check_api_version(apiversion, codegen_dir):
165168
'attribute_target_avx'),
166169
('__attribute__((target ("avx2")))',
167170
'attribute_target_avx2'),
171+
('__attribute__((target ("avx512f")))',
172+
'attribute_target_avx512f'),
173+
]
174+
175+
# function attributes with intrinsics
176+
# To ensure your compiler can compile avx intrinsics with just the attributes
177+
# gcc 4.8.4 support attributes but not with intrisics
178+
# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
179+
# function name will be converted to HAVE_<upper-case-name> preprocessor macro
180+
OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
181+
'attribute_target_avx2_with_intrinsics',
182+
'__m256 temp = _mm256_set1_ps(1.0)',
183+
'immintrin.h'),
184+
('__attribute__((target("avx512f")))',
185+
'attribute_target_avx512f_with_intrinsics',
186+
'__m512 temp = _mm512_set1_ps(1.0)',
187+
'immintrin.h'),
168188
]
169189

170190
# variable attributes tested via "int %s a" % attribute

numpy/core/src/umath/cpuid.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define XCR_XFEATURE_ENABLED_MASK 0x0
1212
#define XSTATE_SSE 0x2
1313
#define XSTATE_YMM 0x4
14+
#define XSTATE_ZMM 0x70
1415

1516
/*
1617
* verify the OS supports avx instructions
@@ -33,6 +34,19 @@ int os_avx_support(void)
3334
#endif
3435
}
3536

37+
static NPY_INLINE
38+
int os_avx512_support(void)
39+
{
40+
#if HAVE_XGETBV
41+
unsigned int eax, edx;
42+
unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
43+
unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
44+
__asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
45+
return (eax & xcr0) == xcr0;
46+
#else
47+
return 0;
48+
#endif
49+
}
3650

3751
/*
3852
* Primitive cpu feature detect function
@@ -42,7 +56,14 @@ NPY_NO_EXPORT int
4256
npy_cpu_supports(const char * feature)
4357
{
4458
#ifdef HAVE___BUILTIN_CPU_SUPPORTS
45-
if (strcmp(feature, "avx2") == 0) {
59+
if (strcmp(feature, "avx512f") == 0) {
60+
#if defined(__GNUC__) && (__GNUC__ < 5)
61+
return 0;
62+
#else
63+
return __builtin_cpu_supports("avx512f") && os_avx512_support();
64+
#endif
65+
}
66+
else if (strcmp(feature, "avx2") == 0) {
4667
return __builtin_cpu_supports("avx2") && os_avx_support();
4768
}
4869
else if (strcmp(feature, "avx") == 0) {

numpy/core/src/umath/loops.c.src

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,55 @@ NPY_NO_EXPORT void
15691569

15701570
/**end repeat**/
15711571

1572+
/**begin repeat
1573+
* #func = exp, log#
1574+
* #scalarf = npy_expf, npy_logf#
1575+
*/
1576+
1577+
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1578+
FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
1579+
{
1580+
UNARY_LOOP {
1581+
const npy_float in1 = *(npy_float *)ip1;
1582+
*(npy_float *)op1 = @scalarf@(in1);
1583+
}
1584+
}
1585+
1586+
/**end repeat**/
1587+
1588+
/**begin repeat
1589+
* #isa = avx512f, avx2#
1590+
* #ISA = AVX512F, AVX2#
1591+
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
1592+
*/
1593+
1594+
/**begin repeat1
1595+
* #func = exp, log#
1596+
* #scalarf = npy_expf, npy_logf#
1597+
*/
1598+
1599+
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1600+
FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
1601+
{
1602+
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
1603+
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
1604+
#else
1605+
/*
1606+
* This is the path it would take if ISA was runtime detected, but not
1607+
* compiled for. It fixes the error on clang6.0 which fails to compile
1608+
* AVX512F version. Not sure if I like this idea, if during runtime it
1609+
* detects AXV512F, it will end up running the scalar version instead
1610+
* of AVX2.
1611+
*/
1612+
UNARY_LOOP {
1613+
const npy_float in1 = *(npy_float *)ip1;
1614+
*(npy_float *)op1 = @scalarf@(in1);
1615+
}
1616+
#endif
1617+
}
1618+
1619+
/**end repeat1**/
1620+
/**end repeat**/
15721621

15731622
/**begin repeat
15741623
* Float types

numpy/core/src/umath/loops.h.src

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,22 @@ NPY_NO_EXPORT void
177177
@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
178178
/**end repeat**/
179179

180+
/**begin repeat
181+
* #func = exp, log#
182+
*/
183+
NPY_NO_EXPORT void
184+
FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
185+
186+
/**begin repeat1
187+
* #isa = avx512f, avx2#
188+
*/
189+
190+
NPY_NO_EXPORT void
191+
FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
192+
193+
/**end repeat1**/
194+
/**end repeat**/
195+
180196
/**begin repeat
181197
* Float types
182198
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#

0 commit comments

Comments
 (0)