Merge pull request #13134 from r-devulap/logexp-simd

mattip · web-flow · commit cd2e52ce9dc5 · 2019-04-20T20:20:46.000+03:00
ENH: Use AVX for float32 implementation of np.exp &amp; np.log
diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst
@@ -191,6 +191,12 @@ but with this change, you can do::
 
 thereby saving a level of indentation
 
+`numpy.exp and numpy.log` speed up for float32 implementation
+-------------------------------------------------------------
+float32 implementation of numpy.exp and numpy.log now benefit from AVX2/AVX512
+instruction set which are detected during runtime. numpy.exp has a max ulp
+error of 2.52 and numpy.log has a max ulp error or 3.83.
+
 Improve performance of ``np.pad``
 ---------------------------------
 The performance of the function has been improved for most cases by filling in
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -697,6 +697,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
           ),
@@ -718,6 +719,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
           ),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
@@ -46,10 +46,20 @@
 #endif
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
+#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
 #else
 #define NPY_GCC_TARGET_AVX2
 #endif
 
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
+#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
+#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
+#else
+#define NPY_GCC_TARGET_AVX512F
+#endif
+
 /*
  * mark an argument (starting from 1) that must not be NULL and is not checked
  * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -68,6 +78,13 @@
 #define NPY_HAVE_SSE2_INTRINSICS
 #endif
 
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
+#define NPY_HAVE_AVX2_INTRINSICS
+#endif
+
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
+#define NPY_HAVE_AVX512F_INTRINSICS
+#endif
 /*
  * give a hint to the compiler which branch is more likely or unlikely
  * to occur, e.g. rare error cases:
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
@@ -113,6 +113,38 @@ NPY_INLINE static float __npy_nzerof(void)
 #define NPY_SQRT2l    1.414213562373095048801688724209698079L /* sqrt(2) */
 #define NPY_SQRT1_2l  0.707106781186547524400844362104849039L /* 1/sqrt(2) */
 
+/* 
+ * Constants used in vector implementation of exp(x) 
+ */
+#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
+#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
+#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
+#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f                                 
+#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f                                 
+#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f                                 
+#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f                                 
+#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f                                 
+#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f                                 
+#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f                                 
+#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f                                
+#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f  
+
+/* 
+ * Constants used in vector implementation of log(x) 
+ */
+#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f                          
+#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f                          
+#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f                          
+#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f                          
+#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f                          
+#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f                          
+#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f                          
+#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f                          
+#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f                          
+#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f                          
+#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f                          
+#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f 
+
 /*
  * C99 double math funcs
  */
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -171,6 +171,11 @@ def check_funcs(funcs_name):
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
 
+    for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
+        if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
+                                                               header):
+            moredefs.append((fname2def(fn), 1))
+
     for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
         if config.check_gcc_variable_attribute(fn):
             m = fn.replace("(", "_").replace(")", "_")
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
@@ -118,6 +118,7 @@ def check_api_version(apiversion, codegen_dir):
 # sse headers only enabled automatically on amd64/x32 builds
                 "xmmintrin.h",  # SSE
                 "emmintrin.h",  # SSE2
+                "immintrin.h",  # AVX
                 "features.h",  # for glibc version linux
                 "xlocale.h",  # see GH#8367
                 "dlfcn.h", # dladdr
@@ -149,6 +150,8 @@ def check_api_version(apiversion, codegen_dir):
                         "stdio.h", "LINK_AVX"),
                        ("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
                         "stdio.h", "LINK_AVX2"),
+                       ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
+                        "stdio.h", "LINK_AVX512F"),
                        ("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
                        ]
 
@@ -165,6 +168,23 @@ def check_api_version(apiversion, codegen_dir):
                                  'attribute_target_avx'),
                                 ('__attribute__((target ("avx2")))',
                                  'attribute_target_avx2'),
+                                ('__attribute__((target ("avx512f")))',
+                                 'attribute_target_avx512f'),
+                                ]
+
+# function attributes with intrinsics
+# To ensure your compiler can compile avx intrinsics with just the attributes
+# gcc 4.8.4 support attributes but not with intrisics
+# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
+# function name will be converted to HAVE_<upper-case-name> preprocessor macro
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+                                'attribute_target_avx2_with_intrinsics',
+                                '__m256 temp = _mm256_set1_ps(1.0)',
+                                'immintrin.h'),
+                                ('__attribute__((target("avx512f")))',
+                                'attribute_target_avx512f_with_intrinsics',
+                                '__m512 temp = _mm512_set1_ps(1.0)',
+                                'immintrin.h'),
                                 ]
 
 # variable attributes tested via "int %s a" % attribute
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
@@ -11,6 +11,7 @@
 #define XCR_XFEATURE_ENABLED_MASK 0x0
 #define XSTATE_SSE 0x2
 #define XSTATE_YMM 0x4
+#define XSTATE_ZMM 0x70
 
 /*
  * verify the OS supports avx instructions
@@ -33,6 +34,19 @@ int os_avx_support(void)
 #endif
 }
 
+static NPY_INLINE
+int os_avx512_support(void)
+{
+#if HAVE_XGETBV
+    unsigned int eax, edx;
+    unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
+    unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
+    __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
+    return (eax & xcr0) == xcr0;
+#else
+    return 0;
+#endif
+}
 
 /*
  * Primitive cpu feature detect function
@@ -42,7 +56,14 @@ NPY_NO_EXPORT int
 npy_cpu_supports(const char * feature)
 {
 #ifdef HAVE___BUILTIN_CPU_SUPPORTS
-    if (strcmp(feature, "avx2") == 0) {
+    if (strcmp(feature, "avx512f") == 0) {
+#if defined(__GNUC__) && (__GNUC__ < 5)
+        return 0;
+#else
+        return __builtin_cpu_supports("avx512f") && os_avx512_support();
+#endif
+    }
+    else if (strcmp(feature, "avx2") == 0) {
         return __builtin_cpu_supports("avx2") && os_avx_support();
     }
     else if (strcmp(feature, "avx") == 0) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -1569,6 +1569,55 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
+/**begin repeat
+ *  #func = exp, log#
+ *  #scalarf = npy_expf, npy_logf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+	const npy_float in1 = *(npy_float *)ip1;
+	*(npy_float *)op1 = @scalarf@(in1);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #isa = avx512f, avx2#
+ * #ISA = AVX512F, AVX2#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = exp, log#
+ *  #scalarf = npy_expf, npy_logf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+#else
+    /*
+     * This is the path it would take if ISA was runtime detected, but not
+     * compiled for. It fixes the error on clang6.0 which fails to compile
+     * AVX512F version. Not sure if I like this idea, if during runtime it
+     * detects AXV512F, it will end up running the scalar version instead
+     * of AVX2.
+     */
+    UNARY_LOOP {
+	const npy_float in1 = *(npy_float *)ip1;
+	*(npy_float *)op1 = @scalarf@(in1);
+    }
+#endif
+}
+
+/**end repeat1**/
+/**end repeat**/
 
 /**begin repeat
  * Float types
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
@@ -177,6 +177,22 @@ NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+/**begin repeat
+ *  #func = exp, log#
+ */
+NPY_NO_EXPORT void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, avx2#
+ */
+
+NPY_NO_EXPORT void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
diff --git a/numpy/distutils/command/autodist.py b/numpy/distutils/command/autodist.py
diff --git a/numpy/distutils/command/config.py b/numpy/distutils/command/config.py