Skip to content

Commit 519338a

Browse files
nathan-bossartchiranmoyFujitsuRamaKishanMalladi
committed
Optimize popcount functions with ARM SVE intrinsics.
This commit introduces SVE implementations of pg_popcount{32,64}. Unlike the Neon versions, we need an additional configure-time check to determine if the compiler supports SVE intrinsics, and we need a runtime check to determine if the current CPU supports SVE instructions. Our testing showed that the SVE implementations are much faster for larger inputs and are comparable to the status quo for smaller inputs. Author: "Devanga.Susmitha@fujitsu.com" <Devanga.Susmitha@fujitsu.com> Co-authored-by: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Co-authored-by: "Malladi, Rama" <ramamalladi@hotmail.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com Discussion: https://postgr.es/m/OSZPR01MB84990A9A02A3515C6E85A65B8B2A2%40OSZPR01MB8499.jpnprd01.prod.outlook.com
1 parent 3c8e463 commit 519338a

File tree

7 files changed

+475
-6
lines changed

7 files changed

+475
-6
lines changed

config/c-compiler.m4

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,3 +708,55 @@ if test x"$Ac_cachevar" = x"yes"; then
708708
fi
709709
undefine([Ac_cachevar])dnl
710710
])# PGAC_AVX512_POPCNT_INTRINSICS
711+
712+
# PGAC_SVE_POPCNT_INTRINSICS
713+
# --------------------------
714+
# Check if the compiler supports the SVE popcount instructions using the
715+
# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
716+
# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
717+
# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
718+
#
719+
# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
720+
AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
721+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
722+
AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
723+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
724+
725+
char buf[128];
726+
727+
#if defined(__has_attribute) && __has_attribute (target)
728+
__attribute__((target("arch=armv8-a+sve")))
729+
#endif
730+
static int popcount_test(void)
731+
{
732+
svbool_t pred = svptrue_b64();
733+
svuint8_t vec8;
734+
svuint64_t accum1 = svdup_u64(0),
735+
accum2 = svdup_u64(0),
736+
vec64;
737+
char *p = buf;
738+
uint64_t popcnt,
739+
mask = 0x5555555555555555;
740+
741+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
742+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
743+
p += svcntb();
744+
745+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
746+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
747+
p += svcntb();
748+
749+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
750+
751+
pred = svwhilelt_b8_s32(0, sizeof(buf));
752+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
753+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
754+
}]],
755+
[return popcount_test();])],
756+
[Ac_cachevar=yes],
757+
[Ac_cachevar=no])])
758+
if test x"$Ac_cachevar" = x"yes"; then
759+
pgac_sve_popcnt_intrinsics=yes
760+
fi
761+
undefine([Ac_cachevar])dnl
762+
])# PGAC_SVE_POPCNT_INTRINSICS

configure

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17517,6 +17517,77 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
1751717517
fi
1751817518
fi
1751917519

17520+
# Check for SVE popcount intrinsics
17521+
#
17522+
if test x"$host_cpu" = x"aarch64"; then
17523+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
17524+
$as_echo_n "checking for svcnt_x... " >&6; }
17525+
if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
17526+
$as_echo_n "(cached) " >&6
17527+
else
17528+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17529+
/* end confdefs.h. */
17530+
#include <arm_sve.h>
17531+
17532+
char buf[128];
17533+
17534+
#if defined(__has_attribute) && __has_attribute (target)
17535+
__attribute__((target("arch=armv8-a+sve")))
17536+
#endif
17537+
static int popcount_test(void)
17538+
{
17539+
svbool_t pred = svptrue_b64();
17540+
svuint8_t vec8;
17541+
svuint64_t accum1 = svdup_u64(0),
17542+
accum2 = svdup_u64(0),
17543+
vec64;
17544+
char *p = buf;
17545+
uint64_t popcnt,
17546+
mask = 0x5555555555555555;
17547+
17548+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
17549+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
17550+
p += svcntb();
17551+
17552+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
17553+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
17554+
p += svcntb();
17555+
17556+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
17557+
17558+
pred = svwhilelt_b8_s32(0, sizeof(buf));
17559+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
17560+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
17561+
}
17562+
int
17563+
main ()
17564+
{
17565+
return popcount_test();
17566+
;
17567+
return 0;
17568+
}
17569+
_ACEOF
17570+
if ac_fn_c_try_link "$LINENO"; then :
17571+
pgac_cv_sve_popcnt_intrinsics=yes
17572+
else
17573+
pgac_cv_sve_popcnt_intrinsics=no
17574+
fi
17575+
rm -f core conftest.err conftest.$ac_objext \
17576+
conftest$ac_exeext conftest.$ac_ext
17577+
fi
17578+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5
17579+
$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
17580+
if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
17581+
pgac_sve_popcnt_intrinsics=yes
17582+
fi
17583+
17584+
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
17585+
17586+
$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
17587+
17588+
fi
17589+
fi
17590+
1752017591
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
1752117592
#
1752217593
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5

configure.ac

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2070,6 +2070,15 @@ if test x"$host_cpu" = x"x86_64"; then
20702070
fi
20712071
fi
20722072

2073+
# Check for SVE popcount intrinsics
2074+
#
2075+
if test x"$host_cpu" = x"aarch64"; then
2076+
PGAC_SVE_POPCNT_INTRINSICS()
2077+
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
2078+
AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE popcount instructions with a runtime check.])
2079+
fi
2080+
fi
2081+
20732082
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
20742083
#
20752084
PGAC_SSE42_CRC32_INTRINSICS()

meson.build

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,54 @@ int main(void)
22972297
endif
22982298

22992299

2300+
###############################################################
2301+
# Check for the availability of SVE popcount intrinsics.
2302+
###############################################################
2303+
2304+
if host_cpu == 'aarch64'
2305+
2306+
prog = '''
2307+
#include <arm_sve.h>
2308+
2309+
char buf[128];
2310+
2311+
#if defined(__has_attribute) && __has_attribute (target)
2312+
__attribute__((target("arch=armv8-a+sve")))
2313+
#endif
2314+
int main(void)
2315+
{
2316+
svbool_t pred = svptrue_b64();
2317+
svuint8_t vec8;
2318+
svuint64_t accum1 = svdup_u64(0),
2319+
accum2 = svdup_u64(0),
2320+
vec64;
2321+
char *p = buf;
2322+
uint64_t popcnt,
2323+
mask = 0x5555555555555555;
2324+
2325+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
2326+
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
2327+
p += svcntb();
2328+
2329+
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
2330+
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
2331+
p += svcntb();
2332+
2333+
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
2334+
2335+
pred = svwhilelt_b8_s32(0, sizeof(buf));
2336+
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
2337+
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
2338+
}
2339+
'''
2340+
2341+
if cc.links(prog, name: 'SVE popcount', args: test_c_args)
2342+
cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
2343+
endif
2344+
2345+
endif
2346+
2347+
23002348
###############################################################
23012349
# Select CRC-32C implementation.
23022350
#

src/include/pg_config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,9 @@
712712
/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
713713
#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
714714

715+
/* Define to 1 to use SVE popcount instructions with a runtime check. */
716+
#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
717+
715718
/* Define to build with systemd support. (--with-systemd) */
716719
#undef USE_SYSTEMD
717720

src/include/port/pg_bitutils.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
324324
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
325325
#endif
326326

327+
#elif POPCNT_AARCH64
328+
/* Use the Neon version of pg_popcount{32,64} without function pointer. */
329+
extern int pg_popcount32(uint32 word);
330+
extern int pg_popcount64(uint64 word);
331+
332+
/*
333+
* We can try to use an SVE-optimized pg_popcount() on some systems For that,
334+
* we do use a function pointer.
335+
*/
336+
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
337+
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
338+
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
339+
#else
340+
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
341+
extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
342+
#endif
343+
327344
#else
328345
/* Use a portable implementation -- no need for a function pointer. */
329346
extern int pg_popcount32(uint32 word);

0 commit comments

Comments
 (0)