Skip to content

Commit 792752a

Browse files
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks when possible. Newer hardware that supports AVX-512 instructions can use 512-bit chunks, which provides a nice speedup, especially for larger buffers. This commit introduces the infrastructure required to detect compiler and CPU support for the required AVX-512 intrinsic functions, and it adds a new pg_popcount() implementation that uses these functions. If CPU support for this optimized implementation is detected at runtime, a function pointer is updated so that it is used by subsequent calls to pg_popcount(). Most of the existing in-tree calls to pg_popcount() should benefit from these instructions, and calls with smaller buffers should at least not regress compared to v16. The new infrastructure introduced by this commit can also be used to optimize visibilitymap_count(), but that is left for a follow-up commit. Co-authored-by: Paul Amonson, Ants Aasma Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 parent 158f581 commit 792752a

15 files changed

+696
-3
lines changed

config/c-compiler.m4

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then
694694
fi
695695
undefine([Ac_cachevar])dnl
696696
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
697+
698+
# PGAC_XSAVE_INTRINSICS
699+
# ---------------------
700+
# Check if the compiler supports the XSAVE instructions using the _xgetbv
701+
# intrinsic function.
702+
#
703+
# An optional compiler flag can be passed as argument (e.g., -mxsave). If the
704+
# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
705+
AC_DEFUN([PGAC_XSAVE_INTRINSICS],
706+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
707+
AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
708+
[pgac_save_CFLAGS=$CFLAGS
709+
CFLAGS="$pgac_save_CFLAGS $1"
710+
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
711+
[return _xgetbv(0) & 0xe0;])],
712+
[Ac_cachevar=yes],
713+
[Ac_cachevar=no])
714+
CFLAGS="$pgac_save_CFLAGS"])
715+
if test x"$Ac_cachevar" = x"yes"; then
716+
CFLAGS_XSAVE="$1"
717+
pgac_xsave_intrinsics=yes
718+
fi
719+
undefine([Ac_cachevar])dnl
720+
])# PGAC_XSAVE_INTRINSICS
721+
722+
# PGAC_AVX512_POPCNT_INTRINSICS
723+
# -----------------------------
724+
# Check if the compiler supports the AVX-512 popcount instructions using the
725+
# _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
726+
# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
727+
#
728+
# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
729+
# -mavx512bw). If the intrinsics are supported, sets
730+
# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
731+
AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
732+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
733+
AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
734+
[pgac_save_CFLAGS=$CFLAGS
735+
CFLAGS="$pgac_save_CFLAGS $1"
736+
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
737+
[const char buf@<:@sizeof(__m512i)@:>@;
738+
PG_INT64_TYPE popcnt = 0;
739+
__m512i accum = _mm512_setzero_si512();
740+
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
741+
const __m512i cnt = _mm512_popcnt_epi64(val);
742+
accum = _mm512_add_epi64(accum, cnt);
743+
popcnt = _mm512_reduce_add_epi64(accum);
744+
/* return computed value, to prevent the above being optimized away */
745+
return popcnt == 0;])],
746+
[Ac_cachevar=yes],
747+
[Ac_cachevar=no])
748+
CFLAGS="$pgac_save_CFLAGS"])
749+
if test x"$Ac_cachevar" = x"yes"; then
750+
CFLAGS_POPCNT="$1"
751+
pgac_avx512_popcnt_intrinsics=yes
752+
fi
753+
undefine([Ac_cachevar])dnl
754+
])# PGAC_AVX512_POPCNT_INTRINSICS

configure

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,9 @@ MSGFMT_FLAGS
647647
MSGFMT
648648
PG_CRC32C_OBJS
649649
CFLAGS_CRC
650+
PG_POPCNT_OBJS
651+
CFLAGS_POPCNT
652+
CFLAGS_XSAVE
650653
LIBOBJS
651654
OPENSSL
652655
ZSTD
@@ -17404,6 +17407,40 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h
1740417407

1740517408
fi
1740617409

17410+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
17411+
$as_echo_n "checking for __get_cpuid_count... " >&6; }
17412+
if ${pgac_cv__get_cpuid_count+:} false; then :
17413+
$as_echo_n "(cached) " >&6
17414+
else
17415+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17416+
/* end confdefs.h. */
17417+
#include <cpuid.h>
17418+
int
17419+
main ()
17420+
{
17421+
unsigned int exx[4] = {0, 0, 0, 0};
17422+
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
17423+
17424+
;
17425+
return 0;
17426+
}
17427+
_ACEOF
17428+
if ac_fn_c_try_link "$LINENO"; then :
17429+
pgac_cv__get_cpuid_count="yes"
17430+
else
17431+
pgac_cv__get_cpuid_count="no"
17432+
fi
17433+
rm -f core conftest.err conftest.$ac_objext \
17434+
conftest$ac_exeext conftest.$ac_ext
17435+
fi
17436+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5
17437+
$as_echo "$pgac_cv__get_cpuid_count" >&6; }
17438+
if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
17439+
17440+
$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
17441+
17442+
fi
17443+
1740717444
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5
1740817445
$as_echo_n "checking for __cpuid... " >&6; }
1740917446
if ${pgac_cv__cpuid+:} false; then :
@@ -17438,6 +17475,221 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
1743817475

1743917476
fi
1744017477

17478+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
17479+
$as_echo_n "checking for __cpuidex... " >&6; }
17480+
if ${pgac_cv__cpuidex+:} false; then :
17481+
$as_echo_n "(cached) " >&6
17482+
else
17483+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17484+
/* end confdefs.h. */
17485+
#include <intrin.h>
17486+
int
17487+
main ()
17488+
{
17489+
unsigned int exx[4] = {0, 0, 0, 0};
17490+
__get_cpuidex(exx[0], 7, 0);
17491+
17492+
;
17493+
return 0;
17494+
}
17495+
_ACEOF
17496+
if ac_fn_c_try_link "$LINENO"; then :
17497+
pgac_cv__cpuidex="yes"
17498+
else
17499+
pgac_cv__cpuidex="no"
17500+
fi
17501+
rm -f core conftest.err conftest.$ac_objext \
17502+
conftest$ac_exeext conftest.$ac_ext
17503+
fi
17504+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
17505+
$as_echo "$pgac_cv__cpuidex" >&6; }
17506+
if test x"$pgac_cv__cpuidex" = x"yes"; then
17507+
17508+
$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
17509+
17510+
fi
17511+
17512+
# Check for XSAVE intrinsics
17513+
#
17514+
CFLAGS_XSAVE=""
17515+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
17516+
$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
17517+
if ${pgac_cv_xsave_intrinsics_+:} false; then :
17518+
$as_echo_n "(cached) " >&6
17519+
else
17520+
pgac_save_CFLAGS=$CFLAGS
17521+
CFLAGS="$pgac_save_CFLAGS "
17522+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17523+
/* end confdefs.h. */
17524+
#include <immintrin.h>
17525+
int
17526+
main ()
17527+
{
17528+
return _xgetbv(0) & 0xe0;
17529+
;
17530+
return 0;
17531+
}
17532+
_ACEOF
17533+
if ac_fn_c_try_link "$LINENO"; then :
17534+
pgac_cv_xsave_intrinsics_=yes
17535+
else
17536+
pgac_cv_xsave_intrinsics_=no
17537+
fi
17538+
rm -f core conftest.err conftest.$ac_objext \
17539+
conftest$ac_exeext conftest.$ac_ext
17540+
CFLAGS="$pgac_save_CFLAGS"
17541+
fi
17542+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5
17543+
$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
17544+
if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
17545+
CFLAGS_XSAVE=""
17546+
pgac_xsave_intrinsics=yes
17547+
fi
17548+
17549+
if test x"$pgac_xsave_intrinsics" != x"yes"; then
17550+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5
17551+
$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
17552+
if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
17553+
$as_echo_n "(cached) " >&6
17554+
else
17555+
pgac_save_CFLAGS=$CFLAGS
17556+
CFLAGS="$pgac_save_CFLAGS -mxsave"
17557+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17558+
/* end confdefs.h. */
17559+
#include <immintrin.h>
17560+
int
17561+
main ()
17562+
{
17563+
return _xgetbv(0) & 0xe0;
17564+
;
17565+
return 0;
17566+
}
17567+
_ACEOF
17568+
if ac_fn_c_try_link "$LINENO"; then :
17569+
pgac_cv_xsave_intrinsics__mxsave=yes
17570+
else
17571+
pgac_cv_xsave_intrinsics__mxsave=no
17572+
fi
17573+
rm -f core conftest.err conftest.$ac_objext \
17574+
conftest$ac_exeext conftest.$ac_ext
17575+
CFLAGS="$pgac_save_CFLAGS"
17576+
fi
17577+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5
17578+
$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
17579+
if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
17580+
CFLAGS_XSAVE="-mxsave"
17581+
pgac_xsave_intrinsics=yes
17582+
fi
17583+
17584+
fi
17585+
if test x"$pgac_xsave_intrinsics" = x"yes"; then
17586+
17587+
$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
17588+
17589+
fi
17590+
17591+
17592+
# Check for AVX-512 popcount intrinsics
17593+
#
17594+
CFLAGS_POPCNT=""
17595+
PG_POPCNT_OBJS=""
17596+
if test x"$host_cpu" = x"x86_64"; then
17597+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
17598+
$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
17599+
if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
17600+
$as_echo_n "(cached) " >&6
17601+
else
17602+
pgac_save_CFLAGS=$CFLAGS
17603+
CFLAGS="$pgac_save_CFLAGS "
17604+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17605+
/* end confdefs.h. */
17606+
#include <immintrin.h>
17607+
int
17608+
main ()
17609+
{
17610+
const char buf[sizeof(__m512i)];
17611+
PG_INT64_TYPE popcnt = 0;
17612+
__m512i accum = _mm512_setzero_si512();
17613+
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
17614+
const __m512i cnt = _mm512_popcnt_epi64(val);
17615+
accum = _mm512_add_epi64(accum, cnt);
17616+
popcnt = _mm512_reduce_add_epi64(accum);
17617+
/* return computed value, to prevent the above being optimized away */
17618+
return popcnt == 0;
17619+
;
17620+
return 0;
17621+
}
17622+
_ACEOF
17623+
if ac_fn_c_try_link "$LINENO"; then :
17624+
pgac_cv_avx512_popcnt_intrinsics_=yes
17625+
else
17626+
pgac_cv_avx512_popcnt_intrinsics_=no
17627+
fi
17628+
rm -f core conftest.err conftest.$ac_objext \
17629+
conftest$ac_exeext conftest.$ac_ext
17630+
CFLAGS="$pgac_save_CFLAGS"
17631+
fi
17632+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
17633+
$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
17634+
if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
17635+
CFLAGS_POPCNT=""
17636+
pgac_avx512_popcnt_intrinsics=yes
17637+
fi
17638+
17639+
if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
17640+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5
17641+
$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw... " >&6; }
17642+
if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false; then :
17643+
$as_echo_n "(cached) " >&6
17644+
else
17645+
pgac_save_CFLAGS=$CFLAGS
17646+
CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw"
17647+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17648+
/* end confdefs.h. */
17649+
#include <immintrin.h>
17650+
int
17651+
main ()
17652+
{
17653+
const char buf[sizeof(__m512i)];
17654+
PG_INT64_TYPE popcnt = 0;
17655+
__m512i accum = _mm512_setzero_si512();
17656+
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
17657+
const __m512i cnt = _mm512_popcnt_epi64(val);
17658+
accum = _mm512_add_epi64(accum, cnt);
17659+
popcnt = _mm512_reduce_add_epi64(accum);
17660+
/* return computed value, to prevent the above being optimized away */
17661+
return popcnt == 0;
17662+
;
17663+
return 0;
17664+
}
17665+
_ACEOF
17666+
if ac_fn_c_try_link "$LINENO"; then :
17667+
pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes
17668+
else
17669+
pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no
17670+
fi
17671+
rm -f core conftest.err conftest.$ac_objext \
17672+
conftest$ac_exeext conftest.$ac_ext
17673+
CFLAGS="$pgac_save_CFLAGS"
17674+
fi
17675+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5
17676+
$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6; }
17677+
if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" = x"yes"; then
17678+
CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw"
17679+
pgac_avx512_popcnt_intrinsics=yes
17680+
fi
17681+
17682+
fi
17683+
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
17684+
PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
17685+
17686+
$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
17687+
17688+
fi
17689+
fi
17690+
17691+
17692+
1744117693
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
1744217694
#
1744317695
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used

configure.ac

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
20522052
AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
20532053
fi
20542054

2055+
AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
2056+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
2057+
[[unsigned int exx[4] = {0, 0, 0, 0};
2058+
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
2059+
]])],
2060+
[pgac_cv__get_cpuid_count="yes"],
2061+
[pgac_cv__get_cpuid_count="no"])])
2062+
if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
2063+
AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
2064+
fi
2065+
20552066
AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
20562067
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
20572068
[[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
20632074
AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
20642075
fi
20652076

2077+
AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
2078+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
2079+
[[unsigned int exx[4] = {0, 0, 0, 0};
2080+
__get_cpuidex(exx[0], 7, 0);
2081+
]])],
2082+
[pgac_cv__cpuidex="yes"],
2083+
[pgac_cv__cpuidex="no"])])
2084+
if test x"$pgac_cv__cpuidex" = x"yes"; then
2085+
AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
2086+
fi
2087+
2088+
# Check for XSAVE intrinsics
2089+
#
2090+
CFLAGS_XSAVE=""
2091+
PGAC_XSAVE_INTRINSICS([])
2092+
if test x"$pgac_xsave_intrinsics" != x"yes"; then
2093+
PGAC_XSAVE_INTRINSICS([-mxsave])
2094+
fi
2095+
if test x"$pgac_xsave_intrinsics" = x"yes"; then
2096+
AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
2097+
fi
2098+
AC_SUBST(CFLAGS_XSAVE)
2099+
2100+
# Check for AVX-512 popcount intrinsics
2101+
#
2102+
CFLAGS_POPCNT=""
2103+
PG_POPCNT_OBJS=""
2104+
if test x"$host_cpu" = x"x86_64"; then
2105+
PGAC_AVX512_POPCNT_INTRINSICS([])
2106+
if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
2107+
PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
2108+
fi
2109+
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
2110+
PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
2111+
AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 popcount instructions with a runtime check.])
2112+
fi
2113+
fi
2114+
AC_SUBST(CFLAGS_POPCNT)
2115+
AC_SUBST(PG_POPCNT_OBJS)
2116+
20662117
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
20672118
#
20682119
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used

0 commit comments

Comments
 (0)