Skip to content

Commit 6be53c2

Browse files
Optimize popcount functions with ARM Neon intrinsics.
This commit introduces Neon implementations of pg_popcount{32,64}, pg_popcount(), and pg_popcount_masked(). As in simd.h, we assume that all available AArch64 hardware supports Neon, so we don't need any new configure-time or runtime checks. Some compilers already emit Neon instructions for these functions, but our hand-rolled implementations for pg_popcount() and pg_popcount_masked() performed better in testing, likely due to better instruction-level parallelism. Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
1 parent 51a0382 commit 6be53c2

File tree

5 files changed

+235
-6
lines changed

5 files changed

+235
-6
lines changed

src/include/port/pg_bitutils.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)
298298
#endif
299299
#endif
300300

301+
/*
302+
* On AArch64, we can use Neon instructions if the compiler provides access to
303+
* them (as indicated by __ARM_NEON). As in simd.h, we assume that all
304+
* available 64-bit hardware has Neon support.
305+
*/
306+
#if defined(__aarch64__) && defined(__ARM_NEON)
307+
#define POPCNT_AARCH64 1
308+
#endif
309+
301310
#ifdef TRY_POPCNT_X86_64
302311
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
303312
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);

src/port/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ OBJS = \
4646
path.o \
4747
pg_bitutils.o \
4848
pg_localeconv_r.o \
49+
pg_popcount_aarch64.o \
4950
pg_popcount_avx512.o \
5051
pg_strong_random.o \
5152
pgcheckdir.o \

src/port/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pgport_sources = [
99
'path.c',
1010
'pg_bitutils.c',
1111
'pg_localeconv_r.c',
12+
'pg_popcount_aarch64.c',
1213
'pg_popcount_avx512.c',
1314
'pg_strong_random.c',
1415
'pgcheckdir.c',

src/port/pg_bitutils.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
103103
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
104104
};
105105

106+
/*
107+
* If we are building the Neon versions, we don't need the "slow" fallbacks.
108+
*/
109+
#ifndef POPCNT_AARCH64
106110
static inline int pg_popcount32_slow(uint32 word);
107111
static inline int pg_popcount64_slow(uint64 word);
108112
static uint64 pg_popcount_slow(const char *buf, int bytes);
109113
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
114+
#endif
110115

111116
#ifdef TRY_POPCNT_X86_64
112117
static bool pg_popcount_available(void);
@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
339344

340345
#endif /* TRY_POPCNT_X86_64 */
341346

347+
/*
348+
* If we are building the Neon versions, we don't need the "slow" fallbacks.
349+
*/
350+
#ifndef POPCNT_AARCH64
342351

343352
/*
344353
* pg_popcount32_slow
@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
486495
return popcnt;
487496
}
488497

489-
#ifndef TRY_POPCNT_X86_64
498+
#endif /* ! POPCNT_AARCH64 */
499+
500+
#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
490501

491502
/*
492-
* When the POPCNT instruction is not available, there's no point in using
503+
* When special CPU instructions are not available, there's no point in using
493504
* function pointers to vary the implementation between the fast and slow
494-
* method. We instead just make these actual external functions when
495-
* TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline
496-
* the slow versions here.
505+
* method. We instead just make these actual external functions. The compiler
506+
* should be able to inline the slow versions here.
497507
*/
498508
int
499509
pg_popcount32(uint32 word)
@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
527537
return pg_popcount_masked_slow(buf, bytes, mask);
528538
}
529539

530-
#endif /* !TRY_POPCNT_X86_64 */
540+
#endif /* ! TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */

src/port/pg_popcount_aarch64.c

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_popcount_aarch64.c
4+
* Holds the AArch64 popcount implementations.
5+
*
6+
* Copyright (c) 2025, PostgreSQL Global Development Group
7+
*
8+
* IDENTIFICATION
9+
* src/port/pg_popcount_aarch64.c
10+
*
11+
*-------------------------------------------------------------------------
12+
*/
13+
#include "c.h"
14+
15+
#include "port/pg_bitutils.h"
16+
17+
#ifdef POPCNT_AARCH64
18+
19+
#include <arm_neon.h>
20+
21+
/*
22+
* pg_popcount32
23+
* Return number of 1 bits in word
24+
*/
25+
int
26+
pg_popcount32(uint32 word)
27+
{
28+
return pg_popcount64((uint64) word);
29+
}
30+
31+
/*
32+
* pg_popcount64
33+
* Return number of 1 bits in word
34+
*/
35+
int
36+
pg_popcount64(uint64 word)
37+
{
38+
/*
39+
* For some compilers, __builtin_popcountl() already emits Neon
40+
* instructions. The line below should compile to the same code on those
41+
* systems.
42+
*/
43+
return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
44+
}
45+
46+
/*
47+
* pg_popcount_optimized
48+
* Returns number of 1 bits in buf
49+
*/
50+
uint64
51+
pg_popcount_optimized(const char *buf, int bytes)
52+
{
53+
uint8x16_t vec;
54+
uint64x2_t accum1 = vdupq_n_u64(0),
55+
accum2 = vdupq_n_u64(0),
56+
accum3 = vdupq_n_u64(0),
57+
accum4 = vdupq_n_u64(0);
58+
uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
59+
uint64 popcnt = 0;
60+
61+
/*
62+
* For better instruction-level parallelism, each loop iteration operates
63+
* on a block of four registers.
64+
*/
65+
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
66+
{
67+
vec = vld1q_u8((const uint8 *) buf);
68+
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
69+
buf += sizeof(uint8x16_t);
70+
71+
vec = vld1q_u8((const uint8 *) buf);
72+
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
73+
buf += sizeof(uint8x16_t);
74+
75+
vec = vld1q_u8((const uint8 *) buf);
76+
accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
77+
buf += sizeof(uint8x16_t);
78+
79+
vec = vld1q_u8((const uint8 *) buf);
80+
accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
81+
buf += sizeof(uint8x16_t);
82+
}
83+
84+
/*
85+
* If enough data remains, do another iteration on a block of two
86+
* registers.
87+
*/
88+
bytes_per_iteration = 2 * sizeof(uint8x16_t);
89+
if (bytes >= bytes_per_iteration)
90+
{
91+
vec = vld1q_u8((const uint8 *) buf);
92+
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
93+
buf += sizeof(uint8x16_t);
94+
95+
vec = vld1q_u8((const uint8 *) buf);
96+
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
97+
buf += sizeof(uint8x16_t);
98+
99+
bytes -= bytes_per_iteration;
100+
}
101+
102+
/*
103+
* Add the accumulators.
104+
*/
105+
popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
106+
popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
107+
108+
/*
109+
* Process remaining 8-byte blocks.
110+
*/
111+
for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
112+
{
113+
popcnt += pg_popcount64(*((uint64 *) buf));
114+
buf += sizeof(uint64);
115+
}
116+
117+
/*
118+
* Process any remaining data byte-by-byte.
119+
*/
120+
while (bytes--)
121+
popcnt += pg_number_of_ones[(unsigned char) *buf++];
122+
123+
return popcnt;
124+
}
125+
126+
/*
127+
* pg_popcount_masked_optimized
128+
* Returns number of 1 bits in buf after applying the mask to each byte
129+
*/
130+
uint64
131+
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
132+
{
133+
uint8x16_t vec,
134+
maskv = vdupq_n_u8(mask);
135+
uint64x2_t accum1 = vdupq_n_u64(0),
136+
accum2 = vdupq_n_u64(0),
137+
accum3 = vdupq_n_u64(0),
138+
accum4 = vdupq_n_u64(0);
139+
uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
140+
uint64 popcnt = 0,
141+
mask64 = ~UINT64CONST(0) / 0xFF * mask;
142+
143+
/*
144+
* For better instruction-level parallelism, each loop iteration operates
145+
* on a block of four registers.
146+
*/
147+
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
148+
{
149+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
150+
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
151+
buf += sizeof(uint8x16_t);
152+
153+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
154+
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
155+
buf += sizeof(uint8x16_t);
156+
157+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
158+
accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
159+
buf += sizeof(uint8x16_t);
160+
161+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
162+
accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
163+
buf += sizeof(uint8x16_t);
164+
}
165+
166+
/*
167+
* If enough data remains, do another iteration on a block of two
168+
* registers.
169+
*/
170+
bytes_per_iteration = 2 * sizeof(uint8x16_t);
171+
if (bytes >= bytes_per_iteration)
172+
{
173+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
174+
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
175+
buf += sizeof(uint8x16_t);
176+
177+
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
178+
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
179+
buf += sizeof(uint8x16_t);
180+
181+
bytes -= bytes_per_iteration;
182+
}
183+
184+
/*
185+
* Add the accumulators.
186+
*/
187+
popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
188+
popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
189+
190+
/*
191+
* Process remining 8-byte blocks.
192+
*/
193+
for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
194+
{
195+
popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
196+
buf += sizeof(uint64);
197+
}
198+
199+
/*
200+
* Process any remaining data byte-by-byte.
201+
*/
202+
while (bytes--)
203+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
204+
205+
return popcnt;
206+
}
207+
208+
#endif /* POPCNT_AARCH64 */

0 commit comments

Comments
 (0)