Skip to content

Commit 41c51f0

Browse files
Optimize visibilitymap_count() with AVX-512 instructions.
Commit 792752a added infrastructure for using AVX-512 intrinsic functions, and this commit uses that infrastructure to optimize visibilitymap_count(). Specificially, a new pg_popcount_masked() function is introduced that applies a bitmask to every byte in the buffer prior to calculating the population count, which is used to filter out the all-visible or all-frozen bits as needed. Platforms without AVX-512 support should also see a nice speedup due to the reduced number of calls to a function pointer. Co-authored-by: Ants Aasma Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 parent 792752a commit 41c51f0

File tree

4 files changed

+225
-20
lines changed

4 files changed

+225
-20
lines changed

src/backend/access/heap/visibilitymap.c

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,8 @@
119119
#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
120120

121121
/* Masks for counting subsets of bits in the visibility map. */
122-
#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each
123-
* bit pair */
124-
#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
125-
* bit pair */
122+
#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
123+
#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
126124

127125
/* prototypes for internal routines */
128126
static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
396394
{
397395
Buffer mapBuffer;
398396
uint64 *map;
399-
int i;
400397

401398
/*
402399
* Read till we fall off the end of the map. We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
414411
*/
415412
map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
416413

417-
StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
418-
"unsupported MAPSIZE");
419-
if (all_frozen == NULL)
420-
{
421-
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
422-
nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
423-
}
424-
else
425-
{
426-
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
427-
{
428-
nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
429-
nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
430-
}
431-
}
414+
nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
415+
if (all_frozen)
416+
nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
432417

433418
ReleaseBuffer(mapBuffer);
434419
}

src/include/port/pg_bitutils.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ pg_ceil_log2_64(uint64 num)
303303
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
304304
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
305305
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
306+
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
306307

307308
/*
308309
* We can also try to use the AVX-512 popcount instruction on some systems.
@@ -313,13 +314,15 @@ extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
313314
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
314315
extern bool pg_popcount_avx512_available(void);
315316
extern uint64 pg_popcount_avx512(const char *buf, int bytes);
317+
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
316318
#endif
317319

318320
#else
319321
/* Use a portable implementation -- no need for a function pointer. */
320322
extern int pg_popcount32(uint32 word);
321323
extern int pg_popcount64(uint64 word);
322324
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
325+
extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
323326

324327
#endif /* TRY_POPCNT_FAST */
325328

@@ -357,6 +360,37 @@ pg_popcount(const char *buf, int bytes)
357360
return pg_popcount_optimized(buf, bytes);
358361
}
359362

363+
/*
364+
* Returns the number of 1-bits in buf after applying the mask to each byte.
365+
*
366+
* Similar to pg_popcount(), we only take on the function pointer overhead when
367+
* it's likely to be faster.
368+
*/
369+
static inline uint64
370+
pg_popcount_masked(const char *buf, int bytes, bits8 mask)
371+
{
372+
/*
373+
* We set the threshold to the point at which we'll first use special
374+
* instructions in the optimized version.
375+
*/
376+
#if SIZEOF_VOID_P >= 8
377+
int threshold = 8;
378+
#else
379+
int threshold = 4;
380+
#endif
381+
382+
if (bytes < threshold)
383+
{
384+
uint64 popcnt = 0;
385+
386+
while (bytes--)
387+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
388+
return popcnt;
389+
}
390+
391+
return pg_popcount_masked_optimized(buf, bytes, mask);
392+
}
393+
360394
/*
361395
* Rotate the bits of "word" to the right/left by n bits.
362396
*/

src/port/pg_bitutils.c

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,23 @@ const uint8 pg_number_of_ones[256] = {
106106
static inline int pg_popcount32_slow(uint32 word);
107107
static inline int pg_popcount64_slow(uint64 word);
108108
static uint64 pg_popcount_slow(const char *buf, int bytes);
109+
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
109110

110111
#ifdef TRY_POPCNT_FAST
111112
static bool pg_popcount_available(void);
112113
static int pg_popcount32_choose(uint32 word);
113114
static int pg_popcount64_choose(uint64 word);
114115
static uint64 pg_popcount_choose(const char *buf, int bytes);
116+
static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
115117
static inline int pg_popcount32_fast(uint32 word);
116118
static inline int pg_popcount64_fast(uint64 word);
117119
static uint64 pg_popcount_fast(const char *buf, int bytes);
120+
static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
118121

119122
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
120123
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
121124
uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
125+
uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
122126
#endif /* TRY_POPCNT_FAST */
123127

124128
#ifdef TRY_POPCNT_FAST
@@ -156,17 +160,22 @@ choose_popcount_functions(void)
156160
pg_popcount32 = pg_popcount32_fast;
157161
pg_popcount64 = pg_popcount64_fast;
158162
pg_popcount_optimized = pg_popcount_fast;
163+
pg_popcount_masked_optimized = pg_popcount_masked_fast;
159164
}
160165
else
161166
{
162167
pg_popcount32 = pg_popcount32_slow;
163168
pg_popcount64 = pg_popcount64_slow;
164169
pg_popcount_optimized = pg_popcount_slow;
170+
pg_popcount_masked_optimized = pg_popcount_masked_slow;
165171
}
166172

167173
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
168174
if (pg_popcount_avx512_available())
175+
{
169176
pg_popcount_optimized = pg_popcount_avx512;
177+
pg_popcount_masked_optimized = pg_popcount_masked_avx512;
178+
}
170179
#endif
171180
}
172181

@@ -191,6 +200,13 @@ pg_popcount_choose(const char *buf, int bytes)
191200
return pg_popcount_optimized(buf, bytes);
192201
}
193202

203+
static uint64
204+
pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
205+
{
206+
choose_popcount_functions();
207+
return pg_popcount_masked(buf, bytes, mask);
208+
}
209+
194210
/*
195211
* pg_popcount32_fast
196212
* Return the number of 1 bits set in word
@@ -271,6 +287,56 @@ pg_popcount_fast(const char *buf, int bytes)
271287
return popcnt;
272288
}
273289

290+
/*
291+
* pg_popcount_masked_fast
292+
* Returns the number of 1-bits in buf after applying the mask to each byte
293+
*/
294+
static uint64
295+
pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
296+
{
297+
uint64 popcnt = 0;
298+
299+
#if SIZEOF_VOID_P >= 8
300+
/* Process in 64-bit chunks if the buffer is aligned */
301+
uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
302+
303+
if (buf == (const char *) TYPEALIGN(8, buf))
304+
{
305+
const uint64 *words = (const uint64 *) buf;
306+
307+
while (bytes >= 8)
308+
{
309+
popcnt += pg_popcount64_fast(*words++ & maskv);
310+
bytes -= 8;
311+
}
312+
313+
buf = (const char *) words;
314+
}
315+
#else
316+
/* Process in 32-bit chunks if the buffer is aligned. */
317+
uint32 maskv = ~((uint32) 0) / 0xFF * mask;
318+
319+
if (buf == (const char *) TYPEALIGN(4, buf))
320+
{
321+
const uint32 *words = (const uint32 *) buf;
322+
323+
while (bytes >= 4)
324+
{
325+
popcnt += pg_popcount32_fast(*words++ & maskv);
326+
bytes -= 4;
327+
}
328+
329+
buf = (const char *) words;
330+
}
331+
#endif
332+
333+
/* Process any remaining bytes */
334+
while (bytes--)
335+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
336+
337+
return popcnt;
338+
}
339+
274340
#endif /* TRY_POPCNT_FAST */
275341

276342

@@ -370,6 +436,56 @@ pg_popcount_slow(const char *buf, int bytes)
370436
return popcnt;
371437
}
372438

439+
/*
440+
* pg_popcount_masked_slow
441+
* Returns the number of 1-bits in buf after applying the mask to each byte
442+
*/
443+
static uint64
444+
pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
445+
{
446+
uint64 popcnt = 0;
447+
448+
#if SIZEOF_VOID_P >= 8
449+
/* Process in 64-bit chunks if the buffer is aligned */
450+
uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
451+
452+
if (buf == (const char *) TYPEALIGN(8, buf))
453+
{
454+
const uint64 *words = (const uint64 *) buf;
455+
456+
while (bytes >= 8)
457+
{
458+
popcnt += pg_popcount64_slow(*words++ & maskv);
459+
bytes -= 8;
460+
}
461+
462+
buf = (const char *) words;
463+
}
464+
#else
465+
/* Process in 32-bit chunks if the buffer is aligned. */
466+
uint32 maskv = ~((uint32) 0) / 0xFF * mask;
467+
468+
if (buf == (const char *) TYPEALIGN(4, buf))
469+
{
470+
const uint32 *words = (const uint32 *) buf;
471+
472+
while (bytes >= 4)
473+
{
474+
popcnt += pg_popcount32_slow(*words++ & maskv);
475+
bytes -= 4;
476+
}
477+
478+
buf = (const char *) words;
479+
}
480+
#endif
481+
482+
/* Process any remaining bytes */
483+
while (bytes--)
484+
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
485+
486+
return popcnt;
487+
}
488+
373489
#ifndef TRY_POPCNT_FAST
374490

375491
/*
@@ -401,4 +517,14 @@ pg_popcount_optimized(const char *buf, int bytes)
401517
return pg_popcount_slow(buf, bytes);
402518
}
403519

520+
/*
521+
* pg_popcount_masked_optimized
522+
* Returns the number of 1-bits in buf after applying the mask to each byte
523+
*/
524+
uint64
525+
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
526+
{
527+
return pg_popcount_masked_slow(buf, bytes, mask);
528+
}
529+
404530
#endif /* !TRY_POPCNT_FAST */

src/port/pg_popcount_avx512.c

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,64 @@ pg_popcount_avx512(const char *buf, int bytes)
7878
return _mm512_reduce_add_epi64(accum);
7979
}
8080

81+
/*
82+
* pg_popcount_masked_avx512
83+
* Returns the number of 1-bits in buf after applying the mask to each byte
84+
*/
85+
uint64
86+
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
87+
{
88+
__m512i val,
89+
vmasked,
90+
cnt;
91+
__m512i accum = _mm512_setzero_si512();
92+
const char *final;
93+
int tail_idx;
94+
__mmask64 bmask = ~UINT64CONST(0);
95+
const __m512i maskv = _mm512_set1_epi8(mask);
96+
97+
/*
98+
* Align buffer down to avoid double load overhead from unaligned access.
99+
* Calculate a mask to ignore preceding bytes. Find start offset of final
100+
* iteration and ensure it is not empty.
101+
*/
102+
bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
103+
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
104+
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
105+
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
106+
107+
/*
108+
* Iterate through all but the final iteration. Starting from the second
109+
* iteration, the mask is ignored.
110+
*/
111+
if (buf < final)
112+
{
113+
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
114+
vmasked = _mm512_and_si512(val, maskv);
115+
cnt = _mm512_popcnt_epi64(vmasked);
116+
accum = _mm512_add_epi64(accum, cnt);
117+
118+
buf += sizeof(__m512i);
119+
bmask = ~UINT64CONST(0);
120+
121+
for (; buf < final; buf += sizeof(__m512i))
122+
{
123+
val = _mm512_load_si512((const __m512i *) buf);
124+
vmasked = _mm512_and_si512(val, maskv);
125+
cnt = _mm512_popcnt_epi64(vmasked);
126+
accum = _mm512_add_epi64(accum, cnt);
127+
}
128+
}
129+
130+
/* Final iteration needs to ignore bytes that are not within the length */
131+
bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
132+
133+
val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
134+
vmasked = _mm512_and_si512(val, maskv);
135+
cnt = _mm512_popcnt_epi64(vmasked);
136+
accum = _mm512_add_epi64(accum, cnt);
137+
138+
return _mm512_reduce_add_epi64(accum);
139+
}
140+
81141
#endif /* TRY_POPCNT_FAST */

0 commit comments

Comments
 (0)