Skip to content

Commit cc4826d

Browse files
Inline pg_popcount{32,64} into pg_popcount().
On some systems, calls to pg_popcount{32,64} are indirected through a function pointer. This commit converts pg_popcount() to a function pointer on those systems so that we can inline the appropriate pg_popcount{32,64} implementations into each of the pg_popcount() implementations. Since pg_popcount() may call pg_popcount{32,64} several times, this can significantly improve its performance. Suggested-by: David Rowley Reviewed-by: David Rowley Discussion: https://postgr.es/m/CAApHDvrb7MJRB6JuKLDEY2x_LKdFHwVbogpjZBCX547i5%2BrXOQ%40mail.gmail.com
1 parent b7e2121 commit cc4826d

File tree

2 files changed

+121
-39
lines changed

2 files changed

+121
-39
lines changed

src/include/port/pg_bitutils.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,17 +302,16 @@ pg_ceil_log2_64(uint64 num)
302302
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
303303
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
304304
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
305+
extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
305306

306307
#else
307308
/* Use a portable implementation -- no need for a function pointer. */
308309
extern int pg_popcount32(uint32 word);
309310
extern int pg_popcount64(uint64 word);
311+
extern uint64 pg_popcount(const char *buf, int bytes);
310312

311313
#endif /* TRY_POPCNT_FAST */
312314

313-
/* Count the number of one-bits in a byte array */
314-
extern uint64 pg_popcount(const char *buf, int bytes);
315-
316315
/*
317316
* Rotate the bits of "word" to the right/left by n bits.
318317
*/

src/port/pg_bitutils.c

Lines changed: 119 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,22 @@ const uint8 pg_number_of_ones[256] = {
103103
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
104104
};
105105

106-
static int pg_popcount32_slow(uint32 word);
107-
static int pg_popcount64_slow(uint64 word);
106+
static inline int pg_popcount32_slow(uint32 word);
107+
static inline int pg_popcount64_slow(uint64 word);
108+
static uint64 pg_popcount_slow(const char *buf, int bytes);
108109

109110
#ifdef TRY_POPCNT_FAST
110111
static bool pg_popcount_available(void);
111112
static int pg_popcount32_choose(uint32 word);
112113
static int pg_popcount64_choose(uint64 word);
113-
static int pg_popcount32_fast(uint32 word);
114-
static int pg_popcount64_fast(uint64 word);
114+
static uint64 pg_popcount_choose(const char *buf, int bytes);
115+
static inline int pg_popcount32_fast(uint32 word);
116+
static inline int pg_popcount64_fast(uint64 word);
117+
static uint64 pg_popcount_fast(const char *buf, int bytes);
115118

116119
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
117120
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
121+
uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
118122
#endif /* TRY_POPCNT_FAST */
119123

120124
#ifdef TRY_POPCNT_FAST
@@ -151,11 +155,13 @@ pg_popcount32_choose(uint32 word)
151155
{
152156
pg_popcount32 = pg_popcount32_fast;
153157
pg_popcount64 = pg_popcount64_fast;
158+
pg_popcount = pg_popcount_fast;
154159
}
155160
else
156161
{
157162
pg_popcount32 = pg_popcount32_slow;
158163
pg_popcount64 = pg_popcount64_slow;
164+
pg_popcount = pg_popcount_slow;
159165
}
160166

161167
return pg_popcount32(word);
@@ -168,21 +174,42 @@ pg_popcount64_choose(uint64 word)
168174
{
169175
pg_popcount32 = pg_popcount32_fast;
170176
pg_popcount64 = pg_popcount64_fast;
177+
pg_popcount = pg_popcount_fast;
171178
}
172179
else
173180
{
174181
pg_popcount32 = pg_popcount32_slow;
175182
pg_popcount64 = pg_popcount64_slow;
183+
pg_popcount = pg_popcount_slow;
176184
}
177185

178186
return pg_popcount64(word);
179187
}
180188

189+
static uint64
190+
pg_popcount_choose(const char *buf, int bytes)
191+
{
192+
if (pg_popcount_available())
193+
{
194+
pg_popcount32 = pg_popcount32_fast;
195+
pg_popcount64 = pg_popcount64_fast;
196+
pg_popcount = pg_popcount_fast;
197+
}
198+
else
199+
{
200+
pg_popcount32 = pg_popcount32_slow;
201+
pg_popcount64 = pg_popcount64_slow;
202+
pg_popcount = pg_popcount_slow;
203+
}
204+
205+
return pg_popcount(buf, bytes);
206+
}
207+
181208
/*
182209
* pg_popcount32_fast
183210
* Return the number of 1 bits set in word
184211
*/
185-
static int
212+
static inline int
186213
pg_popcount32_fast(uint32 word)
187214
{
188215
#ifdef _MSC_VER
@@ -199,7 +226,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
199226
* pg_popcount64_fast
200227
* Return the number of 1 bits set in word
201228
*/
202-
static int
229+
static inline int
203230
pg_popcount64_fast(uint64 word)
204231
{
205232
#ifdef _MSC_VER
@@ -212,14 +239,60 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
212239
#endif
213240
}
214241

242+
/*
243+
* pg_popcount_fast
244+
* Returns the number of 1-bits in buf
245+
*/
246+
static uint64
247+
pg_popcount_fast(const char *buf, int bytes)
248+
{
249+
uint64 popcnt = 0;
250+
251+
#if SIZEOF_VOID_P >= 8
252+
/* Process in 64-bit chunks if the buffer is aligned. */
253+
if (buf == (const char *) TYPEALIGN(8, buf))
254+
{
255+
const uint64 *words = (const uint64 *) buf;
256+
257+
while (bytes >= 8)
258+
{
259+
popcnt += pg_popcount64_fast(*words++);
260+
bytes -= 8;
261+
}
262+
263+
buf = (const char *) words;
264+
}
265+
#else
266+
/* Process in 32-bit chunks if the buffer is aligned. */
267+
if (buf == (const char *) TYPEALIGN(4, buf))
268+
{
269+
const uint32 *words = (const uint32 *) buf;
270+
271+
while (bytes >= 4)
272+
{
273+
popcnt += pg_popcount32_fast(*words++);
274+
bytes -= 4;
275+
}
276+
277+
buf = (const char *) words;
278+
}
279+
#endif
280+
281+
/* Process any remaining bytes */
282+
while (bytes--)
283+
popcnt += pg_number_of_ones[(unsigned char) *buf++];
284+
285+
return popcnt;
286+
}
287+
215288
#endif /* TRY_POPCNT_FAST */
216289

217290

218291
/*
219292
* pg_popcount32_slow
220293
* Return the number of 1 bits set in word
221294
*/
222-
static int
295+
static inline int
223296
pg_popcount32_slow(uint32 word)
224297
{
225298
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -241,7 +314,7 @@ pg_popcount32_slow(uint32 word)
241314
* pg_popcount64_slow
242315
* Return the number of 1 bits set in word
243316
*/
244-
static int
317+
static inline int
245318
pg_popcount64_slow(uint64 word)
246319
{
247320
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -265,35 +338,12 @@ pg_popcount64_slow(uint64 word)
265338
#endif /* HAVE__BUILTIN_POPCOUNT */
266339
}
267340

268-
#ifndef TRY_POPCNT_FAST
269-
270341
/*
271-
* When the POPCNT instruction is not available, there's no point in using
272-
* function pointers to vary the implementation between the fast and slow
273-
* method. We instead just make these actual external functions when
274-
* TRY_POPCNT_FAST is not defined. The compiler should be able to inline
275-
* the slow versions here.
276-
*/
277-
int
278-
pg_popcount32(uint32 word)
279-
{
280-
return pg_popcount32_slow(word);
281-
}
282-
283-
int
284-
pg_popcount64(uint64 word)
285-
{
286-
return pg_popcount64_slow(word);
287-
}
288-
289-
#endif /* !TRY_POPCNT_FAST */
290-
291-
/*
292-
* pg_popcount
342+
* pg_popcount_slow
293343
* Returns the number of 1-bits in buf
294344
*/
295-
uint64
296-
pg_popcount(const char *buf, int bytes)
345+
static uint64
346+
pg_popcount_slow(const char *buf, int bytes)
297347
{
298348
uint64 popcnt = 0;
299349

@@ -305,7 +355,7 @@ pg_popcount(const char *buf, int bytes)
305355

306356
while (bytes >= 8)
307357
{
308-
popcnt += pg_popcount64(*words++);
358+
popcnt += pg_popcount64_slow(*words++);
309359
bytes -= 8;
310360
}
311361

@@ -319,7 +369,7 @@ pg_popcount(const char *buf, int bytes)
319369

320370
while (bytes >= 4)
321371
{
322-
popcnt += pg_popcount32(*words++);
372+
popcnt += pg_popcount32_slow(*words++);
323373
bytes -= 4;
324374
}
325375

@@ -333,3 +383,36 @@ pg_popcount(const char *buf, int bytes)
333383

334384
return popcnt;
335385
}
386+
387+
#ifndef TRY_POPCNT_FAST
388+
389+
/*
390+
* When the POPCNT instruction is not available, there's no point in using
391+
* function pointers to vary the implementation between the fast and slow
392+
* method. We instead just make these actual external functions when
393+
* TRY_POPCNT_FAST is not defined. The compiler should be able to inline
394+
* the slow versions here.
395+
*/
396+
int
397+
pg_popcount32(uint32 word)
398+
{
399+
return pg_popcount32_slow(word);
400+
}
401+
402+
int
403+
pg_popcount64(uint64 word)
404+
{
405+
return pg_popcount64_slow(word);
406+
}
407+
408+
/*
409+
* pg_popcount
410+
* Returns the number of 1-bits in buf
411+
*/
412+
uint64
413+
pg_popcount(const char *buf, int bytes)
414+
{
415+
return pg_popcount_slow(buf, bytes);
416+
}
417+
418+
#endif /* !TRY_POPCNT_FAST */

0 commit comments

Comments
 (0)