|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * pg_lfind.h |
| 4 | + * Optimized linear search routines. |
| 5 | + * |
| 6 | + * Copyright (c) 2022, PostgreSQL Global Development Group |
| 7 | + * |
| 8 | + * IDENTIFICATION |
| 9 | + * src/include/port/pg_lfind.h |
| 10 | + * |
| 11 | + *------------------------------------------------------------------------- |
| 12 | + */ |
| 13 | +#ifndef PG_LFIND_H |
| 14 | +#define PG_LFIND_H |
| 15 | + |
| 16 | +#include "port/simd.h" |
| 17 | + |
| 18 | +/* |
| 19 | + * pg_lfind32 |
| 20 | + * |
| 21 | + * Return true if there is an element in 'base' that equals 'key', otherwise |
| 22 | + * return false. |
| 23 | + */ |
| 24 | +static inline bool |
| 25 | +pg_lfind32(uint32 key, uint32 *base, uint32 nelem) |
| 26 | +{ |
| 27 | + uint32 i = 0; |
| 28 | + |
| 29 | + /* Use SIMD intrinsics where available. */ |
| 30 | +#ifdef USE_SSE2 |
| 31 | + |
| 32 | + /* |
| 33 | + * A 16-byte register only has four 4-byte lanes. For better |
| 34 | + * instruction-level parallelism, each loop iteration operates on a block |
| 35 | + * of four registers. Testing has showed this is ~40% faster than using a |
| 36 | + * block of two registers. |
| 37 | + */ |
| 38 | + const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ |
| 39 | + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ |
| 40 | + |
| 41 | +#if defined(USE_ASSERT_CHECKING) |
| 42 | + bool assert_result = false; |
| 43 | + |
| 44 | + /* pre-compute the result for assert checking */ |
| 45 | + for (i = 0; i < nelem; i++) |
| 46 | + { |
| 47 | + if (key == base[i]) |
| 48 | + { |
| 49 | + assert_result = true; |
| 50 | + break; |
| 51 | + } |
| 52 | + } |
| 53 | +#endif |
| 54 | + |
| 55 | + for (i = 0; i < iterations; i += 16) |
| 56 | + { |
| 57 | + /* load the next block into 4 registers holding 4 values each */ |
| 58 | + const __m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]); |
| 59 | + const __m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]); |
| 60 | + const __m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]); |
| 61 | + const __m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]); |
| 62 | + |
| 63 | + /* compare each value to the key */ |
| 64 | + const __m128i result1 = _mm_cmpeq_epi32(keys, vals1); |
| 65 | + const __m128i result2 = _mm_cmpeq_epi32(keys, vals2); |
| 66 | + const __m128i result3 = _mm_cmpeq_epi32(keys, vals3); |
| 67 | + const __m128i result4 = _mm_cmpeq_epi32(keys, vals4); |
| 68 | + |
| 69 | + /* combine the results into a single variable */ |
| 70 | + const __m128i tmp1 = _mm_or_si128(result1, result2); |
| 71 | + const __m128i tmp2 = _mm_or_si128(result3, result4); |
| 72 | + const __m128i result = _mm_or_si128(tmp1, tmp2); |
| 73 | + |
| 74 | + /* see if there was a match */ |
| 75 | + if (_mm_movemask_epi8(result) != 0) |
| 76 | + { |
| 77 | +#if defined(USE_ASSERT_CHECKING) |
| 78 | + Assert(assert_result == true); |
| 79 | +#endif |
| 80 | + return true; |
| 81 | + } |
| 82 | + } |
| 83 | +#endif /* USE_SSE2 */ |
| 84 | + |
| 85 | + /* Process the remaining elements one at a time. */ |
| 86 | + for (; i < nelem; i++) |
| 87 | + { |
| 88 | + if (key == base[i]) |
| 89 | + { |
| 90 | +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) |
| 91 | + Assert(assert_result == true); |
| 92 | +#endif |
| 93 | + return true; |
| 94 | + } |
| 95 | + } |
| 96 | + |
| 97 | +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) |
| 98 | + Assert(assert_result == false); |
| 99 | +#endif |
| 100 | + return false; |
| 101 | +} |
| 102 | + |
| 103 | +#endif /* PG_LFIND_H */ |
0 commit comments