Skip to content

Commit f8f19f7

Browse files
committed
Abstract some more architecture-specific details away from SIMD functionality
Add a typedef to represent vectors containing four 32-bit integers, and add functions operating on them. Also separate out saturating subtraction into its own function. The motivation for this is to prepare for a future commit to add ARM NEON support. Nathan Bossart Reviewed by John Naylor and Tom Lane Discussion: https://www.postgresql.org/message-id/flat/CAFBsxsEyR9JkfbPcDXBRYEfdfC__OkwVGdwEAgY4Rv0cvw35EA%40mail.gmail.com#aba7a64b11503494ffd8dd27067626a9
1 parent c6e0fe1 commit f8f19f7

File tree

2 files changed

+121
-43
lines changed

2 files changed

+121
-43
lines changed

src/include/port/pg_lfind.h

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -91,16 +91,19 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
9191
{
9292
uint32 i = 0;
9393

94-
#ifdef USE_SSE2
94+
#ifndef USE_NO_SIMD
9595

9696
/*
97-
* A 16-byte register only has four 4-byte lanes. For better
98-
* instruction-level parallelism, each loop iteration operates on a block
99-
* of four registers. Testing has showed this is ~40% faster than using a
100-
* block of two registers.
97+
* For better instruction-level parallelism, each loop iteration operates
98+
* on a block of four registers. Testing for SSE2 has showed this is ~40%
99+
* faster than using a block of two registers.
101100
*/
102-
const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */
103-
uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */
101+
const Vector32 keys = vector32_broadcast(key); /* load copies of key */
102+
const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
103+
const uint32 nelem_per_iteration = 4 * nelem_per_vector;
104+
105+
/* round down to multiple of elements per iteration */
106+
const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
104107

105108
#if defined(USE_ASSERT_CHECKING)
106109
bool assert_result = false;
@@ -116,49 +119,59 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
116119
}
117120
#endif
118121

119-
for (i = 0; i < iterations; i += 16)
122+
for (i = 0; i < tail_idx; i += nelem_per_iteration)
120123
{
121-
/* load the next block into 4 registers holding 4 values each */
122-
const __m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]);
123-
const __m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]);
124-
const __m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]);
125-
const __m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]);
124+
Vector32 vals1,
125+
vals2,
126+
vals3,
127+
vals4,
128+
result1,
129+
result2,
130+
result3,
131+
result4,
132+
tmp1,
133+
tmp2,
134+
result;
135+
136+
/* load the next block into 4 registers */
137+
vector32_load(&vals1, &base[i]);
138+
vector32_load(&vals2, &base[i + nelem_per_vector]);
139+
vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
140+
vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
126141

127142
/* compare each value to the key */
128-
const __m128i result1 = _mm_cmpeq_epi32(keys, vals1);
129-
const __m128i result2 = _mm_cmpeq_epi32(keys, vals2);
130-
const __m128i result3 = _mm_cmpeq_epi32(keys, vals3);
131-
const __m128i result4 = _mm_cmpeq_epi32(keys, vals4);
143+
result1 = vector32_eq(keys, vals1);
144+
result2 = vector32_eq(keys, vals2);
145+
result3 = vector32_eq(keys, vals3);
146+
result4 = vector32_eq(keys, vals4);
132147

133148
/* combine the results into a single variable */
134-
const __m128i tmp1 = _mm_or_si128(result1, result2);
135-
const __m128i tmp2 = _mm_or_si128(result3, result4);
136-
const __m128i result = _mm_or_si128(tmp1, tmp2);
149+
tmp1 = vector32_or(result1, result2);
150+
tmp2 = vector32_or(result3, result4);
151+
result = vector32_or(tmp1, tmp2);
137152

138153
/* see if there was a match */
139-
if (_mm_movemask_epi8(result) != 0)
154+
if (vector8_is_highbit_set((Vector8) result))
140155
{
141-
#if defined(USE_ASSERT_CHECKING)
142156
Assert(assert_result == true);
143-
#endif
144157
return true;
145158
}
146159
}
147-
#endif /* USE_SSE2 */
160+
#endif /* ! USE_NO_SIMD */
148161

149162
/* Process the remaining elements one at a time. */
150163
for (; i < nelem; i++)
151164
{
152165
if (key == base[i])
153166
{
154-
#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
167+
#ifndef USE_NO_SIMD
155168
Assert(assert_result == true);
156169
#endif
157170
return true;
158171
}
159172
}
160173

161-
#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
174+
#ifndef USE_NO_SIMD
162175
Assert(assert_result == false);
163176
#endif
164177
return false;

src/include/port/simd.h

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,32 @@
3131
#include <emmintrin.h>
3232
#define USE_SSE2
3333
typedef __m128i Vector8;
34+
typedef __m128i Vector32;
3435

3536
#else
3637
/*
3738
* If no SIMD instructions are available, we can in some cases emulate vector
38-
* operations using bitwise operations on unsigned integers.
39+
* operations using bitwise operations on unsigned integers. Note that many
40+
* of the functions in this file presently do not have non-SIMD
41+
* implementations. In particular, none of the functions involving Vector32
42+
* are implemented without SIMD since it's likely not worthwhile to represent
43+
* two 32-bit integers using a uint64.
3944
*/
4045
#define USE_NO_SIMD
4146
typedef uint64 Vector8;
4247
#endif
4348

44-
4549
/* load/store operations */
4650
static inline void vector8_load(Vector8 *v, const uint8 *s);
51+
#ifndef USE_NO_SIMD
52+
static inline void vector32_load(Vector32 *v, const uint32 *s);
53+
#endif
4754

4855
/* assignment operations */
4956
static inline Vector8 vector8_broadcast(const uint8 c);
57+
#ifndef USE_NO_SIMD
58+
static inline Vector32 vector32_broadcast(const uint32 c);
59+
#endif
5060

5161
/* element-wise comparisons to a scalar */
5262
static inline bool vector8_has(const Vector8 v, const uint8 c);
@@ -56,14 +66,21 @@ static inline bool vector8_is_highbit_set(const Vector8 v);
5666

5767
/* arithmetic operations */
5868
static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
59-
60-
/* Different semantics for SIMD architectures. */
6169
#ifndef USE_NO_SIMD
70+
static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
71+
static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
72+
#endif
6273

63-
/* comparisons between vectors */
74+
/*
75+
* comparisons between vectors
76+
*
77+
* Note: These return a vector rather than booloan, which is why we don't
78+
* have non-SIMD implementations.
79+
*/
80+
#ifndef USE_NO_SIMD
6481
static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
65-
66-
#endif /* ! USE_NO_SIMD */
82+
static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
83+
#endif
6784

6885
/*
6986
* Load a chunk of memory into the given vector.
@@ -78,6 +95,15 @@ vector8_load(Vector8 *v, const uint8 *s)
7895
#endif
7996
}
8097

98+
#ifndef USE_NO_SIMD
99+
static inline void
100+
vector32_load(Vector32 *v, const uint32 *s)
101+
{
102+
#ifdef USE_SSE2
103+
*v = _mm_loadu_si128((const __m128i *) s);
104+
#endif
105+
}
106+
#endif /* ! USE_NO_SIMD */
81107

82108
/*
83109
* Create a vector with all elements set to the same value.
@@ -92,6 +118,16 @@ vector8_broadcast(const uint8 c)
92118
#endif
93119
}
94120

121+
#ifndef USE_NO_SIMD
122+
static inline Vector32
123+
vector32_broadcast(const uint32 c)
124+
{
125+
#ifdef USE_SSE2
126+
return _mm_set1_epi32(c);
127+
#endif
128+
}
129+
#endif /* ! USE_NO_SIMD */
130+
95131
/*
96132
* Return true if any elements in the vector are equal to the given scalar.
97133
*/
@@ -118,7 +154,7 @@ vector8_has(const Vector8 v, const uint8 c)
118154
/* any bytes in v equal to c will evaluate to zero via XOR */
119155
result = vector8_has_zero(v ^ vector8_broadcast(c));
120156
#elif defined(USE_SSE2)
121-
result = _mm_movemask_epi8(_mm_cmpeq_epi8(v, vector8_broadcast(c)));
157+
result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
122158
#endif
123159

124160
Assert(assert_result == result);
@@ -133,8 +169,8 @@ vector8_has_zero(const Vector8 v)
133169
{
134170
#if defined(USE_NO_SIMD)
135171
/*
136-
* We cannot call vector8_has() here, because that would lead to a circular
137-
* definition.
172+
* We cannot call vector8_has() here, because that would lead to a
173+
* circular definition.
138174
*/
139175
return vector8_has_le(v, 0);
140176
#elif defined(USE_SSE2)
@@ -150,9 +186,6 @@ static inline bool
150186
vector8_has_le(const Vector8 v, const uint8 c)
151187
{
152188
bool result = false;
153-
#if defined(USE_SSE2)
154-
__m128i sub;
155-
#endif
156189

157190
/* pre-compute the result for assert checking */
158191
#ifdef USE_ASSERT_CHECKING
@@ -194,10 +227,10 @@ vector8_has_le(const Vector8 v, const uint8 c)
194227

195228
/*
196229
* Use saturating subtraction to find bytes <= c, which will present as
197-
* NUL bytes in 'sub'.
230+
* NUL bytes. This approach is a workaround for the lack of unsigned
231+
* comparison instructions on some architectures.
198232
*/
199-
sub = _mm_subs_epu8(v, vector8_broadcast(c));
200-
result = vector8_has_zero(sub);
233+
result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
201234
#endif
202235

203236
Assert(assert_result == result);
@@ -230,22 +263,54 @@ vector8_or(const Vector8 v1, const Vector8 v2)
230263
#endif
231264
}
232265

266+
#ifndef USE_NO_SIMD
267+
static inline Vector32
268+
vector32_or(const Vector32 v1, const Vector32 v2)
269+
{
270+
#ifdef USE_SSE2
271+
return _mm_or_si128(v1, v2);
272+
#endif
273+
}
274+
#endif /* ! USE_NO_SIMD */
233275

234-
/* Different semantics for SIMD architectures. */
276+
/*
277+
* Return the result of subtracting the respective elements of the input
278+
* vectors using saturation (i.e., if the operation would yield a value less
279+
* than zero, zero is returned instead). For more information on saturation
280+
* arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
281+
*/
235282
#ifndef USE_NO_SIMD
283+
static inline Vector8
284+
vector8_ssub(const Vector8 v1, const Vector8 v2)
285+
{
286+
#ifdef USE_SSE2
287+
return _mm_subs_epu8(v1, v2);
288+
#endif
289+
}
290+
#endif /* ! USE_NO_SIMD */
236291

237292
/*
238293
* Return a vector with all bits set in each lane where the the corresponding
239294
* lanes in the inputs are equal.
240295
*/
296+
#ifndef USE_NO_SIMD
241297
static inline Vector8
242298
vector8_eq(const Vector8 v1, const Vector8 v2)
243299
{
244300
#ifdef USE_SSE2
245301
return _mm_cmpeq_epi8(v1, v2);
246302
#endif
247303
}
304+
#endif /* ! USE_NO_SIMD */
248305

306+
#ifndef USE_NO_SIMD
307+
static inline Vector32
308+
vector32_eq(const Vector32 v1, const Vector32 v2)
309+
{
310+
#ifdef USE_SSE2
311+
return _mm_cmpeq_epi32(v1, v2);
312+
#endif
313+
}
249314
#endif /* ! USE_NO_SIMD */
250315

251316
#endif /* SIMD_H */

0 commit comments

Comments
 (0)