Skip to content

Commit 169568c

Browse files
committed
Use map() to interleave instructions for parallelism
1 parent 9cc4ddf commit 169568c

File tree

7 files changed

+44
-51
lines changed

7 files changed

+44
-51
lines changed

src/arm64/simd_input.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,12 @@ struct simd_input<Architecture::ARM64> {
6969

7070
really_inline uint64_t eq(uint8_t m) {
7171
const uint8x16_t mask = vmovq_n_u8(m);
72-
return this->map([&](uint8x16_t chunk) {
73-
return vceqq_u8(chunk, mask);
74-
}).to_bitmask();
72+
return this->MAP_BITMASK( vceqq_u8(chunk, mask) );
7573
}
7674

7775
really_inline uint64_t lteq(uint8_t m) {
7876
const uint8x16_t mask = vmovq_n_u8(m);
79-
return this->map([&](uint8x16_t chunk) {
80-
return vcleq_u8(chunk, mask);
81-
}).to_bitmask();
77+
return this->MAP_BITMASK( vcleq_u8(chunk, mask) );
8278
}
8379

8480
}; // struct simd_input

src/arm64/stage1_find_marks.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,10 @@ really_inline void find_whitespace_and_structurals(
3939
});
4040

4141
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
42-
structurals = v.map([&](auto chunk) {
43-
return vtstq_u8(chunk, structural_shufti_mask);
44-
}).to_bitmask();
42+
structurals = v.MAP_BITMASK( vtstq_u8(chunk, structural_shufti_mask) );
4543

4644
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
47-
whitespace = v.map([&](auto chunk) {
48-
return vtstq_u8(chunk, whitespace_shufti_mask);
49-
}).to_bitmask();
45+
whitespace = v.MAP_BITMASK( vtstq_u8(chunk, whitespace_shufti_mask) );
5046
}
5147

5248
#include "generic/stage1_find_marks_flatten.h"

src/haswell/simd_input.h

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ struct simd_input<Architecture::HASWELL> {
1818
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
1919
}
2020

21-
really_inline simd_input(__m256i i0, __m256i i1) {
22-
this->lo = i0;
23-
this->hi = i1;
21+
really_inline simd_input(__m256i a_lo, __m256i a_hi) {
22+
this->lo = a_lo;
23+
this->hi = a_hi;
2424
}
2525

2626
template <typename F>
@@ -32,23 +32,19 @@ struct simd_input<Architecture::HASWELL> {
3232
}
3333

3434
really_inline uint64_t to_bitmask() {
35-
uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
36-
uint64_t r1 = _mm256_movemask_epi8(this->hi);
37-
return r0 | (r1 << 32);
35+
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
36+
uint64_t r_hi = _mm256_movemask_epi8(this->hi);
37+
return r_lo | (r_hi << 32);
3838
}
3939

4040
really_inline uint64_t eq(uint8_t m) {
4141
const __m256i mask = _mm256_set1_epi8(m);
42-
return this->map([&] (auto chunk) {
43-
return _mm256_cmpeq_epi8(chunk, mask);
44-
}).to_bitmask();
42+
return this->MAP_BITMASK( _mm256_cmpeq_epi8(chunk, mask) );
4543
}
4644

4745
really_inline uint64_t lteq(uint8_t m) {
4846
const __m256i maxval = _mm256_set1_epi8(m);
49-
return this->map([&] (auto chunk) {
50-
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
51-
}).to_bitmask();
47+
return this->MAP_BITMASK( _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval) );
5248
}
5349

5450
}; // struct simd_input

src/haswell/stage1_find_marks.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
5353
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
5454
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
5555
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
56+
return space;
5657
}).to_bitmask();
5758
// end of naive approach
5859

@@ -69,15 +70,14 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
6970
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
7071
const __m256i struct_mask = _mm256_set1_epi8(32);
7172

72-
whitespace = in.map([&](auto chunk) {
73-
return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
74-
}).to_bitmask();
75-
structurals = in.map([&](auto chunk) {
76-
__m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
77-
__m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
78-
__m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
79-
return _mm256_cmpeq_epi8(struct_r2, struct_r3);
80-
}).to_bitmask();
73+
whitespace = in.MAP_BITMASK( _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk)) );
74+
auto struct_r1 = in.MAP_CHUNKS( _mm256_add_epi8(struct_offset, chunk) );
75+
auto struct_r2 = in.MAP_CHUNKS( _mm256_or_si256(chunk, struct_mask) );
76+
auto struct_r3 = struct_r1.MAP_CHUNKS( _mm256_shuffle_epi8(structural_table, chunk) );
77+
structurals = simd_input<ARCHITECTURE>(
78+
_mm256_cmpeq_epi8(struct_r2.lo, struct_r3.lo),
79+
_mm256_cmpeq_epi8(struct_r2.hi, struct_r3.hi)
80+
).to_bitmask();
8181

8282
#endif // else SIMDJSON_NAIVE_STRUCTURAL
8383
}

src/simd_input.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,23 @@
88

99
namespace simdjson {
1010

11-
template <Architecture>
11+
template <Architecture T>
1212
struct simd_input {
1313
simd_input(const uint8_t *ptr);
14+
// Map through each simd register in this input, producing another simd_input.
15+
template <typename F>
16+
really_inline simd_input<T> map(F const& map_chunk);
17+
// turn this bytemask (usually the result of a simd comparison operation) into a bitmask.
18+
uint64_t to_bitmask();
1419
// a straightforward comparison of a mask against input.
1520
uint64_t eq(uint8_t m);
1621
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
1722
uint64_t lteq(uint8_t m);
1823
}; // struct simd_input
1924

25+
#define MAP_CHUNKS(EXPR) map([&](auto chunk) { return EXPR; })
26+
#define MAP_BITMASK(EXPR) map([&](auto chunk) { return EXPR; }).to_bitmask()
27+
2028
} // namespace simdjson
2129

2230
#endif

src/westmere/simd_input.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,20 @@ struct simd_input<Architecture::WESTMERE> {
4242

4343
really_inline uint64_t to_bitmask() {
4444
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(this->v0));
45-
uint64_t r1 = _mm_movemask_epi8(this->v0);
45+
uint64_t r1 = _mm_movemask_epi8(this->v1);
4646
uint64_t r2 = _mm_movemask_epi8(this->v2);
4747
uint64_t r3 = _mm_movemask_epi8(this->v3);
4848
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
4949
}
5050

5151
really_inline uint64_t eq(uint8_t m) {
5252
const __m128i mask = _mm_set1_epi8(m);
53-
return this->map([&](auto chunk) {
54-
return _mm_cmpeq_epi8(chunk, mask);
55-
}).to_bitmask();
53+
return this->MAP_BITMASK( _mm_cmpeq_epi8(chunk, mask) );
5654
}
5755

5856
really_inline uint64_t lteq(uint8_t m) {
5957
const __m128i maxval = _mm_set1_epi8(m);
60-
return this->map([&](auto chunk) {
61-
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
62-
}).to_bitmask();
58+
return this->MAP_BITMASK( _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval) );
6359
}
6460

6561
}; // struct simd_input

src/westmere/stage1_find_marks.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,17 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
2828
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
2929
const __m128i struct_mask = _mm_set1_epi8(32);
3030

31-
whitespace = in.map([&](auto chunk) {
32-
return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
33-
}).to_bitmask();
34-
35-
structurals = in.map([&](auto chunk) {
36-
__m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
37-
__m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
38-
__m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
39-
return _mm_cmpeq_epi8(struct_r2, struct_r3);
40-
}).to_bitmask();
31+
whitespace = in.MAP_BITMASK( _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk)) );
32+
33+
auto r1 = in.MAP_CHUNKS( _mm_add_epi8(struct_offset, chunk) );
34+
auto r2 = in.MAP_CHUNKS( _mm_or_si128(chunk, struct_mask) );
35+
auto r3 = r1.MAP_CHUNKS( _mm_shuffle_epi8(structural_table, chunk) );
36+
structurals = simd_input<ARCHITECTURE>(
37+
_mm_cmpeq_epi8(r2.v0, r3.v0),
38+
_mm_cmpeq_epi8(r2.v1, r3.v1),
39+
_mm_cmpeq_epi8(r2.v2, r3.v2),
40+
_mm_cmpeq_epi8(r2.v3, r3.v3)
41+
).to_bitmask();
4142
}
4243

4344
#include "generic/stage1_find_marks_flatten.h"

0 commit comments

Comments
 (0)