Skip to content

Commit b012225

Browse files
committed
Genericize bitmask building to make algorithms clearer
1 parent 2060cf8 commit b012225

File tree

4 files changed

+90
-136
lines changed

4 files changed

+90
-136
lines changed

src/haswell/simd_input.h

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,25 @@ struct simd_input<Architecture::HASWELL> {
1818
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
1919
}
2020

21+
template <typename F>
22+
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
23+
uint64_t r0 = static_cast<uint32_t>(_mm256_movemask_epi8(chunk_to_mask(this->lo)));
24+
uint64_t r1 = _mm256_movemask_epi8(chunk_to_mask(this->hi));
25+
return r0 | (r1 << 32);
26+
}
27+
2128
really_inline uint64_t eq(uint8_t m) {
2229
const __m256i mask = _mm256_set1_epi8(m);
23-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
24-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
25-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
26-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
27-
return res_0 | (res_1 << 32);
30+
return this->build_bitmask([&] (auto chunk) {
31+
return _mm256_cmpeq_epi8(chunk, mask);
32+
});
2833
}
2934

3035
really_inline uint64_t lteq(uint8_t m) {
3136
const __m256i maxval = _mm256_set1_epi8(m);
32-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
33-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
34-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
35-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
36-
return res_0 | (res_1 << 32);
37+
return this->build_bitmask([&] (auto chunk) {
38+
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, chunk), maxval);
39+
});
3740
}
3841

3942
}; // struct simd_input

src/haswell/stage1_find_marks.h

Lines changed: 52 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -25,77 +25,60 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
2525
uint64_t &whitespace, uint64_t &structurals) {
2626

2727
#ifdef SIMDJSON_NAIVE_STRUCTURAL
28-
// You should never need this naive approach, but it can be useful
29-
// for research purposes
30-
const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
31-
__m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
32-
__m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
33-
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
34-
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
35-
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
36-
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
37-
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
38-
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
39-
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
40-
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
41-
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
42-
const __m256i mask_column = _mm256_set1_epi8(0x3a);
43-
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
44-
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
45-
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
46-
struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
47-
struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
48-
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
49-
uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
50-
structurals = (structural_res_0 | (structural_res_1 << 32));
51-
52-
const __m256i mask_space = _mm256_set1_epi8(0x20);
53-
__m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
54-
__m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
55-
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
56-
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
57-
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
58-
const __m256i mask_tab = _mm256_set1_epi8(0x09);
59-
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
60-
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
61-
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
62-
space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
63-
space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
64-
65-
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
66-
uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
67-
whitespace = (ws_res_0 | (ws_res_1 << 32));
68-
// end of naive approach
28+
29+
// You should never need this naive approach, but it can be useful
30+
// for research purposes
31+
const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
32+
const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
33+
const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
34+
const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
35+
const __m256i mask_column = _mm256_set1_epi8(0x3a);
36+
const __m256i mask_comma = _mm256_set1_epi8(0x2c);
37+
structurals = in->build_bitmask([&](auto in) {
38+
__m256i structurals = _mm256_cmpeq_epi8(in, mask_open_brace);
39+
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_brace));
40+
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_open_bracket));
41+
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_close_bracket));
42+
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_column));
43+
structurals = _mm256_or_si256(structurals, _mm256_cmpeq_epi8(in, mask_comma));
44+
return structurals;
45+
});
46+
47+
const __m256i mask_space = _mm256_set1_epi8(0x20);
48+
const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
49+
const __m256i mask_tab = _mm256_set1_epi8(0x09);
50+
const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
51+
whitespace = in->build_bitmask([&](auto in) {
52+
__m256i space = _mm256_cmpeq_epi8(in, mask_space);
53+
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_linefeed));
54+
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_tab));
55+
space = _mm256_or_si256(space, _mm256_cmpeq_epi8(in, mask_carriage));
56+
});
57+
// end of naive approach
6958

7059
#else // SIMDJSON_NAIVE_STRUCTURAL
71-
// clang-format off
72-
const __m256i structural_table =
73-
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
74-
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
75-
const __m256i white_table = _mm256_setr_epi8(
76-
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
77-
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
78-
// clang-format on
79-
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
80-
const __m256i struct_mask = _mm256_set1_epi8(32);
81-
82-
__m256i lo_white = _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
83-
__m256i hi_white = _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
84-
uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
85-
uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
86-
whitespace = (ws_res_0 | (ws_res_1 << 32));
87-
__m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
88-
__m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
89-
__m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
90-
__m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
91-
__m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
92-
__m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
93-
__m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
94-
__m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
95-
96-
uint64_t structural_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
97-
uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
98-
structurals = (structural_res_0 | (structural_res_1 << 32));
60+
61+
// clang-format off
62+
const __m256i structural_table =
63+
_mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
64+
44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
65+
const __m256i white_table = _mm256_setr_epi8(
66+
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
67+
32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
68+
// clang-format on
69+
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
70+
const __m256i struct_mask = _mm256_set1_epi8(32);
71+
72+
whitespace = in.build_bitmask([&](auto chunk) {
73+
return _mm256_cmpeq_epi8(chunk, _mm256_shuffle_epi8(white_table, chunk));
74+
});
75+
structurals = in.build_bitmask([&](auto chunk) {
76+
__m256i struct_r1 = _mm256_add_epi8(struct_offset, chunk);
77+
__m256i struct_r2 = _mm256_or_si256(chunk, struct_mask);
78+
__m256i struct_r3 = _mm256_shuffle_epi8(structural_table, struct_r1);
79+
return _mm256_cmpeq_epi8(struct_r2, struct_r3);
80+
});
81+
9982
#endif // else SIMDJSON_NAIVE_STRUCTURAL
10083
}
10184

src/westmere/simd_input.h

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,27 @@ struct simd_input<Architecture::WESTMERE> {
2222
this->v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
2323
}
2424

25+
template <typename F>
26+
really_inline uint64_t build_bitmask(F const& chunk_to_mask) {
27+
uint64_t r0 = static_cast<uint32_t>(_mm_movemask_epi8(chunk_to_mask(this->v0)));
28+
uint64_t r1 = _mm_movemask_epi8(chunk_to_mask(this->v1));
29+
uint64_t r2 = _mm_movemask_epi8(chunk_to_mask(this->v2));
30+
uint64_t r3 = _mm_movemask_epi8(chunk_to_mask(this->v3));
31+
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
32+
}
33+
2534
really_inline uint64_t eq(uint8_t m) {
2635
const __m128i mask = _mm_set1_epi8(m);
27-
__m128i cmp_res_0 = _mm_cmpeq_epi8(this->v0, mask);
28-
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
29-
__m128i cmp_res_1 = _mm_cmpeq_epi8(this->v1, mask);
30-
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
31-
__m128i cmp_res_2 = _mm_cmpeq_epi8(this->v2, mask);
32-
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
33-
__m128i cmp_res_3 = _mm_cmpeq_epi8(this->v3, mask);
34-
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
35-
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
36+
return this->build_bitmask([&](auto chunk) {
37+
return _mm_cmpeq_epi8(chunk, mask);
38+
});
3639
}
3740

3841
really_inline uint64_t lteq(uint8_t m) {
3942
const __m128i maxval = _mm_set1_epi8(m);
40-
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v0), maxval);
41-
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
42-
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v1), maxval);
43-
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
44-
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v2), maxval);
45-
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
46-
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v3), maxval);
47-
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
48-
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
43+
return this->build_bitmask([&](auto chunk) {
44+
return _mm_cmpeq_epi8(_mm_max_epu8(maxval, chunk), maxval);
45+
});
4946
}
5047

5148
}; // struct simd_input

src/westmere/stage1_find_marks.h

Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -28,45 +28,16 @@ static really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTUR
2828
const __m128i struct_offset = _mm_set1_epi8(0xd4u);
2929
const __m128i struct_mask = _mm_set1_epi8(32);
3030

31-
__m128i white0 = _mm_cmpeq_epi8(in.v0, _mm_shuffle_epi8(white_table, in.v0));
32-
__m128i white1 = _mm_cmpeq_epi8(in.v1, _mm_shuffle_epi8(white_table, in.v1));
33-
__m128i white2 = _mm_cmpeq_epi8(in.v2, _mm_shuffle_epi8(white_table, in.v2));
34-
__m128i white3 = _mm_cmpeq_epi8(in.v3, _mm_shuffle_epi8(white_table, in.v3));
35-
uint64_t ws_res_0 = _mm_movemask_epi8(white0);
36-
uint64_t ws_res_1 = _mm_movemask_epi8(white1);
37-
uint64_t ws_res_2 = _mm_movemask_epi8(white2);
38-
uint64_t ws_res_3 = _mm_movemask_epi8(white3);
39-
40-
whitespace =
41-
(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
42-
43-
__m128i struct1_r1 = _mm_add_epi8(struct_offset, in.v0);
44-
__m128i struct2_r1 = _mm_add_epi8(struct_offset, in.v1);
45-
__m128i struct3_r1 = _mm_add_epi8(struct_offset, in.v2);
46-
__m128i struct4_r1 = _mm_add_epi8(struct_offset, in.v3);
47-
48-
__m128i struct1_r2 = _mm_or_si128(in.v0, struct_mask);
49-
__m128i struct2_r2 = _mm_or_si128(in.v1, struct_mask);
50-
__m128i struct3_r2 = _mm_or_si128(in.v2, struct_mask);
51-
__m128i struct4_r2 = _mm_or_si128(in.v3, struct_mask);
52-
53-
__m128i struct1_r3 = _mm_shuffle_epi8(structural_table, struct1_r1);
54-
__m128i struct2_r3 = _mm_shuffle_epi8(structural_table, struct2_r1);
55-
__m128i struct3_r3 = _mm_shuffle_epi8(structural_table, struct3_r1);
56-
__m128i struct4_r3 = _mm_shuffle_epi8(structural_table, struct4_r1);
57-
58-
__m128i struct1 = _mm_cmpeq_epi8(struct1_r2, struct1_r3);
59-
__m128i struct2 = _mm_cmpeq_epi8(struct2_r2, struct2_r3);
60-
__m128i struct3 = _mm_cmpeq_epi8(struct3_r2, struct3_r3);
61-
__m128i struct4 = _mm_cmpeq_epi8(struct4_r2, struct4_r3);
62-
63-
uint64_t structural_res_0 = _mm_movemask_epi8(struct1);
64-
uint64_t structural_res_1 = _mm_movemask_epi8(struct2);
65-
uint64_t structural_res_2 = _mm_movemask_epi8(struct3);
66-
uint64_t structural_res_3 = _mm_movemask_epi8(struct4);
67-
68-
structurals = (structural_res_0 | (structural_res_1 << 16) |
69-
(structural_res_2 << 32) | (structural_res_3 << 48));
31+
whitespace = in.build_bitmask([&](auto chunk) {
32+
return _mm_cmpeq_epi8(chunk, _mm_shuffle_epi8(white_table, chunk));
33+
});
34+
35+
structurals = in.build_bitmask([&](auto chunk) {
36+
__m128i struct_r1 = _mm_add_epi8(struct_offset, chunk);
37+
__m128i struct_r2 = _mm_or_si128(chunk, struct_mask);
38+
__m128i struct_r3 = _mm_shuffle_epi8(structural_table, struct_r1);
39+
return _mm_cmpeq_epi8(struct_r2, struct_r3);
40+
});
7041
}
7142

7243
#include "generic/stage1_find_marks_flatten.h"

0 commit comments

Comments
 (0)