Skip to content

Commit aef3f4b

Browse files
authored
Merge pull request simdjson#296 from lemire/wide_mask
Genericize bitmask building to make algorithms clearer
2 parents 2060cf8 + bf80838 commit aef3f4b

13 files changed

+380
-416
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
REFERENCE_VERSION = v0.2.1
1+
REFERENCE_VERSION = master
22

33
.SUFFIXES:
44
#

scripts/checkperf.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@ make parse
2727
make perfdiff
2828

2929
echo "Running perfdiff:"
30+
echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
3031
./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"

singleheader/amalgamation_demo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Sun Aug 18 15:06:50 DST 2019. Do not edit! */
1+
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
22

33
#include <iostream>
44
#include "simdjson.h"

singleheader/simdjson.cpp

Lines changed: 166 additions & 218 deletions
Large diffs are not rendered by default.

singleheader/simdjson.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Sun Aug 18 15:06:50 DST 2019. Do not edit! */
1+
/* auto-generated on Fri Aug 23 11:02:39 DST 2019. Do not edit! */
22
/* begin file include/simdjson/simdjson_version.h */
33
// /include/simdjson/simdjson_version.h automatically generated by release.py,
44
// do not change by hand
@@ -36438,13 +36438,17 @@ class ParsedJson::BasicIterator {
3643836438
// (in case of repeated keys, this only finds the first one).
3643936439
// We seek the key using C's strcmp so if your JSON strings contain
3644036440
// NULL chars, this would trigger a false positive: if you expect that
36441-
// to be the case, take extra precautions.
36441+
// to be the case, take extra precautions.
36442+
// Furthermore, we do the comparison character-by-character
36443+
// without taking into account Unicode equivalence.
3644236444
inline bool move_to_key(const char *key);
3644336445
// when at {, go one level deep, looking for a given key
3644436446
// if successful, we are left pointing at the value,
3644536447
// if not, we are still pointing at the object ({)
3644636448
// (in case of repeated keys, this only finds the first one).
3644736449
// The string we search for can contain NULL values.
36450+
// Furthermore, we do the comparison character-by-character
36451+
// without taking into account Unicode equivalence.
3644836452
inline bool move_to_key(const char *key, uint32_t length);
3644936453

3645036454
// when at a key location within an object, this moves to the accompanying

src/arm64/simd_input.h

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,49 @@ struct simd_input<Architecture::ARM64> {
4646
this->i3 = vld1q_u8(ptr + 48);
4747
}
4848

49+
really_inline simd_input(uint8x16_t a0, uint8x16_t a1, uint8x16_t a2, uint8x16_t a3) {
50+
this->i0 = a0;
51+
this->i1 = a1;
52+
this->i2 = a2;
53+
this->i3 = a3;
54+
}
55+
56+
template <typename F>
57+
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
58+
return simd_input<Architecture::ARM64>(
59+
map_chunk(this->i0),
60+
map_chunk(this->i1),
61+
map_chunk(this->i2),
62+
map_chunk(this->i3)
63+
);
64+
}
65+
66+
template <typename F>
67+
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
68+
return simd_input<Architecture::ARM64>(
69+
map_chunk(this->i0, b.i0),
70+
map_chunk(this->i1, b.i1),
71+
map_chunk(this->i2, b.i2),
72+
map_chunk(this->i3, b.i3)
73+
);
74+
}
75+
76+
really_inline uint64_t to_bitmask() {
77+
return neon_movemask_bulk(this->i0, this->i1, this->i2, this->i3);
78+
}
79+
4980
really_inline uint64_t eq(uint8_t m) {
5081
const uint8x16_t mask = vmovq_n_u8(m);
51-
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
52-
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
53-
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
54-
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
55-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
82+
return this->map( [&](auto a) {
83+
return vceqq_u8(a, mask);
84+
}).to_bitmask();
5685
}
5786

5887
really_inline uint64_t lteq(uint8_t m) {
5988
const uint8x16_t mask = vmovq_n_u8(m);
60-
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
61-
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
62-
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
63-
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
64-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
89+
return this->map( [&](auto a) {
90+
return vcleq_u8(a, mask);
91+
}).to_bitmask();
6592
}
6693

6794
}; // struct simd_input

src/arm64/stage1_find_marks.h

Lines changed: 15 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
namespace simdjson::arm64 {
1414

15-
static really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
15+
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
1616

1717
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
1818
return vmull_p64(-1ULL, quote_bits);
@@ -21,52 +21,28 @@ static really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
2121
#endif
2222
}
2323

24-
static really_inline void find_whitespace_and_structurals(
24+
really_inline void find_whitespace_and_structurals(
2525
simd_input<ARCHITECTURE> in, uint64_t &whitespace,
2626
uint64_t &structurals) {
2727
const uint8x16_t low_nibble_mask =
2828
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
2929
const uint8x16_t high_nibble_mask =
3030
(uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
31-
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
32-
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
3331
const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
3432

35-
uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
36-
uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
37-
uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
38-
uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
39-
uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
40-
41-
uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
42-
uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
43-
uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
44-
uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
45-
uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
46-
47-
uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
48-
uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
49-
uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
50-
uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
51-
uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
52-
53-
uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
54-
uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
55-
uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
56-
uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
57-
uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
58-
59-
uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
60-
uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
61-
uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
62-
uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
63-
structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
64-
65-
uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
66-
uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
67-
uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
68-
uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
69-
whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
33+
auto v = in.map([&](auto chunk) {
34+
uint8x16_t nib_lo = vandq_u8(chunk, low_nib_and_mask);
35+
uint8x16_t nib_hi = vshrq_n_u8(chunk, 4);
36+
uint8x16_t shuf_lo = vqtbl1q_u8(low_nibble_mask, nib_lo);
37+
uint8x16_t shuf_hi = vqtbl1q_u8(high_nibble_mask, nib_hi);
38+
return vandq_u8(shuf_lo, shuf_hi);
39+
});
40+
41+
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
42+
structurals = MAP_BITMASK( v, vtstq_u8(_v, structural_shufti_mask) );
43+
44+
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
45+
whitespace = MAP_BITMASK( v, vtstq_u8(_v, whitespace_shufti_mask) );
7046
}
7147

7248
#include "generic/stage1_find_marks_flatten.h"

src/generic/stage1_find_marks_flatten.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// This is just a naive implementation. It should be normally
99
// disable, but can be used for research purposes to compare
1010
// again our optimized version.
11-
static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
11+
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
1212
uint32_t *out_ptr = base_ptr + base;
1313
idx -= 64;
1414
while (bits != 0) {
@@ -26,7 +26,7 @@ static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint3
2626
// base_ptr[base] incrementing base as we go
2727
// will potentially store extra values beyond end of valid bits, so base_ptr
2828
// needs to be large enough to handle this
29-
static really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
29+
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
3030
// In some instances, the next branch is expensive because it is mispredicted.
3131
// Unfortunately, in other cases,
3232
// it helps tremendously.

src/haswell/simd_input.h

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,45 @@ struct simd_input<Architecture::HASWELL> {
1818
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
1919
}
2020

21+
really_inline simd_input(__m256i a_lo, __m256i a_hi) {
22+
this->lo = a_lo;
23+
this->hi = a_hi;
24+
}
25+
26+
template <typename F>
27+
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) {
28+
return simd_input<Architecture::HASWELL>(
29+
map_chunk(this->lo),
30+
map_chunk(this->hi)
31+
);
32+
}
33+
34+
template <typename F>
35+
really_inline simd_input<Architecture::HASWELL> map(simd_input<Architecture::HASWELL> b, F const& map_chunk) {
36+
return simd_input<Architecture::HASWELL>(
37+
map_chunk(this->lo, b.lo),
38+
map_chunk(this->hi, b.hi)
39+
);
40+
}
41+
42+
really_inline uint64_t to_bitmask() {
43+
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
44+
uint64_t r_hi = _mm256_movemask_epi8(this->hi);
45+
return r_lo | (r_hi << 32);
46+
}
47+
2148
really_inline uint64_t eq(uint8_t m) {
2249
const __m256i mask = _mm256_set1_epi8(m);
23-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
24-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
25-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
26-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
27-
return res_0 | (res_1 << 32);
50+
return this->map( [&](auto a) {
51+
return _mm256_cmpeq_epi8(a, mask);
52+
}).to_bitmask();
2853
}
2954

3055
really_inline uint64_t lteq(uint8_t m) {
3156
const __m256i maxval = _mm256_set1_epi8(m);
32-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
33-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
34-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
35-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
36-
return res_0 | (res_1 << 32);
57+
return this->map( [&](auto a) {
58+
return _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, a), maxval);
59+
}).to_bitmask();
3760
}
3861

3962
}; // struct simd_input

0 commit comments

Comments
 (0)