Skip to content

Commit f7e8936

Browse files
jkeiserlemire
authored andcommitted
Use simd_input generic methods for utf8 checking (simdjson#301)
* Use generic each/reduce in simdutf8check * Remove macros from generic simd_input uses * Use array instead of members to store simd registers * Default local checkperf to clone from .
1 parent 5765c81 commit f7e8936

12 files changed

+154
-111
lines changed

.drone.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ steps:
1414
- make amalgamate
1515
- name: checkperf
1616
image: gcc:8
17+
environment:
18+
CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson
1719
commands:
1820
- make checkperf
1921
---
@@ -33,6 +35,8 @@ steps:
3335
- make amalgamate
3436
- name: checkperf
3537
image: gcc:8
38+
environment:
39+
CHECKPERF_REPOSITORY: https://github.com/lemire/simdjson
3640
commands:
3741
- make checkperf
3842
---

scripts/checkperf.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
set -e
44
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
55

6+
if [ -z "$CHECKPERF_REPOSITORY"]; then CHECKPERF_REPOSITORY=.; fi
7+
68
# Arguments: perfdiff.sh <branch> <test json files>
79
if [ -z "$1" ]; then reference_branch="master"; else reference_branch=$1; shift; fi
810
if [ -z "$*" ]; then perftests="jsonexamples/twitter.json"; else perftests=$*; fi
@@ -13,7 +15,7 @@ current=$SCRIPTPATH/..
1315
reference=$current/benchbranch/$reference_branch
1416
rm -rf $reference
1517
mkdir -p $reference
16-
git clone --depth 1 -b $reference_branch https://github.com/lemire/simdjson $reference
18+
git clone --depth 1 -b $reference_branch $CHECKPERF_REPOSITORY $reference
1719
cd $reference
1820
make parse
1921

src/arm64/simd_input.h

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#ifdef IS_ARM64
77

8-
namespace simdjson {
8+
namespace simdjson::arm64 {
99

1010
really_inline uint16_t neon_movemask(uint8x16_t input) {
1111
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
@@ -32,49 +32,68 @@ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
3232
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
3333
}
3434

35+
} // namespace simdjson::arm64
36+
37+
namespace simdjson {
38+
39+
using namespace simdjson::arm64;
40+
3541
template <>
3642
struct simd_input<Architecture::ARM64> {
37-
uint8x16_t i0;
38-
uint8x16_t i1;
39-
uint8x16_t i2;
40-
uint8x16_t i3;
43+
uint8x16_t chunks[4];
4144

4245
really_inline simd_input(const uint8_t *ptr) {
43-
this->i0 = vld1q_u8(ptr + 0);
44-
this->i1 = vld1q_u8(ptr + 16);
45-
this->i2 = vld1q_u8(ptr + 32);
46-
this->i3 = vld1q_u8(ptr + 48);
46+
this->chunks[0] = vld1q_u8(ptr + 0*16);
47+
this->chunks[1] = vld1q_u8(ptr + 1*16);
48+
this->chunks[2] = vld1q_u8(ptr + 2*16);
49+
this->chunks[3] = vld1q_u8(ptr + 3*16);
4750
}
4851

49-
really_inline simd_input(uint8x16_t a0, uint8x16_t a1, uint8x16_t a2, uint8x16_t a3) {
50-
this->i0 = a0;
51-
this->i1 = a1;
52-
this->i2 = a2;
53-
this->i3 = a3;
52+
really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
53+
this->chunks[0] = chunk0;
54+
this->chunks[1] = chunk1;
55+
this->chunks[2] = chunk2;
56+
this->chunks[3] = chunk3;
57+
}
58+
59+
template <typename F>
60+
really_inline void each(F const& each_chunk)
61+
{
62+
each_chunk(this->chunks[0]);
63+
each_chunk(this->chunks[1]);
64+
each_chunk(this->chunks[2]);
65+
each_chunk(this->chunks[3]);
5466
}
5567

5668
template <typename F>
5769
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
5870
return simd_input<Architecture::ARM64>(
59-
map_chunk(this->i0),
60-
map_chunk(this->i1),
61-
map_chunk(this->i2),
62-
map_chunk(this->i3)
71+
map_chunk(this->chunks[0]),
72+
map_chunk(this->chunks[1]),
73+
map_chunk(this->chunks[2]),
74+
map_chunk(this->chunks[3])
6375
);
6476
}
6577

6678
template <typename F>
6779
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
6880
return simd_input<Architecture::ARM64>(
69-
map_chunk(this->i0, b.i0),
70-
map_chunk(this->i1, b.i1),
71-
map_chunk(this->i2, b.i2),
72-
map_chunk(this->i3, b.i3)
81+
map_chunk(this->chunks[0], b.chunks[0]),
82+
map_chunk(this->chunks[1], b.chunks[1]),
83+
map_chunk(this->chunks[2], b.chunks[2]),
84+
map_chunk(this->chunks[3], b.chunks[3])
7385
);
7486
}
7587

88+
template <typename F>
89+
really_inline uint8x16_t reduce(F const& reduce_pair) {
90+
uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
91+
uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
92+
return reduce_pair(r01, r23);
93+
}
94+
7695
really_inline uint64_t to_bitmask() {
77-
return neon_movemask_bulk(this->i0, this->i1, this->i2, this->i3);
96+
return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
7897
}
7998

8099
really_inline uint64_t eq(uint8_t m) {

src/arm64/simdutf8check.h

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,11 @@ check_utf8_bytes(int8x16_t current_bytes, struct processed_utf_bytes *previous,
181181
really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
182182
// checking if the most significant bit is always equal to 0.
183183
uint8x16_t high_bit = vdupq_n_u8(0x80);
184-
uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
185-
uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
186-
uint8x16_t t3 = vorrq_u8(t0, t1);
187-
uint8x16_t t4 = vandq_u8(t3, high_bit);
188-
uint64x2_t v64 = vreinterpretq_u64_u8(t4);
184+
uint8x16_t any_bits_on = in.reduce([&](auto a, auto b) {
185+
return vorrq_u8(a, b);
186+
});
187+
uint8x16_t high_bit_on = vandq_u8(any_bits_on, high_bit);
188+
uint64x2_t v64 = vreinterpretq_u64_u8(high_bit_on);
189189
uint32x2_t v32 = vqmovn_u64(v64);
190190
uint64x1_t result = vreinterpret_u64_u32(v32);
191191
return vget_lane_u64(result, 0) == 0;
@@ -215,14 +215,9 @@ struct utf8_checker<Architecture::ARM64> {
215215
this->has_error);
216216
} else {
217217
// it is not ascii so we have to do heavy work
218-
this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
219-
&(this->previous), &(this->has_error));
220-
this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
221-
&(this->previous), &(this->has_error));
222-
this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
223-
&(this->previous), &(this->has_error));
224-
this->previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
225-
&(this->previous), &(this->has_error));
218+
in.each([&](auto _in) {
219+
this->previous = check_utf8_bytes(vreinterpretq_s8_u8(_in), &(this->previous), &(this->has_error));
220+
});
226221
}
227222
}
228223

src/arm64/stage1_find_marks.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,14 @@ really_inline void find_whitespace_and_structurals(
3939
});
4040

4141
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
42-
structurals = MAP_BITMASK( v, vtstq_u8(_v, structural_shufti_mask) );
42+
structurals = v.map([&](auto _v) {
43+
return vtstq_u8(_v, structural_shufti_mask);
44+
}).to_bitmask();
4345

4446
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
45-
whitespace = MAP_BITMASK( v, vtstq_u8(_v, whitespace_shufti_mask) );
47+
whitespace = v.map([&](auto _v) {
48+
return vtstq_u8(_v, whitespace_shufti_mask);
49+
}).to_bitmask();
4650
}
4751

4852
#include "generic/stage1_find_marks_flatten.h"

src/haswell/simd_input.h

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,38 +10,51 @@ namespace simdjson {
1010

1111
template <>
1212
struct simd_input<Architecture::HASWELL> {
13-
__m256i lo;
14-
__m256i hi;
13+
__m256i chunks[2];
1514

16-
really_inline simd_input(const uint8_t *ptr) {
17-
this->lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
18-
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
15+
really_inline simd_input(const uint8_t *ptr)
16+
{
17+
this->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32));
18+
this->chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32));
1919
}
2020

21-
really_inline simd_input(__m256i a_lo, __m256i a_hi) {
22-
this->lo = a_lo;
23-
this->hi = a_hi;
21+
really_inline simd_input(__m256i chunk0, __m256i chunk1)
22+
{
23+
this->chunks[0] = chunk0;
24+
this->chunks[1] = chunk1;
25+
}
26+
27+
template <typename F>
28+
really_inline void each(F const& each_chunk)
29+
{
30+
each_chunk(this->chunks[0]);
31+
each_chunk(this->chunks[1]);
2432
}
2533

2634
template <typename F>
2735
really_inline simd_input<Architecture::HASWELL> map(F const& map_chunk) {
2836
return simd_input<Architecture::HASWELL>(
29-
map_chunk(this->lo),
30-
map_chunk(this->hi)
37+
map_chunk(this->chunks[0]),
38+
map_chunk(this->chunks[1])
3139
);
3240
}
3341

3442
template <typename F>
3543
really_inline simd_input<Architecture::HASWELL> map(simd_input<Architecture::HASWELL> b, F const& map_chunk) {
3644
return simd_input<Architecture::HASWELL>(
37-
map_chunk(this->lo, b.lo),
38-
map_chunk(this->hi, b.hi)
45+
map_chunk(this->chunks[0], b.chunks[0]),
46+
map_chunk(this->chunks[1], b.chunks[1])
3947
);
4048
}
4149

50+
template <typename F>
51+
really_inline __m256i reduce(F const& reduce_pair) {
52+
return reduce_pair(this->chunks[0], this->chunks[1]);
53+
}
54+
4255
really_inline uint64_t to_bitmask() {
43-
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->lo));
44-
uint64_t r_hi = _mm256_movemask_epi8(this->hi);
56+
uint64_t r_lo = static_cast<uint32_t>(_mm256_movemask_epi8(this->chunks[0]));
57+
uint64_t r_hi = _mm256_movemask_epi8(this->chunks[1]);
4558
return r_lo | (r_hi << 32);
4659
}
4760

src/haswell/simdutf8check.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,10 @@ struct utf8_checker<Architecture::HASWELL> {
215215

216216
really_inline void check_next_input(simd_input<Architecture::HASWELL> in) {
217217
__m256i high_bit = _mm256_set1_epi8(0x80u);
218-
if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
218+
__m256i any_bits_on = in.reduce([&](auto a, auto b) {
219+
return _mm256_or_si256(a, b);
220+
});
221+
if ((_mm256_testz_si256(any_bits_on, high_bit)) == 1) {
219222
// it is ascii, we just check continuation
220223
this->has_error = _mm256_or_si256(
221224
_mm256_cmpgt_epi8(this->previous.carried_continuations,
@@ -225,10 +228,9 @@ struct utf8_checker<Architecture::HASWELL> {
225228
this->has_error);
226229
} else {
227230
// it is not ascii so we have to do heavy work
228-
this->previous =
229-
avx_check_utf8_bytes(in.lo, &(this->previous), &(this->has_error));
230-
this->previous =
231-
avx_check_utf8_bytes(in.hi, &(this->previous), &(this->has_error));
231+
in.each([&](auto _in) {
232+
this->previous = avx_check_utf8_bytes(_in, &(this->previous), &(this->has_error));
233+
});
232234
}
233235
}
234236

src/haswell/stage1_find_marks.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
7070
const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
7171
const __m256i struct_mask = _mm256_set1_epi8(32);
7272

73-
whitespace = MAP_BITMASK( in, _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in)) );
73+
whitespace = in.map([&](auto _in) {
74+
return _mm256_cmpeq_epi8(_in, _mm256_shuffle_epi8(white_table, _in));
75+
}).to_bitmask();
7476

7577
structurals = in.map([&](auto _in) {
7678
const __m256i r1 = _mm256_add_epi8(struct_offset, _in);

src/simd_input.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,24 @@
44
#include "simdjson/common_defs.h"
55
#include "simdjson/portability.h"
66
#include "simdjson/simdjson.h"
7-
#include <cassert>
87

98
namespace simdjson {
109

1110
template <Architecture T>
1211
struct simd_input {
1312
simd_input(const uint8_t *ptr);
13+
// Run an operation on each chunk.
14+
template <typename F>
15+
really_inline void each(F const& each_chunk);
1416
// Map through each simd register in this input, producing another simd_input.
1517
template <typename F>
1618
really_inline simd_input<T> map(F const& map_chunk);
1719
// Map through each simd register across two inputs, producing a single simd_input.
1820
template <typename F>
1921
really_inline simd_input<T> map(simd_input<T> b, F const& map_chunk);
22+
// Run a horizontal operation like "sum" across the whole input
23+
// template <typename F>
24+
// really_inline simd<T> reduce(F const& map_chunk);
2025
// turn this bytemask (usually the result of a simd comparison operation) into a bitmask.
2126
uint64_t to_bitmask();
2227
// a straightforward comparison of a mask against input.
@@ -25,11 +30,6 @@ struct simd_input {
2530
uint64_t lteq(uint8_t m);
2631
}; // struct simd_input
2732

28-
#define MAP_CHUNKS(A, EXPR) A.map([&](auto _##A) { return (EXPR); })
29-
#define MAP_BITMASK(A, EXPR) MAP_CHUNKS(A, EXPR).to_bitmask()
30-
#define MAP_CHUNKS2(A, B, EXPR) A.map((B), [&](auto _##A, auto _##B) { return (EXPR); })
31-
#define MAP_BITMASK2(A, B, EXPR) MAP_CHUNKS2(A, B, EXPR).to_bitmask()
32-
3333
} // namespace simdjson
3434

3535
#endif

0 commit comments

Comments
 (0)