@@ -14,38 +14,72 @@ using namespace simd;
14
14
struct json_character_block {
15
15
static simdjson_really_inline json_character_block classify (const simd::simd8x64<uint8_t >& in);
16
16
// ASCII white-space ('\r','\n','\t',' ')
17
- simdjson_really_inline uint64_t whitespace () const { return _whitespace; }
17
+ simdjson_really_inline uint64_t whitespace () const ;
18
18
// non-quote structural characters (comma, colon, braces, brackets)
19
- simdjson_really_inline uint64_t op () const { return _op; }
19
+ simdjson_really_inline uint64_t op () const ;
20
20
// neither a structural character nor a white-space, so letters, numbers and quotes
21
- simdjson_really_inline uint64_t scalar () { return ~( op () | whitespace ()); }
21
+ simdjson_really_inline uint64_t scalar () const ;
22
22
23
23
uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
24
24
uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
25
25
};
26
26
27
+ simdjson_really_inline uint64_t json_character_block::whitespace () const { return _whitespace; }
28
+ simdjson_really_inline uint64_t json_character_block::op () const { return _op; }
29
+ simdjson_really_inline uint64_t json_character_block::scalar () const { return ~(op () | whitespace ()); }
30
+
27
31
// This identifies structural characters (comma, colon, braces, brackets),
28
32
// and ASCII white-space ('\r','\n','\t',' ').
29
33
simdjson_really_inline json_character_block json_character_block::classify (const simd::simd8x64<uint8_t >& in) {
30
34
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
31
35
// we can't use the generic lookup_16.
32
- auto whitespace_table = simd8<uint8_t >::repeat_16 (' ' , 100 , 100 , 100 , 17 , 100 , 113 , 2 , 100 , ' \t ' , ' \n ' , 112 , 100 , ' \r ' , 100 , 100 );
33
- auto op_table = simd8<uint8_t >::repeat_16 (' ,' , ' }' , 0 , 0 , 0xc0u , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , ' :' , ' {' );
34
-
35
- // We compute whitespace and op separately. If the code later only use one or the
36
+ const auto whitespace_table = simd8<uint8_t >::repeat_16 (' ' , 100 , 100 , 100 , 17 , 100 , 113 , 2 , 100 , ' \t ' , ' \n ' , 112 , 100 , ' \r ' , 100 , 100 );
37
+
38
+ // The 6 operators (:,[]{}) have these values:
39
+ //
40
+ // , 2C
41
+ // : 3A
42
+ // [ 5B
43
+ // { 7B
44
+ // ] 5D
45
+ // } 7D
46
+ //
47
+ // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
48
+ // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
49
+ // match it (against | 0x20).
50
+ //
51
+ // To prevent recognizing other characters, everything else gets compared with 0, which cannot
52
+ // match due to the | 0x20.
53
+ //
54
+ // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
55
+ // and :. This gets caught in stage 2, which checks the actual character to ensure the right
56
+ // operators are in the right places.
57
+ const auto op_table = simd8<uint8_t >::repeat_16 (
58
+ 0 , 0 , 0 , 0 ,
59
+ 0 , 0 , 0 , 0 ,
60
+ 0 , 0 , ' :' , ' {' , // : = 3A, [ = 5B, { = 7B
61
+ ' ,' , ' }' , 0 , 0 // , = 2C, ] = 5D, } = 7D
62
+ );
63
+
64
+ // We compute whitespace and op separately. If later code only uses one or the
36
65
// other, given the fact that all functions are aggressively inlined, we can
37
66
// hope that useless computations will be omitted. This is namely case when
38
67
// minifying (we only need whitespace).
39
68
40
- uint64_t whitespace = simd8x64<bool >(
41
- in.chunks [0 ] == simd8<uint8_t >(_mm256_shuffle_epi8 (whitespace_table, in.chunks [0 ])),
42
- in.chunks [1 ] == simd8<uint8_t >(_mm256_shuffle_epi8 (whitespace_table, in.chunks [1 ]))
43
- ).to_bitmask ();
69
+ const uint64_t whitespace = in.eq ({
70
+ _mm256_shuffle_epi8 (whitespace_table, in.chunks [0 ]),
71
+ _mm256_shuffle_epi8 (whitespace_table, in.chunks [1 ])
72
+ });
73
+ // Turn [ and ] into { and }
74
+ const simd8x64<uint8_t > curlified{
75
+ in.chunks [0 ] | 0x20 ,
76
+ in.chunks [1 ] | 0x20
77
+ };
78
+ const uint64_t op = curlified.eq ({
79
+ _mm256_shuffle_epi8 (op_table, in.chunks [0 ]),
80
+ _mm256_shuffle_epi8 (op_table, in.chunks [1 ])
81
+ });
44
82
45
- uint64_t op = simd8x64<bool >(
46
- (in.chunks [0 ] | 32 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [0 ]-' ,' )),
47
- (in.chunks [1 ] | 32 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [1 ]-' ,' ))
48
- ).to_bitmask ();
49
83
return { whitespace, op };
50
84
}
51
85
0 commit comments