@@ -14,25 +14,54 @@ using namespace simd;
14
14
struct json_character_block {
15
15
static simdjson_really_inline json_character_block classify (const simd::simd8x64<uint8_t >& in);
16
16
// ASCII white-space ('\r','\n','\t',' ')
17
- simdjson_really_inline uint64_t whitespace () const { return _whitespace; }
17
+ simdjson_really_inline uint64_t whitespace () const ;
18
18
// non-quote structural characters (comma, colon, braces, brackets)
19
- simdjson_really_inline uint64_t op () const { return _op; }
19
+ simdjson_really_inline uint64_t op () const ;
20
20
// neither a structural character nor a white-space, so letters, numbers and quotes
21
- simdjson_really_inline uint64_t scalar () { return ~( op () | whitespace ()); }
21
+ simdjson_really_inline uint64_t scalar () const ;
22
22
23
23
uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
24
24
uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
25
25
};
26
26
27
+ simdjson_really_inline uint64_t json_character_block::whitespace () const { return _whitespace; }
28
+ simdjson_really_inline uint64_t json_character_block::op () const { return _op; }
29
+ simdjson_really_inline uint64_t json_character_block::scalar () const { return ~(op () | whitespace ()); }
30
+
27
31
// This identifies structural characters (comma, colon, braces, brackets),
28
32
// and ASCII white-space ('\r','\n','\t',' ').
29
33
simdjson_really_inline json_character_block json_character_block::classify (const simd::simd8x64<uint8_t >& in) {
30
34
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
31
35
// we can't use the generic lookup_16.
32
36
auto whitespace_table = simd8<uint8_t >::repeat_16 (' ' , 100 , 100 , 100 , 17 , 100 , 113 , 2 , 100 , ' \t ' , ' \n ' , 112 , 100 , ' \r ' , 100 , 100 );
33
- auto op_table = simd8<uint8_t >::repeat_16 (' ,' , ' }' , 0 , 0 , 0xc0u , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , ' :' , ' {' );
34
37
35
- // We compute whitespace and op separately. If the code later only use one or the
38
+ // The 6 operators (:,[]{}) have these values:
39
+ //
40
+ // , 2C
41
+ // : 3A
42
+ // [ 5B
43
+ // { 7B
44
+ // ] 5D
45
+ // } 7D
46
+ //
47
+ // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
48
+ // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
49
+ // match it (against | 0x20).
50
+ //
51
+ // To prevent recognizing other characters, everything else gets compared with 0, which cannot
52
+ // match due to the | 0x20.
53
+ //
54
+ // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
55
+ // and :. This gets caught in stage 2, which checks the actual character to ensure the right
56
+ // operators are in the right places.
57
+ auto op_table = simd8<uint8_t >::repeat_16 (
58
+ 0 , 0 , 0 , 0 ,
59
+ 0 , 0 , 0 , 0 ,
60
+ 0 , 0 , ' :' , ' {' , // : = 3A, [ = 5B, { = 7B
61
+ ' ,' , ' }' , 0 , 0 // , = 2C, ] = 5D, } = 7D
62
+ );
63
+
64
+ // We compute whitespace and op separately. If later code only uses one or the
36
65
// other, given the fact that all functions are aggressively inlined, we can
37
66
// hope that useless computations will be omitted. This is namely case when
38
67
// minifying (we only need whitespace).
@@ -43,8 +72,8 @@ simdjson_really_inline json_character_block json_character_block::classify(const
43
72
).to_bitmask ();
44
73
45
74
uint64_t op = simd8x64<bool >(
46
- (in.chunks [0 ] | 32 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [0 ]- ' , ' )),
47
- (in.chunks [1 ] | 32 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [1 ]- ' , ' ))
75
+ (in.chunks [0 ] | 0x20 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [0 ])),
76
+ (in.chunks [1 ] | 0x20 ) == simd8<uint8_t >(_mm256_shuffle_epi8 (op_table, in.chunks [1 ]))
48
77
).to_bitmask ();
49
78
return { whitespace, op };
50
79
}
0 commit comments