Skip to content

Commit 7d7bec8

Browse files
committed
Remove lookup_lower_4_bits
It's only a coincidence that it works in current uses: it doesn't do what the name says. Particularly, if the high bit is 1 it will yield 0 even if the lower 4 bits would yield something else.
1 parent c5504ef commit 7d7bec8

File tree

5 files changed

+94
-60
lines changed

5 files changed

+94
-60
lines changed

src/arm64/simd.h

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,16 @@ namespace simdjson::arm64::simd {
9191
v0, v1, v2, v3, v4, v5, v6, v7,
9292
v8, v9, v10,v11,v12,v13,v14,v15
9393
}) {}
94+
// Repeat 16 values as many times as necessary (usually for lookup tables)
95+
really_inline static simd8<uint8_t> repeat_16(
96+
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
97+
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
98+
) {
99+
return simd8<uint8_t>(
100+
v0, v1, v2, v3, v4, v5, v6, v7,
101+
v8, v9, v10,v11,v12,v13,v14,v15
102+
);
103+
}
94104

95105
// Store to array
96106
really_inline void store(uint8_t dst[16]) { return vst1q_u8(dst, *this); }
@@ -119,7 +129,7 @@ namespace simdjson::arm64::simd {
119129
template<int N>
120130
really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
121131

122-
// Perform a lookup assuming no value is larger than 16
132+
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
123133
template<typename L>
124134
really_inline simd8<L> lookup_16(
125135
L replace0, L replace1, L replace2, L replace3,
@@ -135,23 +145,9 @@ namespace simdjson::arm64::simd {
135145
return lookup_table.apply_lookup_16_to(*this);
136146
}
137147

138-
// Perform a lookup of the lower 4 bits
139-
template<typename L>
140-
really_inline simd8<L> lookup_lower_4_bits(
141-
L replace0, L replace1, L replace2, L replace3,
142-
L replace4, L replace5, L replace6, L replace7,
143-
L replace8, L replace9, L replace10, L replace11,
144-
L replace12, L replace13, L replace14, L replace15) const {
145-
return (*this & 0xF).lookup_16(
146-
replace0, replace1, replace2, replace3,
147-
replace4, replace5, replace6, replace7,
148-
replace8, replace9, replace10, replace11,
149-
replace12, replace13, replace14, replace15
150-
);
151-
}
152-
153-
really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
154-
return vqtbl1q_u8(*this, original);
148+
template<typename T>
149+
really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
150+
return vqtbl1q_u8(*this, simd8<uint8_t>(original));
155151
}
156152
};
157153

@@ -183,6 +179,16 @@ namespace simdjson::arm64::simd {
183179
v0, v1, v2, v3, v4, v5, v6, v7,
184180
v8, v9, v10,v11,v12,v13,v14,v15
185181
}) {}
182+
// Repeat 16 values as many times as necessary (usually for lookup tables)
183+
really_inline static simd8<int8_t> repeat_16(
184+
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
185+
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
186+
) {
187+
return simd8<int8_t>(
188+
v0, v1, v2, v3, v4, v5, v6, v7,
189+
v8, v9, v10,v11,v12,v13,v14,v15
190+
);
191+
}
186192

187193
// Store to array
188194
really_inline void store(int8_t dst[16]) { return vst1q_s8(dst, *this); }
@@ -223,7 +229,8 @@ namespace simdjson::arm64::simd {
223229
);
224230
}
225231

226-
really_inline simd8<int8_t> apply_lookup_16_to(const simd8<uint8_t> original) {
232+
template<typename T>
233+
really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
227234
return vqtbl1q_s8(*this, original);
228235
}
229236
};

src/haswell/simd.h

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -90,39 +90,23 @@ namespace simdjson::haswell::simd {
9090
really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
9191
really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
9292

93-
// Perform a lookup of the lower 4 bits
93+
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
9494
template<typename L>
95-
really_inline simd8<L> lookup_lower_4_bits(
96-
L replace0, L replace1, L replace2, L replace3,
97-
L replace4, L replace5, L replace6, L replace7,
98-
L replace8, L replace9, L replace10, L replace11,
99-
L replace12, L replace13, L replace14, L replace15) const {
100-
simd8<L> lookup_table(
101-
replace0, replace1, replace2, replace3,
102-
replace4, replace5, replace6, replace7,
103-
replace8, replace9, replace10, replace11,
104-
replace12, replace13, replace14, replace15,
105-
replace0, replace1, replace2, replace3,
106-
replace4, replace5, replace6, replace7,
107-
replace8, replace9, replace10, replace11,
108-
replace12, replace13, replace14, replace15
109-
);
95+
really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
11096
return _mm256_shuffle_epi8(lookup_table, *this);
11197
}
112-
113-
// Perform a lookup assuming the value is between 0 and 16
11498
template<typename L>
11599
really_inline simd8<L> lookup_16(
116100
L replace0, L replace1, L replace2, L replace3,
117101
L replace4, L replace5, L replace6, L replace7,
118102
L replace8, L replace9, L replace10, L replace11,
119103
L replace12, L replace13, L replace14, L replace15) const {
120-
return lookup_lower_4_bits(
104+
return lookup_16(simd8<L>::repeat_16(
121105
replace0, replace1, replace2, replace3,
122106
replace4, replace5, replace6, replace7,
123107
replace8, replace9, replace10, replace11,
124108
replace12, replace13, replace14, replace15
125-
);
109+
));
126110
}
127111
};
128112

@@ -147,6 +131,18 @@ namespace simdjson::haswell::simd {
147131
v16,v17,v18,v19,v20,v21,v22,v23,
148132
v24,v25,v26,v27,v28,v29,v30,v31
149133
)) {}
134+
// Repeat 16 values as many times as necessary (usually for lookup tables)
135+
really_inline static simd8<int8_t> repeat_16(
136+
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
137+
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
138+
) {
139+
return simd8<int8_t>(
140+
v0, v1, v2, v3, v4, v5, v6, v7,
141+
v8, v9, v10,v11,v12,v13,v14,v15,
142+
v0, v1, v2, v3, v4, v5, v6, v7,
143+
v8, v9, v10,v11,v12,v13,v14,v15
144+
);
145+
}
150146

151147
// Order-sensitive comparisons
152148
really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
@@ -175,6 +171,18 @@ namespace simdjson::haswell::simd {
175171
v16,v17,v18,v19,v20,v21,v22,v23,
176172
v24,v25,v26,v27,v28,v29,v30,v31
177173
)) {}
174+
// Repeat 16 values as many times as necessary (usually for lookup tables)
175+
really_inline static simd8<uint8_t> repeat_16(
176+
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
177+
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
178+
) {
179+
return simd8<uint8_t>(
180+
v0, v1, v2, v3, v4, v5, v6, v7,
181+
v8, v9, v10,v11,v12,v13,v14,v15,
182+
v0, v1, v2, v3, v4, v5, v6, v7,
183+
v8, v9, v10,v11,v12,v13,v14,v15
184+
);
185+
}
178186

179187
// Saturated math
180188
really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }

src/haswell/stage1_find_marks.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators(
1818
const simd::simd8x64<uint8_t> in,
1919
uint64_t &whitespace, uint64_t &op) {
2020

21+
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
22+
// we can't use the generic lookup_16.
23+
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
24+
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
25+
2126
whitespace = in.map([&](simd8<uint8_t> _in) {
22-
return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
27+
return _in == simd8<uint8_t>(_mm256_shuffle_epi8(whitespace_table, _in));
2328
}).to_bitmask();
2429

2530
op = in.map([&](simd8<uint8_t> _in) {
26-
return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
31+
// | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
32+
return (_in | 32) == simd8<uint8_t>(_mm256_shuffle_epi8(op_table, _in-','));
2733
}).to_bitmask();
2834
}
2935

src/westmere/simd.h

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -91,36 +91,23 @@ namespace simdjson::westmere::simd {
9191
really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *this; }
9292
really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *this; }
9393

94-
// Perform a lookup of the lower 4 bits
94+
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
9595
template<typename L>
96-
really_inline simd8<L> lookup_lower_4_bits(
97-
L replace0, L replace1, L replace2, L replace3,
98-
L replace4, L replace5, L replace6, L replace7,
99-
L replace8, L replace9, L replace10, L replace11,
100-
L replace12, L replace13, L replace14, L replace15) const {
101-
102-
simd8<L> lookup_table(
103-
replace0, replace1, replace2, replace3,
104-
replace4, replace5, replace6, replace7,
105-
replace8, replace9, replace10, replace11,
106-
replace12, replace13, replace14, replace15
107-
);
96+
really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
10897
return _mm_shuffle_epi8(lookup_table, *this);
10998
}
110-
111-
// Perform a lookup assuming the value is between 0 and 16
11299
template<typename L>
113100
really_inline simd8<L> lookup_16(
114101
L replace0, L replace1, L replace2, L replace3,
115102
L replace4, L replace5, L replace6, L replace7,
116103
L replace8, L replace9, L replace10, L replace11,
117104
L replace12, L replace13, L replace14, L replace15) const {
118-
return lookup_lower_4_bits(
105+
return lookup_16(simd8<L>::repeat_16(
119106
replace0, replace1, replace2, replace3,
120107
replace4, replace5, replace6, replace7,
121108
replace8, replace9, replace10, replace11,
122109
replace12, replace13, replace14, replace15
123-
);
110+
));
124111
}
125112
};
126113

@@ -141,6 +128,16 @@ namespace simdjson::westmere::simd {
141128
v0, v1, v2, v3, v4, v5, v6, v7,
142129
v8, v9, v10,v11,v12,v13,v14,v15
143130
)) {}
131+
// Repeat 16 values as many times as necessary (usually for lookup tables)
132+
really_inline static simd8<int8_t> repeat_16(
133+
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
134+
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
135+
) {
136+
return simd8<int8_t>(
137+
v0, v1, v2, v3, v4, v5, v6, v7,
138+
v8, v9, v10,v11,v12,v13,v14,v15
139+
);
140+
}
144141

145142
// Order-sensitive comparisons
146143
really_inline simd8<int8_t> max(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
@@ -165,6 +162,16 @@ namespace simdjson::westmere::simd {
165162
v0, v1, v2, v3, v4, v5, v6, v7,
166163
v8, v9, v10,v11,v12,v13,v14,v15
167164
)) {}
165+
// Repeat 16 values as many times as necessary (usually for lookup tables)
166+
really_inline static simd8<uint8_t> repeat_16(
167+
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
168+
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
169+
) {
170+
return simd8<uint8_t>(
171+
v0, v1, v2, v3, v4, v5, v6, v7,
172+
v8, v9, v10,v11,v12,v13,v14,v15
173+
);
174+
}
168175

169176
// Saturated math
170177
really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }

src/westmere/stage1_find_marks.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@ really_inline void find_whitespace_and_operators(
1818
const simd8x64<uint8_t> in,
1919
uint64_t &whitespace, uint64_t &op) {
2020

21+
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
22+
// we can't use the generic lookup_16.
23+
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
24+
auto op_table = simd8<uint8_t>::repeat_16(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
25+
2126
whitespace = in.map([&](simd8<uint8_t> _in) {
22-
return _in == _in.lookup_lower_4_bits<uint8_t>(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
27+
return _in == simd8<uint8_t>(_mm_shuffle_epi8(whitespace_table, _in));
2328
}).to_bitmask();
2429

2530
op = in.map([&](simd8<uint8_t> _in) {
26-
return (_in | 32) == (_in+0xd4u).lookup_lower_4_bits<uint8_t>(',', '}', 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{');
31+
// | 32 handles the fact that { } and [ ] are exactly 32 bytes apart
32+
return (_in | 32) == simd8<uint8_t>(_mm_shuffle_epi8(op_table, _in-','));
2733
}).to_bitmask();
2834
}
2935

0 commit comments

Comments
 (0)