Skip to content

Commit f2e9040

Browse files
committed
Simplify flatten_bits
- Add directly to base instead of storing variable - Don't modify base_ptr after beginning of function - Eliminate base variable and increment base_ptr instead
1 parent a65ecd9 commit f2e9040

File tree

3 files changed

+68
-74
lines changed

3 files changed

+68
-74
lines changed

src/generic/stage1_find_marks.h

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ really_inline uint64_t finalize_structurals(
117117

118118
// Find structural bits in a 64-byte chunk.
119119
really_inline void find_structural_bits_64(
120-
const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
120+
const uint8_t *buf, size_t idx, uint32_t *&base_ptr,
121121
uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
122122
uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
123123
uint64_t &error_mask,
@@ -137,7 +137,7 @@ really_inline void find_structural_bits_64(
137137
/* take the previous iterations structural bits, not our current
138138
* iteration,
139139
* and flatten */
140-
flatten_bits(base_ptr, base, idx, structurals);
140+
flatten_bits(base_ptr, idx, structurals);
141141

142142
uint64_t whitespace;
143143
find_whitespace_and_structurals(in, whitespace, structurals);
@@ -156,7 +156,6 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
156156
return simdjson::CAPACITY;
157157
}
158158
uint32_t *base_ptr = pj.structural_indexes;
159-
uint32_t base = 0;
160159
utf8_checker<ARCHITECTURE> utf8_state;
161160

162161
/* we have padded the input out to 64 byte multiple with the remainder
@@ -190,7 +189,7 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
190189
code points < 0x20) */
191190

192191
for (; idx < lenminus64; idx += 64) {
193-
find_structural_bits_64(&buf[idx], idx, base_ptr, base,
192+
find_structural_bits_64(&buf[idx], idx, base_ptr,
194193
prev_iter_ends_odd_backslash,
195194
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
196195
structurals, error_mask, utf8_state);
@@ -202,7 +201,7 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
202201
uint8_t tmp_buf[64];
203202
memset(tmp_buf, 0x20, 64);
204203
memcpy(tmp_buf, buf + idx, len - idx);
205-
find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
204+
find_structural_bits_64(&tmp_buf[0], idx, base_ptr,
206205
prev_iter_ends_odd_backslash,
207206
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
208207
structurals, error_mask, utf8_state);
@@ -216,24 +215,24 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
216215

217216
/* finally, flatten out the remaining structurals from the last iteration
218217
*/
219-
flatten_bits(base_ptr, base, idx, structurals);
218+
flatten_bits(base_ptr, idx, structurals);
220219

221-
pj.n_structural_indexes = base;
220+
pj.n_structural_indexes = base_ptr - pj.structural_indexes;
222221
/* a valid JSON file cannot have zero structural indexes - we should have
223222
* found something */
224223
if (pj.n_structural_indexes == 0u) {
225224
return simdjson::EMPTY;
226225
}
227-
if (base_ptr[pj.n_structural_indexes - 1] > len) {
226+
if (pj.structural_indexes[pj.n_structural_indexes - 1] > len) {
228227
return simdjson::UNEXPECTED_ERROR;
229228
}
230-
if (len != base_ptr[pj.n_structural_indexes - 1]) {
229+
if (len != pj.structural_indexes[pj.n_structural_indexes - 1]) {
231230
/* the string might not be NULL terminated, but we add a virtual NULL
232231
* ending character. */
233-
base_ptr[pj.n_structural_indexes++] = len;
232+
pj.structural_indexes[pj.n_structural_indexes++] = len;
234233
}
235234
/* make it safe to dereference one beyond this array */
236-
base_ptr[pj.n_structural_indexes] = 0;
235+
pj.structural_indexes[pj.n_structural_indexes] = 0;
237236
if (error_mask) {
238237
return simdjson::UNESCAPED_CHARS;
239238
}

src/generic/stage1_find_marks_flatten.h

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,14 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx
2626
// base_ptr[base] incrementing base as we go
2727
// will potentially store extra values beyond end of valid bits, so base_ptr
2828
// needs to be large enough to handle this
29-
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
29+
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
3030
// In some instances, the next branch is expensive because it is mispredicted.
3131
// Unfortunately, in other cases,
3232
// it helps tremendously.
3333
if (bits == 0)
3434
return;
3535
uint32_t cnt = hamming(bits);
36-
uint32_t next_base = base + cnt;
3736
idx -= 64;
38-
base_ptr += base;
3937
{
4038
base_ptr[0] = idx + trailing_zeroes(bits);
4139
bits = bits & (bits - 1);
@@ -53,37 +51,36 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx
5351
bits = bits & (bits - 1);
5452
base_ptr[7] = idx + trailing_zeroes(bits);
5553
bits = bits & (bits - 1);
56-
base_ptr += 8;
5754
}
5855
// We hope that the next branch is easily predicted.
5956
if (cnt > 8) {
60-
base_ptr[0] = idx + trailing_zeroes(bits);
57+
base_ptr[8] = idx + trailing_zeroes(bits);
6158
bits = bits & (bits - 1);
62-
base_ptr[1] = idx + trailing_zeroes(bits);
59+
base_ptr[9] = idx + trailing_zeroes(bits);
6360
bits = bits & (bits - 1);
64-
base_ptr[2] = idx + trailing_zeroes(bits);
61+
base_ptr[10] = idx + trailing_zeroes(bits);
6562
bits = bits & (bits - 1);
66-
base_ptr[3] = idx + trailing_zeroes(bits);
63+
base_ptr[11] = idx + trailing_zeroes(bits);
6764
bits = bits & (bits - 1);
68-
base_ptr[4] = idx + trailing_zeroes(bits);
65+
base_ptr[12] = idx + trailing_zeroes(bits);
6966
bits = bits & (bits - 1);
70-
base_ptr[5] = idx + trailing_zeroes(bits);
67+
base_ptr[13] = idx + trailing_zeroes(bits);
7168
bits = bits & (bits - 1);
72-
base_ptr[6] = idx + trailing_zeroes(bits);
69+
base_ptr[14] = idx + trailing_zeroes(bits);
7370
bits = bits & (bits - 1);
74-
base_ptr[7] = idx + trailing_zeroes(bits);
71+
base_ptr[15] = idx + trailing_zeroes(bits);
7572
bits = bits & (bits - 1);
76-
base_ptr += 8;
7773
}
7874
if (cnt > 16) { // unluckly: we rarely get here
7975
// since it means having one structural or pseudo-structral element
8076
// every 4 characters (possible with inputs like "","","",...).
77+
uint32_t i = 16;
8178
do {
82-
base_ptr[0] = idx + trailing_zeroes(bits);
79+
base_ptr[i] = idx + trailing_zeroes(bits);
8380
bits = bits & (bits - 1);
84-
base_ptr++;
85-
} while (bits != 0);
81+
i++;
82+
} while (i < cnt);
8683
}
87-
base = next_base;
84+
base_ptr += cnt;
8885
}
8986
#endif // SIMDJSON_NAIVE_FLATTEN

src/haswell/stage1_find_marks.h

Lines changed: 44 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -89,65 +89,63 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
8989
// base_ptr[base] incrementing base as we go
9090
// will potentially store extra values beyond end of valid bits, so base_ptr
9191
// needs to be large enough to handle this
92-
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, uint32_t idx, uint64_t bits) {
92+
really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits) {
9393
// In some instances, the next branch is expensive because it is mispredicted.
9494
// Unfortunately, in other cases,
9595
// it helps tremendously.
9696
if (bits == 0)
9797
return;
9898
uint32_t cnt = _mm_popcnt_u64(bits);
99-
uint32_t next_base = base + cnt;
10099
idx -= 64;
101-
base_ptr += base;
102100
{
103-
base_ptr[0] = idx + trailing_zeroes(bits);
104-
bits = _blsr_u64(bits);
105-
base_ptr[1] = idx + trailing_zeroes(bits);
106-
bits = _blsr_u64(bits);
107-
base_ptr[2] = idx + trailing_zeroes(bits);
108-
bits = _blsr_u64(bits);
109-
base_ptr[3] = idx + trailing_zeroes(bits);
110-
bits = _blsr_u64(bits);
111-
base_ptr[4] = idx + trailing_zeroes(bits);
112-
bits = _blsr_u64(bits);
113-
base_ptr[5] = idx + trailing_zeroes(bits);
114-
bits = _blsr_u64(bits);
115-
base_ptr[6] = idx + trailing_zeroes(bits);
116-
bits = _blsr_u64(bits);
117-
base_ptr[7] = idx + trailing_zeroes(bits);
118-
bits = _blsr_u64(bits);
119-
base_ptr += 8;
101+
base_ptr[0] = idx + trailing_zeroes(bits);
102+
bits = _blsr_u64(bits);
103+
base_ptr[1] = idx + trailing_zeroes(bits);
104+
bits = _blsr_u64(bits);
105+
base_ptr[2] = idx + trailing_zeroes(bits);
106+
bits = _blsr_u64(bits);
107+
base_ptr[3] = idx + trailing_zeroes(bits);
108+
bits = _blsr_u64(bits);
109+
base_ptr[4] = idx + trailing_zeroes(bits);
110+
bits = _blsr_u64(bits);
111+
base_ptr[5] = idx + trailing_zeroes(bits);
112+
bits = _blsr_u64(bits);
113+
base_ptr[6] = idx + trailing_zeroes(bits);
114+
bits = _blsr_u64(bits);
115+
base_ptr[7] = idx + trailing_zeroes(bits);
116+
bits = _blsr_u64(bits);
120117
}
121118
// We hope that the next branch is easily predicted.
122119
if (cnt > 8) {
123-
base_ptr[0] = idx + trailing_zeroes(bits);
124-
bits = _blsr_u64(bits);
125-
base_ptr[1] = idx + trailing_zeroes(bits);
126-
bits = _blsr_u64(bits);
127-
base_ptr[2] = idx + trailing_zeroes(bits);
128-
bits = _blsr_u64(bits);
129-
base_ptr[3] = idx + trailing_zeroes(bits);
130-
bits = _blsr_u64(bits);
131-
base_ptr[4] = idx + trailing_zeroes(bits);
132-
bits = _blsr_u64(bits);
133-
base_ptr[5] = idx + trailing_zeroes(bits);
134-
bits = _blsr_u64(bits);
135-
base_ptr[6] = idx + trailing_zeroes(bits);
136-
bits = _blsr_u64(bits);
137-
base_ptr[7] = idx + trailing_zeroes(bits);
138-
bits = _blsr_u64(bits);
139-
base_ptr += 8;
120+
base_ptr[8] = idx + trailing_zeroes(bits);
121+
bits = _blsr_u64(bits);
122+
base_ptr[9] = idx + trailing_zeroes(bits);
123+
bits = _blsr_u64(bits);
124+
base_ptr[10] = idx + trailing_zeroes(bits);
125+
bits = _blsr_u64(bits);
126+
base_ptr[11] = idx + trailing_zeroes(bits);
127+
bits = _blsr_u64(bits);
128+
base_ptr[12] = idx + trailing_zeroes(bits);
129+
bits = _blsr_u64(bits);
130+
base_ptr[13] = idx + trailing_zeroes(bits);
131+
bits = _blsr_u64(bits);
132+
base_ptr[14] = idx + trailing_zeroes(bits);
133+
bits = _blsr_u64(bits);
134+
base_ptr[15] = idx + trailing_zeroes(bits);
135+
bits = _blsr_u64(bits);
140136
}
141-
if (cnt > 16) { // unluckly: we rarely get here
142-
// since it means having one structural or pseudo-structral element
143-
// every 4 characters (possible with inputs like "","","",...).
144-
do {
145-
base_ptr[0] = idx + trailing_zeroes(bits);
146-
bits = _blsr_u64(bits);
147-
base_ptr++;
148-
} while (bits != 0);
137+
if (cnt > 16) {
138+
// unluckly: this loop will rarely ever trigger
139+
// since it means having one structural or pseudo-structral element
140+
// every 4 characters (possible with inputs like "","","",...).
141+
uint32_t i = 16;
142+
do {
143+
base_ptr[i] = idx + trailing_zeroes(bits);
144+
bits = _blsr_u64(bits);
145+
i++;
146+
} while (i < cnt);
149147
}
150-
base = next_base;
148+
base_ptr += cnt;
151149
}
152150

153151
#include "generic/stage1_find_marks.h"

0 commit comments

Comments
 (0)