Skip to content

Commit 156fe7b

Browse files
committed
De-unroll the flatten_bits loops
1 parent f2e9040 commit 156fe7b

File tree

2 files changed

+31
-72
lines changed

2 files changed

+31
-72
lines changed

src/generic/stage1_find_marks_flatten.h

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -34,46 +34,26 @@ really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits
3434
return;
3535
uint32_t cnt = hamming(bits);
3636
idx -= 64;
37-
{
38-
base_ptr[0] = idx + trailing_zeroes(bits);
39-
bits = bits & (bits - 1);
40-
base_ptr[1] = idx + trailing_zeroes(bits);
41-
bits = bits & (bits - 1);
42-
base_ptr[2] = idx + trailing_zeroes(bits);
43-
bits = bits & (bits - 1);
44-
base_ptr[3] = idx + trailing_zeroes(bits);
45-
bits = bits & (bits - 1);
46-
base_ptr[4] = idx + trailing_zeroes(bits);
47-
bits = bits & (bits - 1);
48-
base_ptr[5] = idx + trailing_zeroes(bits);
49-
bits = bits & (bits - 1);
50-
base_ptr[6] = idx + trailing_zeroes(bits);
51-
bits = bits & (bits - 1);
52-
base_ptr[7] = idx + trailing_zeroes(bits);
37+
38+
// Do the first 8 all together
39+
for (int i=0; i<8; i++) {
40+
base_ptr[i] = idx + trailing_zeroes(bits);
5341
bits = bits & (bits - 1);
5442
}
55-
// We hope that the next branch is easily predicted.
43+
44+
// Do the next 8 all together (we hope in most cases it won't happen at all
45+
// and the branch is easily predicted).
5646
if (cnt > 8) {
57-
base_ptr[8] = idx + trailing_zeroes(bits);
58-
bits = bits & (bits - 1);
59-
base_ptr[9] = idx + trailing_zeroes(bits);
60-
bits = bits & (bits - 1);
61-
base_ptr[10] = idx + trailing_zeroes(bits);
62-
bits = bits & (bits - 1);
63-
base_ptr[11] = idx + trailing_zeroes(bits);
64-
bits = bits & (bits - 1);
65-
base_ptr[12] = idx + trailing_zeroes(bits);
66-
bits = bits & (bits - 1);
67-
base_ptr[13] = idx + trailing_zeroes(bits);
68-
bits = bits & (bits - 1);
69-
base_ptr[14] = idx + trailing_zeroes(bits);
70-
bits = bits & (bits - 1);
71-
base_ptr[15] = idx + trailing_zeroes(bits);
72-
bits = bits & (bits - 1);
47+
for (int i=8; i<16; i++) {
48+
base_ptr[i] = idx + trailing_zeroes(bits);
49+
bits = bits & (bits - 1);
50+
}
7351
}
74-
if (cnt > 16) { // unluckly: we rarely get here
75-
// since it means having one structural or pseudo-structral element
76-
// every 4 characters (possible with inputs like "","","",...).
52+
53+
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
54+
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
55+
// or the start of a value ("abc" true 123) every 4 characters.
56+
if (cnt > 16) {
7757
uint32_t i = 16;
7858
do {
7959
base_ptr[i] = idx + trailing_zeroes(bits);

src/haswell/stage1_find_marks.h

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -97,47 +97,26 @@ really_inline void flatten_bits(uint32_t *&base_ptr, uint32_t idx, uint64_t bits
9797
return;
9898
uint32_t cnt = _mm_popcnt_u64(bits);
9999
idx -= 64;
100-
{
101-
base_ptr[0] = idx + trailing_zeroes(bits);
102-
bits = _blsr_u64(bits);
103-
base_ptr[1] = idx + trailing_zeroes(bits);
104-
bits = _blsr_u64(bits);
105-
base_ptr[2] = idx + trailing_zeroes(bits);
106-
bits = _blsr_u64(bits);
107-
base_ptr[3] = idx + trailing_zeroes(bits);
108-
bits = _blsr_u64(bits);
109-
base_ptr[4] = idx + trailing_zeroes(bits);
110-
bits = _blsr_u64(bits);
111-
base_ptr[5] = idx + trailing_zeroes(bits);
112-
bits = _blsr_u64(bits);
113-
base_ptr[6] = idx + trailing_zeroes(bits);
114-
bits = _blsr_u64(bits);
115-
base_ptr[7] = idx + trailing_zeroes(bits);
100+
101+
// Do the first 8 all together
102+
for (int i=0; i<8; i++) {
103+
base_ptr[i] = idx + trailing_zeroes(bits);
116104
bits = _blsr_u64(bits);
117105
}
118-
// We hope that the next branch is easily predicted.
106+
107+
// Do the next 8 all together (we hope in most cases it won't happen at all
108+
// and the branch is easily predicted).
119109
if (cnt > 8) {
120-
base_ptr[8] = idx + trailing_zeroes(bits);
121-
bits = _blsr_u64(bits);
122-
base_ptr[9] = idx + trailing_zeroes(bits);
123-
bits = _blsr_u64(bits);
124-
base_ptr[10] = idx + trailing_zeroes(bits);
125-
bits = _blsr_u64(bits);
126-
base_ptr[11] = idx + trailing_zeroes(bits);
127-
bits = _blsr_u64(bits);
128-
base_ptr[12] = idx + trailing_zeroes(bits);
129-
bits = _blsr_u64(bits);
130-
base_ptr[13] = idx + trailing_zeroes(bits);
131-
bits = _blsr_u64(bits);
132-
base_ptr[14] = idx + trailing_zeroes(bits);
133-
bits = _blsr_u64(bits);
134-
base_ptr[15] = idx + trailing_zeroes(bits);
135-
bits = _blsr_u64(bits);
110+
for (int i=8; i<16; i++) {
111+
base_ptr[i] = idx + trailing_zeroes(bits);
112+
bits = _blsr_u64(bits);
113+
}
136114
}
115+
116+
// Most files don't have 16+ structurals per block, so we take several basically guaranteed
117+
// branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
118+
// or the start of a value ("abc" true 123) every four characters.
137119
if (cnt > 16) {
138-
// unluckly: this loop will rarely ever trigger
139-
// since it means having one structural or pseudo-structral element
140-
// every 4 characters (possible with inputs like "","","",...).
141120
uint32_t i = 16;
142121
do {
143122
base_ptr[i] = idx + trailing_zeroes(bits);

0 commit comments

Comments
 (0)