@@ -89,65 +89,63 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
89
89
// base_ptr[base] incrementing base as we go
90
90
// will potentially store extra values beyond end of valid bits, so base_ptr
91
91
// needs to be large enough to handle this
92
- really_inline void flatten_bits (uint32_t *base_ptr, uint32_t &base , uint32_t idx, uint64_t bits) {
92
+ really_inline void flatten_bits (uint32_t *&base_ptr , uint32_t idx, uint64_t bits) {
93
93
// In some instances, the next branch is expensive because it is mispredicted.
94
94
// Unfortunately, in other cases,
95
95
// it helps tremendously.
96
96
if (bits == 0 )
97
97
return ;
98
98
uint32_t cnt = _mm_popcnt_u64 (bits);
99
- uint32_t next_base = base + cnt;
100
99
idx -= 64 ;
101
- base_ptr += base;
102
100
{
103
- base_ptr[0 ] = idx + trailing_zeroes (bits);
104
- bits = _blsr_u64 (bits);
105
- base_ptr[1 ] = idx + trailing_zeroes (bits);
106
- bits = _blsr_u64 (bits);
107
- base_ptr[2 ] = idx + trailing_zeroes (bits);
108
- bits = _blsr_u64 (bits);
109
- base_ptr[3 ] = idx + trailing_zeroes (bits);
110
- bits = _blsr_u64 (bits);
111
- base_ptr[4 ] = idx + trailing_zeroes (bits);
112
- bits = _blsr_u64 (bits);
113
- base_ptr[5 ] = idx + trailing_zeroes (bits);
114
- bits = _blsr_u64 (bits);
115
- base_ptr[6 ] = idx + trailing_zeroes (bits);
116
- bits = _blsr_u64 (bits);
117
- base_ptr[7 ] = idx + trailing_zeroes (bits);
118
- bits = _blsr_u64 (bits);
119
- base_ptr += 8 ;
101
+ base_ptr[0 ] = idx + trailing_zeroes (bits);
102
+ bits = _blsr_u64 (bits);
103
+ base_ptr[1 ] = idx + trailing_zeroes (bits);
104
+ bits = _blsr_u64 (bits);
105
+ base_ptr[2 ] = idx + trailing_zeroes (bits);
106
+ bits = _blsr_u64 (bits);
107
+ base_ptr[3 ] = idx + trailing_zeroes (bits);
108
+ bits = _blsr_u64 (bits);
109
+ base_ptr[4 ] = idx + trailing_zeroes (bits);
110
+ bits = _blsr_u64 (bits);
111
+ base_ptr[5 ] = idx + trailing_zeroes (bits);
112
+ bits = _blsr_u64 (bits);
113
+ base_ptr[6 ] = idx + trailing_zeroes (bits);
114
+ bits = _blsr_u64 (bits);
115
+ base_ptr[7 ] = idx + trailing_zeroes (bits);
116
+ bits = _blsr_u64 (bits);
120
117
}
121
118
// We hope that the next branch is easily predicted.
122
119
if (cnt > 8 ) {
123
- base_ptr[0 ] = idx + trailing_zeroes (bits);
124
- bits = _blsr_u64 (bits);
125
- base_ptr[1 ] = idx + trailing_zeroes (bits);
126
- bits = _blsr_u64 (bits);
127
- base_ptr[2 ] = idx + trailing_zeroes (bits);
128
- bits = _blsr_u64 (bits);
129
- base_ptr[3 ] = idx + trailing_zeroes (bits);
130
- bits = _blsr_u64 (bits);
131
- base_ptr[4 ] = idx + trailing_zeroes (bits);
132
- bits = _blsr_u64 (bits);
133
- base_ptr[5 ] = idx + trailing_zeroes (bits);
134
- bits = _blsr_u64 (bits);
135
- base_ptr[6 ] = idx + trailing_zeroes (bits);
136
- bits = _blsr_u64 (bits);
137
- base_ptr[7 ] = idx + trailing_zeroes (bits);
138
- bits = _blsr_u64 (bits);
139
- base_ptr += 8 ;
120
+ base_ptr[8 ] = idx + trailing_zeroes (bits);
121
+ bits = _blsr_u64 (bits);
122
+ base_ptr[9 ] = idx + trailing_zeroes (bits);
123
+ bits = _blsr_u64 (bits);
124
+ base_ptr[10 ] = idx + trailing_zeroes (bits);
125
+ bits = _blsr_u64 (bits);
126
+ base_ptr[11 ] = idx + trailing_zeroes (bits);
127
+ bits = _blsr_u64 (bits);
128
+ base_ptr[12 ] = idx + trailing_zeroes (bits);
129
+ bits = _blsr_u64 (bits);
130
+ base_ptr[13 ] = idx + trailing_zeroes (bits);
131
+ bits = _blsr_u64 (bits);
132
+ base_ptr[14 ] = idx + trailing_zeroes (bits);
133
+ bits = _blsr_u64 (bits);
134
+ base_ptr[15 ] = idx + trailing_zeroes (bits);
135
+ bits = _blsr_u64 (bits);
140
136
}
141
- if (cnt > 16 ) { // unluckly: we rarely get here
142
- // since it means having one structural or pseudo-structral element
143
- // every 4 characters (possible with inputs like "","","",...).
144
- do {
145
- base_ptr[0 ] = idx + trailing_zeroes (bits);
146
- bits = _blsr_u64 (bits);
147
- base_ptr++;
148
- } while (bits != 0 );
137
+ if (cnt > 16 ) {
138
+ // unluckly: this loop will rarely ever trigger
139
+ // since it means having one structural or pseudo-structral element
140
+ // every 4 characters (possible with inputs like "","","",...).
141
+ uint32_t i = 16 ;
142
+ do {
143
+ base_ptr[i] = idx + trailing_zeroes (bits);
144
+ bits = _blsr_u64 (bits);
145
+ i++;
146
+ } while (i < cnt);
149
147
}
150
- base = next_base ;
148
+ base_ptr += cnt ;
151
149
}
152
150
153
151
#include " generic/stage1_find_marks.h"
0 commit comments