Skip to content

Commit 90cc141

Browse files
authored
Merge pull request simdjson#1018 from simdjson/jkeiser/simplify-integer-parse
Remove some branches from number parsing
2 parents d13ce67 + 6797a6a commit 90cc141

File tree

6 files changed

+101
-127
lines changed

6 files changed

+101
-127
lines changed

src/arm64/numberparsing.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ namespace arm64 {
2121

2222
// we don't have SSE, so let us use a scalar function
2323
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
24-
static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
24+
static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
2525
uint64_t val;
2626
memcpy(&val, chars, sizeof(uint64_t));
2727
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;

src/fallback/numberparsing.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@ void found_float(double result, const uint8_t *buf);
1616

1717
namespace simdjson {
1818
namespace fallback {
19-
static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
19+
static really_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
2020
uint32_t result = 0;
2121
for (int i=0;i<8;i++) {
2222
result = result*10 + (chars[i] - '0');
2323
}
2424
return result;
2525
}
26+
static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
27+
return parse_eight_digits_unrolled((const char *)chars);
28+
}
2629

2730
#define SWAR_NUMBER_PARSING
2831

src/generic/stage2/numberparsing.h

Lines changed: 84 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,9 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
199199
return d;
200200
}
201201

202-
static bool parse_float_strtod(const char *ptr, double *outDouble) {
202+
static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
203203
char *endptr;
204-
*outDouble = strtod(ptr, &endptr);
204+
*outDouble = strtod((const char *)ptr, &endptr);
205205
// Some libraries will set errno = ERANGE when the value is subnormal,
206206
// yet we may want to be able to parse subnormal values.
207207
// However, we do not want to tolerate NAN or infinite values.
@@ -222,22 +222,16 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
222222
// a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
223223
// will flat out throw an exception.
224224
//
225-
if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
225+
if ((endptr == (const char *)ptr) || (!std::isfinite(*outDouble))) {
226226
return false;
227227
}
228228
return true;
229229
}
230230

231-
really_inline bool is_integer(char c) {
232-
return (c >= '0' && c <= '9');
233-
// this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
234-
}
235-
236-
237231
// check quickly whether the next 8 chars are made of digits
238232
// at a glance, it looks better than Mula's
239233
// http://0x80.pl/articles/swar-digits-validate.html
240-
really_inline bool is_made_of_eight_digits_fast(const char *chars) {
234+
really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
241235
uint64_t val;
242236
// this can read up to 7 bytes beyond the buffer size, but we require
243237
// SIMDJSON_PADDING of padding
@@ -253,28 +247,34 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
253247
}
254248

255249
template<typename W>
256-
bool slow_float_parsing(UNUSED const char * src, W writer) {
250+
bool slow_float_parsing(UNUSED const uint8_t * src, W writer) {
257251
double d;
258252
if (parse_float_strtod(src, &d)) {
259-
WRITE_DOUBLE(d, (const uint8_t *)src, writer);
253+
WRITE_DOUBLE(d, src, writer);
260254
return true;
261255
}
262-
return INVALID_NUMBER((const uint8_t *)src);
256+
return INVALID_NUMBER(src);
257+
}
258+
259+
template<typename I>
260+
NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
261+
really_inline bool parse_digit(const uint8_t c, I &i) {
262+
const uint8_t digit = static_cast<uint8_t>(c - '0');
263+
if (digit > 9) {
264+
return false;
265+
}
266+
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
267+
i = 10 * i + digit; // might overflow, we will handle the overflow later
268+
return true;
263269
}
264270

265-
really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) {
271+
really_inline bool parse_decimal(UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
266272
// we continue with the fiction that we have an integer. If the
267273
// floating point number is representable as x * 10^z for some integer
268274
// z that fits in 53 bits, then we will be able to convert back the
269275
// the integer into a float in a lossless manner.
270-
const char *const first_after_period = p;
271-
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
272-
273-
unsigned char digit = static_cast<unsigned char>(*p - '0');
274-
++p;
275-
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
276-
// cheaper than arbitrary mult.
277-
// we will handle the overflow later
276+
const uint8_t *const first_after_period = p;
277+
278278
#ifdef SWAR_NUMBER_PARSING
279279
// this helps if we have lots of decimals!
280280
// this turns out to be frequent enough.
@@ -283,61 +283,51 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
283283
p += 8;
284284
}
285285
#endif
286-
while (is_integer(*p)) {
287-
digit = static_cast<unsigned char>(*p - '0');
288-
++p;
289-
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
290-
// because we have parse_highprecision_float later.
291-
}
286+
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
287+
if (parse_digit(*p, i)) { ++p; }
288+
while (parse_digit(*p, i)) { p++; }
292289
exponent = first_after_period - p;
290+
// Decimal without digits (123.) is illegal
291+
if (exponent == 0) {
292+
return INVALID_NUMBER(src);
293+
}
293294
return true;
294295
}
295296

296-
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
297-
bool neg_exp = false;
298-
if ('-' == *p) {
299-
neg_exp = true;
300-
++p;
301-
} else if ('+' == *p) {
302-
++p;
303-
}
297+
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
298+
// Exp Sign: -123.456e[-]78
299+
bool neg_exp = ('-' == *p);
300+
if (neg_exp || '+' == *p) { p++; } // Skip + as well
304301

305-
// e[+-] must be followed by a number
306-
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
307-
unsigned char digit = static_cast<unsigned char>(*p - '0');
308-
int64_t exp_number = digit;
309-
p++;
310-
if (is_integer(*p)) {
311-
digit = static_cast<unsigned char>(*p - '0');
312-
exp_number = 10 * exp_number + digit;
313-
++p;
314-
}
315-
if (is_integer(*p)) {
316-
digit = static_cast<unsigned char>(*p - '0');
317-
exp_number = 10 * exp_number + digit;
318-
++p;
319-
}
320-
while (is_integer(*p)) {
321-
// we need to check for overflows; we refuse to parse this
322-
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
323-
digit = static_cast<unsigned char>(*p - '0');
324-
exp_number = 10 * exp_number + digit;
325-
++p;
326-
}
302+
// Exponent: -123.456e-[78]
303+
auto start_exp = p;
304+
int64_t exp_number = 0;
305+
while (parse_digit(*p, exp_number)) { ++p; }
327306
exponent += (neg_exp ? -exp_number : exp_number);
307+
308+
// If there were no digits, it's an error.
309+
// If there were more than 18 digits, we may have overflowed the integer.
310+
if (unlikely(p == start_exp || p > start_exp+18)) {
311+
// Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
312+
while (*start_exp == '0') { start_exp++; }
313+
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
314+
// support exponents smaller than -9,999,999,999,999,999,999 and bigger
315+
// than 9,999,999,999,999,999,999.
316+
if (p == start_exp || p > start_exp+18) { return INVALID_NUMBER(src); }
317+
}
328318
return true;
329319
}
330320

331321
template<typename W>
332-
really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) {
322+
really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
333323
// If we frequently had to deal with long strings of digits,
334324
// we could extend our code by using a 128-bit integer instead
335325
// of a 64-bit integer. However, this is uncommon in practice.
336326
// digit count is off by 1 because of the decimal (assuming there was one).
337327
if (unlikely((digit_count-1 >= 19))) { // this is uncommon
338328
// It is possible that the integer had an overflow.
339329
// We have to handle the case where we have 0.0000somenumber.
340-
const char *start = start_digits;
330+
const uint8_t *start = start_digits;
341331
while ((*start == '0') || (*start == '.')) {
342332
start++;
343333
}
@@ -351,7 +341,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
351341
// 10000000000000000000000000000000000000000000e+308
352342
// 3.1415926535897932384626433832795028841971693993751
353343
//
354-
bool success = slow_float_parsing((const char *) src, writer);
344+
bool success = slow_float_parsing(src, writer);
355345
// The number was already written, but we made a copy of the writer
356346
// when we passed it to the parse_large_integer() function, so
357347
writer.skip_double();
@@ -364,7 +354,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
364354
if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
365355
// this is almost never going to get called!!!
366356
// we start anew, going slowly!!!
367-
bool success = slow_float_parsing((const char *) src, writer);
357+
bool success = slow_float_parsing(src, writer);
368358
// The number was already written, but we made a copy of the writer when we passed it to the
369359
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
370360
writer.skip_double();
@@ -374,12 +364,23 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
374364
double d = compute_float_64(exponent, i, negative, &success);
375365
if (!success) {
376366
// we are almost never going to get here.
377-
if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); }
367+
if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
378368
}
379369
WRITE_DOUBLE(d, src, writer);
380370
return true;
381371
}
382372

373+
// for performance analysis, it is sometimes useful to skip parsing
374+
#ifdef SIMDJSON_SKIPNUMBERPARSING
375+
376+
template<typename W>
377+
really_inline bool parse_number(const uint8_t *const, W &writer) {
378+
writer.append_s64(0); // always write zero
379+
return true; // always succeeds
380+
}
381+
382+
#else
383+
383384
// parse the number at src
384385
// define JSON_TEST_NUMBERS for unit testing
385386
//
@@ -390,48 +391,25 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
390391
//
391392
// Our objective is accurate parsing (ULP of 0) at high speed.
392393
template<typename W>
393-
really_inline bool parse_number(UNUSED const uint8_t *const src,
394-
UNUSED bool found_minus,
395-
W &writer) {
396-
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
397-
// useful to skip parsing
398-
writer.append_s64(0); // always write zero
399-
return true; // always succeeds
400-
#else
401-
const char *p = reinterpret_cast<const char *>(src);
402-
bool negative = false;
403-
if (found_minus) {
404-
++p;
405-
negative = true;
406-
// a negative sign must be followed by an integer
407-
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
408-
}
409-
const char *const start_digits = p;
394+
really_inline bool parse_number(const uint8_t *const src, W &writer) {
410395

411-
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
412-
if (*p == '0') {
413-
++p;
414-
if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
415-
i = 0;
416-
} else {
417-
// NOTE: This is a redundant check--either we're negative, in which case we checked whether this
418-
// is a digit above, or the caller already determined we start with a digit. But removing this
419-
// check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
420-
// Please do try yourself, or think of ways to explain it--we'd love to understand :)
421-
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer
422-
unsigned char digit = static_cast<unsigned char>(*p - '0');
423-
i = digit;
424-
p++;
425-
// the is_made_of_eight_digits_fast routine is unlikely to help here because
426-
// we rarely see large integer parts like 123456789
427-
while (is_integer(*p)) {
428-
digit = static_cast<unsigned char>(*p - '0');
429-
// a multiplication by 10 is cheaper than an arbitrary integer
430-
// multiplication
431-
i = 10 * i + digit; // might overflow, we will handle the overflow later
432-
++p;
433-
}
434-
}
396+
//
397+
// Check for minus sign
398+
//
399+
bool negative = (*src == '-');
400+
const uint8_t *p = src + negative;
401+
402+
//
403+
// Parse the integer part.
404+
//
405+
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
406+
const uint8_t *const start_digits = p;
407+
uint64_t i = 0;
408+
while (parse_digit(*p, i)) { p++; }
409+
410+
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
411+
int digit_count = int(p - start_digits);
412+
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
435413

436414
//
437415
// Handle floats if there is a . or e (or both)
@@ -442,8 +420,8 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
442420
is_float = true;
443421
++p;
444422
if (!parse_decimal(src, p, i, exponent)) { return false; }
423+
digit_count = int(p - start_digits); // used later to guard against overflows
445424
}
446-
int digit_count = int(p - start_digits); // used later to guard against overflows
447425
if (('e' == *p) || ('E' == *p)) {
448426
is_float = true;
449427
++p;
@@ -492,9 +470,9 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
492470
WRITE_INTEGER(negative ? 0 - i : i, src, writer);
493471
}
494472
return is_structural_or_whitespace(*p);
473+
}
495474

496475
#endif // SIMDJSON_SKIPNUMBERPARSING
497-
}
498476

499477
} // namespace numberparsing
500478
} // namespace stage2

src/generic/stage2/structural_parser.h

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -159,17 +159,17 @@ struct structural_parser : structural_iterator {
159159
return false;
160160
}
161161

162-
WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
162+
WARN_UNUSED really_inline bool parse_number(const uint8_t *src) {
163163
log_value("number");
164-
bool succeeded = numberparsing::parse_number(src, found_minus, tape);
164+
bool succeeded = numberparsing::parse_number(src, tape);
165165
if (!succeeded) { log_error("Invalid number"); }
166166
return !succeeded;
167167
}
168-
WARN_UNUSED really_inline bool parse_number(bool found_minus) {
169-
return parse_number(current(), found_minus);
168+
WARN_UNUSED really_inline bool parse_number() {
169+
return parse_number(current());
170170
}
171171

172-
really_inline bool parse_number_with_space_terminated_copy(const bool is_negative) {
172+
really_inline bool parse_number_with_space_terminated_copy() {
173173
/**
174174
* We need to make a copy to make sure that the string is space terminated.
175175
* This is not about padding the input, which should already padded up
@@ -190,7 +190,7 @@ struct structural_parser : structural_iterator {
190190
memcpy(copy, buf, parser.len);
191191
memset(copy + parser.len, ' ', SIMDJSON_PADDING);
192192
size_t idx = *current_structural;
193-
bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
193+
bool result = parse_number(&copy[idx]); // parse_number does not throw
194194
free(copy);
195195
return result;
196196
}
@@ -214,12 +214,10 @@ struct structural_parser : structural_iterator {
214214
FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
215215
tape.append(0, internal::tape_type::NULL_VALUE);
216216
return continue_state;
217+
case '-':
217218
case '0': case '1': case '2': case '3': case '4':
218219
case '5': case '6': case '7': case '8': case '9':
219-
FAIL_IF( parse_number(false) );
220-
return continue_state;
221-
case '-':
222-
FAIL_IF( parse_number(true) );
220+
FAIL_IF( parse_number() );
223221
return continue_state;
224222
case '{':
225223
FAIL_IF( start_object(continue_state) );
@@ -375,18 +373,13 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
375373
FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
376374
parser.tape.append(0, internal::tape_type::NULL_VALUE);
377375
goto finish;
376+
case '-':
378377
case '0': case '1': case '2': case '3': case '4':
379378
case '5': case '6': case '7': case '8': case '9':
380379
// Next line used to be an interesting functional programming exercise with
381380
// a lambda that gets passed to another function via a closure. This would confuse the
382381
// clangcl compiler under Visual Studio 2019 (recent release).
383-
{ if(parser.parse_number_with_space_terminated_copy(false)) { goto error; }}
384-
goto finish;
385-
case '-':
386-
// Next line used to be an interesting functional programming exercise with
387-
// a lambda that gets passed to another function via a closure. This would confuse the
388-
// clangcl compiler under Visual Studio 2019 (recent release).
389-
{ if(parser.parse_number_with_space_terminated_copy(true)) { goto error; }}
382+
FAIL_IF(parser.parse_number_with_space_terminated_copy());
390383
goto finish;
391384
default:
392385
parser.log_error("Document starts with a non-value character");

0 commit comments

Comments
 (0)