@@ -268,9 +268,9 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
268
268
// z that fits in 53 bits, then we will be able to convert back the
269
269
// the integer into a float in a lossless manner.
270
270
const char *const first_after_period = p;
271
- if (!is_integer (*p)) { return INVALID_NUMBER (src); } // There must be at least one digit after the .
272
271
273
272
unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
273
+ if (digit > 9 ) { return INVALID_NUMBER (src); } // There must be at least one digit after the .
274
274
++p;
275
275
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
276
276
// cheaper than arbitrary mult.
@@ -283,16 +283,36 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
283
283
p += 8 ;
284
284
}
285
285
#endif
286
- while ( is_integer (*p)) {
287
- digit = static_cast < unsigned char >(*p - ' 0 ' );
286
+ digit = static_cast < unsigned char >(*p - ' 0 ' );
287
+ while ( digit <= 9 ) {
288
288
++p;
289
289
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
290
290
// because we have parse_highprecision_float later.
291
+ digit = static_cast <unsigned char >(*p - ' 0' );
291
292
}
292
293
exponent = first_after_period - p;
293
294
return true ;
294
295
}
295
296
297
+ template <typename I>
298
+ really_inline bool parse_digit (const char c, I &i) {
299
+ const unsigned char digit = static_cast <unsigned char >(c - ' 0' );
300
+ if (digit <= 9 ) {
301
+ // a multiplication by 10 is cheaper than an arbitrary integer
302
+ // multiplication
303
+ i = 10 * i + digit; // might overflow, we will handle the overflow later
304
+ return true ;
305
+ } else {
306
+ return false ;
307
+ }
308
+ }
309
+ template <typename I>
310
+ really_inline bool parse_first_digit (const char c, I &i) {
311
+ const unsigned char digit = static_cast <unsigned char >(c - ' 0' );
312
+ i = digit;
313
+ return digit <= 9 ;
314
+ }
315
+
296
316
really_inline bool parse_exponent (UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
297
317
bool neg_exp = false ;
298
318
if (' -' == *p) {
@@ -303,26 +323,15 @@ really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&
303
323
}
304
324
305
325
// e[+-] must be followed by a number
306
- if (!is_integer (*p)) { return INVALID_NUMBER (src); }
307
- unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
308
- int64_t exp_number = digit;
309
- p++;
310
- if (is_integer (*p)) {
311
- digit = static_cast <unsigned char >(*p - ' 0' );
312
- exp_number = 10 * exp_number + digit;
313
- ++p;
314
- }
315
- if (is_integer (*p)) {
316
- digit = static_cast <unsigned char >(*p - ' 0' );
317
- exp_number = 10 * exp_number + digit;
326
+ int64_t exp_number;
327
+ if (!parse_first_digit (*p, exp_number)) { return INVALID_NUMBER (src); }
328
+ ++p;
329
+ if (parse_digit (*p, exp_number)) { ++p; }
330
+ if (parse_digit (*p, exp_number)) { ++p; }
331
+ while (parse_digit (*p, exp_number)) {
318
332
++p;
319
- }
320
- while (is_integer (*p)) {
321
333
// we need to check for overflows; we refuse to parse this
322
334
if (exp_number > 0x100000000 ) { return INVALID_NUMBER (src); }
323
- digit = static_cast <unsigned char >(*p - ' 0' );
324
- exp_number = 10 * exp_number + digit;
325
- ++p;
326
335
}
327
336
exponent += (neg_exp ? -exp_number : exp_number);
328
337
return true ;
@@ -403,34 +412,23 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
403
412
if (found_minus) {
404
413
++p;
405
414
negative = true ;
406
- // a negative sign must be followed by an integer
407
- if (!is_integer (*p)) { return INVALID_NUMBER (src); }
408
415
}
416
+
417
+ //
418
+ // Parse the integer part.
419
+ //
409
420
const char *const start_digits = p;
421
+ uint64_t i;
422
+ if (!parse_first_digit (*p, i)) { return INVALID_NUMBER (src); }
423
+ ++p;
410
424
411
- uint64_t i; // an unsigned int avoids signed overflows (which are bad)
412
- if (*p == ' 0' ) {
413
- ++p;
414
- if (is_integer (*p)) { return INVALID_NUMBER (src); } // 0 cannot be followed by an integer
415
- i = 0 ;
425
+ if (i == 0 ) {
426
+ // If the integer starts with 0, just check that there are no more digits.
427
+ if (static_cast <unsigned char >(*p - ' 0' ) <= 9 ) { return INVALID_NUMBER (src); } // 0 cannot be followed by an integer
416
428
} else {
417
- // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
418
- // is a digit above, or the caller already determined we start with a digit. But removing this
419
- // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
420
- // Please do try yourself, or think of ways to explain it--we'd love to understand :)
421
- if (!is_integer (*p)) { return INVALID_NUMBER (src); } // must start with an integer
422
- unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
423
- i = digit;
424
- p++;
425
- // the is_made_of_eight_digits_fast routine is unlikely to help here because
426
- // we rarely see large integer parts like 123456789
427
- while (is_integer (*p)) {
428
- digit = static_cast <unsigned char >(*p - ' 0' );
429
- // a multiplication by 10 is cheaper than an arbitrary integer
430
- // multiplication
431
- i = 10 * i + digit; // might overflow, we will handle the overflow later
432
- ++p;
433
- }
429
+ // Integer starts with 1-9. Parse the rest of the integer
430
+ // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
431
+ while (parse_digit (*p, i)) { p++; }
434
432
}
435
433
436
434
//
0 commit comments