@@ -199,9 +199,9 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
199
199
return d;
200
200
}
201
201
202
- static bool parse_float_strtod (const char *ptr, double *outDouble) {
202
+ static bool parse_float_strtod (const uint8_t *ptr, double *outDouble) {
203
203
char *endptr;
204
- *outDouble = strtod (ptr, &endptr);
204
+ *outDouble = strtod (( const char *) ptr, &endptr);
205
205
// Some libraries will set errno = ERANGE when the value is subnormal,
206
206
// yet we may want to be able to parse subnormal values.
207
207
// However, we do not want to tolerate NAN or infinite values.
@@ -222,22 +222,16 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
222
222
// a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
223
223
// will flat out throw an exception.
224
224
//
225
- if ((endptr == ptr) || (!std::isfinite (*outDouble))) {
225
+ if ((endptr == ( const char *) ptr) || (!std::isfinite (*outDouble))) {
226
226
return false ;
227
227
}
228
228
return true ;
229
229
}
230
230
231
- really_inline bool is_integer (char c) {
232
- return (c >= ' 0' && c <= ' 9' );
233
- // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
234
- }
235
-
236
-
237
231
// check quickly whether the next 8 chars are made of digits
238
232
// at a glance, it looks better than Mula's
239
233
// http://0x80.pl/articles/swar-digits-validate.html
240
- really_inline bool is_made_of_eight_digits_fast (const char *chars) {
234
+ really_inline bool is_made_of_eight_digits_fast (const uint8_t *chars) {
241
235
uint64_t val;
242
236
// this can read up to 7 bytes beyond the buffer size, but we require
243
237
// SIMDJSON_PADDING of padding
@@ -253,28 +247,34 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
253
247
}
254
248
255
249
template <typename W>
256
- bool slow_float_parsing (UNUSED const char * src, W writer) {
250
+ bool slow_float_parsing (UNUSED const uint8_t * src, W writer) {
257
251
double d;
258
252
if (parse_float_strtod (src, &d)) {
259
- WRITE_DOUBLE (d, ( const uint8_t *) src, writer);
253
+ WRITE_DOUBLE (d, src, writer);
260
254
return true ;
261
255
}
262
- return INVALID_NUMBER ((const uint8_t *)src);
256
+ return INVALID_NUMBER (src);
257
+ }
258
+
259
+ template <typename I>
260
+ NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
261
+ really_inline bool parse_digit (const uint8_t c, I &i) {
262
+ const uint8_t digit = static_cast <uint8_t >(c - ' 0' );
263
+ if (digit > 9 ) {
264
+ return false ;
265
+ }
266
+ // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
267
+ i = 10 * i + digit; // might overflow, we will handle the overflow later
268
+ return true ;
263
269
}
264
270
265
- really_inline bool parse_decimal (UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) {
271
+ really_inline bool parse_decimal (UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
266
272
// we continue with the fiction that we have an integer. If the
267
273
// floating point number is representable as x * 10^z for some integer
268
274
// z that fits in 53 bits, then we will be able to convert back the
269
275
// the integer into a float in a lossless manner.
270
- const char *const first_after_period = p;
271
- if (!is_integer (*p)) { return INVALID_NUMBER (src); } // There must be at least one digit after the .
272
-
273
- unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
274
- ++p;
275
- i = i * 10 + digit; // might overflow + multiplication by 10 is likely
276
- // cheaper than arbitrary mult.
277
- // we will handle the overflow later
276
+ const uint8_t *const first_after_period = p;
277
+
278
278
#ifdef SWAR_NUMBER_PARSING
279
279
// this helps if we have lots of decimals!
280
280
// this turns out to be frequent enough.
@@ -283,61 +283,51 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
283
283
p += 8 ;
284
284
}
285
285
#endif
286
- while (is_integer (*p)) {
287
- digit = static_cast <unsigned char >(*p - ' 0' );
288
- ++p;
289
- i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
290
- // because we have parse_highprecision_float later.
291
- }
286
+ // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
287
+ if (parse_digit (*p, i)) { ++p; }
288
+ while (parse_digit (*p, i)) { p++; }
292
289
exponent = first_after_period - p;
290
+ // Decimal without digits (123.) is illegal
291
+ if (exponent == 0 ) {
292
+ return INVALID_NUMBER (src);
293
+ }
293
294
return true ;
294
295
}
295
296
296
- really_inline bool parse_exponent (UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
297
- bool neg_exp = false ;
298
- if (' -' == *p) {
299
- neg_exp = true ;
300
- ++p;
301
- } else if (' +' == *p) {
302
- ++p;
303
- }
297
+ really_inline bool parse_exponent (UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
298
+ // Exp Sign: -123.456e[-]78
299
+ bool neg_exp = (' -' == *p);
300
+ if (neg_exp || ' +' == *p) { p++; } // Skip + as well
304
301
305
- // e[+-] must be followed by a number
306
- if (!is_integer (*p)) { return INVALID_NUMBER (src); }
307
- unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
308
- int64_t exp_number = digit;
309
- p++;
310
- if (is_integer (*p)) {
311
- digit = static_cast <unsigned char >(*p - ' 0' );
312
- exp_number = 10 * exp_number + digit;
313
- ++p;
314
- }
315
- if (is_integer (*p)) {
316
- digit = static_cast <unsigned char >(*p - ' 0' );
317
- exp_number = 10 * exp_number + digit;
318
- ++p;
319
- }
320
- while (is_integer (*p)) {
321
- // we need to check for overflows; we refuse to parse this
322
- if (exp_number > 0x100000000 ) { return INVALID_NUMBER (src); }
323
- digit = static_cast <unsigned char >(*p - ' 0' );
324
- exp_number = 10 * exp_number + digit;
325
- ++p;
326
- }
302
+ // Exponent: -123.456e-[78]
303
+ auto start_exp = p;
304
+ int64_t exp_number = 0 ;
305
+ while (parse_digit (*p, exp_number)) { ++p; }
327
306
exponent += (neg_exp ? -exp_number : exp_number);
307
+
308
+ // If there were no digits, it's an error.
309
+ // If there were more than 18 digits, we may have overflowed the integer.
310
+ if (unlikely (p == start_exp || p > start_exp+18 )) {
311
+ // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
312
+ while (*start_exp == ' 0' ) { start_exp++; }
313
+ // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
314
+ // support exponents smaller than -9,999,999,999,999,999,999 and bigger
315
+ // than 9,999,999,999,999,999,999.
316
+ if (p == start_exp || p > start_exp+18 ) { return INVALID_NUMBER (src); }
317
+ }
328
318
return true ;
329
319
}
330
320
331
321
template <typename W>
332
- really_inline bool write_float (const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) {
322
+ really_inline bool write_float (const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
333
323
// If we frequently had to deal with long strings of digits,
334
324
// we could extend our code by using a 128-bit integer instead
335
325
// of a 64-bit integer. However, this is uncommon in practice.
336
326
// digit count is off by 1 because of the decimal (assuming there was one).
337
327
if (unlikely ((digit_count-1 >= 19 ))) { // this is uncommon
338
328
// It is possible that the integer had an overflow.
339
329
// We have to handle the case where we have 0.0000somenumber.
340
- const char *start = start_digits;
330
+ const uint8_t *start = start_digits;
341
331
while ((*start == ' 0' ) || (*start == ' .' )) {
342
332
start++;
343
333
}
@@ -351,7 +341,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
351
341
// 10000000000000000000000000000000000000000000e+308
352
342
// 3.1415926535897932384626433832795028841971693993751
353
343
//
354
- bool success = slow_float_parsing (( const char *) src, writer);
344
+ bool success = slow_float_parsing (src, writer);
355
345
// The number was already written, but we made a copy of the writer
356
346
// when we passed it to the parse_large_integer() function, so
357
347
writer.skip_double ();
@@ -364,7 +354,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
364
354
if (unlikely (exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
365
355
// this is almost never going to get called!!!
366
356
// we start anew, going slowly!!!
367
- bool success = slow_float_parsing (( const char *) src, writer);
357
+ bool success = slow_float_parsing (src, writer);
368
358
// The number was already written, but we made a copy of the writer when we passed it to the
369
359
// slow_float_parsing() function, so we have to skip those tape spots now that we've returned
370
360
writer.skip_double ();
@@ -374,12 +364,23 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
374
364
double d = compute_float_64 (exponent, i, negative, &success);
375
365
if (!success) {
376
366
// we are almost never going to get here.
377
- if (!parse_float_strtod (( const char *) src, &d)) { return INVALID_NUMBER (src); }
367
+ if (!parse_float_strtod (src, &d)) { return INVALID_NUMBER (src); }
378
368
}
379
369
WRITE_DOUBLE (d, src, writer);
380
370
return true ;
381
371
}
382
372
373
+ // for performance analysis, it is sometimes useful to skip parsing
374
+ #ifdef SIMDJSON_SKIPNUMBERPARSING
375
+
376
+ template <typename W>
377
+ really_inline bool parse_number (const uint8_t *const , W &writer) {
378
+ writer.append_s64 (0 ); // always write zero
379
+ return true ; // always succeeds
380
+ }
381
+
382
+ #else
383
+
383
384
// parse the number at src
384
385
// define JSON_TEST_NUMBERS for unit testing
385
386
//
@@ -390,48 +391,25 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
390
391
//
391
392
// Our objective is accurate parsing (ULP of 0) at high speed.
392
393
template <typename W>
393
- really_inline bool parse_number (UNUSED const uint8_t *const src,
394
- UNUSED bool found_minus,
395
- W &writer) {
396
- #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
397
- // useful to skip parsing
398
- writer.append_s64 (0 ); // always write zero
399
- return true ; // always succeeds
400
- #else
401
- const char *p = reinterpret_cast <const char *>(src);
402
- bool negative = false ;
403
- if (found_minus) {
404
- ++p;
405
- negative = true ;
406
- // a negative sign must be followed by an integer
407
- if (!is_integer (*p)) { return INVALID_NUMBER (src); }
408
- }
409
- const char *const start_digits = p;
394
+ really_inline bool parse_number (const uint8_t *const src, W &writer) {
410
395
411
- uint64_t i; // an unsigned int avoids signed overflows (which are bad)
412
- if (*p == ' 0' ) {
413
- ++p;
414
- if (is_integer (*p)) { return INVALID_NUMBER (src); } // 0 cannot be followed by an integer
415
- i = 0 ;
416
- } else {
417
- // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
418
- // is a digit above, or the caller already determined we start with a digit. But removing this
419
- // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
420
- // Please do try yourself, or think of ways to explain it--we'd love to understand :)
421
- if (!is_integer (*p)) { return INVALID_NUMBER (src); } // must start with an integer
422
- unsigned char digit = static_cast <unsigned char >(*p - ' 0' );
423
- i = digit;
424
- p++;
425
- // the is_made_of_eight_digits_fast routine is unlikely to help here because
426
- // we rarely see large integer parts like 123456789
427
- while (is_integer (*p)) {
428
- digit = static_cast <unsigned char >(*p - ' 0' );
429
- // a multiplication by 10 is cheaper than an arbitrary integer
430
- // multiplication
431
- i = 10 * i + digit; // might overflow, we will handle the overflow later
432
- ++p;
433
- }
434
- }
396
+ //
397
+ // Check for minus sign
398
+ //
399
+ bool negative = (*src == ' -' );
400
+ const uint8_t *p = src + negative;
401
+
402
+ //
403
+ // Parse the integer part.
404
+ //
405
+ // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
406
+ const uint8_t *const start_digits = p;
407
+ uint64_t i = 0 ;
408
+ while (parse_digit (*p, i)) { p++; }
409
+
410
+ // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
411
+ int digit_count = int (p - start_digits);
412
+ if (digit_count == 0 || (' 0' == *start_digits && digit_count > 1 )) { return INVALID_NUMBER (src); }
435
413
436
414
//
437
415
// Handle floats if there is a . or e (or both)
@@ -442,8 +420,8 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
442
420
is_float = true ;
443
421
++p;
444
422
if (!parse_decimal (src, p, i, exponent)) { return false ; }
423
+ digit_count = int (p - start_digits); // used later to guard against overflows
445
424
}
446
- int digit_count = int (p - start_digits); // used later to guard against overflows
447
425
if ((' e' == *p) || (' E' == *p)) {
448
426
is_float = true ;
449
427
++p;
@@ -492,9 +470,9 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
492
470
WRITE_INTEGER (negative ? 0 - i : i, src, writer);
493
471
}
494
472
return is_structural_or_whitespace (*p);
473
+ }
495
474
496
475
#endif // SIMDJSON_SKIPNUMBERPARSING
497
- }
498
476
499
477
} // namespace numberparsing
500
478
} // namespace stage2
0 commit comments