Skip to content

Commit fc0102b

Browse files
committed
Use common parse_digit() funtion in int parsing
1 parent 62a3963 commit fc0102b

File tree

1 file changed

+42
-44
lines changed

1 file changed

+42
-44
lines changed

src/generic/stage2/numberparsing.h

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
268268
// z that fits in 53 bits, then we will be able to convert back the
269269
// the integer into a float in a lossless manner.
270270
const char *const first_after_period = p;
271-
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
272271

273272
unsigned char digit = static_cast<unsigned char>(*p - '0');
273+
if (digit > 9) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
274274
++p;
275275
i = i * 10 + digit; // might overflow + multiplication by 10 is likely
276276
// cheaper than arbitrary mult.
@@ -283,16 +283,36 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
283283
p += 8;
284284
}
285285
#endif
286-
while (is_integer(*p)) {
287-
digit = static_cast<unsigned char>(*p - '0');
286+
digit = static_cast<unsigned char>(*p - '0');
287+
while (digit <= 9) {
288288
++p;
289289
i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
290290
// because we have parse_highprecision_float later.
291+
digit = static_cast<unsigned char>(*p - '0');
291292
}
292293
exponent = first_after_period - p;
293294
return true;
294295
}
295296

297+
template<typename I>
298+
really_inline bool parse_digit(const char c, I &i) {
299+
const unsigned char digit = static_cast<unsigned char>(c - '0');
300+
if (digit <= 9) {
301+
// a multiplication by 10 is cheaper than an arbitrary integer
302+
// multiplication
303+
i = 10 * i + digit; // might overflow, we will handle the overflow later
304+
return true;
305+
} else {
306+
return false;
307+
}
308+
}
309+
template<typename I>
310+
really_inline bool parse_first_digit(const char c, I &i) {
311+
const unsigned char digit = static_cast<unsigned char>(c - '0');
312+
i = digit;
313+
return digit <= 9;
314+
}
315+
296316
really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
297317
bool neg_exp = false;
298318
if ('-' == *p) {
@@ -303,26 +323,15 @@ really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&
303323
}
304324

305325
// e[+-] must be followed by a number
306-
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
307-
unsigned char digit = static_cast<unsigned char>(*p - '0');
308-
int64_t exp_number = digit;
309-
p++;
310-
if (is_integer(*p)) {
311-
digit = static_cast<unsigned char>(*p - '0');
312-
exp_number = 10 * exp_number + digit;
313-
++p;
314-
}
315-
if (is_integer(*p)) {
316-
digit = static_cast<unsigned char>(*p - '0');
317-
exp_number = 10 * exp_number + digit;
326+
int64_t exp_number;
327+
if (!parse_first_digit(*p, exp_number)) { return INVALID_NUMBER(src); }
328+
++p;
329+
if (parse_digit(*p, exp_number)) { ++p; }
330+
if (parse_digit(*p, exp_number)) { ++p; }
331+
while (parse_digit(*p, exp_number)) {
318332
++p;
319-
}
320-
while (is_integer(*p)) {
321333
// we need to check for overflows; we refuse to parse this
322334
if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
323-
digit = static_cast<unsigned char>(*p - '0');
324-
exp_number = 10 * exp_number + digit;
325-
++p;
326335
}
327336
exponent += (neg_exp ? -exp_number : exp_number);
328337
return true;
@@ -403,34 +412,23 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
403412
if (found_minus) {
404413
++p;
405414
negative = true;
406-
// a negative sign must be followed by an integer
407-
if (!is_integer(*p)) { return INVALID_NUMBER(src); }
408415
}
416+
417+
//
418+
// Parse the integer part.
419+
//
409420
const char *const start_digits = p;
421+
uint64_t i;
422+
if (!parse_first_digit(*p, i)) { return INVALID_NUMBER(src); }
423+
++p;
410424

411-
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
412-
if (*p == '0') {
413-
++p;
414-
if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
415-
i = 0;
425+
if (i == 0) {
426+
// If the integer starts with 0, just check that there are no more digits.
427+
if (static_cast<unsigned char>(*p - '0') <= 9) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
416428
} else {
417-
// NOTE: This is a redundant check--either we're negative, in which case we checked whether this
418-
// is a digit above, or the caller already determined we start with a digit. But removing this
419-
// check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
420-
// Please do try yourself, or think of ways to explain it--we'd love to understand :)
421-
if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer
422-
unsigned char digit = static_cast<unsigned char>(*p - '0');
423-
i = digit;
424-
p++;
425-
// the is_made_of_eight_digits_fast routine is unlikely to help here because
426-
// we rarely see large integer parts like 123456789
427-
while (is_integer(*p)) {
428-
digit = static_cast<unsigned char>(*p - '0');
429-
// a multiplication by 10 is cheaper than an arbitrary integer
430-
// multiplication
431-
i = 10 * i + digit; // might overflow, we will handle the overflow later
432-
++p;
433-
}
429+
// Integer starts with 1-9. Parse the rest of the integer
430+
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
431+
while (parse_digit(*p, i)) { p++; }
434432
}
435433

436434
//

0 commit comments

Comments
 (0)