Sync up int parsing with integer branch

jkeiser · jkeiser · commit a50ac7a701b6 · 2020-07-10T14:54:03.000-07:00
diff --git a/include/simdjson/internal/numberparsing.h b/include/simdjson/internal/numberparsing.h
@@ -16,135 +16,105 @@ namespace {
 
 using namespace internal::logger;
 
-// Integers 19 digits or more: 10,000,000,000,000,000,000 to 18,446,744,073,709,551,615
-simdjson_result<uint64_t> convert_large_unsigned(const uint8_t *buf, int digits, uint64_t magnitude) {
-  log_event("  (large unsigned)", buf);
-  assert(digits >= 19);
-  if (digits > 19) {
-    log_error("20+ digits", buf);
-    return NUMBER_OUT_OF_RANGE;
+template<typename I>
+NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+really_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
   }
-  if (!is_structural_or_whitespace(buf[digits])) {
-    log_error("followed by non-ws/struct", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-  if (buf[0] != '1') {
-    log_error("greater than 2e18", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-
-  // We have 19 digits and a leading 1.
-  // 19,999,999,999,999,999,999 is the biggest number the user could have written.
-  // 18,446,744,073,709,551,615 is the biggest number uint64_t could store.
-  //  1,553,255,926,290,448,383 is the overflow of the biggest number we could store.
-  // 10,000,000,000,000,000,000 is the smallest number the user could have written.
-  // We assume that an overflow is lower than that.
-  if (magnitude < 10000000000000000000ULL) {
-    log_error("19-digit overflow", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-  return magnitude;
-}
-
-// Integers 18 digits or more: 1,000,000,000,000,000,000 to  9,223,372,036,854,775,807
-//                        and -1,000,000,000,000,000,000 to -9,223,372,036,854,775,808
-simdjson_result<int64_t> convert_large_integer(const uint8_t *buf, int digits, uint64_t magnitude, bool negative) {
-  log_event("  (large integer)", buf);
-  assert(digits >= 18);
-  if (digits > 18) {
-    log_error("19+ digits", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-  if (!is_structural_or_whitespace(buf[digits])) {
-    log_error("followed by non-ws/struct", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-
-  // The number cannot have actually overflowed since it's stored in an unsigned integer;
-  // we just have to check whether it's bigger than INT64_MAX
-
-  // C++ can't reliably negate uint64_t INT64_MIN, it seems
-  if (negative && magnitude == (uint64_t(INT64_MAX)+1)) {
-    log_event("  (INT64_MIN)", buf);
-    return INT64_MIN;
-  }
-  if (magnitude > uint64_t(INT64_MAX)) {
-    log_error("18-digit overflow", buf);
-    return NUMBER_OUT_OF_RANGE;
-  }
-  return negative ? -static_cast<int64_t>(magnitude) : static_cast<int64_t>(magnitude);
-
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
 }
 
-} // namespace {}
+}; // namespace {}
 
 // Parse any number from 0 to 18,446,744,073,709,551,615
-really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const buf) noexcept {
-  // Parse the first digit
-  uint64_t magnitude = buf[0] - '0';
-  int digits = 1;
-  if (magnitude > 0) { // 0 cannot be followed by other digits
-    if (magnitude > 9) { // First thing is not a digit at all
-      log_error("non-digit start", buf);
-      return INCORRECT_TYPE;
-    }
-
-    // Parse remaining digits
-    while (1) {
-      uint8_t digit = static_cast<uint8_t>(buf[digits] - '0');
-      if (digit > 9) { break; }
-      magnitude = magnitude * 10 + digit;
-      digits++;
-    }
-
-    // Check for massive numbers
-    if (unlikely(digits >= 19)) {
-      return convert_large_unsigned(buf, digits, magnitude);
-    }
+really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+
+  //
+  // Check for errors
+  //
+  auto digit_count = src - p;
+  if ( !is_structural_or_whitespace(*p) || // . or e
+        digit_count == 0                || // no digits
+       (leading_zero && digit_count != 1)  // 0123 (zero must be solo)
+  ) {
+    return NUMBER_ERROR;
   }
-
-  // Next character can't be . or e--it must be whitespace, comma, end array or end bracket
-  if (!is_structural_or_whitespace(buf[digits])) {
-    log_error("followed by non-ws/struct", buf);
-    return INCORRECT_TYPE;
+  // Overflow checks
+  if (digit_count > 20) { return NUMBER_OUT_OF_RANGE; }
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
   }
-  return magnitude;
+
+  //
+  // Return the number.
+  //
+  return i;
 }
 
 // Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
-really_inline simdjson_result<int64_t> parse_integer(const uint8_t * buf) noexcept {
-  bool negative = (buf[0] == '-');
-  if (negative) { buf++; }
-
-  // Parse the first digit
-  uint64_t magnitude = buf[0] - '0';
-  int digits = 1;
-  if (magnitude > 0) { // 0 cannot be followed by other digits
-    if (magnitude > 9) { // First thing is not a digit at all
-      log_error("non-digit start", buf);
-      return INCORRECT_TYPE;
-    }
-
-    // Parse remaining digits
-    while (1) {
-      uint8_t digit = static_cast<uint8_t>(buf[digits] - '0');
-      if (digit > 9) { break; }
-      magnitude = magnitude * 10 + digit;
-      digits++;
-    }
-
-    // Check for massive numbers
-    if (unlikely(digits >= 19)) {
-      return convert_large_integer(buf, digits, magnitude, negative);
-    }
+really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += negative;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+
+  //
+  // Check for errors
+  //
+  auto digit_count = p - src;
+  if ( !is_structural_or_whitespace(*p) || // . or e
+        digit_count == 0                || // no digits
+       (leading_zero && digit_count != 1)  // 0123 (zero must be solo)
+  ) {
+    return NUMBER_ERROR;
   }
-
-  // Next character can't be . or e--it must be whitespace, comma, end array or end bracket
-  if (!is_structural_or_whitespace(buf[digits])) {
-    log_error("followed by non-ws/struct", buf);
-    return INCORRECT_TYPE;
+  // Overflow checks
+  if (digit_count > 19) { return NUMBER_OUT_OF_RANGE; }
+  if (digit_count == 19) {
+    // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
+    if (negative && i == uint64_t(INT64_MAX)+1) { return INT64_MIN; }
+    // Anything above INT64_MAX is either invalid or INT64_MIN.
+    if (i > uint64_t(INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
   }
-  return magnitude;
+
+  //
+  // Return the number.
+  //
+  return negative ? 0 - i : i;
 }
 
 // really_inline simdjson_result<double> parse_double(const uint8_t * buf) noexcept {