Merge pull request simdjson#1018 from simdjson/jkeiser/simplify-integer-parse

jkeiser · web-flow · commit 90cc1411da8c · 2020-07-16T12:21:43.000-07:00
Remove some branches from number parsing
diff --git a/src/arm64/numberparsing.h b/src/arm64/numberparsing.h
@@ -21,7 +21,7 @@ namespace arm64 {
 
 // we don't have SSE, so let us use a scalar function
 // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
+static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
   uint64_t val;
   memcpy(&val, chars, sizeof(uint64_t));
   val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
diff --git a/src/fallback/numberparsing.h b/src/fallback/numberparsing.h
@@ -16,13 +16,16 @@ void found_float(double result, const uint8_t *buf);
 
 namespace simdjson {
 namespace fallback {
-static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
+static really_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
   uint32_t result = 0;
   for (int i=0;i<8;i++) {
     result = result*10 + (chars[i] - '0');
   }
   return result;
 }
+static really_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  return parse_eight_digits_unrolled((const char *)chars);
+}
 
 #define SWAR_NUMBER_PARSING
 
diff --git a/src/generic/stage2/numberparsing.h b/src/generic/stage2/numberparsing.h
@@ -199,9 +199,9 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
   return d;
 }
 
-static bool parse_float_strtod(const char *ptr, double *outDouble) {
+static bool parse_float_strtod(const uint8_t *ptr, double *outDouble) {
   char *endptr;
-  *outDouble = strtod(ptr, &endptr);
+  *outDouble = strtod((const char *)ptr, &endptr);
   // Some libraries will set errno = ERANGE when the value is subnormal,
   // yet we may want to be able to parse subnormal values.
   // However, we do not want to tolerate NAN or infinite values.
@@ -222,22 +222,16 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // a float that does not fit in binary64. JSON for Modern C++ (nlohmann/json)
   // will flat out throw an exception.
   //
-  if ((endptr == ptr) || (!std::isfinite(*outDouble))) {
+  if ((endptr == (const char *)ptr) || (!std::isfinite(*outDouble))) {
     return false;
   }
   return true;
 }
 
-really_inline bool is_integer(char c) {
-  return (c >= '0' && c <= '9');
-  // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
-}
-
-
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
 // http://0x80.pl/articles/swar-digits-validate.html
-really_inline bool is_made_of_eight_digits_fast(const char *chars) {
+really_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
   uint64_t val;
   // this can read up to 7 bytes beyond the buffer size, but we require
   // SIMDJSON_PADDING of padding
@@ -253,28 +247,34 @@ really_inline bool is_made_of_eight_digits_fast(const char *chars) {
 }
 
 template<typename W>
-bool slow_float_parsing(UNUSED const char * src, W writer) {
+bool slow_float_parsing(UNUSED const uint8_t * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    WRITE_DOUBLE(d, (const uint8_t *)src, writer);
+    WRITE_DOUBLE(d, src, writer);
     return true;
   }
-  return INVALID_NUMBER((const uint8_t *)src);
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+really_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
 }
 
-really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p, uint64_t &i, int64_t &exponent) {
+really_inline bool parse_decimal(UNUSED const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
   // we continue with the fiction that we have an integer. If the
   // floating point number is representable as x * 10^z for some integer
   // z that fits in 53 bits, then we will be able to convert back the
   // the integer into a float in a lossless manner.
-  const char *const first_after_period = p;
-  if (!is_integer(*p)) { return INVALID_NUMBER(src); } // There must be at least one digit after the .
-
-  unsigned char digit = static_cast<unsigned char>(*p - '0');
-  ++p;
-  i = i * 10 + digit; // might overflow + multiplication by 10 is likely
-                      // cheaper than arbitrary mult.
-  // we will handle the overflow later
+  const uint8_t *const first_after_period = p;
+
 #ifdef SWAR_NUMBER_PARSING
   // this helps if we have lots of decimals!
   // this turns out to be frequent enough.
@@ -283,61 +283,51 @@ really_inline bool parse_decimal(UNUSED const uint8_t *const src, const char *&p
     p += 8;
   }
 #endif
-  while (is_integer(*p)) {
-    digit = static_cast<unsigned char>(*p - '0');
-    ++p;
-    i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
-                        // because we have parse_highprecision_float later.
-  }
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
   exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
   return true;
 }
 
-really_inline bool parse_exponent(UNUSED const uint8_t *const src, const char *&p, int64_t &exponent) {
-  bool neg_exp = false;
-  if ('-' == *p) {
-    neg_exp = true;
-    ++p;
-  } else if ('+' == *p) {
-    ++p;
-  }
+really_inline bool parse_exponent(UNUSED const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
 
-  // e[+-] must be followed by a number
-  if (!is_integer(*p)) { return INVALID_NUMBER(src); }
-  unsigned char digit = static_cast<unsigned char>(*p - '0');
-  int64_t exp_number = digit;
-  p++;
-  if (is_integer(*p)) {
-    digit = static_cast<unsigned char>(*p - '0');
-    exp_number = 10 * exp_number + digit;
-    ++p;
-  }
-  if (is_integer(*p)) {
-    digit = static_cast<unsigned char>(*p - '0');
-    exp_number = 10 * exp_number + digit;
-    ++p;
-  }
-  while (is_integer(*p)) {
-    // we need to check for overflows; we refuse to parse this
-    if (exp_number > 0x100000000) { return INVALID_NUMBER(src); }
-    digit = static_cast<unsigned char>(*p - '0');
-    exp_number = 10 * exp_number + digit;
-    ++p;
-  }
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
   exponent += (neg_exp ? -exp_number : exp_number);
+
+  // If there were no digits, it's an error.
+  // If there were more than 18 digits, we may have overflowed the integer.
+  if (unlikely(p == start_exp || p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -9,999,999,999,999,999,999 and bigger
+    // than 9,999,999,999,999,999,999.
+    if (p == start_exp || p > start_exp+18) { return INVALID_NUMBER(src); }
+  }
   return true;
 }
 
 template<typename W>
-really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const char * start_digits, int digit_count, int64_t exponent, W &writer) {
+really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, int digit_count, int64_t exponent, W &writer) {
   // If we frequently had to deal with long strings of digits,
   // we could extend our code by using a 128-bit integer instead
   // of a 64-bit integer. However, this is uncommon in practice.
   // digit count is off by 1 because of the decimal (assuming there was one).
   if (unlikely((digit_count-1 >= 19))) { // this is uncommon
     // It is possible that the integer had an overflow.
     // We have to handle the case where we have 0.0000somenumber.
-    const char *start = start_digits;
+    const uint8_t *start = start_digits;
     while ((*start == '0') || (*start == '.')) {
       start++;
     }
@@ -351,7 +341,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
       // 10000000000000000000000000000000000000000000e+308
       // 3.1415926535897932384626433832795028841971693993751
       //
-      bool success = slow_float_parsing((const char *) src, writer);
+      bool success = slow_float_parsing(src, writer);
       // The number was already written, but we made a copy of the writer
       // when we passed it to the parse_large_integer() function, so
       writer.skip_double();
@@ -364,7 +354,7 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
   if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) {
     // this is almost never going to get called!!!
     // we start anew, going slowly!!!
-    bool success = slow_float_parsing((const char *) src, writer);
+    bool success = slow_float_parsing(src, writer);
     // The number was already written, but we made a copy of the writer when we passed it to the
     // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
     writer.skip_double();
@@ -374,12 +364,23 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
   double d = compute_float_64(exponent, i, negative, &success);
   if (!success) {
     // we are almost never going to get here.
-    if (!parse_float_strtod((const char *)src, &d)) { return INVALID_NUMBER(src); }
+    if (!parse_float_strtod(src, &d)) { return INVALID_NUMBER(src); }
   }
   WRITE_DOUBLE(d, src, writer);
   return true;
 }
 
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+really_inline bool parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return true;                 // always succeeds
+}
+
+#else
+
 // parse the number at src
 // define JSON_TEST_NUMBERS for unit testing
 //
@@ -390,48 +391,25 @@ really_inline bool write_float(const uint8_t *const src, bool negative, uint64_t
 //
 // Our objective is accurate parsing (ULP of 0) at high speed.
 template<typename W>
-really_inline bool parse_number(UNUSED const uint8_t *const src,
-                                UNUSED bool found_minus,
-                                W &writer) {
-#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
-                                  // useful to skip parsing
-  writer.append_s64(0);        // always write zero
-  return true;                    // always succeeds
-#else
-  const char *p = reinterpret_cast<const char *>(src);
-  bool negative = false;
-  if (found_minus) {
-    ++p;
-    negative = true;
-    // a negative sign must be followed by an integer
-    if (!is_integer(*p)) { return INVALID_NUMBER(src); }
-  }
-  const char *const start_digits = p;
+really_inline bool parse_number(const uint8_t *const src, W &writer) {
 
-  uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
-  if (*p == '0') {
-    ++p;
-    if (is_integer(*p)) { return INVALID_NUMBER(src); } // 0 cannot be followed by an integer
-    i = 0;
-  } else {
-    // NOTE: This is a redundant check--either we're negative, in which case we checked whether this
-    // is a digit above, or the caller already determined we start with a digit. But removing this
-    // check seems to make things slower: https://github.com/simdjson/simdjson/pull/990#discussion_r448512448
-    // Please do try yourself, or think of ways to explain it--we'd love to understand :)
-    if (!is_integer(*p)) { return INVALID_NUMBER(src); } // must start with an integer
-    unsigned char digit = static_cast<unsigned char>(*p - '0');
-    i = digit;
-    p++;
-    // the is_made_of_eight_digits_fast routine is unlikely to help here because
-    // we rarely see large integer parts like 123456789
-    while (is_integer(*p)) {
-      digit = static_cast<unsigned char>(*p - '0');
-      // a multiplication by 10 is cheaper than an arbitrary integer
-      // multiplication
-      i = 10 * i + digit; // might overflow, we will handle the overflow later
-      ++p;
-    }
-  }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + negative;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  int digit_count = int(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
 
   //
   // Handle floats if there is a . or e (or both)
@@ -442,8 +420,8 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     is_float = true;
     ++p;
     if (!parse_decimal(src, p, i, exponent)) { return false; }
+    digit_count = int(p - start_digits); // used later to guard against overflows
   }
-  int digit_count = int(p - start_digits); // used later to guard against overflows
   if (('e' == *p) || ('E' == *p)) {
     is_float = true;
     ++p;
@@ -492,9 +470,9 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     WRITE_INTEGER(negative ? 0 - i : i, src, writer);
   }
   return is_structural_or_whitespace(*p);
+}
 
 #endif // SIMDJSON_SKIPNUMBERPARSING
-}
 
 } // namespace numberparsing
 } // namespace stage2
diff --git a/src/generic/stage2/structural_parser.h b/src/generic/stage2/structural_parser.h
@@ -159,17 +159,17 @@ struct structural_parser : structural_iterator {
     return false;
   }
 
-  WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
+  WARN_UNUSED really_inline bool parse_number(const uint8_t *src) {
     log_value("number");
-    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+    bool succeeded = numberparsing::parse_number(src, tape);
     if (!succeeded) { log_error("Invalid number"); }
     return !succeeded;
   }
-  WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(current(), found_minus);
+  WARN_UNUSED really_inline bool parse_number() {
+    return parse_number(current());
   }
 
-  really_inline bool parse_number_with_space_terminated_copy(const bool is_negative) {
+  really_inline bool parse_number_with_space_terminated_copy() {
     /**
     * We need to make a copy to make sure that the string is space terminated.
     * This is not about padding the input, which should already padded up
@@ -190,7 +190,7 @@ struct structural_parser : structural_iterator {
     memcpy(copy, buf, parser.len);
     memset(copy + parser.len, ' ', SIMDJSON_PADDING);
     size_t idx = *current_structural;
-    bool result = parse_number(&copy[idx], is_negative); // parse_number does not throw
+    bool result = parse_number(&copy[idx]); // parse_number does not throw
     free(copy);
     return result;
   }
@@ -214,12 +214,10 @@ struct structural_parser : structural_iterator {
       FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
       tape.append(0, internal::tape_type::NULL_VALUE);
       return continue_state;
+    case '-':
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
-      FAIL_IF( parse_number(false) );
-      return continue_state;
-    case '-':
-      FAIL_IF( parse_number(true) );
+      FAIL_IF( parse_number() );
       return continue_state;
     case '{':
       FAIL_IF( start_object(continue_state) );
@@ -375,18 +373,13 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
     FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
     parser.tape.append(0, internal::tape_type::NULL_VALUE);
     goto finish;
+  case '-':
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     // Next line used to be an interesting functional programming exercise with
     // a lambda that gets passed to another function via a closure. This would confuse the
     // clangcl compiler under Visual Studio 2019 (recent release).
-    { if(parser.parse_number_with_space_terminated_copy(false)) { goto error; }}
-    goto finish;
-  case '-':
-    // Next line used to be an interesting functional programming exercise with
-    // a lambda that gets passed to another function via a closure. This would confuse the
-    // clangcl compiler under Visual Studio 2019 (recent release).
-    { if(parser.parse_number_with_space_terminated_copy(true)) { goto error; }}
+    FAIL_IF(parser.parse_number_with_space_terminated_copy());
     goto finish;
   default:
     parser.log_error("Document starts with a non-value character");
diff --git a/src/haswell/numberparsing.h b/src/haswell/numberparsing.h
diff --git a/src/westmere/numberparsing.h b/src/westmere/numberparsing.h