Skip to content

Commit a50ac7a

Browse files
committed
Sync up int parsing with integer branch
1 parent c7cbe6c commit a50ac7a

File tree

1 file changed

+87
-117
lines changed

1 file changed

+87
-117
lines changed

include/simdjson/internal/numberparsing.h

Lines changed: 87 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -16,135 +16,105 @@ namespace {
1616

1717
using namespace internal::logger;
1818

19-
// Integers 19 digits or more: 10,000,000,000,000,000,000 to 18,446,744,073,709,551,615
20-
simdjson_result<uint64_t> convert_large_unsigned(const uint8_t *buf, int digits, uint64_t magnitude) {
21-
log_event(" (large unsigned)", buf);
22-
assert(digits >= 19);
23-
if (digits > 19) {
24-
log_error("20+ digits", buf);
25-
return NUMBER_OUT_OF_RANGE;
19+
template<typename I>
20+
NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
21+
really_inline bool parse_digit(const uint8_t c, I &i) {
22+
const uint8_t digit = static_cast<uint8_t>(c - '0');
23+
if (digit > 9) {
24+
return false;
2625
}
27-
if (!is_structural_or_whitespace(buf[digits])) {
28-
log_error("followed by non-ws/struct", buf);
29-
return NUMBER_OUT_OF_RANGE;
30-
}
31-
if (buf[0] != '1') {
32-
log_error("greater than 2e18", buf);
33-
return NUMBER_OUT_OF_RANGE;
34-
}
35-
36-
// We have 19 digits and a leading 1.
37-
// 19,999,999,999,999,999,999 is the biggest number the user could have written.
38-
// 18,446,744,073,709,551,615 is the biggest number uint64_t could store.
39-
// 1,553,255,926,290,448,383 is the overflow of the biggest number we could store.
40-
// 10,000,000,000,000,000,000 is the smallest number the user could have written.
41-
// We assume that an overflow is lower than that.
42-
if (magnitude < 10000000000000000000ULL) {
43-
log_error("19-digit overflow", buf);
44-
return NUMBER_OUT_OF_RANGE;
45-
}
46-
return magnitude;
47-
}
48-
49-
// Integers 18 digits or more: 1,000,000,000,000,000,000 to 9,223,372,036,854,775,807
50-
// and -1,000,000,000,000,000,000 to -9,223,372,036,854,775,808
51-
simdjson_result<int64_t> convert_large_integer(const uint8_t *buf, int digits, uint64_t magnitude, bool negative) {
52-
log_event(" (large integer)", buf);
53-
assert(digits >= 18);
54-
if (digits > 18) {
55-
log_error("19+ digits", buf);
56-
return NUMBER_OUT_OF_RANGE;
57-
}
58-
if (!is_structural_or_whitespace(buf[digits])) {
59-
log_error("followed by non-ws/struct", buf);
60-
return NUMBER_OUT_OF_RANGE;
61-
}
62-
63-
// The number cannot have actually overflowed since it's stored in an unsigned integer;
64-
// we just have to check whether it's bigger than INT64_MAX
65-
66-
// C++ can't reliably negate uint64_t INT64_MIN, it seems
67-
if (negative && magnitude == (uint64_t(INT64_MAX)+1)) {
68-
log_event(" (INT64_MIN)", buf);
69-
return INT64_MIN;
70-
}
71-
if (magnitude > uint64_t(INT64_MAX)) {
72-
log_error("18-digit overflow", buf);
73-
return NUMBER_OUT_OF_RANGE;
74-
}
75-
return negative ? -static_cast<int64_t>(magnitude) : static_cast<int64_t>(magnitude);
76-
26+
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
27+
i = 10 * i + digit; // might overflow, we will handle the overflow later
28+
return true;
7729
}
7830

79-
} // namespace {}
31+
}; // namespace {}
8032

8133
// Parse any number from 0 to 18,446,744,073,709,551,615
82-
really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const buf) noexcept {
83-
// Parse the first digit
84-
uint64_t magnitude = buf[0] - '0';
85-
int digits = 1;
86-
if (magnitude > 0) { // 0 cannot be followed by other digits
87-
if (magnitude > 9) { // First thing is not a digit at all
88-
log_error("non-digit start", buf);
89-
return INCORRECT_TYPE;
90-
}
91-
92-
// Parse remaining digits
93-
while (1) {
94-
uint8_t digit = static_cast<uint8_t>(buf[digits] - '0');
95-
if (digit > 9) { break; }
96-
magnitude = magnitude * 10 + digit;
97-
digits++;
98-
}
99-
100-
// Check for massive numbers
101-
if (unlikely(digits >= 19)) {
102-
return convert_large_unsigned(buf, digits, magnitude);
103-
}
34+
really_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
35+
//
36+
// Parse the integer part.
37+
//
38+
uint64_t i = 0;
39+
const uint8_t *p = src;
40+
p += parse_digit(*p, i);
41+
bool leading_zero = (i == 0);
42+
while (parse_digit(*p, i)) { p++; }
43+
44+
//
45+
// Check for errors
46+
//
47+
auto digit_count = src - p;
48+
if ( !is_structural_or_whitespace(*p) || // . or e
49+
digit_count == 0 || // no digits
50+
(leading_zero && digit_count != 1) // 0123 (zero must be solo)
51+
) {
52+
return NUMBER_ERROR;
10453
}
105-
106-
// Next character can't be . or e--it must be whitespace, comma, end array or end bracket
107-
if (!is_structural_or_whitespace(buf[digits])) {
108-
log_error("followed by non-ws/struct", buf);
109-
return INCORRECT_TYPE;
54+
// Overflow checks
55+
if (digit_count > 20) { return NUMBER_OUT_OF_RANGE; }
56+
if (digit_count == 20) {
57+
// Positive overflow check:
58+
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
59+
// biggest uint64_t.
60+
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
61+
// If we got here, it's a 20 digit number starting with the digit "1".
62+
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
63+
// than 1,553,255,926,290,448,384.
64+
// - That is smaller than the smallest possible 20-digit number the user could write:
65+
// 10,000,000,000,000,000,000.
66+
// - Therefore, if the number is positive and lower than that, it's overflow.
67+
// - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
68+
//
69+
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
11070
}
111-
return magnitude;
71+
72+
//
73+
// Return the number.
74+
//
75+
return i;
11276
}
11377

11478
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
115-
really_inline simdjson_result<int64_t> parse_integer(const uint8_t * buf) noexcept {
116-
bool negative = (buf[0] == '-');
117-
if (negative) { buf++; }
118-
119-
// Parse the first digit
120-
uint64_t magnitude = buf[0] - '0';
121-
int digits = 1;
122-
if (magnitude > 0) { // 0 cannot be followed by other digits
123-
if (magnitude > 9) { // First thing is not a digit at all
124-
log_error("non-digit start", buf);
125-
return INCORRECT_TYPE;
126-
}
127-
128-
// Parse remaining digits
129-
while (1) {
130-
uint8_t digit = static_cast<uint8_t>(buf[digits] - '0');
131-
if (digit > 9) { break; }
132-
magnitude = magnitude * 10 + digit;
133-
digits++;
134-
}
135-
136-
// Check for massive numbers
137-
if (unlikely(digits >= 19)) {
138-
return convert_large_integer(buf, digits, magnitude, negative);
139-
}
79+
really_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
80+
//
81+
// Check for minus sign
82+
//
83+
bool negative = (*src == '-');
84+
src += negative;
85+
86+
//
87+
// Parse the integer part.
88+
//
89+
uint64_t i = 0;
90+
const uint8_t *p = src;
91+
p += parse_digit(*p, i);
92+
bool leading_zero = (i == 0);
93+
while (parse_digit(*p, i)) { p++; }
94+
95+
//
96+
// Check for errors
97+
//
98+
auto digit_count = p - src;
99+
if ( !is_structural_or_whitespace(*p) || // . or e
100+
digit_count == 0 || // no digits
101+
(leading_zero && digit_count != 1) // 0123 (zero must be solo)
102+
) {
103+
return NUMBER_ERROR;
140104
}
141-
142-
// Next character can't be . or e--it must be whitespace, comma, end array or end bracket
143-
if (!is_structural_or_whitespace(buf[digits])) {
144-
log_error("followed by non-ws/struct", buf);
145-
return INCORRECT_TYPE;
105+
// Overflow checks
106+
if (digit_count > 19) { return NUMBER_OUT_OF_RANGE; }
107+
if (digit_count == 19) {
108+
// C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
109+
if (negative && i == uint64_t(INT64_MAX)+1) { return INT64_MIN; }
110+
// Anything above INT64_MAX is either invalid or INT64_MIN.
111+
if (i > uint64_t(INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
146112
}
147-
return magnitude;
113+
114+
//
115+
// Return the number.
116+
//
117+
return negative ? 0 - i : i;
148118
}
149119

150120
// really_inline simdjson_result<double> parse_double(const uint8_t * buf) noexcept {

0 commit comments

Comments
 (0)