@@ -16,135 +16,105 @@ namespace {
16
16
17
17
using namespace internal ::logger;
18
18
19
- // Integers 19 digits or more: 10,000,000,000,000,000,000 to 18,446,744,073,709,551,615
20
- simdjson_result<uint64_t > convert_large_unsigned (const uint8_t *buf, int digits, uint64_t magnitude) {
21
- log_event (" (large unsigned)" , buf);
22
- assert (digits >= 19 );
23
- if (digits > 19 ) {
24
- log_error (" 20+ digits" , buf);
25
- return NUMBER_OUT_OF_RANGE;
19
+ template <typename I>
20
+ NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
21
+ really_inline bool parse_digit (const uint8_t c, I &i) {
22
+ const uint8_t digit = static_cast <uint8_t >(c - ' 0' );
23
+ if (digit > 9 ) {
24
+ return false ;
26
25
}
27
- if (!is_structural_or_whitespace (buf[digits])) {
28
- log_error (" followed by non-ws/struct" , buf);
29
- return NUMBER_OUT_OF_RANGE;
30
- }
31
- if (buf[0 ] != ' 1' ) {
32
- log_error (" greater than 2e18" , buf);
33
- return NUMBER_OUT_OF_RANGE;
34
- }
35
-
36
- // We have 19 digits and a leading 1.
37
- // 19,999,999,999,999,999,999 is the biggest number the user could have written.
38
- // 18,446,744,073,709,551,615 is the biggest number uint64_t could store.
39
- // 1,553,255,926,290,448,383 is the overflow of the biggest number we could store.
40
- // 10,000,000,000,000,000,000 is the smallest number the user could have written.
41
- // We assume that an overflow is lower than that.
42
- if (magnitude < 10000000000000000000ULL ) {
43
- log_error (" 19-digit overflow" , buf);
44
- return NUMBER_OUT_OF_RANGE;
45
- }
46
- return magnitude;
47
- }
48
-
49
- // Integers 18 digits or more: 1,000,000,000,000,000,000 to 9,223,372,036,854,775,807
50
- // and -1,000,000,000,000,000,000 to -9,223,372,036,854,775,808
51
- simdjson_result<int64_t > convert_large_integer (const uint8_t *buf, int digits, uint64_t magnitude, bool negative) {
52
- log_event (" (large integer)" , buf);
53
- assert (digits >= 18 );
54
- if (digits > 18 ) {
55
- log_error (" 19+ digits" , buf);
56
- return NUMBER_OUT_OF_RANGE;
57
- }
58
- if (!is_structural_or_whitespace (buf[digits])) {
59
- log_error (" followed by non-ws/struct" , buf);
60
- return NUMBER_OUT_OF_RANGE;
61
- }
62
-
63
- // The number cannot have actually overflowed since it's stored in an unsigned integer;
64
- // we just have to check whether it's bigger than INT64_MAX
65
-
66
- // C++ can't reliably negate uint64_t INT64_MIN, it seems
67
- if (negative && magnitude == (uint64_t (INT64_MAX)+1 )) {
68
- log_event (" (INT64_MIN)" , buf);
69
- return INT64_MIN;
70
- }
71
- if (magnitude > uint64_t (INT64_MAX)) {
72
- log_error (" 18-digit overflow" , buf);
73
- return NUMBER_OUT_OF_RANGE;
74
- }
75
- return negative ? -static_cast <int64_t >(magnitude) : static_cast <int64_t >(magnitude);
76
-
26
+ // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
27
+ i = 10 * i + digit; // might overflow, we will handle the overflow later
28
+ return true ;
77
29
}
78
30
79
- } // namespace {}
31
+ }; // namespace {}
80
32
81
33
// Parse any number from 0 to 18,446,744,073,709,551,615
82
- really_inline simdjson_result<uint64_t > parse_unsigned (const uint8_t * const buf) noexcept {
83
- // Parse the first digit
84
- uint64_t magnitude = buf[0 ] - ' 0' ;
85
- int digits = 1 ;
86
- if (magnitude > 0 ) { // 0 cannot be followed by other digits
87
- if (magnitude > 9 ) { // First thing is not a digit at all
88
- log_error (" non-digit start" , buf);
89
- return INCORRECT_TYPE;
90
- }
91
-
92
- // Parse remaining digits
93
- while (1 ) {
94
- uint8_t digit = static_cast <uint8_t >(buf[digits] - ' 0' );
95
- if (digit > 9 ) { break ; }
96
- magnitude = magnitude * 10 + digit;
97
- digits++;
98
- }
99
-
100
- // Check for massive numbers
101
- if (unlikely (digits >= 19 )) {
102
- return convert_large_unsigned (buf, digits, magnitude);
103
- }
34
+ really_inline simdjson_result<uint64_t > parse_unsigned (const uint8_t * const src) noexcept {
35
+ //
36
+ // Parse the integer part.
37
+ //
38
+ uint64_t i = 0 ;
39
+ const uint8_t *p = src;
40
+ p += parse_digit (*p, i);
41
+ bool leading_zero = (i == 0 );
42
+ while (parse_digit (*p, i)) { p++; }
43
+
44
+ //
45
+ // Check for errors
46
+ //
47
+ auto digit_count = src - p;
48
+ if ( !is_structural_or_whitespace (*p) || // . or e
49
+ digit_count == 0 || // no digits
50
+ (leading_zero && digit_count != 1 ) // 0123 (zero must be solo)
51
+ ) {
52
+ return NUMBER_ERROR;
104
53
}
105
-
106
- // Next character can't be . or e--it must be whitespace, comma, end array or end bracket
107
- if (!is_structural_or_whitespace (buf[digits])) {
108
- log_error (" followed by non-ws/struct" , buf);
109
- return INCORRECT_TYPE;
54
+ // Overflow checks
55
+ if (digit_count > 20 ) { return NUMBER_OUT_OF_RANGE; }
56
+ if (digit_count == 20 ) {
57
+ // Positive overflow check:
58
+ // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
59
+ // biggest uint64_t.
60
+ // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
61
+ // If we got here, it's a 20 digit number starting with the digit "1".
62
+ // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
63
+ // than 1,553,255,926,290,448,384.
64
+ // - That is smaller than the smallest possible 20-digit number the user could write:
65
+ // 10,000,000,000,000,000,000.
66
+ // - Therefore, if the number is positive and lower than that, it's overflow.
67
+ // - The value we are looking at is less than or equal to 9,223,372,036,854,775,808 (INT64_MAX).
68
+ //
69
+ if (src[0 ] != uint8_t (' 1' ) || i <= uint64_t (INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
110
70
}
111
- return magnitude;
71
+
72
+ //
73
+ // Return the number.
74
+ //
75
+ return i;
112
76
}
113
77
114
78
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
115
- really_inline simdjson_result<int64_t > parse_integer (const uint8_t * buf ) noexcept {
116
- bool negative = (buf[ 0 ] == ' - ' );
117
- if (negative) { buf++; }
118
-
119
- // Parse the first digit
120
- uint64_t magnitude = buf[ 0 ] - ' 0 ' ;
121
- int digits = 1 ;
122
- if (magnitude > 0 ) { // 0 cannot be followed by other digits
123
- if (magnitude > 9 ) { // First thing is not a digit at all
124
- log_error ( " non-digit start " , buf);
125
- return INCORRECT_TYPE ;
126
- }
127
-
128
- // Parse remaining digits
129
- while (1 ) {
130
- uint8_t digit = static_cast < uint8_t >(buf[digits] - ' 0 ' );
131
- if (digit > 9 ) { break ; }
132
- magnitude = magnitude * 10 + digit;
133
- digits++;
134
- }
135
-
136
- // Check for massive numbers
137
- if ( unlikely (digits >= 19 )) {
138
- return convert_large_integer (buf, digits, magnitude, negative);
139
- }
79
+ really_inline simdjson_result<int64_t > parse_integer (const uint8_t *src ) noexcept {
80
+ //
81
+ // Check for minus sign
82
+ //
83
+ bool negative = (*src == ' - ' );
84
+ src += negative ;
85
+
86
+ //
87
+ // Parse the integer part.
88
+ //
89
+ uint64_t i = 0 ;
90
+ const uint8_t *p = src;
91
+ p += parse_digit (*p, i);
92
+ bool leading_zero = (i == 0 );
93
+ while (parse_digit (*p, i)) { p++; }
94
+
95
+ //
96
+ // Check for errors
97
+ //
98
+ auto digit_count = p - src;
99
+ if ( ! is_structural_or_whitespace (*p) || // . or e
100
+ digit_count == 0 || // no digits
101
+ (leading_zero && digit_count != 1 ) // 0123 (zero must be solo)
102
+ ) {
103
+ return NUMBER_ERROR;
140
104
}
141
-
142
- // Next character can't be . or e--it must be whitespace, comma, end array or end bracket
143
- if (!is_structural_or_whitespace (buf[digits])) {
144
- log_error (" followed by non-ws/struct" , buf);
145
- return INCORRECT_TYPE;
105
+ // Overflow checks
106
+ if (digit_count > 19 ) { return NUMBER_OUT_OF_RANGE; }
107
+ if (digit_count == 19 ) {
108
+ // C++ can't reliably negate uint64_t INT64_MIN, it seems. Special case it.
109
+ if (negative && i == uint64_t (INT64_MAX)+1 ) { return INT64_MIN; }
110
+ // Anything above INT64_MAX is either invalid or INT64_MIN.
111
+ if (i > uint64_t (INT64_MAX)) { return NUMBER_OUT_OF_RANGE; }
146
112
}
147
- return magnitude;
113
+
114
+ //
115
+ // Return the number.
116
+ //
117
+ return negative ? 0 - i : i;
148
118
}
149
119
150
120
// really_inline simdjson_result<double> parse_double(const uint8_t * buf) noexcept {
0 commit comments