Skip to content

Commit c1f27fb

Browse files
saka1lemire
authored andcommitted
Accept large unsigned integers (simdjson#295)
* handle uint64 value in JSON * Add integer_tests * Add get_unsigned_integer() on ParsedJson::BasicIterator * Write 'u' to tape when the value seems unsigned * Add to handle 'u' element * Brush up integer_tests.cpp * Append tests/integer_tests in .gitignore * Add comments to is_integer and is_unsigned_integer
1 parent 6d0fd5b commit c1f27fb

12 files changed

+177
-36
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
/tests/basictests
3333
/tests/jsoncheck
3434
/tests/pointercheck
35+
/tests/integer_tests
3536
/tools/json2json
3637
/tools/jsonstats
3738
/tools/minify

benchmark/parseandstatcompetition.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ simdjson_compute_stats(const simdjson::padded_string &p) {
7272
answer.number_count++;
7373
tape_idx++; // skipping the integer
7474
break;
75+
case 'u': // we have a long uint
76+
answer.number_count++;
77+
tape_idx++; // skipping the unsigned integer
78+
break;
7579
case 'd': // we have a double
7680
answer.number_count++;
7781
tape_idx++; // skipping the double

benchmark/statisticalmodel.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
7878
answer.integer_count++;
7979
tape_idx++; // skipping the integer
8080
break;
81+
case 'u': // we have a long uint
82+
answer.integer_count++;
83+
tape_idx++; // skipping the integer
84+
break;
8185
case 'd': // we have a double
8286
answer.float_count++;
8387
tape_idx++; // skipping the double

include/simdjson/numberparsing.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -375,15 +375,20 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
375375
found_invalid_number(buf + offset);
376376
#endif
377377
return false; // overflow
378+
} else if (i == 0x8000000000000000) {
379+
constexpr int64_t signed_answer = INT64_MIN;
380+
pj.write_tape_s64(signed_answer);
381+
#ifdef JSON_TEST_NUMBERS // for unit testing
382+
found_integer(signed_answer, buf + offset);
383+
#endif
384+
return is_structural_or_whitespace(*p);
378385
}
379386
} else {
380-
if (i >= 0x8000000000000000) {
381-
// overflows!
382387
#ifdef JSON_TEST_NUMBERS // for unit testing
383-
found_invalid_number(buf + offset);
388+
found_integer(i, buf + offset);
384389
#endif
385-
return false; // overflow
386-
}
390+
pj.write_tape_u64(i);
391+
return is_structural_or_whitespace(*p);
387392
}
388393
int64_t signed_answer =
389394
negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);

include/simdjson/parsedjson.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ class ParsedJson {
7979
tape[current_loc++] = *(reinterpret_cast<uint64_t *>(&i));
8080
}
8181

82+
really_inline void write_tape_u64(uint64_t i) {
83+
write_tape(0, 'u');
84+
tape[current_loc++] = i;
85+
}
86+
8287
really_inline void write_tape_double(double d) {
8388
write_tape(0, 'd');
8489
static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size");

include/simdjson/parsedjsoniterator.h

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
#ifndef SIMDJSON_PARSEDJSONITERATOR_H
22
#define SIMDJSON_PARSEDJSONITERATOR_H
33

4-
#include "simdjson/parsedjson.h"
54
#include "simdjson/jsonformatutils.h"
5+
#include "simdjson/parsedjson.h"
66
#include <cstring>
77
#include <iostream>
8-
#include <limits>
98
#include <iterator>
9+
#include <limits>
1010

1111
namespace simdjson {
12-
template <size_t max_depth>
13-
class ParsedJson::BasicIterator {
12+
template <size_t max_depth> class ParsedJson::BasicIterator {
1413
// might throw InvalidJSON if ParsedJson is invalid
1514
public:
1615
explicit BasicIterator(ParsedJson &pj_);
@@ -51,6 +50,14 @@ class ParsedJson::BasicIterator {
5150
return static_cast<int64_t>(pj->tape[location + 1]);
5251
}
5352

53+
// get the value as uint64
54+
inline uint64_t get_unsigned_integer() const {
55+
if (location + 1 >= tape_length) {
56+
return 0; // default value in case of error
57+
}
58+
return pj->tape[location + 1];
59+
}
60+
5461
// get the string value at this node (NULL ended); valid only if we're at "
5562
// note that tabs, and line endings are escaped in the returned value (see
5663
// print_with_escapes) return value is valid UTF-8 It may contain NULL chars
@@ -90,10 +97,26 @@ class ParsedJson::BasicIterator {
9097

9198
inline bool is_string() const { return get_type() == '"'; }
9299

100+
// Returns true if the current type of node is an signed integer.
101+
// You can get its value with `get_integer()`.
93102
inline bool is_integer() const { return get_type() == 'l'; }
94103

104+
// Returns true if the current type of node is an unsigned integer.
105+
// You can get its value with `get_unsigned_integer()`.
106+
//
107+
// NOTE:
108+
// Only a large value, which is out of range of a 64-bit signed integer, is
109+
// represented internally as an unsigned node. On the other hand, a typical
110+
// positive integer, such as 1, 42, or 1000000, is as a signed node.
111+
// Be aware this function returns false for a signed node.
112+
inline bool is_unsigned_integer() const { return get_type() == 'u'; }
113+
95114
inline bool is_double() const { return get_type() == 'd'; }
96115

116+
inline bool is_number() const {
117+
return is_integer() || is_unsigned_integer() || is_double();
118+
}
119+
97120
inline bool is_true() const { return get_type() == 't'; }
98121

99122
inline bool is_false() const { return get_type() == 'f'; }
@@ -110,7 +133,7 @@ class ParsedJson::BasicIterator {
110133
// (in case of repeated keys, this only finds the first one).
111134
// We seek the key using C's strcmp so if your JSON strings contain
112135
// NULL chars, this would trigger a false positive: if you expect that
113-
// to be the case, take extra precautions.
136+
// to be the case, take extra precautions.
114137
// Furthermore, we do the comparison character-by-character
115138
// without taking into account Unicode equivalence.
116139
inline bool move_to_key(const char *key);
@@ -230,21 +253,28 @@ class ParsedJson::BasicIterator {
230253
};
231254

232255
template <size_t max_depth>
233-
WARN_UNUSED
234-
bool ParsedJson::BasicIterator<max_depth>::is_ok() const { return location < tape_length; }
256+
WARN_UNUSED bool ParsedJson::BasicIterator<max_depth>::is_ok() const {
257+
return location < tape_length;
258+
}
235259

236260
// useful for debuging purposes
237261
template <size_t max_depth>
238-
size_t ParsedJson::BasicIterator<max_depth>::get_tape_location() const { return location; }
262+
size_t ParsedJson::BasicIterator<max_depth>::get_tape_location() const {
263+
return location;
264+
}
239265

240266
// useful for debuging purposes
241267
template <size_t max_depth>
242-
size_t ParsedJson::BasicIterator<max_depth>::get_tape_length() const { return tape_length; }
268+
size_t ParsedJson::BasicIterator<max_depth>::get_tape_length() const {
269+
return tape_length;
270+
}
243271

244272
// returns the current depth (start at 1 with 0 reserved for the fictitious root
245273
// node)
246274
template <size_t max_depth>
247-
size_t ParsedJson::BasicIterator<max_depth>::get_depth() const { return depth; }
275+
size_t ParsedJson::BasicIterator<max_depth>::get_depth() const {
276+
return depth;
277+
}
248278

249279
// A scope is a series of nodes at the same depth, typically it is either an
250280
// object ({) or an array ([). The root node has type 'r'.
@@ -268,8 +298,8 @@ bool ParsedJson::BasicIterator<max_depth>::move_forward() {
268298
} else if ((current_type == ']') || (current_type == '}')) {
269299
// Leaving a scope.
270300
depth--;
271-
} else if ((current_type == 'd') || (current_type == 'l')) {
272-
// d and l types use 2 locations on the tape, not just one.
301+
} else if (is_number()) {
302+
// these types use 2 locations on the tape, not just one.
273303
location += 1;
274304
}
275305

@@ -305,7 +335,8 @@ bool ParsedJson::BasicIterator<max_depth>::move_to_key(const char *key) {
305335
}
306336

307337
template <size_t max_depth>
308-
bool ParsedJson::BasicIterator<max_depth>::move_to_key(const char *key, uint32_t length) {
338+
bool ParsedJson::BasicIterator<max_depth>::move_to_key(const char *key,
339+
uint32_t length) {
309340
if (down()) {
310341
do {
311342
assert(is_string());
@@ -339,33 +370,31 @@ bool ParsedJson::BasicIterator<max_depth>::move_to_index(uint32_t index) {
339370
return false;
340371
}
341372

342-
template <size_t max_depth>
343-
bool ParsedJson::BasicIterator<max_depth>::prev() {
373+
template <size_t max_depth> bool ParsedJson::BasicIterator<max_depth>::prev() {
344374
size_t target_location = location;
345375
to_start_scope();
346376
size_t npos = location;
347-
if(target_location == npos) {
377+
if (target_location == npos) {
348378
return false; // we were already at the start
349379
}
350380
size_t oldnpos;
351381
// we have that npos < target_location here
352382
do {
353383
oldnpos = npos;
354384
if ((current_type == '[') || (current_type == '{')) {
355-
// we need to jump
385+
// we need to jump
356386
npos = (current_val & JSON_VALUE_MASK);
357387
} else {
358388
npos = npos + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
359389
}
360-
} while(npos < target_location);
390+
} while (npos < target_location);
361391
location = oldnpos;
362392
current_val = pj->tape[location];
363393
current_type = current_val >> 56;
364394
return true;
365395
}
366396

367-
template <size_t max_depth>
368-
bool ParsedJson::BasicIterator<max_depth>::up() {
397+
template <size_t max_depth> bool ParsedJson::BasicIterator<max_depth>::up() {
369398
if (depth == 1) {
370399
return false; // don't allow moving back to root
371400
}
@@ -378,8 +407,7 @@ bool ParsedJson::BasicIterator<max_depth>::up() {
378407
return true;
379408
}
380409

381-
template <size_t max_depth>
382-
bool ParsedJson::BasicIterator<max_depth>::down() {
410+
template <size_t max_depth> bool ParsedJson::BasicIterator<max_depth>::down() {
383411
if (location + 1 >= tape_length) {
384412
return false;
385413
}
@@ -407,14 +435,13 @@ void ParsedJson::BasicIterator<max_depth>::to_start_scope() {
407435
current_type = (current_val >> 56);
408436
}
409437

410-
template <size_t max_depth>
411-
bool ParsedJson::BasicIterator<max_depth>::next() {
438+
template <size_t max_depth> bool ParsedJson::BasicIterator<max_depth>::next() {
412439
size_t npos;
413440
if ((current_type == '[') || (current_type == '{')) {
414441
// we need to jump
415442
npos = (current_val & JSON_VALUE_MASK);
416443
} else {
417-
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
444+
npos = location + (is_number() ? 2 : 1);
418445
}
419446
uint64_t next_val = pj->tape[npos];
420447
uint8_t next_type = (next_val >> 56);
@@ -456,14 +483,17 @@ ParsedJson::BasicIterator<max_depth>::BasicIterator(ParsedJson &pj_)
456483
}
457484

458485
template <size_t max_depth>
459-
ParsedJson::BasicIterator<max_depth>::BasicIterator(const BasicIterator &o) noexcept
460-
: pj(o.pj), depth(o.depth), location(o.location), tape_length(o.tape_length),
461-
current_type(o.current_type), current_val(o.current_val) {
486+
ParsedJson::BasicIterator<max_depth>::BasicIterator(
487+
const BasicIterator &o) noexcept
488+
: pj(o.pj), depth(o.depth), location(o.location),
489+
tape_length(o.tape_length), current_type(o.current_type),
490+
current_val(o.current_val) {
462491
memcpy(depth_index, o.depth_index, (depth + 1) * sizeof(depth_index[0]));
463492
}
464493

465494
template <size_t max_depth>
466-
ParsedJson::BasicIterator<max_depth> &ParsedJson::BasicIterator<max_depth>::operator =(const BasicIterator &o) noexcept {
495+
ParsedJson::BasicIterator<max_depth> &ParsedJson::BasicIterator<max_depth>::
496+
operator=(const BasicIterator &o) noexcept {
467497
pj = o.pj;
468498
depth = o.depth;
469499
location = o.location;
@@ -475,7 +505,8 @@ ParsedJson::BasicIterator<max_depth> &ParsedJson::BasicIterator<max_depth>::oper
475505
}
476506

477507
template <size_t max_depth>
478-
bool ParsedJson::BasicIterator<max_depth>::print(std::ostream &os, bool escape_strings) const {
508+
bool ParsedJson::BasicIterator<max_depth>::print(std::ostream &os,
509+
bool escape_strings) const {
479510
if (!is_ok()) {
480511
return false;
481512
}
@@ -495,6 +526,9 @@ bool ParsedJson::BasicIterator<max_depth>::print(std::ostream &os, bool escape_s
495526
case 'l': // we have a long int
496527
os << get_integer();
497528
break;
529+
case 'u':
530+
os << get_unsigned_integer();
531+
break;
498532
case 'd':
499533
os << get_double();
500534
break;
@@ -520,7 +554,8 @@ bool ParsedJson::BasicIterator<max_depth>::print(std::ostream &os, bool escape_s
520554
}
521555

522556
template <size_t max_depth>
523-
bool ParsedJson::BasicIterator<max_depth>::move_to(const char *pointer, uint32_t length) {
557+
bool ParsedJson::BasicIterator<max_depth>::move_to(const char *pointer,
558+
uint32_t length) {
524559
char *new_pointer = nullptr;
525560
if (pointer[0] == '#') {
526561
// Converting fragment representation to string representation
File renamed without changes.

src/parsedjson.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,14 @@ bool ParsedJson::print_json(std::ostream &os) {
176176
}
177177
os << static_cast<int64_t>(tape[++tape_idx]);
178178
break;
179+
case 'u':
180+
if (tape_idx + 1 >= how_many) {
181+
delete[] in_object;
182+
delete[] in_object_idx;
183+
return false;
184+
}
185+
os << tape[++tape_idx];
186+
break;
179187
case 'd': // we have a double
180188
if (tape_idx + 1 >= how_many) {
181189
delete[] in_object;
@@ -273,6 +281,12 @@ bool ParsedJson::dump_raw_tape(std::ostream &os) {
273281
}
274282
os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n";
275283
break;
284+
case 'u': // we have a long uint
285+
if (tape_idx + 1 >= how_many) {
286+
return false;
287+
}
288+
os << "unsigned integer " << tape[++tape_idx] << "\n";
289+
break;
276290
case 'd': // we have a double
277291
os << "float ";
278292
if (tape_idx + 1 >= how_many) {

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ endif()
77
add_cpp_test(basictests)
88
add_cpp_test(jsoncheck)
99
add_cpp_test(pointercheck)
10+
add_cpp_test(integer_tests)
1011

1112
## This causes problems
1213
# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)

0 commit comments

Comments
 (0)