Skip to content

Commit dc69bc2

Browse files
authored
Trying to verify recent document stream issues. (simdjson#1318)
* Trying to verify recent document stream issues. * Adding another one. * More thorough tests. * Removing trailing spaces. * Working toward exposing some issues. * Tweaking.
1 parent 53577f1 commit dc69bc2

File tree

3 files changed

+162
-3
lines changed

3 files changed

+162
-3
lines changed

include/simdjson/dom/document_stream-inl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ simdjson_really_inline bool document_stream::iterator::operator!=(const document
161161

162162
inline void document_stream::start() noexcept {
163163
if (error) { return; }
164-
165164
error = parser->ensure_capacity(batch_size);
166165
if (error) { return; }
167166

include/simdjson/dom/parser.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
2525
* Some adversary might try to set the batch size to 0 or 1, which might cause problems.
2626
* We set a minimum of 32B since anything else is highly likely to be an error. In practice,
2727
* most users will want a much larger batch size.
28+
*
29+
* All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
30+
* document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
2831
*/
2932
static constexpr size_t MINIMAL_BATCH_SIZE = 32;
3033

tests/document_stream_tests.cpp

Lines changed: 159 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,56 @@
55
#include "simdjson.h"
66
#include "test_macros.h"
77

8+
void print_hex(const simdjson::padded_string& s) {
9+
printf("hex : ");
10+
for(size_t i = 0; i < s.size(); i++) { printf("%02X ", uint8_t(s.data()[i])); }
11+
printf("\n");
12+
printf("ascii: ");
13+
for(size_t i = 0; i < s.size(); i++) {
14+
auto v = uint8_t(s.data()[i]);
15+
if((v <= 32) || (v >= 127)) {
16+
printf(" __");
17+
} else {
18+
printf("%c__", v);
19+
}
20+
}
21+
printf("\n");
22+
}
23+
24+
int char_to_byte(char character) {
25+
if (('A' <= character && character <= 'Z')) {
26+
return (character - 'A');
27+
} else if (('a' <= character && character <= 'z')) {
28+
return 26 + (character - 'a');
29+
} else if (('0' <= character && character <= '9')) {
30+
return 52 + (character - '0');
31+
} else if (character == '+') {
32+
return 62;
33+
} else if (character == '/') {
34+
return 63;
35+
} else if (character == '=') {
36+
return 0;
37+
}
38+
return -1;
39+
}
40+
41+
std::string decode_base64(const std::string &src) {
42+
std::vector<uint8_t> answer;
43+
for (size_t i = 0; i < src.size(); i += 4) {
44+
int three_bytes = char_to_byte(src[i]) << 18 |
45+
char_to_byte(src[i + 1]) << 12 |
46+
char_to_byte(src[i + 2]) << 6 | char_to_byte(src[i + 3]);
47+
if (three_bytes < 0) {
48+
std::cerr << "invalid base64" << std::endl;
49+
abort();
50+
}
51+
answer.push_back(uint8_t((three_bytes & 0x00FF0000) >> 16));
52+
answer.push_back(uint8_t((three_bytes & 0x0000FF00) >> 8));
53+
answer.push_back(uint8_t(three_bytes & 0x000000FF));
54+
}
55+
return std::string(answer.begin(), answer.end());
56+
}
57+
858

959
std::string trim(const std::string s) {
1060
auto start = s.begin();
@@ -30,6 +80,108 @@ namespace document_stream_tests {
3080
simdjson::padded_string str("{}",2);
3181
simdjson::dom::document_stream s1 = parse_many_stream_return(parser, str);
3282
}
83+
84+
bool issue1307() {
85+
std::cout << "Running " << __func__ << std::endl;
86+
const simdjson::padded_string input = decode_base64("AgAMACA=");
87+
print_hex(input);
88+
for(size_t window = 0; window <= 100; window++) {
89+
simdjson::dom::parser parser;
90+
simdjson::dom::document_stream stream;
91+
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
92+
for(auto doc: stream) {
93+
auto error = doc.error();
94+
if(!error) {
95+
std::cout << "Expected an error but got " << error << std::endl;
96+
std::cout << "Window = " << window << std::endl;
97+
return false;
98+
}
99+
}
100+
}
101+
return true;
102+
}
103+
104+
bool issue1308() {
105+
std::cout << "Running " << __func__ << std::endl;
106+
const simdjson::padded_string input = decode_base64("bcdtW0E=");
107+
print_hex(input);
108+
for(size_t window = 0; window <= 100; window++) {
109+
simdjson::dom::parser parser;
110+
simdjson::dom::document_stream stream;
111+
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
112+
for(auto doc: stream) {
113+
auto error = doc.error();
114+
if(!error) {
115+
std::cout << "Expected an error but got " << error << std::endl;
116+
std::cout << "Window = " << window << std::endl;
117+
return false;
118+
}
119+
}
120+
}
121+
return true;
122+
}
123+
124+
bool issue1309() {
125+
std::cout << "Running " << __func__ << std::endl;
126+
const simdjson::padded_string input = decode_base64("CQA5OAo5CgoKCiIiXyIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiJiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiXyIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiJiIiIiIiIiIiIiIiIiIiIiLb29vb29vb29vb29vb29vz8/Pz8/Pz8/Pz8/Pz8/Pz8/Pz8/Pz8/Pz8/Pz29vb29vb29vbIiIiIiIiIiIiIiIiIiIiIiIiIiIiJiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiYiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiI=");
127+
print_hex(input);
128+
for(size_t window = 0; window <= 100; window++) {
129+
simdjson::dom::parser parser;
130+
simdjson::dom::document_stream stream;
131+
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
132+
for(auto doc: stream) {
133+
auto error = doc.error();
134+
if(!error) {
135+
std::cout << "Expected an error but got " << error << std::endl;
136+
std::cout << "Window = " << window << std::endl;
137+
return false;
138+
}
139+
}
140+
}
141+
return true;
142+
}
143+
144+
bool issue1310() {
145+
std::cout << "Running " << __func__ << std::endl;
146+
const simdjson::padded_string input = decode_base64("AwA5ICIg");
147+
print_hex(input);
148+
for(size_t window = 0; window <= 100; window++) {
149+
simdjson::dom::parser parser;
150+
simdjson::dom::document_stream stream;
151+
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
152+
for(auto doc: stream) {
153+
auto error = doc.error();
154+
if(!error) {
155+
std::cout << "Expected an error but got " << error << std::endl;
156+
std::cout << "Window = " << window << std::endl;
157+
return false;
158+
}
159+
}
160+
161+
}
162+
return true;
163+
}
164+
165+
bool issue1311() {
166+
std::cout << "Running " << __func__ << std::endl;
167+
const simdjson::padded_string input = decode_base64("NSMwW1swDPw=");
168+
print_hex(input);
169+
for(size_t window = 0; window <= 100; window++) {
170+
simdjson::dom::parser parser;
171+
simdjson::dom::document_stream stream;
172+
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
173+
for(auto doc: stream) {
174+
auto error = doc.error();
175+
if(!error) {
176+
std::cout << "Expected an error but got " << error << std::endl;
177+
std::cout << "Window = " << window << std::endl;
178+
return false;
179+
}
180+
}
181+
}
182+
return true;
183+
}
184+
33185
bool test_current_index() {
34186
std::cout << "Running " << __func__ << std::endl;
35187
std::string base1("1 ");// one JSON!
@@ -338,12 +490,17 @@ namespace document_stream_tests {
338490
}
339491

340492
bool run() {
341-
return test_naked_iterators() &&
493+
return issue1307() &&
494+
issue1308() &&
495+
issue1309() &&
496+
issue1310() &&
497+
issue1311() &&
498+
test_naked_iterators() &&
342499
test_current_index() &&
343500
single_document() &&
344501
#if SIMDJSON_EXCEPTIONS
345-
single_document_exceptions() &&
346502
issue1133() &&
503+
single_document_exceptions() &&
347504
#endif
348505
#ifdef SIMDJSON_THREADS_ENABLED
349506
threaded_disabled() &&

0 commit comments

Comments
 (0)