Skip to content

Commit d9c4191

Browse files
authored
This should fix an issue we have with unclosed strings with document_stream (simdjson#1321)
* This should fix an issue we have with unclosed strings. * Patching the fallback kernel. * Better guarding the code. * Patching the fallback.
1 parent 3fa40b8 commit d9c4191

File tree

7 files changed

+55
-26
lines changed

7 files changed

+55
-26
lines changed

.github/workflows/alpine.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ jobs:
3434
./alpine.sh cmake --build build_for_alpine
3535
- name: test
3636
run: |
37-
./alpine.sh bash -c "cd build_for_alpine && ctest"
37+
./alpine.sh bash -c "cd build_for_alpine && ctest -E checkperf"

src/fallback/dom_parser_implementation.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ simdjson_really_inline void validate_utf8_character() {
9191
idx += 4;
9292
}
9393

94-
simdjson_really_inline void validate_string() {
94+
// Returns true if the string is unclosed.
95+
simdjson_really_inline bool validate_string() {
9596
idx++; // skip first quote
9697
while (idx < len && buf[idx] != '"') {
9798
if (buf[idx] == '\\') {
@@ -103,7 +104,8 @@ simdjson_really_inline void validate_string() {
103104
idx++;
104105
}
105106
}
106-
if (idx >= len && !partial) { error = UNCLOSED_STRING; }
107+
if (idx >= len) { return true; }
108+
return false;
107109
}
108110

109111
simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
@@ -120,12 +122,13 @@ simdjson_really_inline bool is_whitespace_or_operator(uint8_t c) {
120122
// Parse the entire input in STEP_SIZE-byte chunks.
121123
//
122124
simdjson_really_inline error_code scan() {
125+
bool unclosed_string = false;
123126
for (;idx<len;idx++) {
124127
switch (buf[idx]) {
125128
// String
126129
case '"':
127130
add_structural();
128-
validate_string();
131+
unclosed_string |= validate_string();
129132
break;
130133
// Operator
131134
case '{': case '}': case '[': case ']': case ',': case ':':
@@ -150,20 +153,19 @@ simdjson_really_inline error_code scan() {
150153
next_structural_index[1] = len;
151154
next_structural_index[2] = 0;
152155
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
156+
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
153157
parser.next_structural_index = 0;
154-
155-
if (simdjson_unlikely(parser.n_structural_indexes == 0)) {
156-
return EMPTY;
157-
}
158-
159158
if (partial) {
159+
if(unclosed_string) {
160+
parser.n_structural_indexes--;
161+
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
162+
}
160163
auto new_structural_indexes = find_next_document_index(parser);
161164
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
162165
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
163166
}
164167
parser.n_structural_indexes = new_structural_indexes;
165-
}
166-
168+
} else if(unclosed_string) { error = UNCLOSED_STRING; }
167169
return error;
168170
}
169171

src/generic/stage1/json_minifier.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& i
3232
}
3333

3434
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
35-
error_code error = scanner.finish(false);
35+
error_code error = scanner.finish();
3636
if (error) { dst_len = 0; return error; }
3737
dst_len = dst - dst_start;
3838
return SUCCESS;

src/generic/stage1/json_scanner.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ class json_scanner {
9292
public:
9393
json_scanner() {}
9494
simdjson_really_inline json_block next(const simd::simd8x64<uint8_t>& in);
95-
simdjson_really_inline error_code finish(bool streaming);
95+
// Returns either UNCLOSED_STRING or SUCCESS
96+
simdjson_really_inline error_code finish();
9697

9798
private:
9899
// Whether the last character of the previous iteration is part of a scalar token
@@ -138,8 +139,8 @@ simdjson_really_inline json_block json_scanner::next(const simd::simd8x64<uint8_
138139
};
139140
}
140141

141-
simdjson_really_inline error_code json_scanner::finish(bool streaming) {
142-
return string_scanner.finish(streaming);
142+
simdjson_really_inline error_code json_scanner::finish() {
143+
return string_scanner.finish();
143144
}
144145

145146
} // namespace stage1

src/generic/stage1/json_string_scanner.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ struct json_string_block {
3737
class json_string_scanner {
3838
public:
3939
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
40-
simdjson_really_inline error_code finish(bool streaming);
40+
// Returns either UNCLOSED_STRING or SUCCESS
41+
simdjson_really_inline error_code finish();
4142

4243
private:
4344
// Intended to be defined by the implementation
@@ -132,8 +133,8 @@ simdjson_really_inline json_string_block json_string_scanner::next(const simd::s
132133
};
133134
}
134135

135-
simdjson_really_inline error_code json_string_scanner::finish(bool streaming) {
136-
if (prev_in_string and (not streaming)) {
136+
simdjson_really_inline error_code json_string_scanner::finish() {
137+
if (prev_in_string) {
137138
return UNCLOSED_STRING;
138139
}
139140
return SUCCESS;

src/generic/stage1/json_structural_indexer.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,14 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
182182
// Write out the final iteration's structurals
183183
indexer.write(uint32_t(idx-64), prev_structurals);
184184

185-
error_code error = scanner.finish(partial);
186-
if (simdjson_unlikely(error != SUCCESS)) { return error; }
185+
error_code error = scanner.finish();
186+
// We deliberately break down the next expression so that it is
187+
// human readable.
188+
const bool should_we_exit = partial ?
189+
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
190+
: (error != SUCCESS); // if partial is false, we must have SUCCESS
191+
const bool have_unclosed_string = (error == UNCLOSED_STRING);
192+
if (simdjson_unlikely(should_we_exit)) { return error; }
187193

188194
if (unescaped_chars_error) {
189195
return UNESCAPED_CHARS;
@@ -216,6 +222,13 @@ simdjson_really_inline error_code json_structural_indexer::finish(dom_parser_imp
216222
return UNEXPECTED_ERROR;
217223
}
218224
if (partial) {
225+
// If we have an unclosed string, then the last structural
226+
// will be the quote and we want to make sure to omit it.
227+
if(have_unclosed_string) {
228+
parser.n_structural_indexes--;
229+
// a valid JSON file cannot have zero structural indexes - we should have found something
230+
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
231+
}
219232
auto new_structural_indexes = find_next_document_index(parser);
220233
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
221234
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.

tests/document_stream_tests.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,21 +143,33 @@ namespace document_stream_tests {
143143

144144
bool issue1310() {
145145
std::cout << "Running " << __func__ << std::endl;
146-
const simdjson::padded_string input = decode_base64("AwA5ICIg");
146+
// hex : 20 20 5B 20 33 2C 31 5D 20 22 22 22 22 22 22 22 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
147+
// ascii: __ __[__ __3__,__1__]__ __"__"__"__"__"__"__"__ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __
148+
// We have four full documents followed by an unclosed string.
149+
const simdjson::padded_string input = decode_base64("ICBbIDMsMV0gIiIiIiIiIiAgICAgICAgICAgICAgICAg");
147150
print_hex(input);
148151
for(size_t window = 0; window <= 100; window++) {
149152
simdjson::dom::parser parser;
150153
simdjson::dom::document_stream stream;
151154
ASSERT_SUCCESS(parser.parse_many(input, window).get(stream));
155+
size_t count{0};
152156
for(auto doc: stream) {
153157
auto error = doc.error();
154-
if(!error) {
155-
std::cout << "Expected an error but got " << error << std::endl;
156-
std::cout << "Window = " << window << std::endl;
157-
return false;
158+
count++;
159+
if(count <= 4) {
160+
if(window < input.size() && error) {
161+
std::cout << "Expected no error but got " << error << std::endl;
162+
std::cout << "Window = " << window << std::endl;
163+
return false;
164+
}
165+
} else {
166+
if(!error) {
167+
std::cout << "Expected an error but got " << error << std::endl;
168+
std::cout << "Window = " << window << std::endl;
169+
return false;
170+
}
158171
}
159172
}
160-
161173
}
162174
return true;
163175
}

0 commit comments

Comments
 (0)