Skip to content

Commit 5609851

Browse files
authored
This will guard the batch_size so it cannot be so low as to accidentally burn through your CPU. (simdjson#1319)
* This will guard the batch_size so it cannot be so low as to accidentally burn through your CPU.
1 parent 68a8004 commit 5609851

File tree

4 files changed

+29
-6
lines changed

4 files changed

+29
-6
lines changed

include/simdjson/dom/document_stream-inl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ simdjson_really_inline document_stream::document_stream(
7373
: parser{&_parser},
7474
buf{_buf},
7575
len{_len},
76-
batch_size{_batch_size},
76+
batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size},
7777
error{SUCCESS}
7878
#ifdef SIMDJSON_THREADS_ENABLED
7979
, use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change

include/simdjson/dom/parser-inl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ inline simdjson_result<document_stream> parser::load_many(const std::string &pat
8383
size_t len;
8484
auto _error = read_file(path).get(len);
8585
if (_error) { return _error; }
86+
if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
8687
return document_stream(*this, (const uint8_t*)loaded_bytes.get(), len, batch_size);
8788
}
8889

@@ -112,6 +113,7 @@ simdjson_really_inline simdjson_result<element> parser::parse(const padded_strin
112113
}
113114

114115
inline simdjson_result<document_stream> parser::parse_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept {
116+
if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
115117
return document_stream(*this, buf, len, batch_size);
116118
}
117119
inline simdjson_result<document_stream> parser::parse_many(const char *buf, size_t len, size_t batch_size) noexcept {

include/simdjson/dom/parser.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ class element;
2121

2222
/** The default batch size for parser.parse_many() and parser.load_many() */
2323
static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
24+
/**
25+
* Some adversary might try to set the batch size to 0 or 1, which might cause problems.
26+
* We set a minimum of 32B since anything else is highly likely to be an error. In practice,
27+
* most users will want a much larger batch size.
28+
*/
29+
static constexpr size_t MINIMAL_BATCH_SIZE = 32;
2430

2531
/**
2632
* A persistent document parser.
@@ -249,7 +255,10 @@ class parser {
249255
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
250256
* spot is cache-related: small enough to fit in cache, yet big enough to
251257
* parse as many documents as possible in one tight loop.
252-
* Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet spot in our tests.
258+
* Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet
259+
* spot in our tests.
260+
* If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE
261+
* (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE.
253262
* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
254263
* - IO_ERROR if there was an error opening or reading the file.
255264
* - MEMALLOC if the parser does not have enough capacity and memory allocation fails.

tests/document_stream_tests.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,16 @@ namespace document_stream_tests {
135135

136136
bool small_window() {
137137
std::cout << "Running " << __func__ << std::endl;
138-
auto json = R"({"error":[],"result":{"token":"xxx"}}{"error":[],"result":{"token":"xxx"}})"_padded;
138+
char input[2049];
139+
input[0] = '[';
140+
for(size_t i = 1; i < 1024; i++) {
141+
input[2*i+1]= '1';
142+
input[2*i+2]= i < 1023 ? ',' : ']';
143+
}
144+
auto json = simdjson::padded_string(input,2049);
139145
simdjson::dom::parser parser;
140146
size_t count = 0;
141-
size_t window_size = 10; // deliberately too small
147+
size_t window_size = 1024; // deliberately too small
142148
simdjson::dom::document_stream stream;
143149
ASSERT_SUCCESS( parser.parse_many(json, window_size).get(stream) );
144150
for (auto doc : stream) {
@@ -158,11 +164,17 @@ namespace document_stream_tests {
158164
#ifdef SIMDJSON_THREADS_ENABLED
159165
bool threaded_disabled() {
160166
std::cout << "Running " << __func__ << std::endl;
161-
auto json = R"({"error":[],"result":{"token":"xxx"}}{"error":[],"result":{"token":"xxx"}})"_padded;
167+
char input[2049];
168+
input[0] = '[';
169+
for(size_t i = 1; i < 1024; i++) {
170+
input[2*i+1]= '1';
171+
input[2*i+2]= i < 1023 ? ',' : ']';
172+
}
173+
auto json = simdjson::padded_string(input,2049);
162174
simdjson::dom::parser parser;
163175
parser.threaded = false;
164176
size_t count = 0;
165-
size_t window_size = 10; // deliberately too small
177+
size_t window_size = 1024; // deliberately too small
166178
simdjson::dom::document_stream stream;
167179
ASSERT_SUCCESS( parser.parse_many(json, window_size).get(stream) );
168180
for (auto doc : stream) {

0 commit comments

Comments
 (0)