Skip to content

Commit 3e5497e

Browse files
authored
Fixes issue 1170 and makes the usage of minify easier. (simdjson#1171)
* Fixes issue 1170 and makes the usage of minify easier. * This should get the fallback implementation to detect unclosed strings.
1 parent 6ecbcc7 commit 3e5497e

File tree

7 files changed

+42
-14
lines changed

7 files changed

+42
-14
lines changed

doc/basics.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ for (dom::key_value_pair field : object) {
272272
Minifying JSON strings without parsing
273273
----------------------
274274

275-
In some cases, you may have valid JSON strings that you do not wish to parse but that you wish to minify. That is, you wish to remove all unnecessary spaces. We have a fast function for this purpose (`simdjson::minify(const char * input, size_t length, const char * output, size_t& new_length)`). This function does not validate your content, and it does not parse it. It is much faster than parsing the string and re-serializing it in minified form (`simdjson::minify(parser.parse())`). Usage is relatively simple. You must pass an input pointer with a length parameter, as well as an output pointer and an output length parameter (by reference). The output length parameter is not read, but written to. The output pointer should point to a valid memory region that is slightly overallocated (by `simdjson::SIMDJSON_PADDING`) compared to the original string length. The input pointer and input length are read, but not written to.
275+
In some cases, you may have valid JSON strings that you do not wish to parse but that you wish to minify. That is, you wish to remove all unnecessary spaces. We have a fast function for this purpose (`simdjson::minify(const char * input, size_t length, const char * output, size_t& new_length)`). This function does not validate your content, and it does not parse it. It is much faster than parsing the string and re-serializing it in minified form (`simdjson::minify(parser.parse())`). Usage is relatively simple. You must pass an input pointer with a length parameter, as well as an output pointer and an output length parameter (by reference). The output length parameter is not read, but written to. The output pointer should point to a valid memory region that is as large as the original string length. The input pointer and input length are read, but not written to.
276276

277277
```C++
278278
// Starts with a valid JSON document as a string.
@@ -281,7 +281,7 @@ In some cases, you may have valid JSON strings that you do not wish to parse but
281281
size_t length = strlen(some_string);
282282
// Create a buffer to receive the minified string. Make sure that there is enough room,
283283
// including some padding (simdjson::SIMDJSON_PADDING).
284-
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
284+
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length]};
285285
size_t new_length{}; // It will receive the minified length.
286286
auto error = simdjson::minify(some_string, length, buffer.get(), new_length);
287287
// The buffer variable now has "[1,2,3,4]" and new_length has value 9.

include/simdjson/minify.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@ namespace simdjson {
1515
*
1616
* Minify the input string assuming that it represents a JSON string, does not parse or validate.
1717
* This function is much faster than parsing a JSON string and then writing a minified version of it.
18-
* However, it does not validate the input.
18+
* However, it does not validate the input. It will merely return an error in simple cases (e.g., if
19+
* there is a string that was never terminated).
1920
*
2021
*
2122
* @param buf the json document to minify.
2223
* @param len the length of the json document.
23-
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
24+
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len bytes.
2425
* @param dst_len the number of bytes written. Output only.
2526
* @return the error code, or SUCCESS if there was no error.
2627
*/

src/fallback/dom_parser_implementation.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ SIMDJSON_WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_
240240
}
241241
dst_len = pos; // we intentionally do not work with a reference
242242
// for fear of aliasing
243-
return SUCCESS;
243+
return quote ? UNCLOSED_STRING : SUCCESS;
244244
}
245245

246246
// credit: based on code from Google Fuchsia (Apache Licensed)

src/generic/stage1/json_minifier.h

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ simdjson_really_inline void json_minifier::next(const simd::simd8x64<uint8_t>& i
3131
}
3232

3333
simdjson_really_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
34-
*dst = '\0';
34+
//*dst = '\0';
3535
error_code error = scanner.finish(false);
3636
if (error) { dst_len = 0; return error; }
3737
dst_len = dst - dst_start;
@@ -69,10 +69,22 @@ error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, s
6969

7070
// Index the last (remainder) block, padded with spaces
7171
uint8_t block[STEP_SIZE];
72-
if (simdjson_likely(reader.get_remainder(block)) > 0) {
72+
size_t remaining_bytes = reader.get_remainder(block);
73+
if (remaining_bytes > 0) {
74+
// We do not want to write directly to the output stream. Rather, we write
75+
// to a local buffer (for safety).
76+
uint8_t out_block[STEP_SIZE];
77+
uint8_t * const guarded_dst{minifier.dst};
78+
minifier.dst = out_block;
7379
minifier.step<STEP_SIZE>(block, reader);
80+
size_t to_write = minifier.dst - out_block;
81+
// In some cases, we could be enticed to consider the padded spaces
82+
// as part of the string. This is fine as long as we do not write more
83+
// than we consumed.
84+
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
85+
memcpy(guarded_dst, out_block, to_write);
86+
minifier.dst = guarded_dst + to_write;
7487
}
75-
7688
return minifier.finish(dst, dst_len);
7789
}
7890

tests/basictests.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,7 +1290,7 @@ namespace validate_tests {
12901290
namespace minify_tests {
12911291

12921292
bool check_minification(const char * input, size_t length, const char * expected, size_t expected_length) {
1293-
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
1293+
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length]};
12941294
if(buffer.get() == nullptr) {
12951295
std::cerr << "cannot alloc " << std::endl;
12961296
return false;
@@ -1303,11 +1303,24 @@ namespace minify_tests {
13031303
}
13041304
return true;
13051305
}
1306+
bool test_single_quote() {
1307+
std::cout << "Running " << __func__ << std::endl;
1308+
const std::string test = "\"";
1309+
char output[1];
1310+
size_t newlength;
1311+
auto e = simdjson::minify(test.data(), 1, output, newlength);
1312+
if(e) {
1313+
std::cout << "got an error (expected) : " << e << std::endl;
1314+
return true; // we have an error as expected
1315+
}
1316+
std::cerr << "This should be an error : " << e << std::endl;
1317+
return false;
1318+
}
13061319

13071320
bool test_minify() {
13081321
std::cout << "Running " << __func__ << std::endl;
13091322
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
1310-
const std::string minified(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
1323+
const std::string minified(R"({"foo":1,"bar":[1,2,3],"baz":{"a":1,"b":2,"c":3}})");
13111324
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
13121325
}
13131326
bool test_minify_array() {
@@ -1323,7 +1336,8 @@ namespace minify_tests {
13231336
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
13241337
}
13251338
bool run() {
1326-
return test_minify() &&
1339+
return test_single_quote() &&
1340+
test_minify() &&
13271341
test_minify_array() &&
13281342
test_minify_object();
13291343
}

tests/readme_examples.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ SIMDJSON_POP_DISABLE_WARNINGS
262262
void minify() {
263263
const char * some_string = "[ 1, 2, 3, 4] ";
264264
size_t length = strlen(some_string);
265-
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length + simdjson::SIMDJSON_PADDING]};
265+
std::unique_ptr<char[]> buffer{new(std::nothrow) char[length]};
266266
size_t new_length{};
267267
auto error = simdjson::minify(some_string, length, buffer.get(), new_length);
268268
if(error != simdjson::SUCCESS) {

tools/minify.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,12 @@ int main(int argc, char *argv[]) {
6565
std::cerr << "Could not load the file " << filename << std::endl;
6666
return EXIT_FAILURE;
6767
}
68-
simdjson::padded_string copy(p.length());
68+
simdjson::padded_string copy(p.length()); // does not need to be padded after all!
6969
size_t copy_len;
7070
error = simdjson::active_implementation->minify((const uint8_t*)p.data(), p.length(), (uint8_t*)copy.data(), copy_len);
71-
if (error) { std::cerr << error << std::endl; return 1; }
71+
if (error) { std::cerr << error << std::endl; return EXIT_FAILURE; }
7272
printf("%s", copy.data());
73+
return EXIT_SUCCESS;
7374
#ifdef __cpp_exceptions
7475
} catch (const cxxopts::OptionException& e) {
7576
std::cout << "error parsing options: " << e.what() << std::endl;

0 commit comments

Comments
 (0)