Skip to content

Commit cfef4ff

Browse files
committed
Create parser.parse_many() API
1 parent b2220d6 commit cfef4ff

12 files changed

+859
-317
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/we
6565
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/error.cpp src/jsonioutil.cpp src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
6666
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE)
6767

68-
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
68+
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
6969

7070
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
7171
HEADERS=singleheader/simdjson.h

include/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ set(SIMDJSON_INCLUDE
44
${SIMDJSON_INCLUDE_DIR}/simdjson/common_defs.h
55
${SIMDJSON_INCLUDE_DIR}/simdjson/compiler_check.h
66
${SIMDJSON_INCLUDE_DIR}/simdjson/document_iterator.h
7+
${SIMDJSON_INCLUDE_DIR}/simdjson/document_stream.h
78
${SIMDJSON_INCLUDE_DIR}/simdjson/document.h
89
${SIMDJSON_INCLUDE_DIR}/simdjson/error.h
910
${SIMDJSON_INCLUDE_DIR}/simdjson/implementation.h
10-
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
11+
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_stream.h
1112
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document_iterator.h
13+
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/document.h
1214
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/jsonstream.h
1315
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
1416
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h

include/simdjson.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,20 @@
99
#include "simdjson/padded_string.h"
1010
#include "simdjson/implementation.h"
1111
#include "simdjson/document.h"
12-
#include "simdjson/jsonstream.h"
12+
#include "simdjson/document_stream.h"
1313
#include "simdjson/jsonminifier.h"
1414

1515
// Deprecated API
1616
#include "simdjson/parsedjsoniterator.h"
1717
#include "simdjson/jsonparser.h"
1818
#include "simdjson/parsedjson.h"
19+
#include "simdjson/jsonstream.h"
1920
#include "simdjson/document_iterator.h"
2021

2122
// Inline functions
2223
#include "simdjson/inline/document.h"
2324
#include "simdjson/inline/document_iterator.h"
25+
#include "simdjson/inline/document_stream.h"
2426
#include "simdjson/inline/jsonstream.h"
2527

2628
#endif // SIMDJSON_H

include/simdjson/document.h

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,13 @@ class document {
5252
class object;
5353
class key_value_pair;
5454
class parser;
55+
class stream;
5556

5657
template<typename T=element>
5758
class element_result;
5859
class doc_result;
5960
class doc_ref_result;
61+
class stream_result;
6062

6163
// Nested classes. See definitions later in file.
6264
using iterator = document_iterator<DEFAULT_MAX_DEPTH>;
@@ -315,6 +317,7 @@ class document::doc_ref_result {
315317
private:
316318
doc_ref_result(document &_doc, error_code _error) noexcept;
317319
friend class document::parser;
320+
friend class document::stream;
318321
}; // class document::doc_ref_result
319322

320323
/**
@@ -927,6 +930,255 @@ class document::parser {
927930
// We do not want to allow implicit conversion from C string to std::string.
928931
really_inline doc_ref_result parse(const char *buf) noexcept = delete;
929932

933+
/**
934+
* Parse a buffer containing many JSON documents.
935+
*
936+
* document::parser parser;
937+
* for (const document &doc : parser.parse_many(buf, len)) {
938+
* cout << std::string(doc["title"]) << endl;
939+
* }
940+
*
941+
* ### Format
942+
*
943+
* The buffer must contain a series of one or more JSON documents, concatenated into a single
944+
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
945+
* then starts parsing the next document at that point. (It does this with more parallelism and
946+
* lookahead than you might think, though.)
947+
*
948+
* documents that consist of an object or array may omit the whitespace between them, concatenating
949+
* with no separator. documents that consist of a single primitive (i.e. documents that are not
950+
* arrays or objects) MUST be separated with whitespace.
951+
*
952+
* ### Error Handling
953+
*
954+
* All errors are returned during iteration: if there is a global error such as memory allocation,
955+
* it will be yielded as the first result. Iteration always stops after the first error.
956+
*
957+
* As with all other simdjson methods, non-exception error handling is readily available through
958+
* the same interface, requiring you to check the error before using the document:
959+
*
960+
* document::parser parser;
961+
* for (auto [doc, error] : parser.parse_many(buf, len)) {
962+
* if (error) { cerr << error_message(error) << endl; exit(1); }
963+
* cout << std::string(doc["title"]) << endl;
964+
* }
965+
*
966+
* ### REQUIRED: Buffer Padding
967+
*
968+
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
969+
* those bytes are initialized to, as long as they are allocated.
970+
*
971+
* ### Threads
972+
*
973+
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
974+
* hood to do some lookahead.
975+
*
976+
* ### Parser Capacity
977+
*
978+
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
979+
* allocated, it must have a capacity at least as large as batch_size.
980+
*
981+
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
982+
* @param len The length of the concatenated JSON.
983+
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
984+
* spot is cache-related: small enough to fit in cache, yet big enough to
985+
* parse as many documents as possible in one tight loop.
986+
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
987+
* @return The stream. If there is an error, it will be returned during iteration. An empty input
988+
* will yield 0 documents rather than an EMPTY error. Errors:
989+
* - MEMALLOC if the parser is unallocated and memory allocation fails
990+
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
991+
* - other json errors if parsing fails.
992+
*/
993+
inline stream parse_many(const uint8_t *buf, size_t len, size_t batch_size = 1000000) noexcept;
994+
995+
/**
996+
* Parse a buffer containing many JSON documents.
997+
*
998+
* document::parser parser;
999+
* for (const document &doc : parser.parse_many(buf, len)) {
1000+
* cout << std::string(doc["title"]) << endl;
1001+
* }
1002+
*
1003+
* ### Format
1004+
*
1005+
* The buffer must contain a series of one or more JSON documents, concatenated into a single
1006+
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
1007+
* then starts parsing the next document at that point. (It does this with more parallelism and
1008+
* lookahead than you might think, though.)
1009+
*
1010+
* documents that consist of an object or array may omit the whitespace between them, concatenating
1011+
* with no separator. documents that consist of a single primitive (i.e. documents that are not
1012+
* arrays or objects) MUST be separated with whitespace.
1013+
*
1014+
* ### Error Handling
1015+
*
1016+
* All errors are returned during iteration: if there is a global error such as memory allocation,
1017+
* it will be yielded as the first result. Iteration always stops after the first error.
1018+
*
1019+
* As with all other simdjson methods, non-exception error handling is readily available through
1020+
* the same interface, requiring you to check the error before using the document:
1021+
*
1022+
* document::parser parser;
1023+
* for (auto [doc, error] : parser.parse_many(buf, len)) {
1024+
* if (error) { cerr << error_message(error) << endl; exit(1); }
1025+
* cout << std::string(doc["title"]) << endl;
1026+
* }
1027+
*
1028+
* ### REQUIRED: Buffer Padding
1029+
*
1030+
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
1031+
* those bytes are initialized to, as long as they are allocated.
1032+
*
1033+
* ### Threads
1034+
*
1035+
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
1036+
* hood to do some lookahead.
1037+
*
1038+
* ### Parser Capacity
1039+
*
1040+
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
1041+
* allocated, it must have a capacity at least as large as batch_size.
1042+
*
1043+
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
1044+
* @param len The length of the concatenated JSON.
1045+
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
1046+
* spot is cache-related: small enough to fit in cache, yet big enough to
1047+
* parse as many documents as possible in one tight loop.
1048+
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
1049+
* @return The stream. If there is an error, it will be returned during iteration. An empty input
1050+
* will yield 0 documents rather than an EMPTY error. Errors:
1051+
* - MEMALLOC if the parser is unallocated and memory allocation fails
1052+
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
1053+
* - other json errors if parsing fails
1054+
*/
1055+
inline stream parse_many(const char *buf, size_t len, size_t batch_size = 1000000) noexcept;
1056+
1057+
/**
1058+
* Parse a buffer containing many JSON documents.
1059+
*
1060+
* document::parser parser;
1061+
* for (const document &doc : parser.parse_many(buf, len)) {
1062+
* cout << std::string(doc["title"]) << endl;
1063+
* }
1064+
*
1065+
* ### Format
1066+
*
1067+
* The buffer must contain a series of one or more JSON documents, concatenated into a single
1068+
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
1069+
* then starts parsing the next document at that point. (It does this with more parallelism and
1070+
* lookahead than you might think, though.)
1071+
*
1072+
* documents that consist of an object or array may omit the whitespace between them, concatenating
1073+
* with no separator. documents that consist of a single primitive (i.e. documents that are not
1074+
* arrays or objects) MUST be separated with whitespace.
1075+
*
1076+
* ### Error Handling
1077+
*
1078+
* All errors are returned during iteration: if there is a global error such as memory allocation,
1079+
* it will be yielded as the first result. Iteration always stops after the first error.
1080+
*
1081+
* As with all other simdjson methods, non-exception error handling is readily available through
1082+
* the same interface, requiring you to check the error before using the document:
1083+
*
1084+
* document::parser parser;
1085+
* for (auto [doc, error] : parser.parse_many(buf, len)) {
1086+
* if (error) { cerr << error_message(error) << endl; exit(1); }
1087+
* cout << std::string(doc["title"]) << endl;
1088+
* }
1089+
*
1090+
* ### REQUIRED: Buffer Padding
1091+
*
1092+
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
1093+
* those bytes are initialized to, as long as they are allocated.
1094+
*
1095+
* ### Threads
1096+
*
1097+
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
1098+
* hood to do some lookahead.
1099+
*
1100+
* ### Parser Capacity
1101+
*
1102+
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
1103+
* allocated, it must have a capacity at least as large as batch_size.
1104+
*
1105+
* @param s The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
1106+
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
1107+
* spot is cache-related: small enough to fit in cache, yet big enough to
1108+
* parse as many documents as possible in one tight loop.
1109+
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
1110+
* @return he stream. If there is an error, it will be returned during iteration. An empty input
1111+
* will yield 0 documents rather than an EMPTY error. Errors:
1112+
* - MEMALLOC if the parser is unallocated and memory allocation fails
1113+
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
1114+
* - other json errors if parsing fails
1115+
*/
1116+
inline stream parse_many(const std::string &s, size_t batch_size = 1000000) noexcept;
1117+
1118+
/**
1119+
* Parse a buffer containing many JSON documents.
1120+
*
1121+
* document::parser parser;
1122+
* for (const document &doc : parser.parse_many(buf, len)) {
1123+
* cout << std::string(doc["title"]) << endl;
1124+
* }
1125+
*
1126+
* ### Format
1127+
*
1128+
* The buffer must contain a series of one or more JSON documents, concatenated into a single
1129+
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
1130+
* then starts parsing the next document at that point. (It does this with more parallelism and
1131+
* lookahead than you might think, though.)
1132+
*
1133+
* documents that consist of an object or array may omit the whitespace between them, concatenating
1134+
* with no separator. documents that consist of a single primitive (i.e. documents that are not
1135+
* arrays or objects) MUST be separated with whitespace.
1136+
*
1137+
* ### Error Handling
1138+
*
1139+
* All errors are returned during iteration: if there is a global error such as memory allocation,
1140+
* it will be yielded as the first result. Iteration always stops after the first error.
1141+
*
1142+
* As with all other simdjson methods, non-exception error handling is readily available through
1143+
* the same interface, requiring you to check the error before using the document:
1144+
*
1145+
* document::parser parser;
1146+
* for (auto [doc, error] : parser.parse_many(buf, len)) {
1147+
* if (error) { cerr << error_message(error) << endl; exit(1); }
1148+
* cout << std::string(doc["title"]) << endl;
1149+
* }
1150+
*
1151+
* ### REQUIRED: Buffer Padding
1152+
*
1153+
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
1154+
* those bytes are initialized to, as long as they are allocated.
1155+
*
1156+
* ### Threads
1157+
*
1158+
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
1159+
* hood to do some lookahead.
1160+
*
1161+
* ### Parser Capacity
1162+
*
1163+
* If the parser is unallocated, it will be auto-allocated to batch_size. If it is already
1164+
* allocated, it must have a capacity at least as large as batch_size.
1165+
*
1166+
* @param s The concatenated JSON to parse.
1167+
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
1168+
* spot is cache-related: small enough to fit in cache, yet big enough to
1169+
* parse as many documents as possible in one tight loop.
1170+
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
1171+
* @return he stream. If there is an error, it will be returned during iteration. An empty input
1172+
* will yield 0 documents rather than an EMPTY error. Errors:
1173+
* - MEMALLOC if the parser is unallocated and memory allocation fails
1174+
* - CAPACITY if the parser already has a capacity, and it is less than batch_size
1175+
* - other json errors if parsing fails
1176+
*/
1177+
inline stream parse_many(const padded_string &s, size_t batch_size = 1000000) noexcept;
1178+
1179+
// We do not want to allow implicit conversion from C string to std::string.
1180+
really_inline doc_ref_result parse_many(const char *buf, size_t batch_size = 1000000) noexcept = delete;
1181+
9301182
/**
9311183
* Current capacity: the largest document this parser can support without reallocating.
9321184
*/

0 commit comments

Comments
 (0)