Skip to content

Commit 3e39a99

Browse files
committed
Merge branch 'master' of github.com:lemire/simdjson
Conflicts: include/simdjson/jsonstream.h
2 parents 2867dc5 + c3c4376 commit 3e39a99

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+9508
-5639
lines changed

.drone.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
11
kind: pipeline
2+
name: x64-quicktests-libc
3+
4+
platform:
5+
os: linux
6+
arch: amd64
7+
8+
steps:
9+
- name: quicktests
10+
image: conanio/clang8
11+
user: root
12+
commands: [ EXTRAFLAGS=-stdlib=libc++ make quicktests ]
13+
---
14+
kind: pipeline
215
name: x64-quicktests
316

417
platform:

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ In particular, the following contributions are invited:
66

77
- The library is focused on performance. Well-documented performance optimization are invited.
88
- Fixes to known or newly discovered bugs are always welcome. Typically, a bug fix should come with a test demonstrating that the bug has been fixed.
9-
- The simdjson library is advanced software and maintanability and flexibility are always a concern. Specific contributions to improve maintanability and flexibility are invited.
9+
- The simdjson library is advanced software and maintainability and flexibility are always a concern. Specific contributions to improve maintainability and flexibility are invited.
1010

1111

1212

@@ -28,5 +28,5 @@ Contributors are encouraged to
2828

2929

3030

31-
Though we do not have a formal code of conduct, we will not tolerate bullying, bigotery or intimidation. Everyone is welcome to contribute.
31+
Though we do not have a formal code of conduct, we will not tolerate bullying, bigotry or intimidation. Everyone is welcome to contribute.
3232

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ endif # ifeq ($(SANITIZE),1)
5353
endif # ifeq ($(MEMSANITIZE),1)
5454

5555
# Headers and sources
56-
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
56+
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/json_minifier.h src/generic/buf_block_reader.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
5757
SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h
5858
SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h
5959
SRCHEADERS_FALLBACK= src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h
6060
SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
6161
SRCHEADERS_SRC=src/isadetection.h src/jsoncharutils.h src/simdprune_tables.h src/implementation.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document_parser_callbacks.h
6262
SRCHEADERS=$(SRCHEADERS_SRC) $(SRCHEADERS_GENERIC) $(SRCHEADERS_ARM64) $(SRCHEADERS_HASWELL) $(SRCHEADERS_WESTMERE) $(SRCHEADERS_FALLBACK)
6363

64-
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
64+
INCLUDEHEADERS=include/simdjson.h include/simdjson/common_defs.h include/simdjson/internal/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/inline/padded_string.h include/simdjson/document.h include/simdjson/inline/document.h include/simdjson/document_iterator.h include/simdjson/inline/document_iterator.h include/simdjson/document_stream.h include/simdjson/inline/document_stream.h include/simdjson/implementation.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/inline/jsonstream.h include/simdjson/portability.h include/simdjson/error.h include/simdjson/inline/error.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h
6565

6666
ifeq ($(SIMDJSON_TEST_AMALGAMATED_HEADERS),1)
6767
HEADERS=singleheader/simdjson.h

README.md

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,44 @@
1-
# simdjson : Parsing gigabytes of JSON per second
2-
[![Build Status](https://cloud.drone.io/api/badges/lemire/simdjson/status.svg)](https://cloud.drone.io/lemire/simdjson/)
3-
[![CircleCI](https://circleci.com/gh/lemire/simdjson.svg?style=svg)](https://circleci.com/gh/lemire/simdjson)
4-
[![Build Status](https://img.shields.io/appveyor/ci/lemire/simdjson/master.svg)](https://ci.appveyor.com/project/lemire/simdjson)
1+
# simdjson : Parsing gigabytes of JSON per second
2+
3+
<img src="images/logo.png" width="10%" style="float: right">
4+
JSON is everywhere on the Internet. Servers spend a *lot* of time parsing it. We need a fresh approach. simdjson uses commonly available SIMD instructions and microparallel algorithms to parse JSON 2.5x faster than anything else out there.
5+
6+
* **Ludicrous Speed:** Over 2.5x faster than other production-grade JSON parsers.
7+
* **Delightfully Easy:** First-class, easy to use API.
8+
* **Complete Validation:** Full JSON and UTF-8 validation, with no compromises.
9+
* **Rock-Solid Reliability:** From memory allocation to error handling, simdjson's design avoids surprises.
10+
11+
This library is part of the [Awesome Modern C++](https://awesomecpp.com) list.
12+
13+
[![Build Status](https://cloud.drone.io/api/badges/simdjson/simdjson/status.svg)](https://cloud.drone.io/simdjson/simdjson)
14+
[![CircleCI](https://circleci.com/gh/simdjson/simdjson.svg?style=svg)](https://circleci.com/gh/simdjson/simdjson)
15+
[![Build status](https://ci.appveyor.com/api/projects/status/ae77wp5v3lebmu6n/branch/master?svg=true)](https://ci.appveyor.com/project/lemire/simdjson-jmmti/branch/master)
516
[![][license img]][license]
6-
[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/simdjson.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:simdjson)
717

18+
## Quick Start
819

9-
## A C++ library to see how fast we can parse JSON with complete validation.
20+
simdjson is easily consumable with a single .h and .cpp file.
1021

11-
JSON documents are everywhere on the Internet. Servers spend a lot of time parsing these documents. We want to accelerate the parsing of JSON per se using commonly available SIMD instructions as much as possible while doing full validation (including character encoding). This library is part of the [Awesome Modern C++](https://awesomecpp.com) list.
22+
0. Prerequisites: `g++` or `clang++`.
23+
1. Pull [simdjson.h](singleheader/simdjson.h) and [simdjson.cpp](singleheader/simdjson.cpp) into a directory, along with the sample file [twitter.json](jsonexamples/twitter.json).
24+
```
25+
wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.h https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.cpp https://raw.githubusercontent.com/simdjson/simdjson/master/jsonexamples/twitter.json
26+
```
27+
2. Create `parser.cpp`:
1228

13-
<img src="images/logo.png" width="10%">
29+
```c++
30+
#include "simdjson.h"
31+
int main(void) {
32+
simdjson::document::parser parser;
33+
simdjson::document& tweets = parser.load("twitter.json");
34+
std::cout << tweets["search_metadata"]["count"] << " results." << std::endl;
35+
}
36+
```
37+
3. `g++ -o parser parser.cpp` (or clang++)
38+
4. `./parser`
39+
```
40+
100 results.
41+
```
1442
1543
## Real-world usage
1644
@@ -110,7 +138,7 @@ be concerned with computed gotos.
110138

111139
## Thread safety
112140

113-
The simdjson library is mostly single-threaded. Thread safety is the responsability of the caller: it is unsafe to reuse a document::parser object between different threads.
141+
The simdjson library is mostly single-threaded. Thread safety is the responsibility of the caller: it is unsafe to reuse a document::parser object between different threads.
114142

115143
If you are on an x64 processor, the runtime dispatching assigns the right code path the first time that parsing is attempted. The runtime dispatching is thread-safe.
116144

@@ -136,23 +164,23 @@ All examples below use use `#include "simdjson.h"`, `#include "simdjson.cpp"` an
136164
The simplest API to get started is `document::parse()`, which allocates a new parser, parses a string, and returns the DOM. This is less efficient if you're going to read multiple documents, but as long as you're only parsing a single document, this will do just fine.
137165

138166
```c++
139-
auto [doc, error] = document::parse(string("[ 1, 2, 3 ]"));
140-
if (error) { cerr << "Error: " << error_message(error) << endl; exit(1); }
167+
auto [doc, error] = document::parse("[ 1, 2, 3 ]"_padded);
168+
if (error) { cerr << "Error: " << error << endl; exit(1); }
141169
cout << doc;
142170
```
143171
144172
If you're using exceptions, it gets even simpler (simdjson won't use exceptions internally, so you'll only pay the performance cost of exceptions in your own calling code):
145173
146174
```c++
147-
document doc = document::parse(string("[ 1, 2, 3 ]"));
148-
cout << doc;
175+
cout << document::parse("[ 1, 2, 3 ]"_padded);
149176
```
150177

151-
The simdjson library requires SIMDJSON_PADDING extra bytes at the end of a string (it doesn't matter if the bytes are initialized). The `padded_string` class is an easy way to ensure this is accomplished up front and prevent the extra allocation:
178+
If you're wondering why the examples above use `_padded`, it's because the simdjson library requires SIMDJSON_PADDING extra bytes at the end of a string (it doesn't matter if the bytes are initialized). `_padded`
179+
is a way of creating a `padded_string` class, which assures us we have enough allocation.
152180

153181
```c++
154-
document doc = document::parse(padded_string(string("[ 1, 2, 3 ]")));
155-
cout << doc;
182+
padded_string json = "[ 1, 2, 3 ]"_padded;
183+
cout << document::parse(json);
156184
```
157185

158186
You can also load from a file with `parser.load()`:
@@ -463,7 +491,7 @@ You then have access to the following methods on the resulting `simdjson::docume
463491
* `bool move_to_key(const char *key, uint32_t length)`: as above except that the target can contain NULL characters
464492
* `void move_to_value()`: when at a key location within an object, this moves to the accompanying, value (located next to it). This is equivalent but much faster than calling `next()`.
465493
* `bool move_to_index(uint32_t index)`: when at `[`, go one level deep, and advance to the given index, if successful, we are left pointing at the value,i f not, we are still pointing at the array
466-
* `bool move_to(const char *pointer, uint32_t length)`: Moves the iterator to the value correspoding to the json pointer. Always search from the root of the document. If successful, we are left pointing at the value, if not, we are still pointing the same value we were pointing before the call. The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
494+
* `bool move_to(const char *pointer, uint32_t length)`: Moves the iterator to the value corresponding to the json pointer. Always search from the root of the document. If successful, we are left pointing at the value, if not, we are still pointing the same value we were pointing before the call. The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
467495
* `bool move_to(const std::string &pointer) `: same as above but with a std::string parameter
468496
* `bool next()`: Within a given scope (series of nodes at the same depth within either an array or an object), we move forward. Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, { and [. At the object ({) or at the array ([), you can issue a "down" to visit their content. valid if we're not at the end of a scope (returns true).
469497
* `bool prev()`: Within a given scope (series of nodes at the same depth within either an
@@ -567,15 +595,15 @@ make allparsingcompetition
567595
```
568596

569597
Both the `parsingcompetition` and `allparsingcompetition` tools take a `-t` flag which produces
570-
a table-oriented output that can be conventiently parsed by other tools.
598+
a table-oriented output that can be conveniently parsed by other tools.
571599

572600

573601
## Docker
574602

575603
One can run tests and benchmarks using docker. It especially makes sense under Linux. A privileged access may be needed to get performance counters.
576604

577605
```
578-
git clone https://github.com/lemire/simdjson.git
606+
git clone https://github.com/simdjson/simdjson.git
579607
cd simdjson
580608
docker build -t simdjson .
581609
docker run --privileged -t simdjson

amalgamation.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ int main(int argc, char *argv[]) {
143143
// parse_many
144144
const char * filename2 = argv[2];
145145
for (auto result : parser.load_many(filename2)) {
146-
error = result.error;
146+
error = result.error();
147147
}
148148
if (error) {
149149
std::cout << "parse_many failed" << std::endl;

benchmark/benchmarker.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ struct benchmarker {
263263
: filename(_filename), collector(_collector), stats(NULL) {
264264
verbose() << "[verbose] loading " << filename << endl;
265265
simdjson::error_code error;
266-
std::tie(this->json, error) = padded_string::load(filename);
266+
padded_string::load(filename).tie(this->json, error);
267267
if (error) {
268268
exit_error(string("Could not load the file ") + filename);
269269
}

benchmark/minifiercompetition.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,14 @@ int main(int argc, char *argv[]) {
9898
"despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer),
9999
memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
100100
memcpy(buffer, p.data(), p.size());
101-
102-
size_t outlength = simdjson::json_minify((const uint8_t *)buffer, p.size(),
103-
(uint8_t *)buffer);
104-
if (verbose)
105-
std::cout << "json_minify length is " << outlength << std::endl;
106-
101+
size_t outlength;
107102
uint8_t *cbuffer = (uint8_t *)buffer;
108-
BEST_TIME("json_minify", simdjson::json_minify(cbuffer, p.size(), cbuffer),
103+
for (auto imple : simdjson::available_implementations) {
104+
BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength), outlength),
109105
outlength, memcpy(buffer, p.data(), p.size()), repeat, volume,
110106
!just_data);
107+
}
108+
111109
printf("minisize = %zu, original size = %zu (minified down to %.2f percent "
112110
"of original) \n",
113111
outlength, p.size(), outlength * 100.0 / p.size());
@@ -121,8 +119,9 @@ int main(int argc, char *argv[]) {
121119
!just_data);
122120

123121
char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1);
124-
size_t minisize = simdjson::json_minify((const uint8_t *)p.data(), p.size(),
125-
(uint8_t *)mini_buffer);
122+
size_t minisize;
123+
simdjson::active_implementation->minify((const uint8_t *)p.data(), p.size(),
124+
(uint8_t *)mini_buffer, minisize);
126125
mini_buffer[minisize] = '\0';
127126

128127
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(),
@@ -171,6 +170,7 @@ int main(int argc, char *argv[]) {
171170
automated_reallocation),
172171
simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume,
173172
!just_data);
173+
174174
free(buffer);
175175
free(ast_buffer);
176176
free(mini_buffer);

benchmark/parse.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,12 @@ struct option_struct {
109109
case 'a': {
110110
const implementation *impl = simdjson::available_implementations[optarg];
111111
if (!impl) {
112-
exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a haswell, westmere or arm64");
112+
std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a with one of ";
113+
for (auto imple : simdjson::available_implementations) {
114+
exit_message += imple->name();
115+
exit_message += " ";
116+
}
117+
exit_usage(exit_message);
113118
}
114119
simdjson::active_implementation = impl;
115120
break;

doc/tape.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,12 @@ Simple JSON nodes are represented with one tape element:
8484
## Integer and Double values
8585

8686
Integer values are represented as two 64-bit tape elements:
87-
- The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
88-
- The 64-bit value `('u' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be unsigned 64-bit values.
87+
- The 64-bit value `('l' << 56)` followed by the 64-bit integer value literally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
88+
- The 64-bit value `('u' << 56)` followed by the 64-bit integer value literally. Integer values are assumed to be unsigned 64-bit values.
8989

9090

9191
Float values are represented as two 64-bit tape elements:
92-
- The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.
92+
- The 64-bit value `('d' << 56)` followed by the 64-bit double value literally in standard IEEE 754 notation.
9393

9494
Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance.
9595

include/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ set(SIMDJSON_INCLUDE
1616
${SIMDJSON_INCLUDE_DIR}/simdjson/inline/padded_string.h
1717
${SIMDJSON_INCLUDE_DIR}/simdjson/internal/jsonformatutils.h
1818
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
19-
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h
2019
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
2120
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonstream.h
2221
${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h

include/simdjson.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include "simdjson/implementation.h"
1111
#include "simdjson/document.h"
1212
#include "simdjson/document_stream.h"
13-
#include "simdjson/jsonminifier.h"
1413

1514
// Deprecated API
1615
#include "simdjson/parsedjsoniterator.h"

include/simdjson/document.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1522,6 +1522,14 @@ class document::parser {
15221522
//
15231523
size_t _max_depth;
15241524

1525+
//
1526+
// The loaded buffer (reused each time load() is called)
1527+
//
1528+
std::unique_ptr<char[], decltype(&aligned_free_char)> loaded_bytes;
1529+
1530+
// Capacity of loaded_bytes buffer.
1531+
size_t _loaded_bytes_capacity{0};
1532+
15251533
// all nodes are stored on the doc.tape using a 64-bit word.
15261534
//
15271535
// strings, double and ints are stored as
@@ -1543,6 +1551,11 @@ class document::parser {
15431551
// and auto-allocate if not.
15441552
inline error_code ensure_capacity(size_t desired_capacity) noexcept;
15451553

1554+
//
1555+
// Read the file into loaded_bytes
1556+
//
1557+
inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
1558+
15461559
#if SIMDJSON_EXCEPTIONS
15471560
// Used internally to get the document
15481561
inline const document &get_document() const noexcept(false);
@@ -1555,7 +1568,7 @@ class document::parser {
15551568
/**
15561569
* Minifies a JSON element or document, printing the smallest possible valid JSON.
15571570
*
1558-
* document doc = document::parse(" [ 1 , 2 , 3 ] "_pad);
1571+
* document doc = document::parse(" [ 1 , 2 , 3 ] "_padded);
15591572
* cout << minify(doc) << endl; // prints [1,2,3]
15601573
*
15611574
*/

0 commit comments

Comments
 (0)