Skip to content

Commit f1b4a54

Browse files
authored
add fuzz element (simdjson#1204)
* add definitions for is_number and tie (by lemire) * add fuzzer for element * update fuzz documentation * fix UB in creating an empty padded string * don't bother null terminating padded_string, it is done by the std::memset already * refactor fuzz data splitting into a separate class
1 parent 58e7106 commit f1b4a54

11 files changed

+379
-106
lines changed

.github/workflows/fuzzers.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
runs-on: ubuntu-latest
1414
env:
1515
# fuzzers that use the default implementation
16-
defaultimplfuzzers: atpointer dump dump_raw_tape minify parser print_json
16+
defaultimplfuzzers: atpointer dump dump_raw_tape element minify parser print_json
1717
# fuzzers that loop over the implementations themselves
1818
implfuzzers: implementations minifyimpl utf8
1919
implementations: haswell westmere fallback

fuzz/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ if(ENABLE_FUZZING)
5353
implement_fuzzer(fuzz_atpointer)
5454
implement_fuzzer(fuzz_dump)
5555
implement_fuzzer(fuzz_dump_raw_tape)
56+
implement_fuzzer(fuzz_element)
5657
implement_fuzzer(fuzz_implementations) # parses and serializes again, compares across implementations
5758
implement_fuzzer(fuzz_minify) # minify *with* parsing
5859
implement_fuzzer(fuzz_minifyimpl) # minify *without* parsing, plus compare implementations

fuzz/FuzzUtils.h

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
#define SIMDJSON_FUZZUTILS_H
33

44
#include <cstdint>
5+
#include <vector>
6+
#include <string_view>
7+
#include <cstring> //memcpy
58

69
// view data as a byte pointer
710
template <typename T> inline const std::uint8_t* as_bytes(const T* data) {
@@ -14,4 +17,142 @@ template <typename T> inline const char* as_chars(const T* data) {
1417
}
1518

1619

20+
21+
22+
// Splits the input into strings, using a four byte separator which is human
23+
// readable. Makes for nicer debugging of fuzz data.
24+
// See https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#magic-separator
25+
// for background. Note: don't use memmem, it is not standard C++.
26+
inline std::vector<std::string_view> split(const char* Data, size_t Size) {
27+
28+
std::vector<std::string_view> ret;
29+
30+
using namespace std::literals;
31+
constexpr auto sep="\n~~\n"sv;
32+
33+
std::string_view all(Data,Size);
34+
auto pos=all.find(sep);
35+
while(pos!=std::string_view::npos) {
36+
ret.push_back(all.substr(0,pos));
37+
all=all.substr(pos+sep.size());
38+
pos=all.find(sep);
39+
}
40+
ret.push_back(all);
41+
return ret;
42+
}
43+
44+
// Generic helper to split fuzz data into usable parts, like ints etc.
45+
// Note that it does not throw, instead it sets the data pointer to null
46+
// if the input is exhausted.
47+
struct FuzzData {
48+
// data may not be null, even if size is zero.
49+
FuzzData(const uint8_t* data,
50+
size_t size) : Data(data),Size(size){}
51+
52+
///range is inclusive
53+
template<int Min, int Max>
54+
int getInt() {
55+
static_assert (Min<Max,"min must be <max");
56+
57+
// make this constexpr, can't overflow because that is UB and is forbidden
58+
// in constexpr evaluation
59+
constexpr int range=(Max-Min)+1;
60+
constexpr unsigned int urange=range;
61+
62+
// don't use std::uniform_int_distribution, we don't want to pay for
63+
// over consumption of random data. Accept the slightly non-uniform distribution.
64+
if(range<256)
65+
return Min+static_cast<int>(get<uint8_t>()%urange);
66+
if(range<65536)
67+
return Min+static_cast<int>(get<uint16_t>()%urange);
68+
69+
return Min+static_cast<int>(get<uint32_t>()%urange);
70+
}
71+
72+
template<typename T>
73+
T get() {
74+
const auto Nbytes=sizeof(T);
75+
T ret{};
76+
if(Size<Nbytes) {
77+
//don't throw, signal with null instead.
78+
Data=nullptr;
79+
Size=0;
80+
return ret;
81+
}
82+
std::memcpy(&ret,Data,Nbytes);
83+
Data+=Nbytes;
84+
Size-=Nbytes;
85+
return ret;
86+
}
87+
88+
// gets a string view with length in [Min,Max]
89+
template<int Min, int Max>
90+
std::string_view get_stringview() {
91+
static_assert (Min>=0,"Min must be positive");
92+
const int len=getInt<Min,Max>();
93+
const unsigned int ulen=static_cast<unsigned int>(len);
94+
if(ulen<Size) {
95+
std::string_view ret(chardata(),ulen);
96+
Data+=len;
97+
Size-=ulen;
98+
return ret;
99+
}
100+
101+
//mark that there is too little data to fulfill the request
102+
Data=nullptr;
103+
Size=0;
104+
105+
return {};
106+
}
107+
108+
// split the remainder of the data into string views,
109+
std::vector<std::string_view> splitIntoStrings() {
110+
std::vector<std::string_view> ret;
111+
if(Size>0) {
112+
ret=split(chardata(),Size);
113+
// all data consumed.
114+
Data+=Size;
115+
Size=0;
116+
}
117+
return ret;
118+
}
119+
120+
//are we good?
121+
explicit operator bool() const { return Data!=nullptr;}
122+
123+
//we are a URBG
124+
// https://en.cppreference.com/w/cpp/named_req/UniformRandomBitGenerator
125+
//The type G satisfies UniformRandomBitGenerator if Given
126+
// T, the type named by G::result_type
127+
// g, a value of type G
128+
//
129+
// The following expressions must be valid and have their specified effects
130+
// Expression Return type Requirements
131+
// G::result_type T T is an unsigned integer type
132+
using result_type=uint8_t;
133+
// G::min() T Returns the smallest value that G's operator() may return. The value is strictly less than G::max(). The function must be constexpr.
134+
static constexpr result_type min() {return 0;}
135+
// G::max() T Returns the largest value that G's operator() may return. The value is strictly greater than G::min(). The function must be constexpr.
136+
static constexpr result_type max() {return 255;}
137+
// g() T Returns a value in the closed interval [G::min(), G::max()]. Has amortized constant complexity.
138+
result_type operator()() {
139+
if(Size==0) {
140+
// return something varying, otherwise uniform_int_distribution may get
141+
// stuck
142+
return failcount++;
143+
}
144+
const result_type ret=Data[0];
145+
Data++;
146+
Size--;
147+
return ret;
148+
}
149+
// returns a pointer to data as const char* to avoid those cstyle casts
150+
const char* chardata() const {return static_cast<const char*>(static_cast<const void*>(Data));}
151+
// members
152+
const uint8_t* Data;
153+
size_t Size;
154+
uint8_t failcount=0;
155+
};
156+
157+
17158
#endif // SIMDJSON_FUZZUTILS_H

fuzz/Fuzzing.md

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,40 +9,58 @@
99

1010
The simdjson library tries to follow [fuzzing best practises](https://google.github.io/oss-fuzz/advanced-topics/ideal-integration/#summary).
1111

12-
The simdjson library is continuously fuzzed on [oss-fuzz](https://github.com/google/oss-fuzz). In case a bug is found, the offending input is minimized and tested for reproducibility. A report with the details is automatically filed, and the contact persons at simdjson are notified via email. An issue is opened at the oss-fuzz bugtracker with restricted view access. When the bug is fixed, the issue is automatically closed.
12+
There is both "normal" fuzzers just feeding the api with fuzz data, as well as **differential** fuzzers. The differential fuzzers feed the same data to the multiple implementations (haswell, westmere and fallback) and ensure the same results are achieved. This makes sure the user will always get the same answer regardless of which implementation is in use.
1313

14-
Bugs are automatically made visible to the public after a period of time. An example of a bug that was found, fixed and closed can be seen here: [oss-fuzz 18714](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=18714).
14+
The fuzzers are used in several ways.
1515

16+
* local fuzzing - for developers testing their changes before pushing and/or during development of the fuzzers themselves.
17+
* CI fuzzing - for weeding out those easy to find bugs in pull requests, before they are merged.
18+
* oss-fuzz - heavy duty 24/7 fuzzing provided by the google driven oss-fuzz project
1619

17-
## Currently open bugs
20+
## Local fuzzing
21+
Just invoke fuzz/quick_check.sh, it will download the latest corpus from bintray (kept up to date by the CI fuzzers) and run the fuzzers for a short time. In case you want to run the fuzzers for longer, modify the timeout value in the script or invoke the fuzzer directly.
1822

23+
This requires linux with clang and cmake installed (recent Debian and Ubuntu are known to work fine).
1924

20-
You can find the currently opened bugs, if any at [bugs.chromium.org](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&q=proj%3Asimdjson&can=2): make sure not to miss the "Open Issues" selector. Bugs that are fixed by follow-up commits are automatically closed.
25+
It is also possible to run the full oss-fuzz setup by following [these oss-fuzz instructions](https://google.github.io/oss-fuzz/getting-started/new-project-guide/#testing-locally) with PROJECT_NAME set to simdjson. You will need rights to run docker.
2126

22-
## Integration with oss-fuzz
27+
## Fuzzing as a CI job - x64
2328

24-
Changes to the integration with oss-fuzz are made by making pull requests against the oss-fuzz github repo. An example can be seen at [oss-fuzz pull request 3013](https://github.com/google/oss-fuzz/pull/3013).
29+
There is a CI job which builds and runs the fuzzers. This is aimed to catch the "easy to fuzz" bugs quickly, without having to wait until pull requests are merged and eventually built and run by oss-fuzz.
2530

26-
As little code as possible is kept at oss-fuzz since it is inconvenient to change. The [oss-fuzz build script](https://github.com/google/oss-fuzz/blob/b96dd54183f727a5d90c786e0fb01ec986c74d30/projects/simdjson/build.sh#L18) invokes [the script from the simdjson repo](https://github.com/simdjson/simdjson/blob/master/fuzz/ossfuzz.sh).
31+
The CI job does the following
32+
- builds a fast fuzzer, with full optimization but less checks which is good at rapidly exploring the input space
33+
- builds a heavily sanitized fuzzer, which is good at detecting errors
34+
- downloads the stored corpus
35+
- runs the fast fuzzer build for a while, to grow the corpus
36+
- runs the sanitizer fuzzer for a while, using the input found by the fast fuzzer
37+
- using a reproduce build (uninstrumented), executes a subset of the test cases in the corpus through valgrind
38+
- minimizes the corpus and uploads it (if on the master branch)
39+
- stores the corpus and valgrind output as artifacts
2740

41+
The job is available under the actions tab, here is a [direct link](https://github.com/simdjson/simdjson/actions?query=workflow%3A%22Fuzz+and+run+valgrind%22).
2842

43+
The corpus will grow over time and easy to find bugs will be detected already during the pull request stage. Also, it will keep the fuzzer builds from bit rot.
2944

30-
## Fuzzing as a CI job
45+
## Fuzzing as a CI job - arm64
46+
There is also a job running the fuzzers on arm64 (see .drone.yml) to make sure also the arm specific parts are fuzzed. This does not update the corpus, it just reuses what the x64 job finds.
3147

32-
There is a CI job which builds and runs the fuzzers. This is aimed to catch the "easy to fuzz" bugs quickly, without having to wait until pull requests are merged and eventually built and run by oss-fuzz.
48+
## Fuzzing on oss-fuzz
49+
The simdjson library is continuously fuzzed on [oss-fuzz](https://github.com/google/oss-fuzz). In case a bug is found, the offending input is minimized and tested for reproducibility. A report with the details is automatically filed, and the contact persons at simdjson are notified via email. An issue is opened at the oss-fuzz bugtracker with restricted view access. When the bug is fixed, the issue is automatically closed.
3350

34-
The CI job does the following
35-
- builds several variants (with/without avx, with/without sanitizers, a fast fuzzer)
36-
- downloads the stored corpus
37-
- runs the fastest fuzzer build for 30 seconds, to grow the corpus
38-
- runs each build variant for 10 seconds on each fuzzer
39-
- using a reproduce build (uninstrumented), executes all the test cases in the corpus through valgrind
40-
- minimizes the corpus and upload it (if on the master branch)
41-
- store the corpus and valgrind output as artifacts
51+
Bugs are automatically made visible to the public after a period of time. An example of a bug that was found, fixed and closed can be seen here: [oss-fuzz 18714](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=18714).
4252

43-
The job is available under the actions tab, here is a [direct link](https://github.com/simdjson/simdjson/actions?query=workflow%3A%22Run+fuzzers+on+stored+corpus+and+test+it+with+valgrind%22).
4453

45-
The corpus will grow over time and easy to find bugs will be detected already during the pull request stage. Also, it will keep the fuzzer builds from bit rot.
54+
## Currently open bugs
55+
56+
You can find the currently open bugs (if any) at [bugs.chromium.org](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&q=proj%3Asimdjson&can=2): make sure not to miss the "Open Issues" selector. Bugs that are fixed by follow-up commits are automatically closed.
57+
58+
## Integration with oss-fuzz
59+
60+
Changes to the integration with oss-fuzz are made by making pull requests against the oss-fuzz github repo. An example can be seen at [oss-fuzz pull request 3013](https://github.com/google/oss-fuzz/pull/3013).
61+
62+
As little code as possible is kept at oss-fuzz since it is inconvenient to change. The [oss-fuzz build script](https://github.com/google/oss-fuzz/blob/b96dd54183f727a5d90c786e0fb01ec986c74d30/projects/simdjson/build.sh#L18) invokes [the script from the simdjson repo](https://github.com/simdjson/simdjson/blob/master/fuzz/ossfuzz.sh).
63+
4664

4765
## Corpus
4866

@@ -55,32 +73,13 @@ One can also grab the corpus as an artifact from the github actions job. Pick a
5573
The code coverage from fuzzing is most easily viewed on the [oss-fuzz status panel](https://oss-fuzz.com/fuzzer-stats). Viewing the coverage does not require login, but the direct link is not easy to find. Substitute the date in the URL to get a more recent link:
5674
[https://storage.googleapis.com/oss-fuzz-coverage/simdjson/reports/20200411/linux/src/simdjson/report.html](https://storage.googleapis.com/oss-fuzz-coverage/simdjson/reports/20200411/linux/src/simdjson/report.html)
5775

76+
Keeping the coverage up is a never ending job. See [issue 368](https://github.com/simdjson/simdjson/issues/368)
5877

59-
## Running the fuzzers locally
60-
61-
This has only been tested on Linux (Debian and Ubuntu are known to work).
62-
63-
Make sure you have clang and cmake installed.
64-
The easiest way to get started is to run the following, standing in the root of the checked out repo:
65-
```
66-
fuzz/build_like_ossfuzz.sh
67-
```
68-
69-
Then invoke a fuzzer as shown by the following example:
70-
```
71-
mkdir -p out/parser
72-
build/fuzz/fuzz_parser out/parser/
73-
```
74-
75-
You can also use the more extensive fuzzer build script to get a variation of builds by using
76-
```
78+
## Reproducing
79+
To reproduce a test case, use the local build instruction. Then invoke the fuzzer (the fuzz_parser is shown as an example below) with the testcase as a command line argument:
80+
```shell
7781
fuzz/build_fuzzer_variants.sh
82+
build-sanitizers/fuzz/fuzz_parser my_testcase.json
7883
```
84+
In case this does not reproduce the bug, you may want to proceed with reproducing using the oss-fuzz tools. See the instructions [here](https://google.github.io/oss-fuzz/advanced-topics/reproducing/).
7985

80-
It is also possible to run the full oss-fuzz setup by following [these oss-fuzz instructions](https://google.github.io/oss-fuzz/getting-started/new-project-guide/#testing-locally) with PROJECT_NAME set to simdjson. You will need rights to run docker.
81-
82-
## Reproducing
83-
To reproduce a test case, build the fuzzers, then invoke it with the testcase as a command line argument:
84-
```
85-
build/fuzz/fuzz_parser my_testcase.json
86-
```

fuzz/fuzz_atpointer.cpp

Lines changed: 33 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,47 @@
1-
#include "simdjson.h"
21
#include "FuzzUtils.h"
2+
#include "simdjson.h"
33
#include <cstddef>
44
#include <cstdint>
55
#include <string>
66
#include <string_view>
77

8-
struct FuzzData {
9-
std::string_view json_pointer;
10-
std::string_view json_doc;
11-
};
12-
13-
/**
14-
* @brief split split fuzz data into a pointer and a document
15-
* @param Data
16-
* @param Size
17-
* @return
18-
*/
19-
FuzzData split(const char *Data, size_t Size) {
20-
21-
using namespace std::literals;
22-
constexpr auto sep="\n~~~\n"sv;
23-
24-
std::string_view all(Data,Size);
25-
auto pos=all.find(sep);
26-
if(pos==std::string_view::npos) {
27-
//not found.
28-
return FuzzData{std::string_view{},all};
29-
} else {
30-
return FuzzData{std::string_view{all.substr(0,pos)},all.substr(pos+sep.size())};
31-
}
32-
}
33-
348
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
359

36-
// Split data into two strings, json pointer and the document string.
37-
// Might end up with none, either or both being empty, important for
38-
// covering edge cases such as https://github.com/simdjson/simdjson/issues/1142
39-
// Inputs missing the separator line will get an empty json pointer
40-
// but the all the input put in the document string. This means
41-
// test data from other fuzzers that take json input works for this fuzzer
42-
// as well.
43-
const auto fd=split(as_chars(Data),Size);
44-
45-
simdjson::dom::parser parser;
46-
47-
// parse without exceptions, for speed
48-
auto res=parser.parse(fd.json_doc.data(),fd.json_doc.size());
49-
if(res.error())
50-
return 0;
10+
// Split data into two strings, json pointer and the document string.
11+
// Might end up with none, either or both being empty, important for
12+
// covering edge cases such as
13+
// https://github.com/simdjson/simdjson/issues/1142 Inputs missing the
14+
// separator line will get an empty json pointer but the all the input put in
15+
// the document string. This means test data from other fuzzers that take json
16+
// input works for this fuzzer as well.
17+
FuzzData fd(Data, Size);
18+
auto strings = fd.splitIntoStrings();
19+
while (strings.size() < 2) {
20+
strings.emplace_back();
21+
}
22+
assert(strings.size() >= 2);
23+
24+
simdjson::dom::parser parser;
25+
26+
// parse without exceptions, for speed
27+
auto res = parser.parse(strings[0]);
28+
if (res.error())
29+
return 0;
5130

52-
simdjson::dom::element root;
53-
if(res.get(root))
54-
return 0;
31+
simdjson::dom::element root;
32+
if (res.get(root))
33+
return 0;
5534

56-
auto maybe_leaf=root.at_pointer(fd.json_pointer);
57-
if(maybe_leaf.error())
58-
return 0;
35+
auto maybe_leaf = root.at_pointer(strings[1]);
36+
if (maybe_leaf.error())
37+
return 0;
5938

60-
simdjson::dom::element leaf;
61-
if(maybe_leaf.get(leaf))
62-
return 0;
39+
simdjson::dom::element leaf;
40+
if (maybe_leaf.get(leaf))
41+
return 0;
6342

64-
std::string_view sv;
65-
if(leaf.get_string().get(sv))
66-
return 0;
43+
std::string_view sv;
44+
if (leaf.get_string().get(sv))
6745
return 0;
46+
return 0;
6847
}

0 commit comments

Comments
 (0)