Skip to content

Commit 14039d0

Browse files
authored
Adding a new benchmark for ondemand: distinct user id (simdjson#1239)
* Adding a distinct user id benchmark * reenabling everything * Removing an unnecessary "value()". * Better tests of the examples and some fixes. * Guarding exception code.
1 parent c592da4 commit 14039d0

File tree

6 files changed

+365
-5
lines changed

6 files changed

+365
-5
lines changed

benchmark/bench_ondemand.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,8 @@ SIMDJSON_POP_DISABLE_WARNINGS
1919
#include "kostya/iter.h"
2020
#include "kostya/dom.h"
2121

22+
#include "distinctuserid/ondemand.h"
23+
#include "distinctuserid/dom.h"
24+
25+
2226
BENCHMARK_MAIN();
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
2+
#pragma once
3+
#include <vector>
4+
#include <cstdint>
5+
#include "event_counter.h"
6+
#include "json_benchmark.h"
7+
8+
9+
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
10+
11+
void remove_duplicates(std::vector<int64_t> &v) {
12+
std::sort(v.begin(), v.end());
13+
auto last = std::unique(v.begin(), v.end());
14+
v.erase(last, v.end());
15+
}
16+
17+
//
18+
// Interface
19+
//
20+
21+
namespace distinct_user_id {
22+
template<typename T> static void DistinctUserID(benchmark::State &state);
23+
} // namespace
24+
25+
//
26+
// Implementation
27+
//
28+
29+
#include "dom.h"
30+
31+
32+
namespace distinct_user_id {
33+
34+
using namespace simdjson;
35+
36+
template<typename T> static void DistinctUserID(benchmark::State &state) {
37+
//
38+
// Load the JSON file
39+
//
40+
constexpr const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
41+
error_code error;
42+
padded_string json;
43+
if ((error = padded_string::load(TWITTER_JSON).get(json))) {
44+
std::cerr << error << std::endl;
45+
state.SkipWithError("error loading");
46+
return;
47+
}
48+
49+
JsonBenchmark<T, Dom>(state, json);
50+
}
51+
52+
} // namespace distinct_user_id

benchmark/distinctuserid/dom.h

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#pragma once
2+
3+
#if SIMDJSON_EXCEPTIONS
4+
5+
#include "distinctuserid.h"
6+
7+
namespace distinct_user_id {
8+
9+
using namespace simdjson;
10+
11+
12+
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element);
13+
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::array array) {
14+
for (auto child : array) {
15+
simdjson_recurse(v, child);
16+
}
17+
}
18+
void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::object object) {
19+
for (auto [key, value] : object) {
20+
if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) {
21+
// we are in an object under the key "user"
22+
simdjson::error_code error;
23+
simdjson::dom::object child_object;
24+
simdjson::dom::object child_array;
25+
if (not (error = value.get(child_object))) {
26+
for (auto [child_key, child_value] : child_object) {
27+
if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) {
28+
int64_t x;
29+
if (not (error = child_value.get(x))) {
30+
v.push_back(x);
31+
}
32+
}
33+
simdjson_recurse(v, child_value);
34+
}
35+
} else if (not (error = value.get(child_array))) {
36+
simdjson_recurse(v, child_array);
37+
}
38+
// end of: we are in an object under the key "user"
39+
} else {
40+
simdjson_recurse(v, value);
41+
}
42+
}
43+
}
44+
simdjson_really_inline void simdjson_recurse(std::vector<int64_t> & v, simdjson::dom::element element) {
45+
simdjson_unused simdjson::error_code error;
46+
simdjson::dom::array array;
47+
simdjson::dom::object object;
48+
if (not (error = element.get(array))) {
49+
simdjson_recurse(v, array);
50+
} else if (not (error = element.get(object))) {
51+
simdjson_recurse(v, object);
52+
}
53+
}
54+
55+
56+
57+
58+
59+
class Dom {
60+
public:
61+
simdjson_really_inline bool Run(const padded_string &json);
62+
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
63+
simdjson_really_inline size_t ItemCount() { return ids.size(); }
64+
65+
private:
66+
dom::parser parser{};
67+
std::vector<int64_t> ids{};
68+
69+
};
70+
void print_vec(const std::vector<int64_t> &v) {
71+
for (auto i : v) {
72+
std::cout << i << " ";
73+
}
74+
std::cout << std::endl;
75+
}
76+
77+
simdjson_really_inline bool Dom::Run(const padded_string &json) {
78+
ids.clear();
79+
dom::element doc = parser.parse(json);
80+
simdjson_recurse(ids, doc);
81+
remove_duplicates(ids);
82+
return true;
83+
}
84+
85+
BENCHMARK_TEMPLATE(DistinctUserID, Dom);
86+
87+
} // namespace distinct_user_id
88+
89+
#endif // SIMDJSON_EXCEPTIONS

benchmark/distinctuserid/ondemand.h

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#pragma once
2+
3+
#if SIMDJSON_EXCEPTIONS
4+
5+
#include "distinctuserid.h"
6+
7+
namespace distinct_user_id {
8+
9+
using namespace simdjson;
10+
using namespace simdjson::builtin;
11+
12+
13+
class OnDemand {
14+
public:
15+
OnDemand() {
16+
if(!displayed_implementation) {
17+
std::cout << "On Demand implementation: " << builtin_implementation()->name() << std::endl;
18+
displayed_implementation = true;
19+
}
20+
}
21+
simdjson_really_inline bool Run(const padded_string &json);
22+
simdjson_really_inline const std::vector<int64_t> &Result() { return ids; }
23+
simdjson_really_inline size_t ItemCount() { return ids.size(); }
24+
25+
private:
26+
ondemand::parser parser{};
27+
std::vector<int64_t> ids{};
28+
29+
static inline bool displayed_implementation = false;
30+
};
31+
32+
simdjson_really_inline bool OnDemand::Run(const padded_string &json) {
33+
ids.clear();
34+
// Walk the document, parsing as we go
35+
auto doc = parser.iterate(json);
36+
for (ondemand::object tweet : doc["statuses"]) {
37+
// We believe that all statuses have a matching
38+
// user, and we are willing to throw when they do not:
39+
//
40+
// You might think that you do not need the braces, but
41+
// you do, otherwise you will get the wrong answer. That is
42+
// because you can only have one active object or array
43+
// at a time.
44+
{
45+
ondemand::object user = tweet["user"];
46+
int64_t id = user["id"];
47+
ids.push_back(id);
48+
}
49+
// Not all tweets have a "retweeted_status", but when they do
50+
// we want to go and find the user within.
51+
auto retweet = tweet["retweeted_status"];
52+
if(!retweet.error()) {
53+
ondemand::object retweet_content = retweet;
54+
ondemand::object reuser = retweet_content["user"];
55+
int64_t rid = reuser["id"];
56+
ids.push_back(rid);
57+
}
58+
}
59+
remove_duplicates(ids);
60+
return true;
61+
}
62+
63+
BENCHMARK_TEMPLATE(DistinctUserID, OnDemand);
64+
65+
} // namespace distinct_user_id
66+
67+
#endif // SIMDJSON_EXCEPTIONS

doc/ondemand.md

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ auto doc = parser.iterate(json);
2929
for (auto tweet : doc["statuses"]) {
3030
std::string_view text = tweet["text"];
3131
std::string_view screen_name = tweet["user"]["screen_name"];
32+
std::string_view screen_name;
33+
{
34+
ondemand::object user = tweet["user"];
35+
screen_name = user["screen_name"];
36+
}
3237
uint64_t retweets = tweet["retweet_count"];
3338
uint64_t favorites = tweet["favorite_count"];
3439
cout << screen_name << " (" << retweets << " retweets / " << favorites << " favorites): " << text << endl;
@@ -313,6 +318,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
313318
rely on error chaining, so it is possible to delay error checks: we shall shortly explain error
314319
chaining more fully.
315320

321+
NOTE: You should always have such a `document` instance (here `doc`) and it should remain in scope for the duration
322+
of your parsing function. E.g., you should not use the returned document as a temporary (e.g., `auto x = parser.iterate(json).get_object();`)
323+
followed by other operations as the destruction of the `document` instance makes all of the derived instances
324+
ill-defined.
325+
316326

317327
3. We iterate over the "statuses" field using a typical C++ iterator, reading past the initial
318328
`{ "statuses": [ {`.
@@ -355,6 +365,11 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
355365
when you attempt to cast the final `simdjson_result<object>` to object. Upon casting, an exception is
356366
thrown if there was an error.
357367

368+
NOTE: while the document can be queried once for a key as if it were an object, it is not an actual object
369+
instance. If you need to treat it as an object (e.g., to query more than one keys), you can cast it as
370+
such `ondemand::object root_object = doc.get_object();`.
371+
372+
358373
4. We get the `"text"` field as a string.
359374

360375
```c++
@@ -379,20 +394,28 @@ To help visualize the algorithm, we'll walk through the example C++ given at the
379394
4. We get the `"screen_name"` from the `"user"` object.
380395

381396
```c++
382-
std::string_view screen_name = tweet["user"]["screen_name"];
397+
ondemand::object user = tweet["user"];
398+
screen_name = user["screen_name"];
383399
```
384400

385401
First, `["user"]` checks whether there are any more object fields by looking for either `,` or
386402
`}`. Then it matches `"user"` and validates the `:`.
387403

388404
`["screen_name"]` then converts to object, checking for `{`, and finds `"screen_name"`.
389405

390-
To convert to string, `lemire` is written to the document's string buffer, which now has *two*
391-
string_views pointing into it, and looks like `first!\0lemire\0`.
406+
To convert the result to usable string (i.e., the screen name `lemire`), the characters are written to the document's
407+
string buffer (after possibly escaping them), which now has *two* string_views pointing into it, and looks like `first!\0lemire\0`.
392408

393409
Finally, the temporary user object is destroyed, causing it to skip the remainder of the object
394410
(`}`).
395411

412+
NOTE: You may only have one active array or object active at any given time. An array or an object becomes
413+
active when the `ondemand::object` or `ondemand::array` is created, and it releases its 'focus' when
414+
its destructor is called. If you create an array or an object located inside a parent object or array,
415+
the child array or object becomes active while the parent becomes temporarily inactive. If you access
416+
several sibling objects or arrays, you must ensure that the destructor is called by scoping each access
417+
(see Iteration Safety section below for further details).
418+
396419
5. We get `"retweet_count"` and `"favorite_count"` as unsigned integers.
397420

398421
```c++
@@ -484,8 +507,6 @@ for(auto field : doc.get_object()) {
484507
}
485508
```
486509

487-
488-
489510
### Iteration Safety
490511

491512
The On Demand API is powerful. To compensate, we add some safeguards to ensure that it can be used without fear
@@ -501,6 +522,48 @@ in production systems:
501522
if it was `nullptr` but did not care what the actual value was--it will iterate. The destructor automates
502523
the iteration.
503524

525+
Some care is needed when using the On Demand API in scenarios where you need to access several sibling arrays or objects because
526+
only one object or array can be active at any one time. Let us consider the following example:
527+
528+
```C++
529+
ondemand::parser parser;
530+
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
531+
auto doc = parser.iterate(json);
532+
ondemand::object parent = doc["parent"];
533+
// parent owns the focus
534+
ondemand::object c1 = parent["child1"];
535+
// c1 owns the focus
536+
//
537+
if(std::string_view(c1["name"]) != "John") { ... }
538+
// c2 attempts to grab the focus from parent but fails
539+
ondemand::object c2 = parent["child2"];
540+
// c2 is now in an unsafe state and the following line would be unsafe
541+
// if(std::string_view(c2["name"]) != "Daniel") { return false; }
542+
```
543+
544+
A correct usage is given by the following example:
545+
546+
```C++
547+
ondemand::parser parser;
548+
const padded_string json = R"({ "parent": {"child1": {"name": "John"} , "child2": {"name": "Daniel"}} })"_padded;
549+
auto doc = parser.iterate(json);
550+
ondemand::object parent = doc["parent"];
551+
// At this point, parent owns the focus
552+
{
553+
ondemand::object c1 = parent["child1"];
554+
// c1 grabbed the focus from parent
555+
if(std::string_view(c1["name"]) != "John") { return false; }
556+
}
557+
// c1 went out of scope, so its destructor was called and the focus
558+
// was handed back to parent.
559+
{
560+
ondemand::object c2 = parent["child2"];
561+
// c2 grabbed the focus from parent
562+
// the following is safe:
563+
if(std::string_view(c2["name"]) != "Daniel") { return false; }
564+
}
565+
```
566+
504567
### Benefits of the On Demand Approach
505568

506569
We expect that the On Demand approach has many of the performance benefits of the schema-based approach, while providing a flexibility that is similar to that of the DOM-based approach.

0 commit comments

Comments
 (0)