diff --git a/.appveyor.yml b/.appveyor.yml index ec5aa9db79..cdf8f00d4c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -4,30 +4,40 @@ configuration: Release image: Visual Studio 2019 platform: x64 +cache: + - C:\dependencies -> dependencies\CMakeLists.txt + environment: + # Forward slash is used because this is used in CMake as is + simdjson_DEPENDENCY_CACHE_DIR: C:/dependencies + matrix: - job_name: VS2019 - CMAKE_ARGS: -A %Platform% + CMAKE_ARGS: -A %Platform% - job_name: VS2019ARM - CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM? + CMAKE_ARGS: -A ARM64 -DSIMDJSON_DEVELOPER_MODE=ON -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM? - job_name: VS2017 (Static, No Threads) image: Visual Studio 2017 - CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF - CTEST_ARGS: -E checkperf + CMAKE_ARGS: -A %Platform% -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_ENABLE_THREADS=OFF + CTEST_ARGS: -LE explicitonly - job_name: VS2019 (Win32) platform: Win32 - CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway. - CTEST_ARGS: -E checkperf + CMAKE_ARGS: -A %Platform% -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=ON -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway. + CTEST_ARGS: -LE explicitonly + - job_name: VS2019 (Win32, No Exceptions) + platform: Win32 + CMAKE_ARGS: -A %Platform% -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=ON -DSIMDJSON_ENABLE_THREADS=ON -DSIMDJSON_EXCEPTIONS=OFF + CTEST_ARGS: -LE explicitonly - job_name: VS2015 image: Visual Studio 2015 - CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF - CTEST_ARGS: -E checkperf + CMAKE_ARGS: -A %Platform% -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_ENABLE_THREADS=OFF + CTEST_ARGS: -LE explicitonly build_script: - mkdir build - cd build - cmake --version - - cmake %CMAKE_ARGS% --parallel .. + - cmake %CMAKE_ARGS% .. - cmake -LH .. - cmake --build . --config %Configuration% --verbose --parallel diff --git a/.circleci/config.yml b/.circleci/config.yml index 17ed28962c..9b1f48ed79 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,5 +1,8 @@ version: 2.1 + +# We constantly run out of memory so please do not use parallelism (-j, -j4). + # Reusable image / compiler definitions executors: gcc8: @@ -8,8 +11,8 @@ executors: environment: CXX: g++-8 CC: gcc-8 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure gcc9: docker: @@ -17,8 +20,8 @@ executors: environment: CXX: g++-9 CC: gcc-9 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure gcc10: docker: @@ -26,8 +29,8 @@ executors: environment: CXX: g++-10 CC: gcc-10 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure clang10: docker: @@ -35,8 +38,8 @@ executors: environment: CXX: clang++-10 CC: clang-10 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure clang9: docker: @@ -44,8 +47,8 @@ executors: environment: CXX: clang++-9 CC: clang-9 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure clang6: docker: @@ -53,11 +56,24 @@ executors: environment: CXX: clang++-6.0 CC: clang-6.0 - BUILD_FLAGS: -j - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_BUILD_FLAGS: + CTEST_FLAGS: --output-on-failure # Reusable test commands (and initializer for clang 6) commands: + dependency_restore: + steps: + - restore_cache: + keys: + - cmake-cache-{{ checksum "dependencies/CMakeLists.txt" }} + + dependency_cache: + steps: + - save_cache: + key: cmake-cache-{{ checksum "dependencies/CMakeLists.txt" }} + paths: + - dependencies/.cache + install_cmake: steps: - run: apt-get update -qq @@ -68,33 +84,55 @@ commands: - checkout - run: mkdir -p build - cmake_build: + cmake_build_cache: steps: - cmake_prep - - run: | - cd build && - cmake $CMAKE_FLAGS -DCMAKE_INSTALL_PREFIX:PATH=destination .. && - make $BUILD_FLAGS all + - dependency_restore + - run: cmake -DSIMDJSON_DEVELOPER_MODE=ON $CMAKE_FLAGS -DCMAKE_INSTALL_PREFIX:PATH=destination -B build . + - dependency_cache # dependencies are produced in the configure step + + cmake_build: + steps: + - cmake_build_cache + - run: cmake --build build cmake_test: steps: - cmake_build - run: | - cd build && tools/json2json -h && + cd build && + tools/json2json -h && ctest $CTEST_FLAGS -L acceptance && - ctest $CTEST_FLAGS -LE acceptance -E checkperf + ctest $CTEST_FLAGS -LE acceptance -LE explicitonly + + cmake_assert_test: + steps: + - run: | + cd build && + tools/json2json -h && + ctest $CTEST_FLAGS -L assert cmake_test_all: steps: - cmake_build - run: | - cd build && tools/json2json -h && - ctest $CTEST_FLAGS -L acceptance -LE per_implementation && - SIMDJSON_FORCE_IMPLEMENTATION=haswell ctest $CTEST_FLAGS -L per_implementation && - SIMDJSON_FORCE_IMPLEMENTATION=westmere ctest $CTEST_FLAGS -L per_implementation &&SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation && + cd build && + tools/json2json -h && + ctest $CTEST_FLAGS -DSIMDJSON_IMPLEMENTATION="haswell;westmere;fallback" -L acceptance -LE per_implementation && + SIMDJSON_FORCE_IMPLEMENTATION=haswell ctest $CTEST_FLAGS -L per_implementation -LE explicitonly && + SIMDJSON_FORCE_IMPLEMENTATION=westmere ctest $CTEST_FLAGS -L per_implementation -LE explicitonly && + SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation -LE explicitonly && ctest $CTEST_FLAGS -LE "acceptance|per_implementation" # Everything we haven't run yet, run now. + cmake_perftest: + steps: + - cmake_build_cache + - run: | + cmake -DSIMDJSON_ENABLE_DOM_CHECKPERF=ON --build build --target checkperf && + cd build && + ctest --output-on-failure -R checkperf + # we not only want cmake to build and run tests, but we want also a successful installation from which we can build, link and run programs cmake_install_test: # this version builds, install, test and then verify from the installation steps: @@ -113,49 +151,73 @@ jobs: executor: gcc10 environment: { CMAKE_FLAGS: -DSIMDJSON_JUST_LIBRARY=ON } steps: [ cmake_build, cmake_install_test, cmake_installed_test_cxx20 ] + assert-gcc10: + description: Build the library with asserts on, install it and run tests + executor: gcc10 + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DCMAKE_CXX_FLAGS_RELEASE=-O3 } + steps: [ cmake_test, cmake_assert_test ] + assert-clang10: + description: Build just the library, install it and do a basic test + executor: clang10 + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DCMAKE_CXX_FLAGS_RELEASE=-O3 } + steps: [ cmake_test, cmake_assert_test ] + gcc10-perftest: + description: Build and run performance tests on GCC 10 and AVX 2 with a cmake static build, this test performance regression + executor: gcc10 + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DBUILD_SHARED_LIBS=OFF } + steps: [ cmake_perftest ] gcc10: - description: Build and run tests on GCC 10 and AVX 2 with a cmake static build, this test performance regression + description: Build and run tests on GCC 10 and AVX 2 with a cmake static build executor: gcc10 - environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_BUILD_STATIC=ON } - steps: [ cmake_test_all, cmake_install_test, cmake_installed_test_cxx20 ] + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DBUILD_SHARED_LIBS=OFF } + steps: [ cmake_test, cmake_install_test, cmake_installed_test_cxx20 ] clang6: description: Build and run tests on clang 6 and AVX 2 with a cmake static build executor: clang6 - environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_BUILD_STATIC=ON } + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DBUILD_SHARED_LIBS=OFF } steps: [ cmake_test, cmake_install_test ] clang10: description: Build and run tests on clang 10 and AVX 2 with a cmake static build executor: clang10 - environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_BUILD_STATIC=ON } + environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DBUILD_SHARED_LIBS=OFF } steps: [ cmake_test, cmake_install_test, cmake_installed_test_cxx20 ] # libcpp libcpp-clang10: description: Build and run tests on clang 10 and AVX 2 with a cmake static build and libc++ executor: clang10 - environment: { CMAKE_FLAGS: -DSIMDJSON_USE_LIBCPP=ON -DSIMDJSON_BUILD_STATIC=ON } + environment: { CMAKE_FLAGS: -DSIMDJSON_USE_LIBCPP=ON -DBUILD_SHARED_LIBS=OFF } steps: [ cmake_test, cmake_install_test, cmake_installed_test_cxx20 ] # sanitize sanitize-gcc10: description: Build and run tests on GCC 10 and AVX 2 with a cmake sanitize build executor: gcc10 - environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, BUILD_FLAGS: "", CTEST_FLAGS: -j4 --output-on-failure -E checkperf } + environment: { CMAKE_FLAGS: -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=ON -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } steps: [ cmake_test ] sanitize-clang10: description: Build and run tests on clang 10 and AVX 2 with a cmake sanitize build executor: clang10 - environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: -j4 --output-on-failure -E checkperf } + environment: { CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DSIMDJSON_NO_FORCE_INLINING=ON -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } + steps: [ cmake_test ] + threadsanitize-gcc10: + description: Build and run tests on GCC 10 and AVX 2 with a cmake sanitize build + executor: gcc10 + environment: { CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DSIMDJSON_SANITIZE_THREADS=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } + steps: [ cmake_test ] + threadsanitize-clang10: + description: Build and run tests on clang 10 and AVX 2 with a cmake sanitize build + executor: clang10 + environment: { CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DSIMDJSON_NO_FORCE_INLINING=ON -DSIMDJSON_SANITIZE_THREADS=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } steps: [ cmake_test ] - # dynamic dynamic-gcc10: description: Build and run tests on GCC 10 and AVX 2 with a cmake dynamic build executor: gcc10 - environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF } + environment: { CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON } steps: [ cmake_test, cmake_install_test ] dynamic-clang10: description: Build and run tests on clang 10 and AVX 2 with a cmake dynamic build executor: clang10 - environment: { CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF } + environment: { CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON } steps: [ cmake_test, cmake_install_test ] # unthreaded @@ -188,19 +250,24 @@ jobs: # make (test and checkperf) arch-haswell-gcc10: - description: Build, run tests and check performance on GCC 7 with -march=haswell + description: Build, run tests and check performance on GCC 10 with -march=haswell executor: gcc10 environment: { CXXFLAGS: -march=haswell } steps: [ cmake_test ] arch-nehalem-gcc10: - description: Build, run tests and check performance on GCC 7 with -march=nehalem + description: Build, run tests and check performance on GCC 10 with -march=nehalem executor: gcc10 environment: { CXXFLAGS: -march=nehalem } steps: [ cmake_test ] - no-computed-goto-gcc10: - description: Build, run tests and check performance on GCC 7 with -DSIMDJSON_NO_COMPUTED_GOTO=true + sanitize-haswell-gcc10: + description: Build and run tests on GCC 10 and AVX 2 with a cmake sanitize build executor: gcc10 - environment: { CXXFLAGS: -DSIMDJSON_NO_COMPUTED_GOTO=true } + environment: { CXXFLAGS: -march=haswell, CMAKE_FLAGS: -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=ON -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } + steps: [ cmake_test ] + sanitize-haswell-clang10: + description: Build and run tests on clang 10 and AVX 2 with a cmake sanitize build + executor: clang10 + environment: { CXXFLAGS: -march=haswell, CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DSIMDJSON_NO_FORCE_INLINING=ON -DSIMDJSON_SANITIZE=ON, CTEST_FLAGS: --output-on-failure -LE explicitonly } steps: [ cmake_test ] workflows: @@ -219,6 +286,8 @@ workflows: # full single-implementation tests - sanitize-gcc10 - sanitize-clang10 + - threadsanitize-gcc10 + - threadsanitize-clang10 - dynamic-gcc10 - dynamic-clang10 - unthreaded-gcc10 @@ -231,9 +300,17 @@ workflows: # quicker make single-implementation tests - arch-haswell-gcc10 - arch-nehalem-gcc10 - - no-computed-goto-gcc10 + + + # sanitized single-implementation tests + - sanitize-haswell-gcc10 + - sanitize-haswell-clang10 # testing "just the library" - justlib-gcc10 + # testing asserts + - assert-gcc10 + - assert-clang10 + # TODO add windows: https://circleci.com/docs/2.0/configuration-reference/#windows diff --git a/.cirrus.yml b/.cirrus.yml index abef7f33ed..e6cff0c37d 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -6,6 +6,11 @@ task: env: ASSUME_ALWAYS_YES: YES + simdjson_DEPENDENCY_CACHE_DIR: $HOME/.dep_cache + dep_cache: + folder: $HOME/.dep_cache + reupload_on_changes: false + fingerprint_script: cat dependencies/CMakeLists.txt setup_script: - pkg update -f - pkg install bash @@ -14,8 +19,8 @@ task: build_script: - mkdir build - cd build - - cmake .. - - make -j4 + - cmake -DSIMDJSON_BASH=OFF -DSIMDJSON_GIT=OFF .. + - make test_script: - cd build - - ctest -j4 --output-on-failure -E checkperf \ No newline at end of file + - ctest --output-on-failure -LE explicitonly diff --git a/.clangd b/.clangd new file mode 100644 index 0000000000..df5af657cc --- /dev/null +++ b/.clangd @@ -0,0 +1,47 @@ +CompileFlags: + CompilationDatabase: build + Add: + - -Werror -Wall -Wextra -Weffc++ -Wsign-compare -Wshadow -Wwrite-strings -Wpointer-arith -Winit-self -Wconversion -Wno-sign-conversion + - -Wundefined-inline +Diagnostics: + Suppress: + - misc-unused-alias-decls + - misc-unused-using-decls + - misc-definitions-in-headers # TODO fix and remove these violations +--- +If: + PathMatch: + - include/.* + - src/.* + PathExclude: + - include/simdjson.h + - src/simdjson.cpp +CompileFlags: + Add: + - -Wno-unneeded-internal-declaration + - -Wno-undefined-internal # TODO fix and remove these violations + - -Wno-unused-function + - -Wno-unused-const-variable +Diagnostics: + Suppress: + - pp_including_mainfile_in_preamble + - unused-includes +--- +# Amalgamated files that require or partly define an implementation +If: + PathMatch: + - .*/(arm64|fallback|haswell|icelake|ppc64|westmere)/begin.h + - .*/generic/.* +Diagnostics: + Suppress: + - pragma_attribute_no_pop_eof +--- +# clang has a bad time detecting the push/pop together in src/ for some reason +If: + PathMatch: + - include/simdjson/.*/end.h + - src/(arm64|fallback|haswell|icelake|ppc64|westmere).cpp +Diagnostics: + Suppress: + - pragma_attribute_no_pop_eof + - pragma_attribute_stack_mismatch diff --git a/.dockerignore b/.dockerignore index 60a69cf989..6736aa1b5f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,7 @@ * !.git !Makefile -!amalgamate.sh +!amalgamate.py !benchmark !dependencies !include @@ -12,4 +12,4 @@ !src !style !tests -!tools \ No newline at end of file +!tools diff --git a/.drone.yml b/.drone.yml index df0bfdd564..90236f4812 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,46 +1,4 @@ kind: pipeline -name: i386-gcc # we do not support 32-bit systems, but we run tests -platform: { os: linux, arch: amd64 } -steps: -- name: Build and Test - image: i386/ubuntu - environment: - CC: gcc - CXX: g++ - BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf - commands: - - apt-get update -qq - - apt-get install -y g++ cmake gcc - - mkdir build - - cd build - - cmake $CMAKE_FLAGS .. - - cmake --build . $BUILD_FLAGS - - ctest $CTEST_FLAGS ---- -kind: pipeline -name: i386-clang # we do not support 32-bit systems, but we run tests -platform: { os: linux, arch: amd64 } -steps: -- name: Build and Test - image: i386/ubuntu - environment: - CC: clang-6.0 - CXX: clang++-6.0 - BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf - commands: - - apt-get update -qq - - apt-get install -y clang++-6.0 cmake - - mkdir build - - cd build - - cmake $CMAKE_FLAGS .. - - cmake --build . $BUILD_FLAGS - - ctest $CTEST_FLAGS ---- -kind: pipeline name: gcc9 platform: { os: linux, arch: amd64 } steps: @@ -50,16 +8,18 @@ steps: CC: gcc CXX: g++ BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_IMPLEMENTATION=icelake;haswell;westmere;fallback + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. - cmake --build . $BUILD_FLAGS - ctest $CTEST_FLAGS -L acceptance -LE per_implementation + - SIMDJSON_FORCE_IMPLEMENTATION=icelake ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=haswell ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=westmere ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation @@ -76,14 +36,15 @@ steps: CC: clang-6.0 CXX: clang++-6.0 BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_IMPLEMENTATION=icelake;haswell;westmere;fallback + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - mkdir build - cd build - cmake $CMAKE_FLAGS .. - cmake --build . $BUILD_FLAGS - ctest $CTEST_FLAGS -L acceptance -LE per_implementation + - SIMDJSON_FORCE_IMPLEMENTATION=icelake ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=haswell ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=westmere ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation @@ -99,11 +60,12 @@ steps: CC: gcc CXX: g++ BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -120,9 +82,9 @@ steps: environment: CC: clang-9 CXX: clang++-9 - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON BUILD_FLAGS: -- -j - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - mkdir build - cd build @@ -140,16 +102,18 @@ steps: CC: gcc CXX: g++ BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_IMPLEMENTATION=icelake;haswell;westmere;fallback + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. - cmake --build . $BUILD_FLAGS - ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L acceptance -LE per_implementation + - SIMDJSON_FORCE_IMPLEMENTATION=icelake ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=haswell ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=westmere ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=fallback ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation @@ -165,21 +129,43 @@ steps: environment: CC: clang-9 CXX: clang++-9 - CMAKE_FLAGS: -DSIMDJSON_SANITIZE=ON + CMAKE_FLAGS: -DSIMDJSON_SANITIZE=ON -DSIMDJSON_IMPLEMENTATION=icelake;haswell;westmere;fallback BUILD_FLAGS: -- -j - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - mkdir build - cd build - cmake $CMAKE_FLAGS .. - cmake --build . $BUILD_FLAGS - ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L acceptance -LE per_implementation + - SIMDJSON_FORCE_IMPLEMENTATION=icelake ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=haswell ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=westmere ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - SIMDJSON_FORCE_IMPLEMENTATION=fallback ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -L per_implementation - ASAN_OPTIONS="detect_leaks=0" ctest $CTEST_FLAGS -LE "acceptance|per_implementation" # Everything we haven't run yet, run now. --- kind: pipeline +name: cpp20-clang11-libcpp +platform: { os: linux, arch: amd64 } +steps: +- name: Build and Test + image: pauldreik/llvm-11 + user: root + environment: + CC: clang-11 + CXX: clang++-11 + CMAKE_FLAGS: -GNinja + BUILD_FLAGS: + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly + CXXFLAGS: -std=c++20 -stdlib=libc++ + commands: + - mkdir build + - cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . $BUILD_FLAGS + - ctest $CTEST_FLAGS +--- +kind: pipeline name: arm64-gcc8 platform: { os: linux, arch: arm64 } steps: @@ -189,11 +175,12 @@ steps: CC: gcc CXX: g++ BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_IMPLEMENTATION=arm64;fallback + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -208,16 +195,17 @@ name: arm64-clang6 platform: { os: linux, arch: arm64 } steps: - name: Build and Test - image: ubuntu:18.04 + image: debian:buster-backports environment: CC: clang-6.0 CXX: clang++-6.0 - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON BUILD_FLAGS: -- -j - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - - apt-get update -qq - - apt-get install -y clang cmake git + - apt-get -qq update + - apt-get -t buster-backports install -y cmake + - apt-get install -y clang-6.0 git - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -234,11 +222,12 @@ steps: CC: gcc CXX: g++ BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -250,16 +239,17 @@ name: arm64-dynamic-clang6 platform: { os: linux, arch: arm64 } steps: - name: Build and Test - image: ubuntu:18.04 + image: debian:buster-backports environment: CC: clang-6.0 CXX: clang++-6.0 - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON BUILD_FLAGS: -- -j - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - - apt-get update -qq - - apt-get install -y clang cmake git + - apt-get -qq update + - apt-get -t buster-backports install -y cmake + - apt-get install -y clang-6.0 git - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -274,13 +264,15 @@ steps: image: gcc:8 environment: BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_IMPLEMENTATION=arm64;fallback + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly CC: gcc CXX: g++ commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake libstdc++6 + - apt-get -t buster-backports install -y cmake + - apt-get install -y libstdc++6 - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -295,16 +287,17 @@ name: arm64-sanitize-clang6 platform: { os: linux, arch: arm64 } steps: - name: Build and Test - image: ubuntu:18.04 + image: debian:buster-backports environment: CC: clang-6.0 CXX: clang++-6.0 - CMAKE_FLAGS: -DSIMDJSON_SANITIZE=ON + CMAKE_FLAGS: -DSIMDJSON_SANITIZE=ON -DSIMDJSON_IMPLEMENTATION=arm64;fallback BUILD_FLAGS: -- -j - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: - - apt-get update -qq - - apt-get install -y clang cmake git + - apt-get -qq update + - apt-get -t buster-backports install -y cmake + - apt-get install -y clang-6.0 git - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -325,12 +318,10 @@ steps: CC: clang-9 CXX: clang++-9 BUILD_FLAGS: -- -j 4 - CMAKE_FLAGS: -GNinja -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure + CMAKE_FLAGS: -GNinja -DBUILD_SHARED_LIBS=OFF + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly CXXFLAGS: -stdlib=libc++ commands: - - apt-get update -qq - - apt-get install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -348,12 +339,31 @@ steps: CC: clang-9 CXX: clang++-9 BUILD_FLAGS: -- -j - CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=ON - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly + CXXFLAGS: -stdlib=libc++ + commands: + - mkdir build + - cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . $BUILD_FLAGS + - ctest $CTEST_FLAGS +--- +kind: pipeline +name: libcpp-clang7 +platform: { os: linux, arch: amd64 } +steps: +- name: Build and Test + image: conanio/clang7 + user: root + environment: + CC: clang-7 + CXX: clang++-7 + BUILD_FLAGS: -- -j + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=OFF + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly CXXFLAGS: -stdlib=libc++ commands: - - apt-get update -qq - - apt-get install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -371,10 +381,11 @@ steps: CXX: g++ BUILD_FLAGS: -- -j CMAKE_FLAGS: -DSIMDJSON_EXCEPTIONS=OFF - CTEST_FLAGS: -j4 --output-on-failure -E checkperf + CTEST_FLAGS: -j4 --output-on-failure -LE explicitonly commands: + - echo "deb http://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list - apt-get update -qq - - apt-get install -y cmake + - apt-get -t buster-backports install -y cmake - mkdir build - cd build - cmake $CMAKE_FLAGS .. @@ -382,6 +393,26 @@ steps: - ctest $CTEST_FLAGS --- kind: pipeline +name: arm64-fuzz +platform: { os: linux, arch: arm64 } +steps: +- name: Build and run fuzzers shortly + image: ubuntu:20.04 + environment: + CC: clang + CXX: clang++ + DEBIAN_FRONTEND: noninteractive + ASAN_OPTIONS: detect_leaks=0 + commands: + - apt-get -qq update + - apt-get install -q -y clang cmake git wget zip ninja-build + - wget -O corpus.tar.gz https://readonly:readonly@www.pauldreik.se/fuzzdata/index.php?project=simdjson + - tar xf corpus.tar.gz && rm corpus.tar.gz + - fuzz/build_like_ossfuzz.sh + - mkdir -p common_out + - for fuzzer in build/fuzz/fuzz_* ; do echo $fuzzer;$fuzzer common_out out/* -max_total_time=40; done +--- +kind: pipeline name: stylecheck platform: { os: linux, arch: amd64 } steps: diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000..d75a448dd7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# https://editorconfig.org/ +root = true +# Conservatively avoid changing defaults for other file types, e.g. raw json files for test cases, +# Makefiles, etc. +[*.{cpp,h,md}] +charset = utf-8 +end_of_line = lf +indent_size = 2 +indent_style = space +insert_final_newline = true +tab_width = 2 +trim_trailing_whitespace = true diff --git a/.gitattributes b/.gitattributes index 8b79783fa9..aef3e3848b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,7 +3,7 @@ * text=auto # we don't want json files to be modified for this project -*.json binary +*.json binary diff=astextplain # Common settings that generally should always be used with your language specific settings @@ -78,6 +78,7 @@ .gitattributes export-ignore .gitignore export-ignore +.editorconfig export-ignore # Sources *.c text eol=lf diff=c diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 2910f12619..5efc3016d4 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -2,7 +2,7 @@ name: Bug report about: Create a report to help us improve title: '' -labels: bug +labels: bug (unverified) assignees: '' --- @@ -12,23 +12,51 @@ Before submitting an issue, please ensure that you have read the documentation: * Basics is an overview of how to use simdjson and its APIs: https://github.com/simdjson/simdjson/blob/master/doc/basics.md * Performance shows some more advanced scenarios and how to tune for them: https://github.com/simdjson/simdjson/blob/master/doc/performance.md * Contributing: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md +* We follow the [JSON specification as described by RFC 8259](https://www.rfc-editor.org/rfc/rfc8259.txt) (T. Bray, 2017). If you wish to support features that are not part of RFC 8259, then you should not refer to your issue as a bug. **Describe the bug** -A clear and concise description of what the bug is. +A clear and concise description of what the bug is. A bug is a failure to build with normal compiler settings or a misbehaviour: when running the code, you get a result that differs from the expected result from our documentation. + +A compiler or static-analyzer warning is not a bug. It is possible with tools such as Visual Studio to require that rarely enabled warnings are considered errors. Do not report such cases as bugs. We do accept pull requests if you want to silence warnings issued by code analyzers, however. + +We are committed to providing good documentation. We accept the lack of documentation or a misleading documentation as a bug (a 'documentation bug'). + +An unexpected poor software performance can be accepted as a bug (a 'performance bug'). + +We accept the identification of an issue by a sanitizer or some checker tool (e.g., valgrind) as a bug, but you must first ensure that it is not a false positive. + +We recommend that you run your tests using different optimization levels. In particular, we recommend your run tests with the simdjson library and you code compiled in debug mode. The simdjson then sets the SIMDJSON_DEVELOPMENT_CHECKS macro to 1, and this triggers additional checks on your code and on the internals of the library. If possible, we recommend that you run tests with sanitizers (e.g., see [No more leaks with sanitize flags in gcc and clang](https://lemire.me/blog/2016/04/20/no-more-leaks-with-sanitize-flags-in-gcc-and-clang/)). You can compile the library with sanitizers for debugging purposes (e.g., set SIMDJSON_SANITIZE to ON using CMake), but you should also turn on sanitizers on your own code. You may also use tools like valgrind or the commercial equivalent. + +Before reporting a bug, please ensure that you have read our documentation. **To Reproduce** -Steps to reproduce the behaviour: provide a code sample if possible. +Steps to reproduce the behaviour: provide a code sample if possible. Please provide a complete test with data. Remember that a bug is either a failure to build or an unexpected result when running the code. -**Configuration (please complete the following information if relevant):** +If we cannot reproduce the issue, then we cannot address it. Note that a stack trace from your own program is not enough. A sample of your source code is insufficient: please provide a complete test for us to reproduce the issue. Please reduce the issue: use as small and as simple an example of the bug as possible. + +It should be possible to trigger the bug by using solely simdjson with our default build setup. If you can only observe the bug within some specific context, with some other software, please reduce the issue first. + +**simjson release** + +Unless you plan to contribute to simdjson, you should only work from releases. Please be mindful that our main branch may have additional features, bugs and documentation items. + +It is fine to report bugs against our main branch, but if that is what you are doing, please be explicit. + +**Configuration (please complete the following information if relevant)** - OS: [e.g. Ubuntu 16.04.6 LTS] - - Compiler [e.g. Apple clang version 11.0.3 (clang-1103.0.32.59) x86_64-apple-darwin19.4.0] + - Compiler* [e.g. Apple clang version 11.0.3 (clang-1103.0.32.59) x86_64-apple-darwin19.4.0] - Version [e.g. 22] + - Optimization setting (e.g., -O3) + +We support up-to-date 64-bit ARM and x64 FreeBSD, macOS, Windows and Linux systems. Please ensure that your configuration is supported before labelling the issue as a bug. -We support up-to-date 64-bit ARM and x64 FreeBSD, macOS, Windows and Linux systems. Please ensure that your configuration is supported before labelling the issue as a bug. In particular, we do not support legacy 32-bit systems. +* We do not support unreleased or experimental compilers. If you encounter an issue with a +pre-release version of a compiler, do not report it as a bug to simdjson. However, we always +invite contributions either in the form an analysis or of a code contribution. **Indicate whether you are willing or able to provide a bug fix as a pull request** -If you plan to contribute to simdjson, please read our +If you plan to contribute to simdjson, please read our guide: * CONTRIBUTING guide: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md and our * HACKING guide: https://github.com/simdjson/simdjson/blob/master/HACKING.md diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..3ba13e0cec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 400c2177ed..5573d7678e 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -2,7 +2,7 @@ name: Feature request about: Suggest an idea for this project title: '' -labels: feature request +labels: '' assignees: '' --- @@ -12,6 +12,7 @@ Before submitting an issue, please ensure that you have read the documentation: * Basics is an overview of how to use simdjson and its APIs: https://github.com/simdjson/simdjson/blob/master/doc/basics.md * Performance shows some more advanced scenarios and how to tune for them: https://github.com/simdjson/simdjson/blob/master/doc/performance.md * Contributing: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md +* We follow the [JSON specification as described by RFC 8259](https://www.rfc-editor.org/rfc/rfc8259.txt) (T. Bray, 2017). We do not make changes to simdjson without clearly identifiable benefits, which typically means either performance improvements, bug fixes or new features. Avoid bike-shedding: we all have opinions about how to write code, but we want to focus on what makes simdjson objectively better. @@ -30,7 +31,7 @@ A clear and concise description of any alternative solutions or features you've **Additional context** Add any other context or screenshots about the feature request here. -** Are you willing to contribute code or documentation toward this new feature? ** -If you plan to contribute to simdjson, please read our +**Are you willing to contribute code or documentation toward this new feature?** +If you plan to contribute to simdjson, please read our * CONTRIBUTING guide: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md and our * HACKING guide: https://github.com/simdjson/simdjson/blob/master/HACKING.md diff --git a/.github/ISSUE_TEMPLATE/standard-issue-template.md b/.github/ISSUE_TEMPLATE/standard-issue-template.md index 99a8b995d1..74b4a55545 100644 --- a/.github/ISSUE_TEMPLATE/standard-issue-template.md +++ b/.github/ISSUE_TEMPLATE/standard-issue-template.md @@ -12,12 +12,13 @@ Before submitting an issue, please ensure that you have read the documentation: * Basics is an overview of how to use simdjson and its APIs: https://github.com/simdjson/simdjson/blob/master/doc/basics.md * Performance shows some more advanced scenarios and how to tune for them: https://github.com/simdjson/simdjson/blob/master/doc/performance.md * Contributing: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md +* We follow the [JSON specification as described by RFC 8259](https://www.rfc-editor.org/rfc/rfc8259.txt) (T. Bray, 2017). We do not make changes to simdjson without clearly identifiable benefits, which typically means either performance improvements, bug fixes or new features. Avoid bike-shedding: we all have opinions about how to write code, but we want to focus on what makes simdjson objectively better. Is your issue: -1. A bug report? If so, please point at a reproducible test. Indicate whether you are willing or able to provide a bug fix as a pull request. +1. A bug report? If so, please point at a reproducible test. Indicate whether you are willing or able to provide a bug fix as a pull request. As a matter of policy, we do not consider a compiler warning to be a bug. 2. A build issue? If so, provide all possible details regarding your system configuration. If we cannot reproduce your issue, we cannot fix it. @@ -26,6 +27,6 @@ Is your issue: 4. A documentation issue? Can you suggest an improvement? -If you plan to contribute to simdjson, please read our +If you plan to contribute to simdjson, please read our * CONTRIBUTING guide: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md and our * HACKING guide: https://github.com/simdjson/simdjson/blob/master/HACKING.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..31eed92949 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ + + +Our tests check whether you have introduced trailing white space. If such a test fails, please check the "artifacts button" above, which if you click it gives a link to a downloadable file to help you identify the issue. You can also run scripts/remove_trailing_whitespace.sh locally if you have a bash shell and the sed command available on your system. + +If you plan to contribute to simdjson, please read our + +CONTRIBUTING guide: https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md and our +HACKING guide: https://github.com/simdjson/simdjson/blob/master/HACKING.md diff --git a/.github/workflows/aarch64.yml b/.github/workflows/aarch64.yml new file mode 100644 index 0000000000..751a394906 --- /dev/null +++ b/.github/workflows/aarch64.yml @@ -0,0 +1,29 @@ +name: Ubuntu ppc64le (GCC 11) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: uraimo/run-on-arch-action@v3 + name: Test + id: runcmd + with: + arch: aarch64 + distro: ubuntu_latest + githubToken: ${{ github.token }} + install: | + apt-get update -q -y + apt-get install -y cmake make g++ + run: | + cmake -DSIMDJSON_SANITIZE_UNDEFINED=ON -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + cmake --build build -j=2 + ctest --output-on-failure --test-dir build diff --git a/.github/workflows/alpine.yml b/.github/workflows/alpine.yml new file mode 100644 index 0000000000..d44adaf638 --- /dev/null +++ b/.github/workflows/alpine.yml @@ -0,0 +1,34 @@ +name: Alpine Linux + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: start docker + run: | + docker run -w /src -dit --name alpine -v $PWD:/src alpine:latest + echo 'docker exec alpine "$@";' > ./alpine.sh + chmod +x ./alpine.sh + - name: install packages + run: | + ./alpine.sh apk update + ./alpine.sh apk add build-base cmake g++ linux-headers git bash + - name: cmake + run: | + ./alpine.sh cmake -DSIMDJSON_DEVELOPER_MODE=ON -B build_for_alpine + - name: build + run: | + ./alpine.sh cmake --build build_for_alpine + - name: test + run: | + ./alpine.sh bash -c "cd build_for_alpine && ctest -LE explicitonly --output-on-failure" diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 0000000000..c0addddaf0 --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,24 @@ +name: CIFuzz +on: [pull_request] +jobs: + Fuzzing: + runs-on: ubuntu-latest + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'simdjson' + dry-run: false + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'simdjson' + fuzz-seconds: 600 + dry-run: false + - name: Upload Crash + uses: actions/upload-artifact@v4 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts diff --git a/.github/workflows/debian.yaml b/.github/workflows/debian.yaml new file mode 100644 index 0000000000..2ca975f012 --- /dev/null +++ b/.github/workflows/debian.yaml @@ -0,0 +1,33 @@ +name: Debian + +on: [push, pull_request] + +defaults: + run: + shell: sh + +permissions: + contents: read + +jobs: + pkg-config: + runs-on: ubuntu-latest + container: + image: debian:testing + + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + apt -y update + apt -y --no-install-recommends install g++ cmake make pkg-config + + - name: Build and install + run: | + cmake -B build + cmake --build build + cmake --install build + + - name: Test pkg-config + run: g++ examples/quickstart/quickstart.cpp $(pkg-config --cflags --libs simdjson) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000000..24df9d1a14 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,33 @@ +name: Doxygen GitHub Pages + +on: + release: + types: [created] + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: write + pages: write + id-token: write + +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Doxygen + run: sudo apt-get install doxygen graphviz -y + - run: mkdir docs + - name: Install theme + run: ./tools/prepare_doxygen.sh + - name: Generate Doxygen Documentation + run: doxygen + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: doc/api/html diff --git a/.github/workflows/fix-trailing-whitespace.yml b/.github/workflows/fix-trailing-whitespace.yml new file mode 100644 index 0000000000..049a8138b5 --- /dev/null +++ b/.github/workflows/fix-trailing-whitespace.yml @@ -0,0 +1,34 @@ +name: Detect trailing whitespace + +on: [push, pull_request] + +jobs: + whitespace: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove whitespace and check the diff + run: | + set -eu + scripts/remove_trailing_whitespace.sh + git diff >whitespace.patch + cat whitespace.patch + if [ $(wc -c - + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') runs-on: ubuntu-latest env: - allfuzzers: parser dump dump_raw_tape print_json - artifactsprefix: -artifact_prefix=fuzzfailure/ + # fuzzers that change behaviour with SIMDJSON_FORCE_IMPLEMENTATION + defaultimplfuzzers: atpointer dump dump_raw_tape element minify parser print_json + # fuzzers that loop over the implementations themselves, or don't need to switch. + implfuzzers: implementations minifyimpl ndjson ondemand padded utf8 + implementations: haswell westmere fallback + UBSAN_OPTIONS: halt_on_error=1 + MAXLEN: -max_len=4000 + CLANGVERSION: 19 + # which optimization level to use for the sanitizer build (see build_fuzzer.variants.sh) + OPTLEVEL: -O3 + steps: - name: Install packages necessary for building run: | sudo apt update - sudo apt-get install --quiet ninja-build valgrind zip unzip + sudo apt-get install --quiet ninja-build valgrind zip unzip lsb-release wget software-properties-common gnupg wget https://apt.llvm.org/llvm.sh + sudo apt-get purge --auto-remove llvm python3-lldb-15 llvm-15 chmod +x llvm.sh - sudo ./llvm.sh 9 + sudo ./llvm.sh $CLANGVERSION + + - uses: actions/checkout@v4 + + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + + - uses: actions/cache@v4 + id: cache-corpus + with: + path: out/ + key: corpus-${{ github.run_id }} + restore-keys: corpus- + + - name: show statistics for the cached corpus + run: | + echo number of files in github action corpus cache: + find out -type f |wc -l - - uses: actions/checkout@v1 - name: Create and prepare the initial seed corpus run: | fuzz/build_corpus.sh mv corpus.zip seed_corpus.zip - - name: Download the corpus from the last run - run: | - wget --quiet https://dl.bintray.com/pauldreik/simdjson-fuzz-corpus/corpus/corpus.tar - tar xf corpus.tar - rm corpus.tar + mkdir seedcorpus + unzip -q -d seedcorpus seed_corpus.zip + - name: List clang versions run: | ls /usr/bin/clang* which clang++ clang++ --version + - name: Build all the variants - run: fuzz/build_fuzzer_variants.sh - - name: Verify that the oss-fuzz seed corpus passes without problems + run: CLANGSUFFIX=-$CLANGVERSION fuzz/build_fuzzer_variants.sh + + - name: Explore fast (release build, default implementation) run: | - mkdir seedcorpus - unzip -q -d seedcorpus seed_corpus.zip - for buildvariant in noavx withavx; do - for fuzzer in $allfuzzers; do - build-ossfuzz-$buildvariant/fuzz/fuzz_$fuzzer seedcorpus -max_total_time=1 - done + set -eux + for fuzzer in $defaultimplfuzzers $implfuzzers; do + mkdir -p out/$fuzzer # in case this is a new fuzzer, or the github action cached corpus is broken + # get input from everyone else (corpus cross pollination) + others=$(find out -type d -not -name $fuzzer -not -name out -not -name cmin) + build-fast/fuzz/fuzz_$fuzzer out/$fuzzer $others seedcorpus -max_total_time=30 $MAXLEN done - - name: Run the fastest fuzzer to explore fast + + - name: Fuzz default impl. fuzzers with sanitizer+asserts (good at detecting errors) run: | - for fuzzer in $allfuzzers; do - mkdir -p out/$fuzzer # in case this is a new fuzzer, or corpus.tar is broken - build-ossfuzz-fast9/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=30 $artifactsprefix || touch failed - # make sure the failing output is visible in the log - if [ -e failed ] ; then - ls fuzzfailure/* |xargs -n1 base64 - exit 1 - fi + set -eux + for fuzzer in $defaultimplfuzzers; do + # get input from everyone else (corpus cross pollination) + others=$(find out -type d -not -name $fuzzer -not -name out -not -name cmin) + for implementation in $implementations; do + export SIMDJSON_FORCE_IMPLEMENTATION=$implementation + build-sanitizers$OPTLEVEL/fuzz/fuzz_$fuzzer out/$fuzzer $others seedcorpus -max_total_time=20 $MAXLEN + done + echo now have $(ls out/$fuzzer |wc -l) files in corpus done - - name: Run the other fuzzer variants for $fuzzer, with sanitizers etc + + - name: Fuzz differential impl. fuzzers with sanitizer+asserts (good at detecting errors) run: | - set -x - for fuzzer in $allfuzzers; do - build-ossfuzz-withavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=20 $artifactsprefix || touch failed - build-ossfuzz-noavx/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed - build-ossfuzz-noavx9/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 $artifactsprefix || touch failed - if [ -e failed ] ; then - # make sure the failing output is visible in the log - ls fuzzfailure/* |xargs -n1 base64 - exit 1 - fi - echo disable msan runs, it fails inside the fuzzing engine and not the fuzzed code! - echo build-ossfuzz-msan-noavx9/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 -reload=0 $artifactsprefix - echo build-ossfuzz-msan-withavx9/fuzz/fuzz_$fuzzer out/$fuzzer -max_total_time=10 -reload=0 $artifactsprefix - echo now have $(ls out/$fuzzer |wc -l) files in corpus + set -eux + for fuzzer in $implfuzzers; do + # get input from everyone else (corpus cross pollination) + others=$(find out -type d -not -name $fuzzer -not -name out -not -name cmin) + build-sanitizers$OPTLEVEL/fuzz/fuzz_$fuzzer out/$fuzzer $others seedcorpus -max_total_time=20 $MAXLEN + echo now have $(ls out/$fuzzer |wc -l) files in corpus done - - name: Minimize the corpus with the fast fuzzer + + - name: Minimize the corpus with the fast fuzzer on the default implementation run: | - for fuzzer in $allfuzzers; do + set -eux + for fuzzer in $defaultimplfuzzers $implfuzzers; do mkdir -p out/cmin/$fuzzer - build-ossfuzz-fast9/fuzz/fuzz_$fuzzer -merge=1 out/cmin/$fuzzer out/$fuzzer + # get input from everyone else (corpus cross pollination) + others=$(find out -type d -not -name $fuzzer -not -name out -not -name cmin) + build-fast/fuzz/fuzz_$fuzzer -merge=1 $MAXLEN out/cmin/$fuzzer out/$fuzzer $others seedcorpus rm -rf out/$fuzzer mv out/cmin/$fuzzer out/$fuzzer done + - name: Package the corpus into an artifact run: | - for fuzzer in $allfuzzers; do + for fuzzer in $defaultimplfuzzers $implfuzzers; do tar rf corpus.tar out/$fuzzer done + - name: Save the corpus as a github artifact - uses: actions/upload-artifact@v1 + uses: actions/upload-artifact@v4 with: name: corpus path: corpus.tar - - name: Run the corpus through valgrind (normal build) + + - name: Store the corpus externally run: | - for fuzzer in $allfuzzers; do - find out/$fuzzer -type f |sort|xargs valgrind build-plain-noavx/fuzz/fuzz_$fuzzer 2>&1|tee valgrind-$fuzzer-noavx.txt - done - - name: Run the corpus through valgrind (noavx build) + gzip --keep corpus.tar + curl -F"filedata=@corpus.tar.gz" https://simdjson:${{ secrets.fuzzdatapassword }}@www.pauldreik.se/fuzzdata/index.php + + # This takes a subset of the minimized corpus and run it through valgrind. It is slow, + # therefore take a "random" subset. The random selection is accomplished by sorting on filenames, + # which are hashes of the content. + - name: Run some of the minimized corpus through valgrind (replay build, default implementation) run: | - for fuzzer in $allfuzzers; do - find out/$fuzzer -type f |sort|xargs valgrind build-plain-normal/fuzz/fuzz_$fuzzer 2>&1|tee valgrind-$fuzzer-normal.txt - done + for fuzzer in $defaultimplfuzzers $implfuzzers; do + find out/$fuzzer -type f |sort|head -n200|xargs -n40 valgrind build-replay/fuzz/fuzz_$fuzzer 2>&1|tee valgrind-$fuzzer.txt + done + - name: Compress the valgrind output run: tar cf valgrind.tar valgrind-*.txt + - name: Save valgrind output as a github artifact - uses: actions/upload-artifact@v1 + uses: actions/upload-artifact@v4 + if: always() with: name: valgrindresults path: valgrind.tar - - name: Upload the corpus and results to bintray if we are on master - run: | - if [ $(git rev-parse --verify HEAD) = $(git rev-parse --verify origin/master) ] ; then - echo uploading each artifact twice, otherwise it will not be published - curl -T corpus.tar -upauldreik:${{ secrets.bintrayApiKey }} https://api.bintray.com/content/pauldreik/simdjson-fuzz-corpus/corpus/0/corpus/corpus.tar";publish=1;override=1" - curl -T corpus.tar -upauldreik:${{ secrets.bintrayApiKey }} https://api.bintray.com/content/pauldreik/simdjson-fuzz-corpus/corpus/0/corpus/corpus.tar";publish=1;override=1" - curl -T valgrind.tar -upauldreik:${{ secrets.bintrayApiKey }} https://api.bintray.com/content/pauldreik/simdjson-fuzz-corpus/corpus/0/corpus/valgrind.tar";publish=1;override=1" - curl -T valgrind.tar -upauldreik:${{ secrets.bintrayApiKey }} https://api.bintray.com/content/pauldreik/simdjson-fuzz-corpus/corpus/0/corpus/valgrind.tar";publish=1;override=1" - else - echo "not on master, won't upload to bintray" - fi + if-no-files-found: ignore + + - name: Archive any crashes as an artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: crashes + path: | + crash-* + leak-* + timeout-* + if-no-files-found: ignore diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml new file mode 100644 index 0000000000..09abafb54c --- /dev/null +++ b/.github/workflows/loongarch64.yml @@ -0,0 +1,50 @@ +name: LoongArch64-CI + +on: [push, pull_request] + +jobs: + loongarch64: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + platform: + - { toolchain-version: 2023.08.08 } + steps: + - uses: actions/checkout@v4 + - name: Install build requirements + run: | + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends cmake + + - uses: actions/cache/restore@v4 + id: restore-cache + with: + path: /opt/cross-tools + key: loongarch64-${{ matrix.platform.toolchain-version }} + + - name: Download LoongArch64 gcc+glibc toolchain + if: ${{ !steps.restore-cache.outputs.cache-hit }} + run: | + url="https://github.com/loongson/build-tools/releases/download/${{ matrix.platform.toolchain-version }}/x86_64-cross-tools-loongarch64-gcc-libc.tar.xz" + + wget "$url" -O /tmp/toolchain.tar.xz + + mkdir -p /opt + tar -C /opt -x -f /tmp/toolchain.tar.xz + + - uses: actions/cache/save@v3 + if: ${{ !steps.restore-cache.outputs.cache-hit }} + with: + path: /opt/cross-tools + key: loongarch64-${{ matrix.platform.toolchain-version }} + + - name: setup Loongarch64 build environment + run: | + echo "/opt/cross-tools/bin" >> $GITHUB_PATH + echo "CC=loongarch64-unknown-linux-gnu-gcc" >> $GITHUB_ENV + echo "CXX=loongarch64-unknown-linux-gnu-g++" >> $GITHUB_ENV + - name: configure + run: cmake -B build -DCMAKE_SYSTEM_PROCESSOR=loongarch64 -DARCH=lonngarch64 -DCMAKE_SYSTEM_NAME=Linux -DCMAKE_C_COMPILER=loongarch64-unknown-linux-gnu-gcc -DCMAKE_CXX_COMPILER=loongarch64-unknown-linux-gnu-g++ + - name: build + run: cmake --build build diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 0000000000..1eda9ebe7f --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,44 @@ +name: Macos + +on: [push, pull_request] + +jobs: + macos-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir builddebug && + cd builddebug && + cmake -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cd .. && + mkdir build && + cd build && + cmake -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cmake --install . && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + cd ../tests/installation_tests/find && + mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../build/destination .. && cmake --build . + - name: Use cmake (shared) + run: | + mkdir buildshared && + cd buildshared && + cmake -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cmake --install . && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + cd ../tests/installation_tests/find && + mkdir buildshared && cd buildshared && cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../buildshared/destination .. && cmake --build . diff --git a/.github/workflows/mingw-ci.yml b/.github/workflows/mingw-ci.yml deleted file mode 100644 index 058225992e..0000000000 --- a/.github/workflows/mingw-ci.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: MinGW32-CI - -on: [push, pull_request] - -# Important: scoop will either install 32-bit GCC or 64-bit GCC, not both. - -# It is important to build static libraries because cmake is not smart enough under Windows/mingw to take care of the path. So -# with a dynamic library, you could get failures due to the fact that the EXE can't find its DLL. - -jobs: - ci: - name: windows-gcc - runs-on: windows-2016 - - env: - CMAKE_GENERATOR: Ninja # This is critical, try ' cmake -GNinja-DSIMDJSON_BUILD_STATIC=ON .. ' if using the command line - CC: gcc - CXX: g++ - - steps: # To reproduce what is below, start a powershell with administrative rights, using scoop *is* a good idea - - uses: actions/checkout@v2 - - - uses: actions/cache@v2 # we cache the scoop setup with 32-bit GCC - id: cache - with: - path: | - C:\ProgramData\scoop - key: scoop32 # static key: should be good forever - - name: Setup Windows # This should almost never run if the cache works. - if: steps.cache.outputs.cache-hit != 'true' - shell: powershell - run: | - Invoke-Expression (New-Object System.Net.WebClient).DownloadString('https://get.scoop.sh') - scoop install sudo --global - sudo scoop install git --global - sudo scoop install ninja --global - sudo scoop install cmake --global - sudo scoop install gcc --arch 32bit --global - $env:path - Write-Host 'Everything has been installed, you are good!' - - name: Build and Test 32-bit x86 - shell: powershell - run: | - $ENV:PATH="C:\ProgramData\scoop\shims;C:\ProgramData\scoop\apps\gcc\current\bin;C:\ProgramData\scoop\apps\ninja\current;$ENV:PATH" - g++ --version - cmake --version - ninja --version - git --version - mkdir build32 - cd build32 - cmake -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_ENABLE_THREADS=OFF .. - cmake --build . --target parse_many_test jsoncheck basictests numberparsingcheck stringparsingcheck errortests integer_tests pointercheck --verbose - ctest . -R "(parse_many_test|jsoncheck|basictests|stringparsingcheck|numberparsingcheck|errortests|integer_tests|pointercheck)" --output-on-failure diff --git a/.github/workflows/mingw64-ci.yml b/.github/workflows/mingw64-ci.yml deleted file mode 100644 index 8b9b278508..0000000000 --- a/.github/workflows/mingw64-ci.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: MinGW64-CI - -on: [push, pull_request] - -# Important: scoop will either install 32-bit GCC or 64-bit GCC, not both. - -# It is important to build static libraries because cmake is not smart enough under Windows/mingw to take care of the path. So -# with a dynamic library, you could get failures due to the fact that the EXE can't find its DLL. - -jobs: - ci: - name: windows-gcc - runs-on: windows-2016 - - env: - CMAKE_GENERATOR: Ninja # This is critical, try ' cmake -GNinja-DSIMDJSON_BUILD_STATIC=ON .. ' if using the command line - CC: gcc - CXX: g++ - - steps: # To reproduce what is below, start a powershell with administrative rights, using scoop *is* a good idea - - uses: actions/checkout@v2 - - - uses: actions/cache@v2 # we cache the scoop setup with 64-bit GCC - id: cache - with: - path: | - C:\ProgramData\scoop - key: scoop64 # static key: should be good forever - - name: Setup Windows # This should almost never run if the cache works. - if: steps.cache.outputs.cache-hit != 'true' - shell: powershell - run: | - Invoke-Expression (New-Object System.Net.WebClient).DownloadString('https://get.scoop.sh') - scoop install sudo --global - sudo scoop install git --global - sudo scoop install ninja --global - sudo scoop install cmake --global - sudo scoop install gcc --arch 64bit --global - $env:path - Write-Host 'Everything has been installed, you are good!' - - name: Build and Test 64-bit x64 - shell: powershell - run: | - $ENV:PATH="C:\ProgramData\scoop\shims;C:\ProgramData\scoop\apps\gcc\current\bin;C:\ProgramData\scoop\apps\ninja\current;$ENV:PATH" - g++ --version - cmake --version - ninja --version - git --version - mkdir build64 - cd build64 - cmake -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_ENABLE_THREADS=OFF .. - cmake --build . --target parse_many_test jsoncheck basictests numberparsingcheck stringparsingcheck errortests integer_tests pointercheck --verbose - ctest . -R "(parse_many_test|jsoncheck|basictests|stringparsingcheck|numberparsingcheck|errortests|integer_tests|pointercheck)" --output-on-failure diff --git a/.github/workflows/msys2-clang.yml b/.github/workflows/msys2-clang.yml new file mode 100644 index 0000000000..95c6fe4a8b --- /dev/null +++ b/.github/workflows/msys2-clang.yml @@ -0,0 +1,46 @@ +name: MSYS2-CLANG-CI + +on: [push, pull_request] + + +jobs: + windows-mingw: + name: ${{ matrix.msystem }} + runs-on: windows-latest + defaults: + run: + shell: msys2 {0} + strategy: + fail-fast: false + matrix: + include: + - msystem: "MINGW64" + install: mingw-w64-x86_64-libxml2 mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-clang + type: Release + - msystem: "MINGW64" + install: mingw-w64-x86_64-libxml2 mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-clang + type: Debug + - msystem: "MINGW64" + install: mingw-w64-x86_64-libxml2 mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-clang + type: RelWithDebInfo + env: + CMAKE_GENERATOR: Ninja + + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - uses: msys2/setup-msys2@v2 + with: + update: true + msystem: ${{ matrix.msystem }} + install: ${{ matrix.install }} + - name: Build and Test + run: | + mkdir build + cd build + cmake -DSIMDJSON_DEVELOPER_MODE=ON -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=${{ matrix.type }} -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_DO_NOT_USE_THREADS_NO_MATTER_WHAT=ON .. + cmake --build . --verbose + ctest -j4 --output-on-failure -LE explicitonly diff --git a/.github/workflows/msys2.yml b/.github/workflows/msys2.yml index a31c667cd2..41b3a92365 100644 --- a/.github/workflows/msys2.yml +++ b/.github/workflows/msys2.yml @@ -4,6 +4,9 @@ on: [push, pull_request] jobs: windows-mingw: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') name: ${{ matrix.msystem }} runs-on: windows-latest defaults: @@ -15,13 +18,22 @@ jobs: include: - msystem: "MINGW64" install: mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-gcc - - msystem: "MINGW32" - install: mingw-w64-i686-cmake mingw-w64-i686-ninja mingw-w64-i686-gcc + type: Release + - msystem: "MINGW64" + install: mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-gcc + type: Debug + - msystem: "MINGW64" + install: mingw-w64-x86_64-cmake mingw-w64-x86_64-ninja mingw-w64-x86_64-gcc + type: RelWithDebInfo env: CMAKE_GENERATOR: Ninja steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} - uses: msys2/setup-msys2@v2 with: update: true @@ -31,6 +43,6 @@ jobs: run: | mkdir build cd build - cmake -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_DO_NOT_USE_THREADS_NO_MATTER_WHAT=ON .. + cmake -DSIMDJSON_DEVELOPER_MODE=ON -DCMAKE_BUILD_TYPE=${{ matrix.type }} -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_DO_NOT_USE_THREADS_NO_MATTER_WHAT=ON .. cmake --build . --verbose - ctest -j4 --output-on-failure -E checkperf + ctest -j4 --output-on-failure -LE explicitonly diff --git a/.github/workflows/ppc64.yml b/.github/workflows/ppc64.yml new file mode 100644 index 0000000000..468079c1c1 --- /dev/null +++ b/.github/workflows/ppc64.yml @@ -0,0 +1,29 @@ +name: Ubuntu ppc64le (GCC 11) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: uraimo/run-on-arch-action@v3 + name: Test + id: runcmd + with: + arch: ppc64le + distro: ubuntu_latest + githubToken: ${{ github.token }} + install: | + apt-get update -q -y + apt-get install -y cmake make g++ + run: | + cmake -DCMAKE_BUILD_TYPE=Release -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + cmake --build build -j=2 + ctest --output-on-failure --test-dir build diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml new file mode 100644 index 0000000000..9666fad024 --- /dev/null +++ b/.github/workflows/riscv64.yml @@ -0,0 +1,29 @@ +name: Ubuntu riscv64 (GCC 11) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: uraimo/run-on-arch-action@v3 + name: Test + id: runcmd + with: + arch: riscv64 + distro: ubuntu_latest + githubToken: ${{ github.token }} + install: | + apt-get update -q -y + apt-get install -y cmake make g++ + run: | + cmake -DCMAKE_BUILD_TYPE=Release -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + cmake --build build -j=2 + ctest --output-on-failure --test-dir build diff --git a/.github/workflows/s390x.yml b/.github/workflows/s390x.yml new file mode 100644 index 0000000000..df9e438ee5 --- /dev/null +++ b/.github/workflows/s390x.yml @@ -0,0 +1,29 @@ +name: Ubuntu s390x (GCC 11) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: uraimo/run-on-arch-action@v3 + name: Test + id: runcmd + with: + arch: s390x + distro: ubuntu_latest + githubToken: ${{ github.token }} + install: | + apt-get update -q -y + apt-get install -y cmake make g++ + run: | + cmake -DCMAKE_BUILD_TYPE=Release -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + cmake --build build -j=2 + ctest --output-on-failure --test-dir build diff --git a/.github/workflows/ubuntu18.yml b/.github/workflows/ubuntu18.yml deleted file mode 100644 index c76b17f4de..0000000000 --- a/.github/workflows/ubuntu18.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Ubuntu 18.04 CI (GCC 7) - -on: [push, pull_request] - -jobs: - ubuntu-build: - runs-on: ubuntu-18.04 - steps: - - uses: actions/checkout@v2 - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.0 - with: - cmake-version: '3.9.x' - - name: Use cmake - run: | - mkdir build && - cd build && - cmake -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_BUILD_STATIC=ON -DCMAKE_INSTALL_PREFIX:PATH=destination .. && - cmake --build . && - ctest . -j --output-on-failure && - make install && - echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json diff --git a/.github/workflows/ubuntu20.yml b/.github/workflows/ubuntu20.yml deleted file mode 100644 index 12830ddcc4..0000000000 --- a/.github/workflows/ubuntu20.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Ubuntu 20.04 CI (GCC 9) - -on: [push, pull_request] - -jobs: - ubuntu-build: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.0 - with: - cmake-version: '3.9.x' - - name: Use cmake - run: | - mkdir build && - cd build && - cmake -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_BUILD_STATIC=ON -DCMAKE_INSTALL_PREFIX:PATH=destination .. && - cmake --build . && - ctest . -j --output-on-failure && - make install && - echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json diff --git a/.github/workflows/ubuntu22-clang13.yml b/.github/workflows/ubuntu22-clang13.yml new file mode 100644 index 0000000000..780cabef28 --- /dev/null +++ b/.github/workflows/ubuntu22-clang13.yml @@ -0,0 +1,23 @@ +name: Ubuntu 22.04 CI (CLANG 13) + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir build && + cd build && + CXX=clang++-13 cmake -DSIMDJSON_DEVELOPER_MODE=ON .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j \ No newline at end of file diff --git a/.github/workflows/ubuntu22-clang14.yml b/.github/workflows/ubuntu22-clang14.yml new file mode 100644 index 0000000000..cd2497e1b7 --- /dev/null +++ b/.github/workflows/ubuntu22-clang14.yml @@ -0,0 +1,23 @@ +name: Ubuntu 22.04 CI (CLANG 14) + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir build && + cd build && + CXX=clang++-14 cmake -DSIMDJSON_DEVELOPER_MODE=ON .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j diff --git a/.github/workflows/ubuntu22-cxx20.yml b/.github/workflows/ubuntu22-cxx20.yml new file mode 100644 index 0000000000..ac674769a2 --- /dev/null +++ b/.github/workflows/ubuntu22-cxx20.yml @@ -0,0 +1,33 @@ +name: Ubuntu 22.04 CI (GCC 12, CXX 20) + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir builddebug && + cd builddebug && + CXX=g++-12 cmake -DSIMDJSON_CXX_STANDARD=20 -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cd .. && + mkdir build && + cd build && + CXX=g++-12 cmake -DSIMDJSON_CXX_STANDARD=20 -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cmake --install . && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + cd ../tests/installation_tests/find && + mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../build/destination .. && cmake --build . diff --git a/.github/workflows/ubuntu22-gcc12.yml b/.github/workflows/ubuntu22-gcc12.yml new file mode 100644 index 0000000000..18d4c20007 --- /dev/null +++ b/.github/workflows/ubuntu22-gcc12.yml @@ -0,0 +1,23 @@ +name: Ubuntu 22.04 CI (GCC 12) + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir build && + cd build && + CXX=g++-12 cmake -DSIMDJSON_DEVELOPER_MODE=ON .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j \ No newline at end of file diff --git a/.github/workflows/ubuntu22-glibcxxassertions.yml b/.github/workflows/ubuntu22-glibcxxassertions.yml new file mode 100644 index 0000000000..94d9642c97 --- /dev/null +++ b/.github/workflows/ubuntu22-glibcxxassertions.yml @@ -0,0 +1,24 @@ +name: Ubuntu 22.04 CI GCC 12 with GLIBCXX_ASSERTIONS +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Install gcc12 + run: sudo apt-get install -y g++-12 + - name: Use cmake + run: | + mkdir build && + cd build && + CXX=g++-12 cmake -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GLIBCXX_ASSERTIONS=ON -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON .. && + cmake --build . && + ctest . -E avoid_ \ No newline at end of file diff --git a/.github/workflows/ubuntu22-threadsani.yml b/.github/workflows/ubuntu22-threadsani.yml new file mode 100644 index 0000000000..563712f59f --- /dev/null +++ b/.github/workflows/ubuntu22-threadsani.yml @@ -0,0 +1,24 @@ +name: Ubuntu 22.04 CI (GCC 12) with Thread Sanitizer + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir build && + cd build && + CXX=g++-12 cmake -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_SANITIZE_THREADS=ON .. && + cmake --build . --target document_stream_tests --target ondemand_document_stream_tests --target parse_many_test && + ctest --output-on-failure -R parse_many_test && + ctest --output-on-failure -R document_stream_tests \ No newline at end of file diff --git a/.github/workflows/ubuntu22.yml b/.github/workflows/ubuntu22.yml new file mode 100644 index 0000000000..477ffb99e5 --- /dev/null +++ b/.github/workflows/ubuntu22.yml @@ -0,0 +1,47 @@ +name: Ubuntu 22.04 CI (GCC 11) + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake to build just the library + run: | + mkdir buildjustlib && + cd buildjustlib && + cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DSIMDJSON_DEVELOPER_MODE=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + cmake --install . && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && + c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && + cd ../tests/installation_tests/find && + mkdir buildjustlib && + cd buildjustlib && + cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../buildjustlib/destination .. && + cmake --build . + - name: Use cmake + run: | + mkdir builddebug && + cd builddebug && + cmake -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cd .. && + mkdir build && + cd build && + cmake -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cmake --install . && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + cd ../tests/installation_tests/find && + mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../build/destination .. && cmake --build . diff --git a/.github/workflows/ubuntu24-checkperf.yml b/.github/workflows/ubuntu24-checkperf.yml new file mode 100644 index 0000000000..d9e37b83e6 --- /dev/null +++ b/.github/workflows/ubuntu24-checkperf.yml @@ -0,0 +1,29 @@ +name: Performance check on Ubuntu 20.04 CI (GCC 9) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir build && + cd build && + cmake -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_ENABLE_DOM_CHECKPERF=ON -DCMAKE_CXX_FLAGS="-Werror=old-style-cast -pedantic -Wpedantic" -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . --target checkperf && + ctest --output-on-failure -R checkperf diff --git a/.github/workflows/ubuntu24-cxx20.yml b/.github/workflows/ubuntu24-cxx20.yml new file mode 100644 index 0000000000..cbd8f9cf92 --- /dev/null +++ b/.github/workflows/ubuntu24-cxx20.yml @@ -0,0 +1,22 @@ +name: Ubuntu 24.04 CI (CXX 20) + +on: [push, pull_request] +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + strategy: + matrix: + cxx: [g++-13, clang++-16] + steps: + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - name: Prepare + run: cmake -DSIMDJSON_CXX_STANDARD=20 -DSIMDJSON_DEVELOPER_MODE=ON -B build + env: + CXX: ${{matrix.cxx}} + - name: Build + run: cmake --build build -j=2 + - name: Test + run: ctest --output-on-failure --test-dir build \ No newline at end of file diff --git a/.github/workflows/ubuntu24-noexcept.yml b/.github/workflows/ubuntu24-noexcept.yml new file mode 100644 index 0000000000..cd126e916a --- /dev/null +++ b/.github/workflows/ubuntu24-noexcept.yml @@ -0,0 +1,34 @@ +name: Ubuntu 20.04 CI (GCC 9) without exceptions + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir builddebug && + cd builddebug && + cmake -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_EXCEPTIONS=OFF -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cd .. && + mkdir build && + cd build && + cmake -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_EXCEPTIONS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + make install && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + mkdir testfindpackage && + cd testfindpackage && + echo -e 'cmake_minimum_required(VERSION 3.1)\nproject(simdjsontester)\nset(CMAKE_CXX_STANDARD 17)\nfind_package(simdjson REQUIRED)'> CMakeLists.txt && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../destination .. && cmake --build . diff --git a/.github/workflows/ubuntu24-nothread.yml b/.github/workflows/ubuntu24-nothread.yml new file mode 100644 index 0000000000..5c063e8eb0 --- /dev/null +++ b/.github/workflows/ubuntu24-nothread.yml @@ -0,0 +1,34 @@ +name: Ubuntu 20.04 CI (GCC 9) Without Threads + +on: [push, pull_request] + +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake + run: | + mkdir builddebug && + cd builddebug && + cmake -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_ENABLE_THREADS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + cd .. && + mkdir build && + cd build && + cmake -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_GOOGLE_BENCHMARKS=ON -DSIMDJSON_ENABLE_THREADS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX:PATH=destination .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j && + make install && + echo -e '#include \nint main(int argc,char**argv) {simdjson::dom::parser parser;simdjson::dom::element tweets = parser.load(argv[1]); }' > tmp.cpp && c++ -Idestination/include -Ldestination/lib -std=c++17 -Wl,-rpath,destination/lib -o linkandrun tmp.cpp -lsimdjson && ./linkandrun jsonexamples/twitter.json && + mkdir testfindpackage && + cd testfindpackage && + echo -e 'cmake_minimum_required(VERSION 3.1)\nproject(simdjsontester)\nset(CMAKE_CXX_STANDARD 17)\nfind_package(simdjson REQUIRED)'> CMakeLists.txt && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../destination .. && cmake --build . diff --git a/.github/workflows/ubuntu24-sani.yml b/.github/workflows/ubuntu24-sani.yml new file mode 100644 index 0000000000..e658a8bbb3 --- /dev/null +++ b/.github/workflows/ubuntu24-sani.yml @@ -0,0 +1,41 @@ +name: Ubuntu 20.04 CI (GCC 9) With Memory Sanitizer + +on: [push, pull_request] + +jobs: + ubuntu-build-address-sanitizier: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake with address sanitizer + run: | + mkdir builddebug && + cd builddebug && + cmake -DSIMDJSON_SANITIZE=ON -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j + ubuntu-build-undefined-sanitizer: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: Use cmake with undefined sanitizer + run: | + mkdir builddebugundefsani && + cd builddebugundefsani && + cmake -DSIMDJSON_SANITIZE_UNDEFINED=ON -DCMAKE_BUILD_TYPE=Debug -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DBUILD_SHARED_LIBS=OFF .. && + cmake --build . && + ctest --output-on-failure -LE explicitonly -j diff --git a/.github/workflows/ubuntu24.yml b/.github/workflows/ubuntu24.yml new file mode 100644 index 0000000000..1635504001 --- /dev/null +++ b/.github/workflows/ubuntu24.yml @@ -0,0 +1,25 @@ +name: Ubuntu 24.04 CI + +on: [push, pull_request] +jobs: + ubuntu-build: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + runs-on: ubuntu-24.04 + strategy: + matrix: + shared: [ON, OFF] + cxx: [g++-13, clang++-16] + sanitizer: [ON, OFF] + build_type: [RelWithDebInfo, Debug, Release] + steps: + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - name: Prepare + run: cmake -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_SANITIZE=${{matrix.sanitizer}} -DBUILD_SHARED_LIBS=${{matrix.shared}} -B build + env: + CXX: ${{matrix.cxx}} + - name: Build + run: cmake --build build -j=2 + - name: Test + run: ctest --output-on-failure --test-dir build \ No newline at end of file diff --git a/.github/workflows/vs16-ci.yml b/.github/workflows/vs16-ci.yml deleted file mode 100644 index 12a6e917c2..0000000000 --- a/.github/workflows/vs16-ci.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: VS16-CI - -on: [push, pull_request] - -jobs: - ci: - name: windows-vs16 - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: 'Run CMake with VS16' - uses: lukka/run-cmake@v2 - with: - cmakeListsOrSettingsJson: CMakeListsTxtAdvanced - cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' - buildDirectory: "${{ github.workspace }}/../../_temp/windows" - cmakeBuildType: Release - buildWithCMake: true - cmakeGenerator: VS16Win64 - cmakeAppendedArgs: -DSIMDJSON_COMPETITION=OFF - buildWithCMakeArgs: --config Release - - - name: 'Run CTest' - run: ctest -C Release -E checkperf --output-on-failure - working-directory: "${{ github.workspace }}/../../_temp/windows" - diff --git a/.github/workflows/vs16-clang-ci.yml b/.github/workflows/vs16-clang-ci.yml deleted file mode 100644 index 19b2afecdb..0000000000 --- a/.github/workflows/vs16-clang-ci.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: VS16-CLANG-CI - -on: [push, pull_request] - -jobs: - ci: - name: windows-vs16 - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: 'Run CMake with VS16' - uses: lukka/run-cmake@v2 - with: - cmakeListsOrSettingsJson: CMakeListsTxtAdvanced - cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' - buildDirectory: "${{ github.workspace }}/../../_temp/windows" - cmakeBuildType: Release - buildWithCMake: true - cmakeGenerator: VS16Win64 - cmakeAppendedArgs: -T ClangCL -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_BUILD_STATIC=ON - buildWithCMakeArgs: --config Release - - - name: 'Run CTest' - run: ctest -C Release -E checkperf --output-on-failure - working-directory: "${{ github.workspace }}/../../_temp/windows" \ No newline at end of file diff --git a/.github/workflows/vs16-ninja-ci.yml b/.github/workflows/vs16-ninja-ci.yml deleted file mode 100644 index 855d382e3c..0000000000 --- a/.github/workflows/vs16-ninja-ci.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: VS16-Ninja-CI - -on: [push, pull_request] - -jobs: - ci: - name: windows-vs16 - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: 'Run CMake with VS16' - uses: lukka/run-cmake@v2 - with: - cmakeListsOrSettingsJson: CMakeListsTxtAdvanced - cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' - buildDirectory: "${{ github.workspace }}/../../_temp/windows" - cmakeBuildType: Release - buildWithCMake: true - cmakeGenerator: VS16Win64 - cmakeAppendedArgs: -G Ninja -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_BUILD_STATIC=ON - buildWithCMakeArgs: --config Release - - - name: 'Run CTest' - run: ctest -C Release -E checkperf --output-on-failure - working-directory: "${{ github.workspace }}/../../_temp/windows" - diff --git a/.github/workflows/vs17-arm-ci.yml b/.github/workflows/vs17-arm-ci.yml new file mode 100644 index 0000000000..aacdb3a6da --- /dev/null +++ b/.github/workflows/vs17-arm-ci.yml @@ -0,0 +1,21 @@ +name: VS17-ARM-CI + +on: [push, pull_request] + +jobs: + ci: + name: windows-vs17 + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - {arch: ARM64} + - {arch: ARM64EC} + steps: + - name: checkout + uses: actions/checkout@v4 + - name: Use cmake + run: | + cmake -A ${{ matrix.arch }} -DCMAKE_SYSTEM_VERSION="10.0.22621.0" -DCMAKE_CROSSCOMPILING=1 -DSIMDJSON_DEVELOPER_MODE=ON -D SIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_EXCEPTIONS=OFF -B build && + cmake --build build --verbose diff --git a/.github/workflows/vs17-ci-cxx20.yml b/.github/workflows/vs17-ci-cxx20.yml new file mode 100644 index 0000000000..2543110991 --- /dev/null +++ b/.github/workflows/vs17-ci-cxx20.yml @@ -0,0 +1,44 @@ +name: VS17-CI CXX20 + +on: [push, pull_request] + +jobs: + ci: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + name: windows-vs17 + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - {gen: Visual Studio 17 2022, arch: Win32, shared: ON} + - {gen: Visual Studio 17 2022, arch: Win32, shared: OFF} + - {gen: Visual Studio 17 2022, arch: x64, shared: ON} + - {gen: Visual Studio 17 2022, arch: x64, shared: OFF} + steps: + - name: checkout + uses: actions/checkout@v4 + - name: Configure + run: | + cmake -DSIMDJSON_CXX_STANDARD=20 -G "${{matrix.gen}}" -A ${{matrix.arch}} -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -DBUILD_SHARED_LIBS=${{matrix.shared}} -B build + - name: Build Debug + run: cmake --build build --config Debug --verbose + - name: Build Release + run: cmake --build build --config Release --verbose + - name: Run Release tests + run: | + cd build + ctest -C Release -LE explicitonly --output-on-failure + - name: Run Debug tests + run: | + cd build + ctest -C Debug -LE explicitonly --output-on-failure + - name: Install + run: | + cmake --install build --config Release + - name: Test Installation + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -B build_install_test tests/installation_tests/find + cmake --build build_install_test --config Release \ No newline at end of file diff --git a/.github/workflows/vs17-ci.yml b/.github/workflows/vs17-ci.yml new file mode 100644 index 0000000000..348e75e741 --- /dev/null +++ b/.github/workflows/vs17-ci.yml @@ -0,0 +1,40 @@ +name: VS17-CI + +on: [push, pull_request] + +jobs: + ci: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + name: windows-vs17 + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - {gen: Visual Studio 17 2022, arch: Win32, shared: ON, build_type: Release} + - {gen: Visual Studio 17 2022, arch: Win32, shared: OFF, build_type: Release} + - {gen: Visual Studio 17 2022, arch: x64, shared: ON, build_type: Release} + - {gen: Visual Studio 17 2022, arch: x64, shared: OFF, build_type: Debug} + - {gen: Visual Studio 17 2022, arch: x64, shared: OFF, build_type: Release} + - {gen: Visual Studio 17 2022, arch: x64, shared: OFF, build_type: RelWithDebInfo} + steps: + - name: checkout + uses: actions/checkout@v4 + - name: Configure + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -DBUILD_SHARED_LIBS=${{matrix.shared}} -B build + - name: Build Debug + run: cmake --build build --config ${{matrix.build_type}} --verbose + - name: Run tests + run: | + cd build + ctest -C ${{matrix.build_type}} -LE explicitonly --output-on-failure + - name: Install + run: | + cmake --install build --config ${{matrix.build_type}} + - name: Test Installation + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -B build_install_test tests/installation_tests/find + cmake --build build_install_test --config ${{matrix.build_type}} \ No newline at end of file diff --git a/.github/workflows/vs17-clang-ci-cxx20.yml b/.github/workflows/vs17-clang-ci-cxx20.yml new file mode 100644 index 0000000000..af3ab02a00 --- /dev/null +++ b/.github/workflows/vs17-clang-ci-cxx20.yml @@ -0,0 +1,37 @@ +name: VS17-CLANG-CI + +on: [push, pull_request] + +jobs: + ci: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + name: windows-vs17 + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - {gen: Visual Studio 17 2022, arch: x64, build_type: Debug} + - {gen: Visual Studio 17 2022, arch: x64, build_type: Release} + - {gen: Visual Studio 17 2022, arch: x64, build_type: RelWithDebInfo} + steps: + - name: checkout + uses: actions/checkout@v4 + - name: Configure + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -T ClangCL -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + - name: Build + run: cmake --build build --config ${{matrix.build_type}} --verbose + - name: Run tests + run: | + cd build + ctest -C ${{matrix.build_type}} -LE explicitonly --output-on-failure + - name: Install + run: | + cmake --install build --config ${{matrix.build_type}} + - name: Test Installation + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -B build_install_test tests/installation_tests/find + cmake --build build_install_test --config ${{matrix.build_type}} \ No newline at end of file diff --git a/.github/workflows/vs17-clang-ci.yml b/.github/workflows/vs17-clang-ci.yml new file mode 100644 index 0000000000..af3ab02a00 --- /dev/null +++ b/.github/workflows/vs17-clang-ci.yml @@ -0,0 +1,37 @@ +name: VS17-CLANG-CI + +on: [push, pull_request] + +jobs: + ci: + if: >- + ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && + ! contains(toJSON(github.event.commits.*.message), '[skip github]') + name: windows-vs17 + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - {gen: Visual Studio 17 2022, arch: x64, build_type: Debug} + - {gen: Visual Studio 17 2022, arch: x64, build_type: Release} + - {gen: Visual Studio 17 2022, arch: x64, build_type: RelWithDebInfo} + steps: + - name: checkout + uses: actions/checkout@v4 + - name: Configure + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -T ClangCL -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_COMPETITION=OFF -B build + - name: Build + run: cmake --build build --config ${{matrix.build_type}} --verbose + - name: Run tests + run: | + cd build + ctest -C ${{matrix.build_type}} -LE explicitonly --output-on-failure + - name: Install + run: | + cmake --install build --config ${{matrix.build_type}} + - name: Test Installation + run: | + cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -B build_install_test tests/installation_tests/find + cmake --build build_install_test --config ${{matrix.build_type}} \ No newline at end of file diff --git a/.github/workflows/vs17-noexcept-ci.yml b/.github/workflows/vs17-noexcept-ci.yml new file mode 100644 index 0000000000..4e925e6722 --- /dev/null +++ b/.github/workflows/vs17-noexcept-ci.yml @@ -0,0 +1,30 @@ +name: VS17-NoExcept-CI + +on: [push, pull_request] + +jobs: + ci: + name: windows-vs17 + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: dependencies/.cache + key: ${{ hashFiles('dependencies/CMakeLists.txt') }} + - name: 'Run CMake with VS17' + uses: lukka/run-cmake@v3 + with: + cmakeListsOrSettingsJson: CMakeListsTxtAdvanced + cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' + buildDirectory: "${{ github.workspace }}/../../_temp/windows" + cmakeBuildType: Release + buildWithCMake: true + cmakeGenerator: VS16Win64 + cmakeAppendedArgs: -DSIMDJSON_COMPETITION=OFF -DSIMDJSON_DEVELOPER_MODE=ON -DSIMDJSON_EXCEPTIONS=OFF + buildWithCMakeArgs: --config Release + + - name: 'Run CTest' + run: ctest -C Release -LE explicitonly --output-on-failure + working-directory: "${{ github.workspace }}/../../_temp/windows" + diff --git a/.gitignore b/.gitignore index 29fb4f2c50..0608e31b2d 100644 --- a/.gitignore +++ b/.gitignore @@ -96,3 +96,14 @@ objs # Generated docs /doc/api +*.orig + +# VSCode workspace files +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +# clangd +.cache \ No newline at end of file diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 4622e8782c..0000000000 --- a/.gitmodules +++ /dev/null @@ -1,36 +0,0 @@ -[submodule "scalarvssimd/rapidjson"] - path = dependencies/rapidjson - url = https://github.com/Tencent/rapidjson.git -[submodule "dependencies/sajson"] - path = dependencies/sajson - url = https://github.com/chadaustin/sajson.git -[submodule "dependencies/json11"] - path = dependencies/json11 - url = https://github.com/dropbox/json11.git -[submodule "dependencies/fastjson"] - path = dependencies/fastjson - url = https://github.com/mikeando/fastjson.git -[submodule "dependencies/gason"] - path = dependencies/gason - url = https://github.com/vivkin/gason.git -[submodule "dependencies/ujson4c"] - path = dependencies/ujson4c - url = https://github.com/esnme/ujson4c.git -[submodule "dependencies/jsmn"] - path = dependencies/jsmn - url = https://github.com/zserge/jsmn.git -[submodule "dependencies/cJSON"] - path = dependencies/cJSON - url = https://github.com/DaveGamble/cJSON.git -[submodule "dependencies/jsoncpp"] - path = dependencies/jsoncpp - url = https://github.com/open-source-parsers/jsoncpp.git -[submodule "dependencies/json"] - path = dependencies/json - url = https://github.com/nlohmann/json.git -[submodule "dependencies/benchmark"] - path = dependencies/benchmark - url = https://github.com/google/benchmark.git -[submodule "dependencies/cxxopts"] - path = dependencies/cxxopts - url = https://github.com/jarro2783/cxxopts diff --git a/.travis.yml b/.travis.yml index aa3a73bf89..276d30b686 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,30 +1,190 @@ language: cpp -sudo: false -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-7 - - g++-7 - - clang-format - - python -branches: - only: - - master - -script: - - export CXX=g++-7 - - export CC=gcc-7 - - make - - make test - - make everything - - make amalgamate - - make clean - - make SANITIZEGOLD=1 test - - make clean - - ARCHFLAGS="-march=nehalem" make - - ARCHFLAGS="-march=nehalem" make test - - ARCHFLAGS="-march=nehalem" make everything - - ./style/run-clang-format.py -r include/ benchmark/ src/ tests/ +dist: bionic + +arch: + - ppc64le + +cache: + directories: + - $HOME/.dep_cache + +env: + global: + - simdjson_DEPENDENCY_CACHE_DIR=$HOME/.dep_cache + +matrix: + include: + - os: linux + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-8 + env: + - COMPILER="CC=gcc-8 && CXX=g++-8" + compiler: gcc-8 + + - os: linux + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-9 + env: + - COMPILER="CC=gcc-9 && CXX=g++-9" + compiler: gcc-9 + + - os: linux + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-10 + env: + - COMPILER="CC=gcc-10 && CXX=g++-10" + compiler: gcc-10 + +# The sanitizer runs fail systematically +# - os: linux +# addons: +# apt: +# sources: +# - ubuntu-toolchain-r-test +# packages: +# - g++-10 +# env: +# - COMPILER="CC=gcc-10 && CXX=g++-10" +# - SANITIZE="on" +# compiler: gcc-10-sanitize + + - os: linux + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-10 + env: + - COMPILER="CC=gcc-10 && CXX=g++-10" + - STATIC="on" + compiler: gcc-10-static + + - os: linux + addons: + apt: + sources: + - llvm-toolchain-bionic-6.0 + packages: + - clang-6.0 + env: + - COMPILER="CC=clang-6.0 && CXX=clang++-6.0" + compiler: clang-6 + + - os: linux + addons: + apt: + sources: + - llvm-toolchain-bionic-7 + packages: + - clang-7 + env: + - COMPILER="CC=clang-7 && CXX=clang++-7" + compiler: clang-7 + + - os: linux + addons: + apt: + sources: + - llvm-toolchain-bionic-8 + packages: + - clang-8 + env: + - COMPILER="CC=clang-8 && CXX=clang++-8" + compiler: clang-8 + + - os: linux + addons: + apt: + sources: + - llvm-toolchain-bionic-9 + packages: + - clang-9 + env: + - COMPILER="CC=clang-9 && CXX=clang++-9" + compiler: clang-9 + + - os: linux + addons: + apt: + packages: + - clang-10 + sources: + - ubuntu-toolchain-r-test + - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main' + key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' + env: + - COMPILER="CC=clang-10 && CXX=clang++-10" + compiler: clang-10 + + - os: linux + addons: + apt: + packages: + - clang-10 + sources: + - ubuntu-toolchain-r-test + - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main' + key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' + env: + - COMPILER="CC=clang-10 && CXX=clang++-10" + - STATIC="on" + compiler: clang-10-static + +# The clang sanitizer runs fail frequently at setup time +# - os: linux +# addons: +# apt: +# packages: +# - clang-10 +# sources: +# - ubuntu-toolchain-r-test +# - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main' +# key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' +# env: +# - COMPILER="CC=clang-10 && CXX=clang++-10" +# - SANITIZE="on" +# compiler: clang-10-sanitize + +before_install: + - eval "${COMPILER}" + +install: + - wget -q -O - "https://raw.githubusercontent.com/simdjson/debian-ppa/master/key.gpg" | sudo apt-key add - + - sudo apt-add-repository "deb https://raw.githubusercontent.com/simdjson/debian-ppa/master simdjson main" + - sudo apt-get -qq update + - sudo apt-get purge cmake cmake-data + - sudo apt-get -t simdjson -y install cmake + - export CMAKE_CXX_FLAGS="-maltivec -mcpu=power9 -mtune=power9" + - export CMAKE_C_FLAGS="${CMAKE_CXX_FLAGS}" + - export CMAKE_FLAGS="-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DSIMDJSON_IMPLEMENTATION=ppc64;fallback"; + - if [[ "${SANITIZE}" == "on" ]]; then + export CMAKE_FLAGS="${CMAKE_FLAGS} -DSIMDJSON_SANITIZE=ON"; + export ASAN_OPTIONS="detect_leaks=0"; + fi + - if [[ "${STATIC}" == "on" ]]; then + export CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_SHARED_LIBS=OFF"; + fi + - export CTEST_FLAGS="-j4 --output-on-failure -LE explicitonly" + +script: + - mkdir build + - cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . -- -j2 + - SIMDJSON_FORCE_IMPLEMENTATION=ppc64 ctest $CTEST_FLAGS -L per_implementation + - SIMDJSON_FORCE_IMPLEMENTATION=fallback ctest $CTEST_FLAGS -L per_implementation + - ctest $CTEST_FLAGS -LE "acceptance|per_implementation" diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000000..42ee1fa239 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,22 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations. + // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp + + // List of extensions which should be recommended for users of this workspace. + "recommendations": [ + // C++ + "llvm-vs-code-extensions.vscode-clangd", + "xaver.clang-format", + // Python + "ms-python.python", + // .github/* + "github.vscode-github-actions", + // cmake + "ms-vscode.cmake-tools", + "twxs.cmake" + ], + // List of extensions recommended by VS Code that should not be recommended for users of this workspace. + "unwantedRecommendations": [ + + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..ffdfabba89 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,131 @@ +{ + "editor.rulers": [ + {"column": 95 }, + {"column": 120 } + ], + "files.trimTrailingWhitespace": true, + "files.associations": { + ".clangd": "yaml", + "array": "cpp", + "iterator": "cpp", + "chrono": "cpp", + "optional": "cpp", + "__locale": "cpp", + "__tuple": "cpp", + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__functional_base": "cpp", + "__hash_table": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__nullptr": "cpp", + "__split_buffer": "cpp", + "__string": "cpp", + "__threading_support": "cpp", + "__tree": "cpp", + "algorithm": "cpp", + "atomic": "cpp", + "bit": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "cinttypes": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "codecvt": "cpp", + "complex": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "exception": "cpp", + "forward_list": "cpp", + "fstream": "cpp", + "functional": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "list": "cpp", + "locale": "cpp", + "map": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "numeric": "cpp", + "ostream": "cpp", + "random": "cpp", + "ratio": "cpp", + "regex": "cpp", + "set": "cpp", + "sstream": "cpp", + "stack": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "thread": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "utility": "cpp", + "valarray": "cpp", + "vector": "cpp", + "*.ipp": "cpp", + "__functional_base_03": "cpp", + "filesystem": "cpp", + "*.inc": "cpp", + "compare": "cpp", + "concepts": "cpp", + "variant": "cpp", + "__bits": "cpp", + "csignal": "cpp", + "future": "cpp", + "queue": "cpp", + "shared_mutex": "cpp", + "ranges": "cpp", + "span": "cpp", + "__verbose_abort": "cpp", + "charconv": "cpp", + "source_location": "cpp", + "strstream": "cpp", + "typeindex": "cpp", + "*.tcc": "cpp", + "memory_resource": "cpp", + "numbers": "cpp", + "semaphore": "cpp", + "stop_token": "cpp", + "cfenv": "cpp", + "format": "cpp", + "xlocmes": "cpp", + "xlocmon": "cpp", + "xlocnum": "cpp", + "xloctime": "cpp", + "xutility": "cpp", + "coroutine": "cpp", + "xfacet": "cpp", + "xhash": "cpp", + "xiosbase": "cpp", + "xlocale": "cpp", + "xlocbuf": "cpp", + "xlocinfo": "cpp", + "xmemory": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xtree": "cpp" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d4841ac96..da4749081c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,62 +1,321 @@ -cmake_minimum_required(VERSION 3.9) # CMP0069 NEW +cmake_minimum_required(VERSION 3.14) -project(simdjson - DESCRIPTION "Parsing gigabytes of JSON per second" - LANGUAGES CXX C +project( + simdjson + # The version number is modified by tools/release.py + VERSION 3.12.3 + DESCRIPTION "Parsing gigabytes of JSON per second" + HOMEPAGE_URL "https://simdjson.org/" + LANGUAGES CXX C ) -set(PROJECT_VERSION_MAJOR 0) -set(PROJECT_VERSION_MINOR 4) -set(PROJECT_VERSION_PATCH 6) -set(SIMDJSON_SEMANTIC_VERSION "0.4.6" CACHE STRING "simdjson semantic version") -set(SIMDJSON_LIB_VERSION "2.0.0" CACHE STRING "simdjson library version") -set(SIMDJSON_LIB_SOVERSION "2" CACHE STRING "simdjson library soversion") -set(SIMDJSON_GITHUB_REPOSITORY https://github.com/simdjson/simdjson) +set(SIMDJSON_GITHUB_REPOSITORY "https://github.com/simdjson/simdjson") +string( + COMPARE EQUAL + "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}" + is_top_project +) + +# ---- Options, variables ---- + +# These version numbers are modified by tools/release.py +set(SIMDJSON_LIB_VERSION "25.0.0" CACHE STRING "simdjson library version") +set(SIMDJSON_LIB_SOVERSION "25" CACHE STRING "simdjson library soversion") + +option(SIMDJSON_BUILD_STATIC_LIB "Build simdjson_static library along with simdjson (only makes sense if BUILD_SHARED_LIBS=ON)" OFF) +if(SIMDJSON_BUILD_STATIC_LIB AND NOT BUILD_SHARED_LIBS) + message(WARNING "SIMDJSON_BUILD_STATIC_LIB only makes sense if BUILD_SHARED_LIBS is set to ON") + message(WARNING "You might be building and installing a two identical static libraries.") +endif() + +option(SIMDJSON_ENABLE_THREADS "Link with thread support" ON) + +include(cmake/simdjson-props.cmake) +include(cmake/implementation-flags.cmake) +include(cmake/exception-flags.cmake) + +option(SIMDJSON_DISABLE_DEPRECATED_API "Disables deprecated APIs" OFF) +if(SIMDJSON_DISABLE_DEPRECATED_API) + simdjson_add_props( + target_compile_definitions PUBLIC + SIMDJSON_DISABLE_DEPRECATED_API=1 + ) +endif() + +option(SIMDJSON_DEVELOPMENT_CHECKS "Enable development-time aids, such as \ +checks for incorrect API usage. Enabled by default in DEBUG." OFF) +if(SIMDJSON_DEVELOPMENT_CHECKS) + simdjson_add_props( + target_compile_definitions PUBLIC + SIMDJSON_DEVELOPMENT_CHECKS + ) +endif() + +if(is_top_project) + option(SIMDJSON_DEVELOPER_MODE "Enable targets for developing simdjson" OFF) + option(BUILD_SHARED_LIBS "Build simdjson as a shared library" OFF) + option(SIMDJSON_SINGLEHEADER "Disable singleheader generation" ON) +endif() + +include(cmake/handle-deprecations.cmake) +include(cmake/developer-options.cmake) + +# ---- simdjson library ---- + +set(SIMDJSON_SOURCES src/simdjson.cpp) + +add_library(simdjson ${SIMDJSON_SOURCES}) +add_library(simdjson::simdjson ALIAS simdjson) +set(SIMDJSON_LIBRARIES simdjson) + +if(SIMDJSON_BUILD_STATIC_LIB) + add_library(simdjson_static STATIC ${SIMDJSON_SOURCES}) + add_library(simdjson::simdjson_static ALIAS simdjson_static) + list(APPEND SIMDJSON_LIBRARIES simdjson_static) +endif() + +set_target_properties( + simdjson PROPERTIES + VERSION "${SIMDJSON_LIB_VERSION}" + SOVERSION "${SIMDJSON_LIB_SOVERSION}" + # FIXME: symbols should be hidden by default + WINDOWS_EXPORT_ALL_SYMBOLS YES +) + +# FIXME: Use proper CMake integration for exports +if(MSVC AND BUILD_SHARED_LIBS) + target_compile_definitions( + simdjson + PRIVATE SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY=1 + INTERFACE SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY=1 + ) +endif() + +simdjson_add_props( + target_include_directories + PUBLIC "$" + PRIVATE "$" +) + +simdjson_add_props(target_compile_features PUBLIC cxx_std_11) + +# workaround for GNU GCC poor AVX load/store code generation +if( + CMAKE_CXX_COMPILER_ID STREQUAL "GNU" + AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86(_64)?)$" +) + simdjson_add_props( + target_compile_options PRIVATE + -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store + ) +endif() + +option(SIMDJSON_MINUS_ZERO_AS_FLOAT "Treat -0 as a floating-point value" OFF) + +if(SIMDJSON_MINUS_ZERO_AS_FLOAT) + simdjson_add_props(target_compile_definitions PRIVATE SIMDJSON_MINUS_ZERO_AS_FLOAT=1) +endif(SIMDJSON_MINUS_ZERO_AS_FLOAT) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64)$") + option(SIMDJSON_PREFER_LSX "Prefer LoongArch SX" ON) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag(-mlasx COMPILER_SUPPORTS_LASX) + check_cxx_compiler_flag(-mlsx COMPILER_SUPPORTS_LSX) + if(COMPILER_SUPPORTS_LASX AND NOT SIMDJSON_PREFER_LSX) + simdjson_add_props( + target_compile_options PRIVATE + -mlasx + ) + elseif(COMPILER_SUPPORTS_LSX) + simdjson_add_props( + target_compile_options PRIVATE + -mlsx + ) + endif() +endif() + +# GCC and Clang have horrendous Debug builds when using SIMD. +# A common fix is to use '-Og' instead. +# bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 +if( + (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR + CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR + CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") +) + message(STATUS "Adding -Og to compile flag") + simdjson_add_props( + target_compile_options PRIVATE + $<$:-Og> + ) +endif() + +if(SIMDJSON_ENABLE_THREADS) + find_package(Threads REQUIRED) + simdjson_add_props(target_link_libraries PUBLIC Threads::Threads) + simdjson_add_props(target_compile_definitions PUBLIC SIMDJSON_THREADS_ENABLED=1) +endif() + +simdjson_apply_props(simdjson) +if(SIMDJSON_BUILD_STATIC_LIB) + simdjson_apply_props(simdjson_static) +endif() + +# ---- Install rules ---- + +include(CMakePackageConfigHelpers) include(GNUInstallDirs) -include(cmake/simdjson-flags.cmake) -include(cmake/simdjson-user-cmakecache.cmake) +if(SIMDJSON_SINGLEHEADER) + install( + FILES singleheader/simdjson.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + COMPONENT simdjson_Development + ) +endif() + +install( + TARGETS simdjson + EXPORT simdjsonTargets + RUNTIME COMPONENT simdjson_Runtime + LIBRARY COMPONENT simdjson_Runtime + NAMELINK_COMPONENT simdjson_Development + ARCHIVE COMPONENT simdjson_Development + INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" +) +configure_file(cmake/simdjson-config.cmake.in simdjson-config.cmake @ONLY) + +write_basic_package_version_file( + simdjson-config-version.cmake + COMPATIBILITY SameMinorVersion +) +set( + SIMDJSON_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/simdjson" + CACHE STRING "CMake package config location relative to the install prefix" +) +mark_as_advanced(SIMDJSON_INSTALL_CMAKEDIR) -if(SIMDJSON_JUST_LIBRARY) - message( STATUS "Building just the library, omitting all tests, tools and benchmarks." ) +install( + FILES + "${PROJECT_BINARY_DIR}/simdjson-config.cmake" + "${PROJECT_BINARY_DIR}/simdjson-config-version.cmake" + DESTINATION "${SIMDJSON_INSTALL_CMAKEDIR}" + COMPONENT simdjson_Development +) + +install( + EXPORT simdjsonTargets + NAMESPACE simdjson:: + DESTINATION "${SIMDJSON_INSTALL_CMAKEDIR}" + COMPONENT simdjson_Development +) + +if(SIMDJSON_BUILD_STATIC_LIB) + install( + TARGETS simdjson_static + EXPORT simdjson_staticTargets + ARCHIVE COMPONENT simdjson_Development + INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + ) + install( + EXPORT simdjson_staticTargets + NAMESPACE simdjson:: + DESTINATION "${SIMDJSON_INSTALL_CMAKEDIR}" + COMPONENT simdjson_Development + ) endif() -# -# Set up test data -# -if(NOT(SIMDJSON_JUST_LIBRARY)) - enable_testing() - add_subdirectory(jsonchecker) - add_subdirectory(jsonexamples) - add_library(test-data INTERFACE) - target_link_libraries(test-data INTERFACE jsonchecker-data jsonexamples-data) +# pkg-config +include(cmake/JoinPaths.cmake) +join_paths(PKGCONFIG_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") +join_paths(PKGCONFIG_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}") + +if(SIMDJSON_ENABLE_THREADS) + set(PKGCONFIG_CFLAGS "-DSIMDJSON_THREADS_ENABLED=1") + if(CMAKE_THREAD_LIBS_INIT) + set(PKGCONFIG_LIBS_PRIVATE "Libs.private: ${CMAKE_THREAD_LIBS_INIT}") + endif() endif() +configure_file("simdjson.pc.in" "simdjson.pc" @ONLY) +install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/simdjson.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig" +) + # -# Create the top level simdjson library (must be done at this level to use both src/ and include/ -# directories) and tools +# CPack # -add_subdirectory(include) -add_subdirectory(src) +if(is_top_project) + set(CPACK_PACKAGE_VENDOR "Daniel Lemire") + set(CPACK_PACKAGE_CONTACT "lemire@gmail.com") + set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE") + set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md") + + set(CPACK_RPM_PACKAGE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE") + + set(CPACK_SOURCE_GENERATOR "TGZ;ZIP") + + include(CPack) +endif() + +# ---- Developer mode extras ---- + +if(is_top_project AND NOT SIMDJSON_DEVELOPER_MODE) + message(STATUS "Building only the library. Advanced users and contributors may want to turn SIMDJSON_DEVELOPER_MODE to ON, e.g., via -D SIMDJSON_DEVELOPER_MODE=ON.") +elseif(SIMDJSON_DEVELOPER_MODE AND NOT is_top_project) + message(AUTHOR_WARNING "Developer mode in simdjson is intended for the developers of simdjson") +endif() + +if(NOT SIMDJSON_DEVELOPER_MODE) + return() +endif() + +simdjson_apply_props(simdjson-internal-flags) + +set( + SIMDJSON_USER_CMAKECACHE + "${CMAKE_BINARY_DIR}/.simdjson-user-CMakeCache.txt" +) +add_custom_target( + simdjson-user-cmakecache + COMMAND "${CMAKE_COMMAND}" + -D "BINARY_DIR=${CMAKE_BINARY_DIR}" + -D "USER_CMAKECACHE=${SIMDJSON_USER_CMAKECACHE}" + -P "${PROJECT_SOURCE_DIR}/cmake/simdjson-user-cmakecache.cmake" + VERBATIM +) + +# Setup tests +enable_testing() +# So we can build just tests with "make all_tests" +add_custom_target(all_tests) + add_subdirectory(windows) -if(NOT(SIMDJSON_JUST_LIBRARY)) - add_subdirectory(dependencies) ## This needs to be before tools because of cxxopts - add_subdirectory(tools) ## This needs to be before tests because of cxxopts - add_subdirectory(singleheader) +include(cmake/CPM.cmake) +add_subdirectory(dependencies) ## This needs to be before tools because of cxxopts +add_subdirectory(tools) ## This needs to be before tests because of cxxopts + +# Data: jsonexamples is left with only the bare essential. +# most of the data has been moved to https://github.com/simdjson/simdjson-data +add_subdirectory(jsonexamples) + +if(SIMDJSON_SINGLEHEADER) +add_subdirectory(singleheader) endif() -install(FILES singleheader/simdjson.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + + # # Compile tools / tests / benchmarks # -if(NOT(SIMDJSON_JUST_LIBRARY)) - add_subdirectory(tests) - add_subdirectory(examples) +add_subdirectory(tests) +add_subdirectory(examples) +if(CMAKE_SIZEOF_VOID_P EQUAL 8) # we only include the benchmarks on 64-bit systems. add_subdirectory(benchmark) - add_subdirectory(fuzz) endif() +add_subdirectory(fuzz) # # Source files should be just ASCII @@ -64,29 +323,59 @@ endif() find_program(FIND find) find_program(FILE file) find_program(GREP grep) -if((FIND) AND (FILE) AND (GREP)) +if(FIND AND FILE AND GREP) + add_test( + NAME just_ascii + COMMAND sh -c "\ +${FIND} include src windows tools singleheader tests examples benchmark \ +-path benchmark/checkperf-reference -prune -name '*.h' -o -name '*.cpp' \ +-type f -exec ${FILE} '{}' \; | ${GREP} -qv ASCII || exit 0 && exit 1" + WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}" + ) +endif() + +## +## In systems like R, libraries must not use stderr or abort to be acceptable. +## Thus we make it a hard rule that one is not allowed to call abort or stderr. +## The sanitized builds are allowed to abort. +## +if(NOT SIMDJSON_SANITIZE) + find_program(GREP grep) + find_program(NM nm) + if((NOT GREP) OR (NOT NM)) + message("grep and nm are unavailable on this system.") + else() + add_test( + NAME "avoid_abort" + # Under FreeBSD, the __cxa_guard_abort symbol may appear but it is fine. + # So we want to look for abort as a test. + COMMAND sh -c "${NM} $ | ${GREP} ' _*abort' || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + add_test( + NAME "avoid_cout" + COMMAND sh -c "${NM} $ | ${GREP} ' _*cout' || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + add_test( + NAME "avoid_cerr" + COMMAND sh -c "${NM} $ | ${GREP} ' _*cerr' || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) add_test( - NAME "just_ascii" - COMMAND sh -c "${FIND} include src windows tools singleheader tests examples benchmark -path benchmark/checkperf-reference -prune -name '*.h' -o -name '*.cpp' -type f -exec ${FILE} '{}' \; |${GREP} -v ASCII || exit 0 && exit 1" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + NAME "avoid_printf" + COMMAND sh -c "${NM} $ | ${GREP} ' _*printf' || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} ) + add_test( + NAME "avoid_stdout" + COMMAND sh -c "${NM} $ | ${GREP} stdout || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + add_test( + NAME "avoid_stderr" + COMMAND sh -c "${NM} $ | ${GREP} stderr || exit 0 && exit 1" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + endif() endif() - - -# -# CPack -# -set(CPACK_PACKAGE_VENDOR "Daniel Lemire") -set(CPACK_PACKAGE_CONTACT "lemire@gmail.com") -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Parsing gigabytes of JSON per second") -set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) -set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR}) -set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH}) -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") -set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md") - -set(CPACK_RPM_PACKAGE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") - -set(CPACK_SOURCE_GENERATOR "TGZ;ZIP") - -include(CPack) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e645cfbd58..a9cf186e45 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,6 +41,8 @@ We have few hard rules, but we have some: - Printing to standard output or standard error (`stderr`, `stdout`, `std::cerr`, `std::cout`) in the core library is forbidden. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Compiled code should not write to stdout or stderr". - Calls to `abort()` are forbidden in the core library. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Under no circumstances should your compiled code ever call abort or exit". - All source code files (.h, .cpp) must be ASCII. +- All C macros introduced in public headers need to be prefixed with either `SIMDJSON_` or `simdjson_`. +- We avoid trailing white space characters within lines. That is, your lines of code should not terminate with unnecessary spaces. Generally, please avoid making unnecessary changes to white-space characters when contributing code. Tools, tests and benchmarks are not held to these same strict rules. @@ -50,24 +52,27 @@ General Guidelines Contributors are encouraged to : - Document their changes. Though we do not enforce a rule regarding code comments, we prefer that non-trivial algorithms and techniques be somewhat documented in the code. -- Follow as much as possible the existing code style. We do not enforce a specific code style, but we prefer consistency. +- Follow as much as possible the existing code style. We do not enforce a specific code style, but we prefer consistency. We avoid contractions (isn't, aren't) in the comments. - Modify as few lines of code as possible when working on an issue. The more lines you modify, the harder it is for your fellow human beings to understand what is going on. -- Tools may report "problems" with the code, but we never delegate programming to tools: if there is a problem with the code, we need to understand it. Thus we will not "fix" code merely to please a static analyzer if we do not understand. +- Tools may report "problems" with the code, but we never delegate programming to tools: if there is a problem with the code, we need to understand it. Thus we will not "fix" code merely to please a static analyzer. - Provide tests for any new feature. We will not merge a new feature without tests. +- Run before/after benchmarks so that we can appreciate the effect of the changes on the performance. Pull Requests -------------- Pull requests are always invited. However, we ask that you follow these guidelines: -- It is wiser to discuss your ideas first as part of an issue before you start coding. If you omit this step and code first, be prepare to have your code receive scrutiny and be dropped. -- Users should provide a rationale for their changes. Does it improve performance? Does it add a feature? Does it improve maintainability? Does fix a bug? This must be explicitly stated as part of the pull request. Do not propose changes based on taste or intuition. We do not delegate programming to tools: that some tool suggested a code change is not reason enough to change the code. +- It is wise to discuss your ideas first as part of an issue before you start coding. If you omit this step and code first, be prepared to have your code receive scrutiny and be dropped. +- Users should provide a rationale for their changes. Does it improve performance? Does it add a feature? Does it improve maintainability? Does it fix a bug? This must be explicitly stated as part of the pull request. Do not propose changes based on taste or intuition. We do not delegate programming to tools: that some tool suggested a code change is not reason enough to change the code. 1. When your code improves performance, please document the gains with a benchmark using hard numbers. - 2. If your code fixes a bug, please be either fix a failing test, or propose a new test. + 2. If your code fixes a bug, please either fix a failing test, or propose a new test. 3. Other types of changes must be clearly motivated. We openly discourage changes with no identifiable benefits. - Changes should be focused and minimal. You should change as few lines of code as possible. Please do not reformat or touch files needlessly. -- New features must be accompanied of new tests, in general. -- Your code should pass our continuous-integration tests. It is your responsability to ensure that your proposal pass the tests. We do not merge pull requests that would break our build. +- New features must be accompanied by new tests, in general. +- Your code should pass our continuous-integration tests. It is your responsibility to ensure that your proposal pass the tests. We do not merge pull requests that would break our build. + - An exception to this would be changes to non-code files, such as documentation and assets, or trivial changes to code, such as comments, where it is encouraged to explicitly ask for skipping a CI run using the `[skip ci]` prefix in your Pull Request title **and** in the first line of the most recent commit in a push. Example for such a commit: `[skip ci] Fixed typo in power_of_ten's docs` + This benefits the project in such a way that the CI pipeline is not burdened by running jobs on changes that don't change any behavior in the code, which reduces wait times for other Pull Requests that do change behavior and require testing. If the benefits of your proposed code remain unclear, we may choose to discard your code: that is not an insult, we frequently discard our own code. We may also consider various alternatives and choose another path. Again, that is not an insult or a sign that you have wasted your time. @@ -87,7 +92,7 @@ We welcome contributions from women and less represented groups. If you need hel Consider the following points when engaging with the project: -- We discourage arguments from authority: ideas are discusssed on their own merits and not based on who stated it. +- We discourage arguments from authority: ideas are discussed on their own merits and not based on who stated it. - Be mindful that what you may view as an aggression is maybe merely a difference of opinion or a misunderstanding. - Be mindful that a collection of small aggressions, even if mild in isolation, can become harmful. diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 608841d7d7..557127a9ed 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -35,5 +35,11 @@ Matjaž OstroverÅ¡nik Nong Li Furkan TaÅŸkale Brendan Knapp -# if you have contributed to the project and your name does not +Danila Kutenin +Pavel Pavlov +Hao Chen +Nicolas Boyer +Kim Walisch and Jatin Bhateja (AVX-512 bitset decoder) +Fangzheng Zhang and Weiqiang Wan (AVX-512 kernel) +# if you have contributed to the project and your name does not # appear in this list, please let us know! diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 264c28f0f3..0000000000 --- a/Dockerfile +++ /dev/null @@ -1,88 +0,0 @@ -### -# -# Though simdjson requires only commonly available compilers and tools, it can -# be convenient to build it and test it inside a docker container: it makes it -# possible to test and benchmark simdjson under even relatively out-of-date -# Linux servers. It should also work under macOS and Windows, though not -# at native speeds, maybe. -# -# Assuming that you have a working docker server, this file -# allows you to build, test and benchmark simdjson. -# -# We build the library and associated files in the dockerbuild subdirectory. -# It may be necessary to delete it before creating the image: -# -# rm -r -f dockerbuild -# -# The need to delete the directory has nothing to do with docker per se: it is -# simply cleaner in CMake to start from a fresh directory. This is important: if you -# reuse the same directory with different configurations, you may get broken builds. -# -# -# Then you can build the image as follows: -# -# docker build -t simdjson --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) . -# -# Please note that the image does not contain a copy of the code. However, the image will contain the -# the compiler and the build system. This means that if you change the source code, after you have built -# the image, you won't need to rebuild the image. In fact, unless you want to try a different compiler, you -# do not need to ever rebuild the image, even if you do a lot of work on the source code. -# -# We specify the users to avoid having files owned by a privileged user (root) in our directory. Some -# people like to run their machine as the "root" user. We do not think it is cool. -# -# Then you need to build the project: -# -# docker run -v $(pwd):/project:Z simdjson -# -# Should you change a source file, you may need to call this command again. Because the output -# files are persistent between calls to this command (they reside in the dockerbuild directory), -# this command can be fast. -# -# Next you can test it as follows: -# -# docker run -it -v $(pwd):/project:Z simdjson sh -c "cd dockerbuild && ctest . --output-on-failure -E checkperf" -# -# The run the complete tests requires you to have built all of simdjson. -# -# Building all of simdjson takes a long time. Instead, you can build just one target: -# -# docker run -it -v $(pwd):/project:Z simdjson sh -c "[ -d dockerbuild ] || mkdir dockerbuild && cd dockerbuild && cmake .. && cmake --build . --target parse" -# -# Note that it is safe to remove dockerbuild before call the previous command, as the repository gets rebuild. It is also possible, by changing the command, to use a different directory name. -# -# You can run performance tests: -# -# docker run -it --privileged -v $(pwd):/project:Z simdjson sh -c "cd dockerbuild && for i in ../jsonexamples/*.json; do echo \$i; ./benchmark/parse \$i; done" -# -# The "--privileged" is recommended so you can get performance counters under Linux. -# -# You can also grab a fresh copy of simdjson and rebuild it, to make comparisons: -# -# docker run -it -v $(pwd):/project:Z simdjson sh -c "git clone https://github.com/simdjson/simdjson.git && cd simdjson && mkdir build && cd build && cmake .. && cmake --build . --target parse " -# -# Then you can run comparisons: -# -# docker run -it --privileged -v $(pwd):/project:Z simdjson sh -c "for i in jsonexamples/*.json; do echo \$i; dockerbuild/benchmark/parse \$i| grep GB| head -n 1; simdjson/build/benchmark/parse \$i | grep GB |head -n 1; done" -# -#### -FROM ubuntu:20.10 -################ -# We would prefer to use the conan io images but they do not support 64-bit ARM? The small gcc images appear to -# be broken on ARM. -# Furthermore, we would not expect users to frequently rebuild the container, so using ubuntu is probably fine. -############### -ARG USER_ID -ARG GROUP_ID -RUN apt-get update -qq -RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata -RUN apt-get install -y cmake g++ git -RUN mkdir project - -RUN addgroup --gid $GROUP_ID user; exit 0 -RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user; exit 0 -USER user -RUN gcc --version -WORKDIR /project - -CMD ["sh","-c","[ -d dockerbuild ] || mkdir dockerbuild && cd dockerbuild && cmake .. && cmake --build . "] diff --git a/Doxyfile b/Doxyfile index b6af5cd3da..0bf0f9996c 100644 --- a/Doxyfile +++ b/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = simdjson # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "0.4.6" +PROJECT_NUMBER = "3.12.3" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -829,7 +829,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = doc include +INPUT = doc include/simdjson include/simdjson/dom include/simdjson/generic # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1246,7 +1246,10 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = + +HTML_EXTRA_STYLESHEET = theme/doxygen-awesome.css \ + theme/doxygen-awesome-sidebar-only.css \ + theme/doxygen-awesome-sidebar-only-darkmode-toggle.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1256,7 +1259,10 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = theme/doxygen-awesome-darkmode-toggle.js \ + theme/doxygen-awesome-interactive-toc.js \ + theme/doxygen-awesome-fragment-copy-button.js \ + theme/doxygen-awesome-paragraph-link.js # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1543,7 +1549,7 @@ DISABLE_INDEX = NO # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -GENERATE_TREEVIEW = NO +GENERATE_TREEVIEW = YES # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. diff --git a/HACKING.md b/HACKING.md index df2fe9e080..8ee62672f7 100644 --- a/HACKING.md +++ b/HACKING.md @@ -1,11 +1,71 @@ + Hacking simdjson ================ -Here is wisdom about how to build, test and run simdjson from within the repository. *Users* of -simdjson should use the released simdjson.h and simdjson.cpp files. +Here is wisdom about how to build, test and run simdjson from within the repository. This is mostly useful for people who plan to contribute simdjson, or maybe study the design. If you plan to contribute to simdjson, please read our [CONTRIBUTING](https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md) guide. +- [Hacking simdjson](#hacking-simdjson) + - [Build Quickstart](#build-quickstart) + - [Design notes](#design-notes) + - [Developer mode](#developer-mode) + - [Directory Structure and Source](#directory-structure-and-source) + - [Runtime Dispatching](#runtime-dispatching) + - [Regenerating Single-Header Files](#regenerating-single-header-files) + - [Usage (CMake on 64-bit platforms like Linux, FreeBSD or macOS)](#usage-cmake-on-64-bit-platforms-like-linux-freebsd-or-macos) + - [Usage (CMake on 64-bit Windows using Visual Studio 2019 or better)](#usage-cmake-on-64-bit-windows-using-visual-studio-2019-or-better) + - [Various References](#various-references) + +Build Quickstart +------------------------------ + +```bash +mkdir build +cd build +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . +``` + +Design notes +------------------------------ + +The parser works in two stages: + +- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage. +- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage. + + +The role of stage 1 is to identify pseudo-structural characters as quickly as possible. A character is pseudo-structural if and only if: + +1. Not enclosed in quotes, AND +2. Is a non-whitespace character, AND +3. Its preceding character is either: + (a) a structural character, OR + (b) whitespace OR + (c) the final quote in a string. + +This helps as we redefine some new characters as pseudo-structural such as the characters 1, G, n in the following: + +> { "foo" : 1.5, "bar" : 1.5 GEOFF_IS_A_DUMMY bla bla , "baz", null } + +Stage 1 also does unicode validation. + +Stage 2 handles all of the rest: number parsings, recognizing atoms like true, false, null, and so forth. + +Developer mode +-------------- + +Build system targets that are only useful for developers of the simdjson +library are behind the `SIMDJSON_DEVELOPER_MODE` option. Enabling this option +makes tests, examples, benchmarks and other developer targets available. Not +enabling this option means that you are a consumer of simdjson and thus you +only get the library targets and options. + +Developer mode is forced to be on when the `CI` environment variable is set to +a value that CMake recognizes as "on", which is set to `true` in all of the CI +workflows used by simdjson. + Directory Structure and Source ------------------------------ @@ -13,69 +73,85 @@ simdjson's source structure, from the top level, looks like this: * **CMakeLists.txt:** The main build system. * **include:** User-facing declarations and inline definitions (most user-facing functions are inlined). - * simdjson.h: A "master include" that includes files from include/simdjson/. This is equivalent to + * simdjson.h: the `simdjson` namespace. A "main include" that includes files from include/simdjson/. This is equivalent to the distributed simdjson.h. - * simdjson/*.h: Declarations for public simdjson classes and functions. - * simdjson/inline/*.h: Definitions for public simdjson classes and functions. + * simdjson/*.h: Declarations for public simdjson classes and functions. + * simdjson/*-inl.h: Definitions for public simdjson classes and functions. + * simdjson/internal/*.h: the `simdjson::internal` namespace. Private classes and functions used by the rest of simdjson. + * simdjson/dom.h: the `simdjson::dom` namespace. Includes all public DOM classes. + * simdjson/dom/*.h: Declarations/definitions for individual DOM classes. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere.h: `simdjson::` namespace. Common implementation-specific tools like number and string parsing, as well as minification. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere/*.h: implementation-specific functions such as , etc. + * simdjson/generic/*.h: the bulk of the actual code, written generically and compiled for each implementation, using functions defined in the implementation's .h files. + * simdjson/generic/dependencies.h: dependencies on common, non-implementation-specific simdjson classes. This will be included before including amalgamated.h. + * simdjson/generic/amalgamated.h: all generic ondemand classes for an implementation. + * simdjson/ondemand.h: the `simdjson::ondemand` namespace. Includes all public ondemand classes. + * simdjson/builtin.h: the `simdjson::builtin` namespace. Aliased to the most universal implementation available. + * simdjson/builtin/ondemand.h: the `simdjson::builtin::ondemand` namespace. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere/ondemand.h: the `simdjson::::ondemand` namespace. On-Demand compiled for the specific implementation. + * simdjson/generic/ondemand/*.h: individual On-Demand classes, generically written. + * simdjson/generic/ondemand/dependencies.h: dependencies on common, non-implementation-specific simdjson classes. This will be included before including amalgamated.h. + * simdjson/generic/ondemand/amalgamated.h: all generic ondemand classes for an implementation. * **src:** The source files for non-inlined functionality (e.g. the architecture-specific parser implementations). - * simdjson.cpp: A "master source" that includes all implementation files from src/. This is + * simdjson.cpp: A "main source" that includes all implementation files from src/. This is equivalent to the distributed simdjson.cpp. - * arm64/|fallback/|haswell/|westmere/: Architecture-specific implementations. All functions are - Each architecture defines its own namespace, e.g. simdjson::haswell. - * generic/: Generic implementations of the simdjson parser. These files may be included and - compiled multiple times, from whichever architectures use them. They assume they are already - enclosed in a namespace, e.g.: - ```c++ - namespace simdjson { - namespace haswell { - #include "generic/stage1/json_structural_indexer.h" - } - } - ``` + * *.cpp: other misc. implementations, such as `simdjson::implementation` and the minifier. + * arm64|fallback|haswell|icelake|ppc64|westmere.cpp: Architecture-specific parser implementations. + * generic/*.h: `simdjson::` namespace. Generic implementation of the parser, particularly the `dom_parser_implementation`. + * generic/stage1/*.h: `simdjson::::stage1` namespace. Generic implementation of the simd-heavy tokenizer/indexer pass of the simdjson parser. Used for the On-Demand interface + * generic/stage2/*.h: `simdjson::::stage2` namespace. Generic implementation of the tape creator, which consumes the index from stage 1 and actually parses numbers and string and such. Used for the DOM interface. Other important files and directories: * **.drone.yml:** Definitions for Drone CI. * **.appveyor.yml:** Definitions for Appveyor CI (Windows). * **.circleci:** Definitions for Circle CI. -* **amalgamate.sh:** Generates singleheader/simdjson.h and singleheader/simdjson.cpp for release. +* **.github/workflows:** Definitions for GitHub Actions (CI). +* **singleheader:** Contains generated `simdjson.h` and `simdjson.cpp` that we release. The files `singleheader/simdjson.h` and `singleheader/simdjson.cpp` should never be edited by hand. +* **singleheader/amalgamate.py:** Generates `singleheader/simdjson.h` and `singleheader/simdjson.cpp` for release (python script). * **benchmark:** This is where we do benchmarking. Benchmarking is core to every change we make; the cardinal rule is don't regress performance without knowing exactly why, and what you're trading - for it. Many of our benchmarks are microbenchmarks. We are effectively doing controlled scientific experiments for the purpose of understanding what affects our performance. So we simplify as much as possible. We try to avoid irrelevant factors such as page faults, interrupts, unnnecessary system calls. We recommend checking the performance as follows: + for it. Many of our benchmarks are microbenchmarks. We are effectively doing controlled scientific experiments for the purpose of understanding what affects our performance. So we simplify as much as possible. We try to avoid irrelevant factors such as page faults, interrupts, unnecessary system calls. We recommend checking the performance as follows: ```bash mkdir build cd build - cmake .. + cmake -D SIMDJSON_DEVELOPER_MODE=ON .. cmake --build . --config Release - benchmark/parse ../jsonexamples/twitter.json + benchmark/dom/parse ../jsonexamples/twitter.json ``` The last line becomes `./benchmark/Release/parse.exe ../jsonexample/twitter.json` under Windows. You may also use Google Benchmark: ```bash mkdir build cd build - cmake .. + cmake -D SIMDJSON_DEVELOPER_MODE=ON .. cmake --build . --target bench_parse_call --config Release ./benchmark/bench_parse_call ``` - The last line becomes `./benchmark/Release/bench_parse_call.exe` under Windows. Under Windows, you can also build with the clang compiler by adding `-T ClangCL` to the call to `cmake ..`: `cmake .. - TClangCL`. + The last line becomes `./benchmark/Release/bench_parse_call.exe` under Windows. Under Windows, you can also build with the clang compiler by adding `-T ClangCL` to the call to `cmake ..`: `cmake -T ClangCL ..`. * **fuzz:** The source for fuzz testing. This lets us explore important edge and middle cases * **fuzz:** The source for fuzz testing. This lets us explore important edge and middle cases automatically, and is run in CI. * **jsonchecker:** A set of JSON files used to check different functionality of the parser. * **pass*.json:** Files that should pass validation. * **fail*.json:** Files that should fail validation. + * **jsonchecker/minefield/y_*.json:** Files that should pass validation. + * **jsonchecker/minefield/n_*.json:** Files that should fail validation. * **jsonexamples:** A wide spread of useful, real-world JSON files with different characteristics and sizes. -* **singleheader:** Contains generated simdjson.h and simdjson.cpp that we release. * **test:** The tests are here. basictests.cpp and errortests.cpp are the primary ones. -* **tools:** Source for executables that can be distributed with simdjson +* **tools:** Source for executables that can be distributed with simdjson. Some examples: + * `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. + * `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`. + * `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters. + * `jsonpointer mydoc.json ... ` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document. + + > **Don't modify the files in singleheader/ directly; these are automatically generated.** -> -> While we distribute those files on release, we *maintain* the files under include/ and src/. + While simdjson distributes just two files from the singleheader/ directory, we *maintain* the code in -multiple files under include/ and src/. include/simdjson.h and src/simdjson.cpp are the "spine" for -these, and you can include +multiple files under include/ and src/. The files include/simdjson.h and src/simdjson.cpp are the "spine" for +these, and you can include them as if they were the corresponding singleheader/ files. @@ -112,7 +188,7 @@ processor. At this point, we are require to use one of two main strategies. -1. On POSIX systems, the main compilers (LLVM clang, GNU gcc) allow us to use any intrinsic function after including the header, but they fail to inline the resulting instruction if the target processor does not support them. Because we compile for a generic processor, we would not be able to use most intrinsic functions. Thankfully, more recent versions of these compilers allow us to flag a region of code with a specific target, so that we can compile only some of the code with support for advanced instructions. Thus in our C++, one might notice macros like `TARGET_HASWELL`. It is then our responsability, at runtime, to only run the regions of code (that we call kernels) matching the properties of the runtime processor. The benefit of this approach is that the compiler not only let us use intrinsic functions, but it can also optimize the rest of the code in the kernel with advanced instructions we enabled. +1. On POSIX systems, the main compilers (LLVM clang, GNU gcc) allow us to use any intrinsic function after including the header, but they fail to inline the resulting instruction if the target processor does not support them. Because we compile for a generic processor, we would not be able to use most intrinsic functions. Thankfully, more recent versions of these compilers allow us to flag a region of code with a specific target, so that we can compile only some of the code with support for advanced instructions. Thus in our C++, one might notice macros like `TARGET_HASWELL`. It is then our responsibility, at runtime, to only run the regions of code (that we call kernels) matching the properties of the runtime processor. The benefit of this approach is that the compiler not only let us use intrinsic functions, but it can also optimize the rest of the code in the kernel with advanced instructions we enabled. 2. Under Visual Studio, the problem is somewhat simpler. Visual Studio will not only provide the intrinsic functions, but it will also allow us to use them. They will compile just fine. It is at runtime that they may cause a crash. So we do not need to mark regions of code for compilation toward advanced processors (e.g., with `TARGET_HASWELL` macros). The downside of the Visual Studio approach is that the compiler is not allowed to use advanced instructions others than those we specify. In principle, this means that Visual Studio has weaker optimization opportunities. @@ -124,26 +200,29 @@ We also handle the special case where a user is compiling using LLVM clang under -Regenerating Single Headers From Master +Regenerating Single-Header Files --------------------------------------- -simdjson.h and simdjson.cpp are not always up to date in master. To ensure you have the latest copy, -you can regenerate them by running this at the top level: +The simdjson.h and simdjson.cpp files in the singleheader directory are not always up-to-date with the rest of the code; they are only ever +systematically regenerated on releases. To ensure you have the latest code, you can regenerate them by running this at the top level: ```bash mkdir build cd build -cmake .. +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . # needed, because currently dependencies do not work fully for the amalgamate target cmake --build . --target amalgamate ``` -The amalgamator is at `amalgamate.sh` at the top level. It generates singleheader/simdjson.h by +You need to have python3 installed on your system. + +The amalgamator script `amalgamate.py` generates singleheader/simdjson.h by reading through include/simdjson.h, copy/pasting each header file into the amalgamated file at the point it gets included (but only once per header). singleheader/simdjson.cpp is generated from src/simdjson.cpp the same way, except files under generic/ may be included and copy/pasted multiple times. -### Usage (CMake on 64-bit platforms like Linux, freeBSD or macOS) +## Usage (CMake on 64-bit platforms like Linux, FreeBSD or macOS) Requirements: In addition to git, we require a recent version of CMake as well as bash. @@ -156,7 +235,7 @@ brew install cmake apt-get update -qq apt-get install -y cmake ``` -3. On freeBSD, you might be able to install bash and CMake as follows: +3. On FreeBSD, you might be able to install bash and CMake as follows: ``` pkg update -f pkg install bash @@ -171,31 +250,31 @@ Building: While in the project repository, do the following: ``` mkdir build cd build -cmake .. +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. cmake --build . ctest ``` -CMake will build a library. By default, it builds a shared library (e.g., libsimdjson.so on Linux). +CMake will build a library. By default, it builds a static library (e.g., libsimdjson.a on Linux). -You can build a static library: +You can build a shared library: ``` -mkdir buildstatic -cd buildstatic -cmake -DSIMDJSON_BUILD_STATIC=ON .. +mkdir buildshared +cd buildshared +cmake -D BUILD_SHARED_LIBS=ON -D SIMDJSON_DEVELOPER_MODE=ON .. cmake --build . ctest ``` -In some cases, you may want to specify your compiler, especially if the default compiler on your system is too old. You need to tell cmake which compiler you wish to use by setting the CC and CXX variables. Under bash, you can do so with commands such as `export CC=gcc-7` and `export CXX=g++-7`. You can also do it as part of the `cmake` command: `cmake .. -DCMAKE_CXX_COMPILER=g++`. You may proceed as follows: +In some cases, you may want to specify your compiler, especially if the default compiler on your system is too old. You need to tell cmake which compiler you wish to use by setting the CC and CXX variables. Under bash, you can do so with commands such as `export CC=gcc-7` and `export CXX=g++-7`. You can also do it as part of the `cmake` command: `cmake -DCMAKE_CXX_COMPILER=g++ ..`. You may proceed as follows: ``` brew install gcc@8 mkdir build cd build export CXX=g++-8 CC=gcc-8 -cmake .. +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. cmake --build . ctest ``` @@ -204,17 +283,17 @@ If your compiler does not default on C++11 support or better you may get failing Note that the name of directory (`build`) is arbitrary, you can name it as you want (e.g., `buildgcc`) and you can have as many different such directories as you would like (one per configuration). +## Usage (CMake on 64-bit Windows using Visual Studio 2019 or better) +Recent versions of Visual Studio support CMake natively, [please refer to the Visual Studio documentation](https://learn.microsoft.com/en-us/cpp/build/cmake-projects-in-visual-studio?view=msvc-170). -### Usage (CMake on 64-bit Windows using Visual Studio) - -We assume you have a common 64-bit Windows PC with at least Visual Studio 2017 and an x64 processor with AVX2 support (2013 Intel Haswell or later) or SSE 4.2 + CLMUL (2010 Westmere or later). +We assume you have a common 64-bit Windows PC with at least Visual Studio 2019. - Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/). - Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake. - Create a subdirectory within simdjson, such as `build`. - Using a shell, go to this newly created directory. You can start a shell directly from GitHub Desktop (Repository > Open in Command Prompt). -- Type `cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..` in the shell while in the `build` repository. (Alternatively, if you want to build a DLL, you may use the command line `cmake -DCMAKE_GENERATOR_PLATFORM=x64 -DSIMDJSON_BUILD_STATIC=OFF ..`.) +- Type `cmake ..` in the shell while in the `build` repository. - This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`. @@ -223,391 +302,21 @@ Though having Visual Studio installed is necessary, one can build simdjson using - `mkdir build` - `cd build` - `cmake ..` -- `cmake --build . -config Release` +- `cmake --build . --config Release` Furthermore, if you have installed LLVM clang on Windows, for example as a component of Visual Studio 2019, you can configure and build simdjson using LLVM clang on Windows using cmake: - - `mkdir build` - `cd build` -- `cmake .. -T ClangCL` -- `cmake --build . -config Release` - - -### Usage (Using `vcpkg` on 64-bit Windows, Linux and macOS) - -[vcpkg](https://github.com/Microsoft/vcpkg) users on Windows, Linux and macOS can download and install `simdjson` with one single command from their favorite shell. - -On 64-bit Linux and macOS: - -``` -$ ./vcpkg install simdjson -``` - -will build and install `simdjson` as a static library. - -On Windows (64-bit): - -``` -.\vcpkg.exe install simdjson:x64-windows -``` - -will build and install `simdjson` as a shared library. - -``` -.\vcpkg.exe install simdjson:x64-windows-static -``` - -will build and install `simdjson` as a static library. - -These commands will also print out instructions on how to use the library from MSBuild or CMake-based projects. - -If you find the version of `simdjson` shipped with `vcpkg` is out-of-date, feel free to report it to -`vcpkg` community either by submitting an issue or by creating a PR. - -### Usage (Docker) - -One can run tests and benchmarks using docker. It especially makes sense under Linux. Privileged -access may be needed to get performance counters. - -``` -git clone https://github.com/simdjson/simdjson.git -cd simdjson -docker build -t simdjson . -docker run --privileged -t simdjson -``` - -## Architecture and Design Notes - -### Requirements - -- 64-bit platforms like Linux or macOS, as well as Windows through Visual Studio 2017 or later. -- Any 64-bit processor: - - AVX2 (i.e., Intel processors starting with the Haswell microarchitecture released 2013 and AMD - processors starting with the Zen microarchitecture released 2017), - - SSE 4.2 and CLMUL (i.e., Intel processors going back to Westmere released in 2010 or AMD - processors starting with the Jaguar used in the PS4 and XBox One), - - 64-bit ARM processor (ARMv8-A NEON): this covers a wide range of mobile processors, including - all Apple processors currently available for sale, going as far back as the iPhone 5s (2013). - - Any 64-bit processor (simdjson has a fallback generic 64-bit implementation that is still super - fast). -- A recent C++ compiler (e.g., GNU GCC or LLVM CLANG or Visual Studio 2017), we assume C++17. GNU - GCC 7 or better or LLVM's clang 6 or better. -- Some benchmark scripts assume bash and other common utilities, but they are optional. - -### Scope - -We provide a fast parser, that fully validates an input according to various specifications. -The parser builds a useful immutable (read-only) DOM (document-object model) which can be later accessed. - -To simplify the engineering, we make some assumptions. - -- We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16). We do not believe this is a - genuine limitation, because we do not think there is any serious application that needs to process - JSON data without an ASCII or UTF-8 encoding. If the UTF-8 contains a leading BOM, it should be - omitted: the user is responsible for detecting and skipping the BOM; UTF-8 BOMs are discouraged. -- All strings in the JSON document may have up to 4294967295 bytes in UTF-8 (4GB). To enforce this - constraint, we refuse to parse a document that contains more than 4294967295 bytes (4GB). This - should accommodate most JSON documents. -- As allowed by the specification, we allow repeated keys within an object (other parsers like - sajson do the same). -- [The simdjson library is fast for JSON documents spanning a few bytes up to many megabytes](https://github.com/lemire/simdjson/issues/312). - -_We do not aim to provide a general-purpose JSON library._ A library like RapidJSON offers much more -than just parsing, it helps you generate JSON and offers various other convenient functions. We -merely parse the document. This may change in the future. - -### Features - -- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.) -- We parse integers and floating-point numbers as separate types which allows us to support large signed 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long` and large unsigned integers up to the value 18446744073709551615. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed or unsigned 64-bit value, we reject the JSON document. -- We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits::lowest()` to `std::numeric_limits::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document. -- We test for accurate float parsing with a perfect accuracy (ULP 0). Many parsers offer only approximate floating parsing. For example, RapidJSON also offers the option of accurate float parsing (`kParseFullPrecisionFlag`) but it comes at a significant performance penalty compared to the default settings. By default, RapidJSON tolerates an error of 3 ULP. -- The simdjson library does full UTF-8 validation as part of the parsing. Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation. The sajson parser does incomplete UTF-8 validation, accepting code point sequences like 0xb1 0x87. -- We fully validate the numbers. Parsers like gason and ultranjson will accept `[0e+]` as valid JSON. -- We validate string content for unescaped characters. Parsers like fastjson and ultrajson accept unescaped line breaks and tabs in strings. -- We fully validate the white-space characters outside of the strings. Parsers like RapidJSON will accept JSON documents with null characters outside of strings. - -### Architecture - -The parser works in two stages: - -- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage. -- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage. - -### Remarks on JSON parsing - -- The JSON spec defines what a JSON parser is: - > A JSON parser transforms a JSON text into another representation. A JSON parser MUST accept all texts that conform to the JSON grammar. A JSON parser MAY accept non-JSON forms or extensions. An implementation may set limits on the size of texts that it accepts. An implementation may set limits on the maximum depth of nesting. An implementation may set limits on the range and precision of numbers. An implementation may set limits on the length and character contents of strings. - -* JSON is not JavaScript: - - > All JSON is Javascript but NOT all Javascript is JSON. So {property:1} is invalid because property does not have double quotes around it. {'property':1} is also invalid, because it's single quoted while the only thing that can placate the JSON specification is double quoting. JSON is even fussy enough that {"property":.1} is invalid too, because you should have of course written {"property":0.1}. Also, don't even think about having comments or semicolons, you guessed it: they're invalid. (credit:https://github.com/elzr/vim-json) - -* The structural characters are: +- `cmake -T ClangCL ..` +- `cmake --build . --config Release` - begin-array = [ left square bracket - begin-object = { left curly bracket - end-array = ] right square bracket - end-object = } right curly bracket - name-separator = : colon - value-separator = , comma +## Various References -### Pseudo-structural elements - -A character is pseudo-structural if and only if: - -1. Not enclosed in quotes, AND -2. Is a non-whitespace character, AND -3. Its preceding character is either: - (a) a structural character, OR - (b) whitespace. - -This helps as we redefine some new characters as pseudo-structural such as the characters 1, G, n in the following: - -> { "foo" : 1.5, "bar" : 1.5 GEOFF_IS_A_DUMMY bla bla , "baz", null } - - - -### UTF-8 validation (lookup2) - -The simdjson library relies on the lookup2 algorithm for UTF-8 validation on x64 platforms. - -This algorithm validate the length of multibyte characters (that each multibyte character has the right number of continuation characters, and that all continuation characters are part of a multibyte character). - -#### Algorithm - -This algorithm compares *expected* continuation characters with *actual* continuation bytes, and emits an error anytime there is a mismatch. - -For example, in the string "ð„žâ‚¿Öab", which has a 4-, 3-, 2- and 1-byte -characters, the file will look like this: - -| Character | ð„ž | | | | â‚¿ | | | Ö | | a | b | -|-----------------------|----|----|----|----|----|----|----|----|----|----|----| -| Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | -| Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | -| is_second_byte | | X | | | | X | | | X | | | -| is_third_byte | | | X | | | | X | | | | | -| is_fourth_byte | | | | X | | | | | | | | -| expected_continuation | | X | X | X | | X | X | | X | | | -| is_continuation | | X | X | X | | X | X | | X | | | - -The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): - -- **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not - part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just - floating around extra outside of any character, or that there is an illegal 5-byte character, - or maybe it's at the beginning of the file before any characters have started; but it's an - error in all these cases. -- **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means - we started a new character before we were finished with the current one. - -#### Getting the Previous Bytes - -Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte -character, we need to "shift the bytes" to find that out. This is what they mean: - -- `is_continuation`: if the current byte is a continuation. -- `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. -- `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. -- `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. - -We use shuffles to go n bytes back, selecting part of the current `input` and part of the -`prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller -function, because the 1-byte-back data is used by other checks as well. - -#### Getting the Continuation Mask - -Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as -numbers, using signed `<` and `>` operations to check if they are continuations or leads. -In fact, we treat the numbers as *signed*, partly because it helps us, and partly because -Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). - -In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," -respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. -Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. - -When treated as signed numbers, they look like this: - -| Type | High Bits | Binary Range | Signed | -|--------------|------------|--------------|--------| -| ASCII | `0` | `01111111` | 127 | -| | | `00000000` | 0 | -| 4+-Byte Lead | `1111` | `11111111` | -1 | -| | | `11110000 | -16 | -| 3-Byte Lead | `1110` | `11101111` | -17 | -| | | `11100000 | -32 | -| 2-Byte Lead | `110` | `11011111` | -33 | -| | | `11000000 | -64 | -| Continuation | `10` | `10111111` | -65 | -| | | `10000000 | -128 | - -This makes it pretty easy to get the continuation mask! It's just a single comparison: - -``` -is_continuation = input < -64` -``` - -We can do something similar for the others, but it takes two comparisons instead of one: "is -the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and -`> -64`. Surely we can do better, they're right next to each other! - -#### Getting the is_xxx Masks: Shifting the Range - -Notice *why* continuations were a single comparison. The actual *range* would require two -comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get -that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be -just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. - -Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps -ASCII down into the negative, and puts 4+-Byte Lead at the top: - -| Type | High Bits | Binary Range | Signed | -|----------------------|------------|--------------|-------| -| 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | -| | | `01110000 | 112 | -|----------------------|------------|--------------|-------| -| 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | -| | | `01100000 | 96 | -|----------------------|------------|--------------|-------| -| 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | -| | | `01000000 | 64 | -|----------------------|------------|--------------|-------| -| Continuation (+ 127) | `00` | `00111111` | 63 | -| | | `00000000 | 0 | -|----------------------|------------|--------------|-------| -| ASCII (+ 127) | `1` | `11111111` | -1 | -| | | `10000000` | -128 | -|----------------------|------------|--------------|-------| - -*Now* we can use signed `>` on all of them: - -``` -prev1 = input.prev<1> -prev2 = input.prev<2> -prev3 = input.prev<3> -prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` -prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` -prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` -is_second_byte = prev1_flipped > 63;2+-byte lead -is_third_byte = prev2_flipped > 95;3+-byte lead -is_fourth_byte = prev3_flipped > 111; // 4+-byte lead -``` - -NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number -of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 -`^`'s at a time on Haswell, but only 2 `+`'s). - -That doesn't look like it saved us any instructions, did it? Well, because we're adding the -same number to all of them, we can save one of those `+ 128` operations by assembling -`prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 -to it. One more instruction saved! - -``` -prev1 = input.prev<1> -prev3 = input.prev<3> -prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` -prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` -prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or -you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can -be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and -then adds the result together. Same number of operations, but if the processor can run -independent things in parallel (which most can), it runs faster. - -This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have -a super nice advantage in that more of them can be run at the same time (they can run on 3 -ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, -saving us the cycle we would have earned by using +. Even more, using an instruction with a -wider array of ports can help *other* code run ahead, too, since these instructions can "get -out of the way," running on a port other instructions can't. - -#### Epilogue II: One More Trick - -There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay -for" the (prev<1> + 128) instruction, because it can be used to save an instruction in -check_special_cases()--but we'll talk about that there :) - - - - -## About the Project - -### Bindings and Ports of simdjson - -We distinguish between "bindings" (which just wrap the C++ code) and a port to another programming language (which reimplements everything). - - -- [ZippyJSON](https://github.com/michaeleisel/zippyjson): Swift bindings for the simdjson project. -- [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project. -- [simdjson-rs](https://github.com/Licenser/simdjson-rs): Rust port. -- [simdjson-rust](https://github.com/SunDoge/simdjson-rust): Rust wrapper (bindings). -- [SimdJsonSharp](https://github.com/EgorBo/SimdJsonSharp): C# version for .NET Core (bindings and full port). -- [simdjson_nodejs](https://github.com/luizperes/simdjson_nodejs): Node.js bindings for the simdjson project. -- [simdjson_php](https://github.com/crazyxman/simdjson_php): PHP bindings for the simdjson project. -- [simdjson_ruby](https://github.com/saka1/simdjson_ruby): Ruby bindings for the simdjson project. -- [simdjson-go](https://github.com/minio/simdjson-go): Go port using Golang assembly. -- [rcppsimdjson](https://github.com/eddelbuettel/rcppsimdjson): R bindings. - -### Tools - -- `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. -- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`. -- `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters. -- `jsonpointer mydoc.json ... ` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document. - - -### Various References - -- [Google double-conv](https://github.com/google/double-conversion/) - [How to implement atoi using SIMD?](https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd) - [Parsing JSON is a Minefield 💣](http://seriot.ch/parsing_json.php) - https://tools.ietf.org/html/rfc7159 -- The Mison implementation in rust https://github.com/pikkr/pikkr - http://rapidjson.org/md_doc_sax.html - https://github.com/Geal/parser_benchmarks/tree/master/json - Gron: A command line tool that makes JSON greppable https://news.ycombinator.com/item?id=16727665 @@ -621,31 +330,3 @@ Inspiring links: - https://auth0.com/blog/beating-json-performance-with-protobuf/ - https://gist.github.com/shijuvar/25ad7de9505232c87034b8359543404a - https://github.com/frankmcsherry/blog/blob/master/posts/2018-02-11.md - -Validating UTF-8 takes no more than 0.7 cycles per byte: - -- https://github.com/lemire/fastvalidate-utf-8 https://lemire.me/blog/2018/05/16/validating-utf-8-strings-using-as-little-as-0-7-cycles-per-byte/ - -### Academic References - -- T.Mühlbauer, W.Rödiger, R.Seilbeck, A.Reiser, A.Kemper, and T.Neumann. Instant loading for main memory databases. PVLDB, 6(14):1702–1713, 2013. (SIMD-based CSV parsing) -- Mytkowicz, Todd, Madanlal Musuvathi, and Wolfram Schulte. "Data-parallel finite-state machines." ACM SIGARCH Computer Architecture News. Vol. 42. No. 1. ACM, 2014. -- Lu, Yifan, et al. "Tree structured data processing on GPUs." Cloud Computing, Data Science & Engineering-Confluence, 2017 7th International Conference on. IEEE, 2017. -- Sidhu, Reetinder. "High throughput, tree automata based XML processing using FPGAs." Field-Programmable Technology (FPT), 2013 International Conference on. IEEE, 2013. -- Dai, Zefu, Nick Ni, and Jianwen Zhu. "A 1 cycle-per-byte XML parsing accelerator." Proceedings of the 18th annual ACM/SIGDA international symposium on Field programmable gate arrays. ACM, 2010. -- Lin, Dan, et al. "Parabix: Boosting the efficiency of text processing on commodity processors." High Performance Computer Architecture (HPCA), 2012 IEEE 18th International Symposium on. IEEE, 2012. http://parabix.costar.sfu.ca/export/1783/docs/HPCA2012/final_ieee/final.pdf -- Deshmukh, V. M., and G. R. Bamnote. "An empirical evaluation of optimization parameters in XML parsing for performance enhancement." Computer, Communication and Control (IC4), 2015 International Conference on. IEEE, 2015. -- Moussalli, Roger, et al. "Efficient XML Path Filtering Using GPUs." ADMS@ VLDB. 2011. -- Jianliang, Ma, et al. "Parallel speculative dom-based XML parser." High Performance Computing and Communication & 2012 IEEE 9th International Conference on Embedded Software and Systems (HPCC-ICESS), 2012 IEEE 14th International Conference on. IEEE, 2012. -- Li, Y., Katsipoulakis, N.R., Chandramouli, B., Goldstein, J. and Kossmann, D., 2017. Mison: a fast JSON parser for data analytics. Proceedings of the VLDB Endowment, 10(10), pp.1118-1129. http://www.vldb.org/pvldb/vol10/p1118-li.pdf -- Cameron, Robert D., et al. "Parallel scanning with bitstream addition: An xml case study." European Conference on Parallel Processing. Springer, Berlin, Heidelberg, 2011. -- Cameron, Robert D., Kenneth S. Herdy, and Dan Lin. "High performance XML parsing using parallel bit stream technology." Proceedings of the 2008 conference of the center for advanced studies on collaborative research: meeting of minds. ACM, 2008. -- Shah, Bhavik, et al. "A data parallel algorithm for XML DOM parsing." International XML Database Symposium. Springer, Berlin, Heidelberg, 2009. -- Cameron, Robert D., and Dan Lin. "Architectural support for SWAR text processing with parallel bit streams: the inductive doubling principle." ACM Sigplan Notices. Vol. 44. No. 3. ACM, 2009. -- Amagasa, Toshiyuki, Mana Seino, and Hiroyuki Kitagawa. "Energy-Efficient XML Stream Processing through Element-Skipping Parsing." Database and Expert Systems Applications (DEXA), 2013 24th International Workshop on. IEEE, 2013. -- Medforth, Nigel Woodland. "icXML: Accelerating Xerces-C 3.1. 1 using the Parabix Framework." (2013). -- Zhang, Qiang Scott. Embedding Parallel Bit Stream Technology Into Expat. Diss. Simon Fraser University, 2010. -- Cameron, Robert D., et al. "Fast Regular Expression Matching with Bit-parallel Data Streams." -- Lin, Dan. Bits filter: a high-performance multiple string pattern matching algorithm for malware detection. Diss. School of Computing Science-Simon Fraser University, 2010. -- Yang, Shiyang. Validation of XML Document Based on Parallel Bit Stream Technology. Diss. Applied Sciences: School of Computing Science, 2013. -- N. Nakasato, "Implementation of a parallel tree method on a GPU", Journal of Computational Science, vol. 3, no. 3, pp. 132-141, 2012. diff --git a/LICENSE b/LICENSE index 45b1ea773b..57fc54cc1f 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2018-2019 The simdjson authors + Copyright 2018-2025 The simdjson authors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000000..86a11f2f65 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,18 @@ +Copyright 2018-2025 The simdjson authors + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 69424a02fb..87f2263c33 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,9 @@ -[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/simdjson.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&q=proj%3Asimdjson&can=2) -[![CirrusCI](https://api.cirrus-ci.com/github/simdjson/simdjson.svg)](https://cirrus-ci.com/github/simdjson/simdjson) -![Ubuntu 18.04 CI](https://github.com/simdjson/simdjson/workflows/Ubuntu%2018.04%20CI%20(GCC%207)/badge.svg) -[![Ubuntu 20.04 CI](https://github.com/simdjson/simdjson/workflows/Ubuntu%2020.04%20CI%20(GCC%209)/badge.svg)](https://simdjson.org/plots.html) -![VS16-CI](https://github.com/simdjson/simdjson/workflows/VS16-CI/badge.svg) -![MinGW64-CI](https://github.com/simdjson/simdjson/workflows/MinGW64-CI/badge.svg) -[![][license img]][license] [![Doxygen Documentation](https://img.shields.io/badge/docs-doxygen-green.svg)](https://simdjson.org/api/0.4.0/index.html) + +[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/simdjson.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:simdjson) +[![][license img]][license] [![][licensemit img]][licensemit] + + +[![Doxygen Documentation](https://img.shields.io/badge/docs-doxygen-green.svg)](https://simdjson.github.io/simdjson/) simdjson : Parsing gigabytes of JSON per second =============================================== @@ -12,55 +11,97 @@ simdjson : Parsing gigabytes of JSON per second JSON is everywhere on the Internet. Servers spend a *lot* of time parsing it. We need a fresh approach. The simdjson library uses commonly available SIMD instructions and microparallel algorithms -to parse JSON 2.5x faster than anything else out there. +to parse JSON 4x faster than RapidJSON and 25x faster than JSON for Modern C++. -* **Fast:** Over 2.5x faster than other production-grade JSON parsers. -* **Easy:** First-class, easy to use API. +* **Fast:** Over 4x faster than commonly used production-grade JSON parsers. +* **Record Breaking Features:** Minify JSON at 6 GB/s, validate UTF-8 at 13 GB/s, NDJSON at 3.5 GB/s. +* **Easy:** First-class, easy to use and carefully documented APIs. * **Strict:** Full JSON and UTF-8 validation, lossless parsing. Performance with no compromises. * **Automatic:** Selects a CPU-tailored parser at runtime. No configuration needed. * **Reliable:** From memory allocation to error handling, simdjson's design avoids surprises. +* **Peer Reviewed:** Our research appears in venues like VLDB Journal, Software: Practice and Experience. This library is part of the [Awesome Modern C++](https://awesomecpp.com) list. Table of Contents ----------------- +* [Real-world usage](#real-world-usage) * [Quick Start](#quick-start) * [Documentation](#documentation) +* [Godbolt](#godbolt) * [Performance results](#performance-results) -* [Real-world usage](#real-world-usage) +* [Packages](#packages) * [Bindings and Ports of simdjson](#bindings-and-ports-of-simdjson) * [About simdjson](#about-simdjson) * [Funding](#funding) * [Contributing to simdjson](#contributing-to-simdjson) * [License](#license) + +Real-world usage +---------------- + +- [Node.js](https://nodejs.org/) +- [ClickHouse](https://github.com/ClickHouse/ClickHouse) +- [Meta Velox](https://velox-lib.io) +- [Google Pax](https://github.com/google/paxml) +- [milvus](https://github.com/milvus-io/milvus) +- [QuestDB](https://questdb.io/blog/questdb-release-8-0-3/) +- [Clang Build Analyzer](https://github.com/aras-p/ClangBuildAnalyzer) +- [Shopify HeapProfiler](https://github.com/Shopify/heap-profiler) +- [StarRocks](https://github.com/StarRocks/starrocks) +- [Microsoft FishStore](https://github.com/microsoft/FishStore) +- [Intel PCM](https://github.com/intel/pcm) +- [WatermelonDB](https://github.com/Nozbe/WatermelonDB) +- [Apache Doris](https://github.com/apache/doris) +- [Dgraph](https://github.com/dgraph-io/dgraph) +- [UJRPC](https://github.com/unum-cloud/ujrpc) +- [fastgltf](https://github.com/spnda/fastgltf) +- [vast](https://github.com/tenzir/vast) +- [ada-url](https://github.com/ada-url/ada) +- [fastgron](https://github.com/adamritter/fastgron) +- [WasmEdge](https://wasmedge.org) +- [RonDB](https://github.com/logicalclocks/rondb) +- [GreptimeDB](https://github.com/GreptimeTeam/greptimedb) + + +If you are planning to use simdjson in a product, please work from one of our releases. + Quick Start ----------- - The simdjson library is easily consumable with a single .h and .cpp file. -0. Prerequisites: `g++` (version 7 or better) or `clang++` (version 6 or better), and a 64-bit system with a command-line shell (e.g., Linux, macOS, freeBSD). We also support programming environnements like Visual Studio and Xcode, but different steps are needed. -1. Pull [simdjson.h](singleheader/simdjson.h) and [simdjson.cpp](singleheader/simdjson.cpp) into a directory, along with the sample file [twitter.json](jsonexamples/twitter.json). +0. Prerequisites: `g++` (version 7 or better) or `clang++` (version 6 or better), and a 64-bit + system with a command-line shell (e.g., Linux, macOS, freeBSD). We also support programming + environments like Visual Studio and Xcode, but different steps are needed. Users of clang++ may need to specify the C++ version (e.g., `c++ -std=c++17`) since clang++ tends to default on C++98. +1. Pull [simdjson.h](singleheader/simdjson.h) and [simdjson.cpp](singleheader/simdjson.cpp) into a + directory, along with the sample file [twitter.json](jsonexamples/twitter.json). You can download them with the `wget` utility: + ``` wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.h https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.cpp https://raw.githubusercontent.com/simdjson/simdjson/master/jsonexamples/twitter.json ``` 2. Create `quickstart.cpp`: - ```c++ - #include "simdjson.h" - int main(void) { - simdjson::dom::parser parser; - simdjson::dom::element tweets = parser.load("twitter.json"); - std::cout << tweets["search_metadata"]["count"] << " results." << std::endl; - } - ``` +```c++ +#include +#include "simdjson.h" +using namespace simdjson; +int main(void) { + ondemand::parser parser; + padded_string json = padded_string::load("twitter.json"); + ondemand::document tweets = parser.iterate(json); + std::cout << uint64_t(tweets["search_metadata"]["count"]) << " results." << std::endl; +} +``` 3. `c++ -o quickstart quickstart.cpp simdjson.cpp` 4. `./quickstart` - ``` + + ``` 100 results. - ``` + ``` + Documentation ------------- @@ -71,67 +112,45 @@ Usage documentation is available: * [Performance](doc/performance.md) shows some more advanced scenarios and how to tune for them. * [Implementation Selection](doc/implementation-selection.md) describes runtime CPU detection and how you can work with it. -* [API](https://simdjson.org/api/0.4.0/annotated.html) contains the automatically generated API documentation. +* [API](https://simdjson.github.io/simdjson/) contains the automatically generated API documentation. + +Godbolt +------------- + +Some users may want to browse code along with the compiled assembly. You want to check out the following lists of examples: +* [simdjson examples with errors handled through exceptions](https://godbolt.org/z/7G5qE4sr9) +* [simdjson examples with errors without exceptions](https://godbolt.org/z/e9dWb9E4v) Performance results ------------------- -The simdjson library uses three-quarters less instructions than state-of-the-art parser [RapidJSON](https://rapidjson.org) and -fifty percent less than sajson. To our knowledge, simdjson is the first fully-validating JSON parser +The simdjson library uses three-quarters less instructions than state-of-the-art parser [RapidJSON](https://rapidjson.org). To our knowledge, simdjson is the first fully-validating JSON parser to run at [gigabytes per second](https://en.wikipedia.org/wiki/Gigabyte) (GB/s) on commodity processors. It can parse millions of JSON documents per second on a single core. The following figure represents parsing speed in GB/s for parsing various files -on an Intel Skylake processor (3.4 GHz) using the GNU GCC 9 compiler (with the -O3 flag). -We compare against the best and fastest C++ libraries. +on an Intel Skylake processor (3.4 GHz) using the GNU GCC 10 compiler (with the -O3 flag). +We compare against the best and fastest C++ libraries on benchmarks that load and process the data. The simdjson library offers full unicode ([UTF-8](https://en.wikipedia.org/wiki/UTF-8)) validation and exact -number parsing. The RapidJSON library is tested in two modes: fast and -exact number parsing. The sajson library offers fast (but not exact) -number parsing and partial unicode validation. In this data set, the file -sizes range from 65KB (github_events) all the way to 3.3GB (gsoc-2018). -Many files are mostly made of numbers: canada, mesh.pretty, mesh, random -and numbers: in such instances, we see lower JSON parsing speeds due to the -high cost of number parsing. The simdjson library uses exact number parsing which -is particular taxing. - - - -On a Skylake processor, the parsing speeds (in GB/s) of various processors on the twitter.json file are as follows, using again GNU GCC 9.1 (with the -O3 flag). The popular JSON for Modern C++ library is particularly slow: it obviously trades parsing speed for other desirable features. - -| parser | GB/s | -| ------------------------------------- | ---- | -| simdjson | 2.5 | -| RapidJSON UTF8-validation | 0.29 | -| RapidJSON UTF8-valid., exact numbers | 0.28 | -| RapidJSON insitu, UTF8-validation | 0.41 | -| RapidJSON insitu, UTF8-valid., exact | 0.39 | -| sajson (insitu, dynamic) | 0.62 | -| sajson (insitu, static) | 0.88 | -| dropbox | 0.13 | -| fastjson | 0.27 | -| gason | 0.59 | -| ultrajson | 0.34 | -| jsmn | 0.25 | -| cJSON | 0.31 | -| JSON for Modern C++ (nlohmann/json) | 0.11 | +number parsing. + The simdjson library offers high speed whether it processes tiny files (e.g., 300 bytes) or larger files (e.g., 3MB). The following plot presents parsing speed for [synthetic files over various sizes generated with a script](https://github.com/simdjson/simdjson_experiments_vldb2019/blob/master/experiments/growing/gen.py) on a 3.4 GHz Skylake processor (GNU GCC 9, -O3). - + [All our experiments are reproducible](https://github.com/simdjson/simdjson_experiments_vldb2019). -Real-world usage ----------------- -- [Microsoft FishStore](https://github.com/microsoft/FishStore) -- [Yandex ClickHouse](https://github.com/yandex/ClickHouse) -- [Clang Build Analyzer](https://github.com/aras-p/ClangBuildAnalyzer) -- [Shopify HeapProfiler](https://github.com/Shopify/heap-profiler) +For NDJSON files, we can exceed 3 GB/s with [our multithreaded parsing functions](https://github.com/simdjson/simdjson/blob/master/doc/parse_many.md). + + +Packages +------------------------------ +[![Packaging status](https://repology.org/badge/vertical-allrepos/simdjson.svg)](https://repology.org/project/simdjson/versions) -If you are planning to use simdjson in a product, please work from one of our releases. Bindings and Ports of simdjson ------------------------------ @@ -141,6 +160,7 @@ We distinguish between "bindings" (which just wrap the C++ code) and a port to a - [ZippyJSON](https://github.com/michaeleisel/zippyjson): Swift bindings for the simdjson project. - [libpy_simdjson](https://github.com/gerrymanoim/libpy_simdjson/): high-speed Python bindings for simdjson using [libpy](https://github.com/quantopian/libpy). - [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project. +- [cysimdjson](https://github.com/TeskaLabs/cysimdjson): high-speed Python bindings for the simdjson project. - [simdjson-rs](https://github.com/simd-lite): Rust port. - [simdjson-rust](https://github.com/SunDoge/simdjson-rust): Rust wrapper (bindings). - [SimdJsonSharp](https://github.com/EgorBo/SimdJsonSharp): C# version for .NET Core (bindings and full port). @@ -150,6 +170,16 @@ We distinguish between "bindings" (which just wrap the C++ code) and a port to a - [fast_jsonparser](https://github.com/anilmaurya/fast_jsonparser): Ruby bindings for the simdjson project. - [simdjson-go](https://github.com/minio/simdjson-go): Go port using Golang assembly. - [rcppsimdjson](https://github.com/eddelbuettel/rcppsimdjson): R bindings. +- [simdjson_erlang](https://github.com/ChomperT/simdjson_erlang): erlang bindings. +- [simdjsone](https://github.com/saleyn/simdjsone): erlang bindings. +- [lua-simdjson](https://github.com/FourierTransformer/lua-simdjson): lua bindings. +- [hermes-json](https://hackage.haskell.org/package/hermes-json): haskell bindings. +- [zimdjson](https://github.com/EzequielRamis/zimdjson): Zig port. +- [simdjzon](https://github.com/travisstaloch/simdjzon): Zig port. +- [JSON-Simd](https://github.com/rawleyfowler/JSON-simd): Raku bindings. +- [JSON::SIMD](https://metacpan.org/pod/JSON::SIMD): Perl bindings; fully-featured JSON module that uses simdjson for decoding. +- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim JSON parser based on simdjson bindings. +- [simdjson-java](https://github.com/simdjson/simdjson-java): Java port. About simdjson -------------- @@ -158,25 +188,38 @@ The simdjson library takes advantage of modern microarchitectures, parallelizing instructions, reducing branch misprediction, and reducing data dependency to take advantage of each CPU's multiple execution cores. -Some people [enjoy reading our paper](https://arxiv.org/abs/1902.08318): A description of the design -and implementation of simdjson is in our research article: Geoff Langdale, Daniel -Lemire, [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318), VLDB Journal 28 (6), 2019. +Our default front-end is called On-Demand, and we wrote a paper about it: + +- John Keiser, Daniel Lemire, [On-Demand JSON: A Better Way to Parse Documents?](http://arxiv.org/abs/2312.17149), Software: Practice and Experience 54 (6), 2024. + +Some people [enjoy reading the first (2019) simdjson paper](https://arxiv.org/abs/1902.08318): A description of the design +and implementation of simdjson is in our research article: +- Geoff Langdale, Daniel Lemire, [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318), VLDB Journal 28 (6), 2019. + +We have an in-depth paper focused on the UTF-8 validation: + +- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice & Experience 51 (5), 2021. We also have an informal [blog post providing some background and context](https://branchfree.org/2019/02/25/paper-parsing-gigabytes-of-json-per-second/). For the video inclined,
[![simdjson at QCon San Francisco 2019](http://img.youtube.com/vi/wlvKAT7SZIQ/0.jpg)](http://www.youtube.com/watch?v=wlvKAT7SZIQ)
-(it was the best voted talk, we're kinda proud of it). +(It was the best voted talk, we're kinda proud of it.) Funding ------- -The work is supported by the Natural Sciences and Engineering Research Council of Canada under grant -number RGPIN-2017-03910. +The work is supported by the Natural Sciences and Engineering Research Council of Canada under grants +RGPIN-2017-03910 and RGPIN-2024-03787. [license]: LICENSE [license img]: https://img.shields.io/badge/License-Apache%202-blue.svg + +[licensemit]: LICENSE-MIT +[licensemit img]: https://img.shields.io/badge/License-MIT-blue.svg + + Contributing to simdjson ------------------------ @@ -186,8 +229,12 @@ Head over to [CONTRIBUTING.md](CONTRIBUTING.md) for information on contributing License ------- -This code is made available under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0.html). +This code is made available under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0.html) as well as under the MIT License. As a user, you can pick the license you prefer. + +Under Windows, we build some tools using the windows/dirent_portable.h file (which is outside our library code): it is under the liberal (business-friendly) MIT license. + +For compilers that do not support [C++17](https://en.wikipedia.org/wiki/C%2B%2B17), we bundle the string-view library which is published under the [Boost license](http://www.boost.org/LICENSE_1_0.txt). Like the Apache license, the Boost license is a permissive license allowing commercial redistribution. -Under Windows, we build some tools using the windows/dirent_portable.h file (which is outside our library code): it under the liberal (business-friendly) MIT license. +For efficient number serialization, we bundle Florian Loitsch's implementation of the Grisu2 algorithm for binary to decimal floating-point numbers. The implementation was slightly modified by JSON for Modern C++ library. Both Florian Loitsch's implementation and JSON for Modern C++ are provided under the MIT license. -For compilers that do not support [C++17](https://en.wikipedia.org/wiki/C%2B%2B17), we bundle the string-view library which is published under the Boost license (http://www.boost.org/LICENSE_1_0.txt). Like the Apache license, the Boost license is a permissive license allowing commercial redistribution. +For runtime dispatching, we use some code from the PyTorch project licensed under 3-clause BSD. diff --git a/RELEASES.md b/RELEASES.md deleted file mode 100644 index 8cd014c31a..0000000000 --- a/RELEASES.md +++ /dev/null @@ -1,50 +0,0 @@ -# 0.3 - -## Highlights - -- Test coverage has been greatly improved and we have resolved many static-analysis warnings on different systems. -- We added a fast (8GB/s) minifier that works directly on JSON strings. -- We added fast (10GB/s) UTF-8 validator that works directly on strings (any strings, including non-JSON). -- The array and object elements have a constant-time size() method. -- Performance improvements to the API (type(), get<>()). -- The parse_many function (ndjson) has been entirely reworked. It now uses a single secondary thread instead of several new threads. -- We have introduced a faster UTF-8 validation algorithm (lookup3) for all kernels (ARM, x64 SSE, x64 AVX). -- C++11 support for older compilers and systems. -- FreeBSD support (and tests). -- We support the clang front-end compiler (clangcl) under Visual Studio. -- It is now possible to target ARM platforms under Visual Studio. -- The simdjson library will never abort or print to standard output/error. - -# 0.3 - -## Highlights - -- **Multi-Document Parsing:** Read a bundle of JSON documents (ndjson) 2-4x faster than doing it - individually. [API docs](https://github.com/simdjson/simdjson/blob/master/doc/basics.md#newline-delimited-json-ndjson-and-json-lines) / [Design Details](https://github.com/simdjson/simdjson/blob/master/doc/parse_many.md) -- **Simplified API:** The API has been completely revamped for ease of use, including a new JSON - navigation API and fluent support for error code *and* exception styles of error handling with a - single API. [Docs](https://github.com/simdjson/simdjson/blob/master/doc/basics.md#the-basics-loading-and-parsing-json-documents) -- **Exact Float Parsing:** Now simdjson parses floats flawlessly *without* any performance loss, - thanks to [great work by @michaeleisel and @lemire](https://github.com/simdjson/simdjson/pull/558). - [Blog Post](https://lemire.me/blog/2020/03/10/fast-float-parsing-in-practice/) -- **Even Faster:** The fastest parser got faster! With a [shiny new UTF-8 validator](https://github.com/simdjson/simdjson/pull/387) - and meticulously refactored SIMD core, simdjson 0.3 is 15% faster than before, running at 2.5 GB/s - (where 0.2 ran at 2.2 GB/s). - -## Minor Highlights - -- Fallback implementation: simdjson now has a non-SIMD fallback implementation, and can run even on - very old 64-bit machines. -- Automatic allocation: as part of API simplification, the parser no longer has to be preallocated-- - it will adjust automatically when it encounters larger files. -- Runtime selection API: We've exposed simdjson's runtime CPU detection and implementation selection - as an API, so you can tell what implementation we detected and test with other implementations. -- Error handling your way: Whether you use exceptions or check error codes, simdjson lets you handle - errors in your style. APIs that can fail return simdjson_result, letting you check the error - code before using the result. But if you are more comfortable with exceptions, skip the error code - and cast straight to T, and exceptions will be thrown automatically if an error happens. Use the - same API either way! -- Error chaining: We also worked to keep non-exception error-handling short and sweet. Instead of - having to check the error code after every single operation, now you can *chain* JSON navigation - calls like looking up an object field or array element, or casting to a string, so that you only - have to check the error code once at the very end. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..87c4c9b35e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,7 @@ +# Security Policy + +## Reporting a Vulnerability + +Please use the following contact information for reporting a vulnerability: + +- [Daniel Lemire](https://github.com/lemire) - daniel@lemire.me diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 7ceee84ad9..92df9bbc8b 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,37 +1,34 @@ +add_subdirectory(dom) + + include_directories( . linux ) -link_libraries(simdjson simdjson-flags simdjson-windows-headers test-data) +link_libraries(simdjson-windows-headers test-data) +link_libraries(simdjson) + add_executable(benchfeatures benchfeatures.cpp) add_executable(get_corpus_benchmark get_corpus_benchmark.cpp) -add_executable(perfdiff perfdiff.cpp) -add_executable(parse parse.cpp) -add_executable(parse_stream parse_stream.cpp) -add_executable(statisticalmodel statisticalmodel.cpp) - -add_executable(parse_noutf8validation parse.cpp) -target_compile_definitions(parse_noutf8validation PRIVATE SIMDJSON_SKIPUTF8VALIDATION) -add_executable(parse_nonumberparsing parse.cpp) -target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARSING) -add_executable(parse_nostringparsing parse.cpp) -target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING) if (TARGET benchmark::benchmark) link_libraries(benchmark::benchmark) add_executable(bench_parse_call bench_parse_call.cpp) add_executable(bench_dom_api bench_dom_api.cpp) + if(SIMDJSON_EXCEPTIONS) + add_executable(bench_ondemand bench_ondemand.cpp) + if(TARGET yyjson) + target_link_libraries(bench_ondemand PRIVATE yyjson) + endif() + if(TARGET rapidjson) + target_link_libraries(bench_ondemand PRIVATE rapidjson) + endif() + if(TARGET sajson) + target_link_libraries(bench_ondemand PRIVATE sajson) + endif() + if(TARGET nlohmann_json) + target_link_libraries(bench_ondemand PRIVATE nlohmann_json) + endif() + if(TARGET boostjson) + target_link_libraries(bench_ondemand PRIVATE boostjson) + endif() + endif() endif() -if (TARGET competition-all) - add_executable(distinctuseridcompetition distinctuseridcompetition.cpp) - target_link_libraries(distinctuseridcompetition competition-core) - add_executable(minifiercompetition minifiercompetition.cpp) - target_link_libraries(minifiercompetition competition-core) - add_executable(parseandstatcompetition parseandstatcompetition.cpp) - target_link_libraries(parseandstatcompetition competition-core) - add_executable(parsingcompetition parsingcompetition.cpp) - target_link_libraries(parsingcompetition competition-core) - add_executable(allparsingcompetition parsingcompetition.cpp) - target_link_libraries(allparsingcompetition competition-all) - target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER) -endif() - -include(checkperf.cmake) diff --git a/benchmark/Dockerfile b/benchmark/Dockerfile deleted file mode 100644 index e26b3402d6..0000000000 --- a/benchmark/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -# From the ROOT, run: -# docker build -t simdjsonbench -f benchmark/Dockerfile . && docker run --privileged -t simdjsonbench -FROM gcc:8.3 - -# # Build latest -# ENV latest_release=v0.2.1 -# WORKDIR /usr/src/$latest_release/ -# RUN git clone --depth 1 https://github.com/lemire/simdjson/ -b $latest_release . -# RUN make parse - -# # Build master -# WORKDIR /usr/src/master/ -# RUN git clone --depth 1 https://github.com/lemire/simdjson/ . -# RUN make parse - -# Build the current source -COPY . /usr/src/current/ -WORKDIR /usr/src/current/ -RUN make checkperf \ No newline at end of file diff --git a/benchmark/amazon_cellphones/amazon_cellphones.h b/benchmark/amazon_cellphones/amazon_cellphones.h new file mode 100644 index 0000000000..4f7f03a966 --- /dev/null +++ b/benchmark/amazon_cellphones/amazon_cellphones.h @@ -0,0 +1,73 @@ +#pragma once + +#include "json_benchmark/file_runner.h" +#include +#include + + +namespace amazon_cellphones { + +const bool UNTHREADED = false; +const bool THREADED = true; + +using namespace json_benchmark; + +struct brand { + double cumulative_rating; + uint64_t reviews_count; + simdjson_inline bool operator==(const brand &other) const { + return cumulative_rating == other.cumulative_rating && + reviews_count == other.reviews_count; + } + simdjson_inline bool operator!=(const brand &other) const { return !(*this == other); } +}; + +simdjson_unused static std::ostream &operator<<(std::ostream &o, const brand &b) { + o << "cumulative_rating: " << b.cumulative_rating << std::endl; + o << "reviews_count: " << b.reviews_count << std::endl; + return o; +} + +template +simdjson_unused static std::ostream &operator<<(std::ostream &o, const std::pair &p) { + o << "brand: " << p.first << std::endl; + o << p.second; + return o; +} + +template +struct runner : public file_runner { + std::map result{}; + + bool setup(benchmark::State &state) { + return this->load_json(state, AMAZON_CELLPHONES_NDJSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +template +struct simdjson_dom; + +template simdjson_inline static void amazon_cellphones(benchmark::State &state) { + run_json_benchmark, runner>>(state); +} + +} // namespace amazon_cellphones diff --git a/benchmark/amazon_cellphones/simdjson_dom.h b/benchmark/amazon_cellphones/simdjson_dom.h new file mode 100644 index 0000000000..04a407d852 --- /dev/null +++ b/benchmark/amazon_cellphones/simdjson_dom.h @@ -0,0 +1,51 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "amazon_cellphones.h" + +namespace amazon_cellphones { + +using namespace simdjson; + +template +struct simdjson_dom { + using StringType = std::string; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::map &result) { +#ifdef SIMDJSON_THREADS_ENABLED + parser.threaded = threaded; +#endif + auto stream = parser.parse_many(json); + auto i = stream.begin(); + ++i; // Skip first line + for (;i != stream.end(); ++i) { + auto doc = *i; + StringType copy(std::string_view(doc.at(1))); + auto x = result.find(copy); + if (x == result.end()) { // If key not found, add new key + result.emplace(copy, amazon_cellphones::brand{ + double(doc.at(5)) * uint64_t(doc.at(7)), + uint64_t(doc.at(7)) + }); + } else { // Otherwise, update key data + x->second.cumulative_rating += double(doc.at(5)) * uint64_t(doc.at(7)); + x->second.reviews_count += uint64_t(doc.at(7)); + } + } + + return true; + } + +}; + +BENCHMARK_TEMPLATE(amazon_cellphones, simdjson_dom)->UseManualTime(); +#ifdef SIMDJSON_THREADS_ENABLED +BENCHMARK_TEMPLATE(amazon_cellphones, simdjson_dom)->UseManualTime(); +#endif + +} // namespace amazon_cellphones + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/amazon_cellphones/simdjson_ondemand.h b/benchmark/amazon_cellphones/simdjson_ondemand.h new file mode 100644 index 0000000000..f2006141f5 --- /dev/null +++ b/benchmark/amazon_cellphones/simdjson_ondemand.h @@ -0,0 +1,72 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "amazon_cellphones.h" + +namespace amazon_cellphones { + +using namespace simdjson; + +template +struct simdjson_ondemand { + using StringType = std::string; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::map &result) { +#ifdef SIMDJSON_THREADS_ENABLED + parser.threaded = threaded; +#endif + ondemand::document_stream stream = parser.iterate_many(json); + ondemand::document_stream::iterator i = stream.begin(); + ++i; // Skip first line + for (;i != stream.end(); ++i) { + auto doc = *i; + size_t index{0}; + StringType copy; + double rating; + uint64_t reviews; + for ( auto value : doc ) { + switch (index) + { + case 1: + copy = StringType(std::string_view(value)); + break; + case 5: + rating = double(value); + break; + case 7: + reviews = uint64_t(value); + break; + default: + break; + } + index++; + } + + auto x = result.find(copy); + if (x == result.end()) { // If key not found, add new key + result.emplace(copy, amazon_cellphones::brand{ + rating * reviews, + reviews + }); + } else { // Otherwise, update key data + x->second.cumulative_rating += rating * reviews; + x->second.reviews_count += reviews; + } + } + + return true; + } + +}; + +BENCHMARK_TEMPLATE(amazon_cellphones, simdjson_ondemand)->UseManualTime(); +#ifdef SIMDJSON_THREADS_ENABLED +BENCHMARK_TEMPLATE(amazon_cellphones, simdjson_ondemand)->UseManualTime(); +#endif + +} // namespace amazon_cellphones + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/apple/apple_arm_events.h b/benchmark/apple/apple_arm_events.h new file mode 100644 index 0000000000..501ef9981e --- /dev/null +++ b/benchmark/apple/apple_arm_events.h @@ -0,0 +1,1107 @@ + +// Original design from: +// ============================================================================= +// XNU kperf/kpc +// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges +// +// References: +// +// XNU source (since xnu 2422.1.72): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h +// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c +// +// Lightweight PET (Profile Every Thread, since xnu 3789.1.32): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c +// +// System Private frameworks (since macOS 10.11, iOS 8.0): +// /System/Library/PrivateFrameworks/kperf.framework +// /System/Library/PrivateFrameworks/kperfdata.framework +// +// Xcode framework (since Xcode 7.0): +// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework +// +// CPU database (plist files) +// macOS (since macOS 10.11): +// /usr/share/kpep/.plist +// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): +// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform +// /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist +// +// +// Created by YaoYuan on 2021. +// Released into the public domain (unlicense.org). +// ============================================================================= + +#ifndef M1CYCLES_H +#define M1CYCLES_H + +#include +#include +#include +#include +#include + +#include // for dlopen() and dlsym() +#include // for mach_absolute_time() +#include // for kdebug trace decode +#include // for sysctl() +#include // for usleep() + +struct performance_counters { + double cycles; + double branches; + double missed_branches; + double instructions; + performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double c, double b, double m, double i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double init) + : cycles(init), branches(init), missed_branches(init), + instructions(init) {} + + inline performance_counters &operator-=(const performance_counters &other) { + cycles -= other.cycles; + branches -= other.branches; + missed_branches -= other.missed_branches; + instructions -= other.instructions; + return *this; + } + inline performance_counters &min(const performance_counters &other) { + cycles = other.cycles < cycles ? other.cycles : cycles; + branches = other.branches < branches ? other.branches : branches; + missed_branches = other.missed_branches < missed_branches + ? other.missed_branches + : missed_branches; + instructions = + other.instructions < instructions ? other.instructions : instructions; + return *this; + } + inline performance_counters &operator+=(const performance_counters &other) { + cycles += other.cycles; + branches += other.branches; + missed_branches += other.missed_branches; + instructions += other.instructions; + return *this; + } + + inline performance_counters &operator/=(double numerator) { + cycles /= numerator; + branches /= numerator; + missed_branches /= numerator; + instructions /= numerator; + return *this; + } +}; + +inline performance_counters operator-(const performance_counters &a, + const performance_counters &b) { + return performance_counters(a.cycles - b.cycles, a.branches - b.branches, + a.missed_branches - b.missed_branches, + a.instructions - b.instructions); +} + +typedef float f32; +typedef double f64; +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint16_t u16; +typedef int32_t i32; +typedef uint32_t u32; +typedef int64_t i64; +typedef uint64_t u64; +typedef size_t usize; + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework wraps some sysctl calls to communicate with the kpc in kernel. +// Most functions requires root privileges, or process is "blessed". +// ----------------------------------------------------------------------------- + +// Cross-platform class constants. +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) + +// Cross-platform class mask constants. +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 + +// PMU version constants. +#define KPC_PMU_ERROR (0) // Error +#define KPC_PMU_INTEL_V3 (1) // Intel +#define KPC_PMU_ARM_APPLE (2) // ARM64 +#define KPC_PMU_INTEL_V2 (3) // Old Intel +#define KPC_PMU_ARM_V2 (4) // Old ARM + +// The maximum number of counters we could read from every class in one go. +// ARMV7: FIXED: 1, CONFIGURABLE: 4 +// ARM32: FIXED: 2, CONFIGURABLE: 6 +// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) +// x86: 32 +#define KPC_MAX_COUNTERS 32 + +// Bits for defining what to do on an action. +// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h +#define KPERF_SAMPLER_TH_INFO (1U << 0) +#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) +#define KPERF_SAMPLER_KSTACK (1U << 2) +#define KPERF_SAMPLER_USTACK (1U << 3) +#define KPERF_SAMPLER_PMC_THREAD (1U << 4) +#define KPERF_SAMPLER_PMC_CPU (1U << 5) +#define KPERF_SAMPLER_PMC_CONFIG (1U << 6) +#define KPERF_SAMPLER_MEMINFO (1U << 7) +#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) +#define KPERF_SAMPLER_TH_DISPATCH (1U << 9) +#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) +#define KPERF_SAMPLER_SYS_MEM (1U << 11) +#define KPERF_SAMPLER_TH_INSCYC (1U << 12) +#define KPERF_SAMPLER_TK_INFO (1U << 13) + +// Maximum number of kperf action ids. +#define KPERF_ACTION_MAX (32) + +// Maximum number of kperf timer ids. +#define KPERF_TIMER_MAX (8) + +// x86/arm config registers are 64-bit +typedef u64 kpc_config_t; + +/// Print current CPU identification string to the buffer (same as snprintf), +/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC +/// database in /usr/share/kpep. +/// @return string's length, or negative value if error occurs. +/// @note This method does not requires root privileges. +/// @details sysctl get(hw.cputype), get(hw.cpusubtype), +/// get(hw.cpufamily), get(machdep.cpu.model) +static int (*kpc_cpu_string)(char *buf, usize buf_size); + +/// Get the version of KPC that's being run. +/// @return See `PMU version constants` above. +/// @details sysctl get(kpc.pmu_version) +static u32 (*kpc_pmu_version)(void); + +/// Get running PMC classes. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.counting) +static u32 (*kpc_get_counting)(void); + +/// Set PMC classes to enable counting. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.counting) +static int (*kpc_set_counting)(u32 classes); + +/// Get running PMC classes for current thread. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.thread_counting) +static u32 (*kpc_get_thread_counting)(void); + +/// Set PMC classes to enable counting for current thread. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.thread_counting) +static int (*kpc_set_thread_counting)(u32 classes); + +/// Get how many config registers there are for a given mask. +/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @return 0 if error occurs or no class is set. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.config_count) +static u32 (*kpc_get_config_count)(u32 classes); + +/// Get config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer to receive values, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), get(kpc.config) +static int (*kpc_get_config)(u32 classes, kpc_config_t *config); + +/// Set config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), set(kpc.config) +static int (*kpc_set_config)(u32 classes, kpc_config_t *config); + +/// Get how many counters there are for a given mask. +/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.counter_count) +static u32 (*kpc_get_counter_count)(u32 classes); + +/// Get counter accumulations. +/// If `all_cpus` is true, the buffer count should not smaller than +/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller +/// than (counter_count). +/// @see kpc_get_counter_count(), kpc_cpu_count(). +/// @param all_cpus true for all CPUs, false for current cpu. +/// @param classes See `class mask constants` above. +/// @param curcpu A pointer to receive current cpu id, can be NULL. +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) +static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, + u64 *buf); + +/// Get counter accumulations for current thread. +/// @param tid Thread id, should be 0. +/// @param buf_count The number of buf's elements (not bytes), +/// should not smaller than kpc_get_counter_count(). +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(kpc.thread_counters) +static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); + +/// Acquire/release the counters used by the Power Manager. +/// @param val 1:acquire, 0:release +/// @return 0 for success. +/// @details sysctl set(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_set)(int val); + +/// Get the state of all_ctrs. +/// @return 0 for success. +/// @details sysctl get(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_get)(int *val_out); + +/// Set number of actions, should be `KPERF_ACTION_MAX`. +/// @details sysctl set(kperf.action.count) +static int (*kperf_action_count_set)(u32 count); + +/// Get number of actions. +/// @details sysctl get(kperf.action.count) +static int (*kperf_action_count_get)(u32 *count); + +/// Set what to sample when a trigger fires an action, e.g. +/// `KPERF_SAMPLER_PMC_CPU`. +/// @details sysctl set(kperf.action.samplers) +static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); + +/// Get what to sample when a trigger fires an action. +/// @details sysctl get(kperf.action.samplers) +static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); + +/// Apply a task filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_task) +static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); + +/// Apply a pid filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_pid) +static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); + +/// Set number of time triggers, should be `KPERF_TIMER_MAX`. +/// @details sysctl set(kperf.timer.count) +static int (*kperf_timer_count_set)(u32 count); + +/// Get number of time triggers. +/// @details sysctl get(kperf.timer.count) +static int (*kperf_timer_count_get)(u32 *count); + +/// Set timer number and period. +/// @details sysctl set(kperf.timer.period) +static int (*kperf_timer_period_set)(u32 actionid, u64 tick); + +/// Get timer number and period. +/// @details sysctl get(kperf.timer.period) +static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); + +/// Set timer number and actionid. +/// @details sysctl set(kperf.timer.action) +static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); + +/// Get timer number and actionid. +/// @details sysctl get(kperf.timer.action) +static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); + +/// Set which timer ID does PET (Profile Every Thread). +/// @details sysctl set(kperf.timer.pet_timer) +static int (*kperf_timer_pet_set)(u32 timerid); + +/// Get which timer ID does PET (Profile Every Thread). +/// @details sysctl get(kperf.timer.pet_timer) +static int (*kperf_timer_pet_get)(u32 *timerid); + +/// Enable or disable sampling. +/// @details sysctl set(kperf.sampling) +static int (*kperf_sample_set)(u32 enabled); + +/// Get is currently sampling. +/// @details sysctl get(kperf.sampling) +static int (*kperf_sample_get)(u32 *enabled); + +/// Reset kperf: stop sampling, kdebug, timers and actions. +/// @return 0 for success. +static int (*kperf_reset)(void); + +/// Nanoseconds to CPU ticks. +static u64 (*kperf_ns_to_ticks)(u64 ns); + +/// CPU ticks to nanoseconds. +static u64 (*kperf_ticks_to_ns)(u64 ticks); + +/// CPU ticks frequency (mach_absolute_time). +static u64 (*kperf_tick_frequency)(void); + +/// Get lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_get(u32 *enabled) { + if (!enabled) + return -1; + usize size = 4; + return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0); +} + +/// Set lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_set(u32 enabled) { + return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4); +} + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework provides some functions to access the local CPU database. +// These functions do not require root privileges. +// ----------------------------------------------------------------------------- + +// KPEP CPU archtecture constants. +#define KPEP_ARCH_I386 0 +#define KPEP_ARCH_X86_64 1 +#define KPEP_ARCH_ARM 2 +#define KPEP_ARCH_ARM64 3 + +/// KPEP event (size: 48/28 bytes on 64/32 bit OS) +typedef struct kpep_event { + const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". + const char *description; ///< Description for this event. + const char *errata; ///< Errata, currently NULL. + const char *alias; ///< Alias name, such as "Instructions", "Cycles". + const char *fallback; ///< Fallback event name for fixed counter. + u32 mask; + u8 number; + u8 umask; + u8 reserved; + u8 is_fixed; +} kpep_event; + +/// KPEP database (size: 144/80 bytes on 64/32 bit OS) +typedef struct kpep_db { + const char *name; ///< Database name, such as "haswell". + const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". + const char *marketing_name; ///< Marketing name, such as "Intel Haswell". + void *plist_data; ///< Plist data (CFDataRef), currently NULL. + void *event_map; ///< All events (CFDict). + kpep_event + *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). + kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) + ///< * fixed_counter_count) + void *alias_map; ///< All aliases (CFDict). + usize reserved_1; + usize reserved_2; + usize reserved_3; + usize event_count; ///< All events count. + usize alias_count; + usize fixed_counter_count; + usize config_counter_count; + usize power_counter_count; + u32 archtecture; ///< see `KPEP CPU archtecture constants` above. + u32 fixed_counter_bits; + u32 config_counter_bits; + u32 power_counter_bits; +} kpep_db; + +/// KPEP config (size: 80/44 bytes on 64/32 bit OS) +typedef struct kpep_config { + kpep_db *db; + kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL + usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 + usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 + u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 + u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 + usize event_count; /// kpep_config_events_count() + usize counter_count; + u32 classes; ///< See `class mask constants` above. + u32 config_counter; + u32 power_counter; + u32 reserved; +} kpep_config; + +/// Error code for kpep_config_xxx() and kpep_db_xxx() functions. +typedef enum { + KPEP_CONFIG_ERROR_NONE = 0, + KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, + KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, + KPEP_CONFIG_ERROR_IO = 3, + KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, + KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, + KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, + KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, + KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, + KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, + KPEP_CONFIG_ERROR_DB_CORRUPT = 10, + KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, + KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, + KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, + KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, + KPEP_CONFIG_ERROR_ERRNO = 15, + KPEP_CONFIG_ERROR_MAX +} kpep_config_error_code; + +/// Error description for kpep_config_error_code. +static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { + "none", + "invalid argument", + "out of memory", + "I/O", + "buffer too small", + "current system unknown", + "database path invalid", + "database not found", + "database architecture unsupported", + "database version unsupported", + "database corrupt", + "event not found", + "conflicting events", + "all counters must be forced", + "event unavailable", + "check errno"}; + +/// Error description. +static const char *kpep_config_error_desc(int code) { + if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { + return kpep_config_error_names[code]; + } + return "unknown error"; +} + +/// Create a config. +/// @param db A kpep db, see kpep_db_create() +/// @param cfg_ptr A pointer to receive the new config. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); + +/// Free the config. +static void (*kpep_config_free)(kpep_config *cfg); + +/// Add an event to config. +/// @param cfg The config. +/// @param ev_ptr A event pointer. +/// @param flag 0: all, 1: user space only +/// @param err Error bitmap pointer, can be NULL. +/// If return value is `CONFLICTING_EVENTS`, this bitmap contains +/// the conflicted event indices, e.g. "1 << 2" means index 2. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, + u32 flag, u32 *err); + +/// Remove event at index. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); + +/// Force all counters. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_force_counters)(kpep_config *cfg); + +/// Get events count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); + +/// Get all event pointers. +/// @param buf A buffer to receive event pointers. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, + usize buf_size); + +/// Get kpc register configs. +/// @param buf A buffer to receive kpc register configs. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_kpc_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, + usize buf_size); + +/// Get kpc register config count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); + +/// Get kpc classes. +/// @param classes See `class mask constants` above. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); + +/// Get the index mapping from event to counter. +/// @param buf A buffer to receive indexes. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); + +/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". +/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". +/// Pass NULL for current CPU. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); + +/// Free the kpep database. +static void (*kpep_db_free)(kpep_db *db); + +/// Get the database's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_name)(kpep_db *db, const char **name); + +/// Get the event alias count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); + +/// Get all alias. +/// @param buf A buffer to receive all alias strings. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_aliases_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); + +/// Get counters count for given classes. +/// @param classes 1: Fixed, 2: Configurable. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); + +/// Get all event count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events_count)(kpep_db *db, usize *count); + +/// Get all events. +/// @param buf A buffer to receive all event pointers. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); + +/// Get one event by name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); + +/// Get event's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); + +/// Get event's alias. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); + +/// Get event's description. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); + +// ----------------------------------------------------------------------------- +// load kperf/kperfdata dynamic library +// ----------------------------------------------------------------------------- + +typedef struct { + const char *name; + void **impl; +} lib_symbol; + +#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) +#define lib_symbol_def(name) \ + { \ +#name, (void **)&name \ + } + +static const lib_symbol lib_symbols_kperf[] = { + lib_symbol_def(kpc_pmu_version), + lib_symbol_def(kpc_cpu_string), + lib_symbol_def(kpc_set_counting), + lib_symbol_def(kpc_get_counting), + lib_symbol_def(kpc_set_thread_counting), + lib_symbol_def(kpc_get_thread_counting), + lib_symbol_def(kpc_get_config_count), + lib_symbol_def(kpc_get_counter_count), + lib_symbol_def(kpc_set_config), + lib_symbol_def(kpc_get_config), + lib_symbol_def(kpc_get_cpu_counters), + lib_symbol_def(kpc_get_thread_counters), + lib_symbol_def(kpc_force_all_ctrs_set), + lib_symbol_def(kpc_force_all_ctrs_get), + lib_symbol_def(kperf_action_count_set), + lib_symbol_def(kperf_action_count_get), + lib_symbol_def(kperf_action_samplers_set), + lib_symbol_def(kperf_action_samplers_get), + lib_symbol_def(kperf_action_filter_set_by_task), + lib_symbol_def(kperf_action_filter_set_by_pid), + lib_symbol_def(kperf_timer_count_set), + lib_symbol_def(kperf_timer_count_get), + lib_symbol_def(kperf_timer_period_set), + lib_symbol_def(kperf_timer_period_get), + lib_symbol_def(kperf_timer_action_set), + lib_symbol_def(kperf_timer_action_get), + lib_symbol_def(kperf_sample_set), + lib_symbol_def(kperf_sample_get), + lib_symbol_def(kperf_reset), + lib_symbol_def(kperf_timer_pet_set), + lib_symbol_def(kperf_timer_pet_get), + lib_symbol_def(kperf_ns_to_ticks), + lib_symbol_def(kperf_ticks_to_ns), + lib_symbol_def(kperf_tick_frequency), +}; + +static const lib_symbol lib_symbols_kperfdata[] = { + lib_symbol_def(kpep_config_create), + lib_symbol_def(kpep_config_free), + lib_symbol_def(kpep_config_add_event), + lib_symbol_def(kpep_config_remove_event), + lib_symbol_def(kpep_config_force_counters), + lib_symbol_def(kpep_config_events_count), + lib_symbol_def(kpep_config_events), + lib_symbol_def(kpep_config_kpc), + lib_symbol_def(kpep_config_kpc_count), + lib_symbol_def(kpep_config_kpc_classes), + lib_symbol_def(kpep_config_kpc_map), + lib_symbol_def(kpep_db_create), + lib_symbol_def(kpep_db_free), + lib_symbol_def(kpep_db_name), + lib_symbol_def(kpep_db_aliases_count), + lib_symbol_def(kpep_db_aliases), + lib_symbol_def(kpep_db_counters_count), + lib_symbol_def(kpep_db_events_count), + lib_symbol_def(kpep_db_events), + lib_symbol_def(kpep_db_event), + lib_symbol_def(kpep_event_name), + lib_symbol_def(kpep_event_alias), + lib_symbol_def(kpep_event_description), +}; + +#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" +#define lib_path_kperfdata \ + "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" + +static bool lib_inited = false; +static bool lib_has_err = false; +static char lib_err_msg[256]; + +static void *lib_handle_kperf = NULL; +static void *lib_handle_kperfdata = NULL; + +static void lib_deinit(void) { + lib_inited = false; + lib_has_err = false; + if (lib_handle_kperf) + dlclose(lib_handle_kperf); + if (lib_handle_kperfdata) + dlclose(lib_handle_kperfdata); + lib_handle_kperf = NULL; + lib_handle_kperfdata = NULL; + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = NULL; + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = NULL; + } +} + +static bool lib_init(void) { +#define return_err() \ + do { \ + lib_deinit(); \ + lib_inited = true; \ + lib_has_err = true; \ + return false; \ + } while (false) + + if (lib_inited) + return !lib_has_err; + + // load dynamic library + lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); + if (!lib_handle_kperf) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf.framework, message: %s.", dlerror()); + return_err(); + } + lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); + if (!lib_handle_kperfdata) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata.framework, message: %s.", dlerror()); + return_err(); + } + + // load symbol address from dynamic library + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = dlsym(lib_handle_kperf, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf function: %s.", symbol->name); + return_err(); + } + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata function: %s.", symbol->name); + return_err(); + } + } + + lib_inited = true; + lib_has_err = false; + return true; + +#undef return_err +} + +// ----------------------------------------------------------------------------- +// kdebug private structs +// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h +// ----------------------------------------------------------------------------- + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + +typedef struct { + uint64_t timestamp; + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ + uint32_t debugid; /* see */ + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) + uint32_t cpuid; /* cpu index, from 0 */ + kd_buf_argtype unused; +#endif +} kd_buf; + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 + +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U + +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; +} kd_regtype; + +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; +} kbufinfo_t; + +// ----------------------------------------------------------------------------- +// kdebug utils +// ----------------------------------------------------------------------------- + +/// Clean up trace buffers and reset ktrace/kdebug/kperf. +/// @return 0 on success. +static int kdebug_reset(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Disable and reinitialize the trace buffers. +/// @return 0 on success. +static int kdebug_reinit(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Set debug filter. +static int kdebug_setreg(kd_regtype *kdr) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG}; + usize size = sizeof(kd_regtype); + return sysctl(mib, 3, kdr, &size, NULL, 0); +} + +/// Set maximum number of trace entries (kd_buf). +/// Only allow allocation up to half the available memory (sane_size). +/// @return 0 on success. +static int kdebug_trace_setbuf(int nbufs) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs}; + return sysctl(mib, 4, NULL, NULL, NULL, 0); +} + +/// Enable or disable kdebug trace. +/// Trace buffer must already be initialized. +/// @return 0 on success. +static int kdebug_trace_enable(bool enable) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable}; + return sysctl(mib, 4, NULL, 0, NULL, 0); +} + +/// Retrieve trace buffer information from kernel. +/// @return 0 on success. +static int kdebug_get_bufinfo(kbufinfo_t *info) { + if (!info) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF}; + size_t needed = sizeof(kbufinfo_t); + return sysctl(mib, 3, info, &needed, NULL, 0); +} + +/// Retrieve trace buffers from kernel. +/// @param buf Memory to receive buffer data, array of `kd_buf`. +/// @param len Length of `buf` in bytes. +/// @param count Number of trace entries (kd_buf) obtained. +/// @return 0 on success. +static int kdebug_trace_read(void *buf, usize len, usize *count) { + if (count) + *count = 0; + if (!buf || !len) + return -1; + + // Note: the input and output units are not the same. + // input: bytes + // output: number of kd_buf + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR}; + int ret = sysctl(mib, 3, buf, &len, NULL, 0); + if (ret != 0) + return ret; + *count = len; + return 0; +} + +/// Block until there are new buffers filled or `timeout_ms` have passed. +/// @param timeout_ms timeout milliseconds, 0 means wait forever. +/// @param suc set true if new buffers filled. +/// @return 0 on success. +static int kdebug_wait(usize timeout_ms, bool *suc) { + if (timeout_ms == 0) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT}; + usize val = timeout_ms; + int ret = sysctl(mib, 3, NULL, &val, NULL, 0); + if (suc) + *suc = !!val; + return ret; +} + +// ----------------------------------------------------------------------------- +// Demo +// ----------------------------------------------------------------------------- + +#define EVENT_NAME_MAX 8 +typedef struct { + const char *alias; /// name for print + const char *names[EVENT_NAME_MAX]; /// name from pmc db +} event_alias; + +/// Event names from /usr/share/kpep/.plist +static const event_alias profile_events[] = { + {"cycles", + { + "FIXED_CYCLES", // Apple A7-A15 + "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th + "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom + }}, + {"instructions", + { + "FIXED_INSTRUCTIONS", // Apple A7-A15 + "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th + }}, + {"branches", + { + "INST_BRANCH", // Apple A7-A15 + "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th + "INST_RETIRED.ANY", // Intel Yonah, Merom + }}, + {"branch-misses", + { + "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 + "BRANCH_MISPREDICT", // Apple A7-A14 + "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th + "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom + }}, +}; + +static kpep_event *get_event(kpep_db *db, const event_alias *alias) { + for (usize j = 0; j < EVENT_NAME_MAX; j++) { + const char *name = alias->names[j]; + if (!name) + break; + kpep_event *ev = NULL; + if (kpep_db_event(db, name, &ev) == 0) { + return ev; + } + } + return NULL; +} + +struct AppleEvents { + kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; + usize counter_map[KPC_MAX_COUNTERS] = {0}; + u64 counters_0[KPC_MAX_COUNTERS] = {0}; + u64 counters_1[KPC_MAX_COUNTERS] = {0}; + static constexpr usize ev_count = + sizeof(profile_events) / sizeof(profile_events[0]); + bool init = false; + bool worked = false; + inline bool setup_performance_counters() { + if (init) { + return worked; + } + init = true; + + // load dylib + if (!lib_init()) { + printf("Error: %s\n", lib_err_msg); + return (worked = false); + } + + // check permission + int force_ctrs = 0; + if (kpc_force_all_ctrs_get(&force_ctrs)) { + return (worked = false); + } + int ret; + // load pmc db + kpep_db *db = NULL; + if ((ret = kpep_db_create(NULL, &db))) { + printf("Error: cannot load pmc database: %d.\n", ret); + return (worked = false); + } + + // create a config + kpep_config *cfg = NULL; + if ((ret = kpep_config_create(db, &cfg))) { + printf("Failed to create kpep config: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_force_counters(cfg))) { + printf("Failed to force counters: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // get events + kpep_event *ev_arr[ev_count] = {0}; + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + ev_arr[i] = get_event(db, alias); + if (!ev_arr[i]) { + printf("Cannot find event: %s.\n", alias->alias); + return (worked = false); + } + } + + // add event to config + for (usize i = 0; i < ev_count; i++) { + kpep_event *ev = ev_arr[i]; + if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { + printf("Failed to add event: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + } + + // prepare buffer and config + u32 classes = 0; + usize reg_count = 0; + if ((ret = kpep_config_kpc_classes(cfg, &classes))) { + printf("Failed get kpc classes: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_count(cfg, ®_count))) { + printf("Failed get kpc count: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { + printf("Failed get kpc map: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { + printf("Failed get kpc registers: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // set config to kernel + if ((ret = kpc_force_all_ctrs_set(1))) { + printf("Failed force all ctrs: %d.\n", ret); + return (worked = false); + } + if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { + if ((ret = kpc_set_config(classes, regs))) { + printf("Failed set kpc config: %d.\n", ret); + return (worked = false); + } + } + + // start counting + if ((ret = kpc_set_counting(classes))) { + printf("Failed set counting: %d.\n", ret); + return (worked = false); + } + if ((ret = kpc_set_thread_counting(classes))) { + printf("Failed set thread counting: %d.\n", ret); + return (worked = false); + } + + return (worked = true); + } + + inline performance_counters get_counters() { + static bool warned = false; + int ret; + // get counters before + if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { + if (!warned) { + + printf("Failed get thread counters before: %d.\n", ret); + warned = true; + } + return 1; + } + return performance_counters { + counters_0[counter_map[0]], counters_0[counter_map[2]], + counters_0[counter_map[3]], counters_0[counter_map[1]]}; + } +}; + +#endif diff --git a/benchmark/bench_dom_api.cpp b/benchmark/bench_dom_api.cpp index ec6ea3a39a..29a14dc1f9 100644 --- a/benchmark/bench_dom_api.cpp +++ b/benchmark/bench_dom_api.cpp @@ -1,4 +1,5 @@ #include +#include #include "simdjson.h" #include @@ -8,8 +9,8 @@ using namespace std; const padded_string EMPTY_ARRAY("[]", 2); -const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; -const char *NUMBERS_JSON = SIMDJSON_BENCHMARK_DATA_DIR "numbers.json"; +static const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; +static const char *NUMBERS_JSON = SIMDJSON_BENCHMARK_DATA_DIR "numbers.json"; static void recover_one_string(State& state) { dom::parser parser; @@ -22,11 +23,11 @@ static void recover_one_string(State& state) { return; } dom::element doc; - if (error = parser.parse(docdata).get(doc)) { + if ((error = parser.parse(docdata).get(doc))) { cerr << "could not parse string" << error << endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::string_view v; error = doc.get(v); if (error) { @@ -48,8 +49,7 @@ static void serialize_twitter(State& state) { return; } // we do not want mem. alloc. in the loop. - error = parser.allocate(docdata.size()); - if(error) { + if((error = parser.allocate(docdata.size()))) { cout << error << endl; return; } @@ -59,21 +59,141 @@ static void serialize_twitter(State& state) { return; } size_t bytes = 0; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::string serial = simdjson::minify(doc); bytes += serial.size(); benchmark::DoNotOptimize(serial); } + // we validate the result + { + auto serial = simdjson::minify(doc); + dom::element doc2; // we parse the minified output + if ((error = parser.parse(serial).get(doc2))) { throw std::runtime_error("serialization error"); } + auto serial2 = simdjson::minify(doc2); // we minify a second time + if(serial != serial2) { throw std::runtime_error("serialization mismatch"); } + } + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte + state.counters["Gigabytes"] = benchmark::Counter( + double(bytes), benchmark::Counter::kIsRate, + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); +} +BENCHMARK(serialize_twitter)->ComputeStatistics("max", [](const std::vector& v) -> double { + return *(std::max_element(std::begin(v), std::end(v))); + })->DisplayAggregatesOnly(true); + + +static void serialize_big_string_to_string(State& state) { + dom::parser parser; + std::vector content; + content.push_back('\"'); + for(size_t i = 0 ; i < 100000; i ++) { + content.push_back('0' + char(i%10)); // we add what looks like a long list of digits + } + content.push_back('\"'); + dom::element doc; + simdjson::error_code error; + if ((error = parser.parse(content.data(), content.size()).get(doc))) { + cerr << "could not parse big string" << error << endl; + return; + } + size_t bytes = 0; + for (simdjson_unused auto _ : state) { + auto serial = simdjson::to_string(doc); + bytes += serial.size(); + benchmark::DoNotOptimize(serial); + } // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte state.counters["Gigabytes"] = benchmark::Counter( double(bytes), benchmark::Counter::kIsRate, benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); } -BENCHMARK(serialize_twitter)->Repetitions(10)->ComputeStatistics("max", [](const std::vector& v) -> double { +BENCHMARK(serialize_big_string_to_string)->ComputeStatistics("max", [](const std::vector& v) -> double { return *(std::max_element(std::begin(v), std::end(v))); })->DisplayAggregatesOnly(true); + +static void serialize_twitter_to_string(State& state) { + dom::parser parser; + padded_string docdata; + auto error = padded_string::load(TWITTER_JSON).get(docdata); + if(error) { + cerr << "could not parse twitter.json" << error << endl; + return; + } + // we do not want mem. alloc. in the loop. + if((error = parser.allocate(docdata.size()))) { + cout << error << endl; + return; + } + dom::element doc; + if ((error = parser.parse(docdata).get(doc))) { + cerr << "could not parse twitter.json" << error << endl; + return; + } + size_t bytes = 0; + for (simdjson_unused auto _ : state) { + auto serial = simdjson::to_string(doc); + bytes += serial.size(); + benchmark::DoNotOptimize(serial); + } + // we validate the result + { + auto serial = simdjson::to_string(doc); + dom::element doc2; // we parse the stringify output + if ((error = parser.parse(serial).get(doc2))) { throw std::runtime_error("serialization error"); } + auto serial2 = simdjson::to_string(doc2); // we stringify again + if(serial != serial2) { throw std::runtime_error("serialization mismatch"); } + } + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte + state.counters["Gigabytes"] = benchmark::Counter( + double(bytes), benchmark::Counter::kIsRate, + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); +} +BENCHMARK(serialize_twitter_to_string)->ComputeStatistics("max", [](const std::vector& v) -> double { + return *(std::max_element(std::begin(v), std::end(v))); + })->DisplayAggregatesOnly(true); + +static void serialize_twitter_string_builder(State& state) { + dom::parser parser; + padded_string docdata; + auto error = padded_string::load(TWITTER_JSON).get(docdata); + if(error) { + cerr << "could not parse twitter.json" << error << endl; + return; + } + // we do not want mem. alloc. in the loop. + if((error = parser.allocate(docdata.size()))) { + cout << error << endl; + return; + } + dom::element doc; + if ((error = parser.parse(docdata).get(doc))) { + cerr << "could not parse twitter.json" << error << endl; + return; + } + size_t bytes = 0; + simdjson::internal::string_builder<> sb;// not part of our public API, for internal use + for (simdjson_unused auto _ : state) { + sb.clear(); + sb.append(doc); + std::string_view serial = sb.str(); + bytes += serial.size(); + benchmark::DoNotOptimize(serial); + } + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte + state.counters["Gigabytes"] = benchmark::Counter( + double(bytes), benchmark::Counter::kIsRate, + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); +} +BENCHMARK(serialize_twitter_string_builder)->ComputeStatistics("max", [](const std::vector& v) -> double { + return *(std::max_element(std::begin(v), std::end(v))); + })->DisplayAggregatesOnly(true); + + static void numbers_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; @@ -83,7 +203,7 @@ static void numbers_scan(State& state) { cerr << "could not read " << NUMBERS_JSON << " as an array: " << error << endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; for (auto e : arr) { double x; @@ -92,7 +212,7 @@ static void numbers_scan(State& state) { } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_scan); @@ -105,7 +225,7 @@ static void numbers_size_scan(State& state) { cerr << "could not read " << NUMBERS_JSON << " as an array: " << error << endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; container.resize(arr.size()); size_t pos = 0; @@ -117,7 +237,7 @@ static void numbers_size_scan(State& state) { if(pos != container.size()) { cerr << "bad count" << endl; } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_size_scan); @@ -131,7 +251,7 @@ static void numbers_type_scan(State& state) { cerr << "could not read " << NUMBERS_JSON << " as an array" << endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; for (auto e : arr) { dom::element_type actual_type = e.type(); @@ -157,7 +277,7 @@ static void numbers_type_size_scan(State& state) { cerr << "could not read " << NUMBERS_JSON << " as an array: " << error << endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; container.resize(arr.size()); size_t pos = 0; @@ -182,7 +302,7 @@ static void numbers_load_scan(State& state) { dom::parser parser; dom::array arr; simdjson::error_code error; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { // this may hit the disk, but probably just once if ((error = parser.load(NUMBERS_JSON).get(arr))) { cerr << "could not read " << NUMBERS_JSON << " as an array: " << error << endl; @@ -196,7 +316,7 @@ static void numbers_load_scan(State& state) { } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_load_scan); @@ -205,7 +325,7 @@ static void numbers_load_size_scan(State& state) { dom::parser parser; dom::array arr; simdjson::error_code error; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { // this may hit the disk, but probably just once if ((error = parser.load(NUMBERS_JSON).get(arr))) { cerr << "could not read " << NUMBERS_JSON << " as an array" << endl; @@ -222,7 +342,7 @@ static void numbers_load_size_scan(State& state) { if(pos != container.size()) { cerr << "bad count" << endl; } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_load_size_scan); @@ -234,14 +354,14 @@ static void numbers_exceptions_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; dom::array arr = parser.load(NUMBERS_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; for (double x : arr) { container.push_back(x); } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_exceptions_scan); @@ -249,7 +369,7 @@ static void numbers_exceptions_size_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; dom::array arr = parser.load(NUMBERS_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; container.resize(arr.size()); size_t pos = 0; @@ -259,7 +379,7 @@ static void numbers_exceptions_size_scan(State& state) { if(pos != container.size()) { cerr << "bad count" << endl; } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_exceptions_size_scan); @@ -269,7 +389,7 @@ static void numbers_type_exceptions_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; dom::array arr = parser.load(NUMBERS_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; for (auto e : arr) { dom::element_type actual_type = e.type(); @@ -288,7 +408,7 @@ static void numbers_type_exceptions_size_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; dom::array arr = parser.load(NUMBERS_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { std::vector container; container.resize(arr.size()); size_t pos = 0; @@ -309,7 +429,7 @@ BENCHMARK(numbers_type_exceptions_size_scan); static void numbers_exceptions_load_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { // this may hit the disk, but probably just once dom::array arr = parser.load(NUMBERS_JSON); std::vector container; @@ -318,14 +438,14 @@ static void numbers_exceptions_load_scan(State& state) { } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_exceptions_load_scan); static void numbers_exceptions_load_size_scan(State& state) { // Prints the number of results in twitter.json dom::parser parser; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { // this may hit the disk, but probably just once dom::array arr = parser.load(NUMBERS_JSON); std::vector container; @@ -337,7 +457,7 @@ static void numbers_exceptions_load_size_scan(State& state) { if(pos != container.size()) { cerr << "bad count" << endl; } benchmark::DoNotOptimize(container.data()); benchmark::ClobberMemory(); - } + } } BENCHMARK(numbers_exceptions_load_size_scan); @@ -346,43 +466,23 @@ static void twitter_count(State& state) { // Prints the number of results in twitter.json dom::parser parser; dom::element doc = parser.load(TWITTER_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { uint64_t result_count = doc["search_metadata"]["count"]; if (result_count != 100) { return; } } } BENCHMARK(twitter_count); -SIMDJSON_PUSH_DISABLE_WARNINGS -SIMDJSON_DISABLE_DEPRECATED_WARNING -static void iterator_twitter_count(State& state) { - // Prints the number of results in twitter.json - padded_string json = padded_string::load(TWITTER_JSON); - ParsedJson pj = build_parsed_json(json); - for (UNUSED auto _ : state) { - ParsedJson::Iterator iter(pj); - // uint64_t result_count = doc["search_metadata"]["count"]; - if (!iter.move_to_key("search_metadata")) { return; } - if (!iter.move_to_key("count")) { return; } - if (!iter.is_integer()) { return; } - int64_t result_count = iter.get_integer(); - - if (result_count != 100) { return; } - } -} -BENCHMARK(iterator_twitter_count); -SIMDJSON_POP_DISABLE_WARNINGS - static void twitter_default_profile(State& state) { // Count unique users with a default profile. dom::parser parser; dom::element doc = parser.load(TWITTER_JSON); - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { set default_users; for (dom::object tweet : doc["statuses"]) { dom::object user = tweet["user"]; if (user["default_profile"]) { - default_users.insert(user["screen_name"]); + default_users.emplace(user["screen_name"]); } } if (default_users.size() != 86) { return; } @@ -395,14 +495,14 @@ static void twitter_image_sizes(State& state) { dom::parser parser; dom::element doc = parser.load(TWITTER_JSON); simdjson::error_code error; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { set> image_sizes; for (dom::object tweet : doc["statuses"]) { dom::array media; - if (not (error = tweet["entities"]["media"].get(media))) { + if (! (error = tweet["entities"]["media"].get(media))) { for (dom::object image : media) { - for (auto size : image["sizes"].get()) { - image_sizes.insert({ size.value["w"], size.value["h"] }); + for (auto size : image["sizes"].get_object()) { + image_sizes.emplace(size.value["w"], size.value["h"]); } } } @@ -420,7 +520,7 @@ static void error_code_twitter_count(State& state) noexcept { simdjson::error_code error; dom::element doc; if ((error = parser.load(TWITTER_JSON).get(doc))) { return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { uint64_t value; if ((error = doc["search_metadata"]["count"].get(value))) { return; } if (value != 100) { return; } @@ -434,7 +534,7 @@ static void error_code_twitter_default_profile(State& state) noexcept { simdjson::error_code error; dom::element doc; if ((error = parser.load(TWITTER_JSON).get(doc))) { std::cerr << error << std::endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { set default_users; dom::array tweets; @@ -442,7 +542,7 @@ static void error_code_twitter_default_profile(State& state) noexcept { for (dom::element tweet : tweets) { dom::object user; if ((error = tweet["user"].get(user))) { return; } - bool default_profile; + bool default_profile{}; if ((error = user["default_profile"].get(default_profile))) { return; } if (default_profile) { std::string_view screen_name; @@ -456,62 +556,19 @@ static void error_code_twitter_default_profile(State& state) noexcept { } BENCHMARK(error_code_twitter_default_profile); -SIMDJSON_PUSH_DISABLE_WARNINGS -SIMDJSON_DISABLE_DEPRECATED_WARNING -static void iterator_twitter_default_profile(State& state) { - // Count unique users with a default profile. - padded_string json; - auto error = padded_string::load(TWITTER_JSON).get(json); - if (error) { std::cerr << error << std::endl; return; } - ParsedJson pj = build_parsed_json(json); - for (UNUSED auto _ : state) { - set default_users; - ParsedJson::Iterator iter(pj); - - // for (dom::object tweet : doc["statuses"]) { - if (!(iter.move_to_key("statuses") && iter.is_array())) { return; } - if (iter.down()) { // first status - do { - - // dom::object user = tweet["user"]; - if (!(iter.move_to_key("user") && iter.is_object())) { return; } - - // if (user["default_profile"]) { - if (iter.move_to_key("default_profile")) { - if (iter.is_true()) { - if (!iter.up()) { return; } // back to user - - // default_users.insert(user["screen_name"]); - if (!(iter.move_to_key("screen_name") && iter.is_string())) { return; } - default_users.insert(string_view(iter.get_string(), iter.get_string_length())); - } - if (!iter.up()) { return; } // back to user - } - - if (!iter.up()) { return; } // back to status - - } while (iter.next()); // next status - } - - if (default_users.size() != 86) { return; } - } -} -SIMDJSON_POP_DISABLE_WARNINGS -BENCHMARK(iterator_twitter_default_profile); - static void error_code_twitter_image_sizes(State& state) noexcept { // Count unique image sizes dom::parser parser; simdjson::error_code error; dom::element doc; if ((error = parser.load(TWITTER_JSON).get(doc))) { std::cerr << error << std::endl; return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { set> image_sizes; dom::array statuses; if ((error = doc["statuses"].get(statuses))) { return; } for (dom::element tweet : statuses) { dom::array images; - if (not (error = tweet["entities"]["media"].get(images))) { + if (! (error = tweet["entities"]["media"].get(images))) { for (dom::element image : images) { dom::object sizes; if ((error = image["sizes"].get(sizes))) { return; } @@ -519,7 +576,7 @@ static void error_code_twitter_image_sizes(State& state) noexcept { uint64_t width, height; if ((error = size.value["w"].get(width))) { return; } if ((error = size.value["h"].get(height))) { return; } - image_sizes.insert({ width, height }); + image_sizes.emplace(width, height); } } } @@ -529,85 +586,28 @@ static void error_code_twitter_image_sizes(State& state) noexcept { } BENCHMARK(error_code_twitter_image_sizes); -SIMDJSON_PUSH_DISABLE_WARNINGS -SIMDJSON_DISABLE_DEPRECATED_WARNING -static void iterator_twitter_image_sizes(State& state) { - // Count unique image sizes - padded_string json; - auto error = padded_string::load(TWITTER_JSON).get(json); - if (error) { std::cerr << error << std::endl; return; } - ParsedJson pj = build_parsed_json(json); - for (UNUSED auto _ : state) { - set> image_sizes; - ParsedJson::Iterator iter(pj); - - // for (dom::object tweet : doc["statuses"]) { - if (!(iter.move_to_key("statuses") && iter.is_array())) { return; } - if (iter.down()) { // first status - do { - - // dom::object media; - // not_found = tweet["entities"]["media"].get(media); - // if (!not_found) { - if (iter.move_to_key("entities")) { - if (!iter.is_object()) { return; } - if (iter.move_to_key("media")) { - if (!iter.is_array()) { return; } - - // for (dom::object image : media) { - if (iter.down()) { // first media - do { - - // for (auto [key, size] : dom::object(image["sizes"])) { - if (!(iter.move_to_key("sizes") && iter.is_object())) { return; } - if (iter.down()) { // first size - do { - iter.move_to_value(); - - // image_sizes.insert({ size["w"], size["h"] }); - if (!(iter.move_to_key("w")) && !iter.is_integer()) { return; } - uint64_t width = iter.get_integer(); - if (!iter.up()) { return; } // back to size - if (!(iter.move_to_key("h")) && !iter.is_integer()) { return; } - uint64_t height = iter.get_integer(); - if (!iter.up()) { return; } // back to size - image_sizes.insert({ width, height }); - - } while (iter.next()); // next size - if (!iter.up()) { return; } // back to sizes - } - if (!iter.up()) { return; } // back to image - } while (iter.next()); // next image - if (!iter.up()) { return; } // back to media - } - if (!iter.up()) { return; } // back to entities - } - if (!iter.up()) { return; } // back to status - } - } while (iter.next()); // next status - } - - if (image_sizes.size() != 15) { return; }; - } -} -BENCHMARK(iterator_twitter_image_sizes); - -static void print_json(State& state) noexcept { - // Prints the number of results in twitter.json +static void parse_surrogate_pairs(State& state) { + // NOTE: This mostly exists to show there's a tiny benefit to + // loading and comparing both bytes of "\\u" simultaneously. + // (which should also reduce the compiled code size). + // The repeated surrogate pairs make this easier to measure. dom::parser parser; - - padded_string json; - auto error = padded_string::load(TWITTER_JSON).get(json); - if (error) { std::cerr << error << std::endl; return; } - - int code = json_parse(json, parser); - if (code) { cerr << error_message(code) << endl; return; } - for (UNUSED auto _ : state) { - std::stringstream s; - if (!parser.print_json(s)) { cerr << "print_json failed" << endl; return; } + const std::string_view data = "\"\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\\uD834\\uDD1E\""; + padded_string docdata{data}; + // we do not want mem. alloc. in the loop. + auto error = parser.allocate(docdata.size()); + if (error) { + cout << error << endl; + return; + } + for (simdjson_unused auto _ : state) { + dom::element doc; + if ((error = parser.parse(docdata).get(doc))) { + cerr << "could not parse string" << error << endl; + return; + } } } -BENCHMARK(print_json); -SIMDJSON_POP_DISABLE_WARNINGS +BENCHMARK(parse_surrogate_pairs); -BENCHMARK_MAIN(); \ No newline at end of file +BENCHMARK_MAIN(); diff --git a/benchmark/bench_ondemand.cpp b/benchmark/bench_ondemand.cpp new file mode 100644 index 0000000000..74bc8db855 --- /dev/null +++ b/benchmark/bench_ondemand.cpp @@ -0,0 +1,151 @@ +#include "simdjson.h" + +SIMDJSON_PUSH_DISABLE_ALL_WARNINGS + +#ifdef SIMDJSON_COMPETITION_YYJSON +#include "yyjson.h" +#endif + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON +#include "rapidjson/document.h" +#include "rapidjson/reader.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" +#endif + +#ifdef SIMDJSON_COMPETITION_SAJSON +#include "sajson.h" +#endif + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON +#include +#endif + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON +#include +#endif + +// This has to be last, for reasons I don't yet understand +#include + +SIMDJSON_POP_DISABLE_WARNINGS +#include "json2msgpack/simdjson_ondemand.h" +#include "json2msgpack/simdjson_dom.h" +#include "json2msgpack/yyjson.h" +#include "json2msgpack/rapidjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "json2msgpack/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "json2msgpack/nlohmann_json.h" +#include "json2msgpack/boostjson.h" + +#include "partial_tweets/simdjson_ondemand.h" +#include "partial_tweets/simdjson_dom.h" +#include "partial_tweets/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "partial_tweets/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "partial_tweets/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "partial_tweets/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "partial_tweets/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "partial_tweets/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "partial_tweets/boostjson.h" + + +#include "distinct_user_id/simdjson_ondemand.h" +#include "distinct_user_id/simdjson_ondemand_json_pointer.h" +#include "distinct_user_id/simdjson_dom.h" +#include "distinct_user_id/simdjson_dom_json_pointer.h" +#include "distinct_user_id/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "distinct_user_id/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "distinct_user_id/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "distinct_user_id/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "distinct_user_id/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "distinct_user_id/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "distinct_user_id/boostjson.h" + +#include "find_tweet/simdjson_ondemand.h" +#include "find_tweet/simdjson_dom.h" +#include "find_tweet/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "find_tweet/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "find_tweet/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "find_tweet/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "find_tweet/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "find_tweet/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "find_tweet/boostjson.h" + +#include "top_tweet/simdjson_ondemand.h" +#include "top_tweet/simdjson_dom.h" +#include "top_tweet/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "top_tweet/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "top_tweet/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "top_tweet/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "top_tweet/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "top_tweet/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "top_tweet/boostjson.h" + + +#include "kostya/simdjson_ondemand.h" +#include "kostya/simdjson_dom.h" +#include "kostya/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "kostya/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "kostya/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "kostya/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "kostya/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "kostya/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "kostya/boostjson.h" + +#include "large_random/simdjson_ondemand.h" +#if SIMDJSON_COMPETITION_ONDEMAND_UNORDERED +#include "large_random/simdjson_ondemand_unordered.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_UNORDERED +#include "large_random/simdjson_dom.h" +#include "large_random/yyjson.h" +#if SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "large_random/sajson.h" +#endif // SIMDJSON_COMPETITION_ONDEMAND_SAJSON +#include "large_random/rapidjson.h" +#if SIMDJSON_COMPETITION_SAX +#include "large_random/rapidjson_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "large_random/nlohmann_json.h" +#if SIMDJSON_COMPETITION_SAX +#include "large_random/nlohmann_json_sax.h" +#endif // SIMDJSON_COMPETITION_SAX +#include "large_random/boostjson.h" + +#include "amazon_cellphones/simdjson_dom.h" +#include "amazon_cellphones/simdjson_ondemand.h" + +#include "large_amazon_cellphones/simdjson_dom.h" +#include "large_amazon_cellphones/simdjson_ondemand.h" + +BENCHMARK_MAIN(); diff --git a/benchmark/bench_parse_call.cpp b/benchmark/bench_parse_call.cpp index 3b923cdb5d..1da636dbdc 100644 --- a/benchmark/bench_parse_call.cpp +++ b/benchmark/bench_parse_call.cpp @@ -1,4 +1,5 @@ #include +#include #include "simdjson.h" using namespace simdjson; using namespace benchmark; @@ -10,6 +11,64 @@ const char *GSOC_JSON = SIMDJSON_BENCHMARK_DATA_DIR "gsoc-2018.json"; + +static void fast_minify_twitter(State& state) { + dom::parser parser; + padded_string docdata; + auto error = padded_string::load(TWITTER_JSON).get(docdata); + if(error) { + cerr << "could not parse twitter.json" << error << endl; + return; + } + std::unique_ptr buffer{new char[docdata.size()]}; + + size_t bytes = 0; + for (simdjson_unused auto _ : state) { + size_t new_length{}; // It will receive the minified length. + auto error = simdjson::minify(docdata.data(), docdata.size(), buffer.get(), new_length); + bytes += docdata.size(); + benchmark::DoNotOptimize(error); + } + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte + state.counters["Gigabytes"] = benchmark::Counter( + double(bytes), benchmark::Counter::kIsRate, + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); +} +BENCHMARK(fast_minify_twitter)->Repetitions(10)->ComputeStatistics("max", [](const std::vector& v) -> double { + return *(std::max_element(std::begin(v), std::end(v))); + })->DisplayAggregatesOnly(true); + + + + +static void fast_minify_gsoc(State& state) { + dom::parser parser; + padded_string docdata; + auto error = padded_string::load(GSOC_JSON).get(docdata); + if(error) { + cerr << "could not parse gsoc-2018.json" << error << endl; + return; + } + std::unique_ptr buffer{new char[docdata.size()]}; + + size_t bytes = 0; + for (simdjson_unused auto _ : state) { + size_t new_length{}; // It will receive the minified length. + auto error = simdjson::minify(docdata.data(), docdata.size(), buffer.get(), new_length); + bytes += docdata.size(); + benchmark::DoNotOptimize(error); + } + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte + state.counters["Gigabytes"] = benchmark::Counter( + double(bytes), benchmark::Counter::kIsRate, + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); +} +BENCHMARK(fast_minify_gsoc)->Repetitions(10)->ComputeStatistics("max", [](const std::vector& v) -> double { + return *(std::max_element(std::begin(v), std::end(v))); + })->DisplayAggregatesOnly(true); + static void unicode_validate_twitter(State& state) { dom::parser parser; padded_string docdata; @@ -25,7 +84,7 @@ static void unicode_validate_twitter(State& state) { return; } size_t bytes = 0; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { bool is_ok = simdjson::validate_utf8(docdata.data(), docdata.size()); bytes += docdata.size(); benchmark::DoNotOptimize(is_ok); @@ -55,7 +114,7 @@ static void parse_twitter(State& state) { return; } size_t bytes = 0; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { dom::element doc; bytes += docdata.size(); if ((error = parser.parse(docdata).get(doc))) { @@ -90,7 +149,7 @@ static void parse_gsoc(State& state) { return; } size_t bytes = 0; - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { bytes += docdata.size(); dom::element doc; if ((error = parser.parse(docdata).get(doc))) { @@ -110,23 +169,10 @@ BENCHMARK(parse_gsoc)->Repetitions(10)->ComputeStatistics("max", [](const std::v })->DisplayAggregatesOnly(true); - -SIMDJSON_PUSH_DISABLE_WARNINGS -SIMDJSON_DISABLE_DEPRECATED_WARNING -static void json_parse(State& state) { - ParsedJson pj; - if (!pj.allocate_capacity(EMPTY_ARRAY.length())) { return; } - for (UNUSED auto _ : state) { - auto error = json_parse(EMPTY_ARRAY, pj); - if (error) { return; } - } -} -SIMDJSON_POP_DISABLE_WARNINGS -BENCHMARK(json_parse); static void parser_parse_error_code(State& state) { dom::parser parser; if (parser.allocate(EMPTY_ARRAY.length())) { return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { auto error = parser.parse(EMPTY_ARRAY).error(); if (error) { return; } } @@ -138,9 +184,9 @@ BENCHMARK(parser_parse_error_code); static void parser_parse_exception(State& state) { dom::parser parser; if (parser.allocate(EMPTY_ARRAY.length())) { return; } - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { try { - UNUSED dom::element doc = parser.parse(EMPTY_ARRAY); + simdjson_unused dom::element doc = parser.parse(EMPTY_ARRAY); } catch(simdjson_error &j) { cout << j.what() << endl; return; @@ -151,19 +197,8 @@ BENCHMARK(parser_parse_exception); #endif // SIMDJSON_EXCEPTIONS -SIMDJSON_PUSH_DISABLE_WARNINGS -SIMDJSON_DISABLE_DEPRECATED_WARNING -static void build_parsed_json(State& state) { - for (UNUSED auto _ : state) { - dom::parser parser = simdjson::build_parsed_json(EMPTY_ARRAY); - if (!parser.valid) { return; } - } -} -SIMDJSON_POP_DISABLE_WARNINGS - -BENCHMARK(build_parsed_json); static void document_parse_error_code(State& state) { - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { dom::parser parser; auto error = parser.parse(EMPTY_ARRAY).error(); if (error) { return; } @@ -174,10 +209,10 @@ BENCHMARK(document_parse_error_code); #if SIMDJSON_EXCEPTIONS static void document_parse_exception(State& state) { - for (UNUSED auto _ : state) { + for (simdjson_unused auto _ : state) { try { dom::parser parser; - UNUSED dom::element doc = parser.parse(EMPTY_ARRAY); + simdjson_unused dom::element doc = parser.parse(EMPTY_ARRAY); } catch(simdjson_error &j) { cout << j.what() << endl; return; diff --git a/benchmark/benchfeatures.cpp b/benchmark/benchfeatures.cpp index 8900e9d0bd..fd042f85d3 100644 --- a/benchmark/benchfeatures.cpp +++ b/benchmark/benchfeatures.cpp @@ -63,8 +63,8 @@ void print_usage(ostream& out) { out << "-s STAGE - Stop after the given stage." << endl; out << " -s stage1 - Stop after find_structural_bits." << endl; out << " -s all - Run all stages." << endl; - out << "-a ARCH - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl; - out << " or ARM64). By default, detects best supported architecture." << endl; + out << "-a ARCH - Use the parser with the designated architecture (HASWELL, WESTMERE," << endl; + out << " PPC64 or ARM64). By default, detects best supported architecture." << endl; } void exit_usage(string message) { @@ -96,8 +96,14 @@ struct option_struct { case 'v': verbose = true; break; - case 'a': - simdjson::active_implementation = simdjson::available_implementations[optarg]; + case 'a': { + auto impl = simdjson::get_available_implementations()[optarg]; + if(impl && impl->supported_by_runtime_system()) { + simdjson::get_active_implementation() = impl; + } else { + std::cerr << "implementation " << optarg << " not found or not supported " << std::endl; + } + } break; case 's': if (!strcmp(optarg, "stage1")) { @@ -158,7 +164,7 @@ struct feature_benchmarker { } - really_inline void run_iterations(size_t iterations, bool stage1_only=false) { + simdjson_inline void run_iterations(size_t iterations, bool stage1_only=false) { struct7.run_iterations(iterations, stage1_only); struct7_miss.run_iterations(iterations, stage1_only); struct7_full.run_iterations(iterations, stage1_only); @@ -198,10 +204,13 @@ struct feature_benchmarker { } // Rate of 1-7-structural misses per 8-structural flip double struct1_7_miss_rate(BenchmarkStage stage) const { +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + return 1; +#else if (!has_events()) { return 1; } return struct7_miss[stage].best.branch_misses() - struct7[stage].best.branch_misses() / double(struct7_miss.stats->blocks_with_1_structural_flipped); +#endif } - // Extra cost of an 8-15 structural block over a 1-7 structural block double struct8_15_cost(BenchmarkStage stage) const { return cost_per_block(stage, struct15, struct15.stats->blocks_with_8_structurals, struct7); @@ -212,8 +221,12 @@ struct feature_benchmarker { } // Rate of 8-15-structural misses per 8-structural flip double struct8_15_miss_rate(BenchmarkStage stage) const { +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + return 1; +#else if (!has_events()) { return 1; } return double(struct15_miss[stage].best.branch_misses() - struct15[stage].best.branch_misses()) / double(struct15_miss.stats->blocks_with_8_structurals_flipped); +#endif } // Extra cost of a 16+-structural block over an 8-15 structural block (actual varies based on # of structurals!) @@ -226,10 +239,15 @@ struct feature_benchmarker { } // Rate of 16-structural misses per 16-structural flip double struct16_miss_rate(BenchmarkStage stage) const { +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + return 1; +#else if (!has_events()) { return 1; } return double(struct23_miss[stage].best.branch_misses() - struct23[stage].best.branch_misses()) / double(struct23_miss.stats->blocks_with_16_structurals_flipped); +#endif } + // Extra cost of having UTF-8 in a block double utf8_cost(BenchmarkStage stage) const { return cost_per_block(stage, utf8, utf8.stats->blocks_with_utf8, struct7_full); @@ -240,10 +258,13 @@ struct feature_benchmarker { } // Rate of UTF-8 misses per UTF-8 flip double utf8_miss_rate(BenchmarkStage stage) const { +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + return 1; +#else if (!has_events()) { return 1; } return double(utf8_miss[stage].best.branch_misses() - utf8[stage].best.branch_misses()) / double(utf8_miss.stats->blocks_with_utf8_flipped); +#endif } - // Extra cost of having escapes in a block double escape_cost(BenchmarkStage stage) const { return cost_per_block(stage, escape, escape.stats->blocks_with_escapes, struct7_full); @@ -254,10 +275,15 @@ struct feature_benchmarker { } // Rate of escape misses per escape flip double escape_miss_rate(BenchmarkStage stage) const { +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + return 1; +#else if (!has_events()) { return 1; } return double(escape_miss[stage].best.branch_misses() - escape[stage].best.branch_misses()) / double(escape_miss.stats->blocks_with_escapes_flipped); +#endif } + double calc_expected_feature_cost(BenchmarkStage stage, const benchmarker& file) const { // Expected base ns/block (empty) json_stats& stats = *file.stats; @@ -294,7 +320,6 @@ struct feature_benchmarker { double calc_expected(BenchmarkStage stage, const benchmarker& file) const { return calc_expected_feature_cost(stage, file) + calc_expected_miss_cost(stage, file); } - void print(const option_struct& options) const { printf("\n"); printf("Features in ns/block (64 bytes):\n"); @@ -353,6 +378,22 @@ struct feature_benchmarker { } }; +#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS +void print_file_effectiveness(BenchmarkStage stage, const char* filename, const benchmarker& results, const feature_benchmarker& features) { + double actual = results[stage].best.elapsed_ns() / double(results.stats->blocks); + double calc = features.calc_expected(stage, results); + double calc_misses = features.calc_expected_misses(stage, results); + double calc_miss_cost = features.calc_expected_miss_cost(stage, results); + printf(" | %-8s ", benchmark_stage_name(stage)); + printf("| %-15s ", filename); + printf("| %8.3g ", features.calc_expected_feature_cost(stage, results)); + printf("| %8.3g ", calc_miss_cost); + printf("| %8.3g ", calc); + printf("| %8.3g ", actual); + printf("| %+8.3g ", actual - calc); + printf("| %13llu ", (long long unsigned)(calc_misses)); +} +#else void print_file_effectiveness(BenchmarkStage stage, const char* filename, const benchmarker& results, const feature_benchmarker& features) { double actual = results[stage].best.elapsed_ns() / double(results.stats->blocks); double calc = features.calc_expected(stage, results); @@ -376,6 +417,7 @@ void print_file_effectiveness(BenchmarkStage stage, const char* filename, const } printf("|\n"); } +#endif int main(int argc, char *argv[]) { // Read options diff --git a/benchmark/benchmark.h b/benchmark/benchmark.h index 836f8507d1..cdcf7331fa 100644 --- a/benchmark/benchmark.h +++ b/benchmark/benchmark.h @@ -12,9 +12,9 @@ #define BEST_TIME(name, test, expected, pre, repeat, size, verbose) \ do { \ if (verbose) \ - printf("%-40s\t: ", name); \ + std::printf("%-40s\t: ", name); \ else \ - printf("\"%-40s\"", name); \ + std::printf("\"%-40s\"", name); \ fflush(NULL); \ event_collector collector; \ event_aggregate aggregate{}; \ @@ -23,7 +23,8 @@ std::atomic_thread_fence(std::memory_order_acquire); \ collector.start(); \ if (test != expected) { \ - fprintf(stderr, "not expected (%d , %d )", (int)test, (int)expected); \ + std::fprintf(stderr, "not expected (%d , %d )", (int)test, \ + (int)expected); \ break; \ } \ std::atomic_thread_fence(std::memory_order_release); \ @@ -31,39 +32,41 @@ aggregate << allocate_count; \ } \ if (collector.has_events()) { \ - printf("%7.3f", aggregate.best.cycles() / static_cast(size)); \ + std::printf("%7.3f", \ + aggregate.best.cycles() / static_cast(size)); \ if (verbose) { \ - printf(" cycles/byte "); \ + std::printf(" cycles/byte "); \ } \ - printf("\t"); \ - printf("%7.3f", \ - aggregate.best.instructions() / static_cast(size)); \ + std::printf("\t"); \ + std::printf("%7.3f", \ + aggregate.best.instructions() / static_cast(size)); \ if (verbose) { \ - printf(" instructions/byte "); \ + std::printf(" instructions/byte "); \ } \ - printf("\t"); \ + std::printf("\t"); \ } \ double gb = static_cast(size) / 1000000000.0; \ - printf("%7.3f", gb / aggregate.best.elapsed_sec()); \ + std::printf("%7.3f", gb / aggregate.best.elapsed_sec()); \ if (verbose) { \ - printf(" GB/s "); \ + std::printf(" GB/s "); \ } \ - printf("%7.3f", 1.0 / aggregate.best.elapsed_sec()); \ + std::printf("\t"); \ + std::printf("%7.3f", 1.0 / aggregate.best.elapsed_sec()); \ if (verbose) { \ - printf(" documents/s "); \ + std::printf(" documents/s "); \ } \ - printf("\n"); \ - fflush(NULL); \ + std::printf("\n"); \ + std::fflush(NULL); \ } while (0) // like BEST_TIME, but no check #define BEST_TIME_NOCHECK(name, test, pre, repeat, size, verbose) \ do { \ if (verbose) \ - printf("%-40s\t: ", name); \ + std::printf("%-40s\t: ", name); \ else \ - printf("\"%-40s\"", name); \ - fflush(NULL); \ + std::printf("\"%-40s\"", name); \ + std::fflush(NULL); \ event_collector collector; \ event_aggregate aggregate{}; \ for (decltype(repeat) i = 0; i < repeat; i++) { \ @@ -76,29 +79,31 @@ aggregate << allocate_count; \ } \ if (collector.has_events()) { \ - printf("%7.3f", aggregate.best.cycles() / static_cast(size)); \ + std::printf("%7.3f", \ + aggregate.best.cycles() / static_cast(size)); \ if (verbose) { \ - printf(" cycles/byte "); \ + std::printf(" cycles/byte "); \ } \ - printf("\t"); \ - printf("%7.3f", \ - aggregate.best.instructions() / static_cast(size)); \ + std::printf("\t"); \ + std::printf("%7.3f", \ + aggregate.best.instructions() / static_cast(size)); \ if (verbose) { \ - printf(" instructions/byte "); \ + std::printf(" instructions/byte "); \ } \ - printf("\t"); \ + std::printf("\t"); \ } \ double gb = static_cast(size) / 1000000000.0; \ - printf("%7.3f", gb / aggregate.best.elapsed_sec()); \ + std::printf("%7.3f", gb / aggregate.best.elapsed_sec()); \ if (verbose) { \ - printf(" GB/s "); \ + std::printf(" GB/s "); \ } \ - printf("%7.3f", 1.0 / aggregate.best.elapsed_sec()); \ + std::printf("\t"); \ + std::printf("%7.3f", 1.0 / aggregate.best.elapsed_sec()); \ if (verbose) { \ - printf(" documents/s "); \ + std::printf(" documents/s "); \ } \ - printf("\n"); \ - fflush(NULL); \ + std::printf("\n"); \ + std::fflush(NULL); \ } while (0) #endif diff --git a/benchmark/benchmarker.h b/benchmark/benchmarker.h index 3b4e05bab9..b90a5a407e 100644 --- a/benchmark/benchmarker.h +++ b/benchmark/benchmarker.h @@ -2,7 +2,7 @@ #define __BENCHMARKER_H #include "event_counter.h" -#include "simdjson.h" // For SIMDJSON_DISABLE_DEPRECATED_WARNINGS +#include "simdjson.h" #include #include @@ -228,7 +228,7 @@ struct progress_bar { /** * The speed at which we can allocate memory is strictly system specific. * It depends on the OS and the runtime library. It is subject to various - * system-specific knobs. It is not something that we can reasonably + * system-specific knobs. It is not something that we can reasonably * benchmark with crude timings. * If someone wants to optimize how simdjson allocate memory, then it will * almost surely require a distinct benchmarking tool. What is meant by @@ -308,20 +308,20 @@ struct benchmarker { return all_stages_without_allocation.iterations; } - really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) { + simdjson_inline void run_iteration(bool stage1_only, bool hotbuffers=false) { // Allocate dom::parser collector.start(); dom::parser parser; // We always allocate at least 64KB. Smaller allocations may actually be slower under some systems. error_code error = parser.allocate(json.size() < 65536 ? 65536 : json.size()); if (error) { - exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result: " + error_message(error)); + exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON text: " + error_message(error)); } event_count allocate_count = collector.end(); allocate_stage << allocate_count; // Run it once to get hot buffers if(hotbuffers) { - auto result = parser.parse((const uint8_t *)json.data(), json.size()); + auto result = parser.parse(reinterpret_cast(json.data()), json.size()); if (result.error()) { exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error())); } @@ -331,7 +331,7 @@ struct benchmarker { // Stage 1 (find structurals) collector.start(); - error = parser.implementation->stage1((const uint8_t *)json.data(), json.size(), false); + error = parser.implementation->stage1(reinterpret_cast(json.data()), json.size(), stage1_mode::regular); event_count stage1_count = collector.end(); stage1 << stage1_count; if (error) { @@ -367,7 +367,7 @@ struct benchmarker { void run_loop(size_t iterations) { dom::parser parser; - auto firstresult = parser.parse((const uint8_t *)json.data(), json.size()); + auto firstresult = parser.parse(reinterpret_cast(json.data()), json.size()); if (firstresult.error()) { exit_error(string("Failed to parse ") + filename + string(":") + error_message(firstresult.error())); } @@ -375,7 +375,7 @@ struct benchmarker { collector.start(); // some users want something closer to "number of documents per second" for(size_t i = 0; i < iterations; i++) { - auto result = parser.parse((const uint8_t *)json.data(), json.size()); + auto result = parser.parse(reinterpret_cast(json.data()), json.size()); if (result.error()) { exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error())); } @@ -384,7 +384,7 @@ struct benchmarker { loop << all_loop_count; } - really_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers=false) { + simdjson_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers=false) { for (size_t i = 0; i(stats->structurals), stage.instructions() / static_cast(stage.cycles()) ); - +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS // NOTE: removed cycles/miss because it is a somewhat misleading stat printf("%s%-13s: %7.0f branch misses (%6.2f%%) - %.0f cache misses (%6.2f%%) - %.2f cache references\n", prefix, @@ -434,6 +434,7 @@ struct benchmarker { percent(stage.cache_misses(), all_stages_without_allocation.cache_misses()), stage.cache_references() ); +#endif } } @@ -444,9 +445,9 @@ struct benchmarker { return 100.0 * a / b; } - void print(bool tabbed_output) const { + void print(bool tabbed_output, bool stage1_only) const { if (tabbed_output) { - char* filename_copy = (char*)malloc(strlen(filename)+1); + char* filename_copy = reinterpret_cast(malloc(strlen(filename)+1)); SIMDJSON_PUSH_DISABLE_WARNINGS SIMDJSON_DISABLE_DEPRECATED_WARNING // Validated CRT_SECURE safe here strcpy(filename_copy, filename); @@ -502,24 +503,28 @@ struct benchmarker { stats->blocks_with_16_structurals_flipped, percent(stats->blocks_with_16_structurals_flipped, stats->blocks)); } printf("\n"); - printf("All Stages (excluding allocation)\n"); - print_aggregate("| " , all_stages_without_allocation.best); - // frequently, allocation is a tiny fraction of the running time so we omit it - if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) { - printf("|- Allocation\n"); - print_aggregate("| ", allocate_stage.best); + if(!stage1_only) { + printf("All Stages (excluding allocation)\n"); + print_aggregate("| " , all_stages_without_allocation.best); + // frequently, allocation is a tiny fraction of the running time so we omit it + if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages_without_allocation.best.elapsed_sec()) { + printf("|- Allocation\n"); + print_aggregate("| ", allocate_stage.best); + } } printf("|- Stage 1\n"); print_aggregate("| ", stage1.best); - printf("|- Stage 2\n"); - print_aggregate("| ", stage2.best); + if(!stage1_only) { + printf("|- Stage 2\n"); + print_aggregate("| ", stage2.best); + } if (collector.has_events()) { double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0; double freq2 = (stage2.best.cycles() / stage2.best.elapsed_sec()) / 1000000000.0; double freqall = (all_stages_without_allocation.best.cycles() / all_stages_without_allocation.best.elapsed_sec()) / 1000000000.0; double freqmin = min(freq1, freq2); double freqmax = max(freq1, freq2); - if((freqall < 0.95 * freqmin) or (freqall > 1.05 * freqmax)) { + if((freqall < 0.95 * freqmin) || (freqall > 1.05 * freqmax)) { printf("\nWarning: The processor frequency fluctuates in an expected way!!!\n" "Range for stage 1 and stage 2 : [%.3f GHz, %.3f GHz], overall: %.3f GHz.\n", freqmin, freqmax, freqall); diff --git a/benchmark/distinct_user_id/boostjson.h b/benchmark/distinct_user_id/boostjson.h new file mode 100644 index 0000000000..3d171ee8b7 --- /dev/null +++ b/benchmark/distinct_user_id/boostjson.h @@ -0,0 +1,29 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +struct boostjson { + bool run(simdjson::padded_string &json, std::vector &result) { + + auto root = boost::json::parse(json); + for (const auto &tweet : root.at("statuses").as_array()) { + result.push_back(tweet.at("user").at("id").to_number()); + + if (tweet.as_object().if_contains("retweeted_status")) { + result.push_back(tweet.at("retweeted_status").at("user").at("id").to_number()); + } + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, boostjson)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/distinct_user_id/distinct_user_id.h b/benchmark/distinct_user_id/distinct_user_id.h new file mode 100644 index 0000000000..a68c15631f --- /dev/null +++ b/benchmark/distinct_user_id/distinct_user_id.h @@ -0,0 +1,53 @@ + +#pragma once + +#include "json_benchmark/file_runner.h" +#include + +namespace distinct_user_id { + +using namespace json_benchmark; + +template +struct runner : public file_runner { + std::vector result{}; + + bool setup(benchmark::State &state) { + return this->load_json(state, TWITTER_JSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + bool after_run(benchmark::State &state) { + if (!file_runner::after_run(state)) { return false; } + std::sort(result.begin(), result.end()); + auto last = std::unique(result.begin(), result.end()); + result.erase(last, result.end()); + return true; + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +struct simdjson_dom; + +template simdjson_inline static void distinct_user_id(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace distinct_user_id diff --git a/benchmark/distinct_user_id/nlohmann_json.h b/benchmark/distinct_user_id/nlohmann_json.h new file mode 100644 index 0000000000..c2562dad9e --- /dev/null +++ b/benchmark/distinct_user_id/nlohmann_json.h @@ -0,0 +1,27 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +struct nlohmann_json { + bool run(simdjson::padded_string &json, std::vector &result) { + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto tweet : root["statuses"]) { + result.push_back(tweet["user"]["id"]); + if (tweet.contains("retweeted_status")) { + result.push_back(tweet["retweeted_status"]["user"]["id"]); + } + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, nlohmann_json)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/distinct_user_id/nlohmann_json_sax.h b/benchmark/distinct_user_id/nlohmann_json_sax.h new file mode 100644 index 0000000000..e10702f1d4 --- /dev/null +++ b/benchmark/distinct_user_id/nlohmann_json_sax.h @@ -0,0 +1,59 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + struct Handler : json::json_sax_t + { + std::vector& result; + bool user = false; + bool user_id = false; + Handler(std::vector &r) : result(r) { } + + bool key(string_t& val) override { + // Assume that valid user/id pairs appear only once in main array of user objects + if (user) { // If already found user object, find id key + if (val.compare("id") == 0) { user_id = true; } + } + else if (val.compare("user") == 0) { user = true; } // Otherwise, find user object + return true; + } + bool number_unsigned(number_unsigned_t val) override { + if (user_id) { + result.emplace_back(val); + user = false; + user_id = false; + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_float(number_float_t val, const string_t& s) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool string(string_t& val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Handler handler(result); + json::sax_parse(json.data(), &handler); + + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(distinct_user_id, nlohmann_json_sax)->UseManualTime(); +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/distinct_user_id/rapidjson.h b/benchmark/distinct_user_id/rapidjson.h new file mode 100644 index 0000000000..240cfe1f64 --- /dev/null +++ b/benchmark/distinct_user_id/rapidjson.h @@ -0,0 +1,59 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using namespace rapidjson; + +struct rapidjson_base { + Document doc{}; + + bool run(Document &root, std::vector &result) { + if (root.HasParseError()) { printf("parse error\n"); return false; } + if (!root.IsObject()) { printf("root is not an object\n"); return false; } + auto statuses = root.FindMember("statuses"); + if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { printf("statuses is not an array\n"); return false; } + for (auto &tweet : statuses->value.GetArray()) { + if (!tweet.IsObject()) { return false; } + auto user = tweet.FindMember("user"); + if (user == tweet.MemberEnd() || !user->value.IsObject()) { printf("user is not an object\n"); return false; } + auto id = user->value.FindMember("id"); + if (id == user->value.MemberEnd() || !id->value.IsUint64()) { printf("id is not an int\n"); return false; } + result.push_back(id->value.GetUint64()); + + auto retweet = tweet.FindMember("retweeted_status"); + if (retweet != tweet.MemberEnd()) { + if (!retweet->value.IsObject()) { printf("retweet is not an object\n"); return false; } + user = retweet->value.FindMember("user"); + if (user == retweet->value.MemberEnd() || !user->value.IsObject()) { printf("rewtweet.user is not an object\n"); return false; } + id = user->value.FindMember("id"); + if (id == user->value.MemberEnd() || !id->value.IsUint64()) { printf("retweet.id is not an int\n"); return false; } + result.push_back(id->value.GetUint64()); + } + } + + return true; + } +}; + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(distinct_user_id, rapidjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(distinct_user_id, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/distinct_user_id/rapidjson_sax.h b/benchmark/distinct_user_id/rapidjson_sax.h new file mode 100644 index 0000000000..a6eea48ad4 --- /dev/null +++ b/benchmark/distinct_user_id/rapidjson_sax.h @@ -0,0 +1,61 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "distinct_user_id.h" +#include +namespace distinct_user_id { + +using namespace rapidjson; + +struct rapidjson_sax { + struct Handler { + std::vector& result; + bool user = false; + bool user_id = false; + Handler(std::vector &r) : result(r) { } + + bool Key(const char* key, SizeType length, bool copy) { + // Assume that valid user/id pairs appear only once in main array of user objects + if (user) { // If already found user object, find id key + if ((length == 2) && memcmp(key,"id",2) == 0) { user_id = true; } + } + else if ((length == 4) && memcmp(key,"user",4) == 0) { user = true; } // Otherwise, find user object + return true; + } + bool Uint(unsigned i) { // id values are treated as Uint (not Uint64) by the reader + if (user_id) { // Getting id if previous key was "id" for a user + result.emplace_back(i); + user_id = false; + user = false; + } + return true; + } + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Double(double d) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint64(uint64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool String(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Reader reader; + Handler handler(result); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } + +}; // rapid_jason_sax +BENCHMARK_TEMPLATE(distinct_user_id, rapidjson_sax)->UseManualTime(); +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_RAPIDJSON \ No newline at end of file diff --git a/benchmark/distinct_user_id/sajson.h b/benchmark/distinct_user_id/sajson.h new file mode 100644 index 0000000000..af4c9d578f --- /dev/null +++ b/benchmark/distinct_user_id/sajson.h @@ -0,0 +1,82 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +struct sajson { + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + simdjson_inline std::string_view get_string_view(const ::sajson::value &obj, std::string_view key) { + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field is not a string"; } + return { val.as_cstring(), val.get_string_length() }; + } + simdjson_inline uint64_t get_str_uint64(const ::sajson::value &obj, std::string_view key) { + // Since sajson only supports 53-bit numbers, and IDs in twitter.json can be > 53 bits, we read the corresponding id_str and parse that. + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field not a string"; } + auto str = val.as_cstring(); + char *endptr; + uint64_t result = strtoull(str, &endptr, 10); + if (endptr != &str[val.get_string_length()]) { throw "field is a string, but not an integer string"; } + return result; + } + + bool run(simdjson::padded_string &json, std::vector &result) { + using namespace sajson; + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = parse( + bounded_allocation(ast_buffer, ast_buffer_size), + mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != TYPE_OBJECT) { return false; } + auto statuses = root.get_value_of_key({"statuses", strlen("statuses")}); + if (statuses.get_type() != TYPE_ARRAY) { return false; } + + for (size_t i=0; iUseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_SAJSON + diff --git a/benchmark/distinct_user_id/simdjson_dom.h b/benchmark/distinct_user_id/simdjson_dom.h new file mode 100644 index 0000000000..a25e95afd5 --- /dev/null +++ b/benchmark/distinct_user_id/simdjson_dom.h @@ -0,0 +1,36 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using namespace simdjson; + +struct simdjson_dom { + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + // Walk the document, parsing as we go + auto doc = parser.parse(json); + for (dom::object tweet : doc["statuses"]) { + // We believe that all statuses have a matching + // user, and we are willing to throw when they do not. + result.push_back(tweet["user"]["id"]); + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet = tweet["retweeted_status"]; + if (retweet.error() != NO_SUCH_FIELD) { + result.push_back(retweet["user"]["id"]); + } + } + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, simdjson_dom)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/distinct_user_id/simdjson_dom_json_pointer.h b/benchmark/distinct_user_id/simdjson_dom_json_pointer.h new file mode 100644 index 0000000000..c852e018d1 --- /dev/null +++ b/benchmark/distinct_user_id/simdjson_dom_json_pointer.h @@ -0,0 +1,36 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using namespace simdjson; + +struct simdjson_dom_json_pointer { + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + // Walk the document, parsing as we go + auto doc = parser.parse(json); + for (dom::object tweet : doc["statuses"]) { + // We believe that all statuses have a matching + // user, and we are willing to throw when they do not. + result.push_back(tweet.at_pointer("/user/id")); + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet_id = tweet.at_pointer("/retweeted_status/user/id"); + if (retweet_id.error() != NO_SUCH_FIELD) { + result.push_back(retweet_id); + } + } + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, simdjson_dom_json_pointer)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/distinct_user_id/simdjson_ondemand.h b/benchmark/distinct_user_id/simdjson_ondemand.h new file mode 100644 index 0000000000..1ce8d59317 --- /dev/null +++ b/benchmark/distinct_user_id/simdjson_ondemand.h @@ -0,0 +1,37 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using namespace simdjson; + +struct simdjson_ondemand { + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + // Walk the document, parsing as we go + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc.find_field("statuses")) { + // We believe that all statuses have a matching + // user, and we are willing to throw when they do not. + result.push_back(tweet.find_field("user").find_field("id")); + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet = tweet.find_field("retweeted_status"); + if (!retweet.error()) { + result.push_back(retweet.find_field("user").find_field("id")); + } + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, simdjson_ondemand)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/distinct_user_id/simdjson_ondemand_json_pointer.h b/benchmark/distinct_user_id/simdjson_ondemand_json_pointer.h new file mode 100644 index 0000000000..df102c0b3d --- /dev/null +++ b/benchmark/distinct_user_id/simdjson_ondemand_json_pointer.h @@ -0,0 +1,37 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +using namespace simdjson; + +struct simdjson_ondemand_json_pointer { + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + // Walk the document, parsing as we go + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc.find_field("statuses")) { + // We believe that all statuses have a matching + // user, and we are willing to throw when they do not. + result.push_back(tweet.at_pointer("/user/id")); + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet_id = tweet.at_pointer("/retweeted_status/user/id"); + if (retweet_id.error() != NO_SUCH_FIELD) { + result.push_back(retweet_id); + } + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(distinct_user_id, simdjson_ondemand_json_pointer)->UseManualTime(); + +} // namespace distinct_user_id + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/distinct_user_id/yyjson.h b/benchmark/distinct_user_id/yyjson.h new file mode 100644 index 0000000000..99b446cf9c --- /dev/null +++ b/benchmark/distinct_user_id/yyjson.h @@ -0,0 +1,62 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "distinct_user_id.h" + +namespace distinct_user_id { + +struct yyjson_base { + bool run(yyjson_doc *doc, std::vector &result) { + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *statuses = yyjson_obj_get(root, "statuses"); + if (!yyjson_is_arr(statuses)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t tweet_idx, tweets_max; + yyjson_val *tweet; + yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) { + auto user = yyjson_obj_get(tweet, "user"); + if (!yyjson_is_obj(user)) { return false; } + auto id = yyjson_obj_get(user, "id"); + if (!yyjson_is_uint(id)) { return false; } + result.push_back(yyjson_get_uint(id)); + + // Not all tweets have a "retweeted_status", but when they do + // we want to go and find the user within. + auto retweet = yyjson_obj_get(tweet, "retweeted_status"); + if (retweet) { + if (!yyjson_is_obj(retweet)) { return false; } + user = yyjson_obj_get(retweet, "user"); + if (!yyjson_is_obj(user)) { return false; } + id = yyjson_obj_get(user, "id"); + if (!yyjson_is_uint(id)) { return false; } + result.push_back(yyjson_get_sint(id)); + } + } + + return true; + } + +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), result); + } +}; +BENCHMARK_TEMPLATE(distinct_user_id, yyjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), result); + } +}; +BENCHMARK_TEMPLATE(distinct_user_id, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace distinct_user_id + +#endif // SIMDJSON_COMPETITION_YYJSON diff --git a/benchmark/distinctuseridcompetition.cpp b/benchmark/distinctuseridcompetition.cpp deleted file mode 100644 index f2c4a64348..0000000000 --- a/benchmark/distinctuseridcompetition.cpp +++ /dev/null @@ -1,406 +0,0 @@ -#include "simdjson.h" -#include -#include -#include -#include - -#include "benchmark.h" - -SIMDJSON_PUSH_DISABLE_ALL_WARNINGS - -// #define RAPIDJSON_SSE2 // bad for performance -// #define RAPIDJSON_SSE42 // bad for performance -#include "rapidjson/document.h" -#include "rapidjson/reader.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" - -#include "sajson.h" - -SIMDJSON_POP_DISABLE_WARNINGS - -using namespace rapidjson; - -bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; } - -void remove_duplicates(std::vector &v) { - std::sort(v.begin(), v.end()); - auto last = std::unique(v.begin(), v.end()); - v.erase(last, v.end()); -} - -void print_vec(const std::vector &v) { - for (auto i : v) { - std::cout << i << " "; - } - std::cout << std::endl; -} - -// clang-format off - -// simdjson_recurse below come be implemented like so but it is slow: -/*void simdjson_recurse(std::vector & v, simdjson::dom::element element) { - error_code error; - if (element.is_array()) { - dom::array array; - error = element.get(array); - for (auto child : array) { - if (child.is() || child.is()) { - simdjson_recurse(v, child); - } - } - } else if (element.is_object()) { - int64_t id; - error = element["user"]["id"].get(id); - if(!error) { - v.push_back(id); - } - for (auto [key, value] : object) { - if (value.is() || value.is()) { - simdjson_recurse(v, value); - } - } - } -}*/ -// clang-format on - - -really_inline void simdjson_recurse(std::vector & v, simdjson::dom::element element); -void simdjson_recurse(std::vector & v, simdjson::dom::array array) { - for (auto child : array) { - simdjson_recurse(v, child); - } -} -void simdjson_recurse(std::vector & v, simdjson::dom::object object) { - for (auto [key, value] : object) { - if((key.size() == 4) && (memcmp(key.data(), "user", 4) == 0)) { - // we are in an object under the key "user" - simdjson::error_code error; - simdjson::dom::object child_object; - simdjson::dom::object child_array; - if (not (error = value.get(child_object))) { - for (auto [child_key, child_value] : child_object) { - if((child_key.size() == 2) && (memcmp(child_key.data(), "id", 2) == 0)) { - int64_t x; - if (not (error = child_value.get(x))) { - v.push_back(x); - } - } - simdjson_recurse(v, child_value); - } - } else if (not (error = value.get(child_array))) { - simdjson_recurse(v, child_array); - } - // end of: we are in an object under the key "user" - } else { - simdjson_recurse(v, value); - } - } -} -really_inline void simdjson_recurse(std::vector & v, simdjson::dom::element element) { - UNUSED simdjson::error_code error; - simdjson::dom::array array; - simdjson::dom::object object; - if (not (error = element.get(array))) { - simdjson_recurse(v, array); - } else if (not (error = element.get(object))) { - simdjson_recurse(v, object); - } -} - -really_inline std::vector -simdjson_just_dom(simdjson::dom::element doc) { - std::vector answer; - simdjson_recurse(answer, doc); - remove_duplicates(answer); - return answer; -} - -really_inline std::vector -simdjson_compute_stats(const simdjson::padded_string &p) { - std::vector answer; - simdjson::dom::parser parser; - simdjson::dom::element doc; - auto error = parser.parse(p).get(doc); - if (!error) { - simdjson_recurse(answer, doc); - remove_duplicates(answer); - } - return answer; -} - -really_inline simdjson::error_code -simdjson_just_parse(const simdjson::padded_string &p) { - simdjson::dom::parser parser; - return parser.parse(p).error(); -} - -void sajson_traverse(std::vector &answer, const sajson::value &node) { - using namespace sajson; - switch (node.get_type()) { - case TYPE_ARRAY: { - auto length = node.get_length(); - for (size_t i = 0; i < length; ++i) { - sajson_traverse(answer, node.get_array_element(i)); - } - break; - } - case TYPE_OBJECT: { - auto length = node.get_length(); - // sajson has O(log n) find_object_key, but we still visit each node anyhow - // because we need to visit all values. - for (auto i = 0u; i < length; ++i) { - auto key = node.get_object_key(i); // expected: sajson::string - bool found_user = - (key.length() == 4) && (memcmp(key.data(), "user", 4) == 0); - if (found_user) { // found a user!!! - auto user_value = node.get_object_value(i); // get the value - if (user_value.get_type() == - TYPE_OBJECT) { // the value should be an object - // now we know that we only need one value - auto user_value_length = user_value.get_length(); - auto right_index = - user_value.find_object_key(sajson::string("id", 2)); - if (right_index < user_value_length) { - auto v = user_value.get_object_value(right_index); - if (v.get_type() == TYPE_INTEGER) { // check that it is an integer - answer.push_back(v.get_integer_value()); // record it! - } else if (v.get_type() == TYPE_DOUBLE) { - answer.push_back((int64_t)v.get_double_value()); // record it! - } - } - } - } - sajson_traverse(answer, node.get_object_value(i)); - } - break; - } - case TYPE_NULL: - case TYPE_FALSE: - case TYPE_TRUE: - case TYPE_STRING: - case TYPE_DOUBLE: - case TYPE_INTEGER: - break; - default: - assert(false && "unknown node type"); - } -} - -really_inline std::vector -sasjon_just_dom(sajson::document &d) { - std::vector answer; - sajson_traverse(answer, d.get_root()); - remove_duplicates(answer); - return answer; -} - -really_inline std::vector -sasjon_compute_stats(const simdjson::padded_string &p) { - std::vector answer; - char *buffer = (char *)malloc(p.size()); - memcpy(buffer, p.data(), p.size()); - auto d = sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)); - if (!d.is_valid()) { - free(buffer); - return answer; - } - sajson_traverse(answer, d.get_root()); - free(buffer); - remove_duplicates(answer); - return answer; -} - -really_inline bool -sasjon_just_parse(const simdjson::padded_string &p) { - char *buffer = (char *)malloc(p.size()); - memcpy(buffer, p.data(), p.size()); - auto d = sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)); - bool answer = !d.is_valid(); - free(buffer); - return answer; -} - -void rapid_traverse(std::vector &answer, const rapidjson::Value &v) { - switch (v.GetType()) { - case kObjectType: - for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); - ++m) { - bool found_user = (m->name.GetStringLength() == 4) && - (memcmp(m->name.GetString(), "user", 4) == 0); - if (found_user) { - const rapidjson::Value &child = m->value; - if (child.GetType() == kObjectType) { - for (Value::ConstMemberIterator k = child.MemberBegin(); - k != child.MemberEnd(); ++k) { - if (equals(k->name.GetString(), "id")) { - const rapidjson::Value &val = k->value; - if (val.GetType() == kNumberType) { - answer.push_back(val.GetInt64()); - } - } - } - } - } - rapid_traverse(answer, m->value); - } - break; - case kArrayType: - for (Value::ConstValueIterator i = v.Begin(); i != v.End(); - ++i) { // v.Size(); - rapid_traverse(answer, *i); - } - break; - case kNullType: - case kFalseType: - case kTrueType: - case kStringType: - case kNumberType: - default: - break; - } -} - -really_inline std::vector -rapid_just_dom(rapidjson::Document &d) { - std::vector answer; - rapid_traverse(answer, d); - remove_duplicates(answer); - return answer; -} - -really_inline std::vector -rapid_compute_stats(const simdjson::padded_string &p) { - std::vector answer; - char *buffer = (char *)malloc(p.size() + 1); - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - rapidjson::Document d; - d.ParseInsitu(buffer); - if (d.HasParseError()) { - free(buffer); - return answer; - } - rapid_traverse(answer, d); - free(buffer); - remove_duplicates(answer); - return answer; -} - -really_inline bool -rapid_just_parse(const simdjson::padded_string &p) { - char *buffer = (char *)malloc(p.size() + 1); - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - rapidjson::Document d; - d.ParseInsitu(buffer); - bool answer = d.HasParseError(); - free(buffer); - return answer; -} - -int main(int argc, char *argv[]) { - bool verbose = false; - bool just_data = false; - - int c; - while ((c = getopt(argc, argv, "vt")) != -1) - switch (c) { - case 't': - just_data = true; - break; - case 'v': - verbose = true; - break; - default: - abort(); - } - if (optind >= argc) { - std::cerr - << "Using different parsers, we compute the content statistics of " - "JSON documents." - << std::endl; - std::cerr << "Usage: " << argv[0] << " " << std::endl; - std::cerr << "Or " << argv[0] << " -v " << std::endl; - exit(1); - } - const char *filename = argv[optind]; - if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] - << std::endl; - } - simdjson::padded_string p; - auto error = simdjson::padded_string::load(filename).get(p); - if (error) { - std::cerr << "Could not load the file " << filename << std::endl; - return EXIT_FAILURE; - } - // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte - if (verbose) { - std::cout << "Input has "; - if (p.size() > 1000 * 1000) - std::cout << p.size() / (1000 * 1000) << " MB "; - else if (p.size() > 1000) - std::cout << p.size() / 1000 << " KB "; - else - std::cout << p.size() << " B "; - std::cout << std::endl; - } - std::vector s1 = simdjson_compute_stats(p); - if (verbose) { - printf("simdjson: "); - print_vec(s1); - } - std::vector s2 = rapid_compute_stats(p); - if (verbose) { - printf("rapid: "); - print_vec(s2); - } - std::vector s3 = sasjon_compute_stats(p); - if (verbose) { - printf("sasjon: "); - print_vec(s3); - } - assert(s1 == s2); - assert(s1 == s3); - size_t size = s1.size(); - - int repeat = 500; - size_t volume = p.size(); - if (just_data) { - printf( - "name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); - } - BEST_TIME("simdjson ", simdjson_compute_stats(p).size(), size, , repeat, - volume, !just_data); - BEST_TIME("rapid ", rapid_compute_stats(p).size(), size, , repeat, volume, - !just_data); - BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume, - !just_data); - BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), simdjson::error_code::SUCCESS, , repeat, - volume, !just_data); - BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat, - volume, !just_data); - BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat, - volume, !just_data); - simdjson::dom::parser parser; - simdjson::dom::element doc; - error = parser.parse(p).get(doc); - BEST_TIME("simdjson (just dom) ", simdjson_just_dom(doc).size(), size, - , repeat, volume, !just_data); - char *buffer = (char *)malloc(p.size() + 1); - buffer[p.size()] = '\0'; - memcpy(buffer, p.data(), p.size()); - rapidjson::Document drapid; - drapid.ParseInsitu(buffer); - BEST_TIME("rapid (just dom) ", rapid_just_dom(drapid).size(), size, , repeat, - volume, !just_data); - memcpy(buffer, p.data(), p.size()); - auto dsasjon = sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)); - BEST_TIME("sasjon (just dom) ", sasjon_just_dom(dsasjon).size(), size, , - repeat, volume, !just_data); - free(buffer); -} diff --git a/benchmark/dom/CMakeLists.txt b/benchmark/dom/CMakeLists.txt new file mode 100644 index 0000000000..0528cdf69f --- /dev/null +++ b/benchmark/dom/CMakeLists.txt @@ -0,0 +1,16 @@ +include_directories( .. ../linux ) +link_libraries(simdjson-windows-headers test-data) +link_libraries(simdjson) + +add_executable(perfdiff perfdiff.cpp) +add_executable(parse parse.cpp) +add_executable(parse_stream parse_stream.cpp) +add_executable(statisticalmodel statisticalmodel.cpp) + +add_executable(parse_noutf8validation parse.cpp) +target_compile_definitions(parse_noutf8validation PRIVATE SIMDJSON_SKIPUTF8VALIDATION) +add_executable(parse_nonumberparsing parse.cpp) +target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARSING) +add_executable(parse_nostringparsing parse.cpp) +target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING) +include(checkperf.cmake) diff --git a/benchmark/checkperf.cmake b/benchmark/dom/checkperf.cmake similarity index 88% rename from benchmark/checkperf.cmake rename to benchmark/dom/checkperf.cmake index d1ae21b0a2..e77cbb753e 100644 --- a/benchmark/checkperf.cmake +++ b/benchmark/dom/checkperf.cmake @@ -5,9 +5,12 @@ # checkperf-repo: initialize and sync reference repository (first time only) # TEST checkperf: runs the actual checkperf test +option(SIMDJSON_ENABLE_DOM_CHECKPERF "Enable DOM performance comparison with main branch" OFF) + + # Clone the repository if it's not there find_package(Git QUIET) -if (SIMDJSON_IS_UNDER_GIT AND Git_FOUND AND (GIT_VERSION_STRING VERSION_GREATER "2.1.4") AND (NOT CMAKE_GENERATOR MATCHES Ninja) ) # We use "-C" which requires a recent git +if (SIMDJSON_ENABLE_DOM_CHECKPERF AND Git_FOUND AND (GIT_VERSION_STRING VERSION_GREATER "2.1.4") AND (NOT CMAKE_GENERATOR MATCHES Ninja) AND (NOT MSVC) ) # We use "-C" which requires a recent git message(STATUS "Git is available and it is recent. We are enabling checkperf targets.") # sync_git_repository(myrepo ...) creates two targets: # myrepo - if the repo does not exist, creates and syncs it against the origin branch @@ -61,7 +64,13 @@ if (SIMDJSON_IS_UNDER_GIT AND Git_FOUND AND (GIT_VERSION_STRING VERSION_GREATER OUTPUT ${SIMDJSON_CHECKPERF_DIR}/build/cmake_install.cmake # We make many things but this seems the most cross-platform one we can depend on COMMAND ${CMAKE_COMMAND} -E env CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} - ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DSIMDJSON_GOOGLE_BENCHMARKS=OFF -DSIMDJSON_COMPETITION=OFF -G ${CMAKE_GENERATOR} .. + ${CMAKE_COMMAND} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DSIMDJSON_GOOGLE_BENCHMARKS=OFF + -DSIMDJSON_COMPETITION=OFF + -DSIMDJSON_DEVELOPER_MODE=YES + -G ${CMAKE_GENERATOR} + .. WORKING_DIRECTORY ${SIMDJSON_CHECKPERF_DIR}/build DEPENDS ${SIMDJSON_CHECKPERF_DIR}/build/CMakeCache.txt ) @@ -70,7 +79,7 @@ if (SIMDJSON_IS_UNDER_GIT AND Git_FOUND AND (GIT_VERSION_STRING VERSION_GREATER if (CMAKE_CONFIGURATION_TYPES) set(CHECKPERF_PARSE ${SIMDJSON_CHECKPERF_DIR}/build/benchmark/$/parse) else() - set(CHECKPERF_PARSE ${SIMDJSON_CHECKPERF_DIR}/build/benchmark/parse) + set(CHECKPERF_PARSE ${SIMDJSON_CHECKPERF_DIR}/build/benchmark/dom/parse) endif() add_custom_target( checkperf-parse ALL # TODO is ALL necessary? @@ -89,13 +98,15 @@ if (SIMDJSON_IS_UNDER_GIT AND Git_FOUND AND (GIT_VERSION_STRING VERSION_GREATER # COMMAND ECHO $ \"$ -t ${SIMDJSON_CHECKPERF_ARGS}\" \"${CHECKPERF_PARSE} -t ${SIMDJSON_CHECKPERF_ARGS}\" } COMMAND $ $ ${CHECKPERF_PARSE} -H -t ${SIMDJSON_CHECKPERF_ARGS} ) - set_property(TEST checkperf APPEND PROPERTY LABELS per_implementation) + set_property(TEST checkperf APPEND PROPERTY LABELS per_implementation explicitonly) set_property(TEST checkperf APPEND PROPERTY DEPENDS parse perfdiff ${SIMDJSON_USER_CMAKECACHE}) set_property(TEST checkperf PROPERTY RUN_SERIAL TRUE) + add_dependencies(per_implementation_tests checkperf) + add_dependencies(explicitonly_tests checkperf) else() if (CMAKE_GENERATOR MATCHES Ninja) message(STATUS "We disable the checkperf targets under Ninja.") - else() + else() message(STATUS "Either git is unavailable or else it is too old. We are disabling checkperf targets.") endif() endif () diff --git a/benchmark/parse.cpp b/benchmark/dom/parse.cpp similarity index 88% rename from benchmark/parse.cpp rename to benchmark/dom/parse.cpp index bb997aa3b7..d26622da17 100644 --- a/benchmark/parse.cpp +++ b/benchmark/dom/parse.cpp @@ -66,8 +66,10 @@ void print_usage(ostream& out) { out << "-H - Make the buffers hot (reduce page allocation and related OS tasks during parsing) [default]" << endl; out << "-a IMPL - Use the given parser implementation. By default, detects the most advanced" << endl; out << " implementation supported on the host machine." << endl; - for (auto impl : simdjson::available_implementations) { - out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << endl; + for (auto impl : simdjson::get_available_implementations()) { + if(impl->supported_by_runtime_system()) { + out << "-a " << std::left << std::setw(9) << impl->name() << " - Use the " << impl->description() << " parser implementation." << endl; + } } } @@ -114,16 +116,18 @@ struct option_struct { verbose = true; break; case 'a': { - const implementation *impl = simdjson::available_implementations[optarg]; - if (!impl) { + const implementation *impl = simdjson::get_available_implementations()[optarg]; + if ((!impl) || (!impl->supported_by_runtime_system())) { std::string exit_message = string("Unsupported option value -a ") + optarg + ": expected -a with one of "; - for (auto imple : simdjson::available_implementations) { - exit_message += imple->name(); - exit_message += " "; + for (auto imple : simdjson::get_available_implementations()) { + if(imple->supported_by_runtime_system()) { + exit_message += imple->name(); + exit_message += " "; + } } exit_usage(exit_message); } - simdjson::active_implementation = impl; + simdjson::get_active_implementation() = impl; break; } case 'C': @@ -171,7 +175,7 @@ int main(int argc, char *argv[]) { option_struct options(argc, argv); if (options.verbose) { verbose_stream = &cout; - verbose() << "Implementation: " << simdjson::active_implementation->name() << endl; + verbose() << "Implementation: " << simdjson::get_active_implementation()->name() << endl; } // Start collecting events. We put this early so if it prints an error message, it's the @@ -214,7 +218,7 @@ int main(int argc, char *argv[]) { if (!options.verbose) { progress.erase(); } for (size_t i=0; iprint(options.tabbed_output); + benchmarkers[i]->print(options.tabbed_output, options.stage1_only); delete benchmarkers[i]; } diff --git a/benchmark/parse_stream.cpp b/benchmark/dom/parse_stream.cpp similarity index 98% rename from benchmark/parse_stream.cpp rename to benchmark/dom/parse_stream.cpp index d893fd1d48..1217535e7e 100644 --- a/benchmark/parse_stream.cpp +++ b/benchmark/dom/parse_stream.cpp @@ -25,11 +25,12 @@ int main(int argc, char *argv[]) { exit(1); } const char *filename = argv[1]; - auto[p, err] = simdjson::padded_string::load(filename); - if (err) { + auto v = simdjson::padded_string::load(filename); + if (v.error()) { std::cerr << "Could not load the file " << filename << std::endl; return EXIT_FAILURE; } + const simdjson::padded_string& p = v.value_unsafe(); if (test_baseline) { std::wclog << "Baseline: Getline + normal parse... " << std::endl; std::cout << "Gigabytes/second\t" diff --git a/benchmark/perfdiff.cpp b/benchmark/dom/perfdiff.cpp similarity index 100% rename from benchmark/perfdiff.cpp rename to benchmark/dom/perfdiff.cpp diff --git a/benchmark/statisticalmodel.cpp b/benchmark/dom/statisticalmodel.cpp similarity index 96% rename from benchmark/statisticalmodel.cpp rename to benchmark/dom/statisticalmodel.cpp index 89efe82d66..2d7c9a7cc9 100644 --- a/benchmark/statisticalmodel.cpp +++ b/benchmark/dom/statisticalmodel.cpp @@ -41,7 +41,7 @@ using stat_t = struct stat_s; -really_inline void simdjson_process_atom(stat_t &s, +simdjson_inline void simdjson_process_atom(stat_t &s, simdjson::dom::element element) { if (element.is()) { s.integer_count++; @@ -50,9 +50,9 @@ really_inline void simdjson_process_atom(stat_t &s, } else if(element.is()) { s.float_count++; } else if (element.is()) { - simdjson::error_code err; bool v; - err = element.get(v); + simdjson::error_code error; + if ((error = element.get(v))) { std::cerr << error << std::endl; abort(); } if (v) { s.true_count++; } else { @@ -183,7 +183,7 @@ int main(int argc, char *argv[]) { for (uint32_t i = 0; i < iterations; i++) { unified.start(); // The default template is simdjson::architecture::NATIVE. - bool isok = (parser.implementation->stage1((const uint8_t *)p.data(), p.size(), false) == simdjson::SUCCESS); + bool isok = (parser.implementation->stage1((const uint8_t *)p.data(), p.size(), simdjson::stage1_mode::regular) == simdjson::SUCCESS); unified.end(results); cy1 += results[0]; diff --git a/benchmark/event_counter.h b/benchmark/event_counter.h index 0610382ee4..1d4da4122a 100644 --- a/benchmark/event_counter.h +++ b/benchmark/event_counter.h @@ -1,6 +1,15 @@ #ifndef __EVENT_COUNTER_H #define __EVENT_COUNTER_H +#ifndef SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS +#ifdef __aarch64__ +// on ARM, we use just cycles and instructions +#define SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS 1 +#else +// elsewhere, we try to use four counters. +#define SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS 0 +#endif +#endif #include #include #ifndef _MSC_VER @@ -25,11 +34,15 @@ #include #include -#include "linux-perf-events.h" #ifdef __linux__ +#include "linux-perf-events.h" #include #endif +#if __APPLE__ && __aarch64__ +#include "apple/apple_arm_events.h" +#endif + #include "simdjson.h" using std::string; @@ -46,6 +59,12 @@ struct event_count { event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { } // The types of counters (so we can read the getter more easily) + #if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + enum event_counter_types { + CPU_CYCLES, + INSTRUCTIONS + }; + #else enum event_counter_types { CPU_CYCLES, INSTRUCTIONS, @@ -53,15 +72,16 @@ struct event_count { CACHE_REFERENCES, CACHE_MISSES }; - + #endif double elapsed_sec() const { return duration(elapsed).count(); } double elapsed_ns() const { return duration(elapsed).count(); } double cycles() const { return static_cast(event_counts[CPU_CYCLES]); } double instructions() const { return static_cast(event_counts[INSTRUCTIONS]); } +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS double branch_misses() const { return static_cast(event_counts[BRANCH_MISSES]); } double cache_references() const { return static_cast(event_counts[CACHE_REFERENCES]); } double cache_misses() const { return static_cast(event_counts[CACHE_MISSES]); } - +#endif event_count& operator=(const event_count& other) { this->elapsed = other.elapsed; this->event_counts = other.event_counts; @@ -105,9 +125,11 @@ struct event_aggregate { double elapsed_ns() const { return total.elapsed_ns() / iterations; } double cycles() const { return total.cycles() / iterations; } double instructions() const { return total.instructions() / iterations; } +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS double branch_misses() const { return total.branch_misses() / iterations; } double cache_references() const { return total.cache_references() / iterations; } double cache_misses() const { return total.cache_misses() / iterations; } +#endif }; struct event_collector { @@ -117,15 +139,29 @@ struct event_collector { #if defined(__linux__) LinuxEvents linux_events; event_collector() : linux_events(vector{ + #if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + PERF_COUNT_HW_CPU_CYCLES, + PERF_COUNT_HW_INSTRUCTIONS, + #else PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES + #endif }) {} bool has_events() { return linux_events.is_working(); } +#elif __APPLE__ && __aarch64__ + AppleEvents apple_events; + performance_counters diff; + event_collector() : diff(0) { + apple_events.setup_performance_counters(); + } + bool has_events() { + return apple_events.setup_performance_counters(); + } #else event_collector() {} bool has_events() { @@ -133,16 +169,28 @@ struct event_collector { } #endif - really_inline void start() { + simdjson_inline void start() { #if defined(__linux) linux_events.start(); +#elif __APPLE__ && __aarch64__ + if(has_events()) { diff = apple_events.get_counters(); } #endif start_clock = steady_clock::now(); } - really_inline event_count& end() { + simdjson_inline event_count& end() { time_point end_clock = steady_clock::now(); #if defined(__linux) linux_events.end(count.event_counts); +#elif __APPLE__ && __aarch64__ + if(has_events()) { + performance_counters end = apple_events.get_counters(); + diff = end - diff; + } + count.event_counts[0] = diff.cycles; + count.event_counts[1] = diff.instructions; + count.event_counts[2] = diff.missed_branches; + count.event_counts[3] = 0; + count.event_counts[4] = 0; #endif count.elapsed = end_clock - start_clock; return count; diff --git a/benchmark/find_tweet/boostjson.h b/benchmark/find_tweet/boostjson.h new file mode 100644 index 0000000000..adbb7ad70c --- /dev/null +++ b/benchmark/find_tweet/boostjson.h @@ -0,0 +1,30 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "find_tweet.h" + +namespace find_tweet { + +struct boostjson { + using StringType=std::string; + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string &result) { + + auto root = boost::json::parse(json); + for (const auto &tweet : root.at("statuses").as_array()) { + if (tweet.at("id") == find_id) { + result = tweet.at("text").as_string(); + return true; + } + } + + return false; + } +}; + +BENCHMARK_TEMPLATE(find_tweet, boostjson)->UseManualTime(); + +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/find_tweet/find_tweet.h b/benchmark/find_tweet/find_tweet.h new file mode 100644 index 0000000000..e2f2a30382 --- /dev/null +++ b/benchmark/find_tweet/find_tweet.h @@ -0,0 +1,40 @@ + +#pragma once + +#include "json_benchmark/file_runner.h" + +namespace find_tweet { + +using namespace json_benchmark; + +template +struct runner : public file_runner { + typename I::StringType result; + + bool setup(benchmark::State &state) { + return this->load_json(state, TWITTER_JSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result = ""; + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, 505874901689851904ULL, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } +}; + +struct simdjson_dom; + +template simdjson_inline static void find_tweet(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace find_tweet diff --git a/benchmark/find_tweet/nlohmann_json.h b/benchmark/find_tweet/nlohmann_json.h new file mode 100644 index 0000000000..0d7a7c8d71 --- /dev/null +++ b/benchmark/find_tweet/nlohmann_json.h @@ -0,0 +1,29 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "find_tweet.h" + +namespace find_tweet { + +struct nlohmann_json { + using StringType=std::string; + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string &result) { + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto tweet : root["statuses"]) { + if (tweet["id"] == find_id) { + result = to_string(tweet["text"]); + return true; + } + } + + return false; + } +}; + +BENCHMARK_TEMPLATE(find_tweet, nlohmann_json)->UseManualTime(); + +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/find_tweet/nlohmann_json_sax.h b/benchmark/find_tweet/nlohmann_json_sax.h new file mode 100644 index 0000000000..0302648237 --- /dev/null +++ b/benchmark/find_tweet/nlohmann_json_sax.h @@ -0,0 +1,68 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "find_tweet.h" + +namespace find_tweet { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + using StringType=std::string; + + struct Handler : json::json_sax_t + { + bool text_key = false; + bool id_key = false; + bool found_id = false; + uint64_t find_id; + std::string &result; + + Handler(std::string &r,uint64_t id): result(r), find_id(id) { } + + // We assume id is found before text + bool key(string_t& val) override { + if (found_id) { // If have found id, find text key + if (val.compare("text") == 0) { text_key = true; } + } + else if (val.compare("id") == 0) { id_key = true; } // Otherwise, find id key + return true; + } + bool number_unsigned(number_unsigned_t val) override { + if (id_key && (val == find_id)) { // If id key, check if id value matches find_id + found_id = true; + } + return true; + } + bool string(string_t& val) override { + if (text_key) { + result = val; + return false; // End parsing + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_float(number_float_t val, const string_t& s) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string &result) { + Handler handler(result,find_id); + json::sax_parse(json.data(), &handler); + + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(find_tweet, nlohmann_json_sax)->UseManualTime(); +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/find_tweet/rapidjson.h b/benchmark/find_tweet/rapidjson.h new file mode 100644 index 0000000000..e63daf26f3 --- /dev/null +++ b/benchmark/find_tweet/rapidjson.h @@ -0,0 +1,53 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "find_tweet.h" + +namespace find_tweet { + +using namespace rapidjson; + +struct rapidjson_base { + using StringType=std::string_view; + + Document doc{}; + + bool run(Document &root, uint64_t find_id, std::string_view &result) { + if (root.HasParseError() || !root.IsObject()) { return false; } + auto statuses = root.FindMember("statuses"); + if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; } + for (auto &tweet : statuses->value.GetArray()) { + if (!tweet.IsObject()) { return false; } + auto id = tweet.FindMember("id"); + if (id == tweet.MemberEnd() || !id->value.IsUint64()) { return false; } + if (id->value.GetUint64() == find_id) { + auto text = tweet.FindMember("text"); + if (text == tweet.MemberEnd() || !text->value.IsString()) { return false; } + result = { text->value.GetString(), text->value.GetStringLength() }; + return true; + } + } + + return false; + } +}; + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + return rapidjson_base::run(doc.Parse(json.data()), find_id, result); + } +}; +BENCHMARK_TEMPLATE(find_tweet, rapidjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), find_id, result); + } +}; +BENCHMARK_TEMPLATE(find_tweet, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/find_tweet/rapidjson_sax.h b/benchmark/find_tweet/rapidjson_sax.h new file mode 100644 index 0000000000..24a8482f59 --- /dev/null +++ b/benchmark/find_tweet/rapidjson_sax.h @@ -0,0 +1,70 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "find_tweet.h" +#include + +namespace find_tweet { + +using namespace rapidjson; + +struct rapidjson_sax { +using StringType=std::string_view; + +struct Handler { + bool text_key = false; + bool id_key = false; + bool found_id = false; + uint64_t find_id; + std::string_view &result; + + Handler(std::string_view &r,uint64_t id): result(r), find_id(id) { } + + // We assume id is found before text + bool Key(const char* key, SizeType length, bool copy) { + if (found_id) { // If have found id, find text key + if ((length == 4) && (memcmp(key,"text",4) == 0)) { text_key = true; } + } + else if ((length == 2) && (memcmp(key,"id",2) == 0)) { id_key = true; } // Otherwise, find id key + return true; + } + bool Uint64(uint64_t i) { + if (id_key && (i == find_id)) { // If id key, check if id value matches find_id + found_id = true; + } + return true; + } + bool String(const char* str, SizeType length, bool copy) { + if (text_key) { + result = {str,length}; + return false; // End parsing + } + return true; + } + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Double(double d) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint(unsigned i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + Reader reader; + Handler handler(result,find_id); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } +}; // rapidjson_sax +BENCHMARK_TEMPLATE(find_tweet, rapidjson_sax)->UseManualTime(); +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/find_tweet/sajson.h b/benchmark/find_tweet/sajson.h new file mode 100644 index 0000000000..5bfe9ea6a1 --- /dev/null +++ b/benchmark/find_tweet/sajson.h @@ -0,0 +1,68 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "find_tweet.h" + +namespace find_tweet { + +struct sajson { + using StringType=std::string_view; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + simdjson_inline std::string_view get_string_view(const ::sajson::value &obj, std::string_view key) { + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field is not a string"; } + return { val.as_cstring(), val.get_string_length() }; + } + simdjson_inline uint64_t get_str_uint64(const ::sajson::value &obj, std::string_view key) { + // Since sajson only supports 53-bit numbers, and IDs in twitter.json can be > 53 bits, we read the corresponding id_str and parse that. + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field not a string"; } + auto str = val.as_cstring(); + char *endptr; + uint64_t result = strtoull(str, &endptr, 10); + if (endptr != &str[val.get_string_length()]) { throw "field is a string, but not an integer string"; } + return result; + } + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = ::sajson::parse( + ::sajson::bounded_allocation(ast_buffer, ast_buffer_size), + ::sajson::mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != ::sajson::TYPE_OBJECT) { printf("a\n"); return false; } + auto statuses = root.get_value_of_key({"statuses", strlen("statuses")}); + if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; } + + for (size_t i=0; iUseManualTime(); + +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_SAJSON + diff --git a/benchmark/find_tweet/simdjson_dom.h b/benchmark/find_tweet/simdjson_dom.h new file mode 100644 index 0000000000..0b32d8b2a6 --- /dev/null +++ b/benchmark/find_tweet/simdjson_dom.h @@ -0,0 +1,33 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "find_tweet.h" + +namespace find_tweet { + +using namespace simdjson; + +struct simdjson_dom { + using StringType=std::string_view; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + result = ""; + auto doc = parser.parse(json); + for (auto tweet : doc["statuses"]) { + if (uint64_t(tweet["id"]) == find_id) { + result = tweet["text"]; + return true; + } + } + return false; + } +}; + +BENCHMARK_TEMPLATE(find_tweet, simdjson_dom)->UseManualTime(); + +} // namespace find_tweet + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/find_tweet/simdjson_ondemand.h b/benchmark/find_tweet/simdjson_ondemand.h new file mode 100644 index 0000000000..51595b8b93 --- /dev/null +++ b/benchmark/find_tweet/simdjson_ondemand.h @@ -0,0 +1,33 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "find_tweet.h" + +namespace find_tweet { + +using namespace simdjson; + +struct simdjson_ondemand { + using StringType=std::string_view; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + // Walk the document, parsing as we go + auto doc = parser.iterate(json); + for (auto tweet : doc.find_field("statuses")) { + if (uint64_t(tweet.find_field("id")) == find_id) { + result = tweet.find_field("text"); + return true; + } + } + return false; + } +}; + +BENCHMARK_TEMPLATE(find_tweet, simdjson_ondemand)->UseManualTime(); + +} // namespace find_tweet + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/find_tweet/yyjson.h b/benchmark/find_tweet/yyjson.h new file mode 100644 index 0000000000..8ce4251c7c --- /dev/null +++ b/benchmark/find_tweet/yyjson.h @@ -0,0 +1,53 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "find_tweet.h" + +namespace find_tweet { + +struct yyjson_base { + using StringType=std::string_view; + + bool run(yyjson_doc *doc, uint64_t find_id, std::string_view &result) { + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *statuses = yyjson_obj_get(root, "statuses"); + if (!yyjson_is_arr(statuses)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t tweet_idx, tweets_max; + yyjson_val *tweet; + yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) { + if (!yyjson_is_obj(tweet)) { return false; } + auto id = yyjson_obj_get(tweet, "id"); + if (!yyjson_is_uint(id)) { return false; } + if (yyjson_get_uint(id) == find_id) { + auto text = yyjson_obj_get(tweet, "text"); + if (yyjson_is_str(id)) { return false; } + result = { yyjson_get_str(text), yyjson_get_len(text) }; + return true; + } + } + return false; + } +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), find_id, result); + } +}; +BENCHMARK_TEMPLATE(find_tweet, yyjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, uint64_t find_id, std::string_view &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), find_id, result); + } +}; +BENCHMARK_TEMPLATE(find_tweet, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace find_tweet + +#endif // SIMDJSON_COMPETITION_YYJSON diff --git a/benchmark/get_corpus_benchmark.cpp b/benchmark/get_corpus_benchmark.cpp index 855eaaac04..94ae46e2df 100644 --- a/benchmark/get_corpus_benchmark.cpp +++ b/benchmark/get_corpus_benchmark.cpp @@ -5,11 +5,11 @@ #include // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte -never_inline +simdjson_never_inline double bench(std::string filename, simdjson::padded_string& p) { std::chrono::time_point start_clock = std::chrono::steady_clock::now(); - simdjson::padded_string::load(filename).first.swap(p); + simdjson::padded_string::load(filename).value_unsafe().swap(p); std::chrono::time_point end_clock = std::chrono::steady_clock::now(); std::chrono::duration elapsed = end_clock - start_clock; @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) { << std::endl; } simdjson::padded_string p; - bench(filename, p); + bench(filename, p); double meanval = 0; double maxval = 0; double minval = 10000; diff --git a/benchmark/json2msgpack/boostjson.h b/benchmark/json2msgpack/boostjson.h new file mode 100644 index 0000000000..b7dc7a7c8f --- /dev/null +++ b/benchmark/json2msgpack/boostjson.h @@ -0,0 +1,104 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "json2msgpack.h" + +namespace json2msgpack { + +struct boostjson2msgpack { + inline std::string_view to_msgpack(const boost::json::value &root, uint8_t *buf) { + buff = buf; + recursive_processor(root); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); + } + +private: + uint8_t *buff{}; + + inline void write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); + } + + inline void write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; + } + + inline void write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); + } + + inline void write_string(const std::string & str) { + write_byte(0xdb); + write_uint32(uint32_t(str.size())); + ::memcpy(buff, str.data(), str.size()); + buff += str.size(); + } + + inline void recursive_processor(const boost::json::value &element) { + switch(element.kind()) { + case boost::json::kind::array: { + write_byte(0xdd); + const auto &array = element.as_array(); + write_uint32(static_cast(array.size())); + for (const auto &child : array) { + recursive_processor(child); + } + } break; + + case boost::json::kind::object: { + write_byte(0xdf); + const auto &object = element.as_object(); + write_uint32(static_cast(object.size())); + for (const auto &child : object) { + write_string(child.key_c_str()); + recursive_processor(child.value()); + } + } break; + + case boost::json::kind::int64: + case boost::json::kind::uint64: + case boost::json::kind::double_: + write_double(element.to_number()); + break; + + case boost::json::kind::string: + write_string(element.as_string().c_str()); + break; + + case boost::json::kind::bool_: + write_byte(0xc2 + element.as_bool()); + break; + + case boost::json::kind::null: + write_byte(0xc0); + break; + + default: + printf("unexpected\n"); + break; + } + } +}; + +struct boostjson { + using StringType=std::string; + + boostjson2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, std::string_view &result) { + auto root = boost::json::parse(json); + result = parser.to_msgpack(root, reinterpret_cast(buffer)); + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, boostjson)->UseManualTime(); + +} // namespace json2msgpack + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/json2msgpack/json2msgpack.h b/benchmark/json2msgpack/json2msgpack.h new file mode 100644 index 0000000000..b6d9c723d0 --- /dev/null +++ b/benchmark/json2msgpack/json2msgpack.h @@ -0,0 +1,48 @@ +#pragma once + +#include "json_benchmark/file_runner.h" + +namespace json2msgpack { + +using namespace json_benchmark; + +template struct runner : public file_runner { + std::string_view result; + std::unique_ptr buffer; + + bool setup(benchmark::State &state) { + bool isok = this->load_json(state, TWITTER_JSON); + if (isok) { + // Let us allocate a sizeable buffer. + buffer = std::unique_ptr(new char[this->json.size() * 4 + 1024]); + } + return isok; + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { + return false; + } + // Clear the buffer. + ::memset(buffer.get(), 0, this->json.size() * 4 + 1024); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, buffer.get(), result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result.size(), reference.result.size(), diff_flags::NONE); + } +}; + +struct simdjson_ondemand; + +template +simdjson_inline static void json2msgpack(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace json2msgpack diff --git a/benchmark/json2msgpack/nlohmann_json.h b/benchmark/json2msgpack/nlohmann_json.h new file mode 100644 index 0000000000..332d265577 --- /dev/null +++ b/benchmark/json2msgpack/nlohmann_json.h @@ -0,0 +1,117 @@ +#pragma once +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "json2msgpack.h" + +namespace json2msgpack { + +using namespace nlohmann; + +struct nlohmann_json2msgpack { + inline std::string_view to_msgpack(const simdjson::padded_string &json, + uint8_t *buf); + +private: + inline void write_double(const double d) noexcept; + inline void write_byte(const uint8_t b) noexcept; + inline void write_uint32(const uint32_t w) noexcept; + inline void write_string(const std::string& str); + inline void recursive_processor(basic_json<> element); + + uint8_t *buff{}; +}; + +std::string_view nlohmann_json2msgpack::to_msgpack(const simdjson::padded_string &json, + uint8_t *buf) { + buff = buf; + auto val = nlohmann::json::parse(json.data(), json.data() + json.size()); + recursive_processor(val); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} + +void nlohmann_json2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +void nlohmann_json2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +void nlohmann_json2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +void nlohmann_json2msgpack::write_string(const std::string & str) { + write_byte(0xdb); + write_uint32(uint32_t(str.size())); + ::memcpy(buff, str.data(), str.size()); + buff += str.size(); +} + +void nlohmann_json2msgpack::recursive_processor(json element) { + switch (element.type()) { + case nlohmann::detail::value_t::array: { + uint32_t counter = 0; + write_byte(0xdd); + std::vector array = element.get>(); + write_uint32(uint32_t(array.size())); + for (auto child : array) { + recursive_processor(child); + } + } break; + case nlohmann::detail::value_t::object: { + write_byte(0xdf); + std::map object = element.get>(); + write_uint32(uint32_t(object.size())); + for (auto field : object) { + write_string(field.first); + recursive_processor(field.second); + } + } break; + + case nlohmann::detail::value_t::number_integer: + case nlohmann::detail::value_t::number_unsigned: + case nlohmann::detail::value_t::number_float: + write_double(double(element)); + break; + case nlohmann::detail::value_t::string: + write_string(std::string(element)); + break; + case nlohmann::detail::value_t::boolean: + write_byte(0xc2 + bool(element)); + break; + case nlohmann::detail::value_t::null: + write_byte(0xc0); + break; + case nlohmann::detail::value_t::discarded: + case nlohmann::detail::value_t::binary: + default: + printf("unexpected\n"); + break; + } +} + +struct nlohmann_json { + using StringType = std::string_view; + + nlohmann_json2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + result = parser.to_msgpack(json, reinterpret_cast(buffer)); + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, nlohmann_json)->UseManualTime(); + +} // namespace json2msgpack + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON + + + diff --git a/benchmark/json2msgpack/rapidjson.h b/benchmark/json2msgpack/rapidjson.h new file mode 100644 index 0000000000..8a9f48bb30 --- /dev/null +++ b/benchmark/json2msgpack/rapidjson.h @@ -0,0 +1,142 @@ + + +#pragma once +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "json2msgpack.h" + +namespace json2msgpack { + +using namespace rapidjson; + +template +struct rapidjson2msgpack { + inline std::string_view to_msgpack(char *json, uint8_t *buf); + +private: + inline void write_double(const double d) noexcept; + inline void write_byte(const uint8_t b) noexcept; + inline void write_uint32(const uint32_t w) noexcept; + inline void write_uint32_at(const uint32_t w, uint8_t *p) noexcept; + void write_string(const char * s, size_t length) noexcept; + inline void recursive_processor(Value &v); + + uint8_t *buff{}; +}; + +template +std::string_view rapidjson2msgpack::to_msgpack(char *json, uint8_t *buf) { + buff = buf; + Document doc{}; + if(parseflag & kParseInsituFlag) { + doc.ParseInsitu(json); + } else { + doc.Parse(json); + } + recursive_processor(doc); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} + +template +void rapidjson2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +template +void rapidjson2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +template +void rapidjson2msgpack::write_string(const char * c, size_t len) noexcept { + write_byte(0xdb); + write_uint32(uint32_t(len)); + ::memcpy(buff, c, len); + buff += len; +} + +template +void rapidjson2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +template +void rapidjson2msgpack::write_uint32_at(const uint32_t w, uint8_t *p) noexcept { + ::memcpy(p, &w, sizeof(w)); +} + +template +void rapidjson2msgpack::recursive_processor(Value &v) { + switch (v.GetType()) { + case kArrayType: + write_byte(0xdd); + write_uint32(v.Size()); + for (Value::ValueIterator i = v.Begin(); i != v.End(); ++i) { + recursive_processor(*i); + } + break; + case kObjectType: + write_byte(0xdf); + write_uint32(uint32_t(v.MemberEnd()-v.MemberBegin())); + for (Value::MemberIterator m = v.MemberBegin(); m != v.MemberEnd(); + ++m) { + write_string(m->name.GetString(), m->name.GetStringLength()); + recursive_processor(m->value); + } + break; + case kStringType: + write_string(v.GetString(), v.GetStringLength()); + break; + case kNumberType: + write_double(v.GetDouble()); + break; + case kFalseType: + write_byte(0xc2); + break; + case kTrueType: + write_byte(0xc3); + break; + case kNullType: + write_byte(0xc0); + break; + } +} + +template +struct rapidjson_base { + using StringType = std::string_view; + + rapidjson2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + result = + parser.to_msgpack(json.data(), reinterpret_cast(buffer)); + + return true; + } +}; + + +using rapidjson = rapidjson_base; + +BENCHMARK_TEMPLATE(json2msgpack, rapidjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_APPROX +using rapidjson_approx = rapidjson_base; + +BENCHMARK_TEMPLATE(json2msgpack, rapidjson_approx)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_APPROX + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +using rapidjson_insitu = rapidjson_base; + +BENCHMARK_TEMPLATE(json2msgpack, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace json2msgpack + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/json2msgpack/sajson.h b/benchmark/json2msgpack/sajson.h new file mode 100644 index 0000000000..7af88fc296 --- /dev/null +++ b/benchmark/json2msgpack/sajson.h @@ -0,0 +1,131 @@ +#pragma once +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "json2msgpack.h" + +namespace json2msgpack { + +using namespace sajson; + + +struct sajson2msgpack { + inline std::string_view to_msgpack(char *json, size_t size, uint8_t *buf); + virtual ~sajson2msgpack() { free(ast_buffer); } + +private: + inline void write_double(const double d) noexcept; + inline void write_byte(const uint8_t b) noexcept; + inline void write_uint32(const uint32_t w) noexcept; + inline void write_string(const char * s, size_t length) noexcept; + inline void recursive_processor(const sajson::value &v); + + uint8_t *buff{}; + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; +}; + + +std::string_view sajson2msgpack::to_msgpack(char *json, size_t size, uint8_t *buf) { + buff = buf; + + if (!ast_buffer) { + ast_buffer_size = size; + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = parse( + bounded_allocation(ast_buffer, ast_buffer_size), + mutable_string_view(size, json) + ); + + auto root = doc.get_root(); + recursive_processor(root); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} + +void sajson2msgpack::write_string(const char * c, size_t len) noexcept { + write_byte(0xdb); + write_uint32(uint32_t(len)); + ::memcpy(buff, c, len); + buff += len; +} + +void sajson2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +void sajson2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +void sajson2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +void sajson2msgpack::recursive_processor(const sajson::value &node) { + using namespace sajson; + switch (node.get_type()) { + case TYPE_NULL: + write_byte(0xc0); + break; + case TYPE_FALSE: + write_byte(0xc2); + break; + case TYPE_TRUE: + write_byte(0xc3); + break; + case TYPE_ARRAY: { + auto length = node.get_length(); + write_byte(0xdf); + write_uint32(uint32_t(length)); + for (size_t i = 0; i < length; ++i) { + recursive_processor(node.get_array_element(i)); + } + break; + } + case TYPE_OBJECT: { + auto length = node.get_length(); + write_byte(0xdd); + write_uint32(uint32_t(length)); + for (auto i = 0u; i < length; ++i) { + auto s = node.get_object_key(i); + write_string(s.data(), s.length()); + recursive_processor(node.get_object_value(i)); + } + break; + } + case TYPE_STRING: + write_string(node.as_cstring(), node.get_string_length()); + break; + case TYPE_DOUBLE: + case TYPE_INTEGER: + write_double(node.get_number_value()); + break; + default: + assert(false && "unknown node type"); + } +} + + +struct sajson { + using StringType = std::string_view; + + sajson2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + result = + parser.to_msgpack(json.data(), json.size(), reinterpret_cast(buffer)); + + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, sajson)->UseManualTime(); + +} // namespace json2msgpack + +#endif // SIMDJSON_COMPETITION_SAJSON diff --git a/benchmark/json2msgpack/simdjson_dom.h b/benchmark/json2msgpack/simdjson_dom.h new file mode 100644 index 0000000000..1eb211260d --- /dev/null +++ b/benchmark/json2msgpack/simdjson_dom.h @@ -0,0 +1,138 @@ +#pragma once +#if SIMDJSON_EXCEPTIONS + + +#include "json2msgpack.h" + +namespace json2msgpack { + +using namespace simdjson; + +struct simdjsondom2msgpack { + /** + * @brief Converts the provided JSON into msgpack. + * + * @param json JSON input + * @param buf temporary buffer (must be large enough, with simdjson::SIMDJSON_PADDING bytes + * of padding) + * @return std::string_view msgpack output, writing to the temporary buffer + */ + inline std::string_view to_msgpack(const simdjson::padded_string &json, + uint8_t *buf); + +private: + simdjson_really_inline void write_double(const double d) noexcept; + simdjson_really_inline void write_string(const std::string_view v) noexcept; + simdjson_really_inline void write_byte(const uint8_t b) noexcept; + simdjson_really_inline void write_uint32(const uint32_t w) noexcept; + simdjson_really_inline uint8_t *skip_uint32() noexcept; + simdjson_really_inline void write_uint32_at(const uint32_t w, + uint8_t *p) noexcept; + + inline void recursive_processor(simdjson::dom::element element); + + dom::parser parser; + uint8_t *buff{}; +}; + +std::string_view +simdjsondom2msgpack::to_msgpack(const simdjson::padded_string &json, + uint8_t *buf) { + buff = buf; + + recursive_processor(parser.parse(json)); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} +void simdjsondom2msgpack::write_string(const std::string_view v) noexcept { + write_byte(0xdb); + write_uint32(uint32_t(v.size())); + ::memcpy(buff, v.data(), v.size()); + buff += v.size(); +} +void simdjsondom2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +void simdjsondom2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +void simdjsondom2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +uint8_t *simdjsondom2msgpack::skip_uint32() noexcept { + uint8_t *ret = buff; + buff += sizeof(uint32_t); + return ret; +} + +void simdjsondom2msgpack::write_uint32_at(const uint32_t w, uint8_t *p) noexcept { + ::memcpy(p, &w, sizeof(w)); +} + + +void simdjsondom2msgpack::recursive_processor(simdjson::dom::element element) { + switch (element.type()) { + case dom::element_type::ARRAY: { + uint32_t counter = 0; + write_byte(0xdd); + uint8_t *location = skip_uint32(); + for (auto child : dom::array(element)) { + counter++; + recursive_processor(child); + } + write_uint32_at(counter, location);} + break; + case dom::element_type::OBJECT:{ + uint32_t counter = 0; + write_byte(0xdf); + uint8_t *location = skip_uint32(); + for (dom::key_value_pair field : dom::object(element)) { + counter++; + write_string(field.key); + recursive_processor(field.value); + } + write_uint32_at(counter, location); + } + break; + case dom::element_type::INT64: + case dom::element_type::UINT64: + case dom::element_type::DOUBLE: + write_double( double(element)); + break; + case dom::element_type::STRING: + write_string(std::string_view(element)); + break; + case dom::element_type::BOOL: + write_byte(0xc2 + bool(element)); + break; + case dom::element_type::NULL_VALUE: + write_byte(0xc0); + break; + default: + break; + } +} + +struct simdjson_dom { + using StringType = std::string_view; + + simdjsondom2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + result = parser.to_msgpack(json, reinterpret_cast(buffer)); + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, simdjson_dom)->UseManualTime(); + +} // namespace json2msgpack + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/json2msgpack/simdjson_ondemand.h b/benchmark/json2msgpack/simdjson_ondemand.h new file mode 100644 index 0000000000..270eb499a0 --- /dev/null +++ b/benchmark/json2msgpack/simdjson_ondemand.h @@ -0,0 +1,250 @@ +#pragma once +#if SIMDJSON_EXCEPTIONS + +#include "json2msgpack.h" + +namespace json2msgpack { + +using namespace simdjson; + +/** + * @brief The simdjson2msgpack struct is used to quickly convert + * JSON strings to msgpack views. You must provide a pointer to + * a large memory region where the msgpack gets written. The + * buffer should be large enough to store the msgpack output (which + * can never be 3x larger than the input JSON) with an additional + * simdjson::SIMDJSON_PADDING bytes. + * + * Recommended usage: + * + * simdjson2msgpack parser{}; + * simdjson::padded_string json = "[1,2]"_padded; // some JSON + * uint8_t * buffer = new uint8_t[3*json.size() + simdjson::SIMDJSON_PADDING]; // large buffer + * + * std::string_view msgpack = parser.to_msgpack(json, buffer); + * + * The result (msgpack) is a string view to a msgpack serialization of the input JSON, + * it points inside the buffer you provided. + * + * You may reuse the simdjson2msgpack instance though you should use + * one per thread. + */ +struct simdjson2msgpack { + /** + * @brief Converts the provided JSON into msgpack. + * + * @param json JSON input + * @param buf temporary buffer (must be large enough, with simdjson::SIMDJSON_PADDING bytes + * of padding) + * @return std::string_view msgpack output, writing to the temporary buffer + */ + inline std::string_view to_msgpack(const simdjson::padded_string &json, + uint8_t *buf); + +private: + simdjson_inline void write_double(const double d) noexcept; + simdjson_inline void write_byte(const uint8_t b) noexcept; + simdjson_inline void write_uint32(const uint32_t w) noexcept; + simdjson_inline uint8_t *skip_uint32() noexcept; + simdjson_inline void write_uint32_at(const uint32_t w, + uint8_t *p) noexcept; + simdjson_inline void + write_raw_string(simdjson::ondemand::raw_json_string rjs); + inline void recursive_processor(simdjson::ondemand::value element); + inline void recursive_processor_ref(simdjson::ondemand::value& element); + + simdjson::ondemand::parser parser; + uint8_t *buff{}; +}; + +std::string_view +simdjson2msgpack::to_msgpack(const simdjson::padded_string &json, + uint8_t *buf) { + buff = buf; + ondemand::document doc = parser.iterate(json); + if (doc.is_scalar()) { + // we have a special case where the JSON document is a single document... + switch (doc.type()) { + case simdjson::ondemand::json_type::number: + write_double(doc.get_double()); + break; + case simdjson::ondemand::json_type::string: + write_raw_string(doc.get_raw_json_string()); + break; + case simdjson::ondemand::json_type::boolean: + write_byte(0xc2 + doc.get_bool()); + break; + case simdjson::ondemand::json_type::null: + // We check that the value is indeed null + // otherwise: an error is thrown. + if(doc.is_null()) { + write_byte(0xc0); + } + break; + case simdjson::ondemand::json_type::array: + case simdjson::ondemand::json_type::object: + default: + // impossible + SIMDJSON_UNREACHABLE(); + } + } else { + simdjson::ondemand::value val = doc; +#define SIMDJSON_GCC_COMPILER ((__GNUC__) && !(__clang__) && !(__INTEL_COMPILER)) +#if SIMDJSON_GCC_COMPILER + // the GCC compiler does well with by-value passing. + // GCC has superior recursive inlining: + // https://stackoverflow.com/questions/29186186/why-does-gcc-generate-a-faster-program-than-clang-in-this-recursive-fibonacci-co + // https://godbolt.org/z/TeK4doE51 + recursive_processor(val); +#else + recursive_processor_ref(val); +#endif + } + if (!doc.at_end()) { + throw "There are unexpectedly tokens after the end of the json in the json2msgpack sample data"; + } + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} + +void simdjson2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +void simdjson2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +void simdjson2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +uint8_t *simdjson2msgpack::skip_uint32() noexcept { + uint8_t *ret = buff; + buff += sizeof(uint32_t); + return ret; +} + +void simdjson2msgpack::write_uint32_at(const uint32_t w, uint8_t *p) noexcept { + ::memcpy(p, &w, sizeof(w)); +} + +void simdjson2msgpack::write_raw_string( + simdjson::ondemand::raw_json_string in) { + write_byte(0xdb); + uint8_t *location = skip_uint32(); + std::string_view v = parser.unescape(in, buff); + write_uint32_at(uint32_t(v.size()), location); +} + +void simdjson2msgpack::recursive_processor(simdjson::ondemand::value element) { + switch (element.type()) { + case simdjson::ondemand::json_type::array: { + uint32_t counter = 0; + write_byte(0xdd); + uint8_t *location = skip_uint32(); + for (auto child : element.get_array()) { + counter++; + recursive_processor(child.value()); + } + write_uint32_at(counter, location); + } break; + case simdjson::ondemand::json_type::object: { + uint32_t counter = 0; + write_byte(0xdf); + uint8_t *location = skip_uint32(); + for (auto field : element.get_object()) { + counter++; + write_raw_string(field.key()); + recursive_processor(field.value()); + } + write_uint32_at(counter, location); + } break; + case simdjson::ondemand::json_type::number: + write_double(element.get_double()); + break; + case simdjson::ondemand::json_type::string: + write_raw_string(element.get_raw_json_string()); + break; + case simdjson::ondemand::json_type::boolean: + write_byte(0xc2 + element.get_bool()); + break; + case simdjson::ondemand::json_type::null: + // We check that the value is indeed null + // otherwise: an error is thrown. + if(element.is_null()) { + write_byte(0xc0); + } + break; + default: + SIMDJSON_UNREACHABLE(); + } +} + + +void simdjson2msgpack::recursive_processor_ref(simdjson::ondemand::value& element) { + switch (element.type()) { + case simdjson::ondemand::json_type::array: { + uint32_t counter = 0; + write_byte(0xdd); + uint8_t *location = skip_uint32(); + for (auto child : element.get_array()) { + counter++; + simdjson::ondemand::value v = child.value(); + recursive_processor_ref(v); + } + write_uint32_at(counter, location); + } break; + case simdjson::ondemand::json_type::object: { + uint32_t counter = 0; + write_byte(0xdf); + uint8_t *location = skip_uint32(); + for (auto field : element.get_object()) { + counter++; + write_raw_string(field.key()); + simdjson::ondemand::value v = field.value(); + recursive_processor_ref(v); + } + write_uint32_at(counter, location); + } break; + case simdjson::ondemand::json_type::number: + write_double(element.get_double()); + break; + case simdjson::ondemand::json_type::string: + write_raw_string(element.get_raw_json_string()); + break; + case simdjson::ondemand::json_type::boolean: + write_byte(0xc2 + element.get_bool()); + break; + case simdjson::ondemand::json_type::null: + // We check that the value is indeed null + // otherwise: an error is thrown. + if(element.is_null()) { + write_byte(0xc0); + } + break; + default: + SIMDJSON_UNREACHABLE(); + } +} + +struct simdjson_ondemand { + using StringType = std::string_view; + + simdjson2msgpack parser{}; + + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + result = parser.to_msgpack(json, reinterpret_cast(buffer)); + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, simdjson_ondemand)->UseManualTime(); + +} // namespace json2msgpack + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/json2msgpack/yyjson.h b/benchmark/json2msgpack/yyjson.h new file mode 100644 index 0000000000..6f95c97317 --- /dev/null +++ b/benchmark/json2msgpack/yyjson.h @@ -0,0 +1,123 @@ +#pragma once +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "json2msgpack.h" + +namespace json2msgpack { + +struct yyjson2msgpack { + inline std::string_view to_msgpack(yyjson_doc *doc, uint8_t *buf); + +private: + inline void write_double(const double d) noexcept; + inline void write_byte(const uint8_t b) noexcept; + inline void write_uint32(const uint32_t w) noexcept; + inline void write_string(const char *s, size_t length) noexcept; + inline void recursive_processor(yyjson_val *obj); + + uint8_t *buff{}; +}; + +std::string_view yyjson2msgpack::to_msgpack(yyjson_doc *doc, uint8_t *buf) { + buff = buf; + yyjson_val *root = yyjson_doc_get_root(doc); + recursive_processor(root); + return std::string_view(reinterpret_cast(buf), size_t(buff - buf)); +} + +void yyjson2msgpack::write_string(const char *c, size_t len) noexcept { + write_byte(0xdb); + write_uint32(uint32_t(len)); + ::memcpy(buff, c, len); + buff += len; +} + +void yyjson2msgpack::write_double(const double d) noexcept { + *buff++ = 0xcb; + ::memcpy(buff, &d, sizeof(d)); + buff += sizeof(d); +} + +void yyjson2msgpack::write_byte(const uint8_t b) noexcept { + *buff = b; + buff++; +} + +void yyjson2msgpack::write_uint32(const uint32_t w) noexcept { + ::memcpy(buff, &w, sizeof(w)); + buff += sizeof(w); +} + +void yyjson2msgpack::recursive_processor(yyjson_val *obj) { + size_t idx, max; + yyjson_val *val; + yyjson_val *key; + switch (yyjson_get_type(obj)) { + case YYJSON_TYPE_STR: + write_string(yyjson_get_str(obj), yyjson_get_len(obj)); + break; + case YYJSON_TYPE_ARR: + write_byte(0xdf); + write_uint32(uint32_t(yyjson_arr_size(obj))); + yyjson_arr_foreach(obj, idx, max, val) { recursive_processor(val); } + break; + case YYJSON_TYPE_OBJ: + write_byte(0xdd); + write_uint32(uint32_t(yyjson_obj_size(obj))); + yyjson_obj_foreach(obj, idx, max, key, val) { + write_string(yyjson_get_str(key), yyjson_get_len(key)); + recursive_processor(val); + } + break; + case YYJSON_TYPE_BOOL: + write_byte(0xc2 + yyjson_get_bool(obj)); + break; + case YYJSON_TYPE_NULL: + write_byte(0xc0); + break; + case YYJSON_TYPE_NUM: + switch (yyjson_get_subtype(obj)) { + case YYJSON_SUBTYPE_UINT: + write_double(double(yyjson_get_uint(obj))); + break; + case YYJSON_SUBTYPE_SINT: + write_double(double(yyjson_get_sint(obj))); + break; + case YYJSON_SUBTYPE_REAL: + write_double(yyjson_get_real(obj)); + break; + default: + SIMDJSON_UNREACHABLE(); + } + break; + default: + SIMDJSON_UNREACHABLE(); + } +} + +struct yyjson : yyjson2msgpack { + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + yyjson_doc *doc = yyjson_read(json.data(), json.size(), 0); + result = to_msgpack(doc, reinterpret_cast(buffer)); + return true; + } +}; + +BENCHMARK_TEMPLATE(json2msgpack, yyjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson2msgpack { + bool run(simdjson::padded_string &json, char *buffer, + std::string_view &result) { + yyjson_doc *doc = + yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0); + result = to_msgpack(doc, reinterpret_cast(buffer)); + return true; + } +}; +BENCHMARK_TEMPLATE(json2msgpack, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace json2msgpack + +#endif // SIMDJSON_COMPETITION_YYJSON \ No newline at end of file diff --git a/benchmark/json_benchmark/constants.h b/benchmark/json_benchmark/constants.h new file mode 100644 index 0000000000..f87f8b734a --- /dev/null +++ b/benchmark/json_benchmark/constants.h @@ -0,0 +1,9 @@ +#pragma once + +namespace json_benchmark { + +static constexpr const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; +static constexpr const char *NUMBERS_JSON = SIMDJSON_BENCHMARK_DATA_DIR "numbers.json"; +static constexpr const char *AMAZON_CELLPHONES_NDJSON = SIMDJSON_BENCHMARK_DATA_DIR "amazon_cellphones.ndjson"; + +} diff --git a/benchmark/json_benchmark/diff_results.h b/benchmark/json_benchmark/diff_results.h new file mode 100644 index 0000000000..a4ab732de5 --- /dev/null +++ b/benchmark/json_benchmark/diff_results.h @@ -0,0 +1,143 @@ +#pragma once + +#include +#include +#include +#include + +namespace json_benchmark { + +enum class diff_flags { + NONE = 0, + IMPRECISE_FLOATS = 1 +}; + +template +static bool diff_results(benchmark::State &state, const T &result, const U &reference, diff_flags flags); + +template +struct result_differ { + static bool diff(benchmark::State &state, const T &result, const U &reference, diff_flags flags) { + if (result != reference) { + std::stringstream str; + str << "result incorrect: " << result << " ... reference: " << reference; + state.SkipWithError(str.str().data()); + return false; + } + return true; + } +}; + +template +struct result_differ, std::vector> { + static bool diff(benchmark::State &state, const std::vector &result, const std::vector &reference, diff_flags flags) { + auto result_iter = result.begin(); + auto reference_iter = reference.begin(); + while (result_iter != result.end() && reference_iter != reference.end()) { + if (!diff_results(state, *result_iter, *reference_iter, flags)) { return false; } + result_iter++; + reference_iter++; + } + if (result_iter != result.end()) { + std::stringstream str; + str << "extra results (got " << result.size() << ", expected " << reference.size() << "): first extra element: " << *result_iter; + state.SkipWithError(str.str().data()); + return false; + } else if (reference_iter != reference.end()) { + std::stringstream str; + str << "missing results (got " << result.size() << ", expected " << reference.size() << "): first missing element: " << *reference_iter; + state.SkipWithError(str.str().data()); + return false; + } + return true; + } +}; + +template +struct result_differ, std::map> { + static bool diff(benchmark::State &state, const std::map &result, const std::map &reference, diff_flags flags) { + auto result_iter = result.begin(); + auto reference_iter = reference.begin(); + while (result_iter != result.end() && reference_iter != reference.end()) { + if (!diff_results(state, *result_iter, *reference_iter, flags)) { return false; } + result_iter++; + reference_iter++; + } + if (result_iter != result.end()) { + std::stringstream str; + str << "extra results (got " << result.size() << ", expected " << reference.size() << "): first extra element: " << *result_iter; + state.SkipWithError(str.str().data()); + return false; + } else if (reference_iter != reference.end()) { + std::stringstream str; + str << "missing results (got " << result.size() << ", expected " << reference.size() << "): first missing element: " << *reference_iter; + state.SkipWithError(str.str().data()); + return false; + } + return true; + } +}; + +template +struct result_differ, std::vector> { + static bool diff(benchmark::State &state, const std::map &result, const std::map &reference, diff_flags flags) { + auto result_iter = result.begin(); + auto reference_iter = reference.begin(); + while (result_iter != result.end() && reference_iter != reference.end()) { + if (!diff_results(state, *result_iter, *reference_iter, flags)) { return false; } + result_iter++; + reference_iter++; + } + if (result_iter != result.end()) { + std::stringstream str; + str << "extra results (got " << result.size() << ", expected " << reference.size() << "): first extra element: " << *result_iter; + state.SkipWithError(str.str().data()); + return false; + } else if (reference_iter != reference.end()) { + std::stringstream str; + str << "missing results (got " << result.size() << ", expected " << reference.size() << "): first missing element: " << *reference_iter; + state.SkipWithError(str.str().data()); + return false; + } + return true; + } +}; + +template<> +struct result_differ { + static bool diff(benchmark::State &state, const double &result, const double &reference, diff_flags flags) { + bool different; + if (int(flags) & int(diff_flags::IMPRECISE_FLOATS)) { + different = f64_ulp_dist(result, reference) > 1; + } else { + different = result != reference; + } + if (different) { + std::stringstream str; + // We print it out using full precision. + constexpr auto precision = std::numeric_limits::max_digits10; + str << std::setprecision(precision); + str << "incorrect double result: " << std::endl; + str << " result: " << std::left << std::setw(precision+2) << result << " (hexfloat " << std::hexfloat << result << ")" << std::defaultfloat << std::endl; + str << "reference: " << std::left << std::setw(precision+2) << reference << " (hexfloat " << std::hexfloat << reference << ")" << std::defaultfloat << std::endl; + state.SkipWithError(str.str().data()); + } + return true; + } + + static uint64_t f64_ulp_dist(double a, double b) { + uint64_t ua, ub; + std::memcpy(&ua, &a, sizeof(ua)); + std::memcpy(&ub, &b, sizeof(ub)); + if ((int64_t)(ub ^ ua) >= 0) + return (int64_t)(ua - ub) >= 0 ? (ua - ub) : (ub - ua); + return ua + ub + 0x80000000; + } +}; + +template +static bool diff_results(benchmark::State &state, const T &result, const U &reference, diff_flags flags) { + return result_differ::diff(state, result, reference, flags); +} + +} // namespace json_benchmark diff --git a/benchmark/json_benchmark/file_runner.h b/benchmark/json_benchmark/file_runner.h new file mode 100644 index 0000000000..0dddc067c4 --- /dev/null +++ b/benchmark/json_benchmark/file_runner.h @@ -0,0 +1,48 @@ +#pragma once + +#include "json_benchmark/runner_base.h" +#include "simdjson.h" + +namespace json_benchmark { + +template +struct file_runner : public runner_base { + simdjson::padded_string original_json{}; + simdjson::padded_string json{}; + + simdjson_warn_unused bool load_json(benchmark::State &state, const char *file) { + simdjson::error_code error; + if ((error = simdjson::padded_string::load(file).get(original_json))) { + std::stringstream err; + err << "error loading " << file << ": " << error; + state.SkipWithError(err.str().data()); + return false; + } + json = simdjson::padded_string(original_json.data(), original_json.size()); + return true; + } + + simdjson_warn_unused bool before_run(benchmark::State &state) { + if (!runner_base::after_run(state)) { return false; }; + // Copy the original JSON in case we did *in situ* last time + std::memcpy(json.data(), original_json.data(), original_json.size()); + return true; + } + + /** Get the total number of bytes processed in each iteration. Used for metrics like bytes/second. */ + size_t bytes_per_iteration() { + return json.size(); + } + + /** Get the total number of documents processed in each iteration. Used for metrics like documents/second. */ + size_t documents_per_iteration() { + return 1; + } + + /** Get the total number of items processed in each iteration. Used for metrics like items/second. */ + size_t items_per_iteration() { + return 1; + } +}; + +} // namespace json_benchmark diff --git a/benchmark/json_benchmark/point.h b/benchmark/json_benchmark/point.h new file mode 100644 index 0000000000..579c69495e --- /dev/null +++ b/benchmark/json_benchmark/point.h @@ -0,0 +1,26 @@ +#pragma once + +#include "diff_results.h" + +namespace json_benchmark { + +struct point { + double x; + double y; + double z; +}; + +template<> +struct result_differ { + static bool diff(benchmark::State &state, const point &result, const point &reference, diff_flags flags) { + return diff_results(state, result.x, reference.x, flags) + && diff_results(state, result.y, reference.y, flags) + && diff_results(state, result.z, reference.z, flags); + } +}; + +static simdjson_unused std::ostream &operator<<(std::ostream &o, const point &p) { + return o << p.x << "," << p.y << "," << p.z << std::endl; +} + +} // namespace json_benchmark diff --git a/benchmark/json_benchmark/run_json_benchmark.h b/benchmark/json_benchmark/run_json_benchmark.h new file mode 100644 index 0000000000..29917697c4 --- /dev/null +++ b/benchmark/json_benchmark/run_json_benchmark.h @@ -0,0 +1,112 @@ +#pragma once + +#include "simdjson.h" +#include "event_counter.h" +#include + +namespace json_benchmark { + +void maybe_display_implementation() { + static bool displayed_implementation = false; + if(!displayed_implementation) { + displayed_implementation = true; + std::cout << "simdjson::dom implementation: " << simdjson::get_active_implementation()->name() << std::endl; + std::cout << "simdjson::ondemand implementation (stage 1): " << simdjson::get_active_implementation()->name() << std::endl; + std::cout << "simdjson::ondemand implementation (stage 2): " << simdjson::builtin_implementation()->name() << std::endl; + } +} + +template static void run_json_benchmark(benchmark::State &state) { + maybe_display_implementation(); + + event_collector collector; + event_aggregate events; + + // Warmup and equality check (make sure the data is right!) + B bench; + if (!bench.setup(state)) { return; } + if (!bench.before_run(state)) { state.SkipWithError("warmup document before_run failed"); return; } + if (!bench.run(state)) { state.SkipWithError("warmup document reading failed"); return; } + if (!bench.after_run(state)) { state.SkipWithError("warmup document after_run failed"); return; } + { + R reference; + if (!reference.setup(state)) { return; } + if (!reference.before_run(state)) { state.SkipWithError("reference before_run failed"); }; + if (!reference.run(state)) { state.SkipWithError("reference document reading failed"); return; } + if (!reference.after_run(state)) { state.SkipWithError("reference before_run failed"); }; + if (!bench.diff(state, reference)) { return; } + } + + // Run the benchmark + for (simdjson_unused auto _ : state) { + if (!bench.before_run(state)) { state.SkipWithError("before_run failed"); }; + collector.start(); + if (!bench.run(state)) { state.SkipWithError("run failed"); return; } + auto event = collector.end(); + events << event; + state.SetIterationTime(event.elapsed_sec()); + if (!bench.after_run(state)) { state.SkipWithError("after_run failed"); return; }; + } + + state.SetBytesProcessed(bench.bytes_per_iteration() * state.iterations()); + state.SetItemsProcessed(bench.items_per_iteration() * state.iterations()); + state.counters["best_docs_per_sec"] = benchmark::Counter(double(bench.documents_per_iteration()) / events.best.elapsed_sec()); + state.counters["best_bytes_per_sec"] = benchmark::Counter(double(bench.bytes_per_iteration()) / events.best.elapsed_sec()); + state.counters["best_items_per_sec"] = benchmark::Counter(double(bench.items_per_iteration()) / events.best.elapsed_sec()); + state.counters["docs_per_sec"] = benchmark::Counter(double(bench.documents_per_iteration()), benchmark::Counter::kIsIterationInvariantRate); + + if (collector.has_events()) { + state.counters["instructions"] = events.instructions(); + state.counters["cycles"] = events.cycles(); +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + state.counters["branch_miss"] = events.branch_misses(); + state.counters["cache_miss"] = events.cache_misses(); + state.counters["cache_ref"] = events.cache_references(); +#endif + state.counters["instructions_per_byte"] = events.instructions() / double(bench.bytes_per_iteration()); + state.counters["instructions_per_cycle"] = events.instructions() / events.cycles(); + state.counters["cycles_per_byte"] = events.cycles() / double(bench.bytes_per_iteration()); + state.counters["frequency"] = benchmark::Counter(events.cycles(), benchmark::Counter::kIsIterationInvariantRate); + + state.counters["best_instructions"] = events.best.instructions(); + state.counters["best_cycles"] = events.best.cycles(); +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + state.counters["best_branch_miss"] = events.best.branch_misses(); + state.counters["best_cache_miss"] = events.best.cache_misses(); + state.counters["best_cache_ref"] = events.best.cache_references(); +#endif + + state.counters["best_instructions_per_byte"] = events.best.instructions() / double(bench.bytes_per_iteration()); + state.counters["best_instructions_per_cycle"] = events.best.instructions() / events.best.cycles(); + state.counters["best_cycles_per_byte"] = events.best.cycles() / double(bench.bytes_per_iteration()); + state.counters["best_frequency"] = events.best.cycles() / events.best.elapsed_sec(); + } + state.counters["bytes"] = benchmark::Counter(double(bench.bytes_per_iteration())); + state.counters["items"] = benchmark::Counter(double(bench.items_per_iteration())); + + // Build the label + using namespace std; + stringstream label; + label << fixed << setprecision(2); + label << "[BEST:"; + label << " throughput=" << setw(6) << (double(bench.bytes_per_iteration()) / 1000000000.0 / events.best.elapsed_sec()) << " GB/s"; + label << " doc_throughput=" << setw(6) << uint64_t(bench.documents_per_iteration() / events.best.elapsed_sec()) << " docs/s"; + + if (collector.has_events()) { + label << " instructions=" << setw(12) << uint64_t(events.best.instructions()) << setw(0); + label << " cycles=" << setw(12) << uint64_t(events.best.cycles()) << setw(0); +#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS + label << " branch_miss=" << setw(8) << uint64_t(events.best.branch_misses()) << setw(0); + label << " cache_miss=" << setw(8) << uint64_t(events.best.cache_misses()) << setw(0); + label << " cache_ref=" << setw(10) << uint64_t(events.best.cache_references()) << setw(0); +#endif + } + + label << " items=" << setw(10) << bench.items_per_iteration() << setw(0); + label << " avg_time=" << setw(10) << uint64_t(events.elapsed_ns()) << setw(0) << " ns"; + label << "]"; + + state.SetLabel(label.str()); +} + +} // namespace json_benchmark diff --git a/benchmark/json_benchmark/runner_base.h b/benchmark/json_benchmark/runner_base.h new file mode 100644 index 0000000000..a3758b4b5f --- /dev/null +++ b/benchmark/json_benchmark/runner_base.h @@ -0,0 +1,40 @@ +#pragma once + +#include "constants.h" +#include "run_json_benchmark.h" +#include "diff_results.h" + +namespace json_benchmark { + +// +// Extend this to create a new type of test (e.g. partial_tweets). +// +template +struct runner_base { + /** Run once, before all iterations. */ + simdjson_warn_unused bool setup(benchmark::State &) { return true; } + + /** Run on each iteration. This is what gets benchmarked. */ + simdjson_warn_unused bool run(benchmark::State &state) { + return implementation.run(state); + } + + /** Called before each iteration, to clear / set up state. */ + simdjson_warn_unused bool before_run(benchmark::State &state) { return true; } + + /** Called after each iteration, to tear down / massage state. */ + simdjson_warn_unused bool after_run(benchmark::State &) { return true; } + + /** Get the total number of bytes processed in each iteration. Used for metrics like bytes/second. */ + size_t bytes_per_iteration(); + + /** Get the total number of documents processed in each iteration. Used for metrics like documents/second. */ + size_t documents_per_iteration(); + + /** Get the total number of items processed in each iteration. Used for metrics like items/second. */ + size_t items_per_iteration(); + + I implementation{}; +}; + +} \ No newline at end of file diff --git a/benchmark/json_benchmark/string_runner.h b/benchmark/json_benchmark/string_runner.h new file mode 100644 index 0000000000..f9b4603869 --- /dev/null +++ b/benchmark/json_benchmark/string_runner.h @@ -0,0 +1,37 @@ +#pragma once + +#include "runner_base.h" +#include "simdjson.h" + +namespace json_benchmark { + +template +struct string_runner : public runner_base { + const simdjson::padded_string &original_json; + simdjson::padded_string json; + string_runner(const simdjson::padded_string &_json) : original_json{_json}, json(original_json.data(), original_json.size()) {} + + simdjson_warn_unused bool before_run(benchmark::State &state) { + if (!runner_base::after_run(state)) { return false; }; + // Copy the original JSON in case we did *in situ* + std::memcpy(json.data(), original_json.data(), original_json.size()); + return true; + } + + /** Get the total number of bytes processed in each iteration. Used for metrics like bytes/second. */ + size_t bytes_per_iteration() { + return json.size(); + } + + /** Get the total number of documents processed in each iteration. Used for metrics like documents/second. */ + size_t documents_per_iteration() { + return 1; + } + + /** Get the total number of items processed in each iteration. Used for metrics like items/second. */ + size_t items_per_iteration() { + return 1; + } +}; + +} // namespace json_benchmark \ No newline at end of file diff --git a/benchmark/kostya/boostjson.h b/benchmark/kostya/boostjson.h new file mode 100644 index 0000000000..bfe2b2c83a --- /dev/null +++ b/benchmark/kostya/boostjson.h @@ -0,0 +1,29 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "kostya.h" + +namespace kostya { + +struct boostjson { + static constexpr diff_flags DiffFlags = diff_flags::IMPRECISE_FLOATS; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto root = boost::json::parse(json); + for (const auto &point : root.at("coordinates").as_array()) { + result.emplace_back(json_benchmark::point{ + point.at("x").to_number(), + point.at("y").to_number(), + point.at("z").to_number() + }); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(kostya, boostjson)->UseManualTime(); + +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/kostya/kostya.h b/benchmark/kostya/kostya.h new file mode 100644 index 0000000000..42ce8ebc8a --- /dev/null +++ b/benchmark/kostya/kostya.h @@ -0,0 +1,86 @@ +#pragma once + +#include "json_benchmark/string_runner.h" +#include "json_benchmark/point.h" +#include +#include + +namespace kostya { + +using namespace json_benchmark; + +static const simdjson::padded_string &get_built_json_array(); + +template +struct runner : public string_runner { + std::vector result; + + runner() : string_runner(get_built_json_array()) {} + + bool before_run(benchmark::State &state) { + if (!string_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, I::DiffFlags); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +static void append_coordinate(std::default_random_engine &e, std::uniform_real_distribution<> &dis, std::stringstream &myss) { + using std::endl; + myss << R"( {)" << endl; + myss << R"( "x": )" << dis(e) << "," << endl; + myss << R"( "y": )" << dis(e) << "," << endl; + myss << R"( "z": )" << dis(e) << "," << endl; + myss << R"( "name": ")" << char('a'+dis(e)*25) << char('a'+dis(e)*25) << char('a'+dis(e)*25) << char('a'+dis(e)*25) << char('a'+dis(e)*25) << char('a'+dis(e)*25) << " " << int(dis(e)*10000) << "\"," << endl; + myss << R"( "opts": {)" << endl; + myss << R"( "1": [)" << endl; + myss << R"( 1,)" << endl; + myss << R"( true)" << endl; + myss << R"( ])" << endl; + myss << R"( })" << endl; + myss << R"( })"; +} + +static std::string build_json_array(size_t N) { + using namespace std; + default_random_engine e; + uniform_real_distribution<> dis(0, 1); + stringstream myss; + myss << R"({)" << endl; + myss << R"( "coordinates": [)" << endl; + for (size_t i=1; i simdjson_inline static void kostya(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace kostya diff --git a/benchmark/kostya/nlohmann_json.h b/benchmark/kostya/nlohmann_json.h new file mode 100644 index 0000000000..9552edca26 --- /dev/null +++ b/benchmark/kostya/nlohmann_json.h @@ -0,0 +1,25 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "kostya.h" + +namespace kostya { + +struct nlohmann_json { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto point : root["coordinates"]) { + result.emplace_back(json_benchmark::point{point["x"], point["y"], point["z"]}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(kostya, nlohmann_json)->UseManualTime(); + +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON diff --git a/benchmark/kostya/nlohmann_json_sax.h b/benchmark/kostya/nlohmann_json_sax.h new file mode 100644 index 0000000000..24df872ac2 --- /dev/null +++ b/benchmark/kostya/nlohmann_json_sax.h @@ -0,0 +1,74 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "kostya.h" + +namespace kostya { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + struct Handler : json::json_sax_t + { + size_t k{0}; + double buffer[3]; + std::vector& result; + + Handler(std::vector& r) : result(r) { } + + bool key(string_t& val) override { + switch(val[0]) { + case 'x': + k = 0; + break; + case 'y': + k = 1; + break; + case 'z': + k = 2; + break; + } + return true; + } + bool number_float(number_float_t val, const string_t& s) override { + buffer[k] = val; + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + bool number_unsigned(number_unsigned_t val) override { // Need this event because coordinate value can be equal to 1 + buffer[k] = double(val); + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool string(string_t& val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Handler handler(result); + json::sax_parse(json.data(), &handler); + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(kostya, nlohmann_json_sax)->UseManualTime(); +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/kostya/rapidjson.h b/benchmark/kostya/rapidjson.h new file mode 100644 index 0000000000..960130afbc --- /dev/null +++ b/benchmark/kostya/rapidjson.h @@ -0,0 +1,62 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "kostya.h" + +namespace kostya { + +using namespace rapidjson; + +struct rapidjson_base { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + Document doc; + + simdjson_inline double get_double(Value &object, std::string_view key) { + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing double field"; } + if (!field->value.IsNumber()) { throw "Field is not double"; } + return field->value.GetDouble(); + } + + bool run(Document &root, std::vector &result) { + if (root.HasParseError()) { return false; } + if (!root.IsObject()) { return false; } + auto coords = root.FindMember("coordinates"); + if (coords == root.MemberEnd()) { return false; } + if (!coords->value.IsArray()) { return false; } + for (auto &coord : coords->value.GetArray()) { + if (!coord.IsObject()) { return false; } + result.emplace_back(json_benchmark::point{get_double(coord, "x"), get_double(coord, "y"), get_double(coord, "z")}); + } + + return true; + } +}; +#if SIMDJSON_COMPETITION_ONDEMAND_APPROX +struct rapidjson_approx : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(kostya, rapidjson_approx)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_APPROX + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(kostya, rapidjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(kostya, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/kostya/rapidjson_sax.h b/benchmark/kostya/rapidjson_sax.h new file mode 100644 index 0000000000..be49a9304a --- /dev/null +++ b/benchmark/kostya/rapidjson_sax.h @@ -0,0 +1,70 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "kostya.h" + +namespace kostya { + +using namespace rapidjson; + +struct rapidjson_sax { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + struct Handler { + size_t k{0}; + double buffer[3]; + std::vector& result; + + Handler(std::vector &r) : result(r) { } + + bool Key(const char* key, SizeType length, bool copy) { + switch(key[0]) { + case 'x': + k = 0; + break; + case 'y': + k = 1; + break; + case 'z': + k = 2; + break; + } + return true; + } + bool Double(double d) { + buffer[k] = d; + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + bool Uint(unsigned i) { return Double(i); } // Need this event because coordinate value can be equal to 1 + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint64(uint64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool String(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Reader reader; + Handler handler(result); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } + +}; // rapid_jason_sax +BENCHMARK_TEMPLATE(kostya, rapidjson_sax)->UseManualTime(); +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_RAPIDJSON \ No newline at end of file diff --git a/benchmark/kostya/sajson.h b/benchmark/kostya/sajson.h new file mode 100644 index 0000000000..972f43c631 --- /dev/null +++ b/benchmark/kostya/sajson.h @@ -0,0 +1,66 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "kostya.h" + +namespace kostya { + +struct sajson { + static constexpr diff_flags DiffFlags = diff_flags::IMPRECISE_FLOATS; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + simdjson_inline double get_double(const ::sajson::value &obj, std::string_view key) { + using namespace sajson; + + auto val = obj.get_value_of_key({key.data(), key.length()}); + switch (val.get_type()) { + case TYPE_INTEGER: + case TYPE_DOUBLE: + return val.get_number_value(); + default: + throw "field not double"; + } + } + + bool run(simdjson::padded_string &json, std::vector &result) { + using namespace sajson; + + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = parse( + bounded_allocation(ast_buffer, ast_buffer_size), + mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != TYPE_OBJECT) { return false; } + auto points = root.get_value_of_key({"coordinates", strlen("coordinates")}); + if (points.get_type() != TYPE_ARRAY) { return false; } + + for (size_t i=0; iUseManualTime(); + +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_SAJSON + diff --git a/benchmark/kostya/simdjson_dom.h b/benchmark/kostya/simdjson_dom.h new file mode 100644 index 0000000000..34c28614e4 --- /dev/null +++ b/benchmark/kostya/simdjson_dom.h @@ -0,0 +1,28 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "kostya.h" + +namespace kostya { + +using namespace simdjson; + +struct simdjson_dom { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + for (auto point : parser.parse(json)["coordinates"]) { + result.emplace_back(json_benchmark::point{point["x"], point["y"], point["z"]}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(kostya, simdjson_dom)->UseManualTime(); + +} // namespace kostya + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/kostya/simdjson_ondemand.h b/benchmark/kostya/simdjson_ondemand.h new file mode 100644 index 0000000000..33236c18ab --- /dev/null +++ b/benchmark/kostya/simdjson_ondemand.h @@ -0,0 +1,29 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "kostya.h" + +namespace kostya { + +using namespace simdjson; + +struct simdjson_ondemand { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto doc = parser.iterate(json); + for (ondemand::object point : doc.find_field("coordinates")) { + result.emplace_back(json_benchmark::point{point.find_field("x"), point.find_field("y"), point.find_field("z")}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(kostya, simdjson_ondemand)->UseManualTime(); + +} // namespace kostya + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/kostya/yyjson.h b/benchmark/kostya/yyjson.h new file mode 100644 index 0000000000..544adb24ed --- /dev/null +++ b/benchmark/kostya/yyjson.h @@ -0,0 +1,66 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "kostya.h" + +namespace kostya { + +struct yyjson_base { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + simdjson_inline double get_double(yyjson_val *obj, std::string_view key) { + yyjson_val *val = yyjson_obj_getn(obj, key.data(), key.length()); + if (!val) { throw "missing point field!"; } + if (yyjson_get_type(val) != YYJSON_TYPE_NUM) { throw "Number is not a type!"; } + + switch (yyjson_get_subtype(val)) { + case YYJSON_SUBTYPE_UINT: + return double(yyjson_get_uint(val)); + case YYJSON_SUBTYPE_SINT: + return double(yyjson_get_sint(val)); + case YYJSON_SUBTYPE_REAL: + return yyjson_get_real(val); + default: + SIMDJSON_UNREACHABLE(); + } + SIMDJSON_UNREACHABLE(); + return 0.0; // unreachable + } + + bool run(yyjson_doc *doc, std::vector &result) { + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *coords = yyjson_obj_get(root, "coordinates"); + if (!yyjson_is_arr(coords)) { return false; } + + size_t idx, max; + yyjson_val *coord; + yyjson_arr_foreach(coords, idx, max, coord) { + if (!yyjson_is_obj(coord)) { return false; } + result.emplace_back(json_benchmark::point{get_double(coord, "x"), get_double(coord, "y"), get_double(coord, "z")}); + } + + return true; + } + +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), result); + } +}; +BENCHMARK_TEMPLATE(kostya, yyjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), result); + } +}; +BENCHMARK_TEMPLATE(kostya, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace kostya + +#endif // SIMDJSON_COMPETITION_YYJSON diff --git a/benchmark/large_amazon_cellphones/large_amazon_cellphones.h b/benchmark/large_amazon_cellphones/large_amazon_cellphones.h new file mode 100644 index 0000000000..8a3b7119a9 --- /dev/null +++ b/benchmark/large_amazon_cellphones/large_amazon_cellphones.h @@ -0,0 +1,94 @@ +#pragma once + +#include "json_benchmark/string_runner.h" +#include +#include + +namespace large_amazon_cellphones { + +const bool UNTHREADED = false; +const bool THREADED = true; + +static const simdjson::padded_string &get_built_json(); + +using namespace json_benchmark; + +struct brand { + double cumulative_rating; + uint64_t reviews_count; + simdjson_inline bool operator==(const brand &other) const { + return cumulative_rating == other.cumulative_rating && + reviews_count == other.reviews_count; + } + simdjson_inline bool operator!=(const brand &other) const { return !(*this == other); } +}; + +simdjson_unused static std::ostream &operator<<(std::ostream &o, const brand &b) { + o << "cumulative_rating: " << b.cumulative_rating << std::endl; + o << "reviews_count: " << b.reviews_count << std::endl; + return o; +} + +template +simdjson_unused static std::ostream &operator<<(std::ostream &o, const std::pair &p) { + o << "brand: " << p.first << std::endl; + o << p.second; + return o; +} + +template +struct runner : public string_runner { + std::map result{}; + + runner() : string_runner(get_built_json()) {} + + bool before_run(benchmark::State &state) { + if (!string_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +static std::string build_json(size_t N) { + std::ifstream in(AMAZON_CELLPHONES_NDJSON); + std::string answer((std::istreambuf_iterator(in)), std::istreambuf_iterator()); + // Find position of first line to exclude it in further copies + size_t first_line = answer.find('\n'); + std::string copy(answer,first_line + 1); + size_t count{1}; + + while (answer.size() < N) { + answer.append(copy); + count++; + } + + std::cout << "Creating a source file spanning " << (answer.size() + 512) / (1024*1024) << " MB (" << count << " copies of original file)" << std::endl; + return answer; +} + +static const simdjson::padded_string &get_built_json() { + static simdjson::padded_string json = build_json(10*1024*1024); + return json; +} + +template +struct simdjson_dom; + +template simdjson_inline static void large_amazon_cellphones(benchmark::State &state) { + run_json_benchmark, runner>>(state); +} + +} // namespace large_amazon_cellphones diff --git a/benchmark/large_amazon_cellphones/simdjson_dom.h b/benchmark/large_amazon_cellphones/simdjson_dom.h new file mode 100644 index 0000000000..90f0a0865f --- /dev/null +++ b/benchmark/large_amazon_cellphones/simdjson_dom.h @@ -0,0 +1,52 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "large_amazon_cellphones.h" +#include + +namespace large_amazon_cellphones { + +using namespace simdjson; + +template +struct simdjson_dom { + using StringType = std::string; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::map &result) { +#ifdef SIMDJSON_THREADS_ENABLED + parser.threaded = threaded; +#endif + auto stream = parser.parse_many(json); + auto i = stream.begin(); + ++i; // Skip first line + for (;i != stream.end(); ++i) { + auto doc = *i; + StringType copy(std::string_view(doc.at(1))); + auto x = result.find(copy); + if (x == result.end()) { // If key not found, add new key + result.emplace(copy, large_amazon_cellphones::brand{ + double(doc.at(5)) * uint64_t(doc.at(7)), + uint64_t(doc.at(7)) + }); + } else { // Otherwise, update key data + x->second.cumulative_rating += double(doc.at(5)) * uint64_t(doc.at(7)); + x->second.reviews_count += uint64_t(doc.at(7)); + } + } + + return true; + } + +}; + +BENCHMARK_TEMPLATE(large_amazon_cellphones, simdjson_dom)->UseManualTime(); +#ifdef SIMDJSON_THREADS_ENABLED +BENCHMARK_TEMPLATE(large_amazon_cellphones, simdjson_dom)->UseManualTime(); +#endif + +} // namespace large_amazon_cellphones + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/large_amazon_cellphones/simdjson_ondemand.h b/benchmark/large_amazon_cellphones/simdjson_ondemand.h new file mode 100644 index 0000000000..c1acdc455c --- /dev/null +++ b/benchmark/large_amazon_cellphones/simdjson_ondemand.h @@ -0,0 +1,72 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "large_amazon_cellphones.h" + +namespace large_amazon_cellphones { + +using namespace simdjson; + +template +struct simdjson_ondemand { + using StringType = std::string; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::map &result) { +#ifdef SIMDJSON_THREADS_ENABLED + parser.threaded = threaded; +#endif + ondemand::document_stream stream = parser.iterate_many(json); + ondemand::document_stream::iterator i = stream.begin(); + ++i; // Skip first line + for (;i != stream.end(); ++i) { + auto doc = *i; + size_t index{0}; + StringType copy; + double rating; + uint64_t reviews; + for ( auto value : doc ) { + switch (index) + { + case 1: + copy = StringType(std::string_view(value)); + break; + case 5: + rating = double(value); + break; + case 7: + reviews = uint64_t(value); + break; + default: + break; + } + index++; + } + + auto x = result.find(copy); + if (x == result.end()) { // If key not found, add new key + result.emplace(copy, large_amazon_cellphones::brand{ + rating * reviews, + reviews + }); + } else { // Otherwise, update key data + x->second.cumulative_rating += rating * reviews; + x->second.reviews_count += reviews; + } + } + + return true; + } + +}; + +BENCHMARK_TEMPLATE(large_amazon_cellphones, simdjson_ondemand)->UseManualTime(); +#ifdef SIMDJSON_THREADS_ENABLED +BENCHMARK_TEMPLATE(large_amazon_cellphones, simdjson_ondemand)->UseManualTime(); +#endif + +} // namespace amazon_cellphones + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/large_random/boostjson.h b/benchmark/large_random/boostjson.h new file mode 100644 index 0000000000..da2fdf63cd --- /dev/null +++ b/benchmark/large_random/boostjson.h @@ -0,0 +1,29 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "large_random.h" + +namespace large_random { + +struct boostjson { + static constexpr diff_flags DiffFlags = diff_flags::IMPRECISE_FLOATS; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto root = boost::json::parse(json); + for (const auto &point : root.as_array()) { + result.emplace_back(json_benchmark::point{ + point.at("x").to_number(), + point.at("y").to_number(), + point.at("z").to_number() + }); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(large_random, boostjson)->UseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/large_random/large_random.h b/benchmark/large_random/large_random.h new file mode 100644 index 0000000000..039884f4a9 --- /dev/null +++ b/benchmark/large_random/large_random.h @@ -0,0 +1,73 @@ +#pragma once + +#include "json_benchmark/string_runner.h" +#include "json_benchmark/point.h" +#include + +namespace large_random { + +static const simdjson::padded_string &get_built_json_array(); + +using namespace json_benchmark; + +simdjson_unused static std::ostream &operator<<(std::ostream &o, const point &p) { + return o << p.x << "," << p.y << "," << p.z << std::endl; +} + +template +struct runner : public string_runner { + std::vector result; + + runner() : string_runner(get_built_json_array()) {} + + bool before_run(benchmark::State &state) { + if (!string_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, I::DiffFlags); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +static std::string build_json_array(size_t N) { + std::default_random_engine e; + std::uniform_real_distribution<> dis(0, 1); + std::stringstream myss; + myss << "[" << std::endl; + if(N > 0) { + myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}" << std::endl; + } + for(size_t i = 1; i < N; i++) { + myss << "," << std::endl; + myss << "{ \"x\":" << dis(e) << ", \"y\":" << dis(e) << ", \"z\":" << dis(e) << "}"; + } + myss << std::endl; + myss << "]" << std::endl; + std::string answer = myss.str(); + std::cout << "Creating a source file spanning " << (answer.size() + 512) / 1024 << " KB " << std::endl; + return answer; +} + +static const simdjson::padded_string &get_built_json_array() { + static simdjson::padded_string json = build_json_array(1000000); + return json; +} + +struct simdjson_dom; + +template static void large_random(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace large_random diff --git a/benchmark/large_random/nlohmann_json.h b/benchmark/large_random/nlohmann_json.h new file mode 100644 index 0000000000..754a53f6ad --- /dev/null +++ b/benchmark/large_random/nlohmann_json.h @@ -0,0 +1,24 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "large_random.h" + +namespace large_random { + +struct nlohmann_json { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + bool run(simdjson::padded_string &json, std::vector &result) { + for (auto point : nlohmann::json::parse(json.data(), json.data() + json.size())) { + result.emplace_back(json_benchmark::point{point["x"], point["y"], point["z"]}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(large_random, nlohmann_json)->UseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON diff --git a/benchmark/large_random/nlohmann_json_sax.h b/benchmark/large_random/nlohmann_json_sax.h new file mode 100644 index 0000000000..51aa5f31d5 --- /dev/null +++ b/benchmark/large_random/nlohmann_json_sax.h @@ -0,0 +1,73 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "large_random.h" + +namespace large_random { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + struct Handler : json::json_sax_t + { + size_t k{0}; + double buffer[3]; + std::vector& result; + + Handler(std::vector &r) : result(r) { } + + bool key(string_t& val) override { + switch(val[0]) { + case 'x': + k = 0; + break; + case 'y': + k = 1; + break; + case 'z': + k = 2; + break; + } + return true; + } + bool number_unsigned(number_unsigned_t val) override { + buffer[k] = double(val); + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + bool number_float(number_float_t val, const string_t& s) override { + buffer[k] = val; + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool string(string_t& val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Handler handler(result); + json::sax_parse(json.data(), &handler); + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(large_random, nlohmann_json_sax)->UseManualTime(); +} // namespace large_random +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/large_random/rapidjson.h b/benchmark/large_random/rapidjson.h new file mode 100644 index 0000000000..c0f83dde89 --- /dev/null +++ b/benchmark/large_random/rapidjson.h @@ -0,0 +1,61 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "large_random.h" + +namespace large_random { + +using namespace rapidjson; + +struct rapidjson_base { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + Document doc; + + simdjson_inline double get_double(Value &object, std::string_view key) { + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing double field"; } + if (!field->value.IsNumber()) { throw "Field is not double"; } + return field->value.GetDouble(); + } + + bool run(Document &coords, std::vector &result) { + if (coords.HasParseError()) { return false; } + if (!coords.IsArray()) { return false; } + for (auto &coord : coords.GetArray()) { + if (!coord.IsObject()) { return false; } + result.emplace_back(json_benchmark::point{get_double(coord, "x"), get_double(coord, "y"), get_double(coord, "z")}); + } + + return true; + } +}; +#if SIMDJSON_COMPETITION_ONDEMAND_APPROX +struct rapidjson_approx : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(large_random, rapidjson_approx)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_APPROX + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(large_random, rapidjson)->UseManualTime(); + +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(large_random, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU + +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/large_random/rapidjson_sax.h b/benchmark/large_random/rapidjson_sax.h new file mode 100644 index 0000000000..a9dfc49967 --- /dev/null +++ b/benchmark/large_random/rapidjson_sax.h @@ -0,0 +1,69 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "large_random.h" + +namespace large_random { + +using namespace rapidjson; + +struct rapidjson_sax { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + struct Handler { + size_t k{0}; + double buffer[3]; + std::vector& result; + + Handler(std::vector &r) : result(r) { } + + bool Key(const char* key, SizeType length, bool copy) { + switch(key[0]) { + case 'x': + k = 0; + break; + case 'y': + k = 1; + break; + case 'z': + k = 2; + break; + } + return true; + } + bool Double(double d) { + buffer[k] = d; + if (k == 2) { + result.emplace_back(json_benchmark::point{buffer[0],buffer[1],buffer[2]}); + k = 0; + } + return true; + } + bool Uint(unsigned i) { return Double(i); } // Need this event because coordinate value can be equal to 1 + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint64(uint64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool String(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, std::vector &result) { + Reader reader; + Handler handler(result); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } +}; // rapid_jason_sax +BENCHMARK_TEMPLATE(large_random, rapidjson_sax)->UseManualTime(); +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_RAPIDJSON \ No newline at end of file diff --git a/benchmark/large_random/sajson.h b/benchmark/large_random/sajson.h new file mode 100644 index 0000000000..a551436aec --- /dev/null +++ b/benchmark/large_random/sajson.h @@ -0,0 +1,64 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "large_random.h" + +namespace large_random { + +struct sajson { + static constexpr diff_flags DiffFlags = diff_flags::IMPRECISE_FLOATS; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + simdjson_inline double get_double(const ::sajson::value &obj, std::string_view key) { + using namespace sajson; + + auto val = obj.get_value_of_key({key.data(), key.length()}); + switch (val.get_type()) { + case TYPE_INTEGER: + case TYPE_DOUBLE: + return val.get_number_value(); + default: + throw "field not double"; + } + } + + bool run(simdjson::padded_string &json, std::vector &result) { + using namespace sajson; + + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = parse( + bounded_allocation(ast_buffer, ast_buffer_size), + mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto points = doc.get_root(); + if (points.get_type() != TYPE_ARRAY) { return false; } + + for (size_t i=0; iUseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_SAJSON + diff --git a/benchmark/large_random/simdjson_dom.h b/benchmark/large_random/simdjson_dom.h new file mode 100644 index 0000000000..80e4e16573 --- /dev/null +++ b/benchmark/large_random/simdjson_dom.h @@ -0,0 +1,28 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "large_random.h" + +namespace large_random { + +using namespace simdjson; + +struct simdjson_dom { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + for (auto point : parser.parse(json)) { + result.emplace_back(json_benchmark::point{point["x"], point["y"], point["z"]}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(large_random, simdjson_dom)->UseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/large_random/simdjson_ondemand.h b/benchmark/large_random/simdjson_ondemand.h new file mode 100644 index 0000000000..4fd1fbfdb7 --- /dev/null +++ b/benchmark/large_random/simdjson_ondemand.h @@ -0,0 +1,29 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "large_random.h" + +namespace large_random { + +using namespace simdjson; + +struct simdjson_ondemand { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto doc = parser.iterate(json); + for (ondemand::object coord : doc) { + result.emplace_back(json_benchmark::point{coord.find_field("x"), coord.find_field("y"), coord.find_field("z")}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(large_random, simdjson_ondemand)->UseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/large_random/simdjson_ondemand_unordered.h b/benchmark/large_random/simdjson_ondemand_unordered.h new file mode 100644 index 0000000000..27164d4a88 --- /dev/null +++ b/benchmark/large_random/simdjson_ondemand_unordered.h @@ -0,0 +1,29 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "large_random.h" + +namespace large_random { + +using namespace simdjson; + +struct simdjson_ondemand_unordered { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, std::vector &result) { + auto doc = parser.iterate(json); + for (ondemand::object coord : doc) { + result.emplace_back(json_benchmark::point{coord["x"], coord["y"], coord["z"]}); + } + return true; + } +}; + +BENCHMARK_TEMPLATE(large_random, simdjson_ondemand_unordered)->UseManualTime(); + +} // namespace large_random + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/large_random/yyjson.h b/benchmark/large_random/yyjson.h new file mode 100644 index 0000000000..d8ccd2d4ea --- /dev/null +++ b/benchmark/large_random/yyjson.h @@ -0,0 +1,64 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "large_random.h" + +namespace large_random { + +struct yyjson_base { + static constexpr diff_flags DiffFlags = diff_flags::NONE; + + simdjson_inline double get_double(yyjson_val *obj, std::string_view key) { + yyjson_val *val = yyjson_obj_getn(obj, key.data(), key.length()); + if (!val) { throw "missing point field!"; } + if (yyjson_get_type(val) != YYJSON_TYPE_NUM) { throw "Number is not a type!"; } + + switch (yyjson_get_subtype(val)) { + case YYJSON_SUBTYPE_UINT: + return double(yyjson_get_uint(val)); + case YYJSON_SUBTYPE_SINT: + return double(yyjson_get_sint(val)); + case YYJSON_SUBTYPE_REAL: + return yyjson_get_real(val); + default: + SIMDJSON_UNREACHABLE(); + } + SIMDJSON_UNREACHABLE(); + return 0.0; // unreachable + } + + bool run(yyjson_doc *doc, std::vector &result) { + if (!doc) { return false; } + yyjson_val *coords = yyjson_doc_get_root(doc); + if (!yyjson_is_arr(coords)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t idx, max; + yyjson_val *coord; + yyjson_arr_foreach(coords, idx, max, coord) { + if (!yyjson_is_obj(coord)) { return false; } + result.emplace_back(json_benchmark::point{get_double(coord, "x"), get_double(coord, "y"), get_double(coord, "z")}); + } + + return true; + } +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), result); + } +}; +BENCHMARK_TEMPLATE(large_random, yyjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, std::vector &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), result); + } +}; +BENCHMARK_TEMPLATE(large_random, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace large_random + +#endif // SIMDJSON_COMPETITION_YYJSON diff --git a/benchmark/largerandom/iter.h b/benchmark/largerandom/iter.h new file mode 100644 index 0000000000..2dd9d0a9a7 --- /dev/null +++ b/benchmark/largerandom/iter.h @@ -0,0 +1,52 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "largerandom.h" + +namespace largerandom { + +using namespace simdjson; + +class Iter { +public: + simdjson_inline bool Run(const padded_string &json); + + simdjson_inline const std::vector &Result() { return container; } + simdjson_inline size_t ItemCount() { return container.size(); } + +private: + ondemand::parser parser{}; + std::vector container{}; + + simdjson_inline double first_double(ondemand::json_iterator &iter) { + if (iter.start_object().error() || iter.field_key().error() || iter.field_value()) { throw "Invalid field"; } + return iter.consume_double(); + } + + simdjson_inline double next_double(ondemand::json_iterator &iter) { + if (!iter.has_next_field() || iter.field_key().error() || iter.field_value()) { throw "Invalid field"; } + return iter.consume_double(); + } + +}; + +simdjson_inline bool Iter::Run(const padded_string &json) { + container.clear(); + + auto iter = parser.iterate_raw(json).value(); + if (iter.start_array()) { + do { + container.emplace_back(my_point{first_double(iter), next_double(iter), next_double(iter)}); + if (iter.has_next_field()) { throw "Too many fields"; } + } while (iter.has_next_element()); + } + + return true; +} + +BENCHMARK_TEMPLATE(LargeRandom, Iter); + +} // namespace largerandom + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/largerandom/ondemand.h b/benchmark/largerandom/ondemand.h new file mode 100644 index 0000000000..eedf034b24 --- /dev/null +++ b/benchmark/largerandom/ondemand.h @@ -0,0 +1,37 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "largerandom.h" + +namespace largerandom { + +using namespace simdjson; + +class OnDemand { +public: + simdjson_inline bool Run(const padded_string &json); + simdjson_inline const std::vector &Result() { return container; } + simdjson_inline size_t ItemCount() { return container.size(); } + +private: + ondemand::parser parser{}; + std::vector container{}; +}; + +simdjson_inline bool OnDemand::Run(const padded_string &json) { + container.clear(); + + auto doc = parser.iterate(json); + for (ondemand::object coord : doc) { + container.emplace_back(my_point{coord.find_field("x"), coord.find_field("y"), coord.find_field("z")}); + } + + return true; +} + +BENCHMARK_TEMPLATE(LargeRandom, OnDemand); + +} // namespace largerandom + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/largerandom/sax.h b/benchmark/largerandom/sax.h new file mode 100644 index 0000000000..5b527bf37e --- /dev/null +++ b/benchmark/largerandom/sax.h @@ -0,0 +1,120 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "largerandom.h" + +namespace largerandom { + +using namespace simdjson; +using namespace simdjson::builtin::stage2; + +class Sax { +public: + simdjson_inline bool Run(const padded_string &json) noexcept; + + simdjson_inline const std::vector &Result() { return container; } + simdjson_inline size_t ItemCount() { return container.size(); } + +private: + simdjson_inline error_code RunNoExcept(const padded_string &json) noexcept; + error_code Allocate(size_t new_capacity); + std::unique_ptr string_buf{}; + size_t capacity{}; + dom_parser_implementation dom_parser{}; + std::vector container{}; +}; + +struct sax_point_reader_visitor { +public: + std::vector &points; + enum {GOT_X=0, GOT_Y=1, GOT_Z=2, GOT_SOMETHING_ELSE=4}; + size_t idx{GOT_SOMETHING_ELSE}; + double buffer[3]={}; + + explicit sax_point_reader_visitor(std::vector &_points) : points(_points) {} + + simdjson_inline error_code visit_object_start(json_iterator &) { + idx = 0; + return SUCCESS; + } + simdjson_inline error_code visit_primitive(json_iterator &, const uint8_t *value) { + if(idx == GOT_SOMETHING_ELSE) { return simdjson::SUCCESS; } + return numberparsing::parse_double(value).get(buffer[idx]); + } + simdjson_inline error_code visit_object_end(json_iterator &) { + points.emplace_back(my_point{buffer[0], buffer[1], buffer[2]}); + return SUCCESS; + } + + simdjson_inline error_code visit_document_start(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_key(json_iterator &, const uint8_t * key) { + switch(key[1]) { + // Technically, we should check the other characters + // in the key, but we are cheating to go as fast + // as possible. + case 'x': + idx = GOT_X; + break; + case 'y': + idx = GOT_Y; + break; + case 'z': + idx = GOT_Z; + break; + default: + idx = GOT_SOMETHING_ELSE; + } + return SUCCESS; + } + simdjson_inline error_code visit_array_start(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_array_end(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_document_end(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_empty_array(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_empty_object(json_iterator &) { return SUCCESS; } + simdjson_inline error_code visit_root_primitive(json_iterator &, const uint8_t *) { return SUCCESS; } + simdjson_inline error_code increment_count(json_iterator &) { return SUCCESS; } +}; + +// NOTE: this assumes the dom_parser is already allocated +bool Sax::Run(const padded_string &json) noexcept { + auto error = RunNoExcept(json); + if (error) { std::cerr << error << std::endl; return false; } + return true; +} + +error_code Sax::RunNoExcept(const padded_string &json) noexcept { + container.clear(); + + // Allocate capacity if needed + if (capacity < json.size()) { + SIMDJSON_TRY( Allocate(json.size()) ); + } + + // Run stage 1 first. + SIMDJSON_TRY( dom_parser.stage1(json.u8data(), json.size(), false) ); + + // Then walk the document, parsing the tweets as we go + json_iterator iter(dom_parser, 0); + sax_point_reader_visitor visitor(container); + SIMDJSON_TRY( iter.walk_document(visitor) ); + return SUCCESS; +} + +error_code Sax::Allocate(size_t new_capacity) { + // string_capacity copied from document::allocate + size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + SIMDJSON_PADDING, 64); + string_buf.reset(new (std::nothrow) uint8_t[string_capacity]); + if (auto error = dom_parser.set_capacity(new_capacity)) { return error; } + if (capacity == 0) { // set max depth the first time only + if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; } + } + capacity = new_capacity; + return SUCCESS; +} + +BENCHMARK_TEMPLATE(LargeRandom, Sax); + +} // namespace largerandom + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/linux/linux-perf-events.h b/benchmark/linux/linux-perf-events.h index 0f411e0ea9..4fe872d2c8 100644 --- a/benchmark/linux/linux-perf-events.h +++ b/benchmark/linux/linux-perf-events.h @@ -1,4 +1,3 @@ -// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h #pragma once #ifdef __linux__ @@ -43,13 +42,14 @@ template class LinuxEvents { uint32_t i = 0; for (auto config : config_vec) { attribs.config = config; - fd = static_cast(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); - if (fd == -1) { + int _fd = static_cast(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); + if (_fd == -1) { report_error("perf_event_open"); } - ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]); + ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]); if (group == -1) { - group = fd; + group = _fd; + fd = _fd; } } @@ -81,10 +81,16 @@ template class LinuxEvents { } } // our actual results are in slots 1,3,5, ... of this structure - // we really should be checking our ids obtained earlier to be safe for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { results[i / 2] = temp_result_vec[i]; } + for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { + if(ids[i/2-1] != temp_result_vec[i]) { + report_error("event mismatch"); + } + } + + } bool is_working() { @@ -92,10 +98,8 @@ template class LinuxEvents { } private: - void report_error(const std::string &context) { - if (working) - std::cerr << (context + ": " + std::string(strerror(errno))) << std::endl; + void report_error(const std::string &) { working = false; } }; -#endif +#endif \ No newline at end of file diff --git a/benchmark/minifiercompetition.cpp b/benchmark/minifiercompetition.cpp deleted file mode 100644 index 954063a9d0..0000000000 --- a/benchmark/minifiercompetition.cpp +++ /dev/null @@ -1,192 +0,0 @@ -#include -#include - -#include "benchmark.h" -#include "simdjson.h" - -SIMDJSON_PUSH_DISABLE_ALL_WARNINGS - -// #define RAPIDJSON_SSE2 // bad -// #define RAPIDJSON_SSE42 // bad -#include "rapidjson/document.h" -#include "rapidjson/reader.h" // you have to check in the submodule -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" -#include "sajson.h" - -SIMDJSON_POP_DISABLE_WARNINGS - -using namespace rapidjson; -using namespace simdjson; - -std::string rapid_stringme_insitu(char *json) { - Document d; - d.ParseInsitu(json); - if (d.HasParseError()) { - std::cerr << "problem!" << std::endl; - return ""; // should do something - } - StringBuffer buffer; - Writer writer(buffer); - d.Accept(writer); - return buffer.GetString(); -} - -std::string rapid_stringme(char *json) { - Document d; - d.Parse(json); - if (d.HasParseError()) { - std::cerr << "problem!" << std::endl; - return ""; // should do something - } - StringBuffer buffer; - Writer writer(buffer); - d.Accept(writer); - return buffer.GetString(); -} - -std::string simdjson_stringme(simdjson::padded_string & json) { - std::stringstream ss; - dom::parser parser; - dom::element doc = parser.parse(json); - ss << simdjson::minify(doc); - return ss.str(); -} - - -int main(int argc, char *argv[]) { - int c; - bool verbose = false; - bool just_data = false; - - while ((c = getopt(argc, argv, "vt")) != -1) - switch (c) { - case 't': - just_data = true; - break; - case 'v': - verbose = true; - break; - default: - abort(); - } - if (optind >= argc) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - exit(1); - } - const char *filename = argv[optind]; - simdjson::padded_string p; - auto error = simdjson::padded_string::load(filename).get(p); - if (error) { - std::cerr << "Could not load the file " << filename << std::endl; - return EXIT_FAILURE; - } - // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte - if (verbose) { - std::cout << "Input has "; - if (p.size() > 1000 * 1000) - std::cout << p.size() / (1000 * 1000) << " MB "; - else if (p.size() > 1000) - std::cout << p.size() / 1000 << " KB "; - else - std::cout << p.size() << " B "; - std::cout << std::endl; - } - char *buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1); - if(buffer == nullptr) { - std::cerr << "Out of memory!" << std::endl; - abort(); - } - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - - int repeat = 50; - size_t volume = p.size(); - if (just_data) { - printf( - "name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); - } - size_t strlength = rapid_stringme((char *)p.data()).size(); - if (verbose) - std::cout << "input length is " << p.size() << " stringified length is " - << strlength << std::endl; - BEST_TIME_NOCHECK("despacing with RapidJSON", - rapid_stringme((char *)p.data()), , repeat, volume, - !just_data); - BEST_TIME_NOCHECK( - "despacing with RapidJSON Insitu", rapid_stringme_insitu((char *)buffer), - memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - - BEST_TIME_NOCHECK( - "despacing with std::minify", simdjson_stringme(p),, repeat, volume, !just_data); - - - memcpy(buffer, p.data(), p.size()); - size_t outlength; - uint8_t *cbuffer = (uint8_t *)buffer; - for (auto imple : simdjson::available_implementations) { - BEST_TIME((std::string("simdjson->minify+")+imple->name()).c_str(), (imple->minify(cbuffer, p.size(), cbuffer, outlength) == simdjson::SUCCESS ? outlength : -1), - outlength, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - } - - printf("minisize = %zu, original size = %zu (minified down to %.2f percent " - "of original) \n", - outlength, p.size(), static_cast(outlength) * 100.0 / static_cast(p.size())); - - /*** - * Is it worth it to minify before parsing? - ***/ - rapidjson::Document d; - BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - - char *mini_buffer = simdjson::internal::allocate_padded_buffer(p.size() + 1); - if(mini_buffer == nullptr) { - std::cerr << "Out of memory" << std::endl; - abort(); - } - size_t minisize; - auto minierror = minify(p.data(), p.size(),mini_buffer, minisize); - if (!minierror) { std::cerr << minierror << std::endl; exit(1); } - mini_buffer[minisize] = '\0'; - - BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), - false, memcpy(buffer, mini_buffer, p.size()), repeat, volume, - !just_data); - - size_t ast_buffer_size = p.size() * 2; - size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t)); - - BEST_TIME( - "sajson orig", - sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), - sajson::mutable_string_view(p.size(), buffer)) - .is_valid(), - true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - - BEST_TIME( - "sajson despaced", - sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), - sajson::mutable_string_view(minisize, buffer)) - .is_valid(), - true, memcpy(buffer, mini_buffer, p.size()), repeat, volume, !just_data); - - simdjson::dom::parser parser; - bool automated_reallocation = false; - BEST_TIME("simdjson orig", - parser.parse((const uint8_t *)buffer, p.size(), - automated_reallocation).error(), - simdjson::SUCCESS, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - BEST_TIME("simdjson despaced", - parser.parse((const uint8_t *)buffer, minisize, - automated_reallocation).error(), - simdjson::SUCCESS, memcpy(buffer, mini_buffer, p.size()), repeat, volume, - !just_data); - - free(buffer); - free(ast_buffer); - free(mini_buffer); -} diff --git a/benchmark/parseandstatcompetition.cpp b/benchmark/parseandstatcompetition.cpp deleted file mode 100644 index 442bad32b9..0000000000 --- a/benchmark/parseandstatcompetition.cpp +++ /dev/null @@ -1,489 +0,0 @@ -#include "simdjson.h" -#include - -#include "benchmark.h" - -SIMDJSON_PUSH_DISABLE_ALL_WARNINGS - -// #define RAPIDJSON_SSE2 // bad for performance -// #define RAPIDJSON_SSE42 // bad for performance -#include "rapidjson/document.h" -#include "rapidjson/reader.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" - -#include "sajson.h" - -SIMDJSON_POP_DISABLE_WARNINGS - -using namespace rapidjson; -using namespace simdjson; -struct stat_s { - size_t number_count; - size_t object_count; - size_t array_count; - size_t null_count; - size_t true_count; - size_t false_count; - bool valid; -}; - -typedef struct stat_s stat_t; - -bool stat_equal(const stat_t &s1, const stat_t &s2) { - return (s1.valid == s2.valid) && (s1.number_count == s2.number_count) && - (s1.object_count == s2.object_count) && - (s1.array_count == s2.array_count) && - (s1.null_count == s2.null_count) && (s1.true_count == s2.true_count) && - (s1.false_count == s2.false_count); -} - -void print_stat(const stat_t &s) { - if (!s.valid) { - printf("invalid\n"); - return; - } - printf("number: %zu object: %zu array: %zu null: %zu true: %zu false: %zu\n", - s.number_count, s.object_count, s.array_count, s.null_count, - s.true_count, s.false_count); -} - -really_inline void simdjson_process_atom(stat_t &s, - simdjson::dom::element element) { - if (element.is()) { - s.number_count++; - } else if (element.is()) { - simdjson::error_code error; - bool v; - if (not (error = element.get(v)) && v) { - s.true_count++; - } else { - s.false_count++; - } - } else if (element.is_null()) { - s.null_count++; - } -} - -void simdjson_recurse(stat_t &s, simdjson::dom::element element) { - error_code error; - if (element.is()) { - s.array_count++; - dom::array array; - if ((error = element.get(array))) { - std::cerr << error << std::endl; - abort(); - } - for (auto child : array) { - if (child.is() || - child.is()) { - simdjson_recurse(s, child); - } else { - simdjson_process_atom(s, child); - } - } - } else if (element.is()) { - s.object_count++; - dom::object object; - if ((error = element.get(object))) { - std::cerr << error << std::endl; - abort(); - } - for (auto field : object) { - if (field.value.is() || - field.value.is()) { - simdjson_recurse(s, field.value); - } else { - simdjson_process_atom(s, field.value); - } - } - } else { - simdjson_process_atom(s, element); - } -} - -never_inline stat_t simdjson_compute_stats(const simdjson::padded_string &p) { - stat_t s{}; - simdjson::dom::parser parser; - simdjson::dom::element doc; - auto error = parser.parse(p).get(doc); - if (error) { - s.valid = false; - return s; - } - s.valid = true; - simdjson_recurse(s, doc); - return s; -} - -/// -struct Stat { - size_t objectCount; - size_t arrayCount; - size_t numberCount; - size_t stringCount; - size_t trueCount; - size_t falseCount; - size_t nullCount; - - size_t memberCount; // Number of members in all objects - size_t elementCount; // Number of elements in all arrays - size_t stringLength; // Number of code units in all strings -}; - -static void GenStatPlus(Stat &stat, const dom::element v) { - switch (v.type()) { - case dom::element_type::ARRAY: - for (dom::element child : dom::array(v)) { - GenStatPlus(stat, child); - stat.elementCount++; - } - stat.arrayCount++; - break; - case dom::element_type::OBJECT: - for (dom::key_value_pair kv : dom::object(v)) { - GenStatPlus(stat, kv.value); - stat.stringLength += kv.key.size(); - stat.memberCount++; - stat.stringCount++; - } - stat.objectCount++; - break; - case dom::element_type::INT64: - case dom::element_type::UINT64: - case dom::element_type::DOUBLE: - stat.numberCount++; - break; - case dom::element_type::STRING: { - stat.stringCount++; - auto sv = std::string_view(v); - stat.stringLength += sv.size(); - } break; - case dom::element_type::BOOL: - if (bool(v)) { - stat.trueCount++; - } else { - stat.falseCount++; - } - break; - case dom::element_type::NULL_VALUE: - ++stat.nullCount; - break; - } -} - -static void RapidGenStat(Stat &stat, const rapidjson::Value &v) { - switch (v.GetType()) { - case kNullType: - stat.nullCount++; - break; - case kFalseType: - stat.falseCount++; - break; - case kTrueType: - stat.trueCount++; - break; - - case kObjectType: - for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); - ++m) { - stat.stringLength += m->name.GetStringLength(); - RapidGenStat(stat, m->value); - } - stat.objectCount++; - stat.memberCount += (v.MemberEnd() - v.MemberBegin()); - stat.stringCount += (v.MemberEnd() - v.MemberBegin()); // Key - break; - - case kArrayType: - for (Value::ConstValueIterator i = v.Begin(); i != v.End(); ++i) - RapidGenStat(stat, *i); - stat.arrayCount++; - stat.elementCount += v.Size(); - break; - - case kStringType: - stat.stringCount++; - stat.stringLength += v.GetStringLength(); - break; - - case kNumberType: - stat.numberCount++; - break; - } -} -never_inline Stat rapidjson_compute_stats_ref(const rapidjson::Value &doc) { - Stat s{}; - RapidGenStat(s, doc); - return s; -} - -never_inline Stat -simdjson_compute_stats_refplus(const simdjson::dom::element &doc) { - Stat s{}; - GenStatPlus(s, doc); - return s; -} - -// see -// https://github.com/miloyip/nativejson-benchmark/blob/master/src/tests/sajsontest.cpp -void sajson_traverse(stat_t &stats, const sajson::value &node) { - using namespace sajson; - switch (node.get_type()) { - case TYPE_NULL: - stats.null_count++; - break; - case TYPE_FALSE: - stats.false_count++; - break; - case TYPE_TRUE: - stats.true_count++; - break; - case TYPE_ARRAY: { - stats.array_count++; - auto length = node.get_length(); - for (size_t i = 0; i < length; ++i) { - sajson_traverse(stats, node.get_array_element(i)); - } - break; - } - case TYPE_OBJECT: { - stats.object_count++; - auto length = node.get_length(); - for (auto i = 0u; i < length; ++i) { - sajson_traverse(stats, node.get_object_value(i)); - } - break; - } - case TYPE_STRING: - // skip - break; - - case TYPE_DOUBLE: - case TYPE_INTEGER: - stats.number_count++; // node.get_number_value(); - break; - default: - assert(false && "unknown node type"); - } -} - -never_inline stat_t sasjon_compute_stats(const simdjson::padded_string &p) { - stat_t answer{}; - char *buffer = (char *)malloc(p.size()); - if (buffer == nullptr) { - return answer; - } - memcpy(buffer, p.data(), p.size()); - auto d = sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)); - answer.valid = d.is_valid(); - if (!answer.valid) { - free(buffer); - return answer; - } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; - sajson_traverse(answer, d.get_root()); - free(buffer); - return answer; -} - -void rapid_traverse(stat_t &stats, const rapidjson::Value &v) { - switch (v.GetType()) { - case kNullType: - stats.null_count++; - break; - case kFalseType: - stats.false_count++; - break; - case kTrueType: - stats.true_count++; - break; - - case kObjectType: - for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd(); - ++m) { - rapid_traverse(stats, m->value); - } - stats.object_count++; - break; - case kArrayType: - for (Value::ConstValueIterator i = v.Begin(); i != v.End(); - ++i) { // v.Size(); - rapid_traverse(stats, *i); - } - stats.array_count++; - break; - - case kStringType: - break; - - case kNumberType: - stats.number_count++; - break; - } -} - -never_inline stat_t rapid_compute_stats(const simdjson::padded_string &p) { - stat_t answer{}; - char *buffer = (char *)malloc(p.size() + 1); - if (buffer == nullptr) { - return answer; - } - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - rapidjson::Document d; - d.ParseInsitu(buffer); - answer.valid = !d.HasParseError(); - if (!answer.valid) { - free(buffer); - return answer; - } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; - rapid_traverse(answer, d); - free(buffer); - return answer; -} - -never_inline stat_t -rapid_accurate_compute_stats(const simdjson::padded_string &p) { - stat_t answer{}; - char *buffer = (char *)malloc(p.size() + 1); - if (buffer == nullptr) { - return answer; - } - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - rapidjson::Document d; - d.ParseInsitu(buffer); - answer.valid = !d.HasParseError(); - if (!answer.valid) { - free(buffer); - return answer; - } - answer.number_count = 0; - answer.object_count = 0; - answer.array_count = 0; - answer.null_count = 0; - answer.true_count = 0; - answer.false_count = 0; - rapid_traverse(answer, d); - free(buffer); - return answer; -} -int main(int argc, char *argv[]) { - bool verbose = false; - bool just_data = false; - - int c; - while ((c = getopt(argc, argv, "vt")) != -1) - switch (c) { - case 't': - just_data = true; - break; - case 'v': - verbose = true; - break; - default: - abort(); - } - if (optind >= argc) { - std::cerr - << "Using different parsers, we compute the content statistics of " - "JSON documents." - << std::endl; - std::cerr << "Usage: " << argv[0] << " " << std::endl; - std::cerr << "Or " << argv[0] << " -v " << std::endl; - exit(1); - } - const char *filename = argv[optind]; - if (optind + 1 < argc) { - std::cerr << "warning: ignoring everything after " << argv[optind + 1] - << std::endl; - } - simdjson::padded_string p; - auto error = simdjson::padded_string::load(filename).get(p); - if (error) { - std::cerr << "Could not load the file " << filename << std::endl; - return EXIT_FAILURE; - } - // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte - if (verbose) { - std::cout << "Input has "; - if (p.size() > 1000 * 1000) - std::cout << p.size() / (1000 * 1000) << " MB "; - else if (p.size() > 1000) - std::cout << p.size() / 1000 << " KB "; - else - std::cout << p.size() << " B "; - std::cout << std::endl; - } - stat_t s1 = simdjson_compute_stats(p); - if (verbose) { - printf("simdjson: "); - print_stat(s1); - } - stat_t s2 = rapid_compute_stats(p); - if (verbose) { - printf("rapid: "); - print_stat(s2); - } - stat_t s2a = rapid_accurate_compute_stats(p); - if (verbose) { - printf("rapid full: "); - print_stat(s2a); - } - stat_t s3 = sasjon_compute_stats(p); - if (verbose) { - printf("sasjon: "); - print_stat(s3); - } - assert(stat_equal(s1, s2)); - assert(stat_equal(s1, s3)); - int repeat = 50; - size_t volume = p.size(); - if (just_data) { - printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n"); - } - BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , - repeat, volume, !just_data); - BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , - repeat, volume, !just_data); - BEST_TIME("RapidJSON (precise) ", rapid_accurate_compute_stats(p).valid, true, - , repeat, volume, !just_data); - BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , - repeat, volume, !just_data); - if (!just_data) { - printf("API traversal tests\n"); - printf("Based on https://github.com/miloyip/nativejson-benchmark\n"); - simdjson::dom::parser parser; - simdjson::dom::element doc; - auto error = parser.parse(p).get(doc); - if (error) { - std::cerr << error << std::endl; - } - size_t refval = simdjson_compute_stats_refplus(doc).objectCount; - - BEST_TIME("simdjson ", - simdjson_compute_stats_refplus(doc).objectCount, refval, , repeat, - volume, !just_data); - char *buffer = (char *)malloc(p.size() + 1); - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - rapidjson::Document d; - d.ParseInsitu(buffer); - BEST_TIME("rapid ", rapidjson_compute_stats_ref(d).objectCount, - refval, , repeat, volume, !just_data); - free(buffer); - } -} diff --git a/benchmark/parsingcompetition.cpp b/benchmark/parsingcompetition.cpp deleted file mode 100644 index 509e440876..0000000000 --- a/benchmark/parsingcompetition.cpp +++ /dev/null @@ -1,361 +0,0 @@ -#include "simdjson.h" - -#include -#ifndef _MSC_VER -#include "linux-perf-events.h" -#ifdef __linux__ -#include -#endif //__linux__ -#endif // _MSC_VER - -#include - -#include "benchmark.h" - -SIMDJSON_PUSH_DISABLE_ALL_WARNINGS - -// #define RAPIDJSON_SSE2 // bad for performance -// #define RAPIDJSON_SSE42 // bad for performance -#include "rapidjson/document.h" -#include "rapidjson/reader.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" - -#include "sajson.h" - -#include -using json = nlohmann::json; - -#ifdef ALLPARSER - -#include "fastjson.cpp" -#include "fastjson_dom.cpp" -#include "gason.cpp" - -#include "json11.cpp" -extern "C" { -#include "cJSON.c" -#include "cJSON.h" -#include "jsmn.c" -#include "jsmn.h" -#include "ujdecode.h" -#include "ultrajsondec.c" -} - -#include "jsoncpp.cpp" -#include "json/json.h" - -#endif - -SIMDJSON_POP_DISABLE_WARNINGS - -using namespace rapidjson; - -#ifdef ALLPARSER -// fastjson has a tricky interface -void on_json_error(void *, UNUSED const fastjson::ErrorContext &ec) { - // std::cerr<<"ERROR: "<((50000000 * repeat_multiplier) / static_cast(p.size())); - if (repeat < 10) { repeat = 10; } - // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte - if (verbose) { - std::cout << "Input " << filename << " has "; - if (p.size() > 1000 * 1000) - std::cout << p.size() / (1000 * 1000) << " MB"; - else if (p.size() > 1000) - std::cout << p.size() / 1000 << " KB"; - else - std::cout << p.size() << " B"; - std::cout << ": will run " << repeat << " iterations." << std::endl; - } - size_t volume = p.size(); - if (just_data) { - printf("%-42s %20s %20s %20s %20s \n", "name", "cycles_per_byte", - "cycles_per_byte_err", "gb_per_s", "gb_per_s_err"); - } - if (!just_data) { - const std::string inputcopy(p.data(), p.data()+p.size()); - std::stringstream is; - is.str(inputcopy); - const size_t lc = sum_line_lengths(is); - BEST_TIME("getline ",sum_line_lengths(is) , lc, reset_stream(is), - repeat, volume, !just_data); - } - - if (!just_data) { - auto parse_dynamic=[](auto& str){ - simdjson::dom::parser parser; - return parser.parse(str).error(); - }; - BEST_TIME("simdjson (dynamic mem) ", parse_dynamic(p), simdjson::SUCCESS, - , repeat, volume, !just_data); - } - // (static alloc) - simdjson::dom::parser parser; - BEST_TIME("simdjson ", parser.parse(p).error(), simdjson::SUCCESS, , repeat, volume, - !just_data); - - rapidjson::Document d; - - char *buffer = (char *)malloc(p.size() + 1); - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; -#ifndef ALLPARSER - if (!just_data) -#endif - { - memcpy(buffer, p.data(), p.size()); - BEST_TIME("RapidJSON ", - d.Parse((const char *)buffer) - .HasParseError(), - false, , repeat, volume, - !just_data); - } -#ifndef ALLPARSER - if (!just_data) -#endif - { - memcpy(buffer, p.data(), p.size()); - BEST_TIME("RapidJSON (accurate number parsing) ", - d.Parse((const char *)buffer) - .HasParseError(), - false, , repeat, volume, - !just_data); - } - BEST_TIME("RapidJSON (insitu)", - d.ParseInsitu(buffer).HasParseError(), - false, - memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), - repeat, volume, !just_data); - BEST_TIME("RapidJSON (insitu, accurate number parsing)", - d.ParseInsitu(buffer).HasParseError(), - false, - memcpy(buffer, p.data(), p.size()) && (buffer[p.size()] = '\0'), - repeat, volume, !just_data); -#ifndef ALLPARSER - if (!just_data) -#endif - BEST_TIME("sajson (dynamic mem)", - sajson::parse(sajson::dynamic_allocation(), - sajson::mutable_string_view(p.size(), buffer)) - .is_valid(), - true, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - - size_t ast_buffer_size = p.size(); - size_t *ast_buffer = (size_t *)malloc(ast_buffer_size * sizeof(size_t)); - // (static alloc, insitu) - BEST_TIME( - "sajson", - sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), - sajson::mutable_string_view(p.size(), buffer)) - .is_valid(), - true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - - memcpy(buffer, p.data(), p.size()); - size_t expected = json::parse(p.data(), p.data() + p.size()).size(); - BEST_TIME("nlohmann-json", json::parse(buffer, buffer + p.size()).size(), - expected, , repeat, volume, - !just_data); - -#ifdef ALLPARSER - std::string json11err; - BEST_TIME("dropbox (json11) ", - ((json11::Json::parse(buffer, json11err).is_null()) || - (!json11err.empty())), - false, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - - BEST_TIME("fastjson ", fastjson_parse(buffer), true, - memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - JsonValue value; - JsonAllocator allocator; - char *endptr; - BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), - JSON_OK, memcpy(buffer, p.data(), p.size()), repeat, volume, - !just_data); - void *state; - BEST_TIME("ultrajson ", - (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, - memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data); - - { - std::unique_ptr tokens = - std::make_unique(p.size()); - jsmn_parser jparser; - jsmn_init(&jparser); - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - BEST_TIME( - "jsmn ", - (jsmn_parse(&jparser, buffer, p.size(), tokens.get(), static_cast(p.size())) > 0), - true, jsmn_init(&jparser), repeat, volume, !just_data); - } - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - cJSON *tree = cJSON_Parse(buffer); - BEST_TIME("cJSON ", ((tree = cJSON_Parse(buffer)) != NULL), true, - cJSON_Delete(tree), repeat, volume, !just_data); - cJSON_Delete(tree); - - Json::CharReaderBuilder b; - Json::CharReader *json_cpp_reader = b.newCharReader(); - Json::Value root; - Json::String errs; - BEST_TIME("jsoncpp ", - json_cpp_reader->parse(buffer, buffer + volume, &root, &errs), true, - , repeat, volume, !just_data); - delete json_cpp_reader; -#endif - if (!just_data) - BEST_TIME("memcpy ", - (memcpy(buffer, p.data(), p.size()) == buffer), true, , repeat, - volume, !just_data); -#ifdef __linux__ - if (!just_data) { - printf("\n \n \n"); - std::vector evts; - evts.push_back(PERF_COUNT_HW_CPU_CYCLES); - evts.push_back(PERF_COUNT_HW_INSTRUCTIONS); - evts.push_back(PERF_COUNT_HW_BRANCH_MISSES); - evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES); - evts.push_back(PERF_COUNT_HW_CACHE_MISSES); - LinuxEvents unified(evts); - std::vector results; - std::vector stats; - results.resize(evts.size()); - stats.resize(evts.size()); - std::fill(stats.begin(), stats.end(), 0); // unnecessary - for (decltype(repeat) i = 0; i < repeat; i++) { - unified.start(); - auto parse_error = parser.parse(p).error(); - if (parse_error) - printf("bug\n"); - unified.end(results); - std::transform(stats.begin(), stats.end(), results.begin(), stats.begin(), - std::plus()); - } - printf("simdjson : cycles %10.0f instructions %10.0f branchmisses %10.0f " - "cacheref %10.0f cachemisses %10.0f bytespercachemiss %10.0f " - "inspercycle %10.1f insperbyte %10.1f\n", - static_cast(stats[0]) / static_cast(repeat), static_cast(stats[1]) / static_cast(repeat), - static_cast(stats[2]) / static_cast(repeat), static_cast(stats[3]) / static_cast(repeat), - static_cast(stats[4]) / static_cast(repeat), static_cast(volume) * static_cast(repeat) / static_cast(stats[2]), - static_cast(stats[1]) / static_cast(stats[0]), static_cast(stats[1]) / (static_cast(volume) * static_cast(repeat))); - - std::fill(stats.begin(), stats.end(), 0); - for (decltype(repeat) i = 0; i < repeat; i++) { - memcpy(buffer, p.data(), p.size()); - buffer[p.size()] = '\0'; - unified.start(); - if (d.ParseInsitu(buffer).HasParseError() != - false) - printf("bug\n"); - unified.end(results); - std::transform(stats.begin(), stats.end(), results.begin(), stats.begin(), - std::plus()); - } - printf("RapidJSON: cycles %10.0f instructions %10.0f branchmisses %10.0f " - "cacheref %10.0f cachemisses %10.0f bytespercachemiss %10.0f " - "inspercycle %10.1f insperbyte %10.1f\n", - static_cast(stats[0]) / static_cast(repeat), static_cast(stats[1]) / static_cast(repeat), - static_cast(stats[2]) / static_cast(repeat), static_cast(stats[3]) / static_cast(repeat), - static_cast(stats[4]) / static_cast(repeat), static_cast(volume) * static_cast(repeat) / static_cast(stats[2]), - static_cast(stats[1]) / static_cast(stats[0]), static_cast(stats[1]) / (static_cast(volume) * static_cast(repeat))); - - std::fill(stats.begin(), stats.end(), 0); // unnecessary - for (decltype(repeat) i = 0; i < repeat; i++) { - memcpy(buffer, p.data(), p.size()); - unified.start(); - if (sajson::parse(sajson::bounded_allocation(ast_buffer, ast_buffer_size), - sajson::mutable_string_view(p.size(), buffer)) - .is_valid() != true) - printf("bug\n"); - unified.end(results); - std::transform(stats.begin(), stats.end(), results.begin(), stats.begin(), - std::plus()); - } - printf("sajson : cycles %10.0f instructions %10.0f branchmisses %10.0f " - "cacheref %10.0f cachemisses %10.0f bytespercachemiss %10.0f " - "inspercycle %10.1f insperbyte %10.1f\n", - static_cast(stats[0]) / static_cast(repeat), static_cast(stats[1]) / static_cast(repeat), - static_cast(stats[2]) / static_cast(repeat), static_cast(stats[3]) / static_cast(repeat), - static_cast(stats[4]) / static_cast(repeat), static_cast(volume) * static_cast(repeat) / static_cast(stats[2]), - static_cast(stats[1]) / static_cast(stats[0]), static_cast(stats[1]) / (static_cast(volume) * static_cast(repeat))); - - } -#endif // __linux__ - - free(ast_buffer); - free(buffer); - return true; -} - -int main(int argc, char *argv[]) { - bool verbose = false; - bool just_data = false; - double repeat_multiplier = 1; - int c; - while ((c = getopt(argc, argv, "r:vt")) != -1) - switch (c) { - case 'r': - repeat_multiplier = atof(optarg); - break; - case 't': - just_data = true; - break; - case 'v': - verbose = true; - break; - default: - abort(); - } - if (optind >= argc) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - std::cerr << "Or " << argv[0] << " -v " << std::endl; - std::cerr << "The '-t' flag outputs a table." << std::endl; - std::cerr << "The '-r ' flag sets the repeat multiplier: set it above 1 to do more iterations, and below 1 to do fewer." << std::endl; - exit(1); - } - int result = EXIT_SUCCESS; - for (int fileind = optind; fileind < argc; fileind++) { - if (!bench(argv[fileind], verbose, just_data, repeat_multiplier)) { result = EXIT_FAILURE; } - printf("\n\n"); - } - return result; -} diff --git a/benchmark/partial_tweets/boostjson.h b/benchmark/partial_tweets/boostjson.h new file mode 100644 index 0000000000..b402b35f41 --- /dev/null +++ b/benchmark/partial_tweets/boostjson.h @@ -0,0 +1,43 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_BOOSTJSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +struct boostjson { + using StringType=std::string; + + bool run(simdjson::padded_string &json, std::vector> &result) { + + auto root = boost::json::parse(json); + for (const auto &tweet : root.at("statuses").as_array()) { + const auto &user = tweet.at("user"); + + auto in_reply_to_status_id = tweet.as_object().if_contains("in_reply_to_status_id") + ? tweet.at("in_reply_to_status_id") : boost::json::value(); + + result.emplace_back(partial_tweets::tweet{ + tweet.at("created_at").as_string().c_str(), + tweet.at("id").to_number(), + tweet.at("text").as_string().c_str(), + in_reply_to_status_id.is_null() ? 0 : in_reply_to_status_id.to_number(), + { + user.at("id").to_number(), + user.at("screen_name").as_string().c_str() + }, + tweet.at("retweet_count").to_number(), + tweet.at("favorite_count").to_number() + }); + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(partial_tweets, boostjson)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_BOOSTJSON diff --git a/benchmark/partial_tweets/nlohmann_json.h b/benchmark/partial_tweets/nlohmann_json.h new file mode 100644 index 0000000000..b2b015b823 --- /dev/null +++ b/benchmark/partial_tweets/nlohmann_json.h @@ -0,0 +1,40 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +struct nlohmann_json { + using StringType=std::string; + + simdjson_inline uint64_t nullable_int(nlohmann::json value) { + if (value.is_null()) { return 0; } + return value; + } + + bool run(simdjson::padded_string &json, std::vector> &result) { + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto tweet : root["statuses"]) { + auto user = tweet["user"]; + result.emplace_back(partial_tweets::tweet{ + tweet["created_at"], + tweet["id"], + tweet["text"], + nullable_int(tweet["in_reply_to_status_id"]), + { user["id"], user["screen_name"] }, + tweet["retweet_count"], + tweet["favorite_count"] + }); + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(partial_tweets, nlohmann_json)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON diff --git a/benchmark/partial_tweets/nlohmann_json_sax.h b/benchmark/partial_tweets/nlohmann_json_sax.h new file mode 100644 index 0000000000..8261713eaf --- /dev/null +++ b/benchmark/partial_tweets/nlohmann_json_sax.h @@ -0,0 +1,159 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + using StringType=std::string; + + struct Handler : json::json_sax_t + { + // 8 keys to parse for each tweet (in order of appearance): "created_at", "id", "text", "in_reply_status_id", "id"(user), + // "screen_name"(user), "retweet_count" and "favorite_count". + // Assume that the first valid key encountered will be the correct key to parse. + // Assume that each tweet/retweet start with a key "metadata" and has a key "retweeted" towards the end + // The previous assumption will be used to check for the beginning of a new tweet and the end of a retweet + enum state { // Bitset to store state of search + key_date = (1<<0), + key_id = (1<<1), + key_text = (1<<2), + key_reply = (1<<3), + key_userid = (1<<4), + key_screenname = (1<<5), + key_rt = (1<<6), + key_fav = (1<<7), + found_date = (1<<8), + found_id = (1<<9), + found_text = (1<<10), + found_reply = (1<<11), + found_userid = (1<<12), + found_screenname = (1<<13), + found_rt = (1<<14), + found_fav = (1<<15) + }; + int values = state::key_date; + bool userobject_id = false; // If in a user object (to find user.id) + bool userobject_screen_name = false; // If in a user object (to find user.screen_name) + bool inretweet = false; // If in a retweet (all keys irrelevant in retweet object) + // Fields to store partial tweet info + uint64_t user_id; + uint64_t id; + uint64_t rt; + uint64_t fav; + uint64_t reply_status; + string_t screen_name; + string_t date; + string_t text; + std::vector>& result; + + Handler(std::vector> &r) : result(r) { } + + bool key(string_t& val) override { + if (!inretweet) { // If not in a retweet object, find relevant keys + if (val.compare("retweeted_status") == 0) { inretweet = true; } // Check if entering retweet + else if (val.compare("metadata") == 0) { values = 0; } // Reset + // Check if key has been found and if key matches a valid key + else if (!(values & found_date) && (val.compare("created_at") == 0)) { values |= (key_date); } + // Must also check if not in a user object + else if (!(values & found_id) && !userobject_id && (val.compare("id") == 0)) { values |= (key_id); } + else if (!(values & found_text) && (val.compare("text") == 0)) { values |= (key_text); } + else if (!(values & found_reply) && (val.compare("in_reply_to_status_id") == 0)) { values |= (key_reply); } + // Check if entering user object + else if ((val.compare("user") == 0)) { userobject_id = userobject_screen_name = true; } + // Must also check if in a user object + else if (!(values & found_userid) && userobject_id && (val.compare("id") == 0)) { values |= (key_userid); } + // Must also check if in a user object + else if (!(values & found_screenname) && userobject_screen_name && (val.compare("screen_name") == 0)) { values |= (key_screenname); } + else if (!(values & found_rt) && (val.compare("retweet_count") == 0)) { values |= (key_rt); } + else if (!(values & found_fav) && (val.compare("favorite_count") == 0)) { values |= (key_fav); } + } + else if (val.compare("retweeted") == 0) { inretweet = false; } // Check if end of retweet + return true; + } + bool number_unsigned(number_unsigned_t val) override { + if (values & key_id && !(values & found_id)) { // id + id = val; + values &= ~(key_id); + values |= (found_id); + } + else if (values & key_reply && !(values & found_reply)) { // in_reply_status_id + reply_status = val; + values &= ~(key_reply); + values |= (found_reply); + } + else if (values & key_userid && !(values & found_userid)) { // user.id + user_id = val; + userobject_id = false; + values &= ~(key_userid); + values |= (found_userid); + } + else if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = val; + values &= ~(key_rt); + values |= (found_rt); + } + else if (values & key_fav && !(values & found_fav)) { // favorite_count + fav = val; + values &= ~(key_fav); + values |= (found_fav); + // Assume that this is last key required, so add the partial_tweet to result + result.emplace_back(partial_tweets::tweet{ + date,id,text,reply_status,{user_id,screen_name},rt,fav}); + } + return true; + } + bool string(string_t& val) override { + if (values & key_date && !(values & found_date)) { // created_at + date = val; + values &= ~(key_date); + values |= (found_date); + } + else if (values & key_text && !(values & found_text)) { // text + text = val; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screenname && !(values & found_screenname)) { // user.screen_name + screen_name = val; + userobject_screen_name = false; + values &= ~(key_screenname); + values |= (found_screenname); + } + return true; + } + bool null() override { + if (values & key_reply && !(values & found_reply)) { // in_reply_status (null case) + reply_status = 0; + values &= ~(key_reply); + values |= (found_reply); + } + return true; + } + // Irrelevant events + bool boolean(bool val) override { return true; } + bool number_float(number_float_t val, const string_t& s) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, std::vector> &result) { + Handler handler(result); + json::sax_parse(json.data(), &handler); + + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(partial_tweets, nlohmann_json_sax)->UseManualTime(); +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/partial_tweets/ondemand.h b/benchmark/partial_tweets/ondemand.h new file mode 100644 index 0000000000..4c40ed14ff --- /dev/null +++ b/benchmark/partial_tweets/ondemand.h @@ -0,0 +1,63 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "partial_tweets.h" + +namespace partial_tweets { + +using namespace simdjson; + + +class OnDemand { +public: + OnDemand() { + if(!displayed_implementation) { + std::cout << "On-Demand implementation: " << builtin_implementation()->name() << std::endl; + displayed_implementation = true; + } + } + simdjson_inline bool Run(const padded_string &json); + simdjson_inline const std::vector &Result() { return tweets; } + simdjson_inline size_t ItemCount() { return tweets.size(); } + +private: + ondemand::parser parser{}; + std::vector tweets{}; + + simdjson_inline uint64_t nullable_int(ondemand::value value) { + if (value.is_null()) { return 0; } + return value; + } + + simdjson_inline twitter_user read_user(ondemand::object user) { + return { user.find_field("id"), user.find_field("screen_name") }; + } + + static inline bool displayed_implementation = false; +}; + +simdjson_inline bool OnDemand::Run(const padded_string &json) { + tweets.clear(); + + // Walk the document, parsing the tweets as we go + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc.find_field("statuses")) { + tweets.emplace_back(partial_tweets::tweet{ + tweet.find_field("created_at"), + tweet.find_field("id"), + tweet.find_field("text"), + nullable_int(tweet.find_field("in_reply_to_status_id")), + read_user(tweet.find_field("user")), + tweet.find_field("retweet_count"), + tweet.find_field("favorite_count") + }); + } + return true; +} + +BENCHMARK_TEMPLATE(PartialTweets, OnDemand); + +} // namespace partial_tweets + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/partial_tweets/partial_tweets.h b/benchmark/partial_tweets/partial_tweets.h new file mode 100644 index 0000000000..406d0cf77f --- /dev/null +++ b/benchmark/partial_tweets/partial_tweets.h @@ -0,0 +1,46 @@ + +#pragma once + +#include "json_benchmark/file_runner.h" +#include "tweet.h" +#include + +namespace partial_tweets { + +using namespace json_benchmark; + +template +struct runner : public file_runner { + std::vector> result{}; + + bool setup(benchmark::State &state) { + return this->load_json(state, TWITTER_JSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result.clear(); + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return result.size(); + } +}; + +struct simdjson_dom; + +template simdjson_inline static void partial_tweets(benchmark::State &state) { + run_json_benchmark, runner>(state); +} + +} // namespace partial_tweets diff --git a/benchmark/partial_tweets/rapidjson.h b/benchmark/partial_tweets/rapidjson.h new file mode 100644 index 0000000000..e622969439 --- /dev/null +++ b/benchmark/partial_tweets/rapidjson.h @@ -0,0 +1,80 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +using namespace rapidjson; + +struct rapidjson_base { + using StringType=std::string_view; + + Document doc{}; + + simdjson_inline std::string_view get_string_view(Value &object, std::string_view key) { + // TODO use version that supports passing string length? + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing object field"; } + if (!field->value.IsString()) { throw "Field is not a string"; } + return { field->value.GetString(), field->value.GetStringLength() }; + } + simdjson_inline uint64_t get_uint64(Value &object, std::string_view key) { + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing object field"; } + if (!field->value.IsUint64()) { throw "Field is not uint64"; } + return field->value.GetUint64(); + } + simdjson_inline uint64_t get_nullable_uint64(Value &object, std::string_view key) { + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing nullable uint64 field"; } + if (field->value.IsNull()) { return 0; } + if (!field->value.IsUint64()) { throw "Field is not nullable uint64"; } + return field->value.GetUint64(); + } + simdjson_inline partial_tweets::twitter_user get_user(Value &object, std::string_view key) { + auto field = object.FindMember(key.data()); + if (field == object.MemberEnd()) { throw "Missing user field"; } + if (!field->value.IsObject()) { throw "User field is not an object"; } + return { get_uint64(field->value, "id"), get_string_view(field->value, "screen_name") }; + } + + bool run(Document &root, std::vector> &result) { + if (root.HasParseError() || !root.IsObject()) { return false; } + auto statuses = root.FindMember("statuses"); + if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; } + for (auto &tweet : statuses->value.GetArray()) { + if (!tweet.IsObject()) { return false; } + result.emplace_back(partial_tweets::tweet{ + get_string_view(tweet, "created_at"), + get_uint64 (tweet, "id"), + get_string_view(tweet, "text"), + get_nullable_uint64 (tweet, "in_reply_to_status_id"), + get_user (tweet, "user"), + get_uint64 (tweet, "retweet_count"), + get_uint64 (tweet, "favorite_count") + }); + } + + return true; + } +}; + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector> &result) { + return rapidjson_base::run(doc.Parse(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(partial_tweets, rapidjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, std::vector> &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), result); + } +}; +BENCHMARK_TEMPLATE(partial_tweets, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/partial_tweets/rapidjson_sax.h b/benchmark/partial_tweets/rapidjson_sax.h new file mode 100644 index 0000000000..4dbb3577e0 --- /dev/null +++ b/benchmark/partial_tweets/rapidjson_sax.h @@ -0,0 +1,165 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "partial_tweets.h" +#include +#include + +namespace partial_tweets { + +using namespace rapidjson; + +struct rapidjson_sax { + using StringType=std::string_view; + + // 8 keys to parse for each tweet (in order of appearance): "created_at", "id", "text", "in_reply_status_id", "id"(user), + // "screen_name"(user), "retweet_count" and "favorite_count". + // Assume that the first valid key encountered will be the correct key to parse. + // Assume that each tweet/retweet start with a key "metadata" and has a key "retweeted" towards the end + // The previous assumption will be used to check for the beginning of a new tweet and the end of a retweet + struct Handler { + enum state { // Bitset to store state of search + key_date = (1<<0), + key_id = (1<<1), + key_text = (1<<2), + key_reply = (1<<3), + key_userid = (1<<4), + key_screenname = (1<<5), + key_rt = (1<<6), + key_fav = (1<<7), + found_date = (1<<8), + found_id = (1<<9), + found_text = (1<<10), + found_reply = (1<<11), + found_userid = (1<<12), + found_screenname = (1<<13), + found_rt = (1<<14), + found_fav = (1<<15) + }; + int values = state::key_date; + bool userobject_id = false; // If in a user object (to find user.id) + bool userobject_screen_name = false; // If in a user object (to find user.screen_name) + bool inretweet = false; // If in a retweet (all keys irrelevant in retweet object) + // Fields to store partial tweet info + uint64_t user_id; + uint64_t id; + uint64_t rt; + uint64_t fav; + uint64_t reply_status; + std::string_view screen_name; + std::string_view date; + std::string_view text; + std::vector>& result; + + Handler(std::vector> &r) : result(r) { } + + bool Key(const char* key, SizeType length, bool copy) { + if (!inretweet) { // If not in a retweet object, find relevant keys + if ((length == 16) && (memcmp(key,"retweeted_status",16) == 0)) { inretweet = true; } // Check if entering retweet + else if ((length == 8) && (memcmp(key,"metadata",8) == 0)) { values = 0; } // Reset + // Check if key has been found and if key matches a valid key + else if (!(values & found_date) && (length == 10) && (memcmp(key,"created_at",10) == 0)) { values |= (key_date); } + // Must also check if not in a user object + else if (!(values & found_id) && !userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_id); } + else if (!(values & found_text) && (length == 4) && (memcmp(key,"text",4) == 0)) { values |= (key_text); } + else if (!(values & found_reply) && (length == 21) && (memcmp(key,"in_reply_to_status_id",21) == 0)) { values |= (key_reply); } + // Check if entering user object + else if ((length == 4) && (memcmp(key,"user",4) == 0)) { userobject_id = userobject_screen_name = true; } + // Must also check if in a user object + else if (!(values & found_userid) && userobject_id && (length == 2) && (memcmp(key,"id",2) == 0)) { values |= (key_userid); } + // Must also check if in a user object + else if (!(values & found_screenname) && userobject_screen_name && (length == 11) && (memcmp(key,"screen_name",11) == 0)) { values |= (key_screenname); } + else if (!(values & found_rt) && (length == 13) && (memcmp(key,"retweet_count",13) == 0)) { values |= (key_rt); } + else if (!(values & found_fav) && (length == 14) && (memcmp(key,"favorite_count",14) == 0)) { values |= (key_fav); } + } + else if ((length == 9) && (memcmp(key,"retweeted",9) == 0)) { inretweet = false; } // Check if end of retweet + return true; + } + bool Uint(unsigned i) { + if (values & key_userid && !(values & found_userid)) { // user.id + user_id = i; + userobject_id = false; + values &= ~(key_userid); + values |= (found_userid); + } + else if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = i; + values &= ~(key_rt); + values |= (found_rt); + } + else if (values & key_fav && !(values & found_fav)) { // favorite_count + fav = i; + values &= ~(key_fav); + values |= (found_fav); + // Assume that this is last key required, so add the partial_tweet to result + result.emplace_back(partial_tweets::tweet{ + date,id,text,reply_status,{user_id,screen_name},rt,fav}); + } + return true; + } + bool Uint64(uint64_t i) { + if (values & key_id && !(values & found_id)) { // id + id = i; + values &= ~(key_id); + values |= (found_id); + } + else if (values & key_reply && !(values & found_reply)) { // in_reply_status_id + reply_status = i; + values &= ~(key_reply); + values |= (found_reply); + } + return true; + } + bool String(const char* str, SizeType length, bool copy) { + if (values & key_date && !(values & found_date)) { // created_at + date = {str,length}; + values &= ~(key_date); + values |= (found_date); + } + else if (values & key_text && !(values & found_text)) { // text + text = {str,length}; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screenname && !(values & found_screenname)) { // user.screen_name + screen_name = {str,length}; + userobject_screen_name = false; + values &= ~(key_screenname); + values |= (found_screenname); + } + return true; + } + bool Null() { + if (values & key_reply && !(values & found_reply)) { // in_reply_status (null case) + reply_status = 0; + values &= ~(key_reply); + values |= (found_reply); + } + return true; + } + // Irrelevant events + bool Bool(bool b) { return true; } + bool Double(double d) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, std::vector> &result) { + Reader reader; + Handler handler(result); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } + +}; // rapid_jason_sax +BENCHMARK_TEMPLATE(partial_tweets, rapidjson_sax)->UseManualTime(); +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_RAPIDJSON \ No newline at end of file diff --git a/benchmark/partial_tweets/sajson.h b/benchmark/partial_tweets/sajson.h new file mode 100644 index 0000000000..1a8cf88ab6 --- /dev/null +++ b/benchmark/partial_tweets/sajson.h @@ -0,0 +1,98 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +struct sajson { + using StringType=std::string_view; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + simdjson_inline std::string_view get_string_view(const ::sajson::value &obj, std::string_view key) { + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field is not a string"; } + return { val.as_cstring(), val.get_string_length() }; + } + simdjson_inline uint64_t get_uint52(const ::sajson::value &obj, std::string_view key) { + auto val = obj.get_value_of_key({key.data(), key.length()}); + switch (val.get_type()) { + case ::sajson::TYPE_INTEGER: { + int64_t result; + if (!val.get_int53_value(&result) || result < 0) { throw "field is not uint52"; } + return uint64_t(result); + } + default: + throw "field not integer"; + } + } + simdjson_inline uint64_t get_str_uint64(const ::sajson::value &obj, std::string_view key) { + // Since sajson only supports 53-bit numbers, and IDs in twitter.json can be > 53 bits, we read the corresponding id_str and parse that. + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field not a string"; } + auto str = val.as_cstring(); + char *endptr; + uint64_t result = strtoull(str, &endptr, 10); + if (endptr != &str[val.get_string_length()]) { throw "field is a string, but not an integer string"; } + return result; + } + simdjson_inline uint64_t get_nullable_str_uint64(const ::sajson::value &obj, std::string_view key) { + auto val = obj.get_value_of_key({key.data(), key.length()}); + if (val.get_type() == ::sajson::TYPE_NULL) { return 0; } + if (val.get_type() != ::sajson::TYPE_STRING) { throw "field not a string"; } + auto str = val.as_cstring(); + char *endptr; + uint64_t result = strtoull(str, &endptr, 10); + if (endptr != &str[val.get_string_length()]) { throw "field is a string, but not an integer string"; } + return result; + } + simdjson_inline partial_tweets::twitter_user get_user(const ::sajson::value &obj, std::string_view key) { + auto user = obj.get_value_of_key({key.data(), key.length()}); + if (user.get_type() != ::sajson::TYPE_OBJECT) { throw "user is not an object"; } + return { get_str_uint64(user, "id_str"), get_string_view(user, "screen_name") }; + } + + bool run(simdjson::padded_string &json, std::vector> &result) { + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = ::sajson::parse( + ::sajson::bounded_allocation(ast_buffer, ast_buffer_size), + ::sajson::mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != ::sajson::TYPE_OBJECT) { return false; } + auto statuses = root.get_value_of_key({"statuses", strlen("statuses")}); + if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; } + + for (size_t i=0; i{ + get_string_view(tweet, "created_at"), + get_str_uint64 (tweet, "id_str"), + get_string_view(tweet, "text"), + get_nullable_str_uint64(tweet, "in_reply_to_status_id_str"), + get_user (tweet, "user"), + get_uint52 (tweet, "retweet_count"), + get_uint52 (tweet, "favorite_count") + }); + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(partial_tweets, sajson)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_SAJSON + diff --git a/benchmark/partial_tweets/simdjson_dom.h b/benchmark/partial_tweets/simdjson_dom.h new file mode 100644 index 0000000000..4c0846f3f9 --- /dev/null +++ b/benchmark/partial_tweets/simdjson_dom.h @@ -0,0 +1,43 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "partial_tweets.h" + +namespace partial_tweets { + +using namespace simdjson; + +struct simdjson_dom { + using StringType=std::string_view; + + dom::parser parser{}; + + simdjson_inline uint64_t nullable_int(dom::element element) { + if (element.is_null()) { return 0; } + return element; + } + + bool run(simdjson::padded_string &json, std::vector> &result) { + for (dom::element tweet : parser.parse(json)["statuses"]) { + auto user = tweet["user"]; + result.emplace_back(partial_tweets::tweet{ + tweet["created_at"], + tweet["id"], + tweet["text"], + nullable_int(tweet["in_reply_to_status_id"]), + { user["id"], user["screen_name"] }, + tweet["retweet_count"], + tweet["favorite_count"] + }); + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(partial_tweets, simdjson_dom)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/partial_tweets/simdjson_ondemand.h b/benchmark/partial_tweets/simdjson_ondemand.h new file mode 100644 index 0000000000..abc0eeeb01 --- /dev/null +++ b/benchmark/partial_tweets/simdjson_ondemand.h @@ -0,0 +1,48 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "partial_tweets.h" + +namespace partial_tweets { + +using namespace simdjson; + +struct simdjson_ondemand { + using StringType=std::string_view; + + ondemand::parser parser{}; + + simdjson_inline uint64_t nullable_int(ondemand::value value) { + if (value.is_null()) { return 0; } + return value; + } + + simdjson_inline twitter_user read_user(ondemand::object user) { + return { user.find_field("id"), user.find_field("screen_name") }; + } + + bool run(simdjson::padded_string &json, std::vector> &result) { + // Walk the document, parsing the tweets as we go + auto doc = parser.iterate(json); + for (ondemand::object tweet : doc.find_field("statuses")) { + result.emplace_back(partial_tweets::tweet{ + tweet.find_field("created_at"), + tweet.find_field("id"), + tweet.find_field("text"), + nullable_int(tweet.find_field("in_reply_to_status_id")), + read_user(tweet.find_field("user")), + tweet.find_field("retweet_count"), + tweet.find_field("favorite_count") + }); + } + + return true; + } +}; + +BENCHMARK_TEMPLATE(partial_tweets, simdjson_ondemand)->UseManualTime(); + +} // namespace partial_tweets + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/partial_tweets/tweet.h b/benchmark/partial_tweets/tweet.h new file mode 100644 index 0000000000..3cfbbaa618 --- /dev/null +++ b/benchmark/partial_tweets/tweet.h @@ -0,0 +1,61 @@ +#pragma once + +#include "simdjson.h" +#include "twitter_user.h" + +namespace partial_tweets { + +// { +// "statuses": [ +// { +// "created_at": "Sun Aug 31 00:29:15 +0000 2014", +// "id": 505874924095815700, +// "text": "@aym0566x \n\nåå‰:å‰ç”°ã‚ゆã¿\n第一å°è±¡:ãªã‚“ã‹æ€–ã£ï¼\n今ã®å°è±¡:ã¨ã‚Šã‚ãˆãšã‚­ãƒ¢ã„。噛ã¿åˆã‚ãªã„\n好ããªã¨ã“ã‚:ã¶ã™ã§ã‚­ãƒ¢ã„ã¨ã“😋✨✨\næ€ã„出:んーーーã€ã‚りã™ãŽðŸ˜Šâ¤ï¸\nLINE交æ›ã§ãる?:ã‚ã……ã”ã‚ん✋\nトプ画をã¿ã¦:照れã¾ã™ãŒãªðŸ˜˜âœ¨\n一言:ãŠå‰ã¯ä¸€ç”Ÿã‚‚ã‚“ã®ãƒ€ãƒðŸ’–", +// "in_reply_to_status_id": null, +// "user": { +// "id": 1186275104, +// "screen_name": "ayuu0123" +// }, +// "retweet_count": 0, +// "favorite_count": 0 +// } +// ] +// } + +template +struct tweet { + StringType created_at{}; + uint64_t id{}; + StringType result{}; + uint64_t in_reply_to_status_id{}; + twitter_user user{}; + uint64_t retweet_count{}; + uint64_t favorite_count{}; + template + simdjson_inline bool operator==(const tweet &other) const { + return created_at == other.created_at && + id == other.id && + result == other.result && + in_reply_to_status_id == other.in_reply_to_status_id && + user == other.user && + retweet_count == other.retweet_count && + favorite_count == other.favorite_count; + } + template + simdjson_inline bool operator!=(const tweet &other) const { return !(*this == other); } +}; + +template +simdjson_unused static std::ostream &operator<<(std::ostream &o, const tweet &t) { + o << "created_at: " << t.created_at << std::endl; + o << "id: " << t.id << std::endl; + o << "result: " << t.result << std::endl; + o << "in_reply_to_status_id: " << t.in_reply_to_status_id << std::endl; + o << "user.id: " << t.user.id << std::endl; + o << "user.screen_name: " << t.user.screen_name << std::endl; + o << "retweet_count: " << t.retweet_count << std::endl; + o << "favorite_count: " << t.favorite_count << std::endl; + return o; +} + +} // namespace partial_tweets diff --git a/benchmark/partial_tweets/twitter_user.h b/benchmark/partial_tweets/twitter_user.h new file mode 100644 index 0000000000..7807898dd0 --- /dev/null +++ b/benchmark/partial_tweets/twitter_user.h @@ -0,0 +1,18 @@ +#pragma once +#include "simdjson.h" + +namespace partial_tweets { + +template +struct twitter_user { + uint64_t id{}; + StringType screen_name{}; + + template + bool operator==(const twitter_user &other) const { + return id == other.id && + screen_name == other.screen_name; + } +}; + +} // namespace partial_tweets diff --git a/benchmark/partial_tweets/yyjson.h b/benchmark/partial_tweets/yyjson.h new file mode 100644 index 0000000000..7e96344aeb --- /dev/null +++ b/benchmark/partial_tweets/yyjson.h @@ -0,0 +1,80 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "partial_tweets.h" + +namespace partial_tweets { + +struct yyjson_base { + using StringType=std::string_view; + + simdjson_inline std::string_view get_string_view(yyjson_val *obj, std::string_view key) { + auto val = yyjson_obj_getn(obj, key.data(), key.length()); + if (!yyjson_is_str(val)) { throw "field is not uint64 or null!"; } + return { yyjson_get_str(val), yyjson_get_len(val) }; + } + simdjson_inline uint64_t get_uint64(yyjson_val *obj, std::string_view key) { + auto val = yyjson_obj_getn(obj, key.data(), key.length()); + if (!yyjson_is_uint(val)) { throw "field is not uint64 or null!"; } + return yyjson_get_uint(val); + } + simdjson_inline uint64_t get_nullable_uint64(yyjson_val *obj, std::string_view key) { + auto val = yyjson_obj_getn(obj, key.data(), key.length()); + if (!yyjson_is_uint(val)) { } + auto type = yyjson_get_type(val); + if (type != YYJSON_TYPE_NUM && type != YYJSON_TYPE_NULL ) { throw "field is not uint64 or null!"; } + return yyjson_get_uint(val); + } + simdjson_inline partial_tweets::twitter_user get_user(yyjson_val *obj, std::string_view key) { + auto user = yyjson_obj_getn(obj, key.data(), key.length()); + if (!yyjson_is_obj(user)) { throw "missing twitter user field!"; } + return { get_uint64(user, "id"), get_string_view(user, "screen_name") }; + } + + bool run(yyjson_doc *doc, std::vector> &result) { + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *statuses = yyjson_obj_get(root, "statuses"); + if (!yyjson_is_arr(statuses)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t tweet_idx, tweets_max; + yyjson_val *tweet; + yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) { + if (!yyjson_is_obj(tweet)) { return false; } + // TODO these can't actually handle errors + result.emplace_back(partial_tweets::tweet{ + get_string_view(tweet, "created_at"), + get_uint64 (tweet, "id"), + get_string_view(tweet, "text"), + get_nullable_uint64 (tweet, "in_reply_to_status_id"), + get_user (tweet, "user"), + get_uint64 (tweet, "retweet_count"), + get_uint64 (tweet, "favorite_count") + }); + } + + return true; + } +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, std::vector> &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), result); + } +}; +BENCHMARK_TEMPLATE(partial_tweets, yyjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, std::vector> &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), result); + } +}; +BENCHMARK_TEMPLATE(partial_tweets, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace partial_tweets + +#endif // SIMDJSON_COMPETITION_YYJSON + diff --git a/benchmark/top_tweet/README.md b/benchmark/top_tweet/README.md new file mode 100644 index 0000000000..b8fdc5d568 --- /dev/null +++ b/benchmark/top_tweet/README.md @@ -0,0 +1,49 @@ +# Top Tweet Benchmark + +The top_tweet benchmark finds the most-retweeted tweet in a twitter API response. + +## Purpose + +This scenario tends to measure an implementation's laziness: its ability to avoid parsing unneeded +values, without knowing beforehand which values are needed. + +To find the top tweet, an implementation needs to iterate through all tweets, remembering which one +had the highest retweet count. While it scans, it will find many "candidate" tweets with the highest +retweet count *up to that point.* However, While the implementation iterates through tweets, it will +have many "candidate" tweets. Essentially, it has to keep track of the "top tweet so far" while it +searches. However, only the text and screen_name of the *final* top tweet need to be parsed. +Therefore, JSON parsers that can only parse values on the first pass (such as DOM or streaming +parsers) will be forced to parse text and screen_name of every candidate (if not every single +tweet). Parsers which can delay parsing of values until later will therefore shine in scenarios like +this. + +## Rules + +The benchmark will be called with `run(padded_string &json, int64_t max_retweet_count, top_tweet_result &result)`. +The benchmark must: +- Find the tweet with the highest retweet_count at the top level of the "statuses" array. +- Find the *last* such tweet: if multiple tweets have the same top retweet_count, the last one + should be returned. +- Exclude tweets with retweet_count above max_retweet_count. This restriction is solely here because + the default twitter.json has a rather high retweet count in the third tweet, and to test laziness + the matching tweet needs to be further down in the file. +- Fill in top_tweet_result with the corresponding fields from the matching tweet. + +### Abridged Schema + +The abridged schema (objects contain more fields than listed here): + +```json +{ + "statuses": [ + { + "text": "i like to tweet", // text containing UTF-8 and escape characters + "user": { + "screen_name": "AlexanderHamilton" // string containing UTF-8 (and escape characters?) + }, + "retweet_count": 2, // uint32 + }, + ... + ] +} +``` diff --git a/benchmark/top_tweet/boostjson.h b/benchmark/top_tweet/boostjson.h new file mode 100644 index 0000000000..4202a74588 --- /dev/null +++ b/benchmark/top_tweet/boostjson.h @@ -0,0 +1,37 @@ +#pragma once + +#if SIMDJSON_COMPETITION_BOOSTJSON + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct boostjson { + using StringType=std::string; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + boost::json::value top_tweet{}; + + auto root = boost::json::parse(json); + for (const auto &tweet : root.at("statuses").as_array()) { + int64_t retweet_count = tweet.at("retweet_count").as_int64(); + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + result.text = top_tweet.at("text").as_string(); + result.screen_name = top_tweet.at("user").at("screen_name").as_string(); + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, boostjson)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_BOOSTJSON \ No newline at end of file diff --git a/benchmark/top_tweet/nlohmann_json.h b/benchmark/top_tweet/nlohmann_json.h new file mode 100644 index 0000000000..c9321a348a --- /dev/null +++ b/benchmark/top_tweet/nlohmann_json.h @@ -0,0 +1,39 @@ +#pragma once + +#if SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct nlohmann_json { + using StringType=std::string; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + nlohmann::json top_tweet{}; + + auto root = nlohmann::json::parse(json.data(), json.data() + json.size()); + for (auto tweet : root["statuses"]) { + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + result.text = to_string(top_tweet["text"]); + result.screen_name = to_string(top_tweet["user"]["screen_name"]); + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, nlohmann_json)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/top_tweet/nlohmann_json_sax.h b/benchmark/top_tweet/nlohmann_json_sax.h new file mode 100644 index 0000000000..6f695c88ed --- /dev/null +++ b/benchmark/top_tweet/nlohmann_json_sax.h @@ -0,0 +1,99 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_NLOHMANN_JSON + +#include "top_tweet.h" + +namespace top_tweet { + +using json = nlohmann::json; + +struct nlohmann_json_sax { + using StringType=std::string; + + struct Handler : json::json_sax_t + { + // Assume every tweet/retweet starts with "metadata" key and ends with "retweeted" key. Ignore everything in a retweet. + // Assume that the first valid key encountered outside a retweet is the correct key. + enum state { // Bitset to store state of search + key_text = (1<<0), + key_screen_name = (1<<1), + key_rt = (1<<2), + found_text = (1<<3), + found_screen_name = (1<<4), + found_rt = (1<<5) + }; + int values = state::key_text; + bool userobject = false; // If in a user object + bool inretweet = false; + int64_t max_rt; + int rt; + string_t screen_name; + string_t text; + top_tweet_result& result; + + Handler(top_tweet_result &r,int64_t m) : result(r), max_rt(m) { } + + bool key(string_t& val) override { + if (!inretweet) { // If not in a retweet object, find relevant keys + if (val.compare("retweeted_status") == 0) { inretweet = true; } // Check if entering retweet + else if (val.compare("metadata") == 0) { values = 0; } // Reset + else if (!(values & found_text) && (val.compare("text") == 0)) { values |= (key_text); } + else if ((val.compare("user") == 0)) { userobject = true; } + else if (!(values & found_screen_name) && userobject && (val.compare("screen_name") == 0)) { values |= (key_screen_name); } + else if (!(values & found_rt) && (val.compare("retweet_count") == 0)) { values |= (key_rt); } + } + else if (val.compare("retweeted") == 0) { inretweet = false; } // Check if end of retweet + return true; + } + bool number_unsigned(number_unsigned_t val) override { + if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = int(val); + values &= ~(key_rt); + values |= (found_rt); + if (rt <= max_rt && rt >= result.retweet_count) { // Check if current tweet has more retweet than previous top tweet + result.retweet_count = rt; + result.text = text; + result.screen_name = screen_name; + } + } + return true; + } + bool string(string_t& val) override { + if (values & key_text && !(values & found_text)) { // text + text = val; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screen_name && !(values & found_screen_name)) { // user.screen_name + screen_name = val; + userobject = false; + values &= ~(key_screen_name); + values |= (found_screen_name); + } + return true; + } + // Irrelevant events + bool null() override { return true; } + bool boolean(bool val) override { return true; } + bool number_float(number_float_t val, const string_t& s) override { return true; } + bool number_integer(number_integer_t val) override { return true; } + bool start_object(std::size_t elements) override { return true; } + bool end_object() override { return true; } + bool start_array(std::size_t elements) override { return true; } + bool end_array() override { return true; } + bool binary(json::binary_t& val) override { return true; } + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override { return false; } + }; // Handler + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + Handler handler(result,max_retweet_count); + json::sax_parse(json.data(), &handler); + return true; + } +}; // nlohmann_json_sax +BENCHMARK_TEMPLATE(top_tweet, nlohmann_json_sax)->UseManualTime(); +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_NLOHMANN_JSON \ No newline at end of file diff --git a/benchmark/top_tweet/rapidjson.h b/benchmark/top_tweet/rapidjson.h new file mode 100644 index 0000000000..a73360a309 --- /dev/null +++ b/benchmark/top_tweet/rapidjson.h @@ -0,0 +1,69 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace rapidjson; + +struct rapidjson_base { + using StringType=std::string_view; + + Document doc{}; + + bool run(Document &root, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + // Loop over the tweets + if (root.HasParseError() || !root.IsObject()) { return false; } + const auto &statuses = root.FindMember("statuses"); + if (statuses == root.MemberEnd() || !statuses->value.IsArray()) { return false; } + for (const Value &tweet : statuses->value.GetArray()) { + if (!tweet.IsObject()) { return false; } + + // Check if this tweet has a higher retweet count than the current top tweet + const auto &retweet_count_json = tweet.FindMember("retweet_count"); + if (retweet_count_json == tweet.MemberEnd() || !retweet_count_json->value.IsInt64()) { return false; } + int64_t retweet_count = retweet_count_json->value.GetInt64(); + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + + // TODO I can't figure out if there's a way to keep the Value to use outside the loop ... + + // Get text and screen_name of top tweet + const auto &text = tweet.FindMember("text"); + if (text == tweet.MemberEnd() || !text->value.IsString()) { return false; } + result.text = { text->value.GetString(), text->value.GetStringLength() }; + + const auto &user = tweet.FindMember("user"); + if (user == tweet.MemberEnd() || !user->value.IsObject()) { return false; } + const auto &screen_name = user->value.FindMember("screen_name"); + if (screen_name == user->value.MemberEnd() || !screen_name->value.IsString()) { return false; } + result.screen_name = { screen_name->value.GetString(), screen_name->value.GetStringLength() }; + + } + } + + return result.retweet_count != -1; + } +}; + +struct rapidjson : rapidjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return rapidjson_base::run(doc.Parse(json.data()), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, rapidjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct rapidjson_insitu : rapidjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return rapidjson_base::run(doc.ParseInsitu(json.data()), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, rapidjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/top_tweet/rapidjson_sax.h b/benchmark/top_tweet/rapidjson_sax.h new file mode 100644 index 0000000000..74c8d4d556 --- /dev/null +++ b/benchmark/top_tweet/rapidjson_sax.h @@ -0,0 +1,101 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_RAPIDJSON + +#include "top_tweet.h" +#include + +namespace top_tweet { + +using namespace rapidjson; + +struct rapidjson_sax { + using StringType=std::string_view; + struct Handler { + // Assume every tweet/retweet starts with "metadata" key and ends with "retweeted" key. Ignore everything in a retweet. + // Assume that the first valid key encountered outside a retweet is the correct key. + enum state { // Bit set to keep track of state of search for keys + key_text = (1<<0), + key_screen_name = (1<<1), + key_rt = (1<<2), + found_text = (1<<3), + found_screen_name = (1<<4), + found_rt = (1<<5) + }; + int values = state::key_text; + int rt; + StringType text; + StringType screen_name; + bool inretweet = false; + bool userobject = false; + top_tweet_result& result; + int64_t max_rt; + + Handler(top_tweet_result &r,int64_t m) : result(r), max_rt(m) { } + + bool Key(const char* key, SizeType length, bool copy) { + if (!inretweet) { + if ((length == 16) && (memcmp(key,"retweeted_status",16) == 0)) { inretweet = true; } // Check if entering retweet + else if ((length == 8) && (memcmp(key,"metadata",8) == 0)) { values = 0; } // Reset + else if (!(values & found_text) && (length == 4) && (memcmp(key,"text",4) == 0)) { values |= (key_text); } + else if ((length == 4) && (memcmp(key,"user",4) == 0)) { userobject = true; } + else if (!(values & found_screen_name) && userobject && (length == 11) && memcmp(key,"screen_name",11) == 0) { values |= (key_screen_name); } + else if (!(values & found_rt) && (length == 13) && (memcmp(key,"retweet_count",13) == 0)) { values |= (key_rt); } + } + else if ((length == 9) && (memcmp(key,"retweeted",9) == 0)) { inretweet = false; } // Check if end of retweet + return true; + } + bool String(const char* str, SizeType length, bool copy) { + if (values & key_text && !(values & found_text)) { // text + text = {str,length}; + values &= ~(key_text); + values |= (found_text); + } + else if (values & key_screen_name && !(values & found_screen_name)) { // user.screen_name + screen_name = {str,length}; + values &= ~(key_screen_name); + values |= (found_screen_name); + userobject = false; + } + return true; + } + bool Uint(unsigned i) { + if (values & key_rt && !(values & found_rt)) { // retweet_count + rt = i; + values &= ~(key_rt); + values |= (found_rt); + if (rt <= max_rt && rt >= result.retweet_count) { // Check if current tweet has more retweet than previous top tweet + result.retweet_count = rt; + result.text = text; + result.screen_name = screen_name; + } + } + return true; + } + // Irrelevant events + bool Null() { return true; } + bool Bool(bool b) { return true; } + bool Double(double d) { return true; } + bool Int(int i) { return true; } + bool Int64(int64_t i) { return true; } + bool Uint64(uint64_t i) { return true; } + bool RawNumber(const char* str, SizeType length, bool copy) { return true; } + bool StartObject() { return true; } + bool EndObject(SizeType memberCount) { return true; } + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { return true; } + }; // handler + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + Reader reader; + Handler handler(result,max_retweet_count); + InsituStringStream ss(json.data()); + reader.Parse(ss,handler); + return true; + } +}; // rapidjson_sax +BENCHMARK_TEMPLATE(top_tweet, rapidjson_sax)->UseManualTime(); +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_RAPIDJSON diff --git a/benchmark/top_tweet/sajson.h b/benchmark/top_tweet/sajson.h new file mode 100644 index 0000000000..a84dbed84f --- /dev/null +++ b/benchmark/top_tweet/sajson.h @@ -0,0 +1,63 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_SAJSON + +#include "top_tweet.h" + +namespace top_tweet { + +struct sajson { + using StringType=std::string_view; + + size_t ast_buffer_size{0}; + size_t *ast_buffer{nullptr}; + ~sajson() { free(ast_buffer); } + + bool run(simdjson::padded_string &json, int32_t max_retweet_count, top_tweet_result &result) { + if (!ast_buffer) { + ast_buffer_size = json.size(); + ast_buffer = (size_t *)std::malloc(ast_buffer_size * sizeof(size_t)); + } + auto doc = ::sajson::parse( + ::sajson::bounded_allocation(ast_buffer, ast_buffer_size), + ::sajson::mutable_string_view(json.size(), json.data()) + ); + if (!doc.is_valid()) { return false; } + + auto root = doc.get_root(); + if (root.get_type() != ::sajson::TYPE_OBJECT) { return false; } + auto statuses = root.get_value_of_key({ "statuses", strlen("statuses") }); + if (statuses.get_type() != ::sajson::TYPE_ARRAY) { return false; } + + for (size_t i=0; i= result.retweet_count) { + result.retweet_count = retweet_count; + + auto text = tweet.get_value_of_key({ "text", strlen("text") }); + if (text.get_type() != ::sajson::TYPE_STRING) { return false; } + result.text = { text.as_cstring(), text.get_string_length() }; + + auto user = tweet.get_value_of_key({ "user", strlen("user") }); + if (user.get_type() != ::sajson::TYPE_OBJECT) { return false; } + auto screen_name = user.get_value_of_key({ "screen_name", strlen("screen_name") }); + if (screen_name.get_type() != ::sajson::TYPE_STRING) { return false; } + result.screen_name = { screen_name.as_cstring(), screen_name.get_string_length() }; + } + } + + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, sajson)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_SAJSON \ No newline at end of file diff --git a/benchmark/top_tweet/simdjson_dom.h b/benchmark/top_tweet/simdjson_dom.h new file mode 100644 index 0000000000..3a6648f13d --- /dev/null +++ b/benchmark/top_tweet/simdjson_dom.h @@ -0,0 +1,39 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct simdjson_dom { + using StringType=std::string_view; + + dom::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + dom::element top_tweet{}; + + auto doc = parser.parse(json); + for (auto tweet : doc["statuses"]) { + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + result.text = top_tweet["text"]; + result.screen_name = top_tweet["user"]["screen_name"]; + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_dom)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_EXCEPTIONS \ No newline at end of file diff --git a/benchmark/top_tweet/simdjson_ondemand.h b/benchmark/top_tweet/simdjson_ondemand.h new file mode 100644 index 0000000000..7aaa1ac4ed --- /dev/null +++ b/benchmark/top_tweet/simdjson_ondemand.h @@ -0,0 +1,80 @@ +#pragma once + +#if SIMDJSON_EXCEPTIONS + +#include "top_tweet.h" + +namespace top_tweet { + +using namespace simdjson; + +struct simdjson_ondemand { + using StringType=std::string_view; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + // We save these DOM values for later so we don't have to parse them + // into string_views until we're sure which ones we want to parse + // NOTE: simdjson does not presently support reuse of objects or arrays--just scalars. This is + // why we have to grab the text and screen_name fields instead of just saving the tweet object. + ondemand::value screen_name, text; + + auto doc = parser.iterate(json); + for (auto tweet : doc["statuses"]) { + // Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free + // for us to retrieve them here (and will cost a bit more if we do it in the if + // statement). + auto tweet_text = tweet["text"]; + auto tweet_screen_name = tweet["user"]["screen_name"]; + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + // TODO std::move should not be necessary + text = std::move(tweet_text); + screen_name = std::move(tweet_screen_name); + } + } + + // Now that we know which was the most retweeted, parse the values in it + result.screen_name = screen_name; + result.text = text; + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand)->UseManualTime(); + +struct simdjson_ondemand_forward_only { + using StringType=std::string_view; + + ondemand::parser parser{}; + + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + auto doc = parser.iterate(json); + for (auto tweet : doc["statuses"]) { + // Since text, user.screen_name, and retweet_count generally appear in order, it's nearly free + // for us to retrieve them here (and will cost a bit more if we do it in the if + // statement). + auto tweet_text = tweet["text"]; + auto tweet_screen_name = tweet["user"]["screen_name"]; + int64_t retweet_count = tweet["retweet_count"]; + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + result.text = tweet_text; + result.screen_name = tweet_screen_name; + } + } + + return result.retweet_count != -1; + } +}; + +BENCHMARK_TEMPLATE(top_tweet, simdjson_ondemand_forward_only)->UseManualTime(); + +} // namespace top_tweet + +#endif // SIMDJSON_EXCEPTIONS diff --git a/benchmark/top_tweet/top_tweet.h b/benchmark/top_tweet/top_tweet.h new file mode 100644 index 0000000000..aa8a27401c --- /dev/null +++ b/benchmark/top_tweet/top_tweet.h @@ -0,0 +1,67 @@ + +#pragma once + +#include "json_benchmark/file_runner.h" + +namespace top_tweet { + +using namespace json_benchmark; + +template +struct top_tweet_result { + int64_t retweet_count{}; + StringType screen_name{}; + StringType text{}; + template + simdjson_inline bool operator==(const top_tweet_result &other) const { + return retweet_count == other.retweet_count && + screen_name == other.screen_name && + text == other.text; + } + template + simdjson_inline bool operator!=(const top_tweet_result &other) const { return !(*this == other); } +}; + +template +simdjson_unused static std::ostream &operator<<(std::ostream &o, const top_tweet_result &t) { + o << "retweet_count: " << t.retweet_count << std::endl; + o << "screen_name: " << t.screen_name << std::endl; + o << "text: " << t.text << std::endl; + return o; +} + +template +struct runner : public file_runner { + top_tweet_result result{}; + + bool setup(benchmark::State &state) { + return this->load_json(state, TWITTER_JSON); + } + + bool before_run(benchmark::State &state) { + if (!file_runner::before_run(state)) { return false; } + result.retweet_count = -1; + return true; + } + + bool run(benchmark::State &) { + return this->implementation.run(this->json, 60, result); + } + + template + bool diff(benchmark::State &state, runner &reference) { + return diff_results(state, result, reference.result, diff_flags::NONE); + } + + size_t items_per_iteration() { + return 1; + } +}; + +struct simdjson_dom; + +template simdjson_inline static void top_tweet(benchmark::State &state) { + json_benchmark::run_json_benchmark, runner>(state); +} + +} // namespace top_tweet diff --git a/benchmark/top_tweet/yyjson.h b/benchmark/top_tweet/yyjson.h new file mode 100644 index 0000000000..8beee224f2 --- /dev/null +++ b/benchmark/top_tweet/yyjson.h @@ -0,0 +1,68 @@ +#pragma once + +#ifdef SIMDJSON_COMPETITION_YYJSON + +#include "top_tweet.h" + +namespace top_tweet { + +struct yyjson_base { + using StringType=std::string_view; + + bool run(yyjson_doc *doc, int64_t max_retweet_count, top_tweet_result &result) { + result.retweet_count = -1; + + yyjson_val *top_tweet{}; + + if (!doc) { return false; } + yyjson_val *root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { return false; } + yyjson_val *statuses = yyjson_obj_get(root, "statuses"); + if (!yyjson_is_arr(statuses)) { return false; } + + // Walk the document, parsing the tweets as we go + size_t tweet_idx, tweets_max; + yyjson_val *tweet; + yyjson_arr_foreach(statuses, tweet_idx, tweets_max, tweet) { + if (!yyjson_is_obj(tweet)) { return false; } + + auto retweet_count_val = yyjson_obj_get(tweet, "retweet_count"); + if (!yyjson_is_uint(retweet_count_val)) { return false; } + int64_t retweet_count = yyjson_get_uint(retweet_count_val); + if (retweet_count <= max_retweet_count && retweet_count >= result.retweet_count) { + result.retweet_count = retweet_count; + top_tweet = tweet; + } + } + + auto text = yyjson_obj_get(top_tweet, "text"); + if (!yyjson_is_str(text)) { return false; } + result.text = { yyjson_get_str(text), yyjson_get_len(text) }; + + auto user = yyjson_obj_get(top_tweet, "user"); + if (!yyjson_is_obj(user)) { return false; } + auto screen_name = yyjson_obj_get(user, "screen_name"); + if (!yyjson_is_str(screen_name)) { return false; } + result.screen_name = { yyjson_get_str(screen_name), yyjson_get_len(screen_name) }; + + return result.retweet_count != -1; + } +}; + +struct yyjson : yyjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return yyjson_base::run(yyjson_read(json.data(), json.size(), 0), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, yyjson)->UseManualTime(); +#if SIMDJSON_COMPETITION_ONDEMAND_INSITU +struct yyjson_insitu : yyjson_base { + bool run(simdjson::padded_string &json, int64_t max_retweet_count, top_tweet_result &result) { + return yyjson_base::run(yyjson_read_opts(json.data(), json.size(), YYJSON_READ_INSITU, 0, 0), max_retweet_count, result); + } +}; +BENCHMARK_TEMPLATE(top_tweet, yyjson_insitu)->UseManualTime(); +#endif // SIMDJSON_COMPETITION_ONDEMAND_INSITU +} // namespace top_tweet + +#endif // SIMDJSON_COMPETITION_YYJSON diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake new file mode 100644 index 0000000000..baf2d8c344 --- /dev/null +++ b/cmake/CPM.cmake @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: MIT +# +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors + +set(CPM_DOWNLOAD_VERSION 0.40.2) +set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} +) + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cmake/JoinPaths.cmake b/cmake/JoinPaths.cmake new file mode 100644 index 0000000000..c68d91b84d --- /dev/null +++ b/cmake/JoinPaths.cmake @@ -0,0 +1,23 @@ +# This module provides function for joining paths +# known from most languages +# +# SPDX-License-Identifier: (MIT OR CC0-1.0) +# Copyright 2020 Jan Tojnar +# https://github.com/jtojnar/cmake-snips +# +# Modelled after Python’s os.path.join +# https://docs.python.org/3.7/library/os.path.html#os.path.join +# Windows not supported +function(join_paths joined_path first_path_segment) + set(temp_path "${first_path_segment}") + foreach(current_segment IN LISTS ARGN) + if(NOT ("${current_segment}" STREQUAL "")) + if(IS_ABSOLUTE "${current_segment}") + set(temp_path "${current_segment}") + else() + set(temp_path "${temp_path}/${current_segment}") + endif() + endif() + endforeach() + set(${joined_path} "${temp_path}" PARENT_SCOPE) +endfunction() diff --git a/cmake/add_compile_only_test.cmake b/cmake/add_compile_only_test.cmake new file mode 100644 index 0000000000..40ccf7e471 --- /dev/null +++ b/cmake/add_compile_only_test.cmake @@ -0,0 +1,8 @@ +function(add_compile_only_test TEST_NAME) + add_test( + NAME ${TEST_NAME} + COMMAND ${CMAKE_COMMAND} --build . --target ${TEST_NAME} --config $ + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE) +endfunction() \ No newline at end of file diff --git a/cmake/add_cpp_test.cmake b/cmake/add_cpp_test.cmake index b41087fa5d..c88a6940ec 100644 --- a/cmake/add_cpp_test.cmake +++ b/cmake/add_cpp_test.cmake @@ -3,12 +3,12 @@ # SOURCES defaults to testname.cpp if not specified. function(add_cpp_test TEST_NAME) # Parse arguments - cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS") + cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS;DEPENDENCY_OF") if (NOT ARGS_SOURCES) list(APPEND ARGS_SOURCES ${TEST_NAME}.cpp) endif() if (ARGS_COMPILE_ONLY) - list(APPEND ${ARGS_LABELS} compile) + list(APPEND ${ARGS_LABELS} compile_only) endif() # Add the compile target @@ -28,22 +28,29 @@ function(add_cpp_test TEST_NAME) set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE) else() add_test(${TEST_NAME} ${TEST_NAME}) + + # Add to