diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 060b7ea..b359f48 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -4,40 +4,26 @@ name: Main Extension Distribution Pipeline on: push: - paths-ignore: - - "*/**.md" - - "*/**.yml" pull_request: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' && github.sha || '' }} cancel-in-progress: true jobs: - duckdb-next-build: - name: Build extension binaries (next) - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main - with: - duckdb_version: main - ci_tools_version: main - extension_name: chsql +# duckdb-next-build: +# name: Build extension binaries +# uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main +# with: +# duckdb_version: main +# ci_tools_version: main +# extension_name: chsql duckdb-stable-build: name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.1 - with: - duckdb_version: v1.1.1 - ci_tools_version: v1.1.1 - extension_name: chsql - - duckdb-stable-deploy: - name: Deploy extension binaries - needs: duckdb-stable-build - if: false - uses: ./.github/workflows/_extension_deploy.yml - secrets: inherit + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: - duckdb_version: v1.1.1 + duckdb_version: v1.3.2 + ci_tools_version: main extension_name: chsql - deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/.github/workflows/_extension_deploy.yml b/.github/workflows/_extension_deploy.yml deleted file mode 100644 index c408f90..0000000 --- a/.github/workflows/_extension_deploy.yml +++ /dev/null @@ -1,121 +0,0 @@ -# -# Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml -# -# note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the -# deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as -# this workflow can be configured to use a custom deploy script. - - -name: Extension Deployment -on: - workflow_call: - inputs: - # The name of the extension - extension_name: - required: true - type: string - # DuckDB version to build against - duckdb_version: - required: true - type: string - # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64' - exclude_archs: - required: false - type: string - default: "" - # Whether to upload this deployment as the latest. This may overwrite a previous deployment. - deploy_latest: - required: false - type: boolean - default: false - # Whether to upload this deployment under a versioned path. These will not be deleted automatically - deploy_versioned: - required: false - type: boolean - default: false - # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times - artifact_postfix: - required: false - type: string - default: "" - # Override the default deploy script with a custom script - deploy_script: - required: false - type: string - default: "./scripts/extension-upload.sh" - # Override the default matrix parse script with a custom script - matrix_parse_script: - required: false - type: string - default: "./duckdb/scripts/modify_distribution_matrix.py" - -jobs: - generate_matrix: - name: Generate matrix - runs-on: ubuntu-latest - outputs: - deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - id: parse-matrices - run: | - python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty - deploy_matrix="`cat deploy_matrix.json`" - echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT - echo `cat $GITHUB_OUTPUT` - - deploy: - name: Deploy - runs-on: ubuntu-latest - needs: generate_matrix - if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }} - strategy: - matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}} - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - uses: actions/download-artifact@v2 - with: - name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}} - path: | - /tmp/extension - - - name: Deploy - shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} - BUCKET_NAME: ${{ secrets.S3_BUCKET }} - DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }} - run: | - pwd - python3 -m pip install pip awscli - git config --global --add safe.directory '*' - cd duckdb - git fetch --tags - export DUCKDB_VERSION=`git tag --points-at HEAD` - export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`} - cd .. - git fetch --tags - export EXT_VERSION=`git tag --points-at HEAD` - export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`} - ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}} diff --git a/CMakeLists.txt b/CMakeLists.txt index f750372..57edf1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,6 @@ set(TARGET_NAME chsql) # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then # used in cmake with find_package. Feel free to remove or replace with other dependencies. # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. -find_package(OpenSSL REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) project(${TARGET_NAME}) diff --git a/chsql/CMakeLists.txt b/chsql/CMakeLists.txt index 5460e12..422c2d5 100644 --- a/chsql/CMakeLists.txt +++ b/chsql/CMakeLists.txt @@ -4,9 +4,9 @@ set(TARGET_NAME chsql) # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then # used in cmake with find_package. Feel free to remove or replace with other dependencies. # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. -find_package(OpenSSL REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) +set(CHSQL_DUCKDB_VERSION ${DUCKDB_MAJOR_VERSION}) project(${TARGET_NAME}) include_directories( @@ -18,15 +18,13 @@ include_directories( ../duckdb/third_party/thrift ../duckdb/third_party/snappy ../duckdb/third_party/zstd/include - ../duckdb/third_party/mbedtls ../duckdb/third_party/mbedtls/include ../duckdb/third_party/brotli/include) -set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp) +set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp src/parquet_types.cpp) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) -# Link OpenSSL in both the static library as the loadable extension -target_link_libraries(${EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto) -target_link_libraries(${LOADABLE_EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto) +target_link_libraries(${EXTENSION_NAME} duckdb_mbedtls) +target_link_libraries(${LOADABLE_EXTENSION_NAME} duckdb_mbedtls) install( TARGETS ${EXTENSION_NAME} EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/chsql/extension_config.cmake b/chsql/extension_config.cmake index 776f13b..7ae21d2 100644 --- a/chsql/extension_config.cmake +++ b/chsql/extension_config.cmake @@ -1,5 +1,5 @@ # This file is included by DuckDB's build system. It specifies which extension to load - +set(CHSQL_DUCKDB_VERSION ${DUCKDB_MAJOR_VERSION}) include_directories( ./src/include ${CMAKE_CURRENT_SOURCE_DIR}/../duckdb/extension/parquet/include @@ -8,7 +8,7 @@ include_directories( ../duckdb/third_party/thrift ../duckdb/third_party/snappy ../duckdb/third_party/zstd/include - ../duckdb/third_party/mbedtls +# ../duckdb/third_party/mbedtls ../duckdb/third_party/mbedtls/include ../duckdb/third_party/brotli/include) diff --git a/chsql/src/chsql_extension.cpp b/chsql/src/chsql_extension.cpp index f5f0f9a..4a8b871 100644 --- a/chsql/src/chsql_extension.cpp +++ b/chsql/src/chsql_extension.cpp @@ -10,160 +10,159 @@ #include "duckdb/catalog/default/default_functions.hpp" #include "duckdb/catalog/default/default_table_functions.hpp" -// OpenSSL linked through vcpkg -#include #include "parquet_ordered_scan.cpp" #include "chsql_system.hpp" -namespace duckdb { - -// To add a new scalar SQL macro, add a new macro to this array! -// Copy and paste the top item in the array into the -// second-to-last position and make some modifications. -// (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) - -// Keep the DEFAULT_SCHEMA (no change needed) -// Replace "times_two" with a name for your macro -// If your function has parameters, add their names in quotes inside of the {}, with a nullptr at the end -// If you do not have parameters, simplify to {nullptr} -// Add the text of your SQL macro as a raw string with the format R"( select 42 )" - -static DefaultMacro chsql_macros[] = { - // -- Type conversion macros - {DEFAULT_SCHEMA, "toString", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS VARCHAR))"}, - {DEFAULT_SCHEMA, "toInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT8))"}, - {DEFAULT_SCHEMA, "toInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT16))"}, - {DEFAULT_SCHEMA, "toInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT32))"}, - {DEFAULT_SCHEMA, "toInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT64))"}, - {DEFAULT_SCHEMA, "toInt128", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT128))"}, - {DEFAULT_SCHEMA, "toInt256", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS HUGEINT))"}, - {DEFAULT_SCHEMA, "toInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT8) IS NOT NULL THEN CAST(x AS INT8) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT16) IS NOT NULL THEN CAST(x AS INT16) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT32) IS NOT NULL THEN CAST(x AS INT32) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT64) IS NOT NULL THEN CAST(x AS INT64) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt128OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT128) IS NOT NULL THEN CAST(x AS INT128) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt256OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS HUGEINT) IS NOT NULL THEN CAST(x AS HUGEINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT8))"}, - {DEFAULT_SCHEMA, "toInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT16))"}, - {DEFAULT_SCHEMA, "toInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT32))"}, - {DEFAULT_SCHEMA, "toInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT64))"}, - {DEFAULT_SCHEMA, "toInt128OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT128))"}, - {DEFAULT_SCHEMA, "toInt256OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS HUGEINT))"}, - // -- Unsigned integer conversion macros - {DEFAULT_SCHEMA, "toUInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UTINYINT))"}, - {DEFAULT_SCHEMA, "toUInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS USMALLINT))"}, - {DEFAULT_SCHEMA, "toUInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UINTEGER))"}, - {DEFAULT_SCHEMA, "toUInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UBIGINT))"}, - {DEFAULT_SCHEMA, "toUInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UTINYINT) IS NOT NULL THEN CAST(x AS UTINYINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS USMALLINT) IS NOT NULL THEN CAST(x AS USMALLINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UINTEGER) IS NOT NULL THEN CAST(x AS UINTEGER) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UBIGINT) IS NOT NULL THEN CAST(x AS UBIGINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UTINYINT))"}, // Fixed comma here - {DEFAULT_SCHEMA, "toUInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS USMALLINT))"}, // And here - {DEFAULT_SCHEMA, "toUInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UINTEGER))"}, // Also here - {DEFAULT_SCHEMA, "toUInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UBIGINT))"}, // And here - // -- Floating-point conversion macros - {DEFAULT_SCHEMA, "toFloat", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS DOUBLE))"}, - {DEFAULT_SCHEMA, "toFloatOrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS DOUBLE))"}, - {DEFAULT_SCHEMA, "toFloatOrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS DOUBLE) IS NOT NULL THEN CAST(x AS DOUBLE) ELSE 0 END)"}, - // -- Arithmetic macros - {DEFAULT_SCHEMA, "intDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((CAST(a AS BIGINT) // CAST(b AS BIGINT)))"}, - {DEFAULT_SCHEMA, "intDivOrNull", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT))"}, - {DEFAULT_SCHEMA, "intDivOZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(COALESCE((TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT)),0))"}, - {DEFAULT_SCHEMA, "plus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(add(a, b))"}, - {DEFAULT_SCHEMA, "minus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(subtract(a, b))"}, - {DEFAULT_SCHEMA, "modulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(CAST(a AS BIGINT) % CAST(b AS BIGINT))"}, - {DEFAULT_SCHEMA, "moduloOrZero", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(((TRY_CAST(a AS BIGINT) % TRY_CAST(b AS BIGINT))),0))"}, - // -- Tuple macros - {DEFAULT_SCHEMA, "tupleIntDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] // CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleIntDivByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) // CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleDivide", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] / CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMultiply", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> CAST(apply(b, x -> CAST(x AS BIGINT))[i] as BIGINT) * CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMinus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] - CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tuplePlus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] + CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMultiplyByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) * CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleDivideByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) / CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleModulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleModuloByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleConcat", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(list_concat(a, b))"}, - // -- String matching macros - {DEFAULT_SCHEMA, "match", {"string", "token"}, {{nullptr, nullptr}}, R"(string LIKE token)"}, - // -- Array macros - {DEFAULT_SCHEMA, "arrayExists", {"needle", "haystack", nullptr}, {{nullptr, nullptr}}, R"(haystack @> ARRAY[needle])"}, - {DEFAULT_SCHEMA, "arrayMap", {"x", "arr", nullptr}, {{nullptr, nullptr}}, R"(array_transform(arr, x))"}, - // Date and Time Functions - {DEFAULT_SCHEMA, "toYear", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(YEAR FROM date_expression))"}, - {DEFAULT_SCHEMA, "toMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MONTH FROM date_expression))"}, - {DEFAULT_SCHEMA, "toDayOfMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(DAY FROM date_expression))"}, - {DEFAULT_SCHEMA, "toHour", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(HOUR FROM date_expression))"}, - {DEFAULT_SCHEMA, "toMinute", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MINUTE FROM date_expression))"}, - {DEFAULT_SCHEMA, "toSecond", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(SECOND FROM date_expression))"}, - {DEFAULT_SCHEMA, "toYYYYMM", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m'))"}, - {DEFAULT_SCHEMA, "toYYYYMMDD", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d'))"}, - {DEFAULT_SCHEMA, "toYYYYMMDDhhmmss", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d%H%M%S'))"}, - {DEFAULT_SCHEMA, "formatDateTime", {"time", "format", "timezone", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN timezone IS NULL THEN strftime(time, format) ELSE strftime(time AT TIME ZONE timezone, format) END)"}, - // String Functions - {DEFAULT_SCHEMA, "empty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) = 0)"}, - {DEFAULT_SCHEMA, "notEmpty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) > 0)"}, - {DEFAULT_SCHEMA, "lengthUTF8", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str))"}, - {DEFAULT_SCHEMA, "leftPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(LPAD(str, length, pad_str))"}, - {DEFAULT_SCHEMA, "rightPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(RPAD(str, length, pad_str))"}, - {DEFAULT_SCHEMA, "extractAllGroups", {"text", "pattern", nullptr}, {{nullptr, nullptr}}, R"(regexp_extract_all(text, pattern))"}, - {DEFAULT_SCHEMA, "toFixedString", {"str", "length", nullptr}, {{nullptr, nullptr}}, R"(RPAD(LEFT(str, length), length, '\0'))"}, - {DEFAULT_SCHEMA, "ifNull", {"x", "y", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(x, y))"}, - {DEFAULT_SCHEMA, "arrayJoin", {"arr", nullptr}, {{nullptr, nullptr}}, R"(UNNEST(arr))"}, - {DEFAULT_SCHEMA, "splitByChar", {"separator", "str", nullptr}, {{nullptr, nullptr}}, R"(string_split(str, separator))"}, - // URL Functions - {DEFAULT_SCHEMA, "protocol", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '^(\w+)://', 1))"}, - {DEFAULT_SCHEMA, "domain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://([^/]+)', 1))"}, - {DEFAULT_SCHEMA, "topLevelDomain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '\.([^./:]+)([:/]|$)', 1))"}, - {DEFAULT_SCHEMA, "path", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://[^/]+(/.*)', 1))"}, - // IP Address Functions - {DEFAULT_SCHEMA, "IPv4NumToString", {"num", nullptr}, {{nullptr, nullptr}}, R"(CONCAT(CAST((num >> 24) & 255 AS VARCHAR), '.', CAST((num >> 16) & 255 AS VARCHAR), '.', CAST((num >> 8) & 255 AS VARCHAR), '.', CAST(num & 255 AS VARCHAR)))"}, - {DEFAULT_SCHEMA, "IPv4StringToNum", {"ip", nullptr}, {{nullptr, nullptr}}, R"(CAST(SPLIT_PART(ip, '.', 1) AS INTEGER) * 256 * 256 * 256 + CAST(SPLIT_PART(ip, '.', 2) AS INTEGER) * 256 * 256 + CAST(SPLIT_PART(ip, '.', 3) AS INTEGER) * 256 + CAST(SPLIT_PART(ip, '.', 4) AS INTEGER))"}, - // -- JSON Macros - {DEFAULT_SCHEMA, "JSONExtract", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, - {DEFAULT_SCHEMA, "JSONExtractString", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS VARCHAR))"}, - {DEFAULT_SCHEMA, "JSONExtractUInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS UINTEGER))"}, - {DEFAULT_SCHEMA, "JSONExtractInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS INT32))"}, - {DEFAULT_SCHEMA, "JSONExtractFloat", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS DOUBLE))"}, - {DEFAULT_SCHEMA, "JSONExtractRaw", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, - {DEFAULT_SCHEMA, "JSONHas", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key) IS NOT NULL)"}, - {DEFAULT_SCHEMA, "JSONLength", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_array_length(json))"}, - {DEFAULT_SCHEMA, "JSONType", {"json", "path", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN path IS NULL THEN json_each(json) ELSE json_each(json,path) END)"}, - {DEFAULT_SCHEMA, "JSONExtractKeys", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_object_keys(json))"}, - {DEFAULT_SCHEMA, "JSONExtractValues", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_each_text(json))"}, - // -- Compare Macros - {DEFAULT_SCHEMA, "equals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a = b))"}, - {DEFAULT_SCHEMA, "notEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <> b))"}, - {DEFAULT_SCHEMA, "less", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a < b))"}, - {DEFAULT_SCHEMA, "greater", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a > b))"}, - {DEFAULT_SCHEMA, "lessOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <= b))"}, - {DEFAULT_SCHEMA, "greaterOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a >= b))"}, - // -- Misc macros - {DEFAULT_SCHEMA, "generateUUIDv4", {nullptr}, {{nullptr, nullptr}}, R"(toString(uuid()))"}, - {DEFAULT_SCHEMA, "parseURL", {"url", "part", nullptr}, {{nullptr, nullptr}}, R"(CASE part WHEN 'protocol' THEN REGEXP_EXTRACT(url, '^(\w+)://') WHEN 'domain' THEN REGEXP_EXTRACT(url, '://([^/:]+)') WHEN 'port' THEN REGEXP_EXTRACT(url, ':(\d+)') WHEN 'path' THEN REGEXP_EXTRACT(url, '://[^/]+(/.+?)(\?|#|$)') WHEN 'query' THEN REGEXP_EXTRACT(url, '\?([^#]+)') WHEN 'fragment' THEN REGEXP_EXTRACT(url, '#(.+)$') END)"}, - {DEFAULT_SCHEMA, "bitCount", {"num", nullptr}, {{nullptr, nullptr}}, R"(BIT_COUNT(num))"}, - // Dictionary Emulation using MAP VARIABLES - {DEFAULT_SCHEMA, "dictGet", {"dict","attr", nullptr}, {{nullptr, nullptr}}, R"(getvariable(dict)[attr][1])"}, - // -- End Macro - {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}}; - -// To add a new table SQL macro, add a new macro to this array! -// Copy and paste the top item in the array into the -// second-to-last position and make some modifications. -// (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) - -// Keep the DEFAULT_SCHEMA (no change needed) -// Replace "times_two_table" with a name for your macro -// If your function has parameters without default values, add their names in quotes inside of the {}, with a nullptr at the end -// If you do not have parameters, simplify to {nullptr} -// If your function has parameters with default values, add their names and values in quotes inside of {}'s inside of the {}. -// Be sure to keep {nullptr, nullptr} at the end -// If you do not have parameters with default values, simplify to {nullptr, nullptr} -// Add the text of your SQL macro as a raw string with the format R"( select 42; )" - -// clang-format off +namespace duckdb +{ + + // To add a new scalar SQL macro, add a new macro to this array! + // Copy and paste the top item in the array into the + // second-to-last position and make some modifications. + // (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) + + // Keep the DEFAULT_SCHEMA (no change needed) + // Replace "times_two" with a name for your macro + // If your function has parameters, add their names in quotes inside of the {}, with a nullptr at the end + // If you do not have parameters, simplify to {nullptr} + // Add the text of your SQL macro as a raw string with the format R"( select 42 )" + + static DefaultMacro chsql_macros[] = { + // -- Type conversion macros + {DEFAULT_SCHEMA, "toString", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS VARCHAR))"}, + {DEFAULT_SCHEMA, "toInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT8))"}, + {DEFAULT_SCHEMA, "toInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT16))"}, + {DEFAULT_SCHEMA, "toInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT32))"}, + {DEFAULT_SCHEMA, "toInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT64))"}, + {DEFAULT_SCHEMA, "toInt128", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT128))"}, + {DEFAULT_SCHEMA, "toInt256", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS HUGEINT))"}, + {DEFAULT_SCHEMA, "toInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT8) IS NOT NULL THEN CAST(x AS INT8) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT16) IS NOT NULL THEN CAST(x AS INT16) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT32) IS NOT NULL THEN CAST(x AS INT32) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT64) IS NOT NULL THEN CAST(x AS INT64) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt128OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT128) IS NOT NULL THEN CAST(x AS INT128) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt256OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS HUGEINT) IS NOT NULL THEN CAST(x AS HUGEINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT8))"}, + {DEFAULT_SCHEMA, "toInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT16))"}, + {DEFAULT_SCHEMA, "toInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT32))"}, + {DEFAULT_SCHEMA, "toInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT64))"}, + {DEFAULT_SCHEMA, "toInt128OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT128))"}, + {DEFAULT_SCHEMA, "toInt256OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS HUGEINT))"}, + // -- Unsigned integer conversion macros + {DEFAULT_SCHEMA, "toUInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UTINYINT))"}, + {DEFAULT_SCHEMA, "toUInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS USMALLINT))"}, + {DEFAULT_SCHEMA, "toUInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UINTEGER))"}, + {DEFAULT_SCHEMA, "toUInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UBIGINT))"}, + {DEFAULT_SCHEMA, "toUInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UTINYINT) IS NOT NULL THEN CAST(x AS UTINYINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS USMALLINT) IS NOT NULL THEN CAST(x AS USMALLINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UINTEGER) IS NOT NULL THEN CAST(x AS UINTEGER) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UBIGINT) IS NOT NULL THEN CAST(x AS UBIGINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UTINYINT))"}, // Fixed comma here + {DEFAULT_SCHEMA, "toUInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS USMALLINT))"}, // And here + {DEFAULT_SCHEMA, "toUInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UINTEGER))"}, // Also here + {DEFAULT_SCHEMA, "toUInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UBIGINT))"}, // And here + // -- Floating-point conversion macros + {DEFAULT_SCHEMA, "toFloat", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS DOUBLE))"}, + {DEFAULT_SCHEMA, "toFloatOrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS DOUBLE))"}, + {DEFAULT_SCHEMA, "toFloatOrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS DOUBLE) IS NOT NULL THEN CAST(x AS DOUBLE) ELSE 0 END)"}, + // -- Arithmetic macros + {DEFAULT_SCHEMA, "intDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((CAST(a AS BIGINT) // CAST(b AS BIGINT)))"}, + {DEFAULT_SCHEMA, "intDivOrNull", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT))"}, + {DEFAULT_SCHEMA, "intDivOZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(COALESCE((TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT)),0))"}, + {DEFAULT_SCHEMA, "plus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(add(a, b))"}, + {DEFAULT_SCHEMA, "minus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(subtract(a, b))"}, + {DEFAULT_SCHEMA, "modulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(CAST(a AS BIGINT) % CAST(b AS BIGINT))"}, + {DEFAULT_SCHEMA, "moduloOrZero", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(((TRY_CAST(a AS BIGINT) % TRY_CAST(b AS BIGINT))),0))"}, + // -- Tuple macros + {DEFAULT_SCHEMA, "tupleIntDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] // CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleIntDivByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) // CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleDivide", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] / CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMultiply", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> CAST(apply(b, x -> CAST(x AS BIGINT))[i] as BIGINT) * CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMinus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] - CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tuplePlus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] + CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMultiplyByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) * CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleDivideByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) / CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleModulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleModuloByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleConcat", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(list_concat(a, b))"}, + // -- String matching macros + {DEFAULT_SCHEMA, "match", {"string", "token"}, {{nullptr, nullptr}}, R"(string LIKE token)"}, + // -- Array macros + {DEFAULT_SCHEMA, "arrayExists", {"needle", "haystack", nullptr}, {{nullptr, nullptr}}, R"(haystack @> ARRAY[needle])"}, + {DEFAULT_SCHEMA, "arrayMap", {"x", "arr", nullptr}, {{nullptr, nullptr}}, R"(array_transform(arr, x))"}, + // Date and Time Functions + {DEFAULT_SCHEMA, "toYear", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(YEAR FROM date_expression))"}, + {DEFAULT_SCHEMA, "toMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MONTH FROM date_expression))"}, + {DEFAULT_SCHEMA, "toDayOfMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(DAY FROM date_expression))"}, + {DEFAULT_SCHEMA, "toHour", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(HOUR FROM date_expression))"}, + {DEFAULT_SCHEMA, "toMinute", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MINUTE FROM date_expression))"}, + {DEFAULT_SCHEMA, "toSecond", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(SECOND FROM date_expression))"}, + {DEFAULT_SCHEMA, "toYYYYMM", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m'))"}, + {DEFAULT_SCHEMA, "toYYYYMMDD", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d'))"}, + {DEFAULT_SCHEMA, "toYYYYMMDDhhmmss", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d%H%M%S'))"}, + {DEFAULT_SCHEMA, "formatDateTime", {"time", "format", "timezone", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN timezone IS NULL THEN strftime(time, format) ELSE strftime(time AT TIME ZONE timezone, format) END)"}, + // String Functions + {DEFAULT_SCHEMA, "empty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) = 0)"}, + {DEFAULT_SCHEMA, "notEmpty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) > 0)"}, + {DEFAULT_SCHEMA, "lengthUTF8", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str))"}, + {DEFAULT_SCHEMA, "leftPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(LPAD(str, length, pad_str))"}, + {DEFAULT_SCHEMA, "rightPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(RPAD(str, length, pad_str))"}, + {DEFAULT_SCHEMA, "extractAllGroups", {"text", "pattern", nullptr}, {{nullptr, nullptr}}, R"(regexp_extract_all(text, pattern))"}, + {DEFAULT_SCHEMA, "toFixedString", {"str", "length", nullptr}, {{nullptr, nullptr}}, R"(RPAD(LEFT(str, length), length, '\0'))"}, + {DEFAULT_SCHEMA, "ifNull", {"x", "y", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(x, y))"}, + {DEFAULT_SCHEMA, "arrayJoin", {"arr", nullptr}, {{nullptr, nullptr}}, R"(UNNEST(arr))"}, + {DEFAULT_SCHEMA, "splitByChar", {"separator", "str", nullptr}, {{nullptr, nullptr}}, R"(string_split(str, separator))"}, + // URL Functions + {DEFAULT_SCHEMA, "protocol", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '^(\w+)://', 1))"}, + {DEFAULT_SCHEMA, "domain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://([^/]+)', 1))"}, + {DEFAULT_SCHEMA, "topLevelDomain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '\.([^./:]+)([:/]|$)', 1))"}, + {DEFAULT_SCHEMA, "path", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://[^/]+(/.*)', 1))"}, + // IP Address Functions + {DEFAULT_SCHEMA, "IPv4NumToString", {"num", nullptr}, {{nullptr, nullptr}}, R"(CONCAT(CAST((num >> 24) & 255 AS VARCHAR), '.', CAST((num >> 16) & 255 AS VARCHAR), '.', CAST((num >> 8) & 255 AS VARCHAR), '.', CAST(num & 255 AS VARCHAR)))"}, + {DEFAULT_SCHEMA, "IPv4StringToNum", {"ip", nullptr}, {{nullptr, nullptr}}, R"(CAST(SPLIT_PART(ip, '.', 1) AS INTEGER) * 256 * 256 * 256 + CAST(SPLIT_PART(ip, '.', 2) AS INTEGER) * 256 * 256 + CAST(SPLIT_PART(ip, '.', 3) AS INTEGER) * 256 + CAST(SPLIT_PART(ip, '.', 4) AS INTEGER))"}, + // -- JSON Macros + {DEFAULT_SCHEMA, "JSONExtract", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, + {DEFAULT_SCHEMA, "JSONExtractString", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS VARCHAR))"}, + {DEFAULT_SCHEMA, "JSONExtractUInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS UINTEGER))"}, + {DEFAULT_SCHEMA, "JSONExtractInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS INT32))"}, + {DEFAULT_SCHEMA, "JSONExtractFloat", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS DOUBLE))"}, + {DEFAULT_SCHEMA, "JSONExtractRaw", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, + {DEFAULT_SCHEMA, "JSONHas", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key) IS NOT NULL)"}, + {DEFAULT_SCHEMA, "JSONLength", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_array_length(json))"}, + {DEFAULT_SCHEMA, "JSONType", {"json", "path", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN path IS NULL THEN json_each(json) ELSE json_each(json,path) END)"}, + {DEFAULT_SCHEMA, "JSONExtractKeys", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_object_keys(json))"}, + {DEFAULT_SCHEMA, "JSONExtractValues", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_each_text(json))"}, + // -- Compare Macros + {DEFAULT_SCHEMA, "equals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a = b))"}, + {DEFAULT_SCHEMA, "notEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <> b))"}, + {DEFAULT_SCHEMA, "less", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a < b))"}, + {DEFAULT_SCHEMA, "greater", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a > b))"}, + {DEFAULT_SCHEMA, "lessOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <= b))"}, + {DEFAULT_SCHEMA, "greaterOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a >= b))"}, + // -- Misc macros + {DEFAULT_SCHEMA, "generateUUIDv4", {nullptr}, {{nullptr, nullptr}}, R"(toString(uuid()))"}, + {DEFAULT_SCHEMA, "parseURL", {"url", "part", nullptr}, {{nullptr, nullptr}}, R"(CASE part WHEN 'protocol' THEN REGEXP_EXTRACT(url, '^(\w+)://') WHEN 'domain' THEN REGEXP_EXTRACT(url, '://([^/:]+)') WHEN 'port' THEN REGEXP_EXTRACT(url, ':(\d+)') WHEN 'path' THEN REGEXP_EXTRACT(url, '://[^/]+(/.+?)(\?|#|$)') WHEN 'query' THEN REGEXP_EXTRACT(url, '\?([^#]+)') WHEN 'fragment' THEN REGEXP_EXTRACT(url, '#(.+)$') END)"}, + {DEFAULT_SCHEMA, "bitCount", {"num", nullptr}, {{nullptr, nullptr}}, R"(BIT_COUNT(num))"}, + // Dictionary Emulation using MAP VARIABLES + {DEFAULT_SCHEMA, "dictGet", {"dict", "attr", nullptr}, {{nullptr, nullptr}}, R"(getvariable(dict)[attr][1])"}, + // -- End Macro + {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}}; + + // To add a new table SQL macro, add a new macro to this array! + // Copy and paste the top item in the array into the + // second-to-last position and make some modifications. + // (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) + + // Keep the DEFAULT_SCHEMA (no change needed) + // Replace "times_two_table" with a name for your macro + // If your function has parameters without default values, add their names in quotes inside of the {}, with a nullptr at the end + // If you do not have parameters, simplify to {nullptr} + // If your function has parameters with default values, add their names and values in quotes inside of {}'s inside of the {}. + // Be sure to keep {nullptr, nullptr} at the end + // If you do not have parameters with default values, simplify to {nullptr, nullptr} + // Add the text of your SQL macro as a raw string with the format R"( select 42; )" + + // clang-format off static const DefaultTableMacro chsql_table_macros[] = { {DEFAULT_SCHEMA, "numbers", {"x", nullptr}, {{"z", "0"}, {nullptr, nullptr}}, R"(SELECT * as number FROM generate_series(z,x-1);)"}, {DEFAULT_SCHEMA, "url", {"url", "format", nullptr}, {{nullptr, nullptr}}, R"(WITH "JSON" as (SELECT * FROM read_json_auto(url)), "PARQUET" as (SELECT * FROM read_parquet(url)), "CSV" as (SELECT * FROM read_csv_auto(url)), "BLOB" as (SELECT * FROM read_blob(url)), "TEXT" as (SELECT * FROM read_text(url)) FROM query_table(format))"}, @@ -185,49 +184,23 @@ static const DefaultTableMacro chsql_table_macros[] = { )"}, {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr} }; -// clang-format on - -inline void ChSqlScalarFun(DataChunk &args, ExpressionState &state, Vector &result) { - auto &name_vector = args.data[0]; - UnaryExecutor::Execute( - name_vector, result, args.size(), - [&](string_t name) { - return StringVector::AddString(result, "ChSql "+name.GetString()+" 🐥");; - }); -} - -inline void ChSqlOpenSSLVersionScalarFun(DataChunk &args, ExpressionState &state, Vector &result) { - auto &name_vector = args.data[0]; - UnaryExecutor::Execute( - name_vector, result, args.size(), - [&](string_t name) { - return StringVector::AddString(result, "ChSql " + name.GetString() + - ", my linked OpenSSL version is " + - OPENSSL_VERSION_TEXT );; - }); -} - -static void LoadInternal(DatabaseInstance &instance) { - // Register a scalar function - auto chsql_scalar_function = ScalarFunction("chsql", {LogicalType::VARCHAR}, LogicalType::VARCHAR, ChSqlScalarFun); - ExtensionUtil::RegisterFunction(instance, chsql_scalar_function); - - // Register another scalar function - auto chsql_openssl_version_scalar_function = ScalarFunction("chsql_openssl_version", {LogicalType::VARCHAR}, - LogicalType::VARCHAR, ChSqlOpenSSLVersionScalarFun); - ExtensionUtil::RegisterFunction(instance, chsql_openssl_version_scalar_function); + // clang-format on + static void LoadInternal(DatabaseInstance &instance) + { // Macros - for (idx_t index = 0; chsql_macros[index].name != nullptr; index++) { - auto info = DefaultFunctionGenerator::CreateInternalMacroInfo(chsql_macros[index]); - ExtensionUtil::RegisterFunction(instance, *info); - } + for (idx_t index = 0; chsql_macros[index].name != nullptr; index++) + { + auto info = DefaultFunctionGenerator::CreateInternalMacroInfo(chsql_macros[index]); + ExtensionUtil::RegisterFunction(instance, *info); + } // Table Macros - for (idx_t index = 0; chsql_table_macros[index].name != nullptr; index++) { - auto table_info = DefaultTableFunctionGenerator::CreateTableMacroInfo(chsql_table_macros[index]); - ExtensionUtil::RegisterFunction(instance, *table_info); - } - ExtensionUtil::RegisterFunction(instance, ReadParquetOrderedFunction()); + for (idx_t index = 0; chsql_table_macros[index].name != nullptr; index++) + { + auto table_info = DefaultTableFunctionGenerator::CreateTableMacroInfo(chsql_table_macros[index]); + ExtensionUtil::RegisterFunction(instance, *table_info); + } + ExtensionUtil::RegisterFunction(instance, ReadParquetOrderedFunction()); // Flock ExtensionUtil::RegisterFunction(instance, DuckFlockTableFunction()); // System Table @@ -237,36 +210,41 @@ static void LoadInternal(DatabaseInstance &instance) { con.BeginTransaction(); CreateSystemViews(con); con.Commit(); - -} - -void ChsqlExtension::Load(DuckDB &db) { - LoadInternal(*db.instance); -} -std::string ChsqlExtension::Name() { - return "chsql"; -} - -std::string ChsqlExtension::Version() const { + } + + void ChsqlExtension::Load(DuckDB &db) + { + LoadInternal(*db.instance); + } + std::string ChsqlExtension::Name() + { + return "chsql"; + } + + std::string ChsqlExtension::Version() const + { #ifdef EXT_VERSION_CHSQL - return EXT_VERSION_CHSQL; + return EXT_VERSION_CHSQL; #else - return ""; + return ""; #endif -} + } } // namespace duckdb -extern "C" { +extern "C" +{ -DUCKDB_EXTENSION_API void chsql_init(duckdb::DatabaseInstance &db) { + DUCKDB_EXTENSION_API void chsql_init(duckdb::DatabaseInstance &db) + { duckdb::DuckDB db_wrapper(db); db_wrapper.LoadExtension(); -} + } -DUCKDB_EXTENSION_API const char *chsql_version() { - return duckdb::DuckDB::LibraryVersion(); -} + DUCKDB_EXTENSION_API const char *chsql_version() + { + return duckdb::DuckDB::LibraryVersion(); + } } #ifndef DUCKDB_EXTENSION_MAIN diff --git a/chsql/src/duck_flock.cpp b/chsql/src/duck_flock.cpp index 79a821d..b93eaaa 100644 --- a/chsql/src/duck_flock.cpp +++ b/chsql/src/duck_flock.cpp @@ -104,7 +104,7 @@ namespace duckdb { try { if (res->TryFetch(data_chunk, error_data)) { - if (data_chunk && !data_chunk->size() == 0) { + if (data_chunk && data_chunk->size() != 0) { output.Append(*data_chunk); return; } diff --git a/chsql/src/include/chsql_extension.hpp b/chsql/src/include/chsql_extension.hpp index b1c7a5e..50440bf 100644 --- a/chsql/src/include/chsql_extension.hpp +++ b/chsql/src/include/chsql_extension.hpp @@ -2,17 +2,18 @@ #include "duckdb.hpp" -namespace duckdb { +namespace duckdb +{ -class ChsqlExtension : public Extension { -public: - void Load(DuckDB &db) override; - std::string Name() override; - std::string Version() const override; -}; -duckdb::TableFunction ReadParquetOrderedFunction(); -static void RegisterSillyBTreeStore(DatabaseInstance &instance); + class ChsqlExtension : public Extension + { + public: + void Load(DuckDB &db) override; + std::string Name() override; + std::string Version() const override; + }; + duckdb::TableFunction ReadParquetOrderedFunction(); -TableFunction DuckFlockTableFunction(); + TableFunction DuckFlockTableFunction(); } // namespace duckdb diff --git a/chsql/src/include/chsql_parquet_types.h b/chsql/src/include/chsql_parquet_types.h new file mode 100644 index 0000000..0e44d11 --- /dev/null +++ b/chsql/src/include/chsql_parquet_types.h @@ -0,0 +1,55 @@ +// +// Created by hromozeka on 10.03.25. +// + +#ifndef PARQUET_TYPES_H +#define PARQUET_TYPES_H + + +#include "duckdb.hpp" +#include + +struct ParquetType { + /*duckdb_parquet::ConvertedType::type -> replaced to int to support -1 nodata value*/ + int converted_type; + /* duckdb_parquet::Type::type -> replaced to int to support -1 for no matter value */ + int parquet_type; + const duckdb::LogicalType logical_type; + ParquetType(int converted_type, int parquet_type, const duckdb::LogicalType &logical_type) + : converted_type(converted_type), parquet_type(parquet_type), logical_type(logical_type) {} + virtual bool check_type(const duckdb::vector &schema, idx_t idx); + virtual duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema); +}; + +struct LogicalParquetType : public ParquetType { + bool (*get_isset)(const duckdb_parquet::SchemaElement& el); + + LogicalParquetType(bool (*get_isset) (const duckdb_parquet::SchemaElement& el), + const duckdb::LogicalType& logical_type) + : ParquetType(-1, duckdb_parquet::Type::type::INT32, logical_type), get_isset(get_isset) {} + bool check_type(const duckdb::vector &schema, idx_t idx) override; +}; + +struct JSONParquetType : public ParquetType { + JSONParquetType(): ParquetType(duckdb_parquet::ConvertedType::JSON, -1, duckdb::LogicalType::SQLNULL) {} + duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema) override; +}; + +struct DecimalParquetType : public ParquetType { + DecimalParquetType(): ParquetType(-1, duckdb_parquet::Type::type::INT32, duckdb::LogicalType::SQLNULL) {} + bool check_type(const duckdb::vector &schema, idx_t idx) override; + duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema) override; +}; + +class ParquetTypesManager { + protected: + static ParquetTypesManager *instance; + static std::mutex instance_mutex; + ParquetTypesManager(); + static ParquetTypesManager* get_instance(); + duckdb::LogicalType derive_logical_type(const duckdb_parquet::SchemaElement &s_ele, bool binary_as_string); + public: + static duckdb::LogicalType get_logical_type(const duckdb::vector &schema, idx_t idx); +}; + +#endif //PARQUET_TYPES_H diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index 88585c4..81dd069 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -2,25 +2,78 @@ #include "duckdb/common/exception.hpp" #include #include "chsql_extension.hpp" -#include +#include +#include "chsql_parquet_types.h" namespace duckdb { + struct ReturnColumn { + string name; + LogicalType type; + }; + struct ReaderSet { unique_ptr reader; - idx_t orderByIdx; + vector returnColumns; + int64_t orderByIdx; unique_ptr chunk; unique_ptr scanState; - vector columnMap; + vector columnMap; idx_t result_idx; + bool haveAbsentColumns; + void populateColumnInfo(const vector& returnCols, const string& order_by_column) { + this->returnColumns = returnCols; + columnMap.clear(); + haveAbsentColumns = false; + for (auto it = returnCols.begin(); it!= returnCols.end(); ++it) { + auto schema_column = find_if( + reader->metadata->metadata->schema.begin(), + reader->metadata->metadata->schema.end(), + [&](const SchemaElement& column) { return column.name == it->name; }); + if (schema_column == reader->metadata->metadata->schema.end()) { + columnMap.push_back(-1); + haveAbsentColumns = true; + continue; + } + columnMap.push_back(static_cast(schema_column - reader->metadata->metadata->schema.begin() - 1)); + reader->column_ids.push_back( + MultiFileLocalColumnId(static_cast(schema_column - reader->metadata->metadata->schema.begin() - 1))); + reader->column_indexes.emplace_back(static_cast(it - returnCols.begin())); + } + auto order_by_column_it = find_if( + reader->metadata->metadata->schema.begin(), + reader->metadata->metadata->schema.end(), + [&](const SchemaElement& column) { return column.name == order_by_column; }); + if (order_by_column_it == reader->metadata->metadata->schema.end()) { + orderByIdx = -1; + } else { + orderByIdx = find_if(columnMap.begin(), columnMap.end(), + [&](const int64_t& i) { return i == (order_by_column_it - reader->metadata->metadata->schema.begin() - 1); }) - + columnMap.begin(); + } + } + void Scan(ClientContext& ctx) { + chunk->Reset(); + reader->Scan(ctx, *scanState, *chunk); + if (!haveAbsentColumns || chunk->size() == 0) { + return; + } + for (auto it = columnMap.begin(); it!=columnMap.end(); ++it) { + if (*it != -1) { + continue; + } + chunk->data[it - columnMap.begin()].Initialize(false, chunk->size()); + for (idx_t j = 0; j < chunk->size(); j++) { + chunk->data[it - columnMap.begin()].SetValue(j, Value()); + } + } + } }; struct OrderedReadFunctionData : FunctionData { string orderBy; - vector> sets; vector files; - vector returnTypes; - vector names; + vector returnCols; unique_ptr Copy() const override { throw std::runtime_error("not implemented"); } @@ -44,9 +97,15 @@ namespace duckdb { }; }; + bool lt(const Value &left, const Value &right) { + return left.IsNull() || (!right.IsNull() && left < right); + } + bool le(const Value &left, const Value &right) { + return left.IsNull() || (!right.IsNull() && left <= right); + } - struct OrderedReadLocalState: LocalTableFunctionState { + struct OrderedReadLocalState: LocalTableFunctionState { vector> sets; vector winner_group; void RecalculateWinnerGroup() { @@ -55,11 +114,17 @@ namespace duckdb { return; } idx_t winner_idx = 0; + auto first_unordered = std::find_if(sets.begin(), sets.end(), + [&](const unique_ptr &s) { return s->orderByIdx == -1; }); + if (first_unordered != sets.end()) { + winner_group.push_back(first_unordered - sets.begin()); + return; + } for (idx_t i = 1; i < sets.size(); i++) { const auto &s = sets[i]; const auto &w = sets[winner_idx]; - if (s->chunk->GetValue(s->orderByIdx, s->result_idx) < - w->chunk->GetValue(w->orderByIdx, w->result_idx)) { + if (lt(s->chunk->GetValue(s->orderByIdx, s->result_idx), + w->chunk->GetValue(w->orderByIdx, w->result_idx))) { winner_idx = i; } } @@ -70,7 +135,7 @@ namespace duckdb { if (i == winner_idx) continue; auto &s = sets[i]; const auto &sFirst = s->chunk->GetValue(s->orderByIdx, s->result_idx); - if (sFirst <= wLast) { + if (le(sFirst, wLast)) { winner_group.push_back(i); } } @@ -83,83 +148,71 @@ namespace duckdb { } }; + static vector GetColumnsFromParquetSchemas(const vector>& sets) { + vector result; + for (auto &set : sets) { + const auto &schema = set->reader->metadata->metadata->schema; + for (auto it = schema.begin(); it != schema.end(); ++it) { + if (it->num_children > 0) { + continue; + } + auto type = ParquetTypesManager::get_logical_type(schema, it - schema.begin()); + auto existing_col = std::find_if(result.begin(), result.end(), + [it](const ReturnColumn &c) { return c.name == it->name; }); + if (existing_col == result.end()) { + result.push_back(ReturnColumn{it->name, type}); + continue; + } + if (existing_col->type != type) { + throw std::runtime_error("the files have incompatible schema"); + } + } + } + return result; + } + + + static void OpenParquetFiles(ClientContext &context, const vector& fileNames, + vector>& res) { + for (auto & file : fileNames) { + auto set = make_uniq(); + ParquetOptions po; + po.binary_as_string = true; + set->reader = make_uniq(context, file, po, nullptr); + res.push_back(std::move(set)); + } + } static unique_ptr OrderedParquetScanBind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { Connection conn(*context.db); auto res = make_uniq(); auto files = ListValue::GetChildren(input.inputs[0]); - vector fileNames; + vector fileInfoList; for (auto & file : files) { - fileNames.push_back(file.ToString()); + fileInfoList.emplace_back(file.ToString()); } - GlobMultiFileList fileList(context, fileNames, FileGlobOptions::ALLOW_EMPTY); - string filename; + GlobMultiFileList fileList(context, fileInfoList, FileGlobOptions::ALLOW_EMPTY); + OpenFileInfo file_info; MultiFileListScanData it; fileList.InitializeScan(it); - vector unglobbedFileList; - while (fileList.Scan(it, filename)) { - unglobbedFileList.push_back(filename); + while (fileList.Scan(it, file_info)) { + res->files.push_back(file_info.path); } - if (unglobbedFileList.empty()) { - throw duckdb::InvalidInputException("No files matched the provided pattern."); + if (res->files.empty()) { + throw InvalidInputException("No files matched the provided pattern."); } + vector> sets; + OpenParquetFiles(context, res->files, sets); + + res->returnCols = GetColumnsFromParquetSchemas(sets); + std::transform(res->returnCols.begin(), res->returnCols.end(), std::back_inserter(names), + [](const ReturnColumn &c) { return c.name; }); + std::transform(res->returnCols.begin(), res->returnCols.end(), std::back_inserter(return_types), + [](const ReturnColumn &c) { return c.type; }); + res->orderBy = input.inputs[1].GetValue(); - for (auto & file : unglobbedFileList) { - auto set = make_uniq(); - res->files.push_back(file); - ParquetOptions po; - po.binary_as_string = true; - ParquetReader reader(context, file, po, nullptr); - set->columnMap = vector(); - for (auto &el : reader.metadata->metadata->schema) { - if (el.num_children != 0) { - continue; - } - auto name_it = std::find(names.begin(), names.end(), el.name); - auto return_type = LogicalType::ANY; - switch (el.type) { - case Type::INT32: - return_type = LogicalType::INTEGER; - break; - case Type::INT64: - return_type = LogicalType::BIGINT; - break; - case Type::DOUBLE: - return_type = LogicalType::DOUBLE; - break; - case Type::FLOAT: - return_type = LogicalType::FLOAT; - break; - case Type::BYTE_ARRAY: - return_type = LogicalType::VARCHAR; - case Type::FIXED_LEN_BYTE_ARRAY: - return_type = LogicalType::VARCHAR; - break; - case Type::BOOLEAN: - return_type = LogicalType::TINYINT; - break; - default: - break;; - } - set->columnMap.push_back(name_it - names.begin()); - if (el.name == res->orderBy) { - set->orderByIdx = name_it - names.begin(); - } - if (name_it != names.end()) { - if (return_types[name_it - names.begin()] != return_type) { - throw std::runtime_error("incompatible schema"); - } - continue; - } - return_types.push_back(return_type); - names.push_back(el.name); - } - res->sets.push_back(std::move(set)); - } - res->returnTypes = return_types; - res->names = names; return std::move(res); } @@ -167,38 +220,23 @@ namespace duckdb { ParquetScanInitLocal(ExecutionContext &context, TableFunctionInitInput &input, GlobalTableFunctionState *gstate_p) { auto res = make_uniq(); const auto &bindData = input.bind_data->Cast(); - ParquetOptions po; - po.binary_as_string = true; - for (int i = 0; i < bindData.files.size(); i++) { - auto set = make_uniq(); - set->reader = make_uniq(context.client, bindData.files[i], po, nullptr); + OpenParquetFiles(context.client, bindData.files, res->sets); + + for (auto &set : res->sets) { + set->populateColumnInfo(bindData.returnCols, bindData.orderBy); set->scanState = make_uniq(); - int j = 0; - for (auto &el : set->reader->metadata->metadata->schema) { - if (el.num_children != 0) { - continue; - } - set->reader->reader_data.column_ids.push_back(j); - j++; - } - set->columnMap = bindData.sets[i]->columnMap; - set->reader->reader_data.column_mapping = set->columnMap; vector rgs(set->reader->metadata->metadata->row_groups.size(), 0); for (idx_t i = 0; i < rgs.size(); i++) { rgs[i] = i; } set->reader->InitializeScan(context.client, *set->scanState, rgs); set->chunk = make_uniq(); - - set->orderByIdx = bindData.sets[i]->orderByIdx; set->result_idx = 0; auto ltypes = vector(); - for (const auto idx : set->columnMap) { - ltypes.push_back(bindData.returnTypes[idx]); - } + std::transform(bindData.returnCols.begin(), bindData.returnCols.end(), std::back_inserter(ltypes), + [](const ReturnColumn &c) { return c.type; }); set->chunk->Initialize(context.client, ltypes); - set->reader->Scan(*set->scanState, *set->chunk); - res->sets.push_back(std::move(set)); + set->Scan(context.client); } res->RecalculateWinnerGroup(); return std::move(res); @@ -207,16 +245,13 @@ namespace duckdb { static void ParquetOrderedScanImplementation( ClientContext &context, duckdb::TableFunctionInput &data_p,DataChunk &output) { auto &loc_state = data_p.local_state->Cast(); - const auto &fieldNames = data_p.bind_data->Cast().names; - const auto &returnTypes = data_p.bind_data->Cast().returnTypes; + const auto &cols = data_p.bind_data->Cast().returnCols; bool toRecalc = false; for (int i = loc_state.sets.size() - 1; i >= 0 ; i--) { if (loc_state.sets[i]->result_idx >= loc_state.sets[i]->chunk->size()) { auto &set = loc_state.sets[i]; set->chunk->Reset(); - loc_state.sets[i]->reader->Scan( - *loc_state.sets[i]->scanState, - *loc_state.sets[i]->chunk); + loc_state.sets[i]->Scan(context); loc_state.sets[i]->result_idx = 0; if (loc_state.sets[i]->chunk->size() == 0) { @@ -247,18 +282,17 @@ namespace duckdb { auto winnerSet = &loc_state.sets[loc_state.winner_group[0]]; Value winner_val = (*winnerSet)->chunk->GetValue( (*winnerSet)->orderByIdx, - (*winnerSet)->result_idx - ); + (*winnerSet)->result_idx); for (int k = 1; k < loc_state.winner_group.size(); k++) { const auto i = loc_state.winner_group[k]; const auto &set = loc_state.sets[i]; const Value &val = set->chunk->GetValue(set->orderByIdx, set->result_idx); - if (val < winner_val) { + if (lt(val, winner_val)) { winnerSet = &loc_state.sets[i]; winner_val = (*winnerSet)->chunk->GetValue(set->orderByIdx, set->result_idx); } } - for (int i = 0; i < fieldNames.size(); i++) { + for (int i = 0; i < cols.size(); i++) { const auto &val = (*winnerSet)->chunk->GetValue(i,(*winnerSet)->result_idx); output.SetValue(i, j, val); } diff --git a/chsql/src/parquet_types.cpp b/chsql/src/parquet_types.cpp new file mode 100644 index 0000000..efdf9ca --- /dev/null +++ b/chsql/src/parquet_types.cpp @@ -0,0 +1,139 @@ +#include "chsql_parquet_types.h" + +bool ParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + if (parquet_type >= 0 && int(el.type) != parquet_type) { + return false; + } + if (converted_type == -1) { + return !el.__isset.converted_type; + } + return int(el.converted_type) == converted_type; +}; + +duckdb::LogicalType ParquetType::get_logical_type(const duckdb_parquet::SchemaElement &schema) { + return logical_type; +} + +bool LogicalParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + return el.__isset.logicalType && this->get_isset(el); +} + +duckdb::LogicalType JSONParquetType::get_logical_type(const duckdb_parquet::SchemaElement &schema) { + return duckdb::LogicalType::JSON(); +} + +bool DecimalParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + return el.__isset.converted_type && el.converted_type == duckdb_parquet::ConvertedType::DECIMAL && + el.__isset.precision && el.__isset.scale && (el.precision > duckdb::DecimalType::MaxWidth() || + el.type == duckdb_parquet::Type::BYTE_ARRAY || + el.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY || + el.type == duckdb_parquet::Type::INT32 || + el.type == duckdb_parquet::Type::INT64); +} + +duckdb::LogicalType DecimalParquetType::get_logical_type(const duckdb_parquet::SchemaElement &el) { + if (el.precision > duckdb::DecimalType::MaxWidth()) { + return duckdb::LogicalType::DOUBLE; + } + return duckdb::LogicalType::DECIMAL(el.precision, el.scale); +} + +bool isUUID(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.UUID; +} + +bool isTimestampTZ(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP && el.logicalType.TIMESTAMP.isAdjustedToUTC; +} + +bool isTimestampNS(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP && el.logicalType.TIMESTAMP.unit.__isset.NANOS; +} + +bool isTimestamp(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP; +} + +bool isTimeTZ(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIME && el.logicalType.TIME.isAdjustedToUTC; +} + +bool isTime(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIME; +} + +ParquetType *_types[] = { + new LogicalParquetType(isUUID, duckdb::LogicalType::UUID), + new LogicalParquetType(isTimestampTZ, duckdb::LogicalType::TIMESTAMP_TZ), + new LogicalParquetType(isTimestampNS, duckdb::LogicalType::TIMESTAMP_NS), + new LogicalParquetType(isTimestamp, duckdb::LogicalType::TIMESTAMP), + new LogicalParquetType(isTimeTZ, duckdb::LogicalType::TIME), + new LogicalParquetType(isTime, duckdb::LogicalType::TIME), + + new ParquetType(24, -1, duckdb::LogicalType::SQLNULL), + new ParquetType(duckdb_parquet::ConvertedType::INT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::TINYINT), + new ParquetType(duckdb_parquet::ConvertedType::INT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::SMALLINT), + new ParquetType(duckdb_parquet::ConvertedType::INT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER), + new ParquetType(duckdb_parquet::ConvertedType::INT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::UTINYINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_16, duckdb_parquet::Type::INT32, + duckdb::LogicalType::USMALLINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::UINTEGER), + new ParquetType(duckdb_parquet::ConvertedType::UINT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::UBIGINT), + new ParquetType(duckdb_parquet::ConvertedType::DATE, duckdb_parquet::Type::INT32, duckdb::LogicalType::DATE), + new ParquetType(duckdb_parquet::ConvertedType::TIME_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIME_MILLIS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS, duckdb_parquet::Type::INT32, + duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MICROS, duckdb_parquet::Type::INT64, + duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::INTERVAL, -1, duckdb::LogicalType::INTERVAL), + new ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::ENUM, duckdb_parquet::Type::BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::ENUM, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + + new JSONParquetType(), + new DecimalParquetType(), + + new ParquetType(-1, duckdb_parquet::Type::BOOLEAN, duckdb::LogicalType::BOOLEAN), + new ParquetType(-1, duckdb_parquet::Type::BOOLEAN, duckdb::LogicalType::BOOLEAN), + new ParquetType(-1, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER), + new ParquetType(-1, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT), + new ParquetType(-1, duckdb_parquet::Type::INT96, duckdb::LogicalType::TIMESTAMP), + new ParquetType(-1, duckdb_parquet::Type::FLOAT, duckdb::LogicalType::FLOAT), + new ParquetType(-1, duckdb_parquet::Type::DOUBLE, duckdb::LogicalType::DOUBLE), + new ParquetType(-1, duckdb_parquet::Type::BYTE_ARRAY, duckdb::LogicalType::BLOB), + new ParquetType(-1, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, duckdb::LogicalType::BLOB), +}; + +ParquetTypesManager::ParquetTypesManager() { +}; + +ParquetTypesManager *ParquetTypesManager::instance = nullptr; +std::mutex ParquetTypesManager::instance_mutex; + +ParquetTypesManager *ParquetTypesManager::get_instance() { + std::lock_guard lock(instance_mutex); + if (instance == nullptr) { + instance = new ParquetTypesManager(); + } + return instance; +} + +duckdb::LogicalType ParquetTypesManager::get_logical_type(const duckdb::vector &schema, + idx_t idx) { + for (auto &type: _types) { + if (type->check_type(schema, idx)) { + return type->get_logical_type(schema[idx]); + } + } + throw std::runtime_error("Unsupported Parquet type"); +} diff --git a/chsql/test/sql/chsql.test b/chsql/test/sql/chsql.test index 84c2527..2212e6a 100644 --- a/chsql/test/sql/chsql.test +++ b/chsql/test/sql/chsql.test @@ -13,17 +13,6 @@ require chsql require parquet -# Confirm the extension works -query I -SELECT chsql('Works'); ----- -ChSql Works 🐥 - -query I -SELECT chsql_openssl_version('Hello'); ----- -:.*ChSql Hello, my linked OpenSSL version is OpenSSL.* - query I SELECT toString('123') ---- diff --git a/chsql/vcpkg.json b/chsql/vcpkg.json index 85936bf..0b8fa67 100644 --- a/chsql/vcpkg.json +++ b/chsql/vcpkg.json @@ -1,5 +1,3 @@ { - "dependencies": [ - "openssl" - ] + "dependencies": [] } \ No newline at end of file diff --git a/duckdb b/duckdb index 1a3d614..0b83e5d 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 1a3d614f0eec5a2198af8ba4ea06eb9adee9d5f8 +Subproject commit 0b83e5d2f68bc02dfefde74b846bd039f078affa diff --git a/extension-ci-tools b/extension-ci-tools index 916d4ef..90757de 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 916d4ef4371068ca98a007378b52582c3e46b4e5 +Subproject commit 90757de3f06c6802cd49732849b9e46eef75761f