From b98b2d140c668674da7a68754715ac42909a434a Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sun, 2 Feb 2025 19:12:05 +0100 Subject: [PATCH 01/20] Submodule Action --- .github/workflows/main.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..ca94abb --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,24 @@ +name: Update Submodules +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + update: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Update module + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git submodule update --remote --merge + git config --global user.name "GitHub Action" + git config --global user.email "noreply@github.com" + git commit -am "update submodules" + # git push + From cb273d8133ec761484950e5d35e97c8384079a09 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sun, 2 Feb 2025 19:13:07 +0100 Subject: [PATCH 02/20] Update main.yml --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ca94abb..e53841a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,7 +16,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - git submodule update --remote --merge + git submodule update --init --recursive git config --global user.name "GitHub Action" git config --global user.email "noreply@github.com" git commit -am "update submodules" From 4f927abcd081735e481c339b1011070b10e8e497 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Mon, 3 Feb 2025 22:26:52 +0100 Subject: [PATCH 03/20] Update MainDistributionPipeline.yml --- .github/workflows/MainDistributionPipeline.yml | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 060b7ea..0cf5d44 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -25,19 +25,9 @@ jobs: duckdb-stable-build: name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.1 + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.3 with: - duckdb_version: v1.1.1 - ci_tools_version: v1.1.1 + duckdb_version: v1.1.3 + ci_tools_version: v1.1.3 extension_name: chsql - - duckdb-stable-deploy: - name: Deploy extension binaries - needs: duckdb-stable-build - if: false - uses: ./.github/workflows/_extension_deploy.yml - secrets: inherit - with: - duckdb_version: v1.1.1 - extension_name: chsql - deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} + From f20ef1352a2ec321a42a75e9f0944eba105bac5e Mon Sep 17 00:00:00 2001 From: akvlad Date: Mon, 10 Mar 2025 18:17:15 +0200 Subject: [PATCH 04/20] WIP/init --- chsql/CMakeLists.txt | 2 +- chsql/src/include/chsql_parquet_types.h | 34 ++++ chsql/src/parquet_ordered_scan.cpp | 32 +--- chsql/src/parquet_types.cpp | 242 ++++++++++++++++++++++++ 4 files changed, 283 insertions(+), 27 deletions(-) create mode 100644 chsql/src/include/chsql_parquet_types.h create mode 100644 chsql/src/parquet_types.cpp diff --git a/chsql/CMakeLists.txt b/chsql/CMakeLists.txt index 5460e12..93ce21b 100644 --- a/chsql/CMakeLists.txt +++ b/chsql/CMakeLists.txt @@ -21,7 +21,7 @@ include_directories( ../duckdb/third_party/mbedtls ../duckdb/third_party/mbedtls/include ../duckdb/third_party/brotli/include) -set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp) +set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp src/parquet_types.cpp) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) # Link OpenSSL in both the static library as the loadable extension diff --git a/chsql/src/include/chsql_parquet_types.h b/chsql/src/include/chsql_parquet_types.h new file mode 100644 index 0000000..179a9f0 --- /dev/null +++ b/chsql/src/include/chsql_parquet_types.h @@ -0,0 +1,34 @@ +// +// Created by hromozeka on 10.03.25. +// + +#ifndef PARQUET_TYPES_H +#define PARQUET_TYPES_H + +#include "duckdb.hpp" +#include + +struct ParquetType { + /*duckdb_parquet::ConvertedType::type -> replaced to int to support -1 nodata value*/ + int converted_type; + duckdb_parquet::Type::type parquet_type; + const duckdb::LogicalType &logical_type; + ParquetType(int converted_type, duckdb_parquet::Type::type parquet_type, + const duckdb::LogicalType &logical_type) + : converted_type(converted_type), parquet_type(parquet_type), logical_type(logical_type) {} + bool check_type(const duckdb::vector &schema, idx_t idx); +}; + +class ParquetTypesManager { + protected: + std::vector types; + static ParquetTypesManager *instance; + static std::mutex instance_mutex; + ParquetTypesManager(); + static ParquetTypesManager* get_instance(); + duckdb::LogicalType derive_logical_type(const duckdb_parquet::SchemaElement &s_ele, bool binary_as_string); + public: + static duckdb::LogicalType get_logical_type(const duckdb::vector &schema, idx_t idx); +}; + +#endif //PARQUET_TYPES_H diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index 88585c4..f1a4e11 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -3,6 +3,7 @@ #include #include "chsql_extension.hpp" #include +#include "chsql_parquet_types.h" namespace duckdb { @@ -113,36 +114,15 @@ namespace duckdb { po.binary_as_string = true; ParquetReader reader(context, file, po, nullptr); set->columnMap = vector(); - for (auto &el : reader.metadata->metadata->schema) { + for (auto it = reader.metadata->metadata->schema.begin(); + it!= reader.metadata->metadata->schema.end(); ++it) { + auto &el = *it; if (el.num_children != 0) { continue; } auto name_it = std::find(names.begin(), names.end(), el.name); - auto return_type = LogicalType::ANY; - switch (el.type) { - case Type::INT32: - return_type = LogicalType::INTEGER; - break; - case Type::INT64: - return_type = LogicalType::BIGINT; - break; - case Type::DOUBLE: - return_type = LogicalType::DOUBLE; - break; - case Type::FLOAT: - return_type = LogicalType::FLOAT; - break; - case Type::BYTE_ARRAY: - return_type = LogicalType::VARCHAR; - case Type::FIXED_LEN_BYTE_ARRAY: - return_type = LogicalType::VARCHAR; - break; - case Type::BOOLEAN: - return_type = LogicalType::TINYINT; - break; - default: - break;; - } + auto return_type = ParquetTypesManager::get_logical_type(reader.metadata->metadata->schema, + it - reader.metadata->metadata->schema.begin()); set->columnMap.push_back(name_it - names.begin()); if (el.name == res->orderBy) { set->orderByIdx = name_it - names.begin(); diff --git a/chsql/src/parquet_types.cpp b/chsql/src/parquet_types.cpp new file mode 100644 index 0000000..6464adf --- /dev/null +++ b/chsql/src/parquet_types.cpp @@ -0,0 +1,242 @@ +#include "chsql_parquet_types.h" + +bool ParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + if (el.type != parquet_type) { + return false; + } + if (converted_type == -1) { + return !el.__isset.converted_type; + } + return int(el.converted_type) == converted_type; +}; + +ParquetTypesManager::ParquetTypesManager() { + /*UTF8 = 0, + MAP = 1, -> no support + MAP_KEY_VALUE = 2, -> no support + LIST = 3, -> no support + ENUM = 4, -> no support + DECIMAL = 5, -> no support + DATE = 6, + TIME_MILLIS = 7 + TIME_MICROS = 8 + TIMESTAMP_MILLIS = 9, + TIMESTAMP_MICROS = 10, + UINT_8 = 11, + UINT_16 = 12, + UINT_32 = 13, + UINT_64 = 14, + INT_8 = 15, + INT_16 = 16, + INT_32 = 17, + INT_64 = 18, + JSON = 19, -> no support + BSON = 20, -> no support + INTERVAL = 21 -> no support + */ + types.push_back(ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::BYTE_ARRAY, duckdb::LogicalType::VARCHAR)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::DATE, duckdb_parquet::Type::INT32, duckdb::LogicalType::DATE)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIME_MILLIS, duckdb_parquet::Type::INT32, duckdb::LogicalType::TIME)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIME_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIMESTAMP)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIMESTAMP)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::UTINYINT)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::USMALLINT)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::UINTEGER)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::UBIGINT)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::TINYINT)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::SMALLINT)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER)); + types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT)); +}; + +ParquetTypesManager *ParquetTypesManager::instance = nullptr; +std::mutex ParquetTypesManager::instance_mutex; + +ParquetTypesManager *ParquetTypesManager::get_instance() { + std::lock_guard lock(instance_mutex); + if (instance == nullptr) { + instance = new ParquetTypesManager(); + } + return instance; +} + +duckdb::LogicalType ParquetTypesManager::get_logical_type(const duckdb::vector &schema, idx_t idx) { + auto inst = get_instance(); + return inst->derive_logical_type(schema[idx], false); + /*for (auto it = inst->types.begin(); it != inst->types.end(); ++it) { + if (it->check_type(schema, idx)) { + return it->logical_type; + } + } + throw std::runtime_error("Unsupported Parquet type");*/ +} + +duckdb::LogicalType ParquetTypesManager::derive_logical_type(const duckdb_parquet::SchemaElement &s_ele, bool binary_as_string) { + // inner node + if (s_ele.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY && !s_ele.__isset.type_length) { + throw duckdb::IOException("FIXED_LEN_BYTE_ARRAY requires length to be set"); + } + if (s_ele.__isset.logicalType) { + if (s_ele.logicalType.__isset.UUID) { + if (s_ele.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY) { + return duckdb::LogicalType::UUID; + } + } else if (s_ele.logicalType.__isset.TIMESTAMP) { + if (s_ele.logicalType.TIMESTAMP.isAdjustedToUTC) { + return duckdb::LogicalType::TIMESTAMP_TZ; + } else if (s_ele.logicalType.TIMESTAMP.unit.__isset.NANOS) { + return duckdb::LogicalType::TIMESTAMP_NS; + } + return duckdb::LogicalType::TIMESTAMP; + } else if (s_ele.logicalType.__isset.TIME) { + if (s_ele.logicalType.TIME.isAdjustedToUTC) { + return duckdb::LogicalType::TIME_TZ; + } + return duckdb::LogicalType::TIME; + } + } + if (s_ele.__isset.converted_type) { + // Legacy NULL type, does no longer exist, but files are still around of course + if (static_cast(s_ele.converted_type) == 24) { + return duckdb::LogicalTypeId::SQLNULL; + } + switch (s_ele.converted_type) { + case duckdb_parquet::ConvertedType::INT_8: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::TINYINT; + } else { + throw duckdb::IOException("INT8 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::INT_16: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::SMALLINT; + } else { + throw duckdb::IOException("INT16 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::INT_32: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::INTEGER; + } else { + throw duckdb::IOException("INT32 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::INT_64: + if (s_ele.type == duckdb_parquet::Type::INT64) { + return duckdb::LogicalType::BIGINT; + } else { + throw duckdb::IOException("INT64 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::UINT_8: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::UTINYINT; + } else { + throw duckdb::IOException("UINT8 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::UINT_16: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::USMALLINT; + } else { + throw duckdb::IOException("UINT16 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::UINT_32: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::UINTEGER; + } else { + throw duckdb::IOException("UINT32 converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::UINT_64: + if (s_ele.type == duckdb_parquet::Type::INT64) { + return duckdb::LogicalType::UBIGINT; + } else { + throw duckdb::IOException("UINT64 converted type can only be set for value of Type::INT64"); + } + case duckdb_parquet::ConvertedType::DATE: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::DATE; + } else { + throw duckdb::IOException("DATE converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::TIMESTAMP_MICROS: + case duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS: + if (s_ele.type == duckdb_parquet::Type::INT64) { + return duckdb::LogicalType::TIMESTAMP; + } else { + throw duckdb::IOException("TIMESTAMP converted type can only be set for value of Type::INT64"); + } + case duckdb_parquet::ConvertedType::DECIMAL: + if (!s_ele.__isset.precision || !s_ele.__isset.scale) { + throw duckdb::IOException("DECIMAL requires a length and scale specifier!"); + } + if (s_ele.precision > duckdb::DecimalType::MaxWidth()) { + return duckdb::LogicalType::DOUBLE; + } + switch (s_ele.type) { + case duckdb_parquet::Type::BYTE_ARRAY: + case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: + case duckdb_parquet::Type::INT32: + case duckdb_parquet::Type::INT64: + return duckdb::LogicalType::DECIMAL(s_ele.precision, s_ele.scale); + default: + throw duckdb::IOException( + "DECIMAL converted type can only be set for value of Type::(FIXED_LEN_)BYTE_ARRAY/INT32/INT64"); + } + case duckdb_parquet::ConvertedType::UTF8: + case duckdb_parquet::ConvertedType::ENUM: + switch (s_ele.type) { + case duckdb_parquet::Type::BYTE_ARRAY: + case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: + return duckdb::LogicalType::VARCHAR; + default: + throw duckdb::IOException("UTF8 converted type can only be set for Type::(FIXED_LEN_)BYTE_ARRAY"); + } + case duckdb_parquet::ConvertedType::TIME_MILLIS: + if (s_ele.type == duckdb_parquet::Type::INT32) { + return duckdb::LogicalType::TIME; + } else { + throw duckdb::IOException("TIME_MILLIS converted type can only be set for value of Type::INT32"); + } + case duckdb_parquet::ConvertedType::TIME_MICROS: + if (s_ele.type == duckdb_parquet::Type::INT64) { + return duckdb::LogicalType::TIME; + } else { + throw duckdb::IOException("TIME_MICROS converted type can only be set for value of Type::INT64"); + } + case duckdb_parquet::ConvertedType::INTERVAL: + return duckdb::LogicalType::INTERVAL; + case duckdb_parquet::ConvertedType::JSON: + return duckdb::LogicalType::JSON(); + case duckdb_parquet::ConvertedType::MAP: + case duckdb_parquet::ConvertedType::MAP_KEY_VALUE: + case duckdb_parquet::ConvertedType::LIST: + case duckdb_parquet::ConvertedType::BSON: + default: + throw duckdb::IOException("Unsupported converted type (%d)", (int32_t)s_ele.converted_type); + } + } else { + // no converted type set + // use default type for each physical type + switch (s_ele.type) { + case duckdb_parquet::Type::BOOLEAN: + return duckdb::LogicalType::BOOLEAN; + case duckdb_parquet::Type::INT32: + return duckdb::LogicalType::INTEGER; + case duckdb_parquet::Type::INT64: + return duckdb::LogicalType::BIGINT; + case duckdb_parquet::Type::INT96: // always a timestamp it would seem + return duckdb::LogicalType::TIMESTAMP; + case duckdb_parquet::Type::FLOAT: + return duckdb::LogicalType::FLOAT; + case duckdb_parquet::Type::DOUBLE: + return duckdb::LogicalType::DOUBLE; + case duckdb_parquet::Type::BYTE_ARRAY: + case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: + if (binary_as_string) { + return duckdb::LogicalType::VARCHAR; + } + return duckdb::LogicalType::BLOB; + default: + return duckdb::LogicalType::INVALID; + } + } +} From abe9e4b2b4954c796144237a4595432df5949083 Mon Sep 17 00:00:00 2001 From: akvlad Date: Fri, 14 Mar 2025 18:11:09 +0200 Subject: [PATCH 05/20] WIP --- chsql/src/include/chsql_parquet_types.h | 32 ++- chsql/src/parquet_ordered_scan.cpp | 205 ++++++++------ chsql/src/parquet_types.cpp | 347 +++++++++--------------- 3 files changed, 276 insertions(+), 308 deletions(-) diff --git a/chsql/src/include/chsql_parquet_types.h b/chsql/src/include/chsql_parquet_types.h index 179a9f0..870d1ba 100644 --- a/chsql/src/include/chsql_parquet_types.h +++ b/chsql/src/include/chsql_parquet_types.h @@ -11,17 +11,37 @@ struct ParquetType { /*duckdb_parquet::ConvertedType::type -> replaced to int to support -1 nodata value*/ int converted_type; - duckdb_parquet::Type::type parquet_type; - const duckdb::LogicalType &logical_type; - ParquetType(int converted_type, duckdb_parquet::Type::type parquet_type, - const duckdb::LogicalType &logical_type) + /* duckdb_parquet::Type::type -> replaced to int to support -1 for no matter value */ + int parquet_type; + const duckdb::LogicalType logical_type; + ParquetType(int converted_type, int parquet_type, const duckdb::LogicalType &logical_type) : converted_type(converted_type), parquet_type(parquet_type), logical_type(logical_type) {} - bool check_type(const duckdb::vector &schema, idx_t idx); + virtual bool check_type(const duckdb::vector &schema, idx_t idx); + virtual duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema); +}; + +struct LogicalParquetType : public ParquetType { + bool (*get_isset)(const duckdb_parquet::SchemaElement& el); + + LogicalParquetType(bool (*get_isset) (const duckdb_parquet::SchemaElement& el), + const duckdb::LogicalType& logical_type) + : ParquetType(-1, duckdb_parquet::Type::type::INT32, logical_type), get_isset(get_isset) {} + bool check_type(const duckdb::vector &schema, idx_t idx) override; +}; + +struct JSONParquetType : public ParquetType { + JSONParquetType(): ParquetType(duckdb_parquet::ConvertedType::JSON, -1, duckdb::LogicalType::SQLNULL) {} + duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema) override; +}; + +struct DecimalParquetType : public ParquetType { + DecimalParquetType(): ParquetType(-1, duckdb_parquet::Type::type::INT32, duckdb::LogicalType::SQLNULL) {} + bool check_type(const duckdb::vector &schema, idx_t idx) override; + duckdb::LogicalType get_logical_type(const duckdb_parquet::SchemaElement &schema) override; }; class ParquetTypesManager { protected: - std::vector types; static ParquetTypesManager *instance; static std::mutex instance_mutex; ParquetTypesManager(); diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index f1a4e11..a9715f7 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -7,21 +7,70 @@ namespace duckdb { + struct ReturnColumn { + string name; + LogicalType type; + }; + struct ReaderSet { unique_ptr reader; - idx_t orderByIdx; + vector returnColumns; + int64_t orderByIdx; unique_ptr chunk; unique_ptr scanState; - vector columnMap; + vector columnMap; idx_t result_idx; + bool haveAbsentColumns; + void populateColumnInfo(const vector& returnCols, const string& order_by_column) { + this->returnColumns = returnCols; + columnMap.clear(); + haveAbsentColumns = false; + for (auto it = returnCols.begin(); it!= returnCols.end(); ++it) { + auto schema_column = find_if( + reader->metadata->metadata->schema.begin(), + reader->metadata->metadata->schema.end(), + [&](const SchemaElement& column) { return column.name == it->name; }); + if (schema_column == reader->metadata->metadata->schema.end()) { + columnMap.push_back(-1); + haveAbsentColumns = true; + continue; + } + columnMap.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); + reader->reader_data.column_ids.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); + reader->reader_data.column_mapping.push_back(it - returnCols.begin()); + } + auto order_by_column_it = find_if( + reader->metadata->metadata->schema.begin(), + reader->metadata->metadata->schema.end(), + [&](const SchemaElement& column) { return column.name == order_by_column; }); + if (order_by_column_it == reader->metadata->metadata->schema.end()) { + orderByIdx = -1; + } else { + orderByIdx = order_by_column_it - reader->metadata->metadata->schema.begin() - 1; + } + } + void Scan() { + chunk->Reset(); + reader->Scan(*scanState, *chunk); + if (!haveAbsentColumns || chunk->size() == 0) { + return; + } + for (auto it = columnMap.begin(); it!=columnMap.end(); ++it) { + if (*it != -1) { + continue; + } + chunk->data[it - columnMap.begin()].Initialize(false, chunk->size()); + for (idx_t j = 0; j < chunk->size(); j++) { + chunk->data[it - columnMap.begin()].SetValue(j, Value()); + } + } + } }; struct OrderedReadFunctionData : FunctionData { string orderBy; - vector> sets; vector files; - vector returnTypes; - vector names; + vector returnCols; unique_ptr Copy() const override { throw std::runtime_error("not implemented"); } @@ -45,9 +94,15 @@ namespace duckdb { }; }; + bool lt(const Value &left, const Value &right) { + return left.IsNull() || (!right.IsNull() && left < right); + } + bool le(const Value &left, const Value &right) { + return left.IsNull() || (!right.IsNull() && left <= right); + } - struct OrderedReadLocalState: LocalTableFunctionState { + struct OrderedReadLocalState: LocalTableFunctionState { vector> sets; vector winner_group; void RecalculateWinnerGroup() { @@ -56,11 +111,17 @@ namespace duckdb { return; } idx_t winner_idx = 0; + auto first_unordered = std::find_if(sets.begin(), sets.end(), + [&](const unique_ptr &s) { return s->orderByIdx == -1; }); + if (first_unordered != sets.end()) { + winner_group.push_back(first_unordered - sets.begin()); + return; + } for (idx_t i = 1; i < sets.size(); i++) { const auto &s = sets[i]; const auto &w = sets[winner_idx]; - if (s->chunk->GetValue(s->orderByIdx, s->result_idx) < - w->chunk->GetValue(w->orderByIdx, w->result_idx)) { + if (lt(s->chunk->GetValue(s->orderByIdx, s->result_idx), + w->chunk->GetValue(w->orderByIdx, w->result_idx))) { winner_idx = i; } } @@ -71,7 +132,7 @@ namespace duckdb { if (i == winner_idx) continue; auto &s = sets[i]; const auto &sFirst = s->chunk->GetValue(s->orderByIdx, s->result_idx); - if (sFirst <= wLast) { + if (le(sFirst, wLast)) { winner_group.push_back(i); } } @@ -84,6 +145,40 @@ namespace duckdb { } }; + static vector GetColumnsFromParquetSchemas(const vector>& sets) { + vector result; + for (auto &set : sets) { + const auto &schema = set->reader->metadata->metadata->schema; + for (auto it = schema.begin(); it != schema.end(); ++it) { + if (it->num_children > 0) { + continue; + } + auto type = ParquetTypesManager::get_logical_type(schema, it - schema.begin()); + auto existing_col = std::find_if(result.begin(), result.end(), + [it](const ReturnColumn &c) { return c.name == it->name; }); + if (existing_col == result.end()) { + result.push_back(ReturnColumn{it->name, type}); + continue; + } + if (existing_col->type != type) { + throw std::runtime_error("the files have incompatible schema"); + } + } + } + return result; + } + + + static void OpenParquetFiles(ClientContext &context, const vector& fileNames, + vector>& res) { + for (auto & file : fileNames) { + auto set = make_uniq(); + ParquetOptions po; + po.binary_as_string = true; + set->reader = make_uniq(context, file, po, nullptr); + res.push_back(move(set)); + } + } static unique_ptr OrderedParquetScanBind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { @@ -98,48 +193,23 @@ namespace duckdb { string filename; MultiFileListScanData it; fileList.InitializeScan(it); - vector unglobbedFileList; while (fileList.Scan(it, filename)) { - unglobbedFileList.push_back(filename); + res->files.push_back(filename); } - if (unglobbedFileList.empty()) { - throw duckdb::InvalidInputException("No files matched the provided pattern."); + if (res->files.empty()) { + throw InvalidInputException("No files matched the provided pattern."); } + vector> sets; + OpenParquetFiles(context, res->files, sets); + + res->returnCols = GetColumnsFromParquetSchemas(sets); + std::transform(res->returnCols.begin(), res->returnCols.end(), std::back_inserter(names), + [](const ReturnColumn &c) { return c.name; }); + std::transform(res->returnCols.begin(), res->returnCols.end(), std::back_inserter(return_types), + [](const ReturnColumn &c) { return c.type; }); + res->orderBy = input.inputs[1].GetValue(); - for (auto & file : unglobbedFileList) { - auto set = make_uniq(); - res->files.push_back(file); - ParquetOptions po; - po.binary_as_string = true; - ParquetReader reader(context, file, po, nullptr); - set->columnMap = vector(); - for (auto it = reader.metadata->metadata->schema.begin(); - it!= reader.metadata->metadata->schema.end(); ++it) { - auto &el = *it; - if (el.num_children != 0) { - continue; - } - auto name_it = std::find(names.begin(), names.end(), el.name); - auto return_type = ParquetTypesManager::get_logical_type(reader.metadata->metadata->schema, - it - reader.metadata->metadata->schema.begin()); - set->columnMap.push_back(name_it - names.begin()); - if (el.name == res->orderBy) { - set->orderByIdx = name_it - names.begin(); - } - if (name_it != names.end()) { - if (return_types[name_it - names.begin()] != return_type) { - throw std::runtime_error("incompatible schema"); - } - continue; - } - return_types.push_back(return_type); - names.push_back(el.name); - } - res->sets.push_back(std::move(set)); - } - res->returnTypes = return_types; - res->names = names; return std::move(res); } @@ -147,38 +217,23 @@ namespace duckdb { ParquetScanInitLocal(ExecutionContext &context, TableFunctionInitInput &input, GlobalTableFunctionState *gstate_p) { auto res = make_uniq(); const auto &bindData = input.bind_data->Cast(); - ParquetOptions po; - po.binary_as_string = true; - for (int i = 0; i < bindData.files.size(); i++) { - auto set = make_uniq(); - set->reader = make_uniq(context.client, bindData.files[i], po, nullptr); + OpenParquetFiles(context.client, bindData.files, res->sets); + + for (auto &set : res->sets) { + set->populateColumnInfo(bindData.returnCols, bindData.orderBy); set->scanState = make_uniq(); - int j = 0; - for (auto &el : set->reader->metadata->metadata->schema) { - if (el.num_children != 0) { - continue; - } - set->reader->reader_data.column_ids.push_back(j); - j++; - } - set->columnMap = bindData.sets[i]->columnMap; - set->reader->reader_data.column_mapping = set->columnMap; vector rgs(set->reader->metadata->metadata->row_groups.size(), 0); for (idx_t i = 0; i < rgs.size(); i++) { rgs[i] = i; } set->reader->InitializeScan(context.client, *set->scanState, rgs); set->chunk = make_uniq(); - - set->orderByIdx = bindData.sets[i]->orderByIdx; set->result_idx = 0; auto ltypes = vector(); - for (const auto idx : set->columnMap) { - ltypes.push_back(bindData.returnTypes[idx]); - } + std::transform(bindData.returnCols.begin(), bindData.returnCols.end(), std::back_inserter(ltypes), + [](const ReturnColumn &c) { return c.type; }); set->chunk->Initialize(context.client, ltypes); - set->reader->Scan(*set->scanState, *set->chunk); - res->sets.push_back(std::move(set)); + set->Scan(); } res->RecalculateWinnerGroup(); return std::move(res); @@ -187,16 +242,13 @@ namespace duckdb { static void ParquetOrderedScanImplementation( ClientContext &context, duckdb::TableFunctionInput &data_p,DataChunk &output) { auto &loc_state = data_p.local_state->Cast(); - const auto &fieldNames = data_p.bind_data->Cast().names; - const auto &returnTypes = data_p.bind_data->Cast().returnTypes; + const auto &cols = data_p.bind_data->Cast().returnCols; bool toRecalc = false; for (int i = loc_state.sets.size() - 1; i >= 0 ; i--) { if (loc_state.sets[i]->result_idx >= loc_state.sets[i]->chunk->size()) { auto &set = loc_state.sets[i]; set->chunk->Reset(); - loc_state.sets[i]->reader->Scan( - *loc_state.sets[i]->scanState, - *loc_state.sets[i]->chunk); + loc_state.sets[i]->Scan(); loc_state.sets[i]->result_idx = 0; if (loc_state.sets[i]->chunk->size() == 0) { @@ -227,18 +279,17 @@ namespace duckdb { auto winnerSet = &loc_state.sets[loc_state.winner_group[0]]; Value winner_val = (*winnerSet)->chunk->GetValue( (*winnerSet)->orderByIdx, - (*winnerSet)->result_idx - ); + (*winnerSet)->result_idx); for (int k = 1; k < loc_state.winner_group.size(); k++) { const auto i = loc_state.winner_group[k]; const auto &set = loc_state.sets[i]; const Value &val = set->chunk->GetValue(set->orderByIdx, set->result_idx); - if (val < winner_val) { + if (lt(val, winner_val)) { winnerSet = &loc_state.sets[i]; winner_val = (*winnerSet)->chunk->GetValue(set->orderByIdx, set->result_idx); } } - for (int i = 0; i < fieldNames.size(); i++) { + for (int i = 0; i < cols.size(); i++) { const auto &val = (*winnerSet)->chunk->GetValue(i,(*winnerSet)->result_idx); output.SetValue(i, j, val); } diff --git a/chsql/src/parquet_types.cpp b/chsql/src/parquet_types.cpp index 6464adf..efdf9ca 100644 --- a/chsql/src/parquet_types.cpp +++ b/chsql/src/parquet_types.cpp @@ -1,242 +1,139 @@ #include "chsql_parquet_types.h" bool ParquetType::check_type(const duckdb::vector &schema, idx_t idx) { - auto &el = schema[idx]; - if (el.type != parquet_type) { - return false; - } - if (converted_type == -1) { - return !el.__isset.converted_type; - } - return int(el.converted_type) == converted_type; + auto &el = schema[idx]; + if (parquet_type >= 0 && int(el.type) != parquet_type) { + return false; + } + if (converted_type == -1) { + return !el.__isset.converted_type; + } + return int(el.converted_type) == converted_type; +}; + +duckdb::LogicalType ParquetType::get_logical_type(const duckdb_parquet::SchemaElement &schema) { + return logical_type; +} + +bool LogicalParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + return el.__isset.logicalType && this->get_isset(el); +} + +duckdb::LogicalType JSONParquetType::get_logical_type(const duckdb_parquet::SchemaElement &schema) { + return duckdb::LogicalType::JSON(); +} + +bool DecimalParquetType::check_type(const duckdb::vector &schema, idx_t idx) { + auto &el = schema[idx]; + return el.__isset.converted_type && el.converted_type == duckdb_parquet::ConvertedType::DECIMAL && + el.__isset.precision && el.__isset.scale && (el.precision > duckdb::DecimalType::MaxWidth() || + el.type == duckdb_parquet::Type::BYTE_ARRAY || + el.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY || + el.type == duckdb_parquet::Type::INT32 || + el.type == duckdb_parquet::Type::INT64); +} + +duckdb::LogicalType DecimalParquetType::get_logical_type(const duckdb_parquet::SchemaElement &el) { + if (el.precision > duckdb::DecimalType::MaxWidth()) { + return duckdb::LogicalType::DOUBLE; + } + return duckdb::LogicalType::DECIMAL(el.precision, el.scale); +} + +bool isUUID(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.UUID; +} + +bool isTimestampTZ(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP && el.logicalType.TIMESTAMP.isAdjustedToUTC; +} + +bool isTimestampNS(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP && el.logicalType.TIMESTAMP.unit.__isset.NANOS; +} + +bool isTimestamp(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIMESTAMP; +} + +bool isTimeTZ(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIME && el.logicalType.TIME.isAdjustedToUTC; +} + +bool isTime(const duckdb_parquet::SchemaElement &el) { + return el.logicalType.__isset.TIME; +} + +ParquetType *_types[] = { + new LogicalParquetType(isUUID, duckdb::LogicalType::UUID), + new LogicalParquetType(isTimestampTZ, duckdb::LogicalType::TIMESTAMP_TZ), + new LogicalParquetType(isTimestampNS, duckdb::LogicalType::TIMESTAMP_NS), + new LogicalParquetType(isTimestamp, duckdb::LogicalType::TIMESTAMP), + new LogicalParquetType(isTimeTZ, duckdb::LogicalType::TIME), + new LogicalParquetType(isTime, duckdb::LogicalType::TIME), + + new ParquetType(24, -1, duckdb::LogicalType::SQLNULL), + new ParquetType(duckdb_parquet::ConvertedType::INT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::TINYINT), + new ParquetType(duckdb_parquet::ConvertedType::INT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::SMALLINT), + new ParquetType(duckdb_parquet::ConvertedType::INT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER), + new ParquetType(duckdb_parquet::ConvertedType::INT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::UTINYINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_16, duckdb_parquet::Type::INT32, + duckdb::LogicalType::USMALLINT), + new ParquetType(duckdb_parquet::ConvertedType::UINT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::UINTEGER), + new ParquetType(duckdb_parquet::ConvertedType::UINT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::UBIGINT), + new ParquetType(duckdb_parquet::ConvertedType::DATE, duckdb_parquet::Type::INT32, duckdb::LogicalType::DATE), + new ParquetType(duckdb_parquet::ConvertedType::TIME_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIME_MILLIS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS, duckdb_parquet::Type::INT32, + duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MICROS, duckdb_parquet::Type::INT64, + duckdb::LogicalType::TIME), + new ParquetType(duckdb_parquet::ConvertedType::INTERVAL, -1, duckdb::LogicalType::INTERVAL), + new ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::ENUM, duckdb_parquet::Type::BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + new ParquetType(duckdb_parquet::ConvertedType::ENUM, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, + duckdb::LogicalType::VARCHAR), + + new JSONParquetType(), + new DecimalParquetType(), + + new ParquetType(-1, duckdb_parquet::Type::BOOLEAN, duckdb::LogicalType::BOOLEAN), + new ParquetType(-1, duckdb_parquet::Type::BOOLEAN, duckdb::LogicalType::BOOLEAN), + new ParquetType(-1, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER), + new ParquetType(-1, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT), + new ParquetType(-1, duckdb_parquet::Type::INT96, duckdb::LogicalType::TIMESTAMP), + new ParquetType(-1, duckdb_parquet::Type::FLOAT, duckdb::LogicalType::FLOAT), + new ParquetType(-1, duckdb_parquet::Type::DOUBLE, duckdb::LogicalType::DOUBLE), + new ParquetType(-1, duckdb_parquet::Type::BYTE_ARRAY, duckdb::LogicalType::BLOB), + new ParquetType(-1, duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY, duckdb::LogicalType::BLOB), }; ParquetTypesManager::ParquetTypesManager() { - /*UTF8 = 0, - MAP = 1, -> no support - MAP_KEY_VALUE = 2, -> no support - LIST = 3, -> no support - ENUM = 4, -> no support - DECIMAL = 5, -> no support - DATE = 6, - TIME_MILLIS = 7 - TIME_MICROS = 8 - TIMESTAMP_MILLIS = 9, - TIMESTAMP_MICROS = 10, - UINT_8 = 11, - UINT_16 = 12, - UINT_32 = 13, - UINT_64 = 14, - INT_8 = 15, - INT_16 = 16, - INT_32 = 17, - INT_64 = 18, - JSON = 19, -> no support - BSON = 20, -> no support - INTERVAL = 21 -> no support - */ - types.push_back(ParquetType(duckdb_parquet::ConvertedType::UTF8, duckdb_parquet::Type::BYTE_ARRAY, duckdb::LogicalType::VARCHAR)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::DATE, duckdb_parquet::Type::INT32, duckdb::LogicalType::DATE)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIME_MILLIS, duckdb_parquet::Type::INT32, duckdb::LogicalType::TIME)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIME_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIME)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIMESTAMP)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::TIMESTAMP_MICROS, duckdb_parquet::Type::INT64, duckdb::LogicalType::TIMESTAMP)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::UTINYINT)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::USMALLINT)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::UINTEGER)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::UINT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::UBIGINT)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_8, duckdb_parquet::Type::INT32, duckdb::LogicalType::TINYINT)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_16, duckdb_parquet::Type::INT32, duckdb::LogicalType::SMALLINT)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_32, duckdb_parquet::Type::INT32, duckdb::LogicalType::INTEGER)); - types.push_back(ParquetType(duckdb_parquet::ConvertedType::INT_64, duckdb_parquet::Type::INT64, duckdb::LogicalType::BIGINT)); }; ParquetTypesManager *ParquetTypesManager::instance = nullptr; std::mutex ParquetTypesManager::instance_mutex; ParquetTypesManager *ParquetTypesManager::get_instance() { - std::lock_guard lock(instance_mutex); - if (instance == nullptr) { - instance = new ParquetTypesManager(); - } - return instance; -} - -duckdb::LogicalType ParquetTypesManager::get_logical_type(const duckdb::vector &schema, idx_t idx) { - auto inst = get_instance(); - return inst->derive_logical_type(schema[idx], false); - /*for (auto it = inst->types.begin(); it != inst->types.end(); ++it) { - if (it->check_type(schema, idx)) { - return it->logical_type; - } - } - throw std::runtime_error("Unsupported Parquet type");*/ + std::lock_guard lock(instance_mutex); + if (instance == nullptr) { + instance = new ParquetTypesManager(); + } + return instance; } -duckdb::LogicalType ParquetTypesManager::derive_logical_type(const duckdb_parquet::SchemaElement &s_ele, bool binary_as_string) { - // inner node - if (s_ele.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY && !s_ele.__isset.type_length) { - throw duckdb::IOException("FIXED_LEN_BYTE_ARRAY requires length to be set"); - } - if (s_ele.__isset.logicalType) { - if (s_ele.logicalType.__isset.UUID) { - if (s_ele.type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY) { - return duckdb::LogicalType::UUID; - } - } else if (s_ele.logicalType.__isset.TIMESTAMP) { - if (s_ele.logicalType.TIMESTAMP.isAdjustedToUTC) { - return duckdb::LogicalType::TIMESTAMP_TZ; - } else if (s_ele.logicalType.TIMESTAMP.unit.__isset.NANOS) { - return duckdb::LogicalType::TIMESTAMP_NS; - } - return duckdb::LogicalType::TIMESTAMP; - } else if (s_ele.logicalType.__isset.TIME) { - if (s_ele.logicalType.TIME.isAdjustedToUTC) { - return duckdb::LogicalType::TIME_TZ; - } - return duckdb::LogicalType::TIME; - } - } - if (s_ele.__isset.converted_type) { - // Legacy NULL type, does no longer exist, but files are still around of course - if (static_cast(s_ele.converted_type) == 24) { - return duckdb::LogicalTypeId::SQLNULL; - } - switch (s_ele.converted_type) { - case duckdb_parquet::ConvertedType::INT_8: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::TINYINT; - } else { - throw duckdb::IOException("INT8 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::INT_16: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::SMALLINT; - } else { - throw duckdb::IOException("INT16 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::INT_32: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::INTEGER; - } else { - throw duckdb::IOException("INT32 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::INT_64: - if (s_ele.type == duckdb_parquet::Type::INT64) { - return duckdb::LogicalType::BIGINT; - } else { - throw duckdb::IOException("INT64 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::UINT_8: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::UTINYINT; - } else { - throw duckdb::IOException("UINT8 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::UINT_16: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::USMALLINT; - } else { - throw duckdb::IOException("UINT16 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::UINT_32: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::UINTEGER; - } else { - throw duckdb::IOException("UINT32 converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::UINT_64: - if (s_ele.type == duckdb_parquet::Type::INT64) { - return duckdb::LogicalType::UBIGINT; - } else { - throw duckdb::IOException("UINT64 converted type can only be set for value of Type::INT64"); - } - case duckdb_parquet::ConvertedType::DATE: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::DATE; - } else { - throw duckdb::IOException("DATE converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::TIMESTAMP_MICROS: - case duckdb_parquet::ConvertedType::TIMESTAMP_MILLIS: - if (s_ele.type == duckdb_parquet::Type::INT64) { - return duckdb::LogicalType::TIMESTAMP; - } else { - throw duckdb::IOException("TIMESTAMP converted type can only be set for value of Type::INT64"); - } - case duckdb_parquet::ConvertedType::DECIMAL: - if (!s_ele.__isset.precision || !s_ele.__isset.scale) { - throw duckdb::IOException("DECIMAL requires a length and scale specifier!"); - } - if (s_ele.precision > duckdb::DecimalType::MaxWidth()) { - return duckdb::LogicalType::DOUBLE; - } - switch (s_ele.type) { - case duckdb_parquet::Type::BYTE_ARRAY: - case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: - case duckdb_parquet::Type::INT32: - case duckdb_parquet::Type::INT64: - return duckdb::LogicalType::DECIMAL(s_ele.precision, s_ele.scale); - default: - throw duckdb::IOException( - "DECIMAL converted type can only be set for value of Type::(FIXED_LEN_)BYTE_ARRAY/INT32/INT64"); - } - case duckdb_parquet::ConvertedType::UTF8: - case duckdb_parquet::ConvertedType::ENUM: - switch (s_ele.type) { - case duckdb_parquet::Type::BYTE_ARRAY: - case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: - return duckdb::LogicalType::VARCHAR; - default: - throw duckdb::IOException("UTF8 converted type can only be set for Type::(FIXED_LEN_)BYTE_ARRAY"); - } - case duckdb_parquet::ConvertedType::TIME_MILLIS: - if (s_ele.type == duckdb_parquet::Type::INT32) { - return duckdb::LogicalType::TIME; - } else { - throw duckdb::IOException("TIME_MILLIS converted type can only be set for value of Type::INT32"); - } - case duckdb_parquet::ConvertedType::TIME_MICROS: - if (s_ele.type == duckdb_parquet::Type::INT64) { - return duckdb::LogicalType::TIME; - } else { - throw duckdb::IOException("TIME_MICROS converted type can only be set for value of Type::INT64"); - } - case duckdb_parquet::ConvertedType::INTERVAL: - return duckdb::LogicalType::INTERVAL; - case duckdb_parquet::ConvertedType::JSON: - return duckdb::LogicalType::JSON(); - case duckdb_parquet::ConvertedType::MAP: - case duckdb_parquet::ConvertedType::MAP_KEY_VALUE: - case duckdb_parquet::ConvertedType::LIST: - case duckdb_parquet::ConvertedType::BSON: - default: - throw duckdb::IOException("Unsupported converted type (%d)", (int32_t)s_ele.converted_type); - } - } else { - // no converted type set - // use default type for each physical type - switch (s_ele.type) { - case duckdb_parquet::Type::BOOLEAN: - return duckdb::LogicalType::BOOLEAN; - case duckdb_parquet::Type::INT32: - return duckdb::LogicalType::INTEGER; - case duckdb_parquet::Type::INT64: - return duckdb::LogicalType::BIGINT; - case duckdb_parquet::Type::INT96: // always a timestamp it would seem - return duckdb::LogicalType::TIMESTAMP; - case duckdb_parquet::Type::FLOAT: - return duckdb::LogicalType::FLOAT; - case duckdb_parquet::Type::DOUBLE: - return duckdb::LogicalType::DOUBLE; - case duckdb_parquet::Type::BYTE_ARRAY: - case duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY: - if (binary_as_string) { - return duckdb::LogicalType::VARCHAR; - } - return duckdb::LogicalType::BLOB; - default: - return duckdb::LogicalType::INVALID; +duckdb::LogicalType ParquetTypesManager::get_logical_type(const duckdb::vector &schema, + idx_t idx) { + for (auto &type: _types) { + if (type->check_type(schema, idx)) { + return type->get_logical_type(schema[idx]); } } + throw std::runtime_error("Unsupported Parquet type"); } From 7466a3531b7b6855e2a346cb4543504e530a8c5e Mon Sep 17 00:00:00 2001 From: akvlad Date: Mon, 17 Mar 2025 18:17:17 +0200 Subject: [PATCH 06/20] update deps --- duckdb | 2 +- extension-ci-tools | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/duckdb b/duckdb index 1a3d614..8e52ec4 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 1a3d614f0eec5a2198af8ba4ea06eb9adee9d5f8 +Subproject commit 8e52ec43959ab363643d63cb78ee214577111da4 diff --git a/extension-ci-tools b/extension-ci-tools index 916d4ef..58970c5 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 916d4ef4371068ca98a007378b52582c3e46b4e5 +Subproject commit 58970c538d35919db875096460c05806056f4de0 From b845ff4faf5590ea84295b5d362857604b73ccf9 Mon Sep 17 00:00:00 2001 From: akvlad Date: Mon, 17 Mar 2025 18:44:47 +0200 Subject: [PATCH 07/20] debug --- chsql/src/parquet_ordered_scan.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index a9715f7..3b720eb 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -36,8 +36,10 @@ namespace duckdb { continue; } columnMap.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); - reader->reader_data.column_ids.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); - reader->reader_data.column_mapping.push_back(it - returnCols.begin()); + reader->reader_data.column_ids.push_back( + MultiFileLocalColumnId(schema_column - reader->metadata->metadata->schema.begin() - 1)); + reader->reader_data.column_mapping.push_back( + MultiFileGlobalIndex(it - returnCols.begin())); } auto order_by_column_it = find_if( reader->metadata->metadata->schema.begin(), @@ -49,9 +51,9 @@ namespace duckdb { orderByIdx = order_by_column_it - reader->metadata->metadata->schema.begin() - 1; } } - void Scan() { + void Scan(ClientContext& ctx) { chunk->Reset(); - reader->Scan(*scanState, *chunk); + reader->Scan(ctx, *scanState, *chunk); if (!haveAbsentColumns || chunk->size() == 0) { return; } @@ -233,7 +235,7 @@ namespace duckdb { std::transform(bindData.returnCols.begin(), bindData.returnCols.end(), std::back_inserter(ltypes), [](const ReturnColumn &c) { return c.type; }); set->chunk->Initialize(context.client, ltypes); - set->Scan(); + set->Scan(context.client); } res->RecalculateWinnerGroup(); return std::move(res); @@ -248,7 +250,7 @@ namespace duckdb { if (loc_state.sets[i]->result_idx >= loc_state.sets[i]->chunk->size()) { auto &set = loc_state.sets[i]; set->chunk->Reset(); - loc_state.sets[i]->Scan(); + loc_state.sets[i]->Scan(context); loc_state.sets[i]->result_idx = 0; if (loc_state.sets[i]->chunk->size() == 0) { From f32274efc7bff7507560e87a90853e41cd16ed58 Mon Sep 17 00:00:00 2001 From: lmangani <> Date: Mon, 17 Mar 2025 17:31:20 +0000 Subject: [PATCH 08/20] resync submodules --- duckdb | 2 +- extension-ci-tools | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/duckdb b/duckdb index 8e52ec4..7c0f857 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 8e52ec43959ab363643d63cb78ee214577111da4 +Subproject commit 7c0f8574bda9af7aa5b23166d7860d68ae3b9481 diff --git a/extension-ci-tools b/extension-ci-tools index 58970c5..00e6af0 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 58970c538d35919db875096460c05806056f4de0 +Subproject commit 00e6af068429bf776a54f67cb1cd1ff5370a8dd7 From 2c33917468b282cb3013e61b693218c73cff23d9 Mon Sep 17 00:00:00 2001 From: lmangani <> Date: Mon, 17 Mar 2025 17:48:41 +0000 Subject: [PATCH 09/20] update action --- .github/workflows/MainDistributionPipeline.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 0cf5d44..a6c77d9 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -25,9 +25,9 @@ jobs: duckdb-stable-build: name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.3 + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1 with: - duckdb_version: v1.1.3 - ci_tools_version: v1.1.3 + duckdb_version: v1.2.1 + ci_tools_version: v1.2.1 extension_name: chsql From ef1779b86f1ad53042345169a2f6c5f4bc01bbd3 Mon Sep 17 00:00:00 2001 From: akvlad Date: Mon, 17 Mar 2025 20:55:28 +0200 Subject: [PATCH 10/20] fix main build --- .github/workflows/MainDistributionPipeline.yml | 15 ++++++++------- chsql/CMakeLists.txt | 1 + chsql/extension_config.cmake | 2 +- chsql/src/include/chsql_parquet_types.h | 1 + chsql/src/parquet_ordered_scan.cpp | 6 +++--- extension-ci-tools | 2 +- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a6c77d9..5b1fddb 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -15,13 +15,14 @@ concurrency: cancel-in-progress: true jobs: - duckdb-next-build: - name: Build extension binaries (next) - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main - with: - duckdb_version: main - ci_tools_version: main - extension_name: chsql +# Temporarily disabled because main is broken +# duckdb-next-build: +# name: Build extension binaries (next) +# uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main +# with: +# duckdb_version: 1.1.2 +# ci_tools_version: 1.1.2 +# extension_name: chsql duckdb-stable-build: name: Build extension binaries diff --git a/chsql/CMakeLists.txt b/chsql/CMakeLists.txt index 93ce21b..a3375c5 100644 --- a/chsql/CMakeLists.txt +++ b/chsql/CMakeLists.txt @@ -7,6 +7,7 @@ set(TARGET_NAME chsql) find_package(OpenSSL REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) +set(CHSQL_DUCKDB_VERSION ${DUCKDB_MAJOR_VERSION}) project(${TARGET_NAME}) include_directories( diff --git a/chsql/extension_config.cmake b/chsql/extension_config.cmake index 776f13b..4ec1a10 100644 --- a/chsql/extension_config.cmake +++ b/chsql/extension_config.cmake @@ -1,5 +1,5 @@ # This file is included by DuckDB's build system. It specifies which extension to load - +set(CHSQL_DUCKDB_VERSION ${DUCKDB_MAJOR_VERSION}) include_directories( ./src/include ${CMAKE_CURRENT_SOURCE_DIR}/../duckdb/extension/parquet/include diff --git a/chsql/src/include/chsql_parquet_types.h b/chsql/src/include/chsql_parquet_types.h index 870d1ba..0e44d11 100644 --- a/chsql/src/include/chsql_parquet_types.h +++ b/chsql/src/include/chsql_parquet_types.h @@ -5,6 +5,7 @@ #ifndef PARQUET_TYPES_H #define PARQUET_TYPES_H + #include "duckdb.hpp" #include diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index 3b720eb..65c3402 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -37,9 +37,9 @@ namespace duckdb { } columnMap.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); reader->reader_data.column_ids.push_back( - MultiFileLocalColumnId(schema_column - reader->metadata->metadata->schema.begin() - 1)); + schema_column - reader->metadata->metadata->schema.begin() - 1); reader->reader_data.column_mapping.push_back( - MultiFileGlobalIndex(it - returnCols.begin())); + it - returnCols.begin()); } auto order_by_column_it = find_if( reader->metadata->metadata->schema.begin(), @@ -53,7 +53,7 @@ namespace duckdb { } void Scan(ClientContext& ctx) { chunk->Reset(); - reader->Scan(ctx, *scanState, *chunk); + reader->Scan(*scanState, *chunk); if (!haveAbsentColumns || chunk->size() == 0) { return; } diff --git a/extension-ci-tools b/extension-ci-tools index 00e6af0..58970c5 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 00e6af068429bf776a54f67cb1cd1ff5370a8dd7 +Subproject commit 58970c538d35919db875096460c05806056f4de0 From 3cf2d4462e380fbcf6b8a2cd350f290b270a1085 Mon Sep 17 00:00:00 2001 From: akvlad Date: Wed, 19 Mar 2025 14:10:16 +0200 Subject: [PATCH 11/20] reuse uploaded artifacts attempt --- .github/workflows/MainDistributionPipeline.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 5b1fddb..ba1212d 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -31,4 +31,19 @@ jobs: duckdb_version: v1.2.1 ci_tools_version: v1.2.1 extension_name: chsql - + process-artifacts: + name: Process Extension Artifacts + needs: duckdb-stable-build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: downloaded-artifacts + + - name: List downloaded artifacts + run: | + ls -la downloaded-artifacts/ + find downloaded-artifacts -type f | sort From 7daa9b5318570ee78ce3c71f2fdb38622e032c03 Mon Sep 17 00:00:00 2001 From: akvlad Date: Wed, 19 Mar 2025 19:56:11 +0200 Subject: [PATCH 12/20] move and wrap uploaded artifacts --- .github/workflows/MainDistributionPipeline.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index ba1212d..6670816 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -45,5 +45,11 @@ jobs: - name: List downloaded artifacts run: | - ls -la downloaded-artifacts/ - find downloaded-artifacts -type f | sort + mkdir to-upload; \ + for l in `ls downloaded-artifacts`; do \ + VER=`echo $l | cut -d '-' -f 2`; \ + ARCH=`echo $l| cut -d '-' -f 4`; \ + EXT=`ls downloaded-artifacts/$l | cut -b 7-`; \ + mv downloaded-artifacts/$l/chsql.$EXT to-upload/chsql.$VER.$ARCH.$EXT; \ + done; \ + ls -la to-upload \ No newline at end of file From 462cd51fa55a7d33b67aa8e03dbca676df13bf9d Mon Sep 17 00:00:00 2001 From: akvlad Date: Wed, 19 Mar 2025 21:14:45 +0200 Subject: [PATCH 13/20] add 1.2.0 build --- .../workflows/MainDistributionPipeline.yml | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 6670816..152d29d 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -23,6 +23,14 @@ jobs: # duckdb_version: 1.1.2 # ci_tools_version: 1.1.2 # extension_name: chsql + # We have to build v1.2.0 based due to go-duckdb restrictions + duckdb-1-2-0-build: + name: Build extension binaries + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1 + with: + duckdb_version: v1.2.0 + ci_tools_version: v1.2.0 + extension_name: chsql duckdb-stable-build: name: Build extension binaries @@ -31,9 +39,11 @@ jobs: duckdb_version: v1.2.1 ci_tools_version: v1.2.1 extension_name: chsql - process-artifacts: + + process-all-artifacts: name: Process Extension Artifacts - needs: duckdb-stable-build + needs: [duckdb-1-2-0-build, duckdb-stable-build] + #if: github.event_name == 'release' && github.event.action == 'published' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -46,10 +56,16 @@ jobs: - name: List downloaded artifacts run: | mkdir to-upload; \ + ls -la downloaded-artifacts; \ for l in `ls downloaded-artifacts`; do \ VER=`echo $l | cut -d '-' -f 2`; \ ARCH=`echo $l| cut -d '-' -f 4`; \ EXT=`ls downloaded-artifacts/$l | cut -b 7-`; \ mv downloaded-artifacts/$l/chsql.$EXT to-upload/chsql.$VER.$ARCH.$EXT; \ done; \ - ls -la to-upload \ No newline at end of file + ls -la to-upload + + #- name: Upload Release Assets + # uses: softprops/action-gh-release@v1 + # with: + # files: to-upload/* \ No newline at end of file From 955125bfcd3fe0e0b8907d6e7f23d5570e9ee3f6 Mon Sep 17 00:00:00 2001 From: akvlad Date: Thu, 20 Mar 2025 08:40:09 +0200 Subject: [PATCH 14/20] upload artifacts on release --- .github/workflows/MainDistributionPipeline.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 152d29d..9c03355 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -9,6 +9,8 @@ on: - "*/**.yml" pull_request: workflow_dispatch: + release: + types: [published] concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} @@ -40,10 +42,10 @@ jobs: ci_tools_version: v1.2.1 extension_name: chsql - process-all-artifacts: + release-all-artifacts: name: Process Extension Artifacts needs: [duckdb-1-2-0-build, duckdb-stable-build] - #if: github.event_name == 'release' && github.event.action == 'published' + if: github.event_name == 'release' && github.event.action == 'published' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -56,6 +58,7 @@ jobs: - name: List downloaded artifacts run: | mkdir to-upload; \ + echo "Artifacts downloaded:" \ ls -la downloaded-artifacts; \ for l in `ls downloaded-artifacts`; do \ VER=`echo $l | cut -d '-' -f 2`; \ @@ -63,9 +66,10 @@ jobs: EXT=`ls downloaded-artifacts/$l | cut -b 7-`; \ mv downloaded-artifacts/$l/chsql.$EXT to-upload/chsql.$VER.$ARCH.$EXT; \ done; \ + echo "Artifacts to be uploaded:" \ ls -la to-upload - #- name: Upload Release Assets - # uses: softprops/action-gh-release@v1 - # with: - # files: to-upload/* \ No newline at end of file + - name: Upload Release Assets + uses: softprops/action-gh-release@v1 + with: + files: to-upload/* \ No newline at end of file From 9e6f522be4a7426af1bc7cad74d6558860fd3e0a Mon Sep 17 00:00:00 2001 From: akvlad Date: Thu, 3 Apr 2025 22:56:40 +0300 Subject: [PATCH 15/20] debut order_by field index calculation --- chsql/src/parquet_ordered_scan.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index 65c3402..e41117b 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -48,7 +48,9 @@ namespace duckdb { if (order_by_column_it == reader->metadata->metadata->schema.end()) { orderByIdx = -1; } else { - orderByIdx = order_by_column_it - reader->metadata->metadata->schema.begin() - 1; + orderByIdx = find_if(columnMap.begin(), columnMap.end(), + [&](const int64_t& i) { return i == (order_by_column_it - reader->metadata->metadata->schema.begin() - 1); }) - + columnMap.begin(); } } void Scan(ClientContext& ctx) { From b992342a09fad4f45be67cab7972d72a824267b6 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sun, 8 Jun 2025 00:03:41 +0200 Subject: [PATCH 16/20] Fixes duckdb v1.3.0 (local) (#27) * Fix duckdb v1.3.0 (#26) --------- Co-authored-by: Votre Nom Co-authored-by: lmangani --- .../workflows/MainDistributionPipeline.yml | 60 +++---------------- chsql/CMakeLists.txt | 5 +- chsql/extension_config.cmake | 2 +- chsql/src/duck_flock.cpp | 2 +- chsql/src/parquet_ordered_scan.cpp | 27 ++++----- duckdb | 2 +- extension-ci-tools | 2 +- 7 files changed, 26 insertions(+), 74 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 9c03355..c130c02 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -4,72 +4,26 @@ name: Main Extension Distribution Pipeline on: push: - paths-ignore: - - "*/**.md" - - "*/**.yml" pull_request: workflow_dispatch: - release: - types: [published] concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' && github.sha || '' }} cancel-in-progress: true jobs: -# Temporarily disabled because main is broken # duckdb-next-build: -# name: Build extension binaries (next) +# name: Build extension binaries # uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main # with: -# duckdb_version: 1.1.2 -# ci_tools_version: 1.1.2 +# duckdb_version: main +# ci_tools_version: main # extension_name: chsql - # We have to build v1.2.0 based due to go-duckdb restrictions - duckdb-1-2-0-build: - name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1 - with: - duckdb_version: v1.2.0 - ci_tools_version: v1.2.0 - extension_name: chsql duckdb-stable-build: name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1 + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: - duckdb_version: v1.2.1 - ci_tools_version: v1.2.1 + duckdb_version: v1.3.0 + ci_tools_version: v1.3.0 extension_name: chsql - - release-all-artifacts: - name: Process Extension Artifacts - needs: [duckdb-1-2-0-build, duckdb-stable-build] - if: github.event_name == 'release' && github.event.action == 'published' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Download all artifacts - uses: actions/download-artifact@v4 - with: - path: downloaded-artifacts - - - name: List downloaded artifacts - run: | - mkdir to-upload; \ - echo "Artifacts downloaded:" \ - ls -la downloaded-artifacts; \ - for l in `ls downloaded-artifacts`; do \ - VER=`echo $l | cut -d '-' -f 2`; \ - ARCH=`echo $l| cut -d '-' -f 4`; \ - EXT=`ls downloaded-artifacts/$l | cut -b 7-`; \ - mv downloaded-artifacts/$l/chsql.$EXT to-upload/chsql.$VER.$ARCH.$EXT; \ - done; \ - echo "Artifacts to be uploaded:" \ - ls -la to-upload - - - name: Upload Release Assets - uses: softprops/action-gh-release@v1 - with: - files: to-upload/* \ No newline at end of file diff --git a/chsql/CMakeLists.txt b/chsql/CMakeLists.txt index a3375c5..919a25a 100644 --- a/chsql/CMakeLists.txt +++ b/chsql/CMakeLists.txt @@ -19,15 +19,14 @@ include_directories( ../duckdb/third_party/thrift ../duckdb/third_party/snappy ../duckdb/third_party/zstd/include - ../duckdb/third_party/mbedtls ../duckdb/third_party/mbedtls/include ../duckdb/third_party/brotli/include) set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp src/parquet_types.cpp) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) # Link OpenSSL in both the static library as the loadable extension -target_link_libraries(${EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto) -target_link_libraries(${LOADABLE_EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto) +target_link_libraries(${EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto duckdb_mbedtls) +target_link_libraries(${LOADABLE_EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto duckdb_mbedtls) install( TARGETS ${EXTENSION_NAME} EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/chsql/extension_config.cmake b/chsql/extension_config.cmake index 4ec1a10..7ae21d2 100644 --- a/chsql/extension_config.cmake +++ b/chsql/extension_config.cmake @@ -8,7 +8,7 @@ include_directories( ../duckdb/third_party/thrift ../duckdb/third_party/snappy ../duckdb/third_party/zstd/include - ../duckdb/third_party/mbedtls +# ../duckdb/third_party/mbedtls ../duckdb/third_party/mbedtls/include ../duckdb/third_party/brotli/include) diff --git a/chsql/src/duck_flock.cpp b/chsql/src/duck_flock.cpp index 79a821d..b93eaaa 100644 --- a/chsql/src/duck_flock.cpp +++ b/chsql/src/duck_flock.cpp @@ -104,7 +104,7 @@ namespace duckdb { try { if (res->TryFetch(data_chunk, error_data)) { - if (data_chunk && !data_chunk->size() == 0) { + if (data_chunk && data_chunk->size() != 0) { output.Append(*data_chunk); return; } diff --git a/chsql/src/parquet_ordered_scan.cpp b/chsql/src/parquet_ordered_scan.cpp index e41117b..81dd069 100644 --- a/chsql/src/parquet_ordered_scan.cpp +++ b/chsql/src/parquet_ordered_scan.cpp @@ -2,7 +2,7 @@ #include "duckdb/common/exception.hpp" #include #include "chsql_extension.hpp" -#include +#include #include "chsql_parquet_types.h" namespace duckdb { @@ -35,11 +35,10 @@ namespace duckdb { haveAbsentColumns = true; continue; } - columnMap.push_back(schema_column - reader->metadata->metadata->schema.begin() - 1); - reader->reader_data.column_ids.push_back( - schema_column - reader->metadata->metadata->schema.begin() - 1); - reader->reader_data.column_mapping.push_back( - it - returnCols.begin()); + columnMap.push_back(static_cast(schema_column - reader->metadata->metadata->schema.begin() - 1)); + reader->column_ids.push_back( + MultiFileLocalColumnId(static_cast(schema_column - reader->metadata->metadata->schema.begin() - 1))); + reader->column_indexes.emplace_back(static_cast(it - returnCols.begin())); } auto order_by_column_it = find_if( reader->metadata->metadata->schema.begin(), @@ -55,7 +54,7 @@ namespace duckdb { } void Scan(ClientContext& ctx) { chunk->Reset(); - reader->Scan(*scanState, *chunk); + reader->Scan(ctx, *scanState, *chunk); if (!haveAbsentColumns || chunk->size() == 0) { return; } @@ -180,7 +179,7 @@ namespace duckdb { ParquetOptions po; po.binary_as_string = true; set->reader = make_uniq(context, file, po, nullptr); - res.push_back(move(set)); + res.push_back(std::move(set)); } } @@ -189,16 +188,16 @@ namespace duckdb { Connection conn(*context.db); auto res = make_uniq(); auto files = ListValue::GetChildren(input.inputs[0]); - vector fileNames; + vector fileInfoList; for (auto & file : files) { - fileNames.push_back(file.ToString()); + fileInfoList.emplace_back(file.ToString()); } - GlobMultiFileList fileList(context, fileNames, FileGlobOptions::ALLOW_EMPTY); - string filename; + GlobMultiFileList fileList(context, fileInfoList, FileGlobOptions::ALLOW_EMPTY); + OpenFileInfo file_info; MultiFileListScanData it; fileList.InitializeScan(it); - while (fileList.Scan(it, filename)) { - res->files.push_back(filename); + while (fileList.Scan(it, file_info)) { + res->files.push_back(file_info.path); } if (res->files.empty()) { throw InvalidInputException("No files matched the provided pattern."); diff --git a/duckdb b/duckdb index 7c0f857..71c5c07 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 7c0f8574bda9af7aa5b23166d7860d68ae3b9481 +Subproject commit 71c5c07cdd295e9409c0505885033ae9eb6b5ddd diff --git a/extension-ci-tools b/extension-ci-tools index 58970c5..71d2002 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 58970c538d35919db875096460c05806056f4de0 +Subproject commit 71d20029c5314dfc34f3bbdab808b9bce03b8003 From f056022f4a9d20e280059d66458d907f4a080cdd Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Thu, 17 Jul 2025 18:58:18 +0200 Subject: [PATCH 17/20] v1.3.2 builder (#31) --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- duckdb | 2 +- extension-ci-tools | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index c130c02..46d78a5 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -24,6 +24,6 @@ jobs: name: Build extension binaries uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: - duckdb_version: v1.3.0 - ci_tools_version: v1.3.0 + duckdb_version: v1.3.2 + ci_tools_version: v1.3.2 extension_name: chsql diff --git a/duckdb b/duckdb index 71c5c07..0b83e5d 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 71c5c07cdd295e9409c0505885033ae9eb6b5ddd +Subproject commit 0b83e5d2f68bc02dfefde74b846bd039f078affa diff --git a/extension-ci-tools b/extension-ci-tools index 71d2002..90757de 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 71d20029c5314dfc34f3bbdab808b9bce03b8003 +Subproject commit 90757de3f06c6802cd49732849b9e46eef75761f From da7a7003d7a4c396a7f3ef4dc90f28a548ad7148 Mon Sep 17 00:00:00 2001 From: Rusty Conover Date: Fri, 22 Aug 2025 12:28:11 -0400 Subject: [PATCH 18/20] fix: remove openssl dependency and unused functions --- .github/workflows/_extension_deploy.yml | 121 ------- .github/workflows/main.yml | 24 -- CMakeLists.txt | 1 - chsql/src/chsql_extension.cpp | 400 +++++++++++------------- chsql/src/include/chsql_extension.hpp | 21 +- chsql/test/sql/chsql.test | 11 - chsql/vcpkg.json | 4 +- 7 files changed, 201 insertions(+), 381 deletions(-) delete mode 100644 .github/workflows/_extension_deploy.yml delete mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/_extension_deploy.yml b/.github/workflows/_extension_deploy.yml deleted file mode 100644 index c408f90..0000000 --- a/.github/workflows/_extension_deploy.yml +++ /dev/null @@ -1,121 +0,0 @@ -# -# Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml -# -# note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the -# deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as -# this workflow can be configured to use a custom deploy script. - - -name: Extension Deployment -on: - workflow_call: - inputs: - # The name of the extension - extension_name: - required: true - type: string - # DuckDB version to build against - duckdb_version: - required: true - type: string - # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64' - exclude_archs: - required: false - type: string - default: "" - # Whether to upload this deployment as the latest. This may overwrite a previous deployment. - deploy_latest: - required: false - type: boolean - default: false - # Whether to upload this deployment under a versioned path. These will not be deleted automatically - deploy_versioned: - required: false - type: boolean - default: false - # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times - artifact_postfix: - required: false - type: string - default: "" - # Override the default deploy script with a custom script - deploy_script: - required: false - type: string - default: "./scripts/extension-upload.sh" - # Override the default matrix parse script with a custom script - matrix_parse_script: - required: false - type: string - default: "./duckdb/scripts/modify_distribution_matrix.py" - -jobs: - generate_matrix: - name: Generate matrix - runs-on: ubuntu-latest - outputs: - deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - id: parse-matrices - run: | - python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty - deploy_matrix="`cat deploy_matrix.json`" - echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT - echo `cat $GITHUB_OUTPUT` - - deploy: - name: Deploy - runs-on: ubuntu-latest - needs: generate_matrix - if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }} - strategy: - matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}} - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Checkout DuckDB to version - run: | - cd duckdb - git checkout ${{ inputs.duckdb_version }} - - - uses: actions/download-artifact@v2 - with: - name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}} - path: | - /tmp/extension - - - name: Deploy - shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} - BUCKET_NAME: ${{ secrets.S3_BUCKET }} - DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }} - run: | - pwd - python3 -m pip install pip awscli - git config --global --add safe.directory '*' - cd duckdb - git fetch --tags - export DUCKDB_VERSION=`git tag --points-at HEAD` - export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`} - cd .. - git fetch --tags - export EXT_VERSION=`git tag --points-at HEAD` - export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`} - ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index e53841a..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Update Submodules -on: - workflow_dispatch: - -permissions: - contents: write - -jobs: - update: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - name: Update module - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - git submodule update --init --recursive - git config --global user.name "GitHub Action" - git config --global user.email "noreply@github.com" - git commit -am "update submodules" - # git push - diff --git a/CMakeLists.txt b/CMakeLists.txt index f750372..57edf1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,6 @@ set(TARGET_NAME chsql) # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then # used in cmake with find_package. Feel free to remove or replace with other dependencies. # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. -find_package(OpenSSL REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) project(${TARGET_NAME}) diff --git a/chsql/src/chsql_extension.cpp b/chsql/src/chsql_extension.cpp index f5f0f9a..4a8b871 100644 --- a/chsql/src/chsql_extension.cpp +++ b/chsql/src/chsql_extension.cpp @@ -10,160 +10,159 @@ #include "duckdb/catalog/default/default_functions.hpp" #include "duckdb/catalog/default/default_table_functions.hpp" -// OpenSSL linked through vcpkg -#include #include "parquet_ordered_scan.cpp" #include "chsql_system.hpp" -namespace duckdb { - -// To add a new scalar SQL macro, add a new macro to this array! -// Copy and paste the top item in the array into the -// second-to-last position and make some modifications. -// (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) - -// Keep the DEFAULT_SCHEMA (no change needed) -// Replace "times_two" with a name for your macro -// If your function has parameters, add their names in quotes inside of the {}, with a nullptr at the end -// If you do not have parameters, simplify to {nullptr} -// Add the text of your SQL macro as a raw string with the format R"( select 42 )" - -static DefaultMacro chsql_macros[] = { - // -- Type conversion macros - {DEFAULT_SCHEMA, "toString", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS VARCHAR))"}, - {DEFAULT_SCHEMA, "toInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT8))"}, - {DEFAULT_SCHEMA, "toInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT16))"}, - {DEFAULT_SCHEMA, "toInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT32))"}, - {DEFAULT_SCHEMA, "toInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT64))"}, - {DEFAULT_SCHEMA, "toInt128", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT128))"}, - {DEFAULT_SCHEMA, "toInt256", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS HUGEINT))"}, - {DEFAULT_SCHEMA, "toInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT8) IS NOT NULL THEN CAST(x AS INT8) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT16) IS NOT NULL THEN CAST(x AS INT16) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT32) IS NOT NULL THEN CAST(x AS INT32) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT64) IS NOT NULL THEN CAST(x AS INT64) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt128OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT128) IS NOT NULL THEN CAST(x AS INT128) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt256OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS HUGEINT) IS NOT NULL THEN CAST(x AS HUGEINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT8))"}, - {DEFAULT_SCHEMA, "toInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT16))"}, - {DEFAULT_SCHEMA, "toInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT32))"}, - {DEFAULT_SCHEMA, "toInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT64))"}, - {DEFAULT_SCHEMA, "toInt128OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT128))"}, - {DEFAULT_SCHEMA, "toInt256OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS HUGEINT))"}, - // -- Unsigned integer conversion macros - {DEFAULT_SCHEMA, "toUInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UTINYINT))"}, - {DEFAULT_SCHEMA, "toUInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS USMALLINT))"}, - {DEFAULT_SCHEMA, "toUInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UINTEGER))"}, - {DEFAULT_SCHEMA, "toUInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UBIGINT))"}, - {DEFAULT_SCHEMA, "toUInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UTINYINT) IS NOT NULL THEN CAST(x AS UTINYINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS USMALLINT) IS NOT NULL THEN CAST(x AS USMALLINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UINTEGER) IS NOT NULL THEN CAST(x AS UINTEGER) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UBIGINT) IS NOT NULL THEN CAST(x AS UBIGINT) ELSE 0 END)"}, - {DEFAULT_SCHEMA, "toUInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UTINYINT))"}, // Fixed comma here - {DEFAULT_SCHEMA, "toUInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS USMALLINT))"}, // And here - {DEFAULT_SCHEMA, "toUInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UINTEGER))"}, // Also here - {DEFAULT_SCHEMA, "toUInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UBIGINT))"}, // And here - // -- Floating-point conversion macros - {DEFAULT_SCHEMA, "toFloat", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS DOUBLE))"}, - {DEFAULT_SCHEMA, "toFloatOrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS DOUBLE))"}, - {DEFAULT_SCHEMA, "toFloatOrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS DOUBLE) IS NOT NULL THEN CAST(x AS DOUBLE) ELSE 0 END)"}, - // -- Arithmetic macros - {DEFAULT_SCHEMA, "intDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((CAST(a AS BIGINT) // CAST(b AS BIGINT)))"}, - {DEFAULT_SCHEMA, "intDivOrNull", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT))"}, - {DEFAULT_SCHEMA, "intDivOZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(COALESCE((TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT)),0))"}, - {DEFAULT_SCHEMA, "plus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(add(a, b))"}, - {DEFAULT_SCHEMA, "minus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(subtract(a, b))"}, - {DEFAULT_SCHEMA, "modulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(CAST(a AS BIGINT) % CAST(b AS BIGINT))"}, - {DEFAULT_SCHEMA, "moduloOrZero", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(((TRY_CAST(a AS BIGINT) % TRY_CAST(b AS BIGINT))),0))"}, - // -- Tuple macros - {DEFAULT_SCHEMA, "tupleIntDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] // CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleIntDivByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) // CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleDivide", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] / CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMultiply", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> CAST(apply(b, x -> CAST(x AS BIGINT))[i] as BIGINT) * CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMinus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] - CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tuplePlus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] + CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleMultiplyByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) * CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleDivideByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) / CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleModulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleModuloByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, - {DEFAULT_SCHEMA, "tupleConcat", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(list_concat(a, b))"}, - // -- String matching macros - {DEFAULT_SCHEMA, "match", {"string", "token"}, {{nullptr, nullptr}}, R"(string LIKE token)"}, - // -- Array macros - {DEFAULT_SCHEMA, "arrayExists", {"needle", "haystack", nullptr}, {{nullptr, nullptr}}, R"(haystack @> ARRAY[needle])"}, - {DEFAULT_SCHEMA, "arrayMap", {"x", "arr", nullptr}, {{nullptr, nullptr}}, R"(array_transform(arr, x))"}, - // Date and Time Functions - {DEFAULT_SCHEMA, "toYear", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(YEAR FROM date_expression))"}, - {DEFAULT_SCHEMA, "toMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MONTH FROM date_expression))"}, - {DEFAULT_SCHEMA, "toDayOfMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(DAY FROM date_expression))"}, - {DEFAULT_SCHEMA, "toHour", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(HOUR FROM date_expression))"}, - {DEFAULT_SCHEMA, "toMinute", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MINUTE FROM date_expression))"}, - {DEFAULT_SCHEMA, "toSecond", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(SECOND FROM date_expression))"}, - {DEFAULT_SCHEMA, "toYYYYMM", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m'))"}, - {DEFAULT_SCHEMA, "toYYYYMMDD", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d'))"}, - {DEFAULT_SCHEMA, "toYYYYMMDDhhmmss", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d%H%M%S'))"}, - {DEFAULT_SCHEMA, "formatDateTime", {"time", "format", "timezone", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN timezone IS NULL THEN strftime(time, format) ELSE strftime(time AT TIME ZONE timezone, format) END)"}, - // String Functions - {DEFAULT_SCHEMA, "empty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) = 0)"}, - {DEFAULT_SCHEMA, "notEmpty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) > 0)"}, - {DEFAULT_SCHEMA, "lengthUTF8", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str))"}, - {DEFAULT_SCHEMA, "leftPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(LPAD(str, length, pad_str))"}, - {DEFAULT_SCHEMA, "rightPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(RPAD(str, length, pad_str))"}, - {DEFAULT_SCHEMA, "extractAllGroups", {"text", "pattern", nullptr}, {{nullptr, nullptr}}, R"(regexp_extract_all(text, pattern))"}, - {DEFAULT_SCHEMA, "toFixedString", {"str", "length", nullptr}, {{nullptr, nullptr}}, R"(RPAD(LEFT(str, length), length, '\0'))"}, - {DEFAULT_SCHEMA, "ifNull", {"x", "y", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(x, y))"}, - {DEFAULT_SCHEMA, "arrayJoin", {"arr", nullptr}, {{nullptr, nullptr}}, R"(UNNEST(arr))"}, - {DEFAULT_SCHEMA, "splitByChar", {"separator", "str", nullptr}, {{nullptr, nullptr}}, R"(string_split(str, separator))"}, - // URL Functions - {DEFAULT_SCHEMA, "protocol", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '^(\w+)://', 1))"}, - {DEFAULT_SCHEMA, "domain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://([^/]+)', 1))"}, - {DEFAULT_SCHEMA, "topLevelDomain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '\.([^./:]+)([:/]|$)', 1))"}, - {DEFAULT_SCHEMA, "path", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://[^/]+(/.*)', 1))"}, - // IP Address Functions - {DEFAULT_SCHEMA, "IPv4NumToString", {"num", nullptr}, {{nullptr, nullptr}}, R"(CONCAT(CAST((num >> 24) & 255 AS VARCHAR), '.', CAST((num >> 16) & 255 AS VARCHAR), '.', CAST((num >> 8) & 255 AS VARCHAR), '.', CAST(num & 255 AS VARCHAR)))"}, - {DEFAULT_SCHEMA, "IPv4StringToNum", {"ip", nullptr}, {{nullptr, nullptr}}, R"(CAST(SPLIT_PART(ip, '.', 1) AS INTEGER) * 256 * 256 * 256 + CAST(SPLIT_PART(ip, '.', 2) AS INTEGER) * 256 * 256 + CAST(SPLIT_PART(ip, '.', 3) AS INTEGER) * 256 + CAST(SPLIT_PART(ip, '.', 4) AS INTEGER))"}, - // -- JSON Macros - {DEFAULT_SCHEMA, "JSONExtract", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, - {DEFAULT_SCHEMA, "JSONExtractString", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS VARCHAR))"}, - {DEFAULT_SCHEMA, "JSONExtractUInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS UINTEGER))"}, - {DEFAULT_SCHEMA, "JSONExtractInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS INT32))"}, - {DEFAULT_SCHEMA, "JSONExtractFloat", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS DOUBLE))"}, - {DEFAULT_SCHEMA, "JSONExtractRaw", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, - {DEFAULT_SCHEMA, "JSONHas", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key) IS NOT NULL)"}, - {DEFAULT_SCHEMA, "JSONLength", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_array_length(json))"}, - {DEFAULT_SCHEMA, "JSONType", {"json", "path", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN path IS NULL THEN json_each(json) ELSE json_each(json,path) END)"}, - {DEFAULT_SCHEMA, "JSONExtractKeys", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_object_keys(json))"}, - {DEFAULT_SCHEMA, "JSONExtractValues", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_each_text(json))"}, - // -- Compare Macros - {DEFAULT_SCHEMA, "equals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a = b))"}, - {DEFAULT_SCHEMA, "notEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <> b))"}, - {DEFAULT_SCHEMA, "less", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a < b))"}, - {DEFAULT_SCHEMA, "greater", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a > b))"}, - {DEFAULT_SCHEMA, "lessOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <= b))"}, - {DEFAULT_SCHEMA, "greaterOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a >= b))"}, - // -- Misc macros - {DEFAULT_SCHEMA, "generateUUIDv4", {nullptr}, {{nullptr, nullptr}}, R"(toString(uuid()))"}, - {DEFAULT_SCHEMA, "parseURL", {"url", "part", nullptr}, {{nullptr, nullptr}}, R"(CASE part WHEN 'protocol' THEN REGEXP_EXTRACT(url, '^(\w+)://') WHEN 'domain' THEN REGEXP_EXTRACT(url, '://([^/:]+)') WHEN 'port' THEN REGEXP_EXTRACT(url, ':(\d+)') WHEN 'path' THEN REGEXP_EXTRACT(url, '://[^/]+(/.+?)(\?|#|$)') WHEN 'query' THEN REGEXP_EXTRACT(url, '\?([^#]+)') WHEN 'fragment' THEN REGEXP_EXTRACT(url, '#(.+)$') END)"}, - {DEFAULT_SCHEMA, "bitCount", {"num", nullptr}, {{nullptr, nullptr}}, R"(BIT_COUNT(num))"}, - // Dictionary Emulation using MAP VARIABLES - {DEFAULT_SCHEMA, "dictGet", {"dict","attr", nullptr}, {{nullptr, nullptr}}, R"(getvariable(dict)[attr][1])"}, - // -- End Macro - {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}}; - -// To add a new table SQL macro, add a new macro to this array! -// Copy and paste the top item in the array into the -// second-to-last position and make some modifications. -// (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) - -// Keep the DEFAULT_SCHEMA (no change needed) -// Replace "times_two_table" with a name for your macro -// If your function has parameters without default values, add their names in quotes inside of the {}, with a nullptr at the end -// If you do not have parameters, simplify to {nullptr} -// If your function has parameters with default values, add their names and values in quotes inside of {}'s inside of the {}. -// Be sure to keep {nullptr, nullptr} at the end -// If you do not have parameters with default values, simplify to {nullptr, nullptr} -// Add the text of your SQL macro as a raw string with the format R"( select 42; )" - -// clang-format off +namespace duckdb +{ + + // To add a new scalar SQL macro, add a new macro to this array! + // Copy and paste the top item in the array into the + // second-to-last position and make some modifications. + // (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) + + // Keep the DEFAULT_SCHEMA (no change needed) + // Replace "times_two" with a name for your macro + // If your function has parameters, add their names in quotes inside of the {}, with a nullptr at the end + // If you do not have parameters, simplify to {nullptr} + // Add the text of your SQL macro as a raw string with the format R"( select 42 )" + + static DefaultMacro chsql_macros[] = { + // -- Type conversion macros + {DEFAULT_SCHEMA, "toString", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS VARCHAR))"}, + {DEFAULT_SCHEMA, "toInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT8))"}, + {DEFAULT_SCHEMA, "toInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT16))"}, + {DEFAULT_SCHEMA, "toInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT32))"}, + {DEFAULT_SCHEMA, "toInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT64))"}, + {DEFAULT_SCHEMA, "toInt128", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS INT128))"}, + {DEFAULT_SCHEMA, "toInt256", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS HUGEINT))"}, + {DEFAULT_SCHEMA, "toInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT8) IS NOT NULL THEN CAST(x AS INT8) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT16) IS NOT NULL THEN CAST(x AS INT16) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT32) IS NOT NULL THEN CAST(x AS INT32) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT64) IS NOT NULL THEN CAST(x AS INT64) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt128OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS INT128) IS NOT NULL THEN CAST(x AS INT128) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt256OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS HUGEINT) IS NOT NULL THEN CAST(x AS HUGEINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT8))"}, + {DEFAULT_SCHEMA, "toInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT16))"}, + {DEFAULT_SCHEMA, "toInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT32))"}, + {DEFAULT_SCHEMA, "toInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT64))"}, + {DEFAULT_SCHEMA, "toInt128OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS INT128))"}, + {DEFAULT_SCHEMA, "toInt256OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS HUGEINT))"}, + // -- Unsigned integer conversion macros + {DEFAULT_SCHEMA, "toUInt8", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UTINYINT))"}, + {DEFAULT_SCHEMA, "toUInt16", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS USMALLINT))"}, + {DEFAULT_SCHEMA, "toUInt32", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UINTEGER))"}, + {DEFAULT_SCHEMA, "toUInt64", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS UBIGINT))"}, + {DEFAULT_SCHEMA, "toUInt8OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UTINYINT) IS NOT NULL THEN CAST(x AS UTINYINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt16OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS USMALLINT) IS NOT NULL THEN CAST(x AS USMALLINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt32OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UINTEGER) IS NOT NULL THEN CAST(x AS UINTEGER) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt64OrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS UBIGINT) IS NOT NULL THEN CAST(x AS UBIGINT) ELSE 0 END)"}, + {DEFAULT_SCHEMA, "toUInt8OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UTINYINT))"}, // Fixed comma here + {DEFAULT_SCHEMA, "toUInt16OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS USMALLINT))"}, // And here + {DEFAULT_SCHEMA, "toUInt32OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UINTEGER))"}, // Also here + {DEFAULT_SCHEMA, "toUInt64OrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS UBIGINT))"}, // And here + // -- Floating-point conversion macros + {DEFAULT_SCHEMA, "toFloat", {"x", nullptr}, {{nullptr, nullptr}}, R"(CAST(x AS DOUBLE))"}, + {DEFAULT_SCHEMA, "toFloatOrNull", {"x", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST(x AS DOUBLE))"}, + {DEFAULT_SCHEMA, "toFloatOrZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN TRY_CAST(x AS DOUBLE) IS NOT NULL THEN CAST(x AS DOUBLE) ELSE 0 END)"}, + // -- Arithmetic macros + {DEFAULT_SCHEMA, "intDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((CAST(a AS BIGINT) // CAST(b AS BIGINT)))"}, + {DEFAULT_SCHEMA, "intDivOrNull", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT))"}, + {DEFAULT_SCHEMA, "intDivOZero", {"x", nullptr}, {{nullptr, nullptr}}, R"(COALESCE((TRY_CAST((TRY_CAST(a AS BIGINT) // TRY_CAST(b AS BIGINT)) AS BIGINT)),0))"}, + {DEFAULT_SCHEMA, "plus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(add(a, b))"}, + {DEFAULT_SCHEMA, "minus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(subtract(a, b))"}, + {DEFAULT_SCHEMA, "modulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(CAST(a AS BIGINT) % CAST(b AS BIGINT))"}, + {DEFAULT_SCHEMA, "moduloOrZero", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(((TRY_CAST(a AS BIGINT) % TRY_CAST(b AS BIGINT))),0))"}, + // -- Tuple macros + {DEFAULT_SCHEMA, "tupleIntDiv", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] // CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleIntDivByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) // CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleDivide", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] / CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMultiply", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> CAST(apply(b, x -> CAST(x AS BIGINT))[i] as BIGINT) * CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMinus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] - CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tuplePlus", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x,i) -> apply(b, x -> CAST(x AS BIGINT))[i] + CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleMultiplyByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) * CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleDivideByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) / CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleModulo", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleModuloByNumber", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(apply(a, (x) -> CAST(apply(b, x -> CAST(x AS BIGINT))[1] as BIGINT) % CAST(x AS BIGINT)))"}, + {DEFAULT_SCHEMA, "tupleConcat", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"(list_concat(a, b))"}, + // -- String matching macros + {DEFAULT_SCHEMA, "match", {"string", "token"}, {{nullptr, nullptr}}, R"(string LIKE token)"}, + // -- Array macros + {DEFAULT_SCHEMA, "arrayExists", {"needle", "haystack", nullptr}, {{nullptr, nullptr}}, R"(haystack @> ARRAY[needle])"}, + {DEFAULT_SCHEMA, "arrayMap", {"x", "arr", nullptr}, {{nullptr, nullptr}}, R"(array_transform(arr, x))"}, + // Date and Time Functions + {DEFAULT_SCHEMA, "toYear", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(YEAR FROM date_expression))"}, + {DEFAULT_SCHEMA, "toMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MONTH FROM date_expression))"}, + {DEFAULT_SCHEMA, "toDayOfMonth", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(DAY FROM date_expression))"}, + {DEFAULT_SCHEMA, "toHour", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(HOUR FROM date_expression))"}, + {DEFAULT_SCHEMA, "toMinute", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(MINUTE FROM date_expression))"}, + {DEFAULT_SCHEMA, "toSecond", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(EXTRACT(SECOND FROM date_expression))"}, + {DEFAULT_SCHEMA, "toYYYYMM", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m'))"}, + {DEFAULT_SCHEMA, "toYYYYMMDD", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d'))"}, + {DEFAULT_SCHEMA, "toYYYYMMDDhhmmss", {"date_expression", nullptr}, {{nullptr, nullptr}}, R"(DATE_FORMAT(date_expression, '%Y%m%d%H%M%S'))"}, + {DEFAULT_SCHEMA, "formatDateTime", {"time", "format", "timezone", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN timezone IS NULL THEN strftime(time, format) ELSE strftime(time AT TIME ZONE timezone, format) END)"}, + // String Functions + {DEFAULT_SCHEMA, "empty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) = 0)"}, + {DEFAULT_SCHEMA, "notEmpty", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str) > 0)"}, + {DEFAULT_SCHEMA, "lengthUTF8", {"str", nullptr}, {{nullptr, nullptr}}, R"(LENGTH(str))"}, + {DEFAULT_SCHEMA, "leftPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(LPAD(str, length, pad_str))"}, + {DEFAULT_SCHEMA, "rightPad", {"str", "length", "pad_str", nullptr}, {{nullptr, nullptr}}, R"(RPAD(str, length, pad_str))"}, + {DEFAULT_SCHEMA, "extractAllGroups", {"text", "pattern", nullptr}, {{nullptr, nullptr}}, R"(regexp_extract_all(text, pattern))"}, + {DEFAULT_SCHEMA, "toFixedString", {"str", "length", nullptr}, {{nullptr, nullptr}}, R"(RPAD(LEFT(str, length), length, '\0'))"}, + {DEFAULT_SCHEMA, "ifNull", {"x", "y", nullptr}, {{nullptr, nullptr}}, R"(COALESCE(x, y))"}, + {DEFAULT_SCHEMA, "arrayJoin", {"arr", nullptr}, {{nullptr, nullptr}}, R"(UNNEST(arr))"}, + {DEFAULT_SCHEMA, "splitByChar", {"separator", "str", nullptr}, {{nullptr, nullptr}}, R"(string_split(str, separator))"}, + // URL Functions + {DEFAULT_SCHEMA, "protocol", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '^(\w+)://', 1))"}, + {DEFAULT_SCHEMA, "domain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://([^/]+)', 1))"}, + {DEFAULT_SCHEMA, "topLevelDomain", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '\.([^./:]+)([:/]|$)', 1))"}, + {DEFAULT_SCHEMA, "path", {"url", nullptr}, {{nullptr, nullptr}}, R"(REGEXP_EXTRACT(url, '://[^/]+(/.*)', 1))"}, + // IP Address Functions + {DEFAULT_SCHEMA, "IPv4NumToString", {"num", nullptr}, {{nullptr, nullptr}}, R"(CONCAT(CAST((num >> 24) & 255 AS VARCHAR), '.', CAST((num >> 16) & 255 AS VARCHAR), '.', CAST((num >> 8) & 255 AS VARCHAR), '.', CAST(num & 255 AS VARCHAR)))"}, + {DEFAULT_SCHEMA, "IPv4StringToNum", {"ip", nullptr}, {{nullptr, nullptr}}, R"(CAST(SPLIT_PART(ip, '.', 1) AS INTEGER) * 256 * 256 * 256 + CAST(SPLIT_PART(ip, '.', 2) AS INTEGER) * 256 * 256 + CAST(SPLIT_PART(ip, '.', 3) AS INTEGER) * 256 + CAST(SPLIT_PART(ip, '.', 4) AS INTEGER))"}, + // -- JSON Macros + {DEFAULT_SCHEMA, "JSONExtract", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, + {DEFAULT_SCHEMA, "JSONExtractString", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS VARCHAR))"}, + {DEFAULT_SCHEMA, "JSONExtractUInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS UINTEGER))"}, + {DEFAULT_SCHEMA, "JSONExtractInt", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS INT32))"}, + {DEFAULT_SCHEMA, "JSONExtractFloat", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(CAST(json_extract(json, key) AS DOUBLE))"}, + {DEFAULT_SCHEMA, "JSONExtractRaw", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key))"}, + {DEFAULT_SCHEMA, "JSONHas", {"json", "key", nullptr}, {{nullptr, nullptr}}, R"(json_extract(json, key) IS NOT NULL)"}, + {DEFAULT_SCHEMA, "JSONLength", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_array_length(json))"}, + {DEFAULT_SCHEMA, "JSONType", {"json", "path", nullptr}, {{nullptr, nullptr}}, R"(CASE WHEN path IS NULL THEN json_each(json) ELSE json_each(json,path) END)"}, + {DEFAULT_SCHEMA, "JSONExtractKeys", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_object_keys(json))"}, + {DEFAULT_SCHEMA, "JSONExtractValues", {"json", nullptr}, {{nullptr, nullptr}}, R"(json_each_text(json))"}, + // -- Compare Macros + {DEFAULT_SCHEMA, "equals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a = b))"}, + {DEFAULT_SCHEMA, "notEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <> b))"}, + {DEFAULT_SCHEMA, "less", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a < b))"}, + {DEFAULT_SCHEMA, "greater", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a > b))"}, + {DEFAULT_SCHEMA, "lessOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a <= b))"}, + {DEFAULT_SCHEMA, "greaterOrEquals", {"a", "b", nullptr}, {{nullptr, nullptr}}, R"((a >= b))"}, + // -- Misc macros + {DEFAULT_SCHEMA, "generateUUIDv4", {nullptr}, {{nullptr, nullptr}}, R"(toString(uuid()))"}, + {DEFAULT_SCHEMA, "parseURL", {"url", "part", nullptr}, {{nullptr, nullptr}}, R"(CASE part WHEN 'protocol' THEN REGEXP_EXTRACT(url, '^(\w+)://') WHEN 'domain' THEN REGEXP_EXTRACT(url, '://([^/:]+)') WHEN 'port' THEN REGEXP_EXTRACT(url, ':(\d+)') WHEN 'path' THEN REGEXP_EXTRACT(url, '://[^/]+(/.+?)(\?|#|$)') WHEN 'query' THEN REGEXP_EXTRACT(url, '\?([^#]+)') WHEN 'fragment' THEN REGEXP_EXTRACT(url, '#(.+)$') END)"}, + {DEFAULT_SCHEMA, "bitCount", {"num", nullptr}, {{nullptr, nullptr}}, R"(BIT_COUNT(num))"}, + // Dictionary Emulation using MAP VARIABLES + {DEFAULT_SCHEMA, "dictGet", {"dict", "attr", nullptr}, {{nullptr, nullptr}}, R"(getvariable(dict)[attr][1])"}, + // -- End Macro + {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}}; + + // To add a new table SQL macro, add a new macro to this array! + // Copy and paste the top item in the array into the + // second-to-last position and make some modifications. + // (essentially, leave the last entry in the array as {nullptr, nullptr, {nullptr}, nullptr}) + + // Keep the DEFAULT_SCHEMA (no change needed) + // Replace "times_two_table" with a name for your macro + // If your function has parameters without default values, add their names in quotes inside of the {}, with a nullptr at the end + // If you do not have parameters, simplify to {nullptr} + // If your function has parameters with default values, add their names and values in quotes inside of {}'s inside of the {}. + // Be sure to keep {nullptr, nullptr} at the end + // If you do not have parameters with default values, simplify to {nullptr, nullptr} + // Add the text of your SQL macro as a raw string with the format R"( select 42; )" + + // clang-format off static const DefaultTableMacro chsql_table_macros[] = { {DEFAULT_SCHEMA, "numbers", {"x", nullptr}, {{"z", "0"}, {nullptr, nullptr}}, R"(SELECT * as number FROM generate_series(z,x-1);)"}, {DEFAULT_SCHEMA, "url", {"url", "format", nullptr}, {{nullptr, nullptr}}, R"(WITH "JSON" as (SELECT * FROM read_json_auto(url)), "PARQUET" as (SELECT * FROM read_parquet(url)), "CSV" as (SELECT * FROM read_csv_auto(url)), "BLOB" as (SELECT * FROM read_blob(url)), "TEXT" as (SELECT * FROM read_text(url)) FROM query_table(format))"}, @@ -185,49 +184,23 @@ static const DefaultTableMacro chsql_table_macros[] = { )"}, {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr} }; -// clang-format on - -inline void ChSqlScalarFun(DataChunk &args, ExpressionState &state, Vector &result) { - auto &name_vector = args.data[0]; - UnaryExecutor::Execute( - name_vector, result, args.size(), - [&](string_t name) { - return StringVector::AddString(result, "ChSql "+name.GetString()+" 🐥");; - }); -} - -inline void ChSqlOpenSSLVersionScalarFun(DataChunk &args, ExpressionState &state, Vector &result) { - auto &name_vector = args.data[0]; - UnaryExecutor::Execute( - name_vector, result, args.size(), - [&](string_t name) { - return StringVector::AddString(result, "ChSql " + name.GetString() + - ", my linked OpenSSL version is " + - OPENSSL_VERSION_TEXT );; - }); -} - -static void LoadInternal(DatabaseInstance &instance) { - // Register a scalar function - auto chsql_scalar_function = ScalarFunction("chsql", {LogicalType::VARCHAR}, LogicalType::VARCHAR, ChSqlScalarFun); - ExtensionUtil::RegisterFunction(instance, chsql_scalar_function); - - // Register another scalar function - auto chsql_openssl_version_scalar_function = ScalarFunction("chsql_openssl_version", {LogicalType::VARCHAR}, - LogicalType::VARCHAR, ChSqlOpenSSLVersionScalarFun); - ExtensionUtil::RegisterFunction(instance, chsql_openssl_version_scalar_function); + // clang-format on + static void LoadInternal(DatabaseInstance &instance) + { // Macros - for (idx_t index = 0; chsql_macros[index].name != nullptr; index++) { - auto info = DefaultFunctionGenerator::CreateInternalMacroInfo(chsql_macros[index]); - ExtensionUtil::RegisterFunction(instance, *info); - } + for (idx_t index = 0; chsql_macros[index].name != nullptr; index++) + { + auto info = DefaultFunctionGenerator::CreateInternalMacroInfo(chsql_macros[index]); + ExtensionUtil::RegisterFunction(instance, *info); + } // Table Macros - for (idx_t index = 0; chsql_table_macros[index].name != nullptr; index++) { - auto table_info = DefaultTableFunctionGenerator::CreateTableMacroInfo(chsql_table_macros[index]); - ExtensionUtil::RegisterFunction(instance, *table_info); - } - ExtensionUtil::RegisterFunction(instance, ReadParquetOrderedFunction()); + for (idx_t index = 0; chsql_table_macros[index].name != nullptr; index++) + { + auto table_info = DefaultTableFunctionGenerator::CreateTableMacroInfo(chsql_table_macros[index]); + ExtensionUtil::RegisterFunction(instance, *table_info); + } + ExtensionUtil::RegisterFunction(instance, ReadParquetOrderedFunction()); // Flock ExtensionUtil::RegisterFunction(instance, DuckFlockTableFunction()); // System Table @@ -237,36 +210,41 @@ static void LoadInternal(DatabaseInstance &instance) { con.BeginTransaction(); CreateSystemViews(con); con.Commit(); - -} - -void ChsqlExtension::Load(DuckDB &db) { - LoadInternal(*db.instance); -} -std::string ChsqlExtension::Name() { - return "chsql"; -} - -std::string ChsqlExtension::Version() const { + } + + void ChsqlExtension::Load(DuckDB &db) + { + LoadInternal(*db.instance); + } + std::string ChsqlExtension::Name() + { + return "chsql"; + } + + std::string ChsqlExtension::Version() const + { #ifdef EXT_VERSION_CHSQL - return EXT_VERSION_CHSQL; + return EXT_VERSION_CHSQL; #else - return ""; + return ""; #endif -} + } } // namespace duckdb -extern "C" { +extern "C" +{ -DUCKDB_EXTENSION_API void chsql_init(duckdb::DatabaseInstance &db) { + DUCKDB_EXTENSION_API void chsql_init(duckdb::DatabaseInstance &db) + { duckdb::DuckDB db_wrapper(db); db_wrapper.LoadExtension(); -} + } -DUCKDB_EXTENSION_API const char *chsql_version() { - return duckdb::DuckDB::LibraryVersion(); -} + DUCKDB_EXTENSION_API const char *chsql_version() + { + return duckdb::DuckDB::LibraryVersion(); + } } #ifndef DUCKDB_EXTENSION_MAIN diff --git a/chsql/src/include/chsql_extension.hpp b/chsql/src/include/chsql_extension.hpp index b1c7a5e..50440bf 100644 --- a/chsql/src/include/chsql_extension.hpp +++ b/chsql/src/include/chsql_extension.hpp @@ -2,17 +2,18 @@ #include "duckdb.hpp" -namespace duckdb { +namespace duckdb +{ -class ChsqlExtension : public Extension { -public: - void Load(DuckDB &db) override; - std::string Name() override; - std::string Version() const override; -}; -duckdb::TableFunction ReadParquetOrderedFunction(); -static void RegisterSillyBTreeStore(DatabaseInstance &instance); + class ChsqlExtension : public Extension + { + public: + void Load(DuckDB &db) override; + std::string Name() override; + std::string Version() const override; + }; + duckdb::TableFunction ReadParquetOrderedFunction(); -TableFunction DuckFlockTableFunction(); + TableFunction DuckFlockTableFunction(); } // namespace duckdb diff --git a/chsql/test/sql/chsql.test b/chsql/test/sql/chsql.test index 84c2527..2212e6a 100644 --- a/chsql/test/sql/chsql.test +++ b/chsql/test/sql/chsql.test @@ -13,17 +13,6 @@ require chsql require parquet -# Confirm the extension works -query I -SELECT chsql('Works'); ----- -ChSql Works 🐥 - -query I -SELECT chsql_openssl_version('Hello'); ----- -:.*ChSql Hello, my linked OpenSSL version is OpenSSL.* - query I SELECT toString('123') ---- diff --git a/chsql/vcpkg.json b/chsql/vcpkg.json index 85936bf..0b8fa67 100644 --- a/chsql/vcpkg.json +++ b/chsql/vcpkg.json @@ -1,5 +1,3 @@ { - "dependencies": [ - "openssl" - ] + "dependencies": [] } \ No newline at end of file From babdc1d5f23509e4e89ae3a2a22023316a5839a7 Mon Sep 17 00:00:00 2001 From: Rusty Conover Date: Fri, 22 Aug 2025 12:48:53 -0400 Subject: [PATCH 19/20] fix: ci --- chsql/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/chsql/CMakeLists.txt b/chsql/CMakeLists.txt index 919a25a..422c2d5 100644 --- a/chsql/CMakeLists.txt +++ b/chsql/CMakeLists.txt @@ -4,7 +4,6 @@ set(TARGET_NAME chsql) # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then # used in cmake with find_package. Feel free to remove or replace with other dependencies. # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. -find_package(OpenSSL REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) set(CHSQL_DUCKDB_VERSION ${DUCKDB_MAJOR_VERSION}) @@ -24,9 +23,8 @@ include_directories( set(EXTENSION_SOURCES src/chsql_extension.cpp src/duck_flock.cpp src/chsql_system.cpp src/parquet_types.cpp) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) -# Link OpenSSL in both the static library as the loadable extension -target_link_libraries(${EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto duckdb_mbedtls) -target_link_libraries(${LOADABLE_EXTENSION_NAME} OpenSSL::SSL OpenSSL::Crypto duckdb_mbedtls) +target_link_libraries(${EXTENSION_NAME} duckdb_mbedtls) +target_link_libraries(${LOADABLE_EXTENSION_NAME} duckdb_mbedtls) install( TARGETS ${EXTENSION_NAME} EXPORT "${DUCKDB_EXPORT_SET}" From 88f0ffd3085624d23cac23bf3230ea2e7abc505d Mon Sep 17 00:00:00 2001 From: Rusty Conover Date: Fri, 22 Aug 2025 12:53:26 -0400 Subject: [PATCH 20/20] fix: ci --- .github/workflows/MainDistributionPipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 46d78a5..b359f48 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -25,5 +25,5 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: duckdb_version: v1.3.2 - ci_tools_version: v1.3.2 + ci_tools_version: main extension_name: chsql